6deab098b5c95f6e7270fc9f19bc9f00dc4dff6c
[dpdk.git] / drivers / net / memif / rte_eth_memif.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018-2019 Cisco Systems, Inc.  All rights reserved.
3  */
4
5 #include <stdlib.h>
6 #include <fcntl.h>
7 #include <unistd.h>
8 #include <sys/types.h>
9 #include <sys/socket.h>
10 #include <sys/un.h>
11 #include <sys/ioctl.h>
12 #include <sys/mman.h>
13 #include <linux/if_ether.h>
14 #include <errno.h>
15 #include <sys/eventfd.h>
16
17 #include <rte_version.h>
18 #include <rte_mbuf.h>
19 #include <rte_ether.h>
20 #include <rte_ethdev_driver.h>
21 #include <rte_ethdev_vdev.h>
22 #include <rte_malloc.h>
23 #include <rte_kvargs.h>
24 #include <rte_bus_vdev.h>
25 #include <rte_string_fns.h>
26 #include <rte_errno.h>
27 #include <rte_memory.h>
28 #include <rte_memzone.h>
29 #include <rte_eal_memconfig.h>
30
31 #include "rte_eth_memif.h"
32 #include "memif_socket.h"
33
34 #define ETH_MEMIF_ID_ARG                "id"
35 #define ETH_MEMIF_ROLE_ARG              "role"
36 #define ETH_MEMIF_PKT_BUFFER_SIZE_ARG   "bsize"
37 #define ETH_MEMIF_RING_SIZE_ARG         "rsize"
38 #define ETH_MEMIF_SOCKET_ARG            "socket"
39 #define ETH_MEMIF_MAC_ARG               "mac"
40 #define ETH_MEMIF_ZC_ARG                "zero-copy"
41 #define ETH_MEMIF_SECRET_ARG            "secret"
42
43 static const char * const valid_arguments[] = {
44         ETH_MEMIF_ID_ARG,
45         ETH_MEMIF_ROLE_ARG,
46         ETH_MEMIF_PKT_BUFFER_SIZE_ARG,
47         ETH_MEMIF_RING_SIZE_ARG,
48         ETH_MEMIF_SOCKET_ARG,
49         ETH_MEMIF_MAC_ARG,
50         ETH_MEMIF_ZC_ARG,
51         ETH_MEMIF_SECRET_ARG,
52         NULL
53 };
54
55 static const struct rte_eth_link pmd_link = {
56         .link_speed = ETH_SPEED_NUM_10G,
57         .link_duplex = ETH_LINK_FULL_DUPLEX,
58         .link_status = ETH_LINK_DOWN,
59         .link_autoneg = ETH_LINK_AUTONEG
60 };
61
62 #define MEMIF_MP_SEND_REGION            "memif_mp_send_region"
63
64
65 static int memif_region_init_zc(const struct rte_memseg_list *msl,
66                                 const struct rte_memseg *ms, void *arg);
67
68 const char *
69 memif_version(void)
70 {
71         return ("memif-" RTE_STR(MEMIF_VERSION_MAJOR) "." RTE_STR(MEMIF_VERSION_MINOR));
72 }
73
74 /* Message header to synchronize regions */
75 struct mp_region_msg {
76         char port_name[RTE_DEV_NAME_MAX_LEN];
77         memif_region_index_t idx;
78         memif_region_size_t size;
79 };
80
81 static int
82 memif_mp_send_region(const struct rte_mp_msg *msg, const void *peer)
83 {
84         struct rte_eth_dev *dev;
85         struct pmd_process_private *proc_private;
86         const struct mp_region_msg *msg_param = (const struct mp_region_msg *)msg->param;
87         struct rte_mp_msg reply;
88         struct mp_region_msg *reply_param = (struct mp_region_msg *)reply.param;
89         uint16_t port_id;
90         int ret;
91
92         /* Get requested port */
93         ret = rte_eth_dev_get_port_by_name(msg_param->port_name, &port_id);
94         if (ret) {
95                 MIF_LOG(ERR, "Failed to get port id for %s",
96                         msg_param->port_name);
97                 return -1;
98         }
99         dev = &rte_eth_devices[port_id];
100         proc_private = dev->process_private;
101
102         memset(&reply, 0, sizeof(reply));
103         strlcpy(reply.name, msg->name, sizeof(reply.name));
104         reply_param->idx = msg_param->idx;
105         if (proc_private->regions[msg_param->idx] != NULL) {
106                 reply_param->size = proc_private->regions[msg_param->idx]->region_size;
107                 reply.fds[0] = proc_private->regions[msg_param->idx]->fd;
108                 reply.num_fds = 1;
109         }
110         reply.len_param = sizeof(*reply_param);
111         if (rte_mp_reply(&reply, peer) < 0) {
112                 MIF_LOG(ERR, "Failed to reply to an add region request");
113                 return -1;
114         }
115
116         return 0;
117 }
118
119 /*
120  * Request regions
121  * Called by secondary process, when ports link status goes up.
122  */
123 static int
124 memif_mp_request_regions(struct rte_eth_dev *dev)
125 {
126         int ret, i;
127         struct timespec timeout = {.tv_sec = 5, .tv_nsec = 0};
128         struct rte_mp_msg msg, *reply;
129         struct rte_mp_reply replies;
130         struct mp_region_msg *msg_param = (struct mp_region_msg *)msg.param;
131         struct mp_region_msg *reply_param;
132         struct memif_region *r;
133         struct pmd_process_private *proc_private = dev->process_private;
134         struct pmd_internals *pmd = dev->data->dev_private;
135         /* in case of zero-copy slave, only request region 0 */
136         uint16_t max_region_num = (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) ?
137                                    1 : ETH_MEMIF_MAX_REGION_NUM;
138
139         MIF_LOG(DEBUG, "Requesting memory regions");
140
141         for (i = 0; i < max_region_num; i++) {
142                 /* Prepare the message */
143                 memset(&msg, 0, sizeof(msg));
144                 strlcpy(msg.name, MEMIF_MP_SEND_REGION, sizeof(msg.name));
145                 strlcpy(msg_param->port_name, dev->data->name,
146                         sizeof(msg_param->port_name));
147                 msg_param->idx = i;
148                 msg.len_param = sizeof(*msg_param);
149
150                 /* Send message */
151                 ret = rte_mp_request_sync(&msg, &replies, &timeout);
152                 if (ret < 0 || replies.nb_received != 1) {
153                         MIF_LOG(ERR, "Failed to send mp msg: %d",
154                                 rte_errno);
155                         return -1;
156                 }
157
158                 reply = &replies.msgs[0];
159                 reply_param = (struct mp_region_msg *)reply->param;
160
161                 if (reply_param->size > 0) {
162                         r = rte_zmalloc("region", sizeof(struct memif_region), 0);
163                         if (r == NULL) {
164                                 MIF_LOG(ERR, "Failed to alloc memif region.");
165                                 free(reply);
166                                 return -ENOMEM;
167                         }
168                         r->region_size = reply_param->size;
169                         if (reply->num_fds < 1) {
170                                 MIF_LOG(ERR, "Missing file descriptor.");
171                                 free(reply);
172                                 return -1;
173                         }
174                         r->fd = reply->fds[0];
175                         r->addr = NULL;
176
177                         proc_private->regions[reply_param->idx] = r;
178                         proc_private->regions_num++;
179                 }
180                 free(reply);
181         }
182
183         if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
184                 ret = rte_memseg_walk(memif_region_init_zc, (void *)proc_private);
185                 if (ret < 0)
186                         return ret;
187         }
188
189         return memif_connect(dev);
190 }
191
192 static int
193 memif_dev_info(struct rte_eth_dev *dev __rte_unused, struct rte_eth_dev_info *dev_info)
194 {
195         dev_info->max_mac_addrs = 1;
196         dev_info->max_rx_pktlen = (uint32_t)ETH_FRAME_LEN;
197         dev_info->max_rx_queues = ETH_MEMIF_MAX_NUM_Q_PAIRS;
198         dev_info->max_tx_queues = ETH_MEMIF_MAX_NUM_Q_PAIRS;
199         dev_info->min_rx_bufsize = 0;
200
201         return 0;
202 }
203
204 static memif_ring_t *
205 memif_get_ring(struct pmd_internals *pmd, struct pmd_process_private *proc_private,
206                memif_ring_type_t type, uint16_t ring_num)
207 {
208         /* rings only in region 0 */
209         void *p = proc_private->regions[0]->addr;
210         int ring_size = sizeof(memif_ring_t) + sizeof(memif_desc_t) *
211             (1 << pmd->run.log2_ring_size);
212
213         p = (uint8_t *)p + (ring_num + type * pmd->run.num_s2m_rings) * ring_size;
214
215         return (memif_ring_t *)p;
216 }
217
218 static memif_region_offset_t
219 memif_get_ring_offset(struct rte_eth_dev *dev, struct memif_queue *mq,
220                       memif_ring_type_t type, uint16_t num)
221 {
222         struct pmd_internals *pmd = dev->data->dev_private;
223         struct pmd_process_private *proc_private = dev->process_private;
224
225         return ((uint8_t *)memif_get_ring(pmd, proc_private, type, num) -
226                 (uint8_t *)proc_private->regions[mq->region]->addr);
227 }
228
229 static memif_ring_t *
230 memif_get_ring_from_queue(struct pmd_process_private *proc_private,
231                           struct memif_queue *mq)
232 {
233         struct memif_region *r;
234
235         r = proc_private->regions[mq->region];
236         if (r == NULL)
237                 return NULL;
238
239         return (memif_ring_t *)((uint8_t *)r->addr + mq->ring_offset);
240 }
241
242 static void *
243 memif_get_buffer(struct pmd_process_private *proc_private, memif_desc_t *d)
244 {
245         return ((uint8_t *)proc_private->regions[d->region]->addr + d->offset);
246 }
247
248 /* Free mbufs received by master */
249 static void
250 memif_free_stored_mbufs(struct pmd_process_private *proc_private, struct memif_queue *mq)
251 {
252         uint16_t mask = (1 << mq->log2_ring_size) - 1;
253         memif_ring_t *ring = memif_get_ring_from_queue(proc_private, mq);
254
255         /* FIXME: improve performance */
256         /* The ring->tail acts as a guard variable between Tx and Rx
257          * threads, so using load-acquire pairs with store-release
258          * to synchronize it between threads.
259          */
260         while (mq->last_tail != __atomic_load_n(&ring->tail,
261                                                 __ATOMIC_ACQUIRE)) {
262                 RTE_MBUF_PREFETCH_TO_FREE(mq->buffers[(mq->last_tail + 1) & mask]);
263                 /* Decrement refcnt and free mbuf. (current segment) */
264                 rte_mbuf_refcnt_update(mq->buffers[mq->last_tail & mask], -1);
265                 rte_pktmbuf_free_seg(mq->buffers[mq->last_tail & mask]);
266                 mq->last_tail++;
267         }
268 }
269
270 static int
271 memif_pktmbuf_chain(struct rte_mbuf *head, struct rte_mbuf *cur_tail,
272                     struct rte_mbuf *tail)
273 {
274         /* Check for number-of-segments-overflow */
275         if (unlikely(head->nb_segs + tail->nb_segs > RTE_MBUF_MAX_NB_SEGS))
276                 return -EOVERFLOW;
277
278         /* Chain 'tail' onto the old tail */
279         cur_tail->next = tail;
280
281         /* accumulate number of segments and total length. */
282         head->nb_segs = (uint16_t)(head->nb_segs + tail->nb_segs);
283
284         tail->pkt_len = tail->data_len;
285         head->pkt_len += tail->pkt_len;
286
287         return 0;
288 }
289
290 static uint16_t
291 eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
292 {
293         struct memif_queue *mq = queue;
294         struct pmd_internals *pmd = rte_eth_devices[mq->in_port].data->dev_private;
295         struct pmd_process_private *proc_private =
296                 rte_eth_devices[mq->in_port].process_private;
297         memif_ring_t *ring = memif_get_ring_from_queue(proc_private, mq);
298         uint16_t cur_slot, last_slot, n_slots, ring_size, mask, s0;
299         uint16_t n_rx_pkts = 0;
300         uint16_t mbuf_size = rte_pktmbuf_data_room_size(mq->mempool) -
301                 RTE_PKTMBUF_HEADROOM;
302         uint16_t src_len, src_off, dst_len, dst_off, cp_len;
303         memif_ring_type_t type = mq->type;
304         memif_desc_t *d0;
305         struct rte_mbuf *mbuf, *mbuf_head, *mbuf_tail;
306         uint64_t b;
307         ssize_t size __rte_unused;
308         uint16_t head;
309         int ret;
310         struct rte_eth_link link;
311
312         if (unlikely((pmd->flags & ETH_MEMIF_FLAG_CONNECTED) == 0))
313                 return 0;
314         if (unlikely(ring == NULL)) {
315                 /* Secondary process will attempt to request regions. */
316                 ret = rte_eth_link_get(mq->in_port, &link);
317                 if (ret < 0)
318                         MIF_LOG(ERR, "Failed to get port %u link info: %s",
319                                 mq->in_port, rte_strerror(-ret));
320                 return 0;
321         }
322
323         /* consume interrupt */
324         if ((ring->flags & MEMIF_RING_FLAG_MASK_INT) == 0)
325                 size = read(mq->intr_handle.fd, &b, sizeof(b));
326
327         ring_size = 1 << mq->log2_ring_size;
328         mask = ring_size - 1;
329
330         if (type == MEMIF_RING_S2M) {
331                 cur_slot = mq->last_head;
332                 last_slot = __atomic_load_n(&ring->head, __ATOMIC_ACQUIRE);
333         } else {
334                 cur_slot = mq->last_tail;
335                 last_slot = __atomic_load_n(&ring->tail, __ATOMIC_ACQUIRE);
336         }
337
338         if (cur_slot == last_slot)
339                 goto refill;
340         n_slots = last_slot - cur_slot;
341
342         while (n_slots && n_rx_pkts < nb_pkts) {
343                 mbuf_head = rte_pktmbuf_alloc(mq->mempool);
344                 if (unlikely(mbuf_head == NULL))
345                         goto no_free_bufs;
346                 mbuf = mbuf_head;
347                 mbuf->port = mq->in_port;
348
349 next_slot:
350                 s0 = cur_slot & mask;
351                 d0 = &ring->desc[s0];
352
353                 src_len = d0->length;
354                 dst_off = 0;
355                 src_off = 0;
356
357                 do {
358                         dst_len = mbuf_size - dst_off;
359                         if (dst_len == 0) {
360                                 dst_off = 0;
361                                 dst_len = mbuf_size;
362
363                                 /* store pointer to tail */
364                                 mbuf_tail = mbuf;
365                                 mbuf = rte_pktmbuf_alloc(mq->mempool);
366                                 if (unlikely(mbuf == NULL))
367                                         goto no_free_bufs;
368                                 mbuf->port = mq->in_port;
369                                 ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
370                                 if (unlikely(ret < 0)) {
371                                         MIF_LOG(ERR, "number-of-segments-overflow");
372                                         rte_pktmbuf_free(mbuf);
373                                         goto no_free_bufs;
374                                 }
375                         }
376                         cp_len = RTE_MIN(dst_len, src_len);
377
378                         rte_pktmbuf_data_len(mbuf) += cp_len;
379                         rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf);
380                         if (mbuf != mbuf_head)
381                                 rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
382
383                         memcpy(rte_pktmbuf_mtod_offset(mbuf, void *, dst_off),
384                                (uint8_t *)memif_get_buffer(proc_private, d0) + src_off,
385                                cp_len);
386
387                         src_off += cp_len;
388                         dst_off += cp_len;
389                         src_len -= cp_len;
390                 } while (src_len);
391
392                 cur_slot++;
393                 n_slots--;
394
395                 if (d0->flags & MEMIF_DESC_FLAG_NEXT)
396                         goto next_slot;
397
398                 mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
399                 *bufs++ = mbuf_head;
400                 n_rx_pkts++;
401         }
402
403 no_free_bufs:
404         if (type == MEMIF_RING_S2M) {
405                 __atomic_store_n(&ring->tail, cur_slot, __ATOMIC_RELEASE);
406                 mq->last_head = cur_slot;
407         } else {
408                 mq->last_tail = cur_slot;
409         }
410
411 refill:
412         if (type == MEMIF_RING_M2S) {
413                 /* ring->head is updated by the receiver and this function
414                  * is called in the context of receiver thread. The loads in
415                  * the receiver do not need to synchronize with its own stores.
416                  */
417                 head = __atomic_load_n(&ring->head, __ATOMIC_RELAXED);
418                 n_slots = ring_size - head + mq->last_tail;
419
420                 while (n_slots--) {
421                         s0 = head++ & mask;
422                         d0 = &ring->desc[s0];
423                         d0->length = pmd->run.pkt_buffer_size;
424                 }
425                 __atomic_store_n(&ring->head, head, __ATOMIC_RELEASE);
426         }
427
428         mq->n_pkts += n_rx_pkts;
429         return n_rx_pkts;
430 }
431
432 static uint16_t
433 eth_memif_rx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
434 {
435         struct memif_queue *mq = queue;
436         struct pmd_internals *pmd = rte_eth_devices[mq->in_port].data->dev_private;
437         struct pmd_process_private *proc_private =
438                 rte_eth_devices[mq->in_port].process_private;
439         memif_ring_t *ring = memif_get_ring_from_queue(proc_private, mq);
440         uint16_t cur_slot, last_slot, n_slots, ring_size, mask, s0, head;
441         uint16_t n_rx_pkts = 0;
442         memif_desc_t *d0;
443         struct rte_mbuf *mbuf, *mbuf_tail;
444         struct rte_mbuf *mbuf_head = NULL;
445         int ret;
446         struct rte_eth_link link;
447
448         if (unlikely((pmd->flags & ETH_MEMIF_FLAG_CONNECTED) == 0))
449                 return 0;
450         if (unlikely(ring == NULL)) {
451                 /* Secondary process will attempt to request regions. */
452                 rte_eth_link_get(mq->in_port, &link);
453                 return 0;
454         }
455
456         /* consume interrupt */
457         if ((ring->flags & MEMIF_RING_FLAG_MASK_INT) == 0) {
458                 uint64_t b;
459                 ssize_t size __rte_unused;
460                 size = read(mq->intr_handle.fd, &b, sizeof(b));
461         }
462
463         ring_size = 1 << mq->log2_ring_size;
464         mask = ring_size - 1;
465
466         cur_slot = mq->last_tail;
467         /* The ring->tail acts as a guard variable between Tx and Rx
468          * threads, so using load-acquire pairs with store-release
469          * to synchronize it between threads.
470          */
471         last_slot = __atomic_load_n(&ring->tail, __ATOMIC_ACQUIRE);
472         if (cur_slot == last_slot)
473                 goto refill;
474         n_slots = last_slot - cur_slot;
475
476         while (n_slots && n_rx_pkts < nb_pkts) {
477                 s0 = cur_slot & mask;
478
479                 d0 = &ring->desc[s0];
480                 mbuf_head = mq->buffers[s0];
481                 mbuf = mbuf_head;
482
483 next_slot:
484                 /* prefetch next descriptor */
485                 if (n_rx_pkts + 1 < nb_pkts)
486                         rte_prefetch0(&ring->desc[(cur_slot + 1) & mask]);
487
488                 mbuf->port = mq->in_port;
489                 rte_pktmbuf_data_len(mbuf) = d0->length;
490                 rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf);
491
492                 mq->n_bytes += rte_pktmbuf_data_len(mbuf);
493
494                 cur_slot++;
495                 n_slots--;
496                 if (d0->flags & MEMIF_DESC_FLAG_NEXT) {
497                         s0 = cur_slot & mask;
498                         d0 = &ring->desc[s0];
499                         mbuf_tail = mbuf;
500                         mbuf = mq->buffers[s0];
501                         ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
502                         if (unlikely(ret < 0)) {
503                                 MIF_LOG(ERR, "number-of-segments-overflow");
504                                 goto refill;
505                         }
506                         goto next_slot;
507                 }
508
509                 *bufs++ = mbuf_head;
510                 n_rx_pkts++;
511         }
512
513         mq->last_tail = cur_slot;
514
515 /* Supply master with new buffers */
516 refill:
517         /* The ring->head acts as a guard variable between Tx and Rx
518          * threads, so using load-acquire pairs with store-release
519          * to synchronize it between threads.
520          */
521         head = __atomic_load_n(&ring->head, __ATOMIC_ACQUIRE);
522         n_slots = ring_size - head + mq->last_tail;
523
524         if (n_slots < 32)
525                 goto no_free_mbufs;
526
527         ret = rte_pktmbuf_alloc_bulk(mq->mempool, &mq->buffers[head & mask], n_slots);
528         if (unlikely(ret < 0))
529                 goto no_free_mbufs;
530
531         while (n_slots--) {
532                 s0 = head++ & mask;
533                 if (n_slots > 0)
534                         rte_prefetch0(mq->buffers[head & mask]);
535                 d0 = &ring->desc[s0];
536                 /* store buffer header */
537                 mbuf = mq->buffers[s0];
538                 /* populate descriptor */
539                 d0->length = rte_pktmbuf_data_room_size(mq->mempool) -
540                                 RTE_PKTMBUF_HEADROOM;
541                 d0->region = 1;
542                 d0->offset = rte_pktmbuf_mtod(mbuf, uint8_t *) -
543                         (uint8_t *)proc_private->regions[d0->region]->addr;
544         }
545 no_free_mbufs:
546         __atomic_store_n(&ring->head, head, __ATOMIC_RELEASE);
547
548         mq->n_pkts += n_rx_pkts;
549
550         return n_rx_pkts;
551 }
552
553 static uint16_t
554 eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
555 {
556         struct memif_queue *mq = queue;
557         struct pmd_internals *pmd = rte_eth_devices[mq->in_port].data->dev_private;
558         struct pmd_process_private *proc_private =
559                 rte_eth_devices[mq->in_port].process_private;
560         memif_ring_t *ring = memif_get_ring_from_queue(proc_private, mq);
561         uint16_t slot, saved_slot, n_free, ring_size, mask, n_tx_pkts = 0;
562         uint16_t src_len, src_off, dst_len, dst_off, cp_len;
563         memif_ring_type_t type = mq->type;
564         memif_desc_t *d0;
565         struct rte_mbuf *mbuf;
566         struct rte_mbuf *mbuf_head;
567         uint64_t a;
568         ssize_t size;
569         struct rte_eth_link link;
570
571         if (unlikely((pmd->flags & ETH_MEMIF_FLAG_CONNECTED) == 0))
572                 return 0;
573         if (unlikely(ring == NULL)) {
574                 int ret;
575
576                 /* Secondary process will attempt to request regions. */
577                 ret = rte_eth_link_get(mq->in_port, &link);
578                 if (ret < 0)
579                         MIF_LOG(ERR, "Failed to get port %u link info: %s",
580                                 mq->in_port, rte_strerror(-ret));
581                 return 0;
582         }
583
584         ring_size = 1 << mq->log2_ring_size;
585         mask = ring_size - 1;
586
587         if (type == MEMIF_RING_S2M) {
588                 slot = __atomic_load_n(&ring->head, __ATOMIC_ACQUIRE);
589                 n_free = ring_size - slot +
590                                 __atomic_load_n(&ring->tail, __ATOMIC_ACQUIRE);
591         } else {
592                 /* For M2S queues ring->tail is updated by the sender and
593                  * this function is called in the context of sending thread.
594                  * The loads in the sender do not need to synchronize with
595                  * its own stores. Hence, the following load can be a
596                  * relaxed load.
597                  */
598                 slot = __atomic_load_n(&ring->tail, __ATOMIC_RELAXED);
599                 n_free = __atomic_load_n(&ring->head, __ATOMIC_ACQUIRE) - slot;
600         }
601
602         while (n_tx_pkts < nb_pkts && n_free) {
603                 mbuf_head = *bufs++;
604                 mbuf = mbuf_head;
605
606                 saved_slot = slot;
607                 d0 = &ring->desc[slot & mask];
608                 dst_off = 0;
609                 dst_len = (type == MEMIF_RING_S2M) ?
610                         pmd->run.pkt_buffer_size : d0->length;
611
612 next_in_chain:
613                 src_off = 0;
614                 src_len = rte_pktmbuf_data_len(mbuf);
615
616                 while (src_len) {
617                         if (dst_len == 0) {
618                                 if (n_free) {
619                                         slot++;
620                                         n_free--;
621                                         d0->flags |= MEMIF_DESC_FLAG_NEXT;
622                                         d0 = &ring->desc[slot & mask];
623                                         dst_off = 0;
624                                         dst_len = (type == MEMIF_RING_S2M) ?
625                                             pmd->run.pkt_buffer_size : d0->length;
626                                         d0->flags = 0;
627                                 } else {
628                                         slot = saved_slot;
629                                         goto no_free_slots;
630                                 }
631                         }
632                         cp_len = RTE_MIN(dst_len, src_len);
633
634                         memcpy((uint8_t *)memif_get_buffer(proc_private, d0) + dst_off,
635                                rte_pktmbuf_mtod_offset(mbuf, void *, src_off),
636                                cp_len);
637
638                         mq->n_bytes += cp_len;
639                         src_off += cp_len;
640                         dst_off += cp_len;
641                         src_len -= cp_len;
642                         dst_len -= cp_len;
643
644                         d0->length = dst_off;
645                 }
646
647                 if (rte_pktmbuf_is_contiguous(mbuf) == 0) {
648                         mbuf = mbuf->next;
649                         goto next_in_chain;
650                 }
651
652                 n_tx_pkts++;
653                 slot++;
654                 n_free--;
655                 rte_pktmbuf_free(mbuf_head);
656         }
657
658 no_free_slots:
659         if (type == MEMIF_RING_S2M)
660                 __atomic_store_n(&ring->head, slot, __ATOMIC_RELEASE);
661         else
662                 __atomic_store_n(&ring->tail, slot, __ATOMIC_RELEASE);
663
664         if ((ring->flags & MEMIF_RING_FLAG_MASK_INT) == 0) {
665                 a = 1;
666                 size = write(mq->intr_handle.fd, &a, sizeof(a));
667                 if (unlikely(size < 0)) {
668                         MIF_LOG(WARNING,
669                                 "Failed to send interrupt. %s", strerror(errno));
670                 }
671         }
672
673         mq->n_pkts += n_tx_pkts;
674         return n_tx_pkts;
675 }
676
677
678 static int
679 memif_tx_one_zc(struct pmd_process_private *proc_private, struct memif_queue *mq,
680                 memif_ring_t *ring, struct rte_mbuf *mbuf, const uint16_t mask,
681                 uint16_t slot, uint16_t n_free)
682 {
683         memif_desc_t *d0;
684         int used_slots = 1;
685
686 next_in_chain:
687         /* store pointer to mbuf to free it later */
688         mq->buffers[slot & mask] = mbuf;
689         /* Increment refcnt to make sure the buffer is not freed before master
690          * receives it. (current segment)
691          */
692         rte_mbuf_refcnt_update(mbuf, 1);
693         /* populate descriptor */
694         d0 = &ring->desc[slot & mask];
695         d0->length = rte_pktmbuf_data_len(mbuf);
696         /* FIXME: get region index */
697         d0->region = 1;
698         d0->offset = rte_pktmbuf_mtod(mbuf, uint8_t *) -
699                 (uint8_t *)proc_private->regions[d0->region]->addr;
700         d0->flags = 0;
701
702         /* check if buffer is chained */
703         if (rte_pktmbuf_is_contiguous(mbuf) == 0) {
704                 if (n_free < 2)
705                         return 0;
706                 /* mark buffer as chained */
707                 d0->flags |= MEMIF_DESC_FLAG_NEXT;
708                 /* advance mbuf */
709                 mbuf = mbuf->next;
710                 /* update counters */
711                 used_slots++;
712                 slot++;
713                 n_free--;
714                 goto next_in_chain;
715         }
716         return used_slots;
717 }
718
719 static uint16_t
720 eth_memif_tx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
721 {
722         struct memif_queue *mq = queue;
723         struct pmd_internals *pmd = rte_eth_devices[mq->in_port].data->dev_private;
724         struct pmd_process_private *proc_private =
725                 rte_eth_devices[mq->in_port].process_private;
726         memif_ring_t *ring = memif_get_ring_from_queue(proc_private, mq);
727         uint16_t slot, n_free, ring_size, mask, n_tx_pkts = 0;
728         memif_ring_type_t type = mq->type;
729         struct rte_eth_link link;
730
731         if (unlikely((pmd->flags & ETH_MEMIF_FLAG_CONNECTED) == 0))
732                 return 0;
733         if (unlikely(ring == NULL)) {
734                 /* Secondary process will attempt to request regions. */
735                 rte_eth_link_get(mq->in_port, &link);
736                 return 0;
737         }
738
739         ring_size = 1 << mq->log2_ring_size;
740         mask = ring_size - 1;
741
742         /* free mbufs received by master */
743         memif_free_stored_mbufs(proc_private, mq);
744
745         /* ring type always MEMIF_RING_S2M */
746         /* The ring->head acts as a guard variable between Tx and Rx
747          * threads, so using load-acquire pairs with store-release
748          * to synchronize it between threads.
749          */
750         slot = __atomic_load_n(&ring->head, __ATOMIC_ACQUIRE);
751         n_free = ring_size - slot + mq->last_tail;
752
753         int used_slots;
754
755         while (n_free && (n_tx_pkts < nb_pkts)) {
756                 while ((n_free > 4) && ((nb_pkts - n_tx_pkts) > 4)) {
757                         if ((nb_pkts - n_tx_pkts) > 8) {
758                                 rte_prefetch0(*bufs + 4);
759                                 rte_prefetch0(*bufs + 5);
760                                 rte_prefetch0(*bufs + 6);
761                                 rte_prefetch0(*bufs + 7);
762                         }
763                         used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
764                                 mask, slot, n_free);
765                         if (unlikely(used_slots < 1))
766                                 goto no_free_slots;
767                         n_tx_pkts++;
768                         slot += used_slots;
769                         n_free -= used_slots;
770
771                         used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
772                                 mask, slot, n_free);
773                         if (unlikely(used_slots < 1))
774                                 goto no_free_slots;
775                         n_tx_pkts++;
776                         slot += used_slots;
777                         n_free -= used_slots;
778
779                         used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
780                                 mask, slot, n_free);
781                         if (unlikely(used_slots < 1))
782                                 goto no_free_slots;
783                         n_tx_pkts++;
784                         slot += used_slots;
785                         n_free -= used_slots;
786
787                         used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
788                                 mask, slot, n_free);
789                         if (unlikely(used_slots < 1))
790                                 goto no_free_slots;
791                         n_tx_pkts++;
792                         slot += used_slots;
793                         n_free -= used_slots;
794                 }
795                 used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
796                         mask, slot, n_free);
797                 if (unlikely(used_slots < 1))
798                         goto no_free_slots;
799                 n_tx_pkts++;
800                 slot += used_slots;
801                 n_free -= used_slots;
802         }
803
804 no_free_slots:
805         /* update ring pointers */
806         if (type == MEMIF_RING_S2M)
807                 __atomic_store_n(&ring->head, slot, __ATOMIC_RELEASE);
808         else
809                 __atomic_store_n(&ring->tail, slot, __ATOMIC_RELEASE);
810
811         /* Send interrupt, if enabled. */
812         if ((ring->flags & MEMIF_RING_FLAG_MASK_INT) == 0) {
813                 uint64_t a = 1;
814                 ssize_t size = write(mq->intr_handle.fd, &a, sizeof(a));
815                 if (unlikely(size < 0)) {
816                         MIF_LOG(WARNING,
817                                 "Failed to send interrupt. %s", strerror(errno));
818                 }
819         }
820
821         /* increment queue counters */
822         mq->n_pkts += n_tx_pkts;
823
824         return n_tx_pkts;
825 }
826
827 void
828 memif_free_regions(struct rte_eth_dev *dev)
829 {
830         struct pmd_process_private *proc_private = dev->process_private;
831         struct pmd_internals *pmd = dev->data->dev_private;
832         int i;
833         struct memif_region *r;
834
835         /* regions are allocated contiguously, so it's
836          * enough to loop until 'proc_private->regions_num'
837          */
838         for (i = 0; i < proc_private->regions_num; i++) {
839                 r = proc_private->regions[i];
840                 if (r != NULL) {
841                         /* This is memzone */
842                         if (i > 0 && (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY)) {
843                                 r->addr = NULL;
844                                 if (r->fd > 0)
845                                         close(r->fd);
846                         }
847                         if (r->addr != NULL) {
848                                 munmap(r->addr, r->region_size);
849                                 if (r->fd > 0) {
850                                         close(r->fd);
851                                         r->fd = -1;
852                                 }
853                         }
854                         rte_free(r);
855                         proc_private->regions[i] = NULL;
856                 }
857         }
858         proc_private->regions_num = 0;
859 }
860
861 static int
862 memif_region_init_zc(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
863                      void *arg)
864 {
865         struct pmd_process_private *proc_private = (struct pmd_process_private *)arg;
866         struct memif_region *r;
867
868         if (proc_private->regions_num < 1) {
869                 MIF_LOG(ERR, "Missing descriptor region");
870                 return -1;
871         }
872
873         r = proc_private->regions[proc_private->regions_num - 1];
874
875         if (r->addr != msl->base_va)
876                 r = proc_private->regions[++proc_private->regions_num - 1];
877
878         if (r == NULL) {
879                 r = rte_zmalloc("region", sizeof(struct memif_region), 0);
880                 if (r == NULL) {
881                         MIF_LOG(ERR, "Failed to alloc memif region.");
882                         return -ENOMEM;
883                 }
884
885                 r->addr = msl->base_va;
886                 r->region_size = ms->len;
887                 r->fd = rte_memseg_get_fd(ms);
888                 if (r->fd < 0)
889                         return -1;
890                 r->pkt_buffer_offset = 0;
891
892                 proc_private->regions[proc_private->regions_num - 1] = r;
893         } else {
894                 r->region_size += ms->len;
895         }
896
897         return 0;
898 }
899
900 static int
901 memif_region_init_shm(struct rte_eth_dev *dev, uint8_t has_buffers)
902 {
903         struct pmd_internals *pmd = dev->data->dev_private;
904         struct pmd_process_private *proc_private = dev->process_private;
905         char shm_name[ETH_MEMIF_SHM_NAME_SIZE];
906         int ret = 0;
907         struct memif_region *r;
908
909         if (proc_private->regions_num >= ETH_MEMIF_MAX_REGION_NUM) {
910                 MIF_LOG(ERR, "Too many regions.");
911                 return -1;
912         }
913
914         r = rte_zmalloc("region", sizeof(struct memif_region), 0);
915         if (r == NULL) {
916                 MIF_LOG(ERR, "Failed to alloc memif region.");
917                 return -ENOMEM;
918         }
919
920         /* calculate buffer offset */
921         r->pkt_buffer_offset = (pmd->run.num_s2m_rings + pmd->run.num_m2s_rings) *
922             (sizeof(memif_ring_t) + sizeof(memif_desc_t) *
923             (1 << pmd->run.log2_ring_size));
924
925         r->region_size = r->pkt_buffer_offset;
926         /* if region has buffers, add buffers size to region_size */
927         if (has_buffers == 1)
928                 r->region_size += (uint32_t)(pmd->run.pkt_buffer_size *
929                         (1 << pmd->run.log2_ring_size) *
930                         (pmd->run.num_s2m_rings +
931                          pmd->run.num_m2s_rings));
932
933         memset(shm_name, 0, sizeof(char) * ETH_MEMIF_SHM_NAME_SIZE);
934         snprintf(shm_name, ETH_MEMIF_SHM_NAME_SIZE, "memif_region_%d",
935                  proc_private->regions_num);
936
937         r->fd = memfd_create(shm_name, MFD_ALLOW_SEALING);
938         if (r->fd < 0) {
939                 MIF_LOG(ERR, "Failed to create shm file: %s.", strerror(errno));
940                 ret = -1;
941                 goto error;
942         }
943
944         ret = fcntl(r->fd, F_ADD_SEALS, F_SEAL_SHRINK);
945         if (ret < 0) {
946                 MIF_LOG(ERR, "Failed to add seals to shm file: %s.", strerror(errno));
947                 goto error;
948         }
949
950         ret = ftruncate(r->fd, r->region_size);
951         if (ret < 0) {
952                 MIF_LOG(ERR, "Failed to truncate shm file: %s.", strerror(errno));
953                 goto error;
954         }
955
956         r->addr = mmap(NULL, r->region_size, PROT_READ |
957                        PROT_WRITE, MAP_SHARED, r->fd, 0);
958         if (r->addr == MAP_FAILED) {
959                 MIF_LOG(ERR, "Failed to mmap shm region: %s.", strerror(ret));
960                 ret = -1;
961                 goto error;
962         }
963
964         proc_private->regions[proc_private->regions_num] = r;
965         proc_private->regions_num++;
966
967         return ret;
968
969 error:
970         if (r->fd > 0)
971                 close(r->fd);
972         r->fd = -1;
973
974         return ret;
975 }
976
977 static int
978 memif_regions_init(struct rte_eth_dev *dev)
979 {
980         struct pmd_internals *pmd = dev->data->dev_private;
981         int ret;
982
983         /*
984          * Zero-copy exposes dpdk memory.
985          * Each memseg list will be represented by memif region.
986          * Zero-copy regions indexing: memseg list idx + 1,
987          * as we already have region 0 reserved for descriptors.
988          */
989         if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
990                 /* create region idx 0 containing descriptors */
991                 ret = memif_region_init_shm(dev, 0);
992                 if (ret < 0)
993                         return ret;
994                 ret = rte_memseg_walk(memif_region_init_zc, (void *)dev->process_private);
995                 if (ret < 0)
996                         return ret;
997         } else {
998                 /* create one memory region contaning rings and buffers */
999                 ret = memif_region_init_shm(dev, /* has buffers */ 1);
1000                 if (ret < 0)
1001                         return ret;
1002         }
1003
1004         return 0;
1005 }
1006
1007 static void
1008 memif_init_rings(struct rte_eth_dev *dev)
1009 {
1010         struct pmd_internals *pmd = dev->data->dev_private;
1011         struct pmd_process_private *proc_private = dev->process_private;
1012         memif_ring_t *ring;
1013         int i, j;
1014         uint16_t slot;
1015
1016         for (i = 0; i < pmd->run.num_s2m_rings; i++) {
1017                 ring = memif_get_ring(pmd, proc_private, MEMIF_RING_S2M, i);
1018                 __atomic_store_n(&ring->head, 0, __ATOMIC_RELAXED);
1019                 __atomic_store_n(&ring->tail, 0, __ATOMIC_RELAXED);
1020                 ring->cookie = MEMIF_COOKIE;
1021                 ring->flags = 0;
1022
1023                 if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY)
1024                         continue;
1025
1026                 for (j = 0; j < (1 << pmd->run.log2_ring_size); j++) {
1027                         slot = i * (1 << pmd->run.log2_ring_size) + j;
1028                         ring->desc[j].region = 0;
1029                         ring->desc[j].offset =
1030                                 proc_private->regions[0]->pkt_buffer_offset +
1031                                 (uint32_t)(slot * pmd->run.pkt_buffer_size);
1032                         ring->desc[j].length = pmd->run.pkt_buffer_size;
1033                 }
1034         }
1035
1036         for (i = 0; i < pmd->run.num_m2s_rings; i++) {
1037                 ring = memif_get_ring(pmd, proc_private, MEMIF_RING_M2S, i);
1038                 __atomic_store_n(&ring->head, 0, __ATOMIC_RELAXED);
1039                 __atomic_store_n(&ring->tail, 0, __ATOMIC_RELAXED);
1040                 ring->cookie = MEMIF_COOKIE;
1041                 ring->flags = 0;
1042
1043                 if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY)
1044                         continue;
1045
1046                 for (j = 0; j < (1 << pmd->run.log2_ring_size); j++) {
1047                         slot = (i + pmd->run.num_s2m_rings) *
1048                             (1 << pmd->run.log2_ring_size) + j;
1049                         ring->desc[j].region = 0;
1050                         ring->desc[j].offset =
1051                                 proc_private->regions[0]->pkt_buffer_offset +
1052                                 (uint32_t)(slot * pmd->run.pkt_buffer_size);
1053                         ring->desc[j].length = pmd->run.pkt_buffer_size;
1054                 }
1055         }
1056 }
1057
1058 /* called only by slave */
1059 static int
1060 memif_init_queues(struct rte_eth_dev *dev)
1061 {
1062         struct pmd_internals *pmd = dev->data->dev_private;
1063         struct memif_queue *mq;
1064         int i;
1065
1066         for (i = 0; i < pmd->run.num_s2m_rings; i++) {
1067                 mq = dev->data->tx_queues[i];
1068                 mq->log2_ring_size = pmd->run.log2_ring_size;
1069                 /* queues located only in region 0 */
1070                 mq->region = 0;
1071                 mq->ring_offset = memif_get_ring_offset(dev, mq, MEMIF_RING_S2M, i);
1072                 mq->last_head = 0;
1073                 mq->last_tail = 0;
1074                 mq->intr_handle.fd = eventfd(0, EFD_NONBLOCK);
1075                 if (mq->intr_handle.fd < 0) {
1076                         MIF_LOG(WARNING,
1077                                 "Failed to create eventfd for tx queue %d: %s.", i,
1078                                 strerror(errno));
1079                 }
1080                 mq->buffers = NULL;
1081                 if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
1082                         mq->buffers = rte_zmalloc("bufs", sizeof(struct rte_mbuf *) *
1083                                                   (1 << mq->log2_ring_size), 0);
1084                         if (mq->buffers == NULL)
1085                                 return -ENOMEM;
1086                 }
1087         }
1088
1089         for (i = 0; i < pmd->run.num_m2s_rings; i++) {
1090                 mq = dev->data->rx_queues[i];
1091                 mq->log2_ring_size = pmd->run.log2_ring_size;
1092                 /* queues located only in region 0 */
1093                 mq->region = 0;
1094                 mq->ring_offset = memif_get_ring_offset(dev, mq, MEMIF_RING_M2S, i);
1095                 mq->last_head = 0;
1096                 mq->last_tail = 0;
1097                 mq->intr_handle.fd = eventfd(0, EFD_NONBLOCK);
1098                 if (mq->intr_handle.fd < 0) {
1099                         MIF_LOG(WARNING,
1100                                 "Failed to create eventfd for rx queue %d: %s.", i,
1101                                 strerror(errno));
1102                 }
1103                 mq->buffers = NULL;
1104                 if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
1105                         mq->buffers = rte_zmalloc("bufs", sizeof(struct rte_mbuf *) *
1106                                                   (1 << mq->log2_ring_size), 0);
1107                         if (mq->buffers == NULL)
1108                                 return -ENOMEM;
1109                 }
1110         }
1111         return 0;
1112 }
1113
1114 int
1115 memif_init_regions_and_queues(struct rte_eth_dev *dev)
1116 {
1117         int ret;
1118
1119         ret = memif_regions_init(dev);
1120         if (ret < 0)
1121                 return ret;
1122
1123         memif_init_rings(dev);
1124
1125         ret = memif_init_queues(dev);
1126         if (ret < 0)
1127                 return ret;
1128
1129         return 0;
1130 }
1131
1132 int
1133 memif_connect(struct rte_eth_dev *dev)
1134 {
1135         struct pmd_internals *pmd = dev->data->dev_private;
1136         struct pmd_process_private *proc_private = dev->process_private;
1137         struct memif_region *mr;
1138         struct memif_queue *mq;
1139         memif_ring_t *ring;
1140         int i;
1141
1142         for (i = 0; i < proc_private->regions_num; i++) {
1143                 mr = proc_private->regions[i];
1144                 if (mr != NULL) {
1145                         if (mr->addr == NULL) {
1146                                 if (mr->fd < 0)
1147                                         return -1;
1148                                 mr->addr = mmap(NULL, mr->region_size,
1149                                                 PROT_READ | PROT_WRITE,
1150                                                 MAP_SHARED, mr->fd, 0);
1151                                 if (mr->addr == MAP_FAILED) {
1152                                         MIF_LOG(ERR, "mmap failed: %s\n",
1153                                                 strerror(errno));
1154                                         return -1;
1155                                 }
1156                         }
1157                         if (i > 0 && (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY)) {
1158                                 /* close memseg file */
1159                                 close(mr->fd);
1160                                 mr->fd = -1;
1161                         }
1162                 }
1163         }
1164
1165         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
1166                 for (i = 0; i < pmd->run.num_s2m_rings; i++) {
1167                         mq = (pmd->role == MEMIF_ROLE_SLAVE) ?
1168                             dev->data->tx_queues[i] : dev->data->rx_queues[i];
1169                         ring = memif_get_ring_from_queue(proc_private, mq);
1170                         if (ring == NULL || ring->cookie != MEMIF_COOKIE) {
1171                                 MIF_LOG(ERR, "Wrong ring");
1172                                 return -1;
1173                         }
1174                         __atomic_store_n(&ring->head, 0, __ATOMIC_RELAXED);
1175                         __atomic_store_n(&ring->tail, 0, __ATOMIC_RELAXED);
1176                         mq->last_head = 0;
1177                         mq->last_tail = 0;
1178                         /* enable polling mode */
1179                         if (pmd->role == MEMIF_ROLE_MASTER)
1180                                 ring->flags = MEMIF_RING_FLAG_MASK_INT;
1181                 }
1182                 for (i = 0; i < pmd->run.num_m2s_rings; i++) {
1183                         mq = (pmd->role == MEMIF_ROLE_SLAVE) ?
1184                             dev->data->rx_queues[i] : dev->data->tx_queues[i];
1185                         ring = memif_get_ring_from_queue(proc_private, mq);
1186                         if (ring == NULL || ring->cookie != MEMIF_COOKIE) {
1187                                 MIF_LOG(ERR, "Wrong ring");
1188                                 return -1;
1189                         }
1190                         __atomic_store_n(&ring->head, 0, __ATOMIC_RELAXED);
1191                         __atomic_store_n(&ring->tail, 0, __ATOMIC_RELAXED);
1192                         mq->last_head = 0;
1193                         mq->last_tail = 0;
1194                         /* enable polling mode */
1195                         if (pmd->role == MEMIF_ROLE_SLAVE)
1196                                 ring->flags = MEMIF_RING_FLAG_MASK_INT;
1197                 }
1198
1199                 pmd->flags &= ~ETH_MEMIF_FLAG_CONNECTING;
1200                 pmd->flags |= ETH_MEMIF_FLAG_CONNECTED;
1201                 dev->data->dev_link.link_status = ETH_LINK_UP;
1202         }
1203         MIF_LOG(INFO, "Connected.");
1204         return 0;
1205 }
1206
1207 static int
1208 memif_dev_start(struct rte_eth_dev *dev)
1209 {
1210         struct pmd_internals *pmd = dev->data->dev_private;
1211         int ret = 0;
1212
1213         switch (pmd->role) {
1214         case MEMIF_ROLE_SLAVE:
1215                 ret = memif_connect_slave(dev);
1216                 break;
1217         case MEMIF_ROLE_MASTER:
1218                 ret = memif_connect_master(dev);
1219                 break;
1220         default:
1221                 MIF_LOG(ERR, "Unknown role: %d.", pmd->role);
1222                 ret = -1;
1223                 break;
1224         }
1225
1226         return ret;
1227 }
1228
1229 static int
1230 memif_dev_close(struct rte_eth_dev *dev)
1231 {
1232         struct pmd_internals *pmd = dev->data->dev_private;
1233         int i;
1234
1235         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
1236                 memif_msg_enq_disconnect(pmd->cc, "Device closed", 0);
1237                 memif_disconnect(dev);
1238
1239                 for (i = 0; i < dev->data->nb_rx_queues; i++)
1240                         (*dev->dev_ops->rx_queue_release)(dev->data->rx_queues[i]);
1241                 for (i = 0; i < dev->data->nb_tx_queues; i++)
1242                         (*dev->dev_ops->tx_queue_release)(dev->data->tx_queues[i]);
1243
1244                 memif_socket_remove_device(dev);
1245         } else {
1246                 memif_disconnect(dev);
1247         }
1248
1249         rte_free(dev->process_private);
1250
1251         return 0;
1252 }
1253
1254 static int
1255 memif_dev_configure(struct rte_eth_dev *dev)
1256 {
1257         struct pmd_internals *pmd = dev->data->dev_private;
1258
1259         /*
1260          * SLAVE - TXQ
1261          * MASTER - RXQ
1262          */
1263         pmd->cfg.num_s2m_rings = (pmd->role == MEMIF_ROLE_SLAVE) ?
1264                                   dev->data->nb_tx_queues : dev->data->nb_rx_queues;
1265
1266         /*
1267          * SLAVE - RXQ
1268          * MASTER - TXQ
1269          */
1270         pmd->cfg.num_m2s_rings = (pmd->role == MEMIF_ROLE_SLAVE) ?
1271                                   dev->data->nb_rx_queues : dev->data->nb_tx_queues;
1272
1273         return 0;
1274 }
1275
1276 static int
1277 memif_tx_queue_setup(struct rte_eth_dev *dev,
1278                      uint16_t qid,
1279                      uint16_t nb_tx_desc __rte_unused,
1280                      unsigned int socket_id __rte_unused,
1281                      const struct rte_eth_txconf *tx_conf __rte_unused)
1282 {
1283         struct pmd_internals *pmd = dev->data->dev_private;
1284         struct memif_queue *mq;
1285
1286         mq = rte_zmalloc("tx-queue", sizeof(struct memif_queue), 0);
1287         if (mq == NULL) {
1288                 MIF_LOG(ERR, "Failed to allocate tx queue id: %u", qid);
1289                 return -ENOMEM;
1290         }
1291
1292         mq->type =
1293             (pmd->role == MEMIF_ROLE_SLAVE) ? MEMIF_RING_S2M : MEMIF_RING_M2S;
1294         mq->n_pkts = 0;
1295         mq->n_bytes = 0;
1296         mq->intr_handle.fd = -1;
1297         mq->intr_handle.type = RTE_INTR_HANDLE_EXT;
1298         mq->in_port = dev->data->port_id;
1299         dev->data->tx_queues[qid] = mq;
1300
1301         return 0;
1302 }
1303
1304 static int
1305 memif_rx_queue_setup(struct rte_eth_dev *dev,
1306                      uint16_t qid,
1307                      uint16_t nb_rx_desc __rte_unused,
1308                      unsigned int socket_id __rte_unused,
1309                      const struct rte_eth_rxconf *rx_conf __rte_unused,
1310                      struct rte_mempool *mb_pool)
1311 {
1312         struct pmd_internals *pmd = dev->data->dev_private;
1313         struct memif_queue *mq;
1314
1315         mq = rte_zmalloc("rx-queue", sizeof(struct memif_queue), 0);
1316         if (mq == NULL) {
1317                 MIF_LOG(ERR, "Failed to allocate rx queue id: %u", qid);
1318                 return -ENOMEM;
1319         }
1320
1321         mq->type = (pmd->role == MEMIF_ROLE_SLAVE) ? MEMIF_RING_M2S : MEMIF_RING_S2M;
1322         mq->n_pkts = 0;
1323         mq->n_bytes = 0;
1324         mq->intr_handle.fd = -1;
1325         mq->intr_handle.type = RTE_INTR_HANDLE_EXT;
1326         mq->mempool = mb_pool;
1327         mq->in_port = dev->data->port_id;
1328         dev->data->rx_queues[qid] = mq;
1329
1330         return 0;
1331 }
1332
1333 static void
1334 memif_queue_release(void *queue)
1335 {
1336         struct memif_queue *mq = (struct memif_queue *)queue;
1337
1338         if (!mq)
1339                 return;
1340
1341         rte_free(mq);
1342 }
1343
1344 static int
1345 memif_link_update(struct rte_eth_dev *dev,
1346                   int wait_to_complete __rte_unused)
1347 {
1348         struct pmd_process_private *proc_private;
1349
1350         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1351                 proc_private = dev->process_private;
1352                 if (dev->data->dev_link.link_status == ETH_LINK_UP &&
1353                                 proc_private->regions_num == 0) {
1354                         memif_mp_request_regions(dev);
1355                 } else if (dev->data->dev_link.link_status == ETH_LINK_DOWN &&
1356                                 proc_private->regions_num > 0) {
1357                         memif_free_regions(dev);
1358                 }
1359         }
1360         return 0;
1361 }
1362
1363 static int
1364 memif_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1365 {
1366         struct pmd_internals *pmd = dev->data->dev_private;
1367         struct memif_queue *mq;
1368         int i;
1369         uint8_t tmp, nq;
1370
1371         stats->ipackets = 0;
1372         stats->ibytes = 0;
1373         stats->opackets = 0;
1374         stats->obytes = 0;
1375
1376         tmp = (pmd->role == MEMIF_ROLE_SLAVE) ? pmd->run.num_s2m_rings :
1377             pmd->run.num_m2s_rings;
1378         nq = (tmp < RTE_ETHDEV_QUEUE_STAT_CNTRS) ? tmp :
1379             RTE_ETHDEV_QUEUE_STAT_CNTRS;
1380
1381         /* RX stats */
1382         for (i = 0; i < nq; i++) {
1383                 mq = dev->data->rx_queues[i];
1384                 stats->q_ipackets[i] = mq->n_pkts;
1385                 stats->q_ibytes[i] = mq->n_bytes;
1386                 stats->ipackets += mq->n_pkts;
1387                 stats->ibytes += mq->n_bytes;
1388         }
1389
1390         tmp = (pmd->role == MEMIF_ROLE_SLAVE) ? pmd->run.num_m2s_rings :
1391             pmd->run.num_s2m_rings;
1392         nq = (tmp < RTE_ETHDEV_QUEUE_STAT_CNTRS) ? tmp :
1393             RTE_ETHDEV_QUEUE_STAT_CNTRS;
1394
1395         /* TX stats */
1396         for (i = 0; i < nq; i++) {
1397                 mq = dev->data->tx_queues[i];
1398                 stats->q_opackets[i] = mq->n_pkts;
1399                 stats->q_obytes[i] = mq->n_bytes;
1400                 stats->opackets += mq->n_pkts;
1401                 stats->obytes += mq->n_bytes;
1402         }
1403         return 0;
1404 }
1405
1406 static int
1407 memif_stats_reset(struct rte_eth_dev *dev)
1408 {
1409         struct pmd_internals *pmd = dev->data->dev_private;
1410         int i;
1411         struct memif_queue *mq;
1412
1413         for (i = 0; i < pmd->run.num_s2m_rings; i++) {
1414                 mq = (pmd->role == MEMIF_ROLE_SLAVE) ? dev->data->tx_queues[i] :
1415                     dev->data->rx_queues[i];
1416                 mq->n_pkts = 0;
1417                 mq->n_bytes = 0;
1418         }
1419         for (i = 0; i < pmd->run.num_m2s_rings; i++) {
1420                 mq = (pmd->role == MEMIF_ROLE_SLAVE) ? dev->data->rx_queues[i] :
1421                     dev->data->tx_queues[i];
1422                 mq->n_pkts = 0;
1423                 mq->n_bytes = 0;
1424         }
1425
1426         return 0;
1427 }
1428
1429 static int
1430 memif_rx_queue_intr_enable(struct rte_eth_dev *dev __rte_unused,
1431                            uint16_t qid __rte_unused)
1432 {
1433         MIF_LOG(WARNING, "Interrupt mode not supported.");
1434
1435         return -1;
1436 }
1437
1438 static int
1439 memif_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t qid __rte_unused)
1440 {
1441         struct pmd_internals *pmd __rte_unused = dev->data->dev_private;
1442
1443         return 0;
1444 }
1445
1446 static const struct eth_dev_ops ops = {
1447         .dev_start = memif_dev_start,
1448         .dev_close = memif_dev_close,
1449         .dev_infos_get = memif_dev_info,
1450         .dev_configure = memif_dev_configure,
1451         .tx_queue_setup = memif_tx_queue_setup,
1452         .rx_queue_setup = memif_rx_queue_setup,
1453         .rx_queue_release = memif_queue_release,
1454         .tx_queue_release = memif_queue_release,
1455         .rx_queue_intr_enable = memif_rx_queue_intr_enable,
1456         .rx_queue_intr_disable = memif_rx_queue_intr_disable,
1457         .link_update = memif_link_update,
1458         .stats_get = memif_stats_get,
1459         .stats_reset = memif_stats_reset,
1460 };
1461
1462 static int
1463 memif_create(struct rte_vdev_device *vdev, enum memif_role_t role,
1464              memif_interface_id_t id, uint32_t flags,
1465              const char *socket_filename,
1466              memif_log2_ring_size_t log2_ring_size,
1467              uint16_t pkt_buffer_size, const char *secret,
1468              struct rte_ether_addr *ether_addr)
1469 {
1470         int ret = 0;
1471         struct rte_eth_dev *eth_dev;
1472         struct rte_eth_dev_data *data;
1473         struct pmd_internals *pmd;
1474         struct pmd_process_private *process_private;
1475         const unsigned int numa_node = vdev->device.numa_node;
1476         const char *name = rte_vdev_device_name(vdev);
1477
1478         eth_dev = rte_eth_vdev_allocate(vdev, sizeof(*pmd));
1479         if (eth_dev == NULL) {
1480                 MIF_LOG(ERR, "%s: Unable to allocate device struct.", name);
1481                 return -1;
1482         }
1483
1484         process_private = (struct pmd_process_private *)
1485                 rte_zmalloc(name, sizeof(struct pmd_process_private),
1486                             RTE_CACHE_LINE_SIZE);
1487
1488         if (process_private == NULL) {
1489                 MIF_LOG(ERR, "Failed to alloc memory for process private");
1490                 return -1;
1491         }
1492         eth_dev->process_private = process_private;
1493
1494         pmd = eth_dev->data->dev_private;
1495         memset(pmd, 0, sizeof(*pmd));
1496
1497         pmd->id = id;
1498         pmd->flags = flags;
1499         pmd->flags |= ETH_MEMIF_FLAG_DISABLED;
1500         pmd->role = role;
1501         /* Zero-copy flag irelevant to master. */
1502         if (pmd->role == MEMIF_ROLE_MASTER)
1503                 pmd->flags &= ~ETH_MEMIF_FLAG_ZERO_COPY;
1504
1505         ret = memif_socket_init(eth_dev, socket_filename);
1506         if (ret < 0)
1507                 return ret;
1508
1509         memset(pmd->secret, 0, sizeof(char) * ETH_MEMIF_SECRET_SIZE);
1510         if (secret != NULL)
1511                 strlcpy(pmd->secret, secret, sizeof(pmd->secret));
1512
1513         pmd->cfg.log2_ring_size = log2_ring_size;
1514         /* set in .dev_configure() */
1515         pmd->cfg.num_s2m_rings = 0;
1516         pmd->cfg.num_m2s_rings = 0;
1517
1518         pmd->cfg.pkt_buffer_size = pkt_buffer_size;
1519         rte_spinlock_init(&pmd->cc_lock);
1520
1521         data = eth_dev->data;
1522         data->dev_private = pmd;
1523         data->numa_node = numa_node;
1524         data->dev_link = pmd_link;
1525         data->mac_addrs = ether_addr;
1526         data->promiscuous = 1;
1527
1528         eth_dev->dev_ops = &ops;
1529         eth_dev->device = &vdev->device;
1530         if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
1531                 eth_dev->rx_pkt_burst = eth_memif_rx_zc;
1532                 eth_dev->tx_pkt_burst = eth_memif_tx_zc;
1533         } else {
1534                 eth_dev->rx_pkt_burst = eth_memif_rx;
1535                 eth_dev->tx_pkt_burst = eth_memif_tx;
1536         }
1537
1538         rte_eth_dev_probing_finish(eth_dev);
1539
1540         return 0;
1541 }
1542
1543 static int
1544 memif_set_role(const char *key __rte_unused, const char *value,
1545                void *extra_args)
1546 {
1547         enum memif_role_t *role = (enum memif_role_t *)extra_args;
1548
1549         if (strstr(value, "master") != NULL) {
1550                 *role = MEMIF_ROLE_MASTER;
1551         } else if (strstr(value, "slave") != NULL) {
1552                 *role = MEMIF_ROLE_SLAVE;
1553         } else {
1554                 MIF_LOG(ERR, "Unknown role: %s.", value);
1555                 return -EINVAL;
1556         }
1557         return 0;
1558 }
1559
1560 static int
1561 memif_set_zc(const char *key __rte_unused, const char *value, void *extra_args)
1562 {
1563         uint32_t *flags = (uint32_t *)extra_args;
1564
1565         if (strstr(value, "yes") != NULL) {
1566                 if (!rte_mcfg_get_single_file_segments()) {
1567                         MIF_LOG(ERR, "Zero-copy doesn't support multi-file segments.");
1568                         return -ENOTSUP;
1569                 }
1570                 *flags |= ETH_MEMIF_FLAG_ZERO_COPY;
1571         } else if (strstr(value, "no") != NULL) {
1572                 *flags &= ~ETH_MEMIF_FLAG_ZERO_COPY;
1573         } else {
1574                 MIF_LOG(ERR, "Failed to parse zero-copy param: %s.", value);
1575                 return -EINVAL;
1576         }
1577         return 0;
1578 }
1579
1580 static int
1581 memif_set_id(const char *key __rte_unused, const char *value, void *extra_args)
1582 {
1583         memif_interface_id_t *id = (memif_interface_id_t *)extra_args;
1584
1585         /* even if parsing fails, 0 is a valid id */
1586         *id = strtoul(value, NULL, 10);
1587         return 0;
1588 }
1589
1590 static int
1591 memif_set_bs(const char *key __rte_unused, const char *value, void *extra_args)
1592 {
1593         unsigned long tmp;
1594         uint16_t *pkt_buffer_size = (uint16_t *)extra_args;
1595
1596         tmp = strtoul(value, NULL, 10);
1597         if (tmp == 0 || tmp > 0xFFFF) {
1598                 MIF_LOG(ERR, "Invalid buffer size: %s.", value);
1599                 return -EINVAL;
1600         }
1601         *pkt_buffer_size = tmp;
1602         return 0;
1603 }
1604
1605 static int
1606 memif_set_rs(const char *key __rte_unused, const char *value, void *extra_args)
1607 {
1608         unsigned long tmp;
1609         memif_log2_ring_size_t *log2_ring_size =
1610             (memif_log2_ring_size_t *)extra_args;
1611
1612         tmp = strtoul(value, NULL, 10);
1613         if (tmp == 0 || tmp > ETH_MEMIF_MAX_LOG2_RING_SIZE) {
1614                 MIF_LOG(ERR, "Invalid ring size: %s (max %u).",
1615                         value, ETH_MEMIF_MAX_LOG2_RING_SIZE);
1616                 return -EINVAL;
1617         }
1618         *log2_ring_size = tmp;
1619         return 0;
1620 }
1621
1622 /* check if directory exists and if we have permission to read/write */
1623 static int
1624 memif_check_socket_filename(const char *filename)
1625 {
1626         char *dir = NULL, *tmp;
1627         uint32_t idx;
1628         int ret = 0;
1629
1630         if (strlen(filename) >= MEMIF_SOCKET_UN_SIZE) {
1631                 MIF_LOG(ERR, "Unix socket address too long (max 108).");
1632                 return -1;
1633         }
1634
1635         tmp = strrchr(filename, '/');
1636         if (tmp != NULL) {
1637                 idx = tmp - filename;
1638                 dir = rte_zmalloc("memif_tmp", sizeof(char) * (idx + 1), 0);
1639                 if (dir == NULL) {
1640                         MIF_LOG(ERR, "Failed to allocate memory.");
1641                         return -1;
1642                 }
1643                 strlcpy(dir, filename, sizeof(char) * (idx + 1));
1644         }
1645
1646         if (dir == NULL || (faccessat(-1, dir, F_OK | R_OK |
1647                                         W_OK, AT_EACCESS) < 0)) {
1648                 MIF_LOG(ERR, "Invalid socket directory.");
1649                 ret = -EINVAL;
1650         }
1651
1652         if (dir != NULL)
1653                 rte_free(dir);
1654
1655         return ret;
1656 }
1657
1658 static int
1659 memif_set_socket_filename(const char *key __rte_unused, const char *value,
1660                           void *extra_args)
1661 {
1662         const char **socket_filename = (const char **)extra_args;
1663
1664         *socket_filename = value;
1665         return memif_check_socket_filename(*socket_filename);
1666 }
1667
1668 static int
1669 memif_set_mac(const char *key __rte_unused, const char *value, void *extra_args)
1670 {
1671         struct rte_ether_addr *ether_addr = (struct rte_ether_addr *)extra_args;
1672
1673         if (rte_ether_unformat_addr(value, ether_addr) < 0)
1674                 MIF_LOG(WARNING, "Failed to parse mac '%s'.", value);
1675         return 0;
1676 }
1677
1678 static int
1679 memif_set_secret(const char *key __rte_unused, const char *value, void *extra_args)
1680 {
1681         const char **secret = (const char **)extra_args;
1682
1683         *secret = value;
1684         return 0;
1685 }
1686
1687 static int
1688 rte_pmd_memif_probe(struct rte_vdev_device *vdev)
1689 {
1690         RTE_BUILD_BUG_ON(sizeof(memif_msg_t) != 128);
1691         RTE_BUILD_BUG_ON(sizeof(memif_desc_t) != 16);
1692         int ret = 0;
1693         struct rte_kvargs *kvlist;
1694         const char *name = rte_vdev_device_name(vdev);
1695         enum memif_role_t role = MEMIF_ROLE_SLAVE;
1696         memif_interface_id_t id = 0;
1697         uint16_t pkt_buffer_size = ETH_MEMIF_DEFAULT_PKT_BUFFER_SIZE;
1698         memif_log2_ring_size_t log2_ring_size = ETH_MEMIF_DEFAULT_RING_SIZE;
1699         const char *socket_filename = ETH_MEMIF_DEFAULT_SOCKET_FILENAME;
1700         uint32_t flags = 0;
1701         const char *secret = NULL;
1702         struct rte_ether_addr *ether_addr = rte_zmalloc("",
1703                 sizeof(struct rte_ether_addr), 0);
1704         struct rte_eth_dev *eth_dev;
1705
1706         rte_eth_random_addr(ether_addr->addr_bytes);
1707
1708         MIF_LOG(INFO, "Initialize MEMIF: %s.", name);
1709
1710         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1711                 eth_dev = rte_eth_dev_attach_secondary(name);
1712                 if (!eth_dev) {
1713                         MIF_LOG(ERR, "Failed to probe %s", name);
1714                         return -1;
1715                 }
1716
1717                 eth_dev->dev_ops = &ops;
1718                 eth_dev->device = &vdev->device;
1719                 eth_dev->rx_pkt_burst = eth_memif_rx;
1720                 eth_dev->tx_pkt_burst = eth_memif_tx;
1721
1722                 if (!rte_eal_primary_proc_alive(NULL)) {
1723                         MIF_LOG(ERR, "Primary process is missing");
1724                         return -1;
1725                 }
1726
1727                 eth_dev->process_private = (struct pmd_process_private *)
1728                         rte_zmalloc(name,
1729                                 sizeof(struct pmd_process_private),
1730                                 RTE_CACHE_LINE_SIZE);
1731                 if (eth_dev->process_private == NULL) {
1732                         MIF_LOG(ERR,
1733                                 "Failed to alloc memory for process private");
1734                         return -1;
1735                 }
1736
1737                 rte_eth_dev_probing_finish(eth_dev);
1738
1739                 return 0;
1740         }
1741
1742         ret = rte_mp_action_register(MEMIF_MP_SEND_REGION, memif_mp_send_region);
1743         /*
1744          * Primary process can continue probing, but secondary process won't
1745          * be able to get memory regions information
1746          */
1747         if (ret < 0 && rte_errno != EEXIST)
1748                 MIF_LOG(WARNING, "Failed to register mp action callback: %s",
1749                         strerror(rte_errno));
1750
1751         kvlist = rte_kvargs_parse(rte_vdev_device_args(vdev), valid_arguments);
1752
1753         /* parse parameters */
1754         if (kvlist != NULL) {
1755                 ret = rte_kvargs_process(kvlist, ETH_MEMIF_ROLE_ARG,
1756                                          &memif_set_role, &role);
1757                 if (ret < 0)
1758                         goto exit;
1759                 ret = rte_kvargs_process(kvlist, ETH_MEMIF_ID_ARG,
1760                                          &memif_set_id, &id);
1761                 if (ret < 0)
1762                         goto exit;
1763                 ret = rte_kvargs_process(kvlist, ETH_MEMIF_PKT_BUFFER_SIZE_ARG,
1764                                          &memif_set_bs, &pkt_buffer_size);
1765                 if (ret < 0)
1766                         goto exit;
1767                 ret = rte_kvargs_process(kvlist, ETH_MEMIF_RING_SIZE_ARG,
1768                                          &memif_set_rs, &log2_ring_size);
1769                 if (ret < 0)
1770                         goto exit;
1771                 ret = rte_kvargs_process(kvlist, ETH_MEMIF_SOCKET_ARG,
1772                                          &memif_set_socket_filename,
1773                                          (void *)(&socket_filename));
1774                 if (ret < 0)
1775                         goto exit;
1776                 ret = rte_kvargs_process(kvlist, ETH_MEMIF_MAC_ARG,
1777                                          &memif_set_mac, ether_addr);
1778                 if (ret < 0)
1779                         goto exit;
1780                 ret = rte_kvargs_process(kvlist, ETH_MEMIF_ZC_ARG,
1781                                          &memif_set_zc, &flags);
1782                 if (ret < 0)
1783                         goto exit;
1784                 ret = rte_kvargs_process(kvlist, ETH_MEMIF_SECRET_ARG,
1785                                          &memif_set_secret, (void *)(&secret));
1786                 if (ret < 0)
1787                         goto exit;
1788         }
1789
1790         /* create interface */
1791         ret = memif_create(vdev, role, id, flags, socket_filename,
1792                            log2_ring_size, pkt_buffer_size, secret, ether_addr);
1793
1794 exit:
1795         if (kvlist != NULL)
1796                 rte_kvargs_free(kvlist);
1797         return ret;
1798 }
1799
1800 static int
1801 rte_pmd_memif_remove(struct rte_vdev_device *vdev)
1802 {
1803         struct rte_eth_dev *eth_dev;
1804
1805         eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(vdev));
1806         if (eth_dev == NULL)
1807                 return 0;
1808
1809         rte_eth_dev_close(eth_dev->data->port_id);
1810
1811         return 0;
1812 }
1813
1814 static struct rte_vdev_driver pmd_memif_drv = {
1815         .probe = rte_pmd_memif_probe,
1816         .remove = rte_pmd_memif_remove,
1817 };
1818
1819 RTE_PMD_REGISTER_VDEV(net_memif, pmd_memif_drv);
1820
1821 RTE_PMD_REGISTER_PARAM_STRING(net_memif,
1822                               ETH_MEMIF_ID_ARG "=<int>"
1823                               ETH_MEMIF_ROLE_ARG "=master|slave"
1824                               ETH_MEMIF_PKT_BUFFER_SIZE_ARG "=<int>"
1825                               ETH_MEMIF_RING_SIZE_ARG "=<int>"
1826                               ETH_MEMIF_SOCKET_ARG "=<string>"
1827                               ETH_MEMIF_MAC_ARG "=xx:xx:xx:xx:xx:xx"
1828                               ETH_MEMIF_ZC_ARG "=yes|no"
1829                               ETH_MEMIF_SECRET_ARG "=<string>");
1830
1831 RTE_LOG_REGISTER(memif_logtype, pmd.net.memif, NOTICE);