0562e58696859b6dd504eeaa1f18c70c47e76e5e
[dpdk.git] / drivers / net / af_xdp / rte_eth_af_xdp.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2019-2020 Intel Corporation.
3  */
4 #include <unistd.h>
5 #include <errno.h>
6 #include <stdlib.h>
7 #include <string.h>
8 #include <poll.h>
9 #include <netinet/in.h>
10 #include <net/if.h>
11 #include <sys/socket.h>
12 #include <sys/ioctl.h>
13 #include <linux/if_ether.h>
14 #include <linux/if_xdp.h>
15 #include <linux/if_link.h>
16 #include <linux/ethtool.h>
17 #include <linux/sockios.h>
18 #include "af_xdp_deps.h"
19 #include <bpf/xsk.h>
20
21 #include <rte_ethdev.h>
22 #include <rte_ethdev_driver.h>
23 #include <rte_ethdev_vdev.h>
24 #include <rte_kvargs.h>
25 #include <rte_bus_vdev.h>
26 #include <rte_string_fns.h>
27 #include <rte_branch_prediction.h>
28 #include <rte_common.h>
29 #include <rte_dev.h>
30 #include <rte_eal.h>
31 #include <rte_ether.h>
32 #include <rte_lcore.h>
33 #include <rte_log.h>
34 #include <rte_memory.h>
35 #include <rte_memzone.h>
36 #include <rte_mempool.h>
37 #include <rte_mbuf.h>
38 #include <rte_malloc.h>
39 #include <rte_ring.h>
40 #include <rte_spinlock.h>
41
42 #include "compat.h"
43
44
45 #ifndef SOL_XDP
46 #define SOL_XDP 283
47 #endif
48
49 #ifndef AF_XDP
50 #define AF_XDP 44
51 #endif
52
53 #ifndef PF_XDP
54 #define PF_XDP AF_XDP
55 #endif
56
57 RTE_LOG_REGISTER(af_xdp_logtype, pmd.net.af_xdp, NOTICE);
58
59 #define AF_XDP_LOG(level, fmt, args...)                 \
60         rte_log(RTE_LOG_ ## level, af_xdp_logtype,      \
61                 "%s(): " fmt, __func__, ##args)
62
63 #define ETH_AF_XDP_FRAME_SIZE           2048
64 #define ETH_AF_XDP_NUM_BUFFERS          4096
65 #define ETH_AF_XDP_DFLT_NUM_DESCS       XSK_RING_CONS__DEFAULT_NUM_DESCS
66 #define ETH_AF_XDP_DFLT_START_QUEUE_IDX 0
67 #define ETH_AF_XDP_DFLT_QUEUE_COUNT     1
68
69 #define ETH_AF_XDP_RX_BATCH_SIZE        32
70 #define ETH_AF_XDP_TX_BATCH_SIZE        32
71
72
73 struct xsk_umem_info {
74         struct xsk_umem *umem;
75         struct rte_ring *buf_ring;
76         const struct rte_memzone *mz;
77         struct rte_mempool *mb_pool;
78         void *buffer;
79         uint8_t refcnt;
80         uint32_t max_xsks;
81 };
82
83 struct rx_stats {
84         uint64_t rx_pkts;
85         uint64_t rx_bytes;
86         uint64_t rx_dropped;
87 };
88
89 struct pkt_rx_queue {
90         struct xsk_ring_cons rx;
91         struct xsk_umem_info *umem;
92         struct xsk_socket *xsk;
93         struct rte_mempool *mb_pool;
94
95         struct rx_stats stats;
96
97         struct xsk_ring_prod fq;
98         struct xsk_ring_cons cq;
99
100         struct pkt_tx_queue *pair;
101         struct pollfd fds[1];
102         int xsk_queue_idx;
103 };
104
105 struct tx_stats {
106         uint64_t tx_pkts;
107         uint64_t tx_bytes;
108         uint64_t tx_dropped;
109 };
110
111 struct pkt_tx_queue {
112         struct xsk_ring_prod tx;
113         struct xsk_umem_info *umem;
114
115         struct tx_stats stats;
116
117         struct pkt_rx_queue *pair;
118         int xsk_queue_idx;
119 };
120
121 struct pmd_internals {
122         int if_index;
123         char if_name[IFNAMSIZ];
124         int start_queue_idx;
125         int queue_cnt;
126         int max_queue_cnt;
127         int combined_queue_cnt;
128         bool shared_umem;
129         char prog_path[PATH_MAX];
130         bool custom_prog_configured;
131
132         struct rte_ether_addr eth_addr;
133
134         struct pkt_rx_queue *rx_queues;
135         struct pkt_tx_queue *tx_queues;
136 };
137
138 #define ETH_AF_XDP_IFACE_ARG                    "iface"
139 #define ETH_AF_XDP_START_QUEUE_ARG              "start_queue"
140 #define ETH_AF_XDP_QUEUE_COUNT_ARG              "queue_count"
141 #define ETH_AF_XDP_SHARED_UMEM_ARG              "shared_umem"
142 #define ETH_AF_XDP_PROG_ARG                     "xdp_prog"
143
144 static const char * const valid_arguments[] = {
145         ETH_AF_XDP_IFACE_ARG,
146         ETH_AF_XDP_START_QUEUE_ARG,
147         ETH_AF_XDP_QUEUE_COUNT_ARG,
148         ETH_AF_XDP_SHARED_UMEM_ARG,
149         ETH_AF_XDP_PROG_ARG,
150         NULL
151 };
152
153 static const struct rte_eth_link pmd_link = {
154         .link_speed = ETH_SPEED_NUM_10G,
155         .link_duplex = ETH_LINK_FULL_DUPLEX,
156         .link_status = ETH_LINK_DOWN,
157         .link_autoneg = ETH_LINK_AUTONEG
158 };
159
160 /* List which tracks PMDs to facilitate sharing UMEMs across them. */
161 struct internal_list {
162         TAILQ_ENTRY(internal_list) next;
163         struct rte_eth_dev *eth_dev;
164 };
165
166 TAILQ_HEAD(internal_list_head, internal_list);
167 static struct internal_list_head internal_list =
168         TAILQ_HEAD_INITIALIZER(internal_list);
169
170 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
171
172 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
173 static inline int
174 reserve_fill_queue_zc(struct xsk_umem_info *umem, uint16_t reserve_size,
175                       struct rte_mbuf **bufs, struct xsk_ring_prod *fq)
176 {
177         uint32_t idx;
178         uint16_t i;
179
180         if (unlikely(!xsk_ring_prod__reserve(fq, reserve_size, &idx))) {
181                 for (i = 0; i < reserve_size; i++)
182                         rte_pktmbuf_free(bufs[i]);
183                 AF_XDP_LOG(DEBUG, "Failed to reserve enough fq descs.\n");
184                 return -1;
185         }
186
187         for (i = 0; i < reserve_size; i++) {
188                 __u64 *fq_addr;
189                 uint64_t addr;
190
191                 fq_addr = xsk_ring_prod__fill_addr(fq, idx++);
192                 addr = (uint64_t)bufs[i] - (uint64_t)umem->buffer -
193                                 umem->mb_pool->header_size;
194                 *fq_addr = addr;
195         }
196
197         xsk_ring_prod__submit(fq, reserve_size);
198
199         return 0;
200 }
201 #else
202 static inline int
203 reserve_fill_queue_cp(struct xsk_umem_info *umem, uint16_t reserve_size,
204                       struct rte_mbuf **bufs __rte_unused,
205                       struct xsk_ring_prod *fq)
206 {
207         void *addrs[reserve_size];
208         uint32_t idx;
209         uint16_t i;
210
211         if (rte_ring_dequeue_bulk(umem->buf_ring, addrs, reserve_size, NULL)
212                     != reserve_size) {
213                 AF_XDP_LOG(DEBUG, "Failed to get enough buffers for fq.\n");
214                 return -1;
215         }
216
217         if (unlikely(!xsk_ring_prod__reserve(fq, reserve_size, &idx))) {
218                 AF_XDP_LOG(DEBUG, "Failed to reserve enough fq descs.\n");
219                 rte_ring_enqueue_bulk(umem->buf_ring, addrs,
220                                 reserve_size, NULL);
221                 return -1;
222         }
223
224         for (i = 0; i < reserve_size; i++) {
225                 __u64 *fq_addr;
226
227                 fq_addr = xsk_ring_prod__fill_addr(fq, idx++);
228                 *fq_addr = (uint64_t)addrs[i];
229         }
230
231         xsk_ring_prod__submit(fq, reserve_size);
232
233         return 0;
234 }
235 #endif
236
237 static inline int
238 reserve_fill_queue(struct xsk_umem_info *umem, uint16_t reserve_size,
239                    struct rte_mbuf **bufs, struct xsk_ring_prod *fq)
240 {
241 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
242         return reserve_fill_queue_zc(umem, reserve_size, bufs, fq);
243 #else
244         return reserve_fill_queue_cp(umem, reserve_size, bufs, fq);
245 #endif
246 }
247
248 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
249 static uint16_t
250 af_xdp_rx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
251 {
252         struct pkt_rx_queue *rxq = queue;
253         struct xsk_ring_cons *rx = &rxq->rx;
254         struct xsk_ring_prod *fq = &rxq->fq;
255         struct xsk_umem_info *umem = rxq->umem;
256         uint32_t idx_rx = 0;
257         unsigned long rx_bytes = 0;
258         int rcvd, i;
259         struct rte_mbuf *fq_bufs[ETH_AF_XDP_RX_BATCH_SIZE];
260
261         /* allocate bufs for fill queue replenishment after rx */
262         if (rte_pktmbuf_alloc_bulk(umem->mb_pool, fq_bufs, nb_pkts)) {
263                 AF_XDP_LOG(DEBUG,
264                         "Failed to get enough buffers for fq.\n");
265                 return 0;
266         }
267
268         rcvd = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
269
270         if (rcvd == 0) {
271 #if defined(XDP_USE_NEED_WAKEUP)
272                 if (xsk_ring_prod__needs_wakeup(fq))
273                         (void)poll(rxq->fds, 1, 1000);
274 #endif
275
276                 goto out;
277         }
278
279         for (i = 0; i < rcvd; i++) {
280                 const struct xdp_desc *desc;
281                 uint64_t addr;
282                 uint32_t len;
283                 uint64_t offset;
284
285                 desc = xsk_ring_cons__rx_desc(rx, idx_rx++);
286                 addr = desc->addr;
287                 len = desc->len;
288
289                 offset = xsk_umem__extract_offset(addr);
290                 addr = xsk_umem__extract_addr(addr);
291
292                 bufs[i] = (struct rte_mbuf *)
293                                 xsk_umem__get_data(umem->buffer, addr +
294                                         umem->mb_pool->header_size);
295                 bufs[i]->data_off = offset - sizeof(struct rte_mbuf) -
296                         rte_pktmbuf_priv_size(umem->mb_pool) -
297                         umem->mb_pool->header_size;
298
299                 rte_pktmbuf_pkt_len(bufs[i]) = len;
300                 rte_pktmbuf_data_len(bufs[i]) = len;
301                 rx_bytes += len;
302         }
303
304         xsk_ring_cons__release(rx, rcvd);
305
306         (void)reserve_fill_queue(umem, rcvd, fq_bufs, fq);
307
308         /* statistics */
309         rxq->stats.rx_pkts += rcvd;
310         rxq->stats.rx_bytes += rx_bytes;
311
312 out:
313         if (rcvd != nb_pkts)
314                 rte_mempool_put_bulk(umem->mb_pool, (void **)&fq_bufs[rcvd],
315                                      nb_pkts - rcvd);
316
317         return rcvd;
318 }
319 #else
320 static uint16_t
321 af_xdp_rx_cp(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
322 {
323         struct pkt_rx_queue *rxq = queue;
324         struct xsk_ring_cons *rx = &rxq->rx;
325         struct xsk_umem_info *umem = rxq->umem;
326         struct xsk_ring_prod *fq = &rxq->fq;
327         uint32_t idx_rx = 0;
328         unsigned long rx_bytes = 0;
329         int rcvd, i;
330         uint32_t free_thresh = fq->size >> 1;
331         struct rte_mbuf *mbufs[ETH_AF_XDP_RX_BATCH_SIZE];
332
333         if (xsk_prod_nb_free(fq, free_thresh) >= free_thresh)
334                 (void)reserve_fill_queue(umem, ETH_AF_XDP_RX_BATCH_SIZE,
335                                          NULL, fq);
336
337         if (unlikely(rte_pktmbuf_alloc_bulk(rxq->mb_pool, mbufs, nb_pkts) != 0))
338                 return 0;
339
340         rcvd = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
341         if (rcvd == 0) {
342 #if defined(XDP_USE_NEED_WAKEUP)
343                 if (xsk_ring_prod__needs_wakeup(fq))
344                         (void)poll(rxq->fds, 1, 1000);
345 #endif
346
347                 goto out;
348         }
349
350         for (i = 0; i < rcvd; i++) {
351                 const struct xdp_desc *desc;
352                 uint64_t addr;
353                 uint32_t len;
354                 void *pkt;
355
356                 desc = xsk_ring_cons__rx_desc(rx, idx_rx++);
357                 addr = desc->addr;
358                 len = desc->len;
359                 pkt = xsk_umem__get_data(rxq->umem->mz->addr, addr);
360
361                 rte_memcpy(rte_pktmbuf_mtod(mbufs[i], void *), pkt, len);
362                 rte_ring_enqueue(umem->buf_ring, (void *)addr);
363                 rte_pktmbuf_pkt_len(mbufs[i]) = len;
364                 rte_pktmbuf_data_len(mbufs[i]) = len;
365                 rx_bytes += len;
366                 bufs[i] = mbufs[i];
367         }
368
369         xsk_ring_cons__release(rx, rcvd);
370
371         /* statistics */
372         rxq->stats.rx_pkts += rcvd;
373         rxq->stats.rx_bytes += rx_bytes;
374
375 out:
376         if (rcvd != nb_pkts)
377                 rte_mempool_put_bulk(rxq->mb_pool, (void **)&mbufs[rcvd],
378                                      nb_pkts - rcvd);
379
380         return rcvd;
381 }
382 #endif
383
384 static uint16_t
385 eth_af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
386 {
387         nb_pkts = RTE_MIN(nb_pkts, ETH_AF_XDP_RX_BATCH_SIZE);
388
389 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
390         return af_xdp_rx_zc(queue, bufs, nb_pkts);
391 #else
392         return af_xdp_rx_cp(queue, bufs, nb_pkts);
393 #endif
394 }
395
396 static void
397 pull_umem_cq(struct xsk_umem_info *umem, int size, struct xsk_ring_cons *cq)
398 {
399         size_t i, n;
400         uint32_t idx_cq = 0;
401
402         n = xsk_ring_cons__peek(cq, size, &idx_cq);
403
404         for (i = 0; i < n; i++) {
405                 uint64_t addr;
406                 addr = *xsk_ring_cons__comp_addr(cq, idx_cq++);
407 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
408                 addr = xsk_umem__extract_addr(addr);
409                 rte_pktmbuf_free((struct rte_mbuf *)
410                                         xsk_umem__get_data(umem->buffer,
411                                         addr + umem->mb_pool->header_size));
412 #else
413                 rte_ring_enqueue(umem->buf_ring, (void *)addr);
414 #endif
415         }
416
417         xsk_ring_cons__release(cq, n);
418 }
419
420 static void
421 kick_tx(struct pkt_tx_queue *txq, struct xsk_ring_cons *cq)
422 {
423         struct xsk_umem_info *umem = txq->umem;
424
425         pull_umem_cq(umem, XSK_RING_CONS__DEFAULT_NUM_DESCS, cq);
426
427 #if defined(XDP_USE_NEED_WAKEUP)
428         if (xsk_ring_prod__needs_wakeup(&txq->tx))
429 #endif
430                 while (send(xsk_socket__fd(txq->pair->xsk), NULL,
431                             0, MSG_DONTWAIT) < 0) {
432                         /* some thing unexpected */
433                         if (errno != EBUSY && errno != EAGAIN && errno != EINTR)
434                                 break;
435
436                         /* pull from completion queue to leave more space */
437                         if (errno == EAGAIN)
438                                 pull_umem_cq(umem,
439                                              XSK_RING_CONS__DEFAULT_NUM_DESCS,
440                                              cq);
441                 }
442 }
443
444 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
445 static uint16_t
446 af_xdp_tx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
447 {
448         struct pkt_tx_queue *txq = queue;
449         struct xsk_umem_info *umem = txq->umem;
450         struct rte_mbuf *mbuf;
451         unsigned long tx_bytes = 0;
452         int i;
453         uint32_t idx_tx;
454         uint16_t count = 0;
455         struct xdp_desc *desc;
456         uint64_t addr, offset;
457         struct xsk_ring_cons *cq = &txq->pair->cq;
458         uint32_t free_thresh = cq->size >> 1;
459
460         if (xsk_cons_nb_avail(cq, free_thresh) >= free_thresh)
461                 pull_umem_cq(umem, XSK_RING_CONS__DEFAULT_NUM_DESCS, cq);
462
463         for (i = 0; i < nb_pkts; i++) {
464                 mbuf = bufs[i];
465
466                 if (mbuf->pool == umem->mb_pool) {
467                         if (!xsk_ring_prod__reserve(&txq->tx, 1, &idx_tx)) {
468                                 kick_tx(txq, cq);
469                                 if (!xsk_ring_prod__reserve(&txq->tx, 1,
470                                                             &idx_tx))
471                                         goto out;
472                         }
473                         desc = xsk_ring_prod__tx_desc(&txq->tx, idx_tx);
474                         desc->len = mbuf->pkt_len;
475                         addr = (uint64_t)mbuf - (uint64_t)umem->buffer -
476                                         umem->mb_pool->header_size;
477                         offset = rte_pktmbuf_mtod(mbuf, uint64_t) -
478                                         (uint64_t)mbuf +
479                                         umem->mb_pool->header_size;
480                         offset = offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT;
481                         desc->addr = addr | offset;
482                         count++;
483                 } else {
484                         struct rte_mbuf *local_mbuf =
485                                         rte_pktmbuf_alloc(umem->mb_pool);
486                         void *pkt;
487
488                         if (local_mbuf == NULL)
489                                 goto out;
490
491                         if (!xsk_ring_prod__reserve(&txq->tx, 1, &idx_tx)) {
492                                 rte_pktmbuf_free(local_mbuf);
493                                 kick_tx(txq, cq);
494                                 goto out;
495                         }
496
497                         desc = xsk_ring_prod__tx_desc(&txq->tx, idx_tx);
498                         desc->len = mbuf->pkt_len;
499
500                         addr = (uint64_t)local_mbuf - (uint64_t)umem->buffer -
501                                         umem->mb_pool->header_size;
502                         offset = rte_pktmbuf_mtod(local_mbuf, uint64_t) -
503                                         (uint64_t)local_mbuf +
504                                         umem->mb_pool->header_size;
505                         pkt = xsk_umem__get_data(umem->buffer, addr + offset);
506                         offset = offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT;
507                         desc->addr = addr | offset;
508                         rte_memcpy(pkt, rte_pktmbuf_mtod(mbuf, void *),
509                                         desc->len);
510                         rte_pktmbuf_free(mbuf);
511                         count++;
512                 }
513
514                 tx_bytes += mbuf->pkt_len;
515         }
516
517         kick_tx(txq, cq);
518
519 out:
520         xsk_ring_prod__submit(&txq->tx, count);
521
522         txq->stats.tx_pkts += count;
523         txq->stats.tx_bytes += tx_bytes;
524         txq->stats.tx_dropped += nb_pkts - count;
525
526         return count;
527 }
528 #else
529 static uint16_t
530 af_xdp_tx_cp(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
531 {
532         struct pkt_tx_queue *txq = queue;
533         struct xsk_umem_info *umem = txq->umem;
534         struct rte_mbuf *mbuf;
535         void *addrs[ETH_AF_XDP_TX_BATCH_SIZE];
536         unsigned long tx_bytes = 0;
537         int i;
538         uint32_t idx_tx;
539         struct xsk_ring_cons *cq = &txq->pair->cq;
540
541         nb_pkts = RTE_MIN(nb_pkts, ETH_AF_XDP_TX_BATCH_SIZE);
542
543         pull_umem_cq(umem, nb_pkts, cq);
544
545         nb_pkts = rte_ring_dequeue_bulk(umem->buf_ring, addrs,
546                                         nb_pkts, NULL);
547         if (nb_pkts == 0)
548                 return 0;
549
550         if (xsk_ring_prod__reserve(&txq->tx, nb_pkts, &idx_tx) != nb_pkts) {
551                 kick_tx(txq, cq);
552                 rte_ring_enqueue_bulk(umem->buf_ring, addrs, nb_pkts, NULL);
553                 return 0;
554         }
555
556         for (i = 0; i < nb_pkts; i++) {
557                 struct xdp_desc *desc;
558                 void *pkt;
559
560                 desc = xsk_ring_prod__tx_desc(&txq->tx, idx_tx + i);
561                 mbuf = bufs[i];
562                 desc->len = mbuf->pkt_len;
563
564                 desc->addr = (uint64_t)addrs[i];
565                 pkt = xsk_umem__get_data(umem->mz->addr,
566                                          desc->addr);
567                 rte_memcpy(pkt, rte_pktmbuf_mtod(mbuf, void *), desc->len);
568                 tx_bytes += mbuf->pkt_len;
569                 rte_pktmbuf_free(mbuf);
570         }
571
572         xsk_ring_prod__submit(&txq->tx, nb_pkts);
573
574         kick_tx(txq, cq);
575
576         txq->stats.tx_pkts += nb_pkts;
577         txq->stats.tx_bytes += tx_bytes;
578
579         return nb_pkts;
580 }
581 #endif
582
583 static uint16_t
584 eth_af_xdp_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
585 {
586 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
587         return af_xdp_tx_zc(queue, bufs, nb_pkts);
588 #else
589         return af_xdp_tx_cp(queue, bufs, nb_pkts);
590 #endif
591 }
592
593 static int
594 eth_dev_start(struct rte_eth_dev *dev)
595 {
596         dev->data->dev_link.link_status = ETH_LINK_UP;
597
598         return 0;
599 }
600
601 /* This function gets called when the current port gets stopped. */
602 static void
603 eth_dev_stop(struct rte_eth_dev *dev)
604 {
605         dev->data->dev_link.link_status = ETH_LINK_DOWN;
606 }
607
608 /* Find ethdev in list */
609 static inline struct internal_list *
610 find_internal_resource(struct pmd_internals *port_int)
611 {
612         int found = 0;
613         struct internal_list *list = NULL;
614
615         if (port_int == NULL)
616                 return NULL;
617
618         pthread_mutex_lock(&internal_list_lock);
619
620         TAILQ_FOREACH(list, &internal_list, next) {
621                 struct pmd_internals *list_int =
622                                 list->eth_dev->data->dev_private;
623                 if (list_int == port_int) {
624                         found = 1;
625                         break;
626                 }
627         }
628
629         pthread_mutex_unlock(&internal_list_lock);
630
631         if (!found)
632                 return NULL;
633
634         return list;
635 }
636
637 /* Check if the netdev,qid context already exists */
638 static inline bool
639 ctx_exists(struct pkt_rx_queue *rxq, const char *ifname,
640                 struct pkt_rx_queue *list_rxq, const char *list_ifname)
641 {
642         bool exists = false;
643
644         if (rxq->xsk_queue_idx == list_rxq->xsk_queue_idx &&
645                         !strncmp(ifname, list_ifname, IFNAMSIZ)) {
646                 AF_XDP_LOG(ERR, "ctx %s,%i already exists, cannot share umem\n",
647                                         ifname, rxq->xsk_queue_idx);
648                 exists = true;
649         }
650
651         return exists;
652 }
653
654 /* Get a pointer to an existing UMEM which overlays the rxq's mb_pool */
655 static inline int
656 get_shared_umem(struct pkt_rx_queue *rxq, const char *ifname,
657                         struct xsk_umem_info **umem)
658 {
659         struct internal_list *list;
660         struct pmd_internals *internals;
661         int i = 0, ret = 0;
662         struct rte_mempool *mb_pool = rxq->mb_pool;
663
664         if (mb_pool == NULL)
665                 return ret;
666
667         pthread_mutex_lock(&internal_list_lock);
668
669         TAILQ_FOREACH(list, &internal_list, next) {
670                 internals = list->eth_dev->data->dev_private;
671                 for (i = 0; i < internals->queue_cnt; i++) {
672                         struct pkt_rx_queue *list_rxq =
673                                                 &internals->rx_queues[i];
674                         if (rxq == list_rxq)
675                                 continue;
676                         if (mb_pool == internals->rx_queues[i].mb_pool) {
677                                 if (ctx_exists(rxq, ifname, list_rxq,
678                                                 internals->if_name)) {
679                                         ret = -1;
680                                         goto out;
681                                 }
682                                 if (__atomic_load_n(
683                                         &internals->rx_queues[i].umem->refcnt,
684                                                         __ATOMIC_ACQUIRE)) {
685                                         *umem = internals->rx_queues[i].umem;
686                                         goto out;
687                                 }
688                         }
689                 }
690         }
691
692 out:
693         pthread_mutex_unlock(&internal_list_lock);
694
695         return ret;
696 }
697
698 static int
699 eth_dev_configure(struct rte_eth_dev *dev)
700 {
701         struct pmd_internals *internal = dev->data->dev_private;
702
703         /* rx/tx must be paired */
704         if (dev->data->nb_rx_queues != dev->data->nb_tx_queues)
705                 return -EINVAL;
706
707         if (internal->shared_umem) {
708                 struct internal_list *list = NULL;
709                 const char *name = dev->device->name;
710
711                 /* Ensure PMD is not already inserted into the list */
712                 list = find_internal_resource(internal);
713                 if (list)
714                         return 0;
715
716                 list = rte_zmalloc_socket(name, sizeof(*list), 0,
717                                         dev->device->numa_node);
718                 if (list == NULL)
719                         return -1;
720
721                 list->eth_dev = dev;
722                 pthread_mutex_lock(&internal_list_lock);
723                 TAILQ_INSERT_TAIL(&internal_list, list, next);
724                 pthread_mutex_unlock(&internal_list_lock);
725         }
726
727         return 0;
728 }
729
730 static int
731 eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
732 {
733         struct pmd_internals *internals = dev->data->dev_private;
734
735         dev_info->if_index = internals->if_index;
736         dev_info->max_mac_addrs = 1;
737         dev_info->max_rx_pktlen = ETH_FRAME_LEN;
738         dev_info->max_rx_queues = internals->queue_cnt;
739         dev_info->max_tx_queues = internals->queue_cnt;
740
741         dev_info->min_mtu = RTE_ETHER_MIN_MTU;
742 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
743         dev_info->max_mtu = getpagesize() -
744                                 sizeof(struct rte_mempool_objhdr) -
745                                 sizeof(struct rte_mbuf) -
746                                 RTE_PKTMBUF_HEADROOM - XDP_PACKET_HEADROOM;
747 #else
748         dev_info->max_mtu = ETH_AF_XDP_FRAME_SIZE - XDP_PACKET_HEADROOM;
749 #endif
750
751         dev_info->default_rxportconf.nb_queues = 1;
752         dev_info->default_txportconf.nb_queues = 1;
753         dev_info->default_rxportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
754         dev_info->default_txportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
755
756         return 0;
757 }
758
759 static int
760 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
761 {
762         struct pmd_internals *internals = dev->data->dev_private;
763         struct xdp_statistics xdp_stats;
764         struct pkt_rx_queue *rxq;
765         struct pkt_tx_queue *txq;
766         socklen_t optlen;
767         int i, ret;
768
769         for (i = 0; i < dev->data->nb_rx_queues; i++) {
770                 optlen = sizeof(struct xdp_statistics);
771                 rxq = &internals->rx_queues[i];
772                 txq = rxq->pair;
773                 stats->q_ipackets[i] = rxq->stats.rx_pkts;
774                 stats->q_ibytes[i] = rxq->stats.rx_bytes;
775
776                 stats->q_opackets[i] = txq->stats.tx_pkts;
777                 stats->q_obytes[i] = txq->stats.tx_bytes;
778
779                 stats->ipackets += stats->q_ipackets[i];
780                 stats->ibytes += stats->q_ibytes[i];
781                 stats->imissed += rxq->stats.rx_dropped;
782                 stats->oerrors += txq->stats.tx_dropped;
783                 ret = getsockopt(xsk_socket__fd(rxq->xsk), SOL_XDP,
784                                 XDP_STATISTICS, &xdp_stats, &optlen);
785                 if (ret != 0) {
786                         AF_XDP_LOG(ERR, "getsockopt() failed for XDP_STATISTICS.\n");
787                         return -1;
788                 }
789                 stats->imissed += xdp_stats.rx_dropped;
790
791                 stats->opackets += stats->q_opackets[i];
792                 stats->obytes += stats->q_obytes[i];
793         }
794
795         return 0;
796 }
797
798 static int
799 eth_stats_reset(struct rte_eth_dev *dev)
800 {
801         struct pmd_internals *internals = dev->data->dev_private;
802         int i;
803
804         for (i = 0; i < internals->queue_cnt; i++) {
805                 memset(&internals->rx_queues[i].stats, 0,
806                                         sizeof(struct rx_stats));
807                 memset(&internals->tx_queues[i].stats, 0,
808                                         sizeof(struct tx_stats));
809         }
810
811         return 0;
812 }
813
814 static void
815 remove_xdp_program(struct pmd_internals *internals)
816 {
817         uint32_t curr_prog_id = 0;
818
819         if (bpf_get_link_xdp_id(internals->if_index, &curr_prog_id,
820                                 XDP_FLAGS_UPDATE_IF_NOEXIST)) {
821                 AF_XDP_LOG(ERR, "bpf_get_link_xdp_id failed\n");
822                 return;
823         }
824         bpf_set_link_xdp_fd(internals->if_index, -1,
825                         XDP_FLAGS_UPDATE_IF_NOEXIST);
826 }
827
828 static void
829 xdp_umem_destroy(struct xsk_umem_info *umem)
830 {
831 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
832         umem->mb_pool = NULL;
833 #else
834         rte_memzone_free(umem->mz);
835         umem->mz = NULL;
836
837         rte_ring_free(umem->buf_ring);
838         umem->buf_ring = NULL;
839 #endif
840
841         rte_free(umem);
842         umem = NULL;
843 }
844
845 static int
846 eth_dev_close(struct rte_eth_dev *dev)
847 {
848         struct pmd_internals *internals = dev->data->dev_private;
849         struct pkt_rx_queue *rxq;
850         int i;
851
852         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
853                 return 0;
854
855         AF_XDP_LOG(INFO, "Closing AF_XDP ethdev on numa socket %u\n",
856                 rte_socket_id());
857
858         for (i = 0; i < internals->queue_cnt; i++) {
859                 rxq = &internals->rx_queues[i];
860                 if (rxq->umem == NULL)
861                         break;
862                 xsk_socket__delete(rxq->xsk);
863
864                 if (__atomic_sub_fetch(&rxq->umem->refcnt, 1, __ATOMIC_ACQUIRE)
865                                 == 0) {
866                         (void)xsk_umem__delete(rxq->umem->umem);
867                         xdp_umem_destroy(rxq->umem);
868                 }
869
870                 /* free pkt_tx_queue */
871                 rte_free(rxq->pair);
872                 rte_free(rxq);
873         }
874
875         /*
876          * MAC is not allocated dynamically, setting it to NULL would prevent
877          * from releasing it in rte_eth_dev_release_port.
878          */
879         dev->data->mac_addrs = NULL;
880
881         remove_xdp_program(internals);
882
883         if (internals->shared_umem) {
884                 struct internal_list *list;
885
886                 /* Remove ethdev from list used to track and share UMEMs */
887                 list = find_internal_resource(internals);
888                 if (list) {
889                         pthread_mutex_lock(&internal_list_lock);
890                         TAILQ_REMOVE(&internal_list, list, next);
891                         pthread_mutex_unlock(&internal_list_lock);
892                         rte_free(list);
893                 }
894         }
895
896         return 0;
897 }
898
899 static void
900 eth_queue_release(void *q __rte_unused)
901 {
902 }
903
904 static int
905 eth_link_update(struct rte_eth_dev *dev __rte_unused,
906                 int wait_to_complete __rte_unused)
907 {
908         return 0;
909 }
910
911 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
912 static inline uint64_t get_base_addr(struct rte_mempool *mp, uint64_t *align)
913 {
914         struct rte_mempool_memhdr *memhdr;
915         uint64_t memhdr_addr, aligned_addr;
916
917         memhdr = STAILQ_FIRST(&mp->mem_list);
918         memhdr_addr = (uint64_t)memhdr->addr;
919         aligned_addr = memhdr_addr & ~(getpagesize() - 1);
920         *align = memhdr_addr - aligned_addr;
921
922         return aligned_addr;
923 }
924
925 static struct
926 xsk_umem_info *xdp_umem_configure(struct pmd_internals *internals,
927                                   struct pkt_rx_queue *rxq)
928 {
929         struct xsk_umem_info *umem = NULL;
930         int ret;
931         struct xsk_umem_config usr_config = {
932                 .fill_size = ETH_AF_XDP_DFLT_NUM_DESCS * 2,
933                 .comp_size = ETH_AF_XDP_DFLT_NUM_DESCS,
934                 .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG};
935         void *base_addr = NULL;
936         struct rte_mempool *mb_pool = rxq->mb_pool;
937         uint64_t umem_size, align = 0;
938
939         if (internals->shared_umem) {
940                 if (get_shared_umem(rxq, internals->if_name, &umem) < 0)
941                         return NULL;
942
943                 if (umem != NULL &&
944                         __atomic_load_n(&umem->refcnt, __ATOMIC_ACQUIRE) <
945                                         umem->max_xsks) {
946                         AF_XDP_LOG(INFO, "%s,qid%i sharing UMEM\n",
947                                         internals->if_name, rxq->xsk_queue_idx);
948                         __atomic_fetch_add(&umem->refcnt, 1, __ATOMIC_ACQUIRE);
949                 }
950         }
951
952         if (umem == NULL) {
953                 usr_config.frame_size =
954                         rte_mempool_calc_obj_size(mb_pool->elt_size,
955                                                   mb_pool->flags, NULL);
956                 usr_config.frame_headroom = mb_pool->header_size +
957                                                 sizeof(struct rte_mbuf) +
958                                                 rte_pktmbuf_priv_size(mb_pool) +
959                                                 RTE_PKTMBUF_HEADROOM;
960
961                 umem = rte_zmalloc_socket("umem", sizeof(*umem), 0,
962                                           rte_socket_id());
963                 if (umem == NULL) {
964                         AF_XDP_LOG(ERR, "Failed to allocate umem info");
965                         return NULL;
966                 }
967
968                 umem->mb_pool = mb_pool;
969                 base_addr = (void *)get_base_addr(mb_pool, &align);
970                 umem_size = mb_pool->populated_size * usr_config.frame_size +
971                                 align;
972
973                 ret = xsk_umem__create(&umem->umem, base_addr, umem_size,
974                                 &rxq->fq, &rxq->cq, &usr_config);
975                 if (ret) {
976                         AF_XDP_LOG(ERR, "Failed to create umem");
977                         goto err;
978                 }
979                 umem->buffer = base_addr;
980
981                 if (internals->shared_umem) {
982                         umem->max_xsks = mb_pool->populated_size /
983                                                 ETH_AF_XDP_NUM_BUFFERS;
984                         AF_XDP_LOG(INFO, "Max xsks for UMEM %s: %u\n",
985                                                 mb_pool->name, umem->max_xsks);
986                 }
987
988                 __atomic_store_n(&umem->refcnt, 1, __ATOMIC_RELEASE);
989         }
990
991 #else
992 static struct
993 xsk_umem_info *xdp_umem_configure(struct pmd_internals *internals,
994                                   struct pkt_rx_queue *rxq)
995 {
996         struct xsk_umem_info *umem;
997         const struct rte_memzone *mz;
998         struct xsk_umem_config usr_config = {
999                 .fill_size = ETH_AF_XDP_DFLT_NUM_DESCS,
1000                 .comp_size = ETH_AF_XDP_DFLT_NUM_DESCS,
1001                 .frame_size = ETH_AF_XDP_FRAME_SIZE,
1002                 .frame_headroom = 0 };
1003         char ring_name[RTE_RING_NAMESIZE];
1004         char mz_name[RTE_MEMZONE_NAMESIZE];
1005         int ret;
1006         uint64_t i;
1007
1008         umem = rte_zmalloc_socket("umem", sizeof(*umem), 0, rte_socket_id());
1009         if (umem == NULL) {
1010                 AF_XDP_LOG(ERR, "Failed to allocate umem info");
1011                 return NULL;
1012         }
1013
1014         snprintf(ring_name, sizeof(ring_name), "af_xdp_ring_%s_%u",
1015                        internals->if_name, rxq->xsk_queue_idx);
1016         umem->buf_ring = rte_ring_create(ring_name,
1017                                          ETH_AF_XDP_NUM_BUFFERS,
1018                                          rte_socket_id(),
1019                                          0x0);
1020         if (umem->buf_ring == NULL) {
1021                 AF_XDP_LOG(ERR, "Failed to create rte_ring\n");
1022                 goto err;
1023         }
1024
1025         for (i = 0; i < ETH_AF_XDP_NUM_BUFFERS; i++)
1026                 rte_ring_enqueue(umem->buf_ring,
1027                                  (void *)(i * ETH_AF_XDP_FRAME_SIZE));
1028
1029         snprintf(mz_name, sizeof(mz_name), "af_xdp_umem_%s_%u",
1030                        internals->if_name, rxq->xsk_queue_idx);
1031         mz = rte_memzone_reserve_aligned(mz_name,
1032                         ETH_AF_XDP_NUM_BUFFERS * ETH_AF_XDP_FRAME_SIZE,
1033                         rte_socket_id(), RTE_MEMZONE_IOVA_CONTIG,
1034                         getpagesize());
1035         if (mz == NULL) {
1036                 AF_XDP_LOG(ERR, "Failed to reserve memzone for af_xdp umem.\n");
1037                 goto err;
1038         }
1039
1040         ret = xsk_umem__create(&umem->umem, mz->addr,
1041                                ETH_AF_XDP_NUM_BUFFERS * ETH_AF_XDP_FRAME_SIZE,
1042                                &rxq->fq, &rxq->cq,
1043                                &usr_config);
1044
1045         if (ret) {
1046                 AF_XDP_LOG(ERR, "Failed to create umem");
1047                 goto err;
1048         }
1049         umem->mz = mz;
1050
1051 #endif
1052         return umem;
1053
1054 err:
1055         xdp_umem_destroy(umem);
1056         return NULL;
1057 }
1058
1059 static int
1060 load_custom_xdp_prog(const char *prog_path, int if_index)
1061 {
1062         int ret, prog_fd = -1;
1063         struct bpf_object *obj;
1064         struct bpf_map *map;
1065
1066         ret = bpf_prog_load(prog_path, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
1067         if (ret) {
1068                 AF_XDP_LOG(ERR, "Failed to load program %s\n", prog_path);
1069                 return ret;
1070         }
1071
1072         /*
1073          * The loaded program must provision for a map of xsks, such that some
1074          * traffic can be redirected to userspace. When the xsk is created,
1075          * libbpf inserts it into the map.
1076          */
1077         map = bpf_object__find_map_by_name(obj, "xsks_map");
1078         if (!map) {
1079                 AF_XDP_LOG(ERR, "Failed to find xsks_map in %s\n", prog_path);
1080                 return -1;
1081         }
1082
1083         /* Link the program with the given network device */
1084         ret = bpf_set_link_xdp_fd(if_index, prog_fd,
1085                                         XDP_FLAGS_UPDATE_IF_NOEXIST);
1086         if (ret) {
1087                 AF_XDP_LOG(ERR, "Failed to set prog fd %d on interface\n",
1088                                 prog_fd);
1089                 return -1;
1090         }
1091
1092         AF_XDP_LOG(INFO, "Successfully loaded XDP program %s with fd %d\n",
1093                                 prog_path, prog_fd);
1094
1095         return 0;
1096 }
1097
1098 static int
1099 xsk_configure(struct pmd_internals *internals, struct pkt_rx_queue *rxq,
1100               int ring_size)
1101 {
1102         struct xsk_socket_config cfg;
1103         struct pkt_tx_queue *txq = rxq->pair;
1104         int ret = 0;
1105         int reserve_size = ETH_AF_XDP_DFLT_NUM_DESCS;
1106         struct rte_mbuf *fq_bufs[reserve_size];
1107
1108         rxq->umem = xdp_umem_configure(internals, rxq);
1109         if (rxq->umem == NULL)
1110                 return -ENOMEM;
1111         txq->umem = rxq->umem;
1112
1113         cfg.rx_size = ring_size;
1114         cfg.tx_size = ring_size;
1115         cfg.libbpf_flags = 0;
1116         cfg.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
1117         cfg.bind_flags = 0;
1118
1119 #if defined(XDP_USE_NEED_WAKEUP)
1120         cfg.bind_flags |= XDP_USE_NEED_WAKEUP;
1121 #endif
1122
1123         if (strnlen(internals->prog_path, PATH_MAX) &&
1124                                 !internals->custom_prog_configured) {
1125                 ret = load_custom_xdp_prog(internals->prog_path,
1126                                            internals->if_index);
1127                 if (ret) {
1128                         AF_XDP_LOG(ERR, "Failed to load custom XDP program %s\n",
1129                                         internals->prog_path);
1130                         goto err;
1131                 }
1132                 internals->custom_prog_configured = 1;
1133         }
1134
1135         if (internals->shared_umem)
1136                 ret = create_shared_socket(&rxq->xsk, internals->if_name,
1137                                 rxq->xsk_queue_idx, rxq->umem->umem, &rxq->rx,
1138                                 &txq->tx, &rxq->fq, &rxq->cq, &cfg);
1139         else
1140                 ret = xsk_socket__create(&rxq->xsk, internals->if_name,
1141                                 rxq->xsk_queue_idx, rxq->umem->umem, &rxq->rx,
1142                                 &txq->tx, &cfg);
1143
1144         if (ret) {
1145                 AF_XDP_LOG(ERR, "Failed to create xsk socket.\n");
1146                 goto err;
1147         }
1148
1149 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
1150         if (rte_pktmbuf_alloc_bulk(rxq->umem->mb_pool, fq_bufs, reserve_size)) {
1151                 AF_XDP_LOG(DEBUG, "Failed to get enough buffers for fq.\n");
1152                 goto err;
1153         }
1154 #endif
1155         ret = reserve_fill_queue(rxq->umem, reserve_size, fq_bufs, &rxq->fq);
1156         if (ret) {
1157                 xsk_socket__delete(rxq->xsk);
1158                 AF_XDP_LOG(ERR, "Failed to reserve fill queue.\n");
1159                 goto err;
1160         }
1161
1162         return 0;
1163
1164 err:
1165         if (__atomic_sub_fetch(&rxq->umem->refcnt, 1, __ATOMIC_ACQUIRE) == 0)
1166                 xdp_umem_destroy(rxq->umem);
1167
1168         return ret;
1169 }
1170
1171 static int
1172 eth_rx_queue_setup(struct rte_eth_dev *dev,
1173                    uint16_t rx_queue_id,
1174                    uint16_t nb_rx_desc,
1175                    unsigned int socket_id __rte_unused,
1176                    const struct rte_eth_rxconf *rx_conf __rte_unused,
1177                    struct rte_mempool *mb_pool)
1178 {
1179         struct pmd_internals *internals = dev->data->dev_private;
1180         struct pkt_rx_queue *rxq;
1181         int ret;
1182
1183         rxq = &internals->rx_queues[rx_queue_id];
1184
1185         AF_XDP_LOG(INFO, "Set up rx queue, rx queue id: %d, xsk queue id: %d\n",
1186                    rx_queue_id, rxq->xsk_queue_idx);
1187
1188 #ifndef XDP_UMEM_UNALIGNED_CHUNK_FLAG
1189         uint32_t buf_size, data_size;
1190
1191         /* Now get the space available for data in the mbuf */
1192         buf_size = rte_pktmbuf_data_room_size(mb_pool) -
1193                 RTE_PKTMBUF_HEADROOM;
1194         data_size = ETH_AF_XDP_FRAME_SIZE;
1195
1196         if (data_size > buf_size) {
1197                 AF_XDP_LOG(ERR, "%s: %d bytes will not fit in mbuf (%d bytes)\n",
1198                         dev->device->name, data_size, buf_size);
1199                 ret = -ENOMEM;
1200                 goto err;
1201         }
1202 #endif
1203
1204         rxq->mb_pool = mb_pool;
1205
1206         if (xsk_configure(internals, rxq, nb_rx_desc)) {
1207                 AF_XDP_LOG(ERR, "Failed to configure xdp socket\n");
1208                 ret = -EINVAL;
1209                 goto err;
1210         }
1211
1212         rxq->fds[0].fd = xsk_socket__fd(rxq->xsk);
1213         rxq->fds[0].events = POLLIN;
1214
1215         dev->data->rx_queues[rx_queue_id] = rxq;
1216         return 0;
1217
1218 err:
1219         return ret;
1220 }
1221
1222 static int
1223 eth_tx_queue_setup(struct rte_eth_dev *dev,
1224                    uint16_t tx_queue_id,
1225                    uint16_t nb_tx_desc __rte_unused,
1226                    unsigned int socket_id __rte_unused,
1227                    const struct rte_eth_txconf *tx_conf __rte_unused)
1228 {
1229         struct pmd_internals *internals = dev->data->dev_private;
1230         struct pkt_tx_queue *txq;
1231
1232         txq = &internals->tx_queues[tx_queue_id];
1233
1234         dev->data->tx_queues[tx_queue_id] = txq;
1235         return 0;
1236 }
1237
1238 static int
1239 eth_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
1240 {
1241         struct pmd_internals *internals = dev->data->dev_private;
1242         struct ifreq ifr = { .ifr_mtu = mtu };
1243         int ret;
1244         int s;
1245
1246         s = socket(PF_INET, SOCK_DGRAM, 0);
1247         if (s < 0)
1248                 return -EINVAL;
1249
1250         strlcpy(ifr.ifr_name, internals->if_name, IFNAMSIZ);
1251         ret = ioctl(s, SIOCSIFMTU, &ifr);
1252         close(s);
1253
1254         return (ret < 0) ? -errno : 0;
1255 }
1256
1257 static int
1258 eth_dev_change_flags(char *if_name, uint32_t flags, uint32_t mask)
1259 {
1260         struct ifreq ifr;
1261         int ret = 0;
1262         int s;
1263
1264         s = socket(PF_INET, SOCK_DGRAM, 0);
1265         if (s < 0)
1266                 return -errno;
1267
1268         strlcpy(ifr.ifr_name, if_name, IFNAMSIZ);
1269         if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0) {
1270                 ret = -errno;
1271                 goto out;
1272         }
1273         ifr.ifr_flags &= mask;
1274         ifr.ifr_flags |= flags;
1275         if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0) {
1276                 ret = -errno;
1277                 goto out;
1278         }
1279 out:
1280         close(s);
1281         return ret;
1282 }
1283
1284 static int
1285 eth_dev_promiscuous_enable(struct rte_eth_dev *dev)
1286 {
1287         struct pmd_internals *internals = dev->data->dev_private;
1288
1289         return eth_dev_change_flags(internals->if_name, IFF_PROMISC, ~0);
1290 }
1291
1292 static int
1293 eth_dev_promiscuous_disable(struct rte_eth_dev *dev)
1294 {
1295         struct pmd_internals *internals = dev->data->dev_private;
1296
1297         return eth_dev_change_flags(internals->if_name, 0, ~IFF_PROMISC);
1298 }
1299
1300 static const struct eth_dev_ops ops = {
1301         .dev_start = eth_dev_start,
1302         .dev_stop = eth_dev_stop,
1303         .dev_close = eth_dev_close,
1304         .dev_configure = eth_dev_configure,
1305         .dev_infos_get = eth_dev_info,
1306         .mtu_set = eth_dev_mtu_set,
1307         .promiscuous_enable = eth_dev_promiscuous_enable,
1308         .promiscuous_disable = eth_dev_promiscuous_disable,
1309         .rx_queue_setup = eth_rx_queue_setup,
1310         .tx_queue_setup = eth_tx_queue_setup,
1311         .rx_queue_release = eth_queue_release,
1312         .tx_queue_release = eth_queue_release,
1313         .link_update = eth_link_update,
1314         .stats_get = eth_stats_get,
1315         .stats_reset = eth_stats_reset,
1316 };
1317
1318 /** parse integer from integer argument */
1319 static int
1320 parse_integer_arg(const char *key __rte_unused,
1321                   const char *value, void *extra_args)
1322 {
1323         int *i = (int *)extra_args;
1324         char *end;
1325
1326         *i = strtol(value, &end, 10);
1327         if (*i < 0) {
1328                 AF_XDP_LOG(ERR, "Argument has to be positive.\n");
1329                 return -EINVAL;
1330         }
1331
1332         return 0;
1333 }
1334
1335 /** parse name argument */
1336 static int
1337 parse_name_arg(const char *key __rte_unused,
1338                const char *value, void *extra_args)
1339 {
1340         char *name = extra_args;
1341
1342         if (strnlen(value, IFNAMSIZ) > IFNAMSIZ - 1) {
1343                 AF_XDP_LOG(ERR, "Invalid name %s, should be less than %u bytes.\n",
1344                            value, IFNAMSIZ);
1345                 return -EINVAL;
1346         }
1347
1348         strlcpy(name, value, IFNAMSIZ);
1349
1350         return 0;
1351 }
1352
1353 /** parse xdp prog argument */
1354 static int
1355 parse_prog_arg(const char *key __rte_unused,
1356                const char *value, void *extra_args)
1357 {
1358         char *path = extra_args;
1359
1360         if (strnlen(value, PATH_MAX) == PATH_MAX) {
1361                 AF_XDP_LOG(ERR, "Invalid path %s, should be less than %u bytes.\n",
1362                            value, PATH_MAX);
1363                 return -EINVAL;
1364         }
1365
1366         if (access(value, F_OK) != 0) {
1367                 AF_XDP_LOG(ERR, "Error accessing %s: %s\n",
1368                            value, strerror(errno));
1369                 return -EINVAL;
1370         }
1371
1372         strlcpy(path, value, PATH_MAX);
1373
1374         return 0;
1375 }
1376
1377 static int
1378 xdp_get_channels_info(const char *if_name, int *max_queues,
1379                                 int *combined_queues)
1380 {
1381         struct ethtool_channels channels;
1382         struct ifreq ifr;
1383         int fd, ret;
1384
1385         fd = socket(AF_INET, SOCK_DGRAM, 0);
1386         if (fd < 0)
1387                 return -1;
1388
1389         channels.cmd = ETHTOOL_GCHANNELS;
1390         ifr.ifr_data = (void *)&channels;
1391         strlcpy(ifr.ifr_name, if_name, IFNAMSIZ);
1392         ret = ioctl(fd, SIOCETHTOOL, &ifr);
1393         if (ret) {
1394                 if (errno == EOPNOTSUPP) {
1395                         ret = 0;
1396                 } else {
1397                         ret = -errno;
1398                         goto out;
1399                 }
1400         }
1401
1402         if (channels.max_combined == 0 || errno == EOPNOTSUPP) {
1403                 /* If the device says it has no channels, then all traffic
1404                  * is sent to a single stream, so max queues = 1.
1405                  */
1406                 *max_queues = 1;
1407                 *combined_queues = 1;
1408         } else {
1409                 *max_queues = channels.max_combined;
1410                 *combined_queues = channels.combined_count;
1411         }
1412
1413  out:
1414         close(fd);
1415         return ret;
1416 }
1417
1418 static int
1419 parse_parameters(struct rte_kvargs *kvlist, char *if_name, int *start_queue,
1420                         int *queue_cnt, int *shared_umem, char *prog_path)
1421 {
1422         int ret;
1423
1424         ret = rte_kvargs_process(kvlist, ETH_AF_XDP_IFACE_ARG,
1425                                  &parse_name_arg, if_name);
1426         if (ret < 0)
1427                 goto free_kvlist;
1428
1429         ret = rte_kvargs_process(kvlist, ETH_AF_XDP_START_QUEUE_ARG,
1430                                  &parse_integer_arg, start_queue);
1431         if (ret < 0)
1432                 goto free_kvlist;
1433
1434         ret = rte_kvargs_process(kvlist, ETH_AF_XDP_QUEUE_COUNT_ARG,
1435                                  &parse_integer_arg, queue_cnt);
1436         if (ret < 0 || *queue_cnt <= 0) {
1437                 ret = -EINVAL;
1438                 goto free_kvlist;
1439         }
1440
1441         ret = rte_kvargs_process(kvlist, ETH_AF_XDP_SHARED_UMEM_ARG,
1442                                 &parse_integer_arg, shared_umem);
1443         if (ret < 0)
1444                 goto free_kvlist;
1445
1446         ret = rte_kvargs_process(kvlist, ETH_AF_XDP_PROG_ARG,
1447                                  &parse_prog_arg, prog_path);
1448         if (ret < 0)
1449                 goto free_kvlist;
1450
1451 free_kvlist:
1452         rte_kvargs_free(kvlist);
1453         return ret;
1454 }
1455
1456 static int
1457 get_iface_info(const char *if_name,
1458                struct rte_ether_addr *eth_addr,
1459                int *if_index)
1460 {
1461         struct ifreq ifr;
1462         int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
1463
1464         if (sock < 0)
1465                 return -1;
1466
1467         strlcpy(ifr.ifr_name, if_name, IFNAMSIZ);
1468         if (ioctl(sock, SIOCGIFINDEX, &ifr))
1469                 goto error;
1470
1471         *if_index = ifr.ifr_ifindex;
1472
1473         if (ioctl(sock, SIOCGIFHWADDR, &ifr))
1474                 goto error;
1475
1476         rte_memcpy(eth_addr, ifr.ifr_hwaddr.sa_data, RTE_ETHER_ADDR_LEN);
1477
1478         close(sock);
1479         return 0;
1480
1481 error:
1482         close(sock);
1483         return -1;
1484 }
1485
1486 static struct rte_eth_dev *
1487 init_internals(struct rte_vdev_device *dev, const char *if_name,
1488                 int start_queue_idx, int queue_cnt, int shared_umem,
1489                 const char *prog_path)
1490 {
1491         const char *name = rte_vdev_device_name(dev);
1492         const unsigned int numa_node = dev->device.numa_node;
1493         struct pmd_internals *internals;
1494         struct rte_eth_dev *eth_dev;
1495         int ret;
1496         int i;
1497
1498         internals = rte_zmalloc_socket(name, sizeof(*internals), 0, numa_node);
1499         if (internals == NULL)
1500                 return NULL;
1501
1502         internals->start_queue_idx = start_queue_idx;
1503         internals->queue_cnt = queue_cnt;
1504         strlcpy(internals->if_name, if_name, IFNAMSIZ);
1505         strlcpy(internals->prog_path, prog_path, PATH_MAX);
1506         internals->custom_prog_configured = 0;
1507
1508 #ifndef ETH_AF_XDP_SHARED_UMEM
1509         if (shared_umem) {
1510                 AF_XDP_LOG(ERR, "Shared UMEM feature not available. "
1511                                 "Check kernel and libbpf version\n");
1512                 goto err_free_internals;
1513         }
1514 #endif
1515         internals->shared_umem = shared_umem;
1516
1517         if (xdp_get_channels_info(if_name, &internals->max_queue_cnt,
1518                                   &internals->combined_queue_cnt)) {
1519                 AF_XDP_LOG(ERR, "Failed to get channel info of interface: %s\n",
1520                                 if_name);
1521                 goto err_free_internals;
1522         }
1523
1524         if (queue_cnt > internals->combined_queue_cnt) {
1525                 AF_XDP_LOG(ERR, "Specified queue count %d is larger than combined queue count %d.\n",
1526                                 queue_cnt, internals->combined_queue_cnt);
1527                 goto err_free_internals;
1528         }
1529
1530         internals->rx_queues = rte_zmalloc_socket(NULL,
1531                                         sizeof(struct pkt_rx_queue) * queue_cnt,
1532                                         0, numa_node);
1533         if (internals->rx_queues == NULL) {
1534                 AF_XDP_LOG(ERR, "Failed to allocate memory for rx queues.\n");
1535                 goto err_free_internals;
1536         }
1537
1538         internals->tx_queues = rte_zmalloc_socket(NULL,
1539                                         sizeof(struct pkt_tx_queue) * queue_cnt,
1540                                         0, numa_node);
1541         if (internals->tx_queues == NULL) {
1542                 AF_XDP_LOG(ERR, "Failed to allocate memory for tx queues.\n");
1543                 goto err_free_rx;
1544         }
1545         for (i = 0; i < queue_cnt; i++) {
1546                 internals->tx_queues[i].pair = &internals->rx_queues[i];
1547                 internals->rx_queues[i].pair = &internals->tx_queues[i];
1548                 internals->rx_queues[i].xsk_queue_idx = start_queue_idx + i;
1549                 internals->tx_queues[i].xsk_queue_idx = start_queue_idx + i;
1550         }
1551
1552         ret = get_iface_info(if_name, &internals->eth_addr,
1553                              &internals->if_index);
1554         if (ret)
1555                 goto err_free_tx;
1556
1557         eth_dev = rte_eth_vdev_allocate(dev, 0);
1558         if (eth_dev == NULL)
1559                 goto err_free_tx;
1560
1561         eth_dev->data->dev_private = internals;
1562         eth_dev->data->dev_link = pmd_link;
1563         eth_dev->data->mac_addrs = &internals->eth_addr;
1564         eth_dev->dev_ops = &ops;
1565         eth_dev->rx_pkt_burst = eth_af_xdp_rx;
1566         eth_dev->tx_pkt_burst = eth_af_xdp_tx;
1567
1568 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
1569         AF_XDP_LOG(INFO, "Zero copy between umem and mbuf enabled.\n");
1570 #endif
1571
1572         return eth_dev;
1573
1574 err_free_tx:
1575         rte_free(internals->tx_queues);
1576 err_free_rx:
1577         rte_free(internals->rx_queues);
1578 err_free_internals:
1579         rte_free(internals);
1580         return NULL;
1581 }
1582
1583 static int
1584 rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
1585 {
1586         struct rte_kvargs *kvlist;
1587         char if_name[IFNAMSIZ] = {'\0'};
1588         int xsk_start_queue_idx = ETH_AF_XDP_DFLT_START_QUEUE_IDX;
1589         int xsk_queue_cnt = ETH_AF_XDP_DFLT_QUEUE_COUNT;
1590         int shared_umem = 0;
1591         char prog_path[PATH_MAX] = {'\0'};
1592         struct rte_eth_dev *eth_dev = NULL;
1593         const char *name;
1594
1595         AF_XDP_LOG(INFO, "Initializing pmd_af_xdp for %s\n",
1596                 rte_vdev_device_name(dev));
1597
1598         name = rte_vdev_device_name(dev);
1599         if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
1600                 strlen(rte_vdev_device_args(dev)) == 0) {
1601                 eth_dev = rte_eth_dev_attach_secondary(name);
1602                 if (eth_dev == NULL) {
1603                         AF_XDP_LOG(ERR, "Failed to probe %s\n", name);
1604                         return -EINVAL;
1605                 }
1606                 eth_dev->dev_ops = &ops;
1607                 rte_eth_dev_probing_finish(eth_dev);
1608                 return 0;
1609         }
1610
1611         kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1612         if (kvlist == NULL) {
1613                 AF_XDP_LOG(ERR, "Invalid kvargs key\n");
1614                 return -EINVAL;
1615         }
1616
1617         if (dev->device.numa_node == SOCKET_ID_ANY)
1618                 dev->device.numa_node = rte_socket_id();
1619
1620         if (parse_parameters(kvlist, if_name, &xsk_start_queue_idx,
1621                              &xsk_queue_cnt, &shared_umem, prog_path) < 0) {
1622                 AF_XDP_LOG(ERR, "Invalid kvargs value\n");
1623                 return -EINVAL;
1624         }
1625
1626         if (strlen(if_name) == 0) {
1627                 AF_XDP_LOG(ERR, "Network interface must be specified\n");
1628                 return -EINVAL;
1629         }
1630
1631         eth_dev = init_internals(dev, if_name, xsk_start_queue_idx,
1632                                         xsk_queue_cnt, shared_umem, prog_path);
1633         if (eth_dev == NULL) {
1634                 AF_XDP_LOG(ERR, "Failed to init internals\n");
1635                 return -1;
1636         }
1637
1638         rte_eth_dev_probing_finish(eth_dev);
1639
1640         return 0;
1641 }
1642
1643 static int
1644 rte_pmd_af_xdp_remove(struct rte_vdev_device *dev)
1645 {
1646         struct rte_eth_dev *eth_dev = NULL;
1647
1648         AF_XDP_LOG(INFO, "Removing AF_XDP ethdev on numa socket %u\n",
1649                 rte_socket_id());
1650
1651         if (dev == NULL)
1652                 return -1;
1653
1654         /* find the ethdev entry */
1655         eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
1656         if (eth_dev == NULL)
1657                 return 0;
1658
1659         eth_dev_close(eth_dev);
1660         rte_eth_dev_release_port(eth_dev);
1661
1662
1663         return 0;
1664 }
1665
1666 static struct rte_vdev_driver pmd_af_xdp_drv = {
1667         .probe = rte_pmd_af_xdp_probe,
1668         .remove = rte_pmd_af_xdp_remove,
1669 };
1670
1671 RTE_PMD_REGISTER_VDEV(net_af_xdp, pmd_af_xdp_drv);
1672 RTE_PMD_REGISTER_PARAM_STRING(net_af_xdp,
1673                               "iface=<string> "
1674                               "start_queue=<int> "
1675                               "queue_count=<int> "
1676                               "shared_umem=<int> "
1677                               "xdp_prog=<string> ");