net/af_xdp: enable custom XDP program loading
[dpdk.git] / drivers / net / af_xdp / rte_eth_af_xdp.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2019-2020 Intel Corporation.
3  */
4 #include <unistd.h>
5 #include <errno.h>
6 #include <stdlib.h>
7 #include <string.h>
8 #include <poll.h>
9 #include <netinet/in.h>
10 #include <net/if.h>
11 #include <sys/socket.h>
12 #include <sys/ioctl.h>
13 #include <linux/if_ether.h>
14 #include <linux/if_xdp.h>
15 #include <linux/if_link.h>
16 #include <linux/ethtool.h>
17 #include <linux/sockios.h>
18 #include "af_xdp_deps.h"
19 #include <bpf/xsk.h>
20
21 #include <rte_ethdev.h>
22 #include <rte_ethdev_driver.h>
23 #include <rte_ethdev_vdev.h>
24 #include <rte_kvargs.h>
25 #include <rte_bus_vdev.h>
26 #include <rte_string_fns.h>
27 #include <rte_branch_prediction.h>
28 #include <rte_common.h>
29 #include <rte_dev.h>
30 #include <rte_eal.h>
31 #include <rte_ether.h>
32 #include <rte_lcore.h>
33 #include <rte_log.h>
34 #include <rte_memory.h>
35 #include <rte_memzone.h>
36 #include <rte_mempool.h>
37 #include <rte_mbuf.h>
38 #include <rte_malloc.h>
39 #include <rte_ring.h>
40 #include <rte_spinlock.h>
41
42 #include "compat.h"
43
44
45 #ifndef SOL_XDP
46 #define SOL_XDP 283
47 #endif
48
49 #ifndef AF_XDP
50 #define AF_XDP 44
51 #endif
52
53 #ifndef PF_XDP
54 #define PF_XDP AF_XDP
55 #endif
56
57 RTE_LOG_REGISTER(af_xdp_logtype, pmd.net.af_xdp, NOTICE);
58
59 #define AF_XDP_LOG(level, fmt, args...)                 \
60         rte_log(RTE_LOG_ ## level, af_xdp_logtype,      \
61                 "%s(): " fmt, __func__, ##args)
62
63 #define ETH_AF_XDP_FRAME_SIZE           2048
64 #define ETH_AF_XDP_NUM_BUFFERS          4096
65 #define ETH_AF_XDP_DFLT_NUM_DESCS       XSK_RING_CONS__DEFAULT_NUM_DESCS
66 #define ETH_AF_XDP_DFLT_START_QUEUE_IDX 0
67 #define ETH_AF_XDP_DFLT_QUEUE_COUNT     1
68
69 #define ETH_AF_XDP_RX_BATCH_SIZE        32
70 #define ETH_AF_XDP_TX_BATCH_SIZE        32
71
72
73 struct xsk_umem_info {
74         struct xsk_umem *umem;
75         struct rte_ring *buf_ring;
76         const struct rte_memzone *mz;
77         struct rte_mempool *mb_pool;
78         void *buffer;
79         uint8_t refcnt;
80         uint32_t max_xsks;
81 };
82
83 struct rx_stats {
84         uint64_t rx_pkts;
85         uint64_t rx_bytes;
86         uint64_t rx_dropped;
87 };
88
89 struct pkt_rx_queue {
90         struct xsk_ring_cons rx;
91         struct xsk_umem_info *umem;
92         struct xsk_socket *xsk;
93         struct rte_mempool *mb_pool;
94
95         struct rx_stats stats;
96
97         struct xsk_ring_prod fq;
98         struct xsk_ring_cons cq;
99
100         struct pkt_tx_queue *pair;
101         struct pollfd fds[1];
102         int xsk_queue_idx;
103 };
104
105 struct tx_stats {
106         uint64_t tx_pkts;
107         uint64_t tx_bytes;
108         uint64_t tx_dropped;
109 };
110
111 struct pkt_tx_queue {
112         struct xsk_ring_prod tx;
113         struct xsk_umem_info *umem;
114
115         struct tx_stats stats;
116
117         struct pkt_rx_queue *pair;
118         int xsk_queue_idx;
119 };
120
121 struct pmd_internals {
122         int if_index;
123         char if_name[IFNAMSIZ];
124         int start_queue_idx;
125         int queue_cnt;
126         int max_queue_cnt;
127         int combined_queue_cnt;
128         bool shared_umem;
129         char prog_path[PATH_MAX];
130         bool custom_prog_configured;
131
132         struct rte_ether_addr eth_addr;
133
134         struct pkt_rx_queue *rx_queues;
135         struct pkt_tx_queue *tx_queues;
136 };
137
138 #define ETH_AF_XDP_IFACE_ARG                    "iface"
139 #define ETH_AF_XDP_START_QUEUE_ARG              "start_queue"
140 #define ETH_AF_XDP_QUEUE_COUNT_ARG              "queue_count"
141 #define ETH_AF_XDP_SHARED_UMEM_ARG              "shared_umem"
142 #define ETH_AF_XDP_PROG_ARG                     "xdp_prog"
143
144 static const char * const valid_arguments[] = {
145         ETH_AF_XDP_IFACE_ARG,
146         ETH_AF_XDP_START_QUEUE_ARG,
147         ETH_AF_XDP_QUEUE_COUNT_ARG,
148         ETH_AF_XDP_SHARED_UMEM_ARG,
149         ETH_AF_XDP_PROG_ARG,
150         NULL
151 };
152
153 static const struct rte_eth_link pmd_link = {
154         .link_speed = ETH_SPEED_NUM_10G,
155         .link_duplex = ETH_LINK_FULL_DUPLEX,
156         .link_status = ETH_LINK_DOWN,
157         .link_autoneg = ETH_LINK_AUTONEG
158 };
159
160 /* List which tracks PMDs to facilitate sharing UMEMs across them. */
161 struct internal_list {
162         TAILQ_ENTRY(internal_list) next;
163         struct rte_eth_dev *eth_dev;
164 };
165
166 TAILQ_HEAD(internal_list_head, internal_list);
167 static struct internal_list_head internal_list =
168         TAILQ_HEAD_INITIALIZER(internal_list);
169
170 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
171
172 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
173 static inline int
174 reserve_fill_queue_zc(struct xsk_umem_info *umem, uint16_t reserve_size,
175                       struct rte_mbuf **bufs, struct xsk_ring_prod *fq)
176 {
177         uint32_t idx;
178         uint16_t i;
179
180         if (unlikely(!xsk_ring_prod__reserve(fq, reserve_size, &idx))) {
181                 for (i = 0; i < reserve_size; i++)
182                         rte_pktmbuf_free(bufs[i]);
183                 AF_XDP_LOG(DEBUG, "Failed to reserve enough fq descs.\n");
184                 return -1;
185         }
186
187         for (i = 0; i < reserve_size; i++) {
188                 __u64 *fq_addr;
189                 uint64_t addr;
190
191                 fq_addr = xsk_ring_prod__fill_addr(fq, idx++);
192                 addr = (uint64_t)bufs[i] - (uint64_t)umem->buffer -
193                                 umem->mb_pool->header_size;
194                 *fq_addr = addr;
195         }
196
197         xsk_ring_prod__submit(fq, reserve_size);
198
199         return 0;
200 }
201 #else
202 static inline int
203 reserve_fill_queue_cp(struct xsk_umem_info *umem, uint16_t reserve_size,
204                       struct rte_mbuf **bufs __rte_unused,
205                       struct xsk_ring_prod *fq)
206 {
207         void *addrs[reserve_size];
208         uint32_t idx;
209         uint16_t i;
210
211         if (rte_ring_dequeue_bulk(umem->buf_ring, addrs, reserve_size, NULL)
212                     != reserve_size) {
213                 AF_XDP_LOG(DEBUG, "Failed to get enough buffers for fq.\n");
214                 return -1;
215         }
216
217         if (unlikely(!xsk_ring_prod__reserve(fq, reserve_size, &idx))) {
218                 AF_XDP_LOG(DEBUG, "Failed to reserve enough fq descs.\n");
219                 rte_ring_enqueue_bulk(umem->buf_ring, addrs,
220                                 reserve_size, NULL);
221                 return -1;
222         }
223
224         for (i = 0; i < reserve_size; i++) {
225                 __u64 *fq_addr;
226
227                 fq_addr = xsk_ring_prod__fill_addr(fq, idx++);
228                 *fq_addr = (uint64_t)addrs[i];
229         }
230
231         xsk_ring_prod__submit(fq, reserve_size);
232
233         return 0;
234 }
235 #endif
236
237 static inline int
238 reserve_fill_queue(struct xsk_umem_info *umem, uint16_t reserve_size,
239                    struct rte_mbuf **bufs, struct xsk_ring_prod *fq)
240 {
241 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
242         return reserve_fill_queue_zc(umem, reserve_size, bufs, fq);
243 #else
244         return reserve_fill_queue_cp(umem, reserve_size, bufs, fq);
245 #endif
246 }
247
248 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
249 static uint16_t
250 af_xdp_rx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
251 {
252         struct pkt_rx_queue *rxq = queue;
253         struct xsk_ring_cons *rx = &rxq->rx;
254         struct xsk_ring_prod *fq = &rxq->fq;
255         struct xsk_umem_info *umem = rxq->umem;
256         uint32_t idx_rx = 0;
257         unsigned long rx_bytes = 0;
258         int rcvd, i;
259         struct rte_mbuf *fq_bufs[ETH_AF_XDP_RX_BATCH_SIZE];
260
261         /* allocate bufs for fill queue replenishment after rx */
262         if (rte_pktmbuf_alloc_bulk(umem->mb_pool, fq_bufs, nb_pkts)) {
263                 AF_XDP_LOG(DEBUG,
264                         "Failed to get enough buffers for fq.\n");
265                 return 0;
266         }
267
268         rcvd = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
269
270         if (rcvd == 0) {
271 #if defined(XDP_USE_NEED_WAKEUP)
272                 if (xsk_ring_prod__needs_wakeup(fq))
273                         (void)poll(rxq->fds, 1, 1000);
274 #endif
275
276                 goto out;
277         }
278
279         for (i = 0; i < rcvd; i++) {
280                 const struct xdp_desc *desc;
281                 uint64_t addr;
282                 uint32_t len;
283                 uint64_t offset;
284
285                 desc = xsk_ring_cons__rx_desc(rx, idx_rx++);
286                 addr = desc->addr;
287                 len = desc->len;
288
289                 offset = xsk_umem__extract_offset(addr);
290                 addr = xsk_umem__extract_addr(addr);
291
292                 bufs[i] = (struct rte_mbuf *)
293                                 xsk_umem__get_data(umem->buffer, addr +
294                                         umem->mb_pool->header_size);
295                 bufs[i]->data_off = offset - sizeof(struct rte_mbuf) -
296                         rte_pktmbuf_priv_size(umem->mb_pool) -
297                         umem->mb_pool->header_size;
298
299                 rte_pktmbuf_pkt_len(bufs[i]) = len;
300                 rte_pktmbuf_data_len(bufs[i]) = len;
301                 rx_bytes += len;
302         }
303
304         xsk_ring_cons__release(rx, rcvd);
305
306         (void)reserve_fill_queue(umem, rcvd, fq_bufs, fq);
307
308         /* statistics */
309         rxq->stats.rx_pkts += rcvd;
310         rxq->stats.rx_bytes += rx_bytes;
311
312 out:
313         if (rcvd != nb_pkts)
314                 rte_mempool_put_bulk(umem->mb_pool, (void **)&fq_bufs[rcvd],
315                                      nb_pkts - rcvd);
316
317         return rcvd;
318 }
319 #else
320 static uint16_t
321 af_xdp_rx_cp(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
322 {
323         struct pkt_rx_queue *rxq = queue;
324         struct xsk_ring_cons *rx = &rxq->rx;
325         struct xsk_umem_info *umem = rxq->umem;
326         struct xsk_ring_prod *fq = &rxq->fq;
327         uint32_t idx_rx = 0;
328         unsigned long rx_bytes = 0;
329         int rcvd, i;
330         uint32_t free_thresh = fq->size >> 1;
331         struct rte_mbuf *mbufs[ETH_AF_XDP_RX_BATCH_SIZE];
332
333         if (xsk_prod_nb_free(fq, free_thresh) >= free_thresh)
334                 (void)reserve_fill_queue(umem, ETH_AF_XDP_RX_BATCH_SIZE,
335                                          NULL, fq);
336
337         if (unlikely(rte_pktmbuf_alloc_bulk(rxq->mb_pool, mbufs, nb_pkts) != 0))
338                 return 0;
339
340         rcvd = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
341         if (rcvd == 0) {
342 #if defined(XDP_USE_NEED_WAKEUP)
343                 if (xsk_ring_prod__needs_wakeup(fq))
344                         (void)poll(rxq->fds, 1, 1000);
345 #endif
346
347                 goto out;
348         }
349
350         for (i = 0; i < rcvd; i++) {
351                 const struct xdp_desc *desc;
352                 uint64_t addr;
353                 uint32_t len;
354                 void *pkt;
355
356                 desc = xsk_ring_cons__rx_desc(rx, idx_rx++);
357                 addr = desc->addr;
358                 len = desc->len;
359                 pkt = xsk_umem__get_data(rxq->umem->mz->addr, addr);
360
361                 rte_memcpy(rte_pktmbuf_mtod(mbufs[i], void *), pkt, len);
362                 rte_ring_enqueue(umem->buf_ring, (void *)addr);
363                 rte_pktmbuf_pkt_len(mbufs[i]) = len;
364                 rte_pktmbuf_data_len(mbufs[i]) = len;
365                 rx_bytes += len;
366                 bufs[i] = mbufs[i];
367         }
368
369         xsk_ring_cons__release(rx, rcvd);
370
371         /* statistics */
372         rxq->stats.rx_pkts += rcvd;
373         rxq->stats.rx_bytes += rx_bytes;
374
375 out:
376         if (rcvd != nb_pkts)
377                 rte_mempool_put_bulk(rxq->mb_pool, (void **)&mbufs[rcvd],
378                                      nb_pkts - rcvd);
379
380         return rcvd;
381 }
382 #endif
383
384 static uint16_t
385 eth_af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
386 {
387         nb_pkts = RTE_MIN(nb_pkts, ETH_AF_XDP_RX_BATCH_SIZE);
388
389 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
390         return af_xdp_rx_zc(queue, bufs, nb_pkts);
391 #else
392         return af_xdp_rx_cp(queue, bufs, nb_pkts);
393 #endif
394 }
395
396 static void
397 pull_umem_cq(struct xsk_umem_info *umem, int size, struct xsk_ring_cons *cq)
398 {
399         size_t i, n;
400         uint32_t idx_cq = 0;
401
402         n = xsk_ring_cons__peek(cq, size, &idx_cq);
403
404         for (i = 0; i < n; i++) {
405                 uint64_t addr;
406                 addr = *xsk_ring_cons__comp_addr(cq, idx_cq++);
407 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
408                 addr = xsk_umem__extract_addr(addr);
409                 rte_pktmbuf_free((struct rte_mbuf *)
410                                         xsk_umem__get_data(umem->buffer,
411                                         addr + umem->mb_pool->header_size));
412 #else
413                 rte_ring_enqueue(umem->buf_ring, (void *)addr);
414 #endif
415         }
416
417         xsk_ring_cons__release(cq, n);
418 }
419
420 static void
421 kick_tx(struct pkt_tx_queue *txq, struct xsk_ring_cons *cq)
422 {
423         struct xsk_umem_info *umem = txq->umem;
424
425         pull_umem_cq(umem, XSK_RING_CONS__DEFAULT_NUM_DESCS, cq);
426
427 #if defined(XDP_USE_NEED_WAKEUP)
428         if (xsk_ring_prod__needs_wakeup(&txq->tx))
429 #endif
430                 while (send(xsk_socket__fd(txq->pair->xsk), NULL,
431                             0, MSG_DONTWAIT) < 0) {
432                         /* some thing unexpected */
433                         if (errno != EBUSY && errno != EAGAIN && errno != EINTR)
434                                 break;
435
436                         /* pull from completion queue to leave more space */
437                         if (errno == EAGAIN)
438                                 pull_umem_cq(umem,
439                                              XSK_RING_CONS__DEFAULT_NUM_DESCS,
440                                              cq);
441                 }
442 }
443
444 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
445 static uint16_t
446 af_xdp_tx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
447 {
448         struct pkt_tx_queue *txq = queue;
449         struct xsk_umem_info *umem = txq->umem;
450         struct rte_mbuf *mbuf;
451         unsigned long tx_bytes = 0;
452         int i;
453         uint32_t idx_tx;
454         uint16_t count = 0;
455         struct xdp_desc *desc;
456         uint64_t addr, offset;
457         struct xsk_ring_cons *cq = &txq->pair->cq;
458         uint32_t free_thresh = cq->size >> 1;
459
460         if (xsk_cons_nb_avail(cq, free_thresh) >= free_thresh)
461                 pull_umem_cq(umem, XSK_RING_CONS__DEFAULT_NUM_DESCS, cq);
462
463         for (i = 0; i < nb_pkts; i++) {
464                 mbuf = bufs[i];
465
466                 if (mbuf->pool == umem->mb_pool) {
467                         if (!xsk_ring_prod__reserve(&txq->tx, 1, &idx_tx)) {
468                                 kick_tx(txq, cq);
469                                 if (!xsk_ring_prod__reserve(&txq->tx, 1,
470                                                             &idx_tx))
471                                         goto out;
472                         }
473                         desc = xsk_ring_prod__tx_desc(&txq->tx, idx_tx);
474                         desc->len = mbuf->pkt_len;
475                         addr = (uint64_t)mbuf - (uint64_t)umem->buffer -
476                                         umem->mb_pool->header_size;
477                         offset = rte_pktmbuf_mtod(mbuf, uint64_t) -
478                                         (uint64_t)mbuf +
479                                         umem->mb_pool->header_size;
480                         offset = offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT;
481                         desc->addr = addr | offset;
482                         count++;
483                 } else {
484                         struct rte_mbuf *local_mbuf =
485                                         rte_pktmbuf_alloc(umem->mb_pool);
486                         void *pkt;
487
488                         if (local_mbuf == NULL)
489                                 goto out;
490
491                         if (!xsk_ring_prod__reserve(&txq->tx, 1, &idx_tx)) {
492                                 rte_pktmbuf_free(local_mbuf);
493                                 kick_tx(txq, cq);
494                                 goto out;
495                         }
496
497                         desc = xsk_ring_prod__tx_desc(&txq->tx, idx_tx);
498                         desc->len = mbuf->pkt_len;
499
500                         addr = (uint64_t)local_mbuf - (uint64_t)umem->buffer -
501                                         umem->mb_pool->header_size;
502                         offset = rte_pktmbuf_mtod(local_mbuf, uint64_t) -
503                                         (uint64_t)local_mbuf +
504                                         umem->mb_pool->header_size;
505                         pkt = xsk_umem__get_data(umem->buffer, addr + offset);
506                         offset = offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT;
507                         desc->addr = addr | offset;
508                         rte_memcpy(pkt, rte_pktmbuf_mtod(mbuf, void *),
509                                         desc->len);
510                         rte_pktmbuf_free(mbuf);
511                         count++;
512                 }
513
514                 tx_bytes += mbuf->pkt_len;
515         }
516
517         kick_tx(txq, cq);
518
519 out:
520         xsk_ring_prod__submit(&txq->tx, count);
521
522         txq->stats.tx_pkts += count;
523         txq->stats.tx_bytes += tx_bytes;
524         txq->stats.tx_dropped += nb_pkts - count;
525
526         return count;
527 }
528 #else
529 static uint16_t
530 af_xdp_tx_cp(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
531 {
532         struct pkt_tx_queue *txq = queue;
533         struct xsk_umem_info *umem = txq->umem;
534         struct rte_mbuf *mbuf;
535         void *addrs[ETH_AF_XDP_TX_BATCH_SIZE];
536         unsigned long tx_bytes = 0;
537         int i;
538         uint32_t idx_tx;
539         struct xsk_ring_cons *cq = &txq->pair->cq;
540
541         nb_pkts = RTE_MIN(nb_pkts, ETH_AF_XDP_TX_BATCH_SIZE);
542
543         pull_umem_cq(umem, nb_pkts, cq);
544
545         nb_pkts = rte_ring_dequeue_bulk(umem->buf_ring, addrs,
546                                         nb_pkts, NULL);
547         if (nb_pkts == 0)
548                 return 0;
549
550         if (xsk_ring_prod__reserve(&txq->tx, nb_pkts, &idx_tx) != nb_pkts) {
551                 kick_tx(txq, cq);
552                 rte_ring_enqueue_bulk(umem->buf_ring, addrs, nb_pkts, NULL);
553                 return 0;
554         }
555
556         for (i = 0; i < nb_pkts; i++) {
557                 struct xdp_desc *desc;
558                 void *pkt;
559
560                 desc = xsk_ring_prod__tx_desc(&txq->tx, idx_tx + i);
561                 mbuf = bufs[i];
562                 desc->len = mbuf->pkt_len;
563
564                 desc->addr = (uint64_t)addrs[i];
565                 pkt = xsk_umem__get_data(umem->mz->addr,
566                                          desc->addr);
567                 rte_memcpy(pkt, rte_pktmbuf_mtod(mbuf, void *), desc->len);
568                 tx_bytes += mbuf->pkt_len;
569                 rte_pktmbuf_free(mbuf);
570         }
571
572         xsk_ring_prod__submit(&txq->tx, nb_pkts);
573
574         kick_tx(txq, cq);
575
576         txq->stats.tx_pkts += nb_pkts;
577         txq->stats.tx_bytes += tx_bytes;
578
579         return nb_pkts;
580 }
581 #endif
582
583 static uint16_t
584 eth_af_xdp_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
585 {
586 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
587         return af_xdp_tx_zc(queue, bufs, nb_pkts);
588 #else
589         return af_xdp_tx_cp(queue, bufs, nb_pkts);
590 #endif
591 }
592
593 static int
594 eth_dev_start(struct rte_eth_dev *dev)
595 {
596         dev->data->dev_link.link_status = ETH_LINK_UP;
597
598         return 0;
599 }
600
601 /* This function gets called when the current port gets stopped. */
602 static void
603 eth_dev_stop(struct rte_eth_dev *dev)
604 {
605         dev->data->dev_link.link_status = ETH_LINK_DOWN;
606 }
607
608 /* Find ethdev in list */
609 static inline struct internal_list *
610 find_internal_resource(struct pmd_internals *port_int)
611 {
612         int found = 0;
613         struct internal_list *list = NULL;
614
615         if (port_int == NULL)
616                 return NULL;
617
618         pthread_mutex_lock(&internal_list_lock);
619
620         TAILQ_FOREACH(list, &internal_list, next) {
621                 struct pmd_internals *list_int =
622                                 list->eth_dev->data->dev_private;
623                 if (list_int == port_int) {
624                         found = 1;
625                         break;
626                 }
627         }
628
629         pthread_mutex_unlock(&internal_list_lock);
630
631         if (!found)
632                 return NULL;
633
634         return list;
635 }
636
637 /* Get a pointer to an existing UMEM which overlays the rxq's mb_pool */
638 static inline struct xsk_umem_info *
639 get_shared_umem(struct pkt_rx_queue *rxq) {
640         struct internal_list *list;
641         struct pmd_internals *internals;
642         int i = 0;
643         struct rte_mempool *mb_pool = rxq->mb_pool;
644
645         if (mb_pool == NULL)
646                 return NULL;
647
648         pthread_mutex_lock(&internal_list_lock);
649
650         TAILQ_FOREACH(list, &internal_list, next) {
651                 internals = list->eth_dev->data->dev_private;
652                 for (i = 0; i < internals->queue_cnt; i++) {
653                         struct pkt_rx_queue *list_rxq =
654                                                 &internals->rx_queues[i];
655                         if (rxq == list_rxq)
656                                 continue;
657                         if (mb_pool == internals->rx_queues[i].mb_pool) {
658                                 if (__atomic_load_n(
659                                         &internals->rx_queues[i].umem->refcnt,
660                                                         __ATOMIC_ACQUIRE)) {
661                                         pthread_mutex_unlock(
662                                                         &internal_list_lock);
663                                         return internals->rx_queues[i].umem;
664                                 }
665                         }
666                 }
667         }
668
669         pthread_mutex_unlock(&internal_list_lock);
670
671         return NULL;
672 }
673
674 static int
675 eth_dev_configure(struct rte_eth_dev *dev)
676 {
677         struct pmd_internals *internal = dev->data->dev_private;
678
679         /* rx/tx must be paired */
680         if (dev->data->nb_rx_queues != dev->data->nb_tx_queues)
681                 return -EINVAL;
682
683         if (internal->shared_umem) {
684                 struct internal_list *list = NULL;
685                 const char *name = dev->device->name;
686
687                 /* Ensure PMD is not already inserted into the list */
688                 list = find_internal_resource(internal);
689                 if (list)
690                         return 0;
691
692                 list = rte_zmalloc_socket(name, sizeof(*list), 0,
693                                         dev->device->numa_node);
694                 if (list == NULL)
695                         return -1;
696
697                 list->eth_dev = dev;
698                 pthread_mutex_lock(&internal_list_lock);
699                 TAILQ_INSERT_TAIL(&internal_list, list, next);
700                 pthread_mutex_unlock(&internal_list_lock);
701         }
702
703         return 0;
704 }
705
706 static int
707 eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
708 {
709         struct pmd_internals *internals = dev->data->dev_private;
710
711         dev_info->if_index = internals->if_index;
712         dev_info->max_mac_addrs = 1;
713         dev_info->max_rx_pktlen = ETH_FRAME_LEN;
714         dev_info->max_rx_queues = internals->queue_cnt;
715         dev_info->max_tx_queues = internals->queue_cnt;
716
717         dev_info->min_mtu = RTE_ETHER_MIN_MTU;
718 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
719         dev_info->max_mtu = getpagesize() -
720                                 sizeof(struct rte_mempool_objhdr) -
721                                 sizeof(struct rte_mbuf) -
722                                 RTE_PKTMBUF_HEADROOM - XDP_PACKET_HEADROOM;
723 #else
724         dev_info->max_mtu = ETH_AF_XDP_FRAME_SIZE - XDP_PACKET_HEADROOM;
725 #endif
726
727         dev_info->default_rxportconf.nb_queues = 1;
728         dev_info->default_txportconf.nb_queues = 1;
729         dev_info->default_rxportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
730         dev_info->default_txportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
731
732         return 0;
733 }
734
735 static int
736 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
737 {
738         struct pmd_internals *internals = dev->data->dev_private;
739         struct xdp_statistics xdp_stats;
740         struct pkt_rx_queue *rxq;
741         struct pkt_tx_queue *txq;
742         socklen_t optlen;
743         int i, ret;
744
745         for (i = 0; i < dev->data->nb_rx_queues; i++) {
746                 optlen = sizeof(struct xdp_statistics);
747                 rxq = &internals->rx_queues[i];
748                 txq = rxq->pair;
749                 stats->q_ipackets[i] = rxq->stats.rx_pkts;
750                 stats->q_ibytes[i] = rxq->stats.rx_bytes;
751
752                 stats->q_opackets[i] = txq->stats.tx_pkts;
753                 stats->q_obytes[i] = txq->stats.tx_bytes;
754
755                 stats->ipackets += stats->q_ipackets[i];
756                 stats->ibytes += stats->q_ibytes[i];
757                 stats->imissed += rxq->stats.rx_dropped;
758                 stats->oerrors += txq->stats.tx_dropped;
759                 ret = getsockopt(xsk_socket__fd(rxq->xsk), SOL_XDP,
760                                 XDP_STATISTICS, &xdp_stats, &optlen);
761                 if (ret != 0) {
762                         AF_XDP_LOG(ERR, "getsockopt() failed for XDP_STATISTICS.\n");
763                         return -1;
764                 }
765                 stats->imissed += xdp_stats.rx_dropped;
766
767                 stats->opackets += stats->q_opackets[i];
768                 stats->obytes += stats->q_obytes[i];
769         }
770
771         return 0;
772 }
773
774 static int
775 eth_stats_reset(struct rte_eth_dev *dev)
776 {
777         struct pmd_internals *internals = dev->data->dev_private;
778         int i;
779
780         for (i = 0; i < internals->queue_cnt; i++) {
781                 memset(&internals->rx_queues[i].stats, 0,
782                                         sizeof(struct rx_stats));
783                 memset(&internals->tx_queues[i].stats, 0,
784                                         sizeof(struct tx_stats));
785         }
786
787         return 0;
788 }
789
790 static void
791 remove_xdp_program(struct pmd_internals *internals)
792 {
793         uint32_t curr_prog_id = 0;
794
795         if (bpf_get_link_xdp_id(internals->if_index, &curr_prog_id,
796                                 XDP_FLAGS_UPDATE_IF_NOEXIST)) {
797                 AF_XDP_LOG(ERR, "bpf_get_link_xdp_id failed\n");
798                 return;
799         }
800         bpf_set_link_xdp_fd(internals->if_index, -1,
801                         XDP_FLAGS_UPDATE_IF_NOEXIST);
802 }
803
804 static void
805 xdp_umem_destroy(struct xsk_umem_info *umem)
806 {
807 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
808         umem->mb_pool = NULL;
809 #else
810         rte_memzone_free(umem->mz);
811         umem->mz = NULL;
812
813         rte_ring_free(umem->buf_ring);
814         umem->buf_ring = NULL;
815 #endif
816
817         rte_free(umem);
818         umem = NULL;
819 }
820
821 static int
822 eth_dev_close(struct rte_eth_dev *dev)
823 {
824         struct pmd_internals *internals = dev->data->dev_private;
825         struct pkt_rx_queue *rxq;
826         int i;
827
828         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
829                 return 0;
830
831         AF_XDP_LOG(INFO, "Closing AF_XDP ethdev on numa socket %u\n",
832                 rte_socket_id());
833
834         for (i = 0; i < internals->queue_cnt; i++) {
835                 rxq = &internals->rx_queues[i];
836                 if (rxq->umem == NULL)
837                         break;
838                 xsk_socket__delete(rxq->xsk);
839
840                 if (__atomic_sub_fetch(&rxq->umem->refcnt, 1, __ATOMIC_ACQUIRE)
841                                 == 0) {
842                         (void)xsk_umem__delete(rxq->umem->umem);
843                         xdp_umem_destroy(rxq->umem);
844                 }
845
846                 /* free pkt_tx_queue */
847                 rte_free(rxq->pair);
848                 rte_free(rxq);
849         }
850
851         /*
852          * MAC is not allocated dynamically, setting it to NULL would prevent
853          * from releasing it in rte_eth_dev_release_port.
854          */
855         dev->data->mac_addrs = NULL;
856
857         remove_xdp_program(internals);
858
859         if (internals->shared_umem) {
860                 struct internal_list *list;
861
862                 /* Remove ethdev from list used to track and share UMEMs */
863                 list = find_internal_resource(internals);
864                 if (list) {
865                         pthread_mutex_lock(&internal_list_lock);
866                         TAILQ_REMOVE(&internal_list, list, next);
867                         pthread_mutex_unlock(&internal_list_lock);
868                         rte_free(list);
869                 }
870         }
871
872         return 0;
873 }
874
875 static void
876 eth_queue_release(void *q __rte_unused)
877 {
878 }
879
880 static int
881 eth_link_update(struct rte_eth_dev *dev __rte_unused,
882                 int wait_to_complete __rte_unused)
883 {
884         return 0;
885 }
886
887 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
888 static inline uint64_t get_base_addr(struct rte_mempool *mp, uint64_t *align)
889 {
890         struct rte_mempool_memhdr *memhdr;
891         uint64_t memhdr_addr, aligned_addr;
892
893         memhdr = STAILQ_FIRST(&mp->mem_list);
894         memhdr_addr = (uint64_t)memhdr->addr;
895         aligned_addr = memhdr_addr & ~(getpagesize() - 1);
896         *align = memhdr_addr - aligned_addr;
897
898         return aligned_addr;
899 }
900
901 static struct
902 xsk_umem_info *xdp_umem_configure(struct pmd_internals *internals,
903                                   struct pkt_rx_queue *rxq)
904 {
905         struct xsk_umem_info *umem = NULL;
906         int ret;
907         struct xsk_umem_config usr_config = {
908                 .fill_size = ETH_AF_XDP_DFLT_NUM_DESCS * 2,
909                 .comp_size = ETH_AF_XDP_DFLT_NUM_DESCS,
910                 .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG};
911         void *base_addr = NULL;
912         struct rte_mempool *mb_pool = rxq->mb_pool;
913         uint64_t umem_size, align = 0;
914
915         if (internals->shared_umem) {
916                 umem = get_shared_umem(rxq);
917                 if (umem != NULL &&
918                         __atomic_load_n(&umem->refcnt, __ATOMIC_ACQUIRE) <
919                                         umem->max_xsks) {
920                         AF_XDP_LOG(INFO, "%s,qid%i sharing UMEM\n",
921                                         internals->if_name, rxq->xsk_queue_idx);
922                         __atomic_fetch_add(&umem->refcnt, 1, __ATOMIC_ACQUIRE);
923                 }
924         }
925
926         if (umem == NULL) {
927                 usr_config.frame_size =
928                         rte_mempool_calc_obj_size(mb_pool->elt_size,
929                                                   mb_pool->flags, NULL);
930                 usr_config.frame_headroom = mb_pool->header_size +
931                                                 sizeof(struct rte_mbuf) +
932                                                 rte_pktmbuf_priv_size(mb_pool) +
933                                                 RTE_PKTMBUF_HEADROOM;
934
935                 umem = rte_zmalloc_socket("umem", sizeof(*umem), 0,
936                                           rte_socket_id());
937                 if (umem == NULL) {
938                         AF_XDP_LOG(ERR, "Failed to allocate umem info");
939                         return NULL;
940                 }
941
942                 umem->mb_pool = mb_pool;
943                 base_addr = (void *)get_base_addr(mb_pool, &align);
944                 umem_size = mb_pool->populated_size * usr_config.frame_size +
945                                 align;
946
947                 ret = xsk_umem__create(&umem->umem, base_addr, umem_size,
948                                 &rxq->fq, &rxq->cq, &usr_config);
949                 if (ret) {
950                         AF_XDP_LOG(ERR, "Failed to create umem");
951                         goto err;
952                 }
953                 umem->buffer = base_addr;
954
955                 if (internals->shared_umem) {
956                         umem->max_xsks = mb_pool->populated_size /
957                                                 ETH_AF_XDP_NUM_BUFFERS;
958                         AF_XDP_LOG(INFO, "Max xsks for UMEM %s: %u\n",
959                                                 mb_pool->name, umem->max_xsks);
960                 }
961
962                 __atomic_store_n(&umem->refcnt, 1, __ATOMIC_RELEASE);
963         }
964
965 #else
966 static struct
967 xsk_umem_info *xdp_umem_configure(struct pmd_internals *internals,
968                                   struct pkt_rx_queue *rxq)
969 {
970         struct xsk_umem_info *umem;
971         const struct rte_memzone *mz;
972         struct xsk_umem_config usr_config = {
973                 .fill_size = ETH_AF_XDP_DFLT_NUM_DESCS,
974                 .comp_size = ETH_AF_XDP_DFLT_NUM_DESCS,
975                 .frame_size = ETH_AF_XDP_FRAME_SIZE,
976                 .frame_headroom = 0 };
977         char ring_name[RTE_RING_NAMESIZE];
978         char mz_name[RTE_MEMZONE_NAMESIZE];
979         int ret;
980         uint64_t i;
981
982         umem = rte_zmalloc_socket("umem", sizeof(*umem), 0, rte_socket_id());
983         if (umem == NULL) {
984                 AF_XDP_LOG(ERR, "Failed to allocate umem info");
985                 return NULL;
986         }
987
988         snprintf(ring_name, sizeof(ring_name), "af_xdp_ring_%s_%u",
989                        internals->if_name, rxq->xsk_queue_idx);
990         umem->buf_ring = rte_ring_create(ring_name,
991                                          ETH_AF_XDP_NUM_BUFFERS,
992                                          rte_socket_id(),
993                                          0x0);
994         if (umem->buf_ring == NULL) {
995                 AF_XDP_LOG(ERR, "Failed to create rte_ring\n");
996                 goto err;
997         }
998
999         for (i = 0; i < ETH_AF_XDP_NUM_BUFFERS; i++)
1000                 rte_ring_enqueue(umem->buf_ring,
1001                                  (void *)(i * ETH_AF_XDP_FRAME_SIZE));
1002
1003         snprintf(mz_name, sizeof(mz_name), "af_xdp_umem_%s_%u",
1004                        internals->if_name, rxq->xsk_queue_idx);
1005         mz = rte_memzone_reserve_aligned(mz_name,
1006                         ETH_AF_XDP_NUM_BUFFERS * ETH_AF_XDP_FRAME_SIZE,
1007                         rte_socket_id(), RTE_MEMZONE_IOVA_CONTIG,
1008                         getpagesize());
1009         if (mz == NULL) {
1010                 AF_XDP_LOG(ERR, "Failed to reserve memzone for af_xdp umem.\n");
1011                 goto err;
1012         }
1013
1014         ret = xsk_umem__create(&umem->umem, mz->addr,
1015                                ETH_AF_XDP_NUM_BUFFERS * ETH_AF_XDP_FRAME_SIZE,
1016                                &rxq->fq, &rxq->cq,
1017                                &usr_config);
1018
1019         if (ret) {
1020                 AF_XDP_LOG(ERR, "Failed to create umem");
1021                 goto err;
1022         }
1023         umem->mz = mz;
1024
1025 #endif
1026         return umem;
1027
1028 err:
1029         xdp_umem_destroy(umem);
1030         return NULL;
1031 }
1032
1033 static int
1034 load_custom_xdp_prog(const char *prog_path, int if_index)
1035 {
1036         int ret, prog_fd = -1;
1037         struct bpf_object *obj;
1038         struct bpf_map *map;
1039
1040         ret = bpf_prog_load(prog_path, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
1041         if (ret) {
1042                 AF_XDP_LOG(ERR, "Failed to load program %s\n", prog_path);
1043                 return ret;
1044         }
1045
1046         /*
1047          * The loaded program must provision for a map of xsks, such that some
1048          * traffic can be redirected to userspace. When the xsk is created,
1049          * libbpf inserts it into the map.
1050          */
1051         map = bpf_object__find_map_by_name(obj, "xsks_map");
1052         if (!map) {
1053                 AF_XDP_LOG(ERR, "Failed to find xsks_map in %s\n", prog_path);
1054                 return -1;
1055         }
1056
1057         /* Link the program with the given network device */
1058         ret = bpf_set_link_xdp_fd(if_index, prog_fd,
1059                                         XDP_FLAGS_UPDATE_IF_NOEXIST);
1060         if (ret) {
1061                 AF_XDP_LOG(ERR, "Failed to set prog fd %d on interface\n",
1062                                 prog_fd);
1063                 return -1;
1064         }
1065
1066         AF_XDP_LOG(INFO, "Successfully loaded XDP program %s with fd %d\n",
1067                                 prog_path, prog_fd);
1068
1069         return 0;
1070 }
1071
1072 static int
1073 xsk_configure(struct pmd_internals *internals, struct pkt_rx_queue *rxq,
1074               int ring_size)
1075 {
1076         struct xsk_socket_config cfg;
1077         struct pkt_tx_queue *txq = rxq->pair;
1078         int ret = 0;
1079         int reserve_size = ETH_AF_XDP_DFLT_NUM_DESCS;
1080         struct rte_mbuf *fq_bufs[reserve_size];
1081
1082         rxq->umem = xdp_umem_configure(internals, rxq);
1083         if (rxq->umem == NULL)
1084                 return -ENOMEM;
1085         txq->umem = rxq->umem;
1086
1087         cfg.rx_size = ring_size;
1088         cfg.tx_size = ring_size;
1089         cfg.libbpf_flags = 0;
1090         cfg.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
1091         cfg.bind_flags = 0;
1092
1093 #if defined(XDP_USE_NEED_WAKEUP)
1094         cfg.bind_flags |= XDP_USE_NEED_WAKEUP;
1095 #endif
1096
1097         if (strnlen(internals->prog_path, PATH_MAX) &&
1098                                 !internals->custom_prog_configured) {
1099                 ret = load_custom_xdp_prog(internals->prog_path,
1100                                            internals->if_index);
1101                 if (ret) {
1102                         AF_XDP_LOG(ERR, "Failed to load custom XDP program %s\n",
1103                                         internals->prog_path);
1104                         goto err;
1105                 }
1106                 internals->custom_prog_configured = 1;
1107         }
1108
1109         if (internals->shared_umem)
1110                 ret = create_shared_socket(&rxq->xsk, internals->if_name,
1111                                 rxq->xsk_queue_idx, rxq->umem->umem, &rxq->rx,
1112                                 &txq->tx, &rxq->fq, &rxq->cq, &cfg);
1113         else
1114                 ret = xsk_socket__create(&rxq->xsk, internals->if_name,
1115                                 rxq->xsk_queue_idx, rxq->umem->umem, &rxq->rx,
1116                                 &txq->tx, &cfg);
1117
1118         if (ret) {
1119                 AF_XDP_LOG(ERR, "Failed to create xsk socket.\n");
1120                 goto err;
1121         }
1122
1123 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
1124         if (rte_pktmbuf_alloc_bulk(rxq->umem->mb_pool, fq_bufs, reserve_size)) {
1125                 AF_XDP_LOG(DEBUG, "Failed to get enough buffers for fq.\n");
1126                 goto err;
1127         }
1128 #endif
1129         ret = reserve_fill_queue(rxq->umem, reserve_size, fq_bufs, &rxq->fq);
1130         if (ret) {
1131                 xsk_socket__delete(rxq->xsk);
1132                 AF_XDP_LOG(ERR, "Failed to reserve fill queue.\n");
1133                 goto err;
1134         }
1135
1136         return 0;
1137
1138 err:
1139         if (__atomic_sub_fetch(&rxq->umem->refcnt, 1, __ATOMIC_ACQUIRE) == 0)
1140                 xdp_umem_destroy(rxq->umem);
1141
1142         return ret;
1143 }
1144
1145 static int
1146 eth_rx_queue_setup(struct rte_eth_dev *dev,
1147                    uint16_t rx_queue_id,
1148                    uint16_t nb_rx_desc,
1149                    unsigned int socket_id __rte_unused,
1150                    const struct rte_eth_rxconf *rx_conf __rte_unused,
1151                    struct rte_mempool *mb_pool)
1152 {
1153         struct pmd_internals *internals = dev->data->dev_private;
1154         struct pkt_rx_queue *rxq;
1155         int ret;
1156
1157         rxq = &internals->rx_queues[rx_queue_id];
1158
1159         AF_XDP_LOG(INFO, "Set up rx queue, rx queue id: %d, xsk queue id: %d\n",
1160                    rx_queue_id, rxq->xsk_queue_idx);
1161
1162 #ifndef XDP_UMEM_UNALIGNED_CHUNK_FLAG
1163         uint32_t buf_size, data_size;
1164
1165         /* Now get the space available for data in the mbuf */
1166         buf_size = rte_pktmbuf_data_room_size(mb_pool) -
1167                 RTE_PKTMBUF_HEADROOM;
1168         data_size = ETH_AF_XDP_FRAME_SIZE;
1169
1170         if (data_size > buf_size) {
1171                 AF_XDP_LOG(ERR, "%s: %d bytes will not fit in mbuf (%d bytes)\n",
1172                         dev->device->name, data_size, buf_size);
1173                 ret = -ENOMEM;
1174                 goto err;
1175         }
1176 #endif
1177
1178         rxq->mb_pool = mb_pool;
1179
1180         if (xsk_configure(internals, rxq, nb_rx_desc)) {
1181                 AF_XDP_LOG(ERR, "Failed to configure xdp socket\n");
1182                 ret = -EINVAL;
1183                 goto err;
1184         }
1185
1186         rxq->fds[0].fd = xsk_socket__fd(rxq->xsk);
1187         rxq->fds[0].events = POLLIN;
1188
1189         dev->data->rx_queues[rx_queue_id] = rxq;
1190         return 0;
1191
1192 err:
1193         return ret;
1194 }
1195
1196 static int
1197 eth_tx_queue_setup(struct rte_eth_dev *dev,
1198                    uint16_t tx_queue_id,
1199                    uint16_t nb_tx_desc __rte_unused,
1200                    unsigned int socket_id __rte_unused,
1201                    const struct rte_eth_txconf *tx_conf __rte_unused)
1202 {
1203         struct pmd_internals *internals = dev->data->dev_private;
1204         struct pkt_tx_queue *txq;
1205
1206         txq = &internals->tx_queues[tx_queue_id];
1207
1208         dev->data->tx_queues[tx_queue_id] = txq;
1209         return 0;
1210 }
1211
1212 static int
1213 eth_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
1214 {
1215         struct pmd_internals *internals = dev->data->dev_private;
1216         struct ifreq ifr = { .ifr_mtu = mtu };
1217         int ret;
1218         int s;
1219
1220         s = socket(PF_INET, SOCK_DGRAM, 0);
1221         if (s < 0)
1222                 return -EINVAL;
1223
1224         strlcpy(ifr.ifr_name, internals->if_name, IFNAMSIZ);
1225         ret = ioctl(s, SIOCSIFMTU, &ifr);
1226         close(s);
1227
1228         return (ret < 0) ? -errno : 0;
1229 }
1230
1231 static int
1232 eth_dev_change_flags(char *if_name, uint32_t flags, uint32_t mask)
1233 {
1234         struct ifreq ifr;
1235         int ret = 0;
1236         int s;
1237
1238         s = socket(PF_INET, SOCK_DGRAM, 0);
1239         if (s < 0)
1240                 return -errno;
1241
1242         strlcpy(ifr.ifr_name, if_name, IFNAMSIZ);
1243         if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0) {
1244                 ret = -errno;
1245                 goto out;
1246         }
1247         ifr.ifr_flags &= mask;
1248         ifr.ifr_flags |= flags;
1249         if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0) {
1250                 ret = -errno;
1251                 goto out;
1252         }
1253 out:
1254         close(s);
1255         return ret;
1256 }
1257
1258 static int
1259 eth_dev_promiscuous_enable(struct rte_eth_dev *dev)
1260 {
1261         struct pmd_internals *internals = dev->data->dev_private;
1262
1263         return eth_dev_change_flags(internals->if_name, IFF_PROMISC, ~0);
1264 }
1265
1266 static int
1267 eth_dev_promiscuous_disable(struct rte_eth_dev *dev)
1268 {
1269         struct pmd_internals *internals = dev->data->dev_private;
1270
1271         return eth_dev_change_flags(internals->if_name, 0, ~IFF_PROMISC);
1272 }
1273
1274 static const struct eth_dev_ops ops = {
1275         .dev_start = eth_dev_start,
1276         .dev_stop = eth_dev_stop,
1277         .dev_close = eth_dev_close,
1278         .dev_configure = eth_dev_configure,
1279         .dev_infos_get = eth_dev_info,
1280         .mtu_set = eth_dev_mtu_set,
1281         .promiscuous_enable = eth_dev_promiscuous_enable,
1282         .promiscuous_disable = eth_dev_promiscuous_disable,
1283         .rx_queue_setup = eth_rx_queue_setup,
1284         .tx_queue_setup = eth_tx_queue_setup,
1285         .rx_queue_release = eth_queue_release,
1286         .tx_queue_release = eth_queue_release,
1287         .link_update = eth_link_update,
1288         .stats_get = eth_stats_get,
1289         .stats_reset = eth_stats_reset,
1290 };
1291
1292 /** parse integer from integer argument */
1293 static int
1294 parse_integer_arg(const char *key __rte_unused,
1295                   const char *value, void *extra_args)
1296 {
1297         int *i = (int *)extra_args;
1298         char *end;
1299
1300         *i = strtol(value, &end, 10);
1301         if (*i < 0) {
1302                 AF_XDP_LOG(ERR, "Argument has to be positive.\n");
1303                 return -EINVAL;
1304         }
1305
1306         return 0;
1307 }
1308
1309 /** parse name argument */
1310 static int
1311 parse_name_arg(const char *key __rte_unused,
1312                const char *value, void *extra_args)
1313 {
1314         char *name = extra_args;
1315
1316         if (strnlen(value, IFNAMSIZ) > IFNAMSIZ - 1) {
1317                 AF_XDP_LOG(ERR, "Invalid name %s, should be less than %u bytes.\n",
1318                            value, IFNAMSIZ);
1319                 return -EINVAL;
1320         }
1321
1322         strlcpy(name, value, IFNAMSIZ);
1323
1324         return 0;
1325 }
1326
1327 /** parse xdp prog argument */
1328 static int
1329 parse_prog_arg(const char *key __rte_unused,
1330                const char *value, void *extra_args)
1331 {
1332         char *path = extra_args;
1333
1334         if (strnlen(value, PATH_MAX) == PATH_MAX) {
1335                 AF_XDP_LOG(ERR, "Invalid path %s, should be less than %u bytes.\n",
1336                            value, PATH_MAX);
1337                 return -EINVAL;
1338         }
1339
1340         if (access(value, F_OK) != 0) {
1341                 AF_XDP_LOG(ERR, "Error accessing %s: %s\n",
1342                            value, strerror(errno));
1343                 return -EINVAL;
1344         }
1345
1346         strlcpy(path, value, PATH_MAX);
1347
1348         return 0;
1349 }
1350
1351 static int
1352 xdp_get_channels_info(const char *if_name, int *max_queues,
1353                                 int *combined_queues)
1354 {
1355         struct ethtool_channels channels;
1356         struct ifreq ifr;
1357         int fd, ret;
1358
1359         fd = socket(AF_INET, SOCK_DGRAM, 0);
1360         if (fd < 0)
1361                 return -1;
1362
1363         channels.cmd = ETHTOOL_GCHANNELS;
1364         ifr.ifr_data = (void *)&channels;
1365         strncpy(ifr.ifr_name, if_name, IFNAMSIZ);
1366         ret = ioctl(fd, SIOCETHTOOL, &ifr);
1367         if (ret) {
1368                 if (errno == EOPNOTSUPP) {
1369                         ret = 0;
1370                 } else {
1371                         ret = -errno;
1372                         goto out;
1373                 }
1374         }
1375
1376         if (channels.max_combined == 0 || errno == EOPNOTSUPP) {
1377                 /* If the device says it has no channels, then all traffic
1378                  * is sent to a single stream, so max queues = 1.
1379                  */
1380                 *max_queues = 1;
1381                 *combined_queues = 1;
1382         } else {
1383                 *max_queues = channels.max_combined;
1384                 *combined_queues = channels.combined_count;
1385         }
1386
1387  out:
1388         close(fd);
1389         return ret;
1390 }
1391
1392 static int
1393 parse_parameters(struct rte_kvargs *kvlist, char *if_name, int *start_queue,
1394                         int *queue_cnt, int *shared_umem, char *prog_path)
1395 {
1396         int ret;
1397
1398         ret = rte_kvargs_process(kvlist, ETH_AF_XDP_IFACE_ARG,
1399                                  &parse_name_arg, if_name);
1400         if (ret < 0)
1401                 goto free_kvlist;
1402
1403         ret = rte_kvargs_process(kvlist, ETH_AF_XDP_START_QUEUE_ARG,
1404                                  &parse_integer_arg, start_queue);
1405         if (ret < 0)
1406                 goto free_kvlist;
1407
1408         ret = rte_kvargs_process(kvlist, ETH_AF_XDP_QUEUE_COUNT_ARG,
1409                                  &parse_integer_arg, queue_cnt);
1410         if (ret < 0 || *queue_cnt <= 0) {
1411                 ret = -EINVAL;
1412                 goto free_kvlist;
1413         }
1414
1415         ret = rte_kvargs_process(kvlist, ETH_AF_XDP_SHARED_UMEM_ARG,
1416                                 &parse_integer_arg, shared_umem);
1417         if (ret < 0)
1418                 goto free_kvlist;
1419
1420         ret = rte_kvargs_process(kvlist, ETH_AF_XDP_PROG_ARG,
1421                                  &parse_prog_arg, prog_path);
1422         if (ret < 0)
1423                 goto free_kvlist;
1424
1425 free_kvlist:
1426         rte_kvargs_free(kvlist);
1427         return ret;
1428 }
1429
1430 static int
1431 get_iface_info(const char *if_name,
1432                struct rte_ether_addr *eth_addr,
1433                int *if_index)
1434 {
1435         struct ifreq ifr;
1436         int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
1437
1438         if (sock < 0)
1439                 return -1;
1440
1441         strlcpy(ifr.ifr_name, if_name, IFNAMSIZ);
1442         if (ioctl(sock, SIOCGIFINDEX, &ifr))
1443                 goto error;
1444
1445         *if_index = ifr.ifr_ifindex;
1446
1447         if (ioctl(sock, SIOCGIFHWADDR, &ifr))
1448                 goto error;
1449
1450         rte_memcpy(eth_addr, ifr.ifr_hwaddr.sa_data, RTE_ETHER_ADDR_LEN);
1451
1452         close(sock);
1453         return 0;
1454
1455 error:
1456         close(sock);
1457         return -1;
1458 }
1459
1460 static struct rte_eth_dev *
1461 init_internals(struct rte_vdev_device *dev, const char *if_name,
1462                 int start_queue_idx, int queue_cnt, int shared_umem,
1463                 const char *prog_path)
1464 {
1465         const char *name = rte_vdev_device_name(dev);
1466         const unsigned int numa_node = dev->device.numa_node;
1467         struct pmd_internals *internals;
1468         struct rte_eth_dev *eth_dev;
1469         int ret;
1470         int i;
1471
1472         internals = rte_zmalloc_socket(name, sizeof(*internals), 0, numa_node);
1473         if (internals == NULL)
1474                 return NULL;
1475
1476         internals->start_queue_idx = start_queue_idx;
1477         internals->queue_cnt = queue_cnt;
1478         strlcpy(internals->if_name, if_name, IFNAMSIZ);
1479         strlcpy(internals->prog_path, prog_path, PATH_MAX);
1480         internals->custom_prog_configured = 0;
1481
1482 #ifndef ETH_AF_XDP_SHARED_UMEM
1483         if (shared_umem) {
1484                 AF_XDP_LOG(ERR, "Shared UMEM feature not available. "
1485                                 "Check kernel and libbpf version\n");
1486                 goto err_free_internals;
1487         }
1488 #endif
1489         internals->shared_umem = shared_umem;
1490
1491         if (xdp_get_channels_info(if_name, &internals->max_queue_cnt,
1492                                   &internals->combined_queue_cnt)) {
1493                 AF_XDP_LOG(ERR, "Failed to get channel info of interface: %s\n",
1494                                 if_name);
1495                 goto err_free_internals;
1496         }
1497
1498         if (queue_cnt > internals->combined_queue_cnt) {
1499                 AF_XDP_LOG(ERR, "Specified queue count %d is larger than combined queue count %d.\n",
1500                                 queue_cnt, internals->combined_queue_cnt);
1501                 goto err_free_internals;
1502         }
1503
1504         internals->rx_queues = rte_zmalloc_socket(NULL,
1505                                         sizeof(struct pkt_rx_queue) * queue_cnt,
1506                                         0, numa_node);
1507         if (internals->rx_queues == NULL) {
1508                 AF_XDP_LOG(ERR, "Failed to allocate memory for rx queues.\n");
1509                 goto err_free_internals;
1510         }
1511
1512         internals->tx_queues = rte_zmalloc_socket(NULL,
1513                                         sizeof(struct pkt_tx_queue) * queue_cnt,
1514                                         0, numa_node);
1515         if (internals->tx_queues == NULL) {
1516                 AF_XDP_LOG(ERR, "Failed to allocate memory for tx queues.\n");
1517                 goto err_free_rx;
1518         }
1519         for (i = 0; i < queue_cnt; i++) {
1520                 internals->tx_queues[i].pair = &internals->rx_queues[i];
1521                 internals->rx_queues[i].pair = &internals->tx_queues[i];
1522                 internals->rx_queues[i].xsk_queue_idx = start_queue_idx + i;
1523                 internals->tx_queues[i].xsk_queue_idx = start_queue_idx + i;
1524         }
1525
1526         ret = get_iface_info(if_name, &internals->eth_addr,
1527                              &internals->if_index);
1528         if (ret)
1529                 goto err_free_tx;
1530
1531         eth_dev = rte_eth_vdev_allocate(dev, 0);
1532         if (eth_dev == NULL)
1533                 goto err_free_tx;
1534
1535         eth_dev->data->dev_private = internals;
1536         eth_dev->data->dev_link = pmd_link;
1537         eth_dev->data->mac_addrs = &internals->eth_addr;
1538         eth_dev->dev_ops = &ops;
1539         eth_dev->rx_pkt_burst = eth_af_xdp_rx;
1540         eth_dev->tx_pkt_burst = eth_af_xdp_tx;
1541
1542 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
1543         AF_XDP_LOG(INFO, "Zero copy between umem and mbuf enabled.\n");
1544 #endif
1545
1546         return eth_dev;
1547
1548 err_free_tx:
1549         rte_free(internals->tx_queues);
1550 err_free_rx:
1551         rte_free(internals->rx_queues);
1552 err_free_internals:
1553         rte_free(internals);
1554         return NULL;
1555 }
1556
1557 static int
1558 rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
1559 {
1560         struct rte_kvargs *kvlist;
1561         char if_name[IFNAMSIZ] = {'\0'};
1562         int xsk_start_queue_idx = ETH_AF_XDP_DFLT_START_QUEUE_IDX;
1563         int xsk_queue_cnt = ETH_AF_XDP_DFLT_QUEUE_COUNT;
1564         int shared_umem = 0;
1565         char prog_path[PATH_MAX] = {'\0'};
1566         struct rte_eth_dev *eth_dev = NULL;
1567         const char *name;
1568
1569         AF_XDP_LOG(INFO, "Initializing pmd_af_xdp for %s\n",
1570                 rte_vdev_device_name(dev));
1571
1572         name = rte_vdev_device_name(dev);
1573         if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
1574                 strlen(rte_vdev_device_args(dev)) == 0) {
1575                 eth_dev = rte_eth_dev_attach_secondary(name);
1576                 if (eth_dev == NULL) {
1577                         AF_XDP_LOG(ERR, "Failed to probe %s\n", name);
1578                         return -EINVAL;
1579                 }
1580                 eth_dev->dev_ops = &ops;
1581                 rte_eth_dev_probing_finish(eth_dev);
1582                 return 0;
1583         }
1584
1585         kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1586         if (kvlist == NULL) {
1587                 AF_XDP_LOG(ERR, "Invalid kvargs key\n");
1588                 return -EINVAL;
1589         }
1590
1591         if (dev->device.numa_node == SOCKET_ID_ANY)
1592                 dev->device.numa_node = rte_socket_id();
1593
1594         if (parse_parameters(kvlist, if_name, &xsk_start_queue_idx,
1595                              &xsk_queue_cnt, &shared_umem, prog_path) < 0) {
1596                 AF_XDP_LOG(ERR, "Invalid kvargs value\n");
1597                 return -EINVAL;
1598         }
1599
1600         if (strlen(if_name) == 0) {
1601                 AF_XDP_LOG(ERR, "Network interface must be specified\n");
1602                 return -EINVAL;
1603         }
1604
1605         eth_dev = init_internals(dev, if_name, xsk_start_queue_idx,
1606                                         xsk_queue_cnt, shared_umem, prog_path);
1607         if (eth_dev == NULL) {
1608                 AF_XDP_LOG(ERR, "Failed to init internals\n");
1609                 return -1;
1610         }
1611
1612         rte_eth_dev_probing_finish(eth_dev);
1613
1614         return 0;
1615 }
1616
1617 static int
1618 rte_pmd_af_xdp_remove(struct rte_vdev_device *dev)
1619 {
1620         struct rte_eth_dev *eth_dev = NULL;
1621
1622         AF_XDP_LOG(INFO, "Removing AF_XDP ethdev on numa socket %u\n",
1623                 rte_socket_id());
1624
1625         if (dev == NULL)
1626                 return -1;
1627
1628         /* find the ethdev entry */
1629         eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
1630         if (eth_dev == NULL)
1631                 return 0;
1632
1633         eth_dev_close(eth_dev);
1634         rte_eth_dev_release_port(eth_dev);
1635
1636
1637         return 0;
1638 }
1639
1640 static struct rte_vdev_driver pmd_af_xdp_drv = {
1641         .probe = rte_pmd_af_xdp_probe,
1642         .remove = rte_pmd_af_xdp_remove,
1643 };
1644
1645 RTE_PMD_REGISTER_VDEV(net_af_xdp, pmd_af_xdp_drv);
1646 RTE_PMD_REGISTER_PARAM_STRING(net_af_xdp,
1647                               "iface=<string> "
1648                               "start_queue=<int> "
1649                               "queue_count=<int> "
1650                               "shared_umem=<int> "
1651                               "xdp_prog=<string> ");