ethdev: add device flag to bypass auto-filled queue xstats
[dpdk.git] / drivers / net / af_xdp / rte_eth_af_xdp.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2019-2020 Intel Corporation.
3  */
4 #include <unistd.h>
5 #include <errno.h>
6 #include <stdlib.h>
7 #include <string.h>
8 #include <poll.h>
9 #include <netinet/in.h>
10 #include <net/if.h>
11 #include <sys/socket.h>
12 #include <sys/ioctl.h>
13 #include <linux/if_ether.h>
14 #include <linux/if_xdp.h>
15 #include <linux/if_link.h>
16 #include <linux/ethtool.h>
17 #include <linux/sockios.h>
18 #include "af_xdp_deps.h"
19 #include <bpf/xsk.h>
20
21 #include <rte_ethdev.h>
22 #include <rte_ethdev_driver.h>
23 #include <rte_ethdev_vdev.h>
24 #include <rte_kvargs.h>
25 #include <rte_bus_vdev.h>
26 #include <rte_string_fns.h>
27 #include <rte_branch_prediction.h>
28 #include <rte_common.h>
29 #include <rte_dev.h>
30 #include <rte_eal.h>
31 #include <rte_ether.h>
32 #include <rte_lcore.h>
33 #include <rte_log.h>
34 #include <rte_memory.h>
35 #include <rte_memzone.h>
36 #include <rte_mempool.h>
37 #include <rte_mbuf.h>
38 #include <rte_malloc.h>
39 #include <rte_ring.h>
40 #include <rte_spinlock.h>
41
42 #include "compat.h"
43
44
45 #ifndef SOL_XDP
46 #define SOL_XDP 283
47 #endif
48
49 #ifndef AF_XDP
50 #define AF_XDP 44
51 #endif
52
53 #ifndef PF_XDP
54 #define PF_XDP AF_XDP
55 #endif
56
57 RTE_LOG_REGISTER(af_xdp_logtype, pmd.net.af_xdp, NOTICE);
58
59 #define AF_XDP_LOG(level, fmt, args...)                 \
60         rte_log(RTE_LOG_ ## level, af_xdp_logtype,      \
61                 "%s(): " fmt, __func__, ##args)
62
63 #define ETH_AF_XDP_FRAME_SIZE           2048
64 #define ETH_AF_XDP_NUM_BUFFERS          4096
65 #define ETH_AF_XDP_DFLT_NUM_DESCS       XSK_RING_CONS__DEFAULT_NUM_DESCS
66 #define ETH_AF_XDP_DFLT_START_QUEUE_IDX 0
67 #define ETH_AF_XDP_DFLT_QUEUE_COUNT     1
68
69 #define ETH_AF_XDP_RX_BATCH_SIZE        32
70 #define ETH_AF_XDP_TX_BATCH_SIZE        32
71
72
73 struct xsk_umem_info {
74         struct xsk_umem *umem;
75         struct rte_ring *buf_ring;
76         const struct rte_memzone *mz;
77         struct rte_mempool *mb_pool;
78         void *buffer;
79         uint8_t refcnt;
80         uint32_t max_xsks;
81 };
82
83 struct rx_stats {
84         uint64_t rx_pkts;
85         uint64_t rx_bytes;
86         uint64_t rx_dropped;
87 };
88
89 struct pkt_rx_queue {
90         struct xsk_ring_cons rx;
91         struct xsk_umem_info *umem;
92         struct xsk_socket *xsk;
93         struct rte_mempool *mb_pool;
94
95         struct rx_stats stats;
96
97         struct xsk_ring_prod fq;
98         struct xsk_ring_cons cq;
99
100         struct pkt_tx_queue *pair;
101         struct pollfd fds[1];
102         int xsk_queue_idx;
103 };
104
105 struct tx_stats {
106         uint64_t tx_pkts;
107         uint64_t tx_bytes;
108         uint64_t tx_dropped;
109 };
110
111 struct pkt_tx_queue {
112         struct xsk_ring_prod tx;
113         struct xsk_umem_info *umem;
114
115         struct tx_stats stats;
116
117         struct pkt_rx_queue *pair;
118         int xsk_queue_idx;
119 };
120
121 struct pmd_internals {
122         int if_index;
123         char if_name[IFNAMSIZ];
124         int start_queue_idx;
125         int queue_cnt;
126         int max_queue_cnt;
127         int combined_queue_cnt;
128         bool shared_umem;
129         char prog_path[PATH_MAX];
130         bool custom_prog_configured;
131
132         struct rte_ether_addr eth_addr;
133
134         struct pkt_rx_queue *rx_queues;
135         struct pkt_tx_queue *tx_queues;
136 };
137
138 #define ETH_AF_XDP_IFACE_ARG                    "iface"
139 #define ETH_AF_XDP_START_QUEUE_ARG              "start_queue"
140 #define ETH_AF_XDP_QUEUE_COUNT_ARG              "queue_count"
141 #define ETH_AF_XDP_SHARED_UMEM_ARG              "shared_umem"
142 #define ETH_AF_XDP_PROG_ARG                     "xdp_prog"
143
144 static const char * const valid_arguments[] = {
145         ETH_AF_XDP_IFACE_ARG,
146         ETH_AF_XDP_START_QUEUE_ARG,
147         ETH_AF_XDP_QUEUE_COUNT_ARG,
148         ETH_AF_XDP_SHARED_UMEM_ARG,
149         ETH_AF_XDP_PROG_ARG,
150         NULL
151 };
152
153 static const struct rte_eth_link pmd_link = {
154         .link_speed = ETH_SPEED_NUM_10G,
155         .link_duplex = ETH_LINK_FULL_DUPLEX,
156         .link_status = ETH_LINK_DOWN,
157         .link_autoneg = ETH_LINK_AUTONEG
158 };
159
160 /* List which tracks PMDs to facilitate sharing UMEMs across them. */
161 struct internal_list {
162         TAILQ_ENTRY(internal_list) next;
163         struct rte_eth_dev *eth_dev;
164 };
165
166 TAILQ_HEAD(internal_list_head, internal_list);
167 static struct internal_list_head internal_list =
168         TAILQ_HEAD_INITIALIZER(internal_list);
169
170 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
171
172 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
173 static inline int
174 reserve_fill_queue_zc(struct xsk_umem_info *umem, uint16_t reserve_size,
175                       struct rte_mbuf **bufs, struct xsk_ring_prod *fq)
176 {
177         uint32_t idx;
178         uint16_t i;
179
180         if (unlikely(!xsk_ring_prod__reserve(fq, reserve_size, &idx))) {
181                 for (i = 0; i < reserve_size; i++)
182                         rte_pktmbuf_free(bufs[i]);
183                 AF_XDP_LOG(DEBUG, "Failed to reserve enough fq descs.\n");
184                 return -1;
185         }
186
187         for (i = 0; i < reserve_size; i++) {
188                 __u64 *fq_addr;
189                 uint64_t addr;
190
191                 fq_addr = xsk_ring_prod__fill_addr(fq, idx++);
192                 addr = (uint64_t)bufs[i] - (uint64_t)umem->buffer -
193                                 umem->mb_pool->header_size;
194                 *fq_addr = addr;
195         }
196
197         xsk_ring_prod__submit(fq, reserve_size);
198
199         return 0;
200 }
201 #else
202 static inline int
203 reserve_fill_queue_cp(struct xsk_umem_info *umem, uint16_t reserve_size,
204                       struct rte_mbuf **bufs __rte_unused,
205                       struct xsk_ring_prod *fq)
206 {
207         void *addrs[reserve_size];
208         uint32_t idx;
209         uint16_t i;
210
211         if (rte_ring_dequeue_bulk(umem->buf_ring, addrs, reserve_size, NULL)
212                     != reserve_size) {
213                 AF_XDP_LOG(DEBUG, "Failed to get enough buffers for fq.\n");
214                 return -1;
215         }
216
217         if (unlikely(!xsk_ring_prod__reserve(fq, reserve_size, &idx))) {
218                 AF_XDP_LOG(DEBUG, "Failed to reserve enough fq descs.\n");
219                 rte_ring_enqueue_bulk(umem->buf_ring, addrs,
220                                 reserve_size, NULL);
221                 return -1;
222         }
223
224         for (i = 0; i < reserve_size; i++) {
225                 __u64 *fq_addr;
226
227                 fq_addr = xsk_ring_prod__fill_addr(fq, idx++);
228                 *fq_addr = (uint64_t)addrs[i];
229         }
230
231         xsk_ring_prod__submit(fq, reserve_size);
232
233         return 0;
234 }
235 #endif
236
237 static inline int
238 reserve_fill_queue(struct xsk_umem_info *umem, uint16_t reserve_size,
239                    struct rte_mbuf **bufs, struct xsk_ring_prod *fq)
240 {
241 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
242         return reserve_fill_queue_zc(umem, reserve_size, bufs, fq);
243 #else
244         return reserve_fill_queue_cp(umem, reserve_size, bufs, fq);
245 #endif
246 }
247
248 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
249 static uint16_t
250 af_xdp_rx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
251 {
252         struct pkt_rx_queue *rxq = queue;
253         struct xsk_ring_cons *rx = &rxq->rx;
254         struct xsk_ring_prod *fq = &rxq->fq;
255         struct xsk_umem_info *umem = rxq->umem;
256         uint32_t idx_rx = 0;
257         unsigned long rx_bytes = 0;
258         int rcvd, i;
259         struct rte_mbuf *fq_bufs[ETH_AF_XDP_RX_BATCH_SIZE];
260
261         /* allocate bufs for fill queue replenishment after rx */
262         if (rte_pktmbuf_alloc_bulk(umem->mb_pool, fq_bufs, nb_pkts)) {
263                 AF_XDP_LOG(DEBUG,
264                         "Failed to get enough buffers for fq.\n");
265                 return 0;
266         }
267
268         rcvd = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
269
270         if (rcvd == 0) {
271 #if defined(XDP_USE_NEED_WAKEUP)
272                 if (xsk_ring_prod__needs_wakeup(fq))
273                         (void)poll(rxq->fds, 1, 1000);
274 #endif
275
276                 goto out;
277         }
278
279         for (i = 0; i < rcvd; i++) {
280                 const struct xdp_desc *desc;
281                 uint64_t addr;
282                 uint32_t len;
283                 uint64_t offset;
284
285                 desc = xsk_ring_cons__rx_desc(rx, idx_rx++);
286                 addr = desc->addr;
287                 len = desc->len;
288
289                 offset = xsk_umem__extract_offset(addr);
290                 addr = xsk_umem__extract_addr(addr);
291
292                 bufs[i] = (struct rte_mbuf *)
293                                 xsk_umem__get_data(umem->buffer, addr +
294                                         umem->mb_pool->header_size);
295                 bufs[i]->data_off = offset - sizeof(struct rte_mbuf) -
296                         rte_pktmbuf_priv_size(umem->mb_pool) -
297                         umem->mb_pool->header_size;
298
299                 rte_pktmbuf_pkt_len(bufs[i]) = len;
300                 rte_pktmbuf_data_len(bufs[i]) = len;
301                 rx_bytes += len;
302         }
303
304         xsk_ring_cons__release(rx, rcvd);
305
306         (void)reserve_fill_queue(umem, rcvd, fq_bufs, fq);
307
308         /* statistics */
309         rxq->stats.rx_pkts += rcvd;
310         rxq->stats.rx_bytes += rx_bytes;
311
312 out:
313         if (rcvd != nb_pkts)
314                 rte_mempool_put_bulk(umem->mb_pool, (void **)&fq_bufs[rcvd],
315                                      nb_pkts - rcvd);
316
317         return rcvd;
318 }
319 #else
320 static uint16_t
321 af_xdp_rx_cp(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
322 {
323         struct pkt_rx_queue *rxq = queue;
324         struct xsk_ring_cons *rx = &rxq->rx;
325         struct xsk_umem_info *umem = rxq->umem;
326         struct xsk_ring_prod *fq = &rxq->fq;
327         uint32_t idx_rx = 0;
328         unsigned long rx_bytes = 0;
329         int rcvd, i;
330         uint32_t free_thresh = fq->size >> 1;
331         struct rte_mbuf *mbufs[ETH_AF_XDP_RX_BATCH_SIZE];
332
333         if (xsk_prod_nb_free(fq, free_thresh) >= free_thresh)
334                 (void)reserve_fill_queue(umem, ETH_AF_XDP_RX_BATCH_SIZE,
335                                          NULL, fq);
336
337         if (unlikely(rte_pktmbuf_alloc_bulk(rxq->mb_pool, mbufs, nb_pkts) != 0))
338                 return 0;
339
340         rcvd = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
341         if (rcvd == 0) {
342 #if defined(XDP_USE_NEED_WAKEUP)
343                 if (xsk_ring_prod__needs_wakeup(fq))
344                         (void)poll(rxq->fds, 1, 1000);
345 #endif
346
347                 goto out;
348         }
349
350         for (i = 0; i < rcvd; i++) {
351                 const struct xdp_desc *desc;
352                 uint64_t addr;
353                 uint32_t len;
354                 void *pkt;
355
356                 desc = xsk_ring_cons__rx_desc(rx, idx_rx++);
357                 addr = desc->addr;
358                 len = desc->len;
359                 pkt = xsk_umem__get_data(rxq->umem->mz->addr, addr);
360
361                 rte_memcpy(rte_pktmbuf_mtod(mbufs[i], void *), pkt, len);
362                 rte_ring_enqueue(umem->buf_ring, (void *)addr);
363                 rte_pktmbuf_pkt_len(mbufs[i]) = len;
364                 rte_pktmbuf_data_len(mbufs[i]) = len;
365                 rx_bytes += len;
366                 bufs[i] = mbufs[i];
367         }
368
369         xsk_ring_cons__release(rx, rcvd);
370
371         /* statistics */
372         rxq->stats.rx_pkts += rcvd;
373         rxq->stats.rx_bytes += rx_bytes;
374
375 out:
376         if (rcvd != nb_pkts)
377                 rte_mempool_put_bulk(rxq->mb_pool, (void **)&mbufs[rcvd],
378                                      nb_pkts - rcvd);
379
380         return rcvd;
381 }
382 #endif
383
384 static uint16_t
385 eth_af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
386 {
387         nb_pkts = RTE_MIN(nb_pkts, ETH_AF_XDP_RX_BATCH_SIZE);
388
389 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
390         return af_xdp_rx_zc(queue, bufs, nb_pkts);
391 #else
392         return af_xdp_rx_cp(queue, bufs, nb_pkts);
393 #endif
394 }
395
396 static void
397 pull_umem_cq(struct xsk_umem_info *umem, int size, struct xsk_ring_cons *cq)
398 {
399         size_t i, n;
400         uint32_t idx_cq = 0;
401
402         n = xsk_ring_cons__peek(cq, size, &idx_cq);
403
404         for (i = 0; i < n; i++) {
405                 uint64_t addr;
406                 addr = *xsk_ring_cons__comp_addr(cq, idx_cq++);
407 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
408                 addr = xsk_umem__extract_addr(addr);
409                 rte_pktmbuf_free((struct rte_mbuf *)
410                                         xsk_umem__get_data(umem->buffer,
411                                         addr + umem->mb_pool->header_size));
412 #else
413                 rte_ring_enqueue(umem->buf_ring, (void *)addr);
414 #endif
415         }
416
417         xsk_ring_cons__release(cq, n);
418 }
419
420 static void
421 kick_tx(struct pkt_tx_queue *txq, struct xsk_ring_cons *cq)
422 {
423         struct xsk_umem_info *umem = txq->umem;
424
425         pull_umem_cq(umem, XSK_RING_CONS__DEFAULT_NUM_DESCS, cq);
426
427 #if defined(XDP_USE_NEED_WAKEUP)
428         if (xsk_ring_prod__needs_wakeup(&txq->tx))
429 #endif
430                 while (send(xsk_socket__fd(txq->pair->xsk), NULL,
431                             0, MSG_DONTWAIT) < 0) {
432                         /* some thing unexpected */
433                         if (errno != EBUSY && errno != EAGAIN && errno != EINTR)
434                                 break;
435
436                         /* pull from completion queue to leave more space */
437                         if (errno == EAGAIN)
438                                 pull_umem_cq(umem,
439                                              XSK_RING_CONS__DEFAULT_NUM_DESCS,
440                                              cq);
441                 }
442 }
443
444 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
445 static uint16_t
446 af_xdp_tx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
447 {
448         struct pkt_tx_queue *txq = queue;
449         struct xsk_umem_info *umem = txq->umem;
450         struct rte_mbuf *mbuf;
451         unsigned long tx_bytes = 0;
452         int i;
453         uint32_t idx_tx;
454         uint16_t count = 0;
455         struct xdp_desc *desc;
456         uint64_t addr, offset;
457         struct xsk_ring_cons *cq = &txq->pair->cq;
458         uint32_t free_thresh = cq->size >> 1;
459
460         if (xsk_cons_nb_avail(cq, free_thresh) >= free_thresh)
461                 pull_umem_cq(umem, XSK_RING_CONS__DEFAULT_NUM_DESCS, cq);
462
463         for (i = 0; i < nb_pkts; i++) {
464                 mbuf = bufs[i];
465
466                 if (mbuf->pool == umem->mb_pool) {
467                         if (!xsk_ring_prod__reserve(&txq->tx, 1, &idx_tx)) {
468                                 kick_tx(txq, cq);
469                                 if (!xsk_ring_prod__reserve(&txq->tx, 1,
470                                                             &idx_tx))
471                                         goto out;
472                         }
473                         desc = xsk_ring_prod__tx_desc(&txq->tx, idx_tx);
474                         desc->len = mbuf->pkt_len;
475                         addr = (uint64_t)mbuf - (uint64_t)umem->buffer -
476                                         umem->mb_pool->header_size;
477                         offset = rte_pktmbuf_mtod(mbuf, uint64_t) -
478                                         (uint64_t)mbuf +
479                                         umem->mb_pool->header_size;
480                         offset = offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT;
481                         desc->addr = addr | offset;
482                         count++;
483                 } else {
484                         struct rte_mbuf *local_mbuf =
485                                         rte_pktmbuf_alloc(umem->mb_pool);
486                         void *pkt;
487
488                         if (local_mbuf == NULL)
489                                 goto out;
490
491                         if (!xsk_ring_prod__reserve(&txq->tx, 1, &idx_tx)) {
492                                 rte_pktmbuf_free(local_mbuf);
493                                 kick_tx(txq, cq);
494                                 goto out;
495                         }
496
497                         desc = xsk_ring_prod__tx_desc(&txq->tx, idx_tx);
498                         desc->len = mbuf->pkt_len;
499
500                         addr = (uint64_t)local_mbuf - (uint64_t)umem->buffer -
501                                         umem->mb_pool->header_size;
502                         offset = rte_pktmbuf_mtod(local_mbuf, uint64_t) -
503                                         (uint64_t)local_mbuf +
504                                         umem->mb_pool->header_size;
505                         pkt = xsk_umem__get_data(umem->buffer, addr + offset);
506                         offset = offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT;
507                         desc->addr = addr | offset;
508                         rte_memcpy(pkt, rte_pktmbuf_mtod(mbuf, void *),
509                                         desc->len);
510                         rte_pktmbuf_free(mbuf);
511                         count++;
512                 }
513
514                 tx_bytes += mbuf->pkt_len;
515         }
516
517         kick_tx(txq, cq);
518
519 out:
520         xsk_ring_prod__submit(&txq->tx, count);
521
522         txq->stats.tx_pkts += count;
523         txq->stats.tx_bytes += tx_bytes;
524         txq->stats.tx_dropped += nb_pkts - count;
525
526         return count;
527 }
528 #else
529 static uint16_t
530 af_xdp_tx_cp(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
531 {
532         struct pkt_tx_queue *txq = queue;
533         struct xsk_umem_info *umem = txq->umem;
534         struct rte_mbuf *mbuf;
535         void *addrs[ETH_AF_XDP_TX_BATCH_SIZE];
536         unsigned long tx_bytes = 0;
537         int i;
538         uint32_t idx_tx;
539         struct xsk_ring_cons *cq = &txq->pair->cq;
540
541         nb_pkts = RTE_MIN(nb_pkts, ETH_AF_XDP_TX_BATCH_SIZE);
542
543         pull_umem_cq(umem, nb_pkts, cq);
544
545         nb_pkts = rte_ring_dequeue_bulk(umem->buf_ring, addrs,
546                                         nb_pkts, NULL);
547         if (nb_pkts == 0)
548                 return 0;
549
550         if (xsk_ring_prod__reserve(&txq->tx, nb_pkts, &idx_tx) != nb_pkts) {
551                 kick_tx(txq, cq);
552                 rte_ring_enqueue_bulk(umem->buf_ring, addrs, nb_pkts, NULL);
553                 return 0;
554         }
555
556         for (i = 0; i < nb_pkts; i++) {
557                 struct xdp_desc *desc;
558                 void *pkt;
559
560                 desc = xsk_ring_prod__tx_desc(&txq->tx, idx_tx + i);
561                 mbuf = bufs[i];
562                 desc->len = mbuf->pkt_len;
563
564                 desc->addr = (uint64_t)addrs[i];
565                 pkt = xsk_umem__get_data(umem->mz->addr,
566                                          desc->addr);
567                 rte_memcpy(pkt, rte_pktmbuf_mtod(mbuf, void *), desc->len);
568                 tx_bytes += mbuf->pkt_len;
569                 rte_pktmbuf_free(mbuf);
570         }
571
572         xsk_ring_prod__submit(&txq->tx, nb_pkts);
573
574         kick_tx(txq, cq);
575
576         txq->stats.tx_pkts += nb_pkts;
577         txq->stats.tx_bytes += tx_bytes;
578
579         return nb_pkts;
580 }
581 #endif
582
583 static uint16_t
584 eth_af_xdp_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
585 {
586 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
587         return af_xdp_tx_zc(queue, bufs, nb_pkts);
588 #else
589         return af_xdp_tx_cp(queue, bufs, nb_pkts);
590 #endif
591 }
592
593 static int
594 eth_dev_start(struct rte_eth_dev *dev)
595 {
596         dev->data->dev_link.link_status = ETH_LINK_UP;
597
598         return 0;
599 }
600
601 /* This function gets called when the current port gets stopped. */
602 static int
603 eth_dev_stop(struct rte_eth_dev *dev)
604 {
605         dev->data->dev_link.link_status = ETH_LINK_DOWN;
606         return 0;
607 }
608
609 /* Find ethdev in list */
610 static inline struct internal_list *
611 find_internal_resource(struct pmd_internals *port_int)
612 {
613         int found = 0;
614         struct internal_list *list = NULL;
615
616         if (port_int == NULL)
617                 return NULL;
618
619         pthread_mutex_lock(&internal_list_lock);
620
621         TAILQ_FOREACH(list, &internal_list, next) {
622                 struct pmd_internals *list_int =
623                                 list->eth_dev->data->dev_private;
624                 if (list_int == port_int) {
625                         found = 1;
626                         break;
627                 }
628         }
629
630         pthread_mutex_unlock(&internal_list_lock);
631
632         if (!found)
633                 return NULL;
634
635         return list;
636 }
637
638 /* Check if the netdev,qid context already exists */
639 static inline bool
640 ctx_exists(struct pkt_rx_queue *rxq, const char *ifname,
641                 struct pkt_rx_queue *list_rxq, const char *list_ifname)
642 {
643         bool exists = false;
644
645         if (rxq->xsk_queue_idx == list_rxq->xsk_queue_idx &&
646                         !strncmp(ifname, list_ifname, IFNAMSIZ)) {
647                 AF_XDP_LOG(ERR, "ctx %s,%i already exists, cannot share umem\n",
648                                         ifname, rxq->xsk_queue_idx);
649                 exists = true;
650         }
651
652         return exists;
653 }
654
655 /* Get a pointer to an existing UMEM which overlays the rxq's mb_pool */
656 static inline int
657 get_shared_umem(struct pkt_rx_queue *rxq, const char *ifname,
658                         struct xsk_umem_info **umem)
659 {
660         struct internal_list *list;
661         struct pmd_internals *internals;
662         int i = 0, ret = 0;
663         struct rte_mempool *mb_pool = rxq->mb_pool;
664
665         if (mb_pool == NULL)
666                 return ret;
667
668         pthread_mutex_lock(&internal_list_lock);
669
670         TAILQ_FOREACH(list, &internal_list, next) {
671                 internals = list->eth_dev->data->dev_private;
672                 for (i = 0; i < internals->queue_cnt; i++) {
673                         struct pkt_rx_queue *list_rxq =
674                                                 &internals->rx_queues[i];
675                         if (rxq == list_rxq)
676                                 continue;
677                         if (mb_pool == internals->rx_queues[i].mb_pool) {
678                                 if (ctx_exists(rxq, ifname, list_rxq,
679                                                 internals->if_name)) {
680                                         ret = -1;
681                                         goto out;
682                                 }
683                                 if (__atomic_load_n(
684                                         &internals->rx_queues[i].umem->refcnt,
685                                                         __ATOMIC_ACQUIRE)) {
686                                         *umem = internals->rx_queues[i].umem;
687                                         goto out;
688                                 }
689                         }
690                 }
691         }
692
693 out:
694         pthread_mutex_unlock(&internal_list_lock);
695
696         return ret;
697 }
698
699 static int
700 eth_dev_configure(struct rte_eth_dev *dev)
701 {
702         struct pmd_internals *internal = dev->data->dev_private;
703
704         /* rx/tx must be paired */
705         if (dev->data->nb_rx_queues != dev->data->nb_tx_queues)
706                 return -EINVAL;
707
708         if (internal->shared_umem) {
709                 struct internal_list *list = NULL;
710                 const char *name = dev->device->name;
711
712                 /* Ensure PMD is not already inserted into the list */
713                 list = find_internal_resource(internal);
714                 if (list)
715                         return 0;
716
717                 list = rte_zmalloc_socket(name, sizeof(*list), 0,
718                                         dev->device->numa_node);
719                 if (list == NULL)
720                         return -1;
721
722                 list->eth_dev = dev;
723                 pthread_mutex_lock(&internal_list_lock);
724                 TAILQ_INSERT_TAIL(&internal_list, list, next);
725                 pthread_mutex_unlock(&internal_list_lock);
726         }
727
728         return 0;
729 }
730
731 static int
732 eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
733 {
734         struct pmd_internals *internals = dev->data->dev_private;
735
736         dev_info->if_index = internals->if_index;
737         dev_info->max_mac_addrs = 1;
738         dev_info->max_rx_pktlen = ETH_FRAME_LEN;
739         dev_info->max_rx_queues = internals->queue_cnt;
740         dev_info->max_tx_queues = internals->queue_cnt;
741
742         dev_info->min_mtu = RTE_ETHER_MIN_MTU;
743 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
744         dev_info->max_mtu = getpagesize() -
745                                 sizeof(struct rte_mempool_objhdr) -
746                                 sizeof(struct rte_mbuf) -
747                                 RTE_PKTMBUF_HEADROOM - XDP_PACKET_HEADROOM;
748 #else
749         dev_info->max_mtu = ETH_AF_XDP_FRAME_SIZE - XDP_PACKET_HEADROOM;
750 #endif
751
752         dev_info->default_rxportconf.nb_queues = 1;
753         dev_info->default_txportconf.nb_queues = 1;
754         dev_info->default_rxportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
755         dev_info->default_txportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
756
757         return 0;
758 }
759
760 static int
761 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
762 {
763         struct pmd_internals *internals = dev->data->dev_private;
764         struct xdp_statistics xdp_stats;
765         struct pkt_rx_queue *rxq;
766         struct pkt_tx_queue *txq;
767         socklen_t optlen;
768         int i, ret;
769
770         for (i = 0; i < dev->data->nb_rx_queues; i++) {
771                 optlen = sizeof(struct xdp_statistics);
772                 rxq = &internals->rx_queues[i];
773                 txq = rxq->pair;
774                 stats->q_ipackets[i] = rxq->stats.rx_pkts;
775                 stats->q_ibytes[i] = rxq->stats.rx_bytes;
776
777                 stats->q_opackets[i] = txq->stats.tx_pkts;
778                 stats->q_obytes[i] = txq->stats.tx_bytes;
779
780                 stats->ipackets += stats->q_ipackets[i];
781                 stats->ibytes += stats->q_ibytes[i];
782                 stats->imissed += rxq->stats.rx_dropped;
783                 stats->oerrors += txq->stats.tx_dropped;
784                 ret = getsockopt(xsk_socket__fd(rxq->xsk), SOL_XDP,
785                                 XDP_STATISTICS, &xdp_stats, &optlen);
786                 if (ret != 0) {
787                         AF_XDP_LOG(ERR, "getsockopt() failed for XDP_STATISTICS.\n");
788                         return -1;
789                 }
790                 stats->imissed += xdp_stats.rx_dropped;
791
792                 stats->opackets += stats->q_opackets[i];
793                 stats->obytes += stats->q_obytes[i];
794         }
795
796         return 0;
797 }
798
799 static int
800 eth_stats_reset(struct rte_eth_dev *dev)
801 {
802         struct pmd_internals *internals = dev->data->dev_private;
803         int i;
804
805         for (i = 0; i < internals->queue_cnt; i++) {
806                 memset(&internals->rx_queues[i].stats, 0,
807                                         sizeof(struct rx_stats));
808                 memset(&internals->tx_queues[i].stats, 0,
809                                         sizeof(struct tx_stats));
810         }
811
812         return 0;
813 }
814
815 static void
816 remove_xdp_program(struct pmd_internals *internals)
817 {
818         uint32_t curr_prog_id = 0;
819
820         if (bpf_get_link_xdp_id(internals->if_index, &curr_prog_id,
821                                 XDP_FLAGS_UPDATE_IF_NOEXIST)) {
822                 AF_XDP_LOG(ERR, "bpf_get_link_xdp_id failed\n");
823                 return;
824         }
825         bpf_set_link_xdp_fd(internals->if_index, -1,
826                         XDP_FLAGS_UPDATE_IF_NOEXIST);
827 }
828
829 static void
830 xdp_umem_destroy(struct xsk_umem_info *umem)
831 {
832 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
833         umem->mb_pool = NULL;
834 #else
835         rte_memzone_free(umem->mz);
836         umem->mz = NULL;
837
838         rte_ring_free(umem->buf_ring);
839         umem->buf_ring = NULL;
840 #endif
841
842         rte_free(umem);
843         umem = NULL;
844 }
845
846 static int
847 eth_dev_close(struct rte_eth_dev *dev)
848 {
849         struct pmd_internals *internals = dev->data->dev_private;
850         struct pkt_rx_queue *rxq;
851         int i;
852
853         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
854                 return 0;
855
856         AF_XDP_LOG(INFO, "Closing AF_XDP ethdev on numa socket %u\n",
857                 rte_socket_id());
858
859         for (i = 0; i < internals->queue_cnt; i++) {
860                 rxq = &internals->rx_queues[i];
861                 if (rxq->umem == NULL)
862                         break;
863                 xsk_socket__delete(rxq->xsk);
864
865                 if (__atomic_sub_fetch(&rxq->umem->refcnt, 1, __ATOMIC_ACQUIRE)
866                                 == 0) {
867                         (void)xsk_umem__delete(rxq->umem->umem);
868                         xdp_umem_destroy(rxq->umem);
869                 }
870
871                 /* free pkt_tx_queue */
872                 rte_free(rxq->pair);
873                 rte_free(rxq);
874         }
875
876         /*
877          * MAC is not allocated dynamically, setting it to NULL would prevent
878          * from releasing it in rte_eth_dev_release_port.
879          */
880         dev->data->mac_addrs = NULL;
881
882         remove_xdp_program(internals);
883
884         if (internals->shared_umem) {
885                 struct internal_list *list;
886
887                 /* Remove ethdev from list used to track and share UMEMs */
888                 list = find_internal_resource(internals);
889                 if (list) {
890                         pthread_mutex_lock(&internal_list_lock);
891                         TAILQ_REMOVE(&internal_list, list, next);
892                         pthread_mutex_unlock(&internal_list_lock);
893                         rte_free(list);
894                 }
895         }
896
897         return 0;
898 }
899
900 static void
901 eth_queue_release(void *q __rte_unused)
902 {
903 }
904
905 static int
906 eth_link_update(struct rte_eth_dev *dev __rte_unused,
907                 int wait_to_complete __rte_unused)
908 {
909         return 0;
910 }
911
912 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
913 static inline uint64_t get_base_addr(struct rte_mempool *mp, uint64_t *align)
914 {
915         struct rte_mempool_memhdr *memhdr;
916         uint64_t memhdr_addr, aligned_addr;
917
918         memhdr = STAILQ_FIRST(&mp->mem_list);
919         memhdr_addr = (uint64_t)memhdr->addr;
920         aligned_addr = memhdr_addr & ~(getpagesize() - 1);
921         *align = memhdr_addr - aligned_addr;
922
923         return aligned_addr;
924 }
925
926 static struct
927 xsk_umem_info *xdp_umem_configure(struct pmd_internals *internals,
928                                   struct pkt_rx_queue *rxq)
929 {
930         struct xsk_umem_info *umem = NULL;
931         int ret;
932         struct xsk_umem_config usr_config = {
933                 .fill_size = ETH_AF_XDP_DFLT_NUM_DESCS * 2,
934                 .comp_size = ETH_AF_XDP_DFLT_NUM_DESCS,
935                 .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG};
936         void *base_addr = NULL;
937         struct rte_mempool *mb_pool = rxq->mb_pool;
938         uint64_t umem_size, align = 0;
939
940         if (internals->shared_umem) {
941                 if (get_shared_umem(rxq, internals->if_name, &umem) < 0)
942                         return NULL;
943
944                 if (umem != NULL &&
945                         __atomic_load_n(&umem->refcnt, __ATOMIC_ACQUIRE) <
946                                         umem->max_xsks) {
947                         AF_XDP_LOG(INFO, "%s,qid%i sharing UMEM\n",
948                                         internals->if_name, rxq->xsk_queue_idx);
949                         __atomic_fetch_add(&umem->refcnt, 1, __ATOMIC_ACQUIRE);
950                 }
951         }
952
953         if (umem == NULL) {
954                 usr_config.frame_size =
955                         rte_mempool_calc_obj_size(mb_pool->elt_size,
956                                                   mb_pool->flags, NULL);
957                 usr_config.frame_headroom = mb_pool->header_size +
958                                                 sizeof(struct rte_mbuf) +
959                                                 rte_pktmbuf_priv_size(mb_pool) +
960                                                 RTE_PKTMBUF_HEADROOM;
961
962                 umem = rte_zmalloc_socket("umem", sizeof(*umem), 0,
963                                           rte_socket_id());
964                 if (umem == NULL) {
965                         AF_XDP_LOG(ERR, "Failed to allocate umem info");
966                         return NULL;
967                 }
968
969                 umem->mb_pool = mb_pool;
970                 base_addr = (void *)get_base_addr(mb_pool, &align);
971                 umem_size = mb_pool->populated_size * usr_config.frame_size +
972                                 align;
973
974                 ret = xsk_umem__create(&umem->umem, base_addr, umem_size,
975                                 &rxq->fq, &rxq->cq, &usr_config);
976                 if (ret) {
977                         AF_XDP_LOG(ERR, "Failed to create umem");
978                         goto err;
979                 }
980                 umem->buffer = base_addr;
981
982                 if (internals->shared_umem) {
983                         umem->max_xsks = mb_pool->populated_size /
984                                                 ETH_AF_XDP_NUM_BUFFERS;
985                         AF_XDP_LOG(INFO, "Max xsks for UMEM %s: %u\n",
986                                                 mb_pool->name, umem->max_xsks);
987                 }
988
989                 __atomic_store_n(&umem->refcnt, 1, __ATOMIC_RELEASE);
990         }
991
992 #else
993 static struct
994 xsk_umem_info *xdp_umem_configure(struct pmd_internals *internals,
995                                   struct pkt_rx_queue *rxq)
996 {
997         struct xsk_umem_info *umem;
998         const struct rte_memzone *mz;
999         struct xsk_umem_config usr_config = {
1000                 .fill_size = ETH_AF_XDP_DFLT_NUM_DESCS,
1001                 .comp_size = ETH_AF_XDP_DFLT_NUM_DESCS,
1002                 .frame_size = ETH_AF_XDP_FRAME_SIZE,
1003                 .frame_headroom = 0 };
1004         char ring_name[RTE_RING_NAMESIZE];
1005         char mz_name[RTE_MEMZONE_NAMESIZE];
1006         int ret;
1007         uint64_t i;
1008
1009         umem = rte_zmalloc_socket("umem", sizeof(*umem), 0, rte_socket_id());
1010         if (umem == NULL) {
1011                 AF_XDP_LOG(ERR, "Failed to allocate umem info");
1012                 return NULL;
1013         }
1014
1015         snprintf(ring_name, sizeof(ring_name), "af_xdp_ring_%s_%u",
1016                        internals->if_name, rxq->xsk_queue_idx);
1017         umem->buf_ring = rte_ring_create(ring_name,
1018                                          ETH_AF_XDP_NUM_BUFFERS,
1019                                          rte_socket_id(),
1020                                          0x0);
1021         if (umem->buf_ring == NULL) {
1022                 AF_XDP_LOG(ERR, "Failed to create rte_ring\n");
1023                 goto err;
1024         }
1025
1026         for (i = 0; i < ETH_AF_XDP_NUM_BUFFERS; i++)
1027                 rte_ring_enqueue(umem->buf_ring,
1028                                  (void *)(i * ETH_AF_XDP_FRAME_SIZE));
1029
1030         snprintf(mz_name, sizeof(mz_name), "af_xdp_umem_%s_%u",
1031                        internals->if_name, rxq->xsk_queue_idx);
1032         mz = rte_memzone_reserve_aligned(mz_name,
1033                         ETH_AF_XDP_NUM_BUFFERS * ETH_AF_XDP_FRAME_SIZE,
1034                         rte_socket_id(), RTE_MEMZONE_IOVA_CONTIG,
1035                         getpagesize());
1036         if (mz == NULL) {
1037                 AF_XDP_LOG(ERR, "Failed to reserve memzone for af_xdp umem.\n");
1038                 goto err;
1039         }
1040
1041         ret = xsk_umem__create(&umem->umem, mz->addr,
1042                                ETH_AF_XDP_NUM_BUFFERS * ETH_AF_XDP_FRAME_SIZE,
1043                                &rxq->fq, &rxq->cq,
1044                                &usr_config);
1045
1046         if (ret) {
1047                 AF_XDP_LOG(ERR, "Failed to create umem");
1048                 goto err;
1049         }
1050         umem->mz = mz;
1051
1052 #endif
1053         return umem;
1054
1055 err:
1056         xdp_umem_destroy(umem);
1057         return NULL;
1058 }
1059
1060 static int
1061 load_custom_xdp_prog(const char *prog_path, int if_index)
1062 {
1063         int ret, prog_fd = -1;
1064         struct bpf_object *obj;
1065         struct bpf_map *map;
1066
1067         ret = bpf_prog_load(prog_path, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
1068         if (ret) {
1069                 AF_XDP_LOG(ERR, "Failed to load program %s\n", prog_path);
1070                 return ret;
1071         }
1072
1073         /*
1074          * The loaded program must provision for a map of xsks, such that some
1075          * traffic can be redirected to userspace. When the xsk is created,
1076          * libbpf inserts it into the map.
1077          */
1078         map = bpf_object__find_map_by_name(obj, "xsks_map");
1079         if (!map) {
1080                 AF_XDP_LOG(ERR, "Failed to find xsks_map in %s\n", prog_path);
1081                 return -1;
1082         }
1083
1084         /* Link the program with the given network device */
1085         ret = bpf_set_link_xdp_fd(if_index, prog_fd,
1086                                         XDP_FLAGS_UPDATE_IF_NOEXIST);
1087         if (ret) {
1088                 AF_XDP_LOG(ERR, "Failed to set prog fd %d on interface\n",
1089                                 prog_fd);
1090                 return -1;
1091         }
1092
1093         AF_XDP_LOG(INFO, "Successfully loaded XDP program %s with fd %d\n",
1094                                 prog_path, prog_fd);
1095
1096         return 0;
1097 }
1098
1099 static int
1100 xsk_configure(struct pmd_internals *internals, struct pkt_rx_queue *rxq,
1101               int ring_size)
1102 {
1103         struct xsk_socket_config cfg;
1104         struct pkt_tx_queue *txq = rxq->pair;
1105         int ret = 0;
1106         int reserve_size = ETH_AF_XDP_DFLT_NUM_DESCS;
1107         struct rte_mbuf *fq_bufs[reserve_size];
1108
1109         rxq->umem = xdp_umem_configure(internals, rxq);
1110         if (rxq->umem == NULL)
1111                 return -ENOMEM;
1112         txq->umem = rxq->umem;
1113
1114         cfg.rx_size = ring_size;
1115         cfg.tx_size = ring_size;
1116         cfg.libbpf_flags = 0;
1117         cfg.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
1118         cfg.bind_flags = 0;
1119
1120 #if defined(XDP_USE_NEED_WAKEUP)
1121         cfg.bind_flags |= XDP_USE_NEED_WAKEUP;
1122 #endif
1123
1124         if (strnlen(internals->prog_path, PATH_MAX) &&
1125                                 !internals->custom_prog_configured) {
1126                 ret = load_custom_xdp_prog(internals->prog_path,
1127                                            internals->if_index);
1128                 if (ret) {
1129                         AF_XDP_LOG(ERR, "Failed to load custom XDP program %s\n",
1130                                         internals->prog_path);
1131                         goto err;
1132                 }
1133                 internals->custom_prog_configured = 1;
1134         }
1135
1136         if (internals->shared_umem)
1137                 ret = create_shared_socket(&rxq->xsk, internals->if_name,
1138                                 rxq->xsk_queue_idx, rxq->umem->umem, &rxq->rx,
1139                                 &txq->tx, &rxq->fq, &rxq->cq, &cfg);
1140         else
1141                 ret = xsk_socket__create(&rxq->xsk, internals->if_name,
1142                                 rxq->xsk_queue_idx, rxq->umem->umem, &rxq->rx,
1143                                 &txq->tx, &cfg);
1144
1145         if (ret) {
1146                 AF_XDP_LOG(ERR, "Failed to create xsk socket.\n");
1147                 goto err;
1148         }
1149
1150 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
1151         if (rte_pktmbuf_alloc_bulk(rxq->umem->mb_pool, fq_bufs, reserve_size)) {
1152                 AF_XDP_LOG(DEBUG, "Failed to get enough buffers for fq.\n");
1153                 goto err;
1154         }
1155 #endif
1156         ret = reserve_fill_queue(rxq->umem, reserve_size, fq_bufs, &rxq->fq);
1157         if (ret) {
1158                 xsk_socket__delete(rxq->xsk);
1159                 AF_XDP_LOG(ERR, "Failed to reserve fill queue.\n");
1160                 goto err;
1161         }
1162
1163         return 0;
1164
1165 err:
1166         if (__atomic_sub_fetch(&rxq->umem->refcnt, 1, __ATOMIC_ACQUIRE) == 0)
1167                 xdp_umem_destroy(rxq->umem);
1168
1169         return ret;
1170 }
1171
1172 static int
1173 eth_rx_queue_setup(struct rte_eth_dev *dev,
1174                    uint16_t rx_queue_id,
1175                    uint16_t nb_rx_desc,
1176                    unsigned int socket_id __rte_unused,
1177                    const struct rte_eth_rxconf *rx_conf __rte_unused,
1178                    struct rte_mempool *mb_pool)
1179 {
1180         struct pmd_internals *internals = dev->data->dev_private;
1181         struct pkt_rx_queue *rxq;
1182         int ret;
1183
1184         rxq = &internals->rx_queues[rx_queue_id];
1185
1186         AF_XDP_LOG(INFO, "Set up rx queue, rx queue id: %d, xsk queue id: %d\n",
1187                    rx_queue_id, rxq->xsk_queue_idx);
1188
1189 #ifndef XDP_UMEM_UNALIGNED_CHUNK_FLAG
1190         uint32_t buf_size, data_size;
1191
1192         /* Now get the space available for data in the mbuf */
1193         buf_size = rte_pktmbuf_data_room_size(mb_pool) -
1194                 RTE_PKTMBUF_HEADROOM;
1195         data_size = ETH_AF_XDP_FRAME_SIZE;
1196
1197         if (data_size > buf_size) {
1198                 AF_XDP_LOG(ERR, "%s: %d bytes will not fit in mbuf (%d bytes)\n",
1199                         dev->device->name, data_size, buf_size);
1200                 ret = -ENOMEM;
1201                 goto err;
1202         }
1203 #endif
1204
1205         rxq->mb_pool = mb_pool;
1206
1207         if (xsk_configure(internals, rxq, nb_rx_desc)) {
1208                 AF_XDP_LOG(ERR, "Failed to configure xdp socket\n");
1209                 ret = -EINVAL;
1210                 goto err;
1211         }
1212
1213         rxq->fds[0].fd = xsk_socket__fd(rxq->xsk);
1214         rxq->fds[0].events = POLLIN;
1215
1216         dev->data->rx_queues[rx_queue_id] = rxq;
1217         return 0;
1218
1219 err:
1220         return ret;
1221 }
1222
1223 static int
1224 eth_tx_queue_setup(struct rte_eth_dev *dev,
1225                    uint16_t tx_queue_id,
1226                    uint16_t nb_tx_desc __rte_unused,
1227                    unsigned int socket_id __rte_unused,
1228                    const struct rte_eth_txconf *tx_conf __rte_unused)
1229 {
1230         struct pmd_internals *internals = dev->data->dev_private;
1231         struct pkt_tx_queue *txq;
1232
1233         txq = &internals->tx_queues[tx_queue_id];
1234
1235         dev->data->tx_queues[tx_queue_id] = txq;
1236         return 0;
1237 }
1238
1239 static int
1240 eth_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
1241 {
1242         struct pmd_internals *internals = dev->data->dev_private;
1243         struct ifreq ifr = { .ifr_mtu = mtu };
1244         int ret;
1245         int s;
1246
1247         s = socket(PF_INET, SOCK_DGRAM, 0);
1248         if (s < 0)
1249                 return -EINVAL;
1250
1251         strlcpy(ifr.ifr_name, internals->if_name, IFNAMSIZ);
1252         ret = ioctl(s, SIOCSIFMTU, &ifr);
1253         close(s);
1254
1255         return (ret < 0) ? -errno : 0;
1256 }
1257
1258 static int
1259 eth_dev_change_flags(char *if_name, uint32_t flags, uint32_t mask)
1260 {
1261         struct ifreq ifr;
1262         int ret = 0;
1263         int s;
1264
1265         s = socket(PF_INET, SOCK_DGRAM, 0);
1266         if (s < 0)
1267                 return -errno;
1268
1269         strlcpy(ifr.ifr_name, if_name, IFNAMSIZ);
1270         if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0) {
1271                 ret = -errno;
1272                 goto out;
1273         }
1274         ifr.ifr_flags &= mask;
1275         ifr.ifr_flags |= flags;
1276         if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0) {
1277                 ret = -errno;
1278                 goto out;
1279         }
1280 out:
1281         close(s);
1282         return ret;
1283 }
1284
1285 static int
1286 eth_dev_promiscuous_enable(struct rte_eth_dev *dev)
1287 {
1288         struct pmd_internals *internals = dev->data->dev_private;
1289
1290         return eth_dev_change_flags(internals->if_name, IFF_PROMISC, ~0);
1291 }
1292
1293 static int
1294 eth_dev_promiscuous_disable(struct rte_eth_dev *dev)
1295 {
1296         struct pmd_internals *internals = dev->data->dev_private;
1297
1298         return eth_dev_change_flags(internals->if_name, 0, ~IFF_PROMISC);
1299 }
1300
1301 static const struct eth_dev_ops ops = {
1302         .dev_start = eth_dev_start,
1303         .dev_stop = eth_dev_stop,
1304         .dev_close = eth_dev_close,
1305         .dev_configure = eth_dev_configure,
1306         .dev_infos_get = eth_dev_info,
1307         .mtu_set = eth_dev_mtu_set,
1308         .promiscuous_enable = eth_dev_promiscuous_enable,
1309         .promiscuous_disable = eth_dev_promiscuous_disable,
1310         .rx_queue_setup = eth_rx_queue_setup,
1311         .tx_queue_setup = eth_tx_queue_setup,
1312         .rx_queue_release = eth_queue_release,
1313         .tx_queue_release = eth_queue_release,
1314         .link_update = eth_link_update,
1315         .stats_get = eth_stats_get,
1316         .stats_reset = eth_stats_reset,
1317 };
1318
1319 /** parse integer from integer argument */
1320 static int
1321 parse_integer_arg(const char *key __rte_unused,
1322                   const char *value, void *extra_args)
1323 {
1324         int *i = (int *)extra_args;
1325         char *end;
1326
1327         *i = strtol(value, &end, 10);
1328         if (*i < 0) {
1329                 AF_XDP_LOG(ERR, "Argument has to be positive.\n");
1330                 return -EINVAL;
1331         }
1332
1333         return 0;
1334 }
1335
1336 /** parse name argument */
1337 static int
1338 parse_name_arg(const char *key __rte_unused,
1339                const char *value, void *extra_args)
1340 {
1341         char *name = extra_args;
1342
1343         if (strnlen(value, IFNAMSIZ) > IFNAMSIZ - 1) {
1344                 AF_XDP_LOG(ERR, "Invalid name %s, should be less than %u bytes.\n",
1345                            value, IFNAMSIZ);
1346                 return -EINVAL;
1347         }
1348
1349         strlcpy(name, value, IFNAMSIZ);
1350
1351         return 0;
1352 }
1353
1354 /** parse xdp prog argument */
1355 static int
1356 parse_prog_arg(const char *key __rte_unused,
1357                const char *value, void *extra_args)
1358 {
1359         char *path = extra_args;
1360
1361         if (strnlen(value, PATH_MAX) == PATH_MAX) {
1362                 AF_XDP_LOG(ERR, "Invalid path %s, should be less than %u bytes.\n",
1363                            value, PATH_MAX);
1364                 return -EINVAL;
1365         }
1366
1367         if (access(value, F_OK) != 0) {
1368                 AF_XDP_LOG(ERR, "Error accessing %s: %s\n",
1369                            value, strerror(errno));
1370                 return -EINVAL;
1371         }
1372
1373         strlcpy(path, value, PATH_MAX);
1374
1375         return 0;
1376 }
1377
1378 static int
1379 xdp_get_channels_info(const char *if_name, int *max_queues,
1380                                 int *combined_queues)
1381 {
1382         struct ethtool_channels channels;
1383         struct ifreq ifr;
1384         int fd, ret;
1385
1386         fd = socket(AF_INET, SOCK_DGRAM, 0);
1387         if (fd < 0)
1388                 return -1;
1389
1390         channels.cmd = ETHTOOL_GCHANNELS;
1391         ifr.ifr_data = (void *)&channels;
1392         strlcpy(ifr.ifr_name, if_name, IFNAMSIZ);
1393         ret = ioctl(fd, SIOCETHTOOL, &ifr);
1394         if (ret) {
1395                 if (errno == EOPNOTSUPP) {
1396                         ret = 0;
1397                 } else {
1398                         ret = -errno;
1399                         goto out;
1400                 }
1401         }
1402
1403         if (channels.max_combined == 0 || errno == EOPNOTSUPP) {
1404                 /* If the device says it has no channels, then all traffic
1405                  * is sent to a single stream, so max queues = 1.
1406                  */
1407                 *max_queues = 1;
1408                 *combined_queues = 1;
1409         } else {
1410                 *max_queues = channels.max_combined;
1411                 *combined_queues = channels.combined_count;
1412         }
1413
1414  out:
1415         close(fd);
1416         return ret;
1417 }
1418
1419 static int
1420 parse_parameters(struct rte_kvargs *kvlist, char *if_name, int *start_queue,
1421                         int *queue_cnt, int *shared_umem, char *prog_path)
1422 {
1423         int ret;
1424
1425         ret = rte_kvargs_process(kvlist, ETH_AF_XDP_IFACE_ARG,
1426                                  &parse_name_arg, if_name);
1427         if (ret < 0)
1428                 goto free_kvlist;
1429
1430         ret = rte_kvargs_process(kvlist, ETH_AF_XDP_START_QUEUE_ARG,
1431                                  &parse_integer_arg, start_queue);
1432         if (ret < 0)
1433                 goto free_kvlist;
1434
1435         ret = rte_kvargs_process(kvlist, ETH_AF_XDP_QUEUE_COUNT_ARG,
1436                                  &parse_integer_arg, queue_cnt);
1437         if (ret < 0 || *queue_cnt <= 0) {
1438                 ret = -EINVAL;
1439                 goto free_kvlist;
1440         }
1441
1442         ret = rte_kvargs_process(kvlist, ETH_AF_XDP_SHARED_UMEM_ARG,
1443                                 &parse_integer_arg, shared_umem);
1444         if (ret < 0)
1445                 goto free_kvlist;
1446
1447         ret = rte_kvargs_process(kvlist, ETH_AF_XDP_PROG_ARG,
1448                                  &parse_prog_arg, prog_path);
1449         if (ret < 0)
1450                 goto free_kvlist;
1451
1452 free_kvlist:
1453         rte_kvargs_free(kvlist);
1454         return ret;
1455 }
1456
1457 static int
1458 get_iface_info(const char *if_name,
1459                struct rte_ether_addr *eth_addr,
1460                int *if_index)
1461 {
1462         struct ifreq ifr;
1463         int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
1464
1465         if (sock < 0)
1466                 return -1;
1467
1468         strlcpy(ifr.ifr_name, if_name, IFNAMSIZ);
1469         if (ioctl(sock, SIOCGIFINDEX, &ifr))
1470                 goto error;
1471
1472         *if_index = ifr.ifr_ifindex;
1473
1474         if (ioctl(sock, SIOCGIFHWADDR, &ifr))
1475                 goto error;
1476
1477         rte_memcpy(eth_addr, ifr.ifr_hwaddr.sa_data, RTE_ETHER_ADDR_LEN);
1478
1479         close(sock);
1480         return 0;
1481
1482 error:
1483         close(sock);
1484         return -1;
1485 }
1486
1487 static struct rte_eth_dev *
1488 init_internals(struct rte_vdev_device *dev, const char *if_name,
1489                 int start_queue_idx, int queue_cnt, int shared_umem,
1490                 const char *prog_path)
1491 {
1492         const char *name = rte_vdev_device_name(dev);
1493         const unsigned int numa_node = dev->device.numa_node;
1494         struct pmd_internals *internals;
1495         struct rte_eth_dev *eth_dev;
1496         int ret;
1497         int i;
1498
1499         internals = rte_zmalloc_socket(name, sizeof(*internals), 0, numa_node);
1500         if (internals == NULL)
1501                 return NULL;
1502
1503         internals->start_queue_idx = start_queue_idx;
1504         internals->queue_cnt = queue_cnt;
1505         strlcpy(internals->if_name, if_name, IFNAMSIZ);
1506         strlcpy(internals->prog_path, prog_path, PATH_MAX);
1507         internals->custom_prog_configured = 0;
1508
1509 #ifndef ETH_AF_XDP_SHARED_UMEM
1510         if (shared_umem) {
1511                 AF_XDP_LOG(ERR, "Shared UMEM feature not available. "
1512                                 "Check kernel and libbpf version\n");
1513                 goto err_free_internals;
1514         }
1515 #endif
1516         internals->shared_umem = shared_umem;
1517
1518         if (xdp_get_channels_info(if_name, &internals->max_queue_cnt,
1519                                   &internals->combined_queue_cnt)) {
1520                 AF_XDP_LOG(ERR, "Failed to get channel info of interface: %s\n",
1521                                 if_name);
1522                 goto err_free_internals;
1523         }
1524
1525         if (queue_cnt > internals->combined_queue_cnt) {
1526                 AF_XDP_LOG(ERR, "Specified queue count %d is larger than combined queue count %d.\n",
1527                                 queue_cnt, internals->combined_queue_cnt);
1528                 goto err_free_internals;
1529         }
1530
1531         internals->rx_queues = rte_zmalloc_socket(NULL,
1532                                         sizeof(struct pkt_rx_queue) * queue_cnt,
1533                                         0, numa_node);
1534         if (internals->rx_queues == NULL) {
1535                 AF_XDP_LOG(ERR, "Failed to allocate memory for rx queues.\n");
1536                 goto err_free_internals;
1537         }
1538
1539         internals->tx_queues = rte_zmalloc_socket(NULL,
1540                                         sizeof(struct pkt_tx_queue) * queue_cnt,
1541                                         0, numa_node);
1542         if (internals->tx_queues == NULL) {
1543                 AF_XDP_LOG(ERR, "Failed to allocate memory for tx queues.\n");
1544                 goto err_free_rx;
1545         }
1546         for (i = 0; i < queue_cnt; i++) {
1547                 internals->tx_queues[i].pair = &internals->rx_queues[i];
1548                 internals->rx_queues[i].pair = &internals->tx_queues[i];
1549                 internals->rx_queues[i].xsk_queue_idx = start_queue_idx + i;
1550                 internals->tx_queues[i].xsk_queue_idx = start_queue_idx + i;
1551         }
1552
1553         ret = get_iface_info(if_name, &internals->eth_addr,
1554                              &internals->if_index);
1555         if (ret)
1556                 goto err_free_tx;
1557
1558         eth_dev = rte_eth_vdev_allocate(dev, 0);
1559         if (eth_dev == NULL)
1560                 goto err_free_tx;
1561
1562         eth_dev->data->dev_private = internals;
1563         eth_dev->data->dev_link = pmd_link;
1564         eth_dev->data->mac_addrs = &internals->eth_addr;
1565         eth_dev->data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
1566         eth_dev->dev_ops = &ops;
1567         eth_dev->rx_pkt_burst = eth_af_xdp_rx;
1568         eth_dev->tx_pkt_burst = eth_af_xdp_tx;
1569
1570 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
1571         AF_XDP_LOG(INFO, "Zero copy between umem and mbuf enabled.\n");
1572 #endif
1573
1574         return eth_dev;
1575
1576 err_free_tx:
1577         rte_free(internals->tx_queues);
1578 err_free_rx:
1579         rte_free(internals->rx_queues);
1580 err_free_internals:
1581         rte_free(internals);
1582         return NULL;
1583 }
1584
1585 static int
1586 rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
1587 {
1588         struct rte_kvargs *kvlist;
1589         char if_name[IFNAMSIZ] = {'\0'};
1590         int xsk_start_queue_idx = ETH_AF_XDP_DFLT_START_QUEUE_IDX;
1591         int xsk_queue_cnt = ETH_AF_XDP_DFLT_QUEUE_COUNT;
1592         int shared_umem = 0;
1593         char prog_path[PATH_MAX] = {'\0'};
1594         struct rte_eth_dev *eth_dev = NULL;
1595         const char *name;
1596
1597         AF_XDP_LOG(INFO, "Initializing pmd_af_xdp for %s\n",
1598                 rte_vdev_device_name(dev));
1599
1600         name = rte_vdev_device_name(dev);
1601         if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
1602                 strlen(rte_vdev_device_args(dev)) == 0) {
1603                 eth_dev = rte_eth_dev_attach_secondary(name);
1604                 if (eth_dev == NULL) {
1605                         AF_XDP_LOG(ERR, "Failed to probe %s\n", name);
1606                         return -EINVAL;
1607                 }
1608                 eth_dev->dev_ops = &ops;
1609                 rte_eth_dev_probing_finish(eth_dev);
1610                 return 0;
1611         }
1612
1613         kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1614         if (kvlist == NULL) {
1615                 AF_XDP_LOG(ERR, "Invalid kvargs key\n");
1616                 return -EINVAL;
1617         }
1618
1619         if (dev->device.numa_node == SOCKET_ID_ANY)
1620                 dev->device.numa_node = rte_socket_id();
1621
1622         if (parse_parameters(kvlist, if_name, &xsk_start_queue_idx,
1623                              &xsk_queue_cnt, &shared_umem, prog_path) < 0) {
1624                 AF_XDP_LOG(ERR, "Invalid kvargs value\n");
1625                 return -EINVAL;
1626         }
1627
1628         if (strlen(if_name) == 0) {
1629                 AF_XDP_LOG(ERR, "Network interface must be specified\n");
1630                 return -EINVAL;
1631         }
1632
1633         eth_dev = init_internals(dev, if_name, xsk_start_queue_idx,
1634                                         xsk_queue_cnt, shared_umem, prog_path);
1635         if (eth_dev == NULL) {
1636                 AF_XDP_LOG(ERR, "Failed to init internals\n");
1637                 return -1;
1638         }
1639
1640         rte_eth_dev_probing_finish(eth_dev);
1641
1642         return 0;
1643 }
1644
1645 static int
1646 rte_pmd_af_xdp_remove(struct rte_vdev_device *dev)
1647 {
1648         struct rte_eth_dev *eth_dev = NULL;
1649
1650         AF_XDP_LOG(INFO, "Removing AF_XDP ethdev on numa socket %u\n",
1651                 rte_socket_id());
1652
1653         if (dev == NULL)
1654                 return -1;
1655
1656         /* find the ethdev entry */
1657         eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
1658         if (eth_dev == NULL)
1659                 return 0;
1660
1661         eth_dev_close(eth_dev);
1662         rte_eth_dev_release_port(eth_dev);
1663
1664
1665         return 0;
1666 }
1667
1668 static struct rte_vdev_driver pmd_af_xdp_drv = {
1669         .probe = rte_pmd_af_xdp_probe,
1670         .remove = rte_pmd_af_xdp_remove,
1671 };
1672
1673 RTE_PMD_REGISTER_VDEV(net_af_xdp, pmd_af_xdp_drv);
1674 RTE_PMD_REGISTER_PARAM_STRING(net_af_xdp,
1675                               "iface=<string> "
1676                               "start_queue=<int> "
1677                               "queue_count=<int> "
1678                               "shared_umem=<int> "
1679                               "xdp_prog=<string> ");