net/af_xdp: remove unused struct member
[dpdk.git] / drivers / net / af_xdp / rte_eth_af_xdp.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2019 Intel Corporation.
3  */
4 #include <unistd.h>
5 #include <errno.h>
6 #include <stdlib.h>
7 #include <string.h>
8 #include <netinet/in.h>
9 #include <net/if.h>
10 #include <sys/socket.h>
11 #include <sys/ioctl.h>
12 #include <linux/if_ether.h>
13 #include <linux/if_xdp.h>
14 #include <linux/if_link.h>
15 #include <linux/ethtool.h>
16 #include <linux/sockios.h>
17 #include "af_xdp_deps.h"
18 #include <bpf/xsk.h>
19
20 #include <rte_ethdev.h>
21 #include <rte_ethdev_driver.h>
22 #include <rte_ethdev_vdev.h>
23 #include <rte_kvargs.h>
24 #include <rte_bus_vdev.h>
25 #include <rte_string_fns.h>
26 #include <rte_branch_prediction.h>
27 #include <rte_common.h>
28 #include <rte_config.h>
29 #include <rte_dev.h>
30 #include <rte_eal.h>
31 #include <rte_ether.h>
32 #include <rte_lcore.h>
33 #include <rte_log.h>
34 #include <rte_memory.h>
35 #include <rte_memzone.h>
36 #include <rte_mbuf.h>
37 #include <rte_malloc.h>
38 #include <rte_ring.h>
39
40 #ifndef SOL_XDP
41 #define SOL_XDP 283
42 #endif
43
44 #ifndef AF_XDP
45 #define AF_XDP 44
46 #endif
47
48 #ifndef PF_XDP
49 #define PF_XDP AF_XDP
50 #endif
51
52 static int af_xdp_logtype;
53
54 #define AF_XDP_LOG(level, fmt, args...)                 \
55         rte_log(RTE_LOG_ ## level, af_xdp_logtype,      \
56                 "%s(): " fmt, __func__, ##args)
57
58 #define ETH_AF_XDP_FRAME_SIZE           XSK_UMEM__DEFAULT_FRAME_SIZE
59 #define ETH_AF_XDP_NUM_BUFFERS          4096
60 #define ETH_AF_XDP_DATA_HEADROOM        0
61 #define ETH_AF_XDP_DFLT_NUM_DESCS       XSK_RING_CONS__DEFAULT_NUM_DESCS
62 #define ETH_AF_XDP_DFLT_START_QUEUE_IDX 0
63 #define ETH_AF_XDP_DFLT_QUEUE_COUNT     1
64
65 #define ETH_AF_XDP_RX_BATCH_SIZE        32
66 #define ETH_AF_XDP_TX_BATCH_SIZE        32
67
68
69 struct xsk_umem_info {
70         struct xsk_ring_prod fq;
71         struct xsk_ring_cons cq;
72         struct xsk_umem *umem;
73         struct rte_ring *buf_ring;
74         const struct rte_memzone *mz;
75         int pmd_zc;
76 };
77
78 struct rx_stats {
79         uint64_t rx_pkts;
80         uint64_t rx_bytes;
81         uint64_t rx_dropped;
82 };
83
84 struct pkt_rx_queue {
85         struct xsk_ring_cons rx;
86         struct xsk_umem_info *umem;
87         struct xsk_socket *xsk;
88         struct rte_mempool *mb_pool;
89
90         struct rx_stats stats;
91
92         struct pkt_tx_queue *pair;
93         int xsk_queue_idx;
94 };
95
96 struct tx_stats {
97         uint64_t tx_pkts;
98         uint64_t err_pkts;
99         uint64_t tx_bytes;
100 };
101
102 struct pkt_tx_queue {
103         struct xsk_ring_prod tx;
104
105         struct tx_stats stats;
106
107         struct pkt_rx_queue *pair;
108         int xsk_queue_idx;
109 };
110
111 struct pmd_internals {
112         int if_index;
113         char if_name[IFNAMSIZ];
114         int start_queue_idx;
115         int queue_cnt;
116         int max_queue_cnt;
117         int combined_queue_cnt;
118
119         int pmd_zc;
120         struct rte_ether_addr eth_addr;
121
122         struct pkt_rx_queue *rx_queues;
123         struct pkt_tx_queue *tx_queues;
124 };
125
126 #define ETH_AF_XDP_IFACE_ARG                    "iface"
127 #define ETH_AF_XDP_START_QUEUE_ARG              "start_queue"
128 #define ETH_AF_XDP_QUEUE_COUNT_ARG              "queue_count"
129 #define ETH_AF_XDP_PMD_ZC_ARG                   "pmd_zero_copy"
130
131 static const char * const valid_arguments[] = {
132         ETH_AF_XDP_IFACE_ARG,
133         ETH_AF_XDP_START_QUEUE_ARG,
134         ETH_AF_XDP_QUEUE_COUNT_ARG,
135         ETH_AF_XDP_PMD_ZC_ARG,
136         NULL
137 };
138
139 static const struct rte_eth_link pmd_link = {
140         .link_speed = ETH_SPEED_NUM_10G,
141         .link_duplex = ETH_LINK_FULL_DUPLEX,
142         .link_status = ETH_LINK_DOWN,
143         .link_autoneg = ETH_LINK_AUTONEG
144 };
145
146 static inline int
147 reserve_fill_queue(struct xsk_umem_info *umem, uint16_t reserve_size)
148 {
149         struct xsk_ring_prod *fq = &umem->fq;
150         void *addrs[reserve_size];
151         uint32_t idx;
152         uint16_t i;
153
154         if (rte_ring_dequeue_bulk(umem->buf_ring, addrs, reserve_size, NULL)
155                     != reserve_size) {
156                 AF_XDP_LOG(DEBUG, "Failed to get enough buffers for fq.\n");
157                 return -1;
158         }
159
160         if (unlikely(!xsk_ring_prod__reserve(fq, reserve_size, &idx))) {
161                 AF_XDP_LOG(DEBUG, "Failed to reserve enough fq descs.\n");
162                 rte_ring_enqueue_bulk(umem->buf_ring, addrs,
163                                 reserve_size, NULL);
164                 return -1;
165         }
166
167         for (i = 0; i < reserve_size; i++) {
168                 __u64 *fq_addr;
169
170                 fq_addr = xsk_ring_prod__fill_addr(fq, idx++);
171                 *fq_addr = (uint64_t)addrs[i];
172         }
173
174         xsk_ring_prod__submit(fq, reserve_size);
175
176         return 0;
177 }
178
179 static void
180 umem_buf_release_to_fq(void *addr, void *opaque)
181 {
182         struct xsk_umem_info *umem = (struct xsk_umem_info *)opaque;
183         uint64_t umem_addr = (uint64_t)addr - umem->mz->addr_64;
184
185         rte_ring_enqueue(umem->buf_ring, (void *)umem_addr);
186 }
187
188 static uint16_t
189 eth_af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
190 {
191         struct pkt_rx_queue *rxq = queue;
192         struct xsk_ring_cons *rx = &rxq->rx;
193         struct xsk_umem_info *umem = rxq->umem;
194         struct xsk_ring_prod *fq = &umem->fq;
195         uint32_t idx_rx = 0;
196         uint32_t free_thresh = fq->size >> 1;
197         int pmd_zc = umem->pmd_zc;
198         struct rte_mbuf *mbufs[ETH_AF_XDP_RX_BATCH_SIZE];
199         unsigned long dropped = 0;
200         unsigned long rx_bytes = 0;
201         int rcvd, i;
202
203         nb_pkts = RTE_MIN(nb_pkts, ETH_AF_XDP_RX_BATCH_SIZE);
204
205         if (unlikely(rte_pktmbuf_alloc_bulk(rxq->mb_pool, mbufs, nb_pkts) != 0))
206                 return 0;
207
208         rcvd = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
209         if (rcvd == 0)
210                 goto out;
211
212         if (xsk_prod_nb_free(fq, free_thresh) >= free_thresh)
213                 (void)reserve_fill_queue(umem, ETH_AF_XDP_RX_BATCH_SIZE);
214
215         for (i = 0; i < rcvd; i++) {
216                 const struct xdp_desc *desc;
217                 uint64_t addr;
218                 uint32_t len;
219                 void *pkt;
220                 uint16_t buf_len = ETH_AF_XDP_FRAME_SIZE;
221                 struct rte_mbuf_ext_shared_info *shinfo;
222
223                 desc = xsk_ring_cons__rx_desc(rx, idx_rx++);
224                 addr = desc->addr;
225                 len = desc->len;
226                 pkt = xsk_umem__get_data(rxq->umem->mz->addr, addr);
227
228                 if (pmd_zc) {
229                         shinfo = rte_pktmbuf_ext_shinfo_init_helper(pkt,
230                                         &buf_len, umem_buf_release_to_fq, umem);
231
232                         rte_pktmbuf_attach_extbuf(mbufs[i], pkt, 0, buf_len,
233                                                   shinfo);
234                 } else {
235                         rte_memcpy(rte_pktmbuf_mtod(mbufs[i], void *),
236                                                         pkt, len);
237                         rte_ring_enqueue(umem->buf_ring, (void *)addr);
238                 }
239                 rte_pktmbuf_pkt_len(mbufs[i]) = len;
240                 rte_pktmbuf_data_len(mbufs[i]) = len;
241                 rx_bytes += len;
242                 bufs[i] = mbufs[i];
243         }
244
245         xsk_ring_cons__release(rx, rcvd);
246
247         /* statistics */
248         rxq->stats.rx_pkts += (rcvd - dropped);
249         rxq->stats.rx_bytes += rx_bytes;
250
251 out:
252         if (rcvd != nb_pkts)
253                 rte_mempool_put_bulk(rxq->mb_pool, (void **)&mbufs[rcvd],
254                                      nb_pkts - rcvd);
255
256         return rcvd;
257 }
258
259 static void
260 pull_umem_cq(struct xsk_umem_info *umem, int size)
261 {
262         struct xsk_ring_cons *cq = &umem->cq;
263         size_t i, n;
264         uint32_t idx_cq = 0;
265
266         n = xsk_ring_cons__peek(cq, size, &idx_cq);
267
268         for (i = 0; i < n; i++) {
269                 uint64_t addr;
270                 addr = *xsk_ring_cons__comp_addr(cq, idx_cq++);
271                 rte_ring_enqueue(umem->buf_ring, (void *)addr);
272         }
273
274         xsk_ring_cons__release(cq, n);
275 }
276
277 static void
278 kick_tx(struct pkt_tx_queue *txq)
279 {
280         struct xsk_umem_info *umem = txq->pair->umem;
281
282         while (send(xsk_socket__fd(txq->pair->xsk), NULL,
283                       0, MSG_DONTWAIT) < 0) {
284                 /* some thing unexpected */
285                 if (errno != EBUSY && errno != EAGAIN && errno != EINTR)
286                         break;
287
288                 /* pull from completion queue to leave more space */
289                 if (errno == EAGAIN)
290                         pull_umem_cq(umem, ETH_AF_XDP_TX_BATCH_SIZE);
291         }
292         pull_umem_cq(umem, ETH_AF_XDP_TX_BATCH_SIZE);
293 }
294
295 static inline bool
296 in_umem_range(struct xsk_umem_info *umem, uint64_t addr)
297 {
298         uint64_t mz_base_addr = umem->mz->addr_64;
299
300         return addr >= mz_base_addr && addr < mz_base_addr + umem->mz->len;
301 }
302
303 static uint16_t
304 eth_af_xdp_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
305 {
306         struct pkt_tx_queue *txq = queue;
307         struct xsk_umem_info *umem = txq->pair->umem;
308         struct rte_mbuf *mbuf;
309         int pmd_zc = umem->pmd_zc;
310         void *addrs[ETH_AF_XDP_TX_BATCH_SIZE];
311         unsigned long tx_bytes = 0;
312         int i;
313         uint32_t idx_tx;
314
315         nb_pkts = RTE_MIN(nb_pkts, ETH_AF_XDP_TX_BATCH_SIZE);
316
317         pull_umem_cq(umem, nb_pkts);
318
319         nb_pkts = rte_ring_dequeue_bulk(umem->buf_ring, addrs,
320                                         nb_pkts, NULL);
321         if (nb_pkts == 0)
322                 return 0;
323
324         if (xsk_ring_prod__reserve(&txq->tx, nb_pkts, &idx_tx) != nb_pkts) {
325                 kick_tx(txq);
326                 rte_ring_enqueue_bulk(umem->buf_ring, addrs, nb_pkts, NULL);
327                 return 0;
328         }
329
330         for (i = 0; i < nb_pkts; i++) {
331                 struct xdp_desc *desc;
332                 void *pkt;
333
334                 desc = xsk_ring_prod__tx_desc(&txq->tx, idx_tx + i);
335                 mbuf = bufs[i];
336                 desc->len = mbuf->pkt_len;
337
338                 /*
339                  * We need to make sure the external mbuf address is within
340                  * current port's umem memzone range
341                  */
342                 if (pmd_zc && RTE_MBUF_HAS_EXTBUF(mbuf) &&
343                                 in_umem_range(umem, (uint64_t)mbuf->buf_addr)) {
344                         desc->addr = (uint64_t)mbuf->buf_addr -
345                                 umem->mz->addr_64;
346                         mbuf->buf_addr = xsk_umem__get_data(umem->mz->addr,
347                                         (uint64_t)addrs[i]);
348                 } else {
349                         desc->addr = (uint64_t)addrs[i];
350                         pkt = xsk_umem__get_data(umem->mz->addr,
351                                         desc->addr);
352                         rte_memcpy(pkt, rte_pktmbuf_mtod(mbuf, void *),
353                                         desc->len);
354                 }
355                 tx_bytes += mbuf->pkt_len;
356         }
357
358         xsk_ring_prod__submit(&txq->tx, nb_pkts);
359
360         kick_tx(txq);
361
362         txq->stats.tx_pkts += nb_pkts;
363         txq->stats.tx_bytes += tx_bytes;
364
365         for (i = 0; i < nb_pkts; i++)
366                 rte_pktmbuf_free(bufs[i]);
367
368         return nb_pkts;
369 }
370
371 static int
372 eth_dev_start(struct rte_eth_dev *dev)
373 {
374         dev->data->dev_link.link_status = ETH_LINK_UP;
375
376         return 0;
377 }
378
379 /* This function gets called when the current port gets stopped. */
380 static void
381 eth_dev_stop(struct rte_eth_dev *dev)
382 {
383         dev->data->dev_link.link_status = ETH_LINK_DOWN;
384 }
385
386 static int
387 eth_dev_configure(struct rte_eth_dev *dev)
388 {
389         /* rx/tx must be paired */
390         if (dev->data->nb_rx_queues != dev->data->nb_tx_queues)
391                 return -EINVAL;
392
393         return 0;
394 }
395
396 static void
397 eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
398 {
399         struct pmd_internals *internals = dev->data->dev_private;
400
401         dev_info->if_index = internals->if_index;
402         dev_info->max_mac_addrs = 1;
403         dev_info->max_rx_pktlen = ETH_FRAME_LEN;
404         dev_info->max_rx_queues = internals->queue_cnt;
405         dev_info->max_tx_queues = internals->queue_cnt;
406
407         dev_info->min_mtu = RTE_ETHER_MIN_MTU;
408         dev_info->max_mtu = ETH_AF_XDP_FRAME_SIZE - ETH_AF_XDP_DATA_HEADROOM;
409
410         dev_info->default_rxportconf.nb_queues = 1;
411         dev_info->default_txportconf.nb_queues = 1;
412         dev_info->default_rxportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
413         dev_info->default_txportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
414 }
415
416 static int
417 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
418 {
419         struct pmd_internals *internals = dev->data->dev_private;
420         struct xdp_statistics xdp_stats;
421         struct pkt_rx_queue *rxq;
422         struct pkt_tx_queue *txq;
423         socklen_t optlen;
424         int i, ret;
425
426         for (i = 0; i < dev->data->nb_rx_queues; i++) {
427                 optlen = sizeof(struct xdp_statistics);
428                 rxq = &internals->rx_queues[i];
429                 txq = rxq->pair;
430                 stats->q_ipackets[i] = rxq->stats.rx_pkts;
431                 stats->q_ibytes[i] = rxq->stats.rx_bytes;
432
433                 stats->q_opackets[i] = txq->stats.tx_pkts;
434                 stats->q_obytes[i] = txq->stats.tx_bytes;
435
436                 stats->ipackets += stats->q_ipackets[i];
437                 stats->ibytes += stats->q_ibytes[i];
438                 stats->imissed += rxq->stats.rx_dropped;
439                 ret = getsockopt(xsk_socket__fd(rxq->xsk), SOL_XDP,
440                                 XDP_STATISTICS, &xdp_stats, &optlen);
441                 if (ret != 0) {
442                         AF_XDP_LOG(ERR, "getsockopt() failed for XDP_STATISTICS.\n");
443                         return -1;
444                 }
445                 stats->imissed += xdp_stats.rx_dropped;
446
447                 stats->opackets += stats->q_opackets[i];
448                 stats->oerrors += txq->stats.err_pkts;
449                 stats->obytes += stats->q_obytes[i];
450         }
451
452         return 0;
453 }
454
455 static void
456 eth_stats_reset(struct rte_eth_dev *dev)
457 {
458         struct pmd_internals *internals = dev->data->dev_private;
459         int i;
460
461         for (i = 0; i < internals->queue_cnt; i++) {
462                 memset(&internals->rx_queues[i].stats, 0,
463                                         sizeof(struct rx_stats));
464                 memset(&internals->tx_queues[i].stats, 0,
465                                         sizeof(struct tx_stats));
466         }
467 }
468
469 static void
470 remove_xdp_program(struct pmd_internals *internals)
471 {
472         uint32_t curr_prog_id = 0;
473
474         if (bpf_get_link_xdp_id(internals->if_index, &curr_prog_id,
475                                 XDP_FLAGS_UPDATE_IF_NOEXIST)) {
476                 AF_XDP_LOG(ERR, "bpf_get_link_xdp_id failed\n");
477                 return;
478         }
479         bpf_set_link_xdp_fd(internals->if_index, -1,
480                         XDP_FLAGS_UPDATE_IF_NOEXIST);
481 }
482
483 static void
484 xdp_umem_destroy(struct xsk_umem_info *umem)
485 {
486         rte_memzone_free(umem->mz);
487         umem->mz = NULL;
488
489         rte_ring_free(umem->buf_ring);
490         umem->buf_ring = NULL;
491
492         rte_free(umem);
493         umem = NULL;
494 }
495
496 static void
497 eth_dev_close(struct rte_eth_dev *dev)
498 {
499         struct pmd_internals *internals = dev->data->dev_private;
500         struct pkt_rx_queue *rxq;
501         int i;
502
503         AF_XDP_LOG(INFO, "Closing AF_XDP ethdev on numa socket %u\n",
504                 rte_socket_id());
505
506         for (i = 0; i < internals->queue_cnt; i++) {
507                 rxq = &internals->rx_queues[i];
508                 if (rxq->umem == NULL)
509                         break;
510                 xsk_socket__delete(rxq->xsk);
511                 (void)xsk_umem__delete(rxq->umem->umem);
512                 xdp_umem_destroy(rxq->umem);
513
514                 /* free pkt_tx_queue */
515                 rte_free(rxq->pair);
516                 rte_free(rxq);
517         }
518
519         /*
520          * MAC is not allocated dynamically, setting it to NULL would prevent
521          * from releasing it in rte_eth_dev_release_port.
522          */
523         dev->data->mac_addrs = NULL;
524
525         remove_xdp_program(internals);
526 }
527
528 static void
529 eth_queue_release(void *q __rte_unused)
530 {
531 }
532
533 static int
534 eth_link_update(struct rte_eth_dev *dev __rte_unused,
535                 int wait_to_complete __rte_unused)
536 {
537         return 0;
538 }
539
540 static struct
541 xsk_umem_info *xdp_umem_configure(struct pmd_internals *internals,
542                                   struct pkt_rx_queue *rxq)
543 {
544         struct xsk_umem_info *umem;
545         const struct rte_memzone *mz;
546         struct xsk_umem_config usr_config = {
547                 .fill_size = ETH_AF_XDP_DFLT_NUM_DESCS,
548                 .comp_size = ETH_AF_XDP_DFLT_NUM_DESCS,
549                 .frame_size = ETH_AF_XDP_FRAME_SIZE,
550                 .frame_headroom = ETH_AF_XDP_DATA_HEADROOM };
551         char ring_name[RTE_RING_NAMESIZE];
552         char mz_name[RTE_MEMZONE_NAMESIZE];
553         int ret;
554         uint64_t i;
555
556         umem = rte_zmalloc_socket("umem", sizeof(*umem), 0, rte_socket_id());
557         if (umem == NULL) {
558                 AF_XDP_LOG(ERR, "Failed to allocate umem info");
559                 return NULL;
560         }
561
562         snprintf(ring_name, sizeof(ring_name), "af_xdp_ring_%s_%u",
563                        internals->if_name, rxq->xsk_queue_idx);
564         umem->buf_ring = rte_ring_create(ring_name,
565                                          ETH_AF_XDP_NUM_BUFFERS,
566                                          rte_socket_id(),
567                                          0x0);
568         if (umem->buf_ring == NULL) {
569                 AF_XDP_LOG(ERR, "Failed to create rte_ring\n");
570                 goto err;
571         }
572
573         for (i = 0; i < ETH_AF_XDP_NUM_BUFFERS; i++)
574                 rte_ring_enqueue(umem->buf_ring,
575                                  (void *)(i * ETH_AF_XDP_FRAME_SIZE +
576                                           ETH_AF_XDP_DATA_HEADROOM));
577
578         snprintf(mz_name, sizeof(mz_name), "af_xdp_umem_%s_%u",
579                        internals->if_name, rxq->xsk_queue_idx);
580         mz = rte_memzone_reserve_aligned(mz_name,
581                         ETH_AF_XDP_NUM_BUFFERS * ETH_AF_XDP_FRAME_SIZE,
582                         rte_socket_id(), RTE_MEMZONE_IOVA_CONTIG,
583                         getpagesize());
584         if (mz == NULL) {
585                 AF_XDP_LOG(ERR, "Failed to reserve memzone for af_xdp umem.\n");
586                 goto err;
587         }
588
589         ret = xsk_umem__create(&umem->umem, mz->addr,
590                                ETH_AF_XDP_NUM_BUFFERS * ETH_AF_XDP_FRAME_SIZE,
591                                &umem->fq, &umem->cq,
592                                &usr_config);
593
594         if (ret) {
595                 AF_XDP_LOG(ERR, "Failed to create umem");
596                 goto err;
597         }
598         umem->mz = mz;
599
600         return umem;
601
602 err:
603         xdp_umem_destroy(umem);
604         return NULL;
605 }
606
607 static int
608 xsk_configure(struct pmd_internals *internals, struct pkt_rx_queue *rxq,
609               int ring_size)
610 {
611         struct xsk_socket_config cfg;
612         struct pkt_tx_queue *txq = rxq->pair;
613         int ret = 0;
614         int reserve_size;
615
616         rxq->umem = xdp_umem_configure(internals, rxq);
617         if (rxq->umem == NULL)
618                 return -ENOMEM;
619
620         cfg.rx_size = ring_size;
621         cfg.tx_size = ring_size;
622         cfg.libbpf_flags = 0;
623         cfg.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
624         cfg.bind_flags = 0;
625         ret = xsk_socket__create(&rxq->xsk, internals->if_name,
626                         rxq->xsk_queue_idx, rxq->umem->umem, &rxq->rx,
627                         &txq->tx, &cfg);
628         if (ret) {
629                 AF_XDP_LOG(ERR, "Failed to create xsk socket.\n");
630                 goto err;
631         }
632
633         reserve_size = ETH_AF_XDP_DFLT_NUM_DESCS / 2;
634         ret = reserve_fill_queue(rxq->umem, reserve_size);
635         if (ret) {
636                 xsk_socket__delete(rxq->xsk);
637                 AF_XDP_LOG(ERR, "Failed to reserve fill queue.\n");
638                 goto err;
639         }
640
641         return 0;
642
643 err:
644         xdp_umem_destroy(rxq->umem);
645
646         return ret;
647 }
648
649 static int
650 eth_rx_queue_setup(struct rte_eth_dev *dev,
651                    uint16_t rx_queue_id,
652                    uint16_t nb_rx_desc,
653                    unsigned int socket_id __rte_unused,
654                    const struct rte_eth_rxconf *rx_conf __rte_unused,
655                    struct rte_mempool *mb_pool)
656 {
657         struct pmd_internals *internals = dev->data->dev_private;
658         uint32_t buf_size, data_size;
659         struct pkt_rx_queue *rxq;
660         int ret;
661
662         rxq = &internals->rx_queues[rx_queue_id];
663
664         AF_XDP_LOG(INFO, "Set up rx queue, rx queue id: %d, xsk queue id: %d\n",
665                    rx_queue_id, rxq->xsk_queue_idx);
666         /* Now get the space available for data in the mbuf */
667         buf_size = rte_pktmbuf_data_room_size(mb_pool) -
668                 RTE_PKTMBUF_HEADROOM;
669         data_size = ETH_AF_XDP_FRAME_SIZE - ETH_AF_XDP_DATA_HEADROOM;
670
671         if (data_size > buf_size) {
672                 AF_XDP_LOG(ERR, "%s: %d bytes will not fit in mbuf (%d bytes)\n",
673                         dev->device->name, data_size, buf_size);
674                 ret = -ENOMEM;
675                 goto err;
676         }
677
678         rxq->mb_pool = mb_pool;
679
680         if (xsk_configure(internals, rxq, nb_rx_desc)) {
681                 AF_XDP_LOG(ERR, "Failed to configure xdp socket\n");
682                 ret = -EINVAL;
683                 goto err;
684         }
685
686         rxq->umem->pmd_zc = internals->pmd_zc;
687
688         dev->data->rx_queues[rx_queue_id] = rxq;
689         return 0;
690
691 err:
692         return ret;
693 }
694
695 static int
696 eth_tx_queue_setup(struct rte_eth_dev *dev,
697                    uint16_t tx_queue_id,
698                    uint16_t nb_tx_desc __rte_unused,
699                    unsigned int socket_id __rte_unused,
700                    const struct rte_eth_txconf *tx_conf __rte_unused)
701 {
702         struct pmd_internals *internals = dev->data->dev_private;
703         struct pkt_tx_queue *txq;
704
705         txq = &internals->tx_queues[tx_queue_id];
706
707         dev->data->tx_queues[tx_queue_id] = txq;
708         return 0;
709 }
710
711 static int
712 eth_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
713 {
714         struct pmd_internals *internals = dev->data->dev_private;
715         struct ifreq ifr = { .ifr_mtu = mtu };
716         int ret;
717         int s;
718
719         s = socket(PF_INET, SOCK_DGRAM, 0);
720         if (s < 0)
721                 return -EINVAL;
722
723         strlcpy(ifr.ifr_name, internals->if_name, IFNAMSIZ);
724         ret = ioctl(s, SIOCSIFMTU, &ifr);
725         close(s);
726
727         return (ret < 0) ? -errno : 0;
728 }
729
730 static void
731 eth_dev_change_flags(char *if_name, uint32_t flags, uint32_t mask)
732 {
733         struct ifreq ifr;
734         int s;
735
736         s = socket(PF_INET, SOCK_DGRAM, 0);
737         if (s < 0)
738                 return;
739
740         strlcpy(ifr.ifr_name, if_name, IFNAMSIZ);
741         if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0)
742                 goto out;
743         ifr.ifr_flags &= mask;
744         ifr.ifr_flags |= flags;
745         if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0)
746                 goto out;
747 out:
748         close(s);
749 }
750
751 static void
752 eth_dev_promiscuous_enable(struct rte_eth_dev *dev)
753 {
754         struct pmd_internals *internals = dev->data->dev_private;
755
756         eth_dev_change_flags(internals->if_name, IFF_PROMISC, ~0);
757 }
758
759 static void
760 eth_dev_promiscuous_disable(struct rte_eth_dev *dev)
761 {
762         struct pmd_internals *internals = dev->data->dev_private;
763
764         eth_dev_change_flags(internals->if_name, 0, ~IFF_PROMISC);
765 }
766
767 static const struct eth_dev_ops ops = {
768         .dev_start = eth_dev_start,
769         .dev_stop = eth_dev_stop,
770         .dev_close = eth_dev_close,
771         .dev_configure = eth_dev_configure,
772         .dev_infos_get = eth_dev_info,
773         .mtu_set = eth_dev_mtu_set,
774         .promiscuous_enable = eth_dev_promiscuous_enable,
775         .promiscuous_disable = eth_dev_promiscuous_disable,
776         .rx_queue_setup = eth_rx_queue_setup,
777         .tx_queue_setup = eth_tx_queue_setup,
778         .rx_queue_release = eth_queue_release,
779         .tx_queue_release = eth_queue_release,
780         .link_update = eth_link_update,
781         .stats_get = eth_stats_get,
782         .stats_reset = eth_stats_reset,
783 };
784
785 /** parse integer from integer argument */
786 static int
787 parse_integer_arg(const char *key __rte_unused,
788                   const char *value, void *extra_args)
789 {
790         int *i = (int *)extra_args;
791         char *end;
792
793         *i = strtol(value, &end, 10);
794         if (*i < 0) {
795                 AF_XDP_LOG(ERR, "Argument has to be positive.\n");
796                 return -EINVAL;
797         }
798
799         return 0;
800 }
801
802 /** parse name argument */
803 static int
804 parse_name_arg(const char *key __rte_unused,
805                const char *value, void *extra_args)
806 {
807         char *name = extra_args;
808
809         if (strnlen(value, IFNAMSIZ) > IFNAMSIZ - 1) {
810                 AF_XDP_LOG(ERR, "Invalid name %s, should be less than %u bytes.\n",
811                            value, IFNAMSIZ);
812                 return -EINVAL;
813         }
814
815         strlcpy(name, value, IFNAMSIZ);
816
817         return 0;
818 }
819
820 static int
821 xdp_get_channels_info(const char *if_name, int *max_queues,
822                                 int *combined_queues)
823 {
824         struct ethtool_channels channels;
825         struct ifreq ifr;
826         int fd, ret;
827
828         fd = socket(AF_INET, SOCK_DGRAM, 0);
829         if (fd < 0)
830                 return -1;
831
832         channels.cmd = ETHTOOL_GCHANNELS;
833         ifr.ifr_data = (void *)&channels;
834         strncpy(ifr.ifr_name, if_name, IFNAMSIZ);
835         ret = ioctl(fd, SIOCETHTOOL, &ifr);
836         if (ret && errno != EOPNOTSUPP) {
837                 ret = -errno;
838                 goto out;
839         }
840
841         if (channels.max_combined == 0 || errno == EOPNOTSUPP) {
842                 /* If the device says it has no channels, then all traffic
843                  * is sent to a single stream, so max queues = 1.
844                  */
845                 *max_queues = 1;
846                 *combined_queues = 1;
847         } else {
848                 *max_queues = channels.max_combined;
849                 *combined_queues = channels.combined_count;
850         }
851
852  out:
853         close(fd);
854         return ret;
855 }
856
857 static int
858 parse_parameters(struct rte_kvargs *kvlist, char *if_name, int *start_queue,
859                         int *queue_cnt, int *pmd_zc)
860 {
861         int ret;
862
863         ret = rte_kvargs_process(kvlist, ETH_AF_XDP_IFACE_ARG,
864                                  &parse_name_arg, if_name);
865         if (ret < 0)
866                 goto free_kvlist;
867
868         ret = rte_kvargs_process(kvlist, ETH_AF_XDP_START_QUEUE_ARG,
869                                  &parse_integer_arg, start_queue);
870         if (ret < 0)
871                 goto free_kvlist;
872
873         ret = rte_kvargs_process(kvlist, ETH_AF_XDP_QUEUE_COUNT_ARG,
874                                  &parse_integer_arg, queue_cnt);
875         if (ret < 0 || *queue_cnt <= 0) {
876                 ret = -EINVAL;
877                 goto free_kvlist;
878         }
879
880         ret = rte_kvargs_process(kvlist, ETH_AF_XDP_PMD_ZC_ARG,
881                                  &parse_integer_arg, pmd_zc);
882         if (ret < 0)
883                 goto free_kvlist;
884
885 free_kvlist:
886         rte_kvargs_free(kvlist);
887         return ret;
888 }
889
890 static int
891 get_iface_info(const char *if_name,
892                struct rte_ether_addr *eth_addr,
893                int *if_index)
894 {
895         struct ifreq ifr;
896         int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
897
898         if (sock < 0)
899                 return -1;
900
901         strlcpy(ifr.ifr_name, if_name, IFNAMSIZ);
902         if (ioctl(sock, SIOCGIFINDEX, &ifr))
903                 goto error;
904
905         *if_index = ifr.ifr_ifindex;
906
907         if (ioctl(sock, SIOCGIFHWADDR, &ifr))
908                 goto error;
909
910         rte_memcpy(eth_addr, ifr.ifr_hwaddr.sa_data, RTE_ETHER_ADDR_LEN);
911
912         close(sock);
913         return 0;
914
915 error:
916         close(sock);
917         return -1;
918 }
919
920 static struct rte_eth_dev *
921 init_internals(struct rte_vdev_device *dev, const char *if_name,
922                         int start_queue_idx, int queue_cnt, int pmd_zc)
923 {
924         const char *name = rte_vdev_device_name(dev);
925         const unsigned int numa_node = dev->device.numa_node;
926         struct pmd_internals *internals;
927         struct rte_eth_dev *eth_dev;
928         int ret;
929         int i;
930
931         internals = rte_zmalloc_socket(name, sizeof(*internals), 0, numa_node);
932         if (internals == NULL)
933                 return NULL;
934
935         internals->start_queue_idx = start_queue_idx;
936         internals->queue_cnt = queue_cnt;
937         internals->pmd_zc = pmd_zc;
938         strlcpy(internals->if_name, if_name, IFNAMSIZ);
939
940         if (xdp_get_channels_info(if_name, &internals->max_queue_cnt,
941                                   &internals->combined_queue_cnt)) {
942                 AF_XDP_LOG(ERR, "Failed to get channel info of interface: %s\n",
943                                 if_name);
944                 goto err_free_internals;
945         }
946
947         if (queue_cnt > internals->combined_queue_cnt) {
948                 AF_XDP_LOG(ERR, "Specified queue count %d is larger than combined queue count %d.\n",
949                                 queue_cnt, internals->combined_queue_cnt);
950                 goto err_free_internals;
951         }
952
953         internals->rx_queues = rte_zmalloc_socket(NULL,
954                                         sizeof(struct pkt_rx_queue) * queue_cnt,
955                                         0, numa_node);
956         if (internals->rx_queues == NULL) {
957                 AF_XDP_LOG(ERR, "Failed to allocate memory for rx queues.\n");
958                 goto err_free_internals;
959         }
960
961         internals->tx_queues = rte_zmalloc_socket(NULL,
962                                         sizeof(struct pkt_tx_queue) * queue_cnt,
963                                         0, numa_node);
964         if (internals->tx_queues == NULL) {
965                 AF_XDP_LOG(ERR, "Failed to allocate memory for tx queues.\n");
966                 goto err_free_rx;
967         }
968         for (i = 0; i < queue_cnt; i++) {
969                 internals->tx_queues[i].pair = &internals->rx_queues[i];
970                 internals->rx_queues[i].pair = &internals->tx_queues[i];
971                 internals->rx_queues[i].xsk_queue_idx = start_queue_idx + i;
972                 internals->tx_queues[i].xsk_queue_idx = start_queue_idx + i;
973         }
974
975         ret = get_iface_info(if_name, &internals->eth_addr,
976                              &internals->if_index);
977         if (ret)
978                 goto err_free_tx;
979
980         eth_dev = rte_eth_vdev_allocate(dev, 0);
981         if (eth_dev == NULL)
982                 goto err_free_tx;
983
984         eth_dev->data->dev_private = internals;
985         eth_dev->data->dev_link = pmd_link;
986         eth_dev->data->mac_addrs = &internals->eth_addr;
987         eth_dev->dev_ops = &ops;
988         eth_dev->rx_pkt_burst = eth_af_xdp_rx;
989         eth_dev->tx_pkt_burst = eth_af_xdp_tx;
990         /* Let rte_eth_dev_close() release the port resources. */
991         eth_dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE;
992
993         if (internals->pmd_zc)
994                 AF_XDP_LOG(INFO, "Zero copy between umem and mbuf enabled.\n");
995
996         return eth_dev;
997
998 err_free_tx:
999         rte_free(internals->tx_queues);
1000 err_free_rx:
1001         rte_free(internals->rx_queues);
1002 err_free_internals:
1003         rte_free(internals);
1004         return NULL;
1005 }
1006
1007 static int
1008 rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
1009 {
1010         struct rte_kvargs *kvlist;
1011         char if_name[IFNAMSIZ] = {'\0'};
1012         int xsk_start_queue_idx = ETH_AF_XDP_DFLT_START_QUEUE_IDX;
1013         int xsk_queue_cnt = ETH_AF_XDP_DFLT_QUEUE_COUNT;
1014         struct rte_eth_dev *eth_dev = NULL;
1015         const char *name;
1016         int pmd_zc = 0;
1017
1018         AF_XDP_LOG(INFO, "Initializing pmd_af_xdp for %s\n",
1019                 rte_vdev_device_name(dev));
1020
1021         name = rte_vdev_device_name(dev);
1022         if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
1023                 strlen(rte_vdev_device_args(dev)) == 0) {
1024                 eth_dev = rte_eth_dev_attach_secondary(name);
1025                 if (eth_dev == NULL) {
1026                         AF_XDP_LOG(ERR, "Failed to probe %s\n", name);
1027                         return -EINVAL;
1028                 }
1029                 eth_dev->dev_ops = &ops;
1030                 rte_eth_dev_probing_finish(eth_dev);
1031                 return 0;
1032         }
1033
1034         kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1035         if (kvlist == NULL) {
1036                 AF_XDP_LOG(ERR, "Invalid kvargs key\n");
1037                 return -EINVAL;
1038         }
1039
1040         if (dev->device.numa_node == SOCKET_ID_ANY)
1041                 dev->device.numa_node = rte_socket_id();
1042
1043         if (parse_parameters(kvlist, if_name, &xsk_start_queue_idx,
1044                              &xsk_queue_cnt, &pmd_zc) < 0) {
1045                 AF_XDP_LOG(ERR, "Invalid kvargs value\n");
1046                 return -EINVAL;
1047         }
1048
1049         if (strlen(if_name) == 0) {
1050                 AF_XDP_LOG(ERR, "Network interface must be specified\n");
1051                 return -EINVAL;
1052         }
1053
1054         eth_dev = init_internals(dev, if_name, xsk_start_queue_idx,
1055                                         xsk_queue_cnt, pmd_zc);
1056         if (eth_dev == NULL) {
1057                 AF_XDP_LOG(ERR, "Failed to init internals\n");
1058                 return -1;
1059         }
1060
1061         rte_eth_dev_probing_finish(eth_dev);
1062
1063         return 0;
1064 }
1065
1066 static int
1067 rte_pmd_af_xdp_remove(struct rte_vdev_device *dev)
1068 {
1069         struct rte_eth_dev *eth_dev = NULL;
1070
1071         AF_XDP_LOG(INFO, "Removing AF_XDP ethdev on numa socket %u\n",
1072                 rte_socket_id());
1073
1074         if (dev == NULL)
1075                 return -1;
1076
1077         /* find the ethdev entry */
1078         eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
1079         if (eth_dev == NULL)
1080                 return 0;
1081
1082         eth_dev_close(eth_dev);
1083         rte_eth_dev_release_port(eth_dev);
1084
1085
1086         return 0;
1087 }
1088
1089 static struct rte_vdev_driver pmd_af_xdp_drv = {
1090         .probe = rte_pmd_af_xdp_probe,
1091         .remove = rte_pmd_af_xdp_remove,
1092 };
1093
1094 RTE_PMD_REGISTER_VDEV(net_af_xdp, pmd_af_xdp_drv);
1095 RTE_PMD_REGISTER_PARAM_STRING(net_af_xdp,
1096                               "iface=<string> "
1097                               "start_queue=<int> "
1098                               "queue_count=<int> "
1099                               "pmd_zero_copy=<0|1>");
1100
1101 RTE_INIT(af_xdp_init_log)
1102 {
1103         af_xdp_logtype = rte_log_register("pmd.net.af_xdp");
1104         if (af_xdp_logtype >= 0)
1105                 rte_log_set_level(af_xdp_logtype, RTE_LOG_NOTICE);
1106 }