net/sfc: fence off 8 bits in Rx mark for tunnel offload
[dpdk.git] / drivers / net / af_xdp / rte_eth_af_xdp.c
index 703b741..b362ccd 100644 (file)
@@ -5,7 +5,6 @@
 #include <errno.h>
 #include <stdlib.h>
 #include <string.h>
-#include <poll.h>
 #include <netinet/in.h>
 #include <net/if.h>
 #include <sys/socket.h>
@@ -19,8 +18,8 @@
 #include <bpf/xsk.h>
 
 #include <rte_ethdev.h>
-#include <rte_ethdev_driver.h>
-#include <rte_ethdev_vdev.h>
+#include <ethdev_driver.h>
+#include <ethdev_vdev.h>
 #include <rte_kvargs.h>
 #include <rte_bus_vdev.h>
 #include <rte_string_fns.h>
 #include <rte_malloc.h>
 #include <rte_ring.h>
 #include <rte_spinlock.h>
+#include <rte_power_intrinsics.h>
 
 #include "compat.h"
 
+#ifndef SO_PREFER_BUSY_POLL
+#define SO_PREFER_BUSY_POLL 69
+#endif
+#ifndef SO_BUSY_POLL_BUDGET
+#define SO_BUSY_POLL_BUDGET 70
+#endif
+
 
 #ifndef SOL_XDP
 #define SOL_XDP 283
@@ -54,7 +61,7 @@
 #define PF_XDP AF_XDP
 #endif
 
-RTE_LOG_REGISTER(af_xdp_logtype, pmd.net.af_xdp, NOTICE);
+RTE_LOG_REGISTER_DEFAULT(af_xdp_logtype, NOTICE);
 
 #define AF_XDP_LOG(level, fmt, args...)                        \
        rte_log(RTE_LOG_ ## level, af_xdp_logtype,      \
@@ -65,9 +72,11 @@ RTE_LOG_REGISTER(af_xdp_logtype, pmd.net.af_xdp, NOTICE);
 #define ETH_AF_XDP_DFLT_NUM_DESCS      XSK_RING_CONS__DEFAULT_NUM_DESCS
 #define ETH_AF_XDP_DFLT_START_QUEUE_IDX        0
 #define ETH_AF_XDP_DFLT_QUEUE_COUNT    1
+#define ETH_AF_XDP_DFLT_BUSY_BUDGET    64
+#define ETH_AF_XDP_DFLT_BUSY_TIMEOUT   20
 
-#define ETH_AF_XDP_RX_BATCH_SIZE       32
-#define ETH_AF_XDP_TX_BATCH_SIZE       32
+#define ETH_AF_XDP_RX_BATCH_SIZE       XSK_RING_CONS__DEFAULT_NUM_DESCS
+#define ETH_AF_XDP_TX_BATCH_SIZE       XSK_RING_CONS__DEFAULT_NUM_DESCS
 
 
 struct xsk_umem_info {
@@ -100,6 +109,7 @@ struct pkt_rx_queue {
        struct pkt_tx_queue *pair;
        struct pollfd fds[1];
        int xsk_queue_idx;
+       int busy_budget;
 };
 
 struct tx_stats {
@@ -140,6 +150,7 @@ struct pmd_internals {
 #define ETH_AF_XDP_QUEUE_COUNT_ARG             "queue_count"
 #define ETH_AF_XDP_SHARED_UMEM_ARG             "shared_umem"
 #define ETH_AF_XDP_PROG_ARG                    "xdp_prog"
+#define ETH_AF_XDP_BUDGET_ARG                  "busy_budget"
 
 static const char * const valid_arguments[] = {
        ETH_AF_XDP_IFACE_ARG,
@@ -147,6 +158,7 @@ static const char * const valid_arguments[] = {
        ETH_AF_XDP_QUEUE_COUNT_ARG,
        ETH_AF_XDP_SHARED_UMEM_ARG,
        ETH_AF_XDP_PROG_ARG,
+       ETH_AF_XDP_BUDGET_ARG,
        NULL
 };
 
@@ -261,10 +273,17 @@ af_xdp_rx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
        nb_pkts = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
 
        if (nb_pkts == 0) {
-#if defined(XDP_USE_NEED_WAKEUP)
-               if (xsk_ring_prod__needs_wakeup(fq))
-                       (void)poll(rxq->fds, 1, 1000);
-#endif
+               /* we can assume a kernel >= 5.11 is in use if busy polling is
+                * enabled and thus we can safely use the recvfrom() syscall
+                * which is only supported for AF_XDP sockets in kernels >=
+                * 5.11.
+                */
+               if (rxq->busy_budget) {
+                       (void)recvfrom(xsk_socket__fd(rxq->xsk), NULL, 0,
+                                      MSG_DONTWAIT, NULL, NULL);
+               } else if (xsk_ring_prod__needs_wakeup(fq)) {
+                       (void)poll(&rxq->fds[0], 1, 1000);
+               }
 
                return 0;
        }
@@ -329,8 +348,7 @@ af_xdp_rx_cp(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
        struct rte_mbuf *mbufs[ETH_AF_XDP_RX_BATCH_SIZE];
 
        if (xsk_prod_nb_free(fq, free_thresh) >= free_thresh)
-               (void)reserve_fill_queue(umem, ETH_AF_XDP_RX_BATCH_SIZE,
-                                        NULL, fq);
+               (void)reserve_fill_queue(umem, nb_pkts, NULL, fq);
 
        nb_pkts = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
        if (nb_pkts == 0) {
@@ -379,10 +397,8 @@ af_xdp_rx_cp(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 #endif
 
 static uint16_t
-eth_af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 {
-       nb_pkts = RTE_MIN(nb_pkts, ETH_AF_XDP_RX_BATCH_SIZE);
-
 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
        return af_xdp_rx_zc(queue, bufs, nb_pkts);
 #else
@@ -390,6 +406,32 @@ eth_af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 #endif
 }
 
+static uint16_t
+eth_af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+       uint16_t nb_rx;
+
+       if (likely(nb_pkts <= ETH_AF_XDP_RX_BATCH_SIZE))
+               return af_xdp_rx(queue, bufs, nb_pkts);
+
+       /* Split larger batch into smaller batches of size
+        * ETH_AF_XDP_RX_BATCH_SIZE or less.
+        */
+       nb_rx = 0;
+       while (nb_pkts) {
+               uint16_t ret, n;
+
+               n = (uint16_t)RTE_MIN(nb_pkts, ETH_AF_XDP_RX_BATCH_SIZE);
+               ret = af_xdp_rx(queue, &bufs[nb_rx], n);
+               nb_rx = (uint16_t)(nb_rx + ret);
+               nb_pkts = (uint16_t)(nb_pkts - ret);
+               if (ret < n)
+                       break;
+       }
+
+       return nb_rx;
+}
+
 static void
 pull_umem_cq(struct xsk_umem_info *umem, int size, struct xsk_ring_cons *cq)
 {
@@ -421,9 +463,7 @@ kick_tx(struct pkt_tx_queue *txq, struct xsk_ring_cons *cq)
 
        pull_umem_cq(umem, XSK_RING_CONS__DEFAULT_NUM_DESCS, cq);
 
-#if defined(XDP_USE_NEED_WAKEUP)
-       if (xsk_ring_prod__needs_wakeup(&txq->tx))
-#endif
+       if (tx_syscall_needed(&txq->tx))
                while (send(xsk_socket__fd(txq->pair->xsk), NULL,
                            0, MSG_DONTWAIT) < 0) {
                        /* some thing unexpected */
@@ -487,7 +527,6 @@ af_xdp_tx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 
                        if (!xsk_ring_prod__reserve(&txq->tx, 1, &idx_tx)) {
                                rte_pktmbuf_free(local_mbuf);
-                               kick_tx(txq, cq);
                                goto out;
                        }
 
@@ -511,10 +550,9 @@ af_xdp_tx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
                tx_bytes += mbuf->pkt_len;
        }
 
-       kick_tx(txq, cq);
-
 out:
        xsk_ring_prod__submit(&txq->tx, count);
+       kick_tx(txq, cq);
 
        txq->stats.tx_pkts += count;
        txq->stats.tx_bytes += tx_bytes;
@@ -535,8 +573,6 @@ af_xdp_tx_cp(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
        uint32_t idx_tx;
        struct xsk_ring_cons *cq = &txq->pair->cq;
 
-       nb_pkts = RTE_MIN(nb_pkts, ETH_AF_XDP_TX_BATCH_SIZE);
-
        pull_umem_cq(umem, nb_pkts, cq);
 
        nb_pkts = rte_ring_dequeue_bulk(umem->buf_ring, addrs,
@@ -575,6 +611,32 @@ af_xdp_tx_cp(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 
        return nb_pkts;
 }
+
+static uint16_t
+af_xdp_tx_cp_batch(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+       uint16_t nb_tx;
+
+       if (likely(nb_pkts <= ETH_AF_XDP_TX_BATCH_SIZE))
+               return af_xdp_tx_cp(queue, bufs, nb_pkts);
+
+       nb_tx = 0;
+       while (nb_pkts) {
+               uint16_t ret, n;
+
+               /* Split larger batch into smaller batches of size
+                * ETH_AF_XDP_TX_BATCH_SIZE or less.
+                */
+               n = (uint16_t)RTE_MIN(nb_pkts, ETH_AF_XDP_TX_BATCH_SIZE);
+               ret = af_xdp_tx_cp(queue, &bufs[nb_tx], n);
+               nb_tx = (uint16_t)(nb_tx + ret);
+               nb_pkts = (uint16_t)(nb_pkts - ret);
+               if (ret < n)
+                       break;
+       }
+
+       return nb_tx;
+}
 #endif
 
 static uint16_t
@@ -583,7 +645,7 @@ eth_af_xdp_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
        return af_xdp_tx_zc(queue, bufs, nb_pkts);
 #else
-       return af_xdp_tx_cp(queue, bufs, nb_pkts);
+       return af_xdp_tx_cp_batch(queue, bufs, nb_pkts);
 #endif
 }
 
@@ -725,6 +787,38 @@ eth_dev_configure(struct rte_eth_dev *dev)
        return 0;
 }
 
+#define CLB_VAL_IDX 0
+static int
+eth_monitor_callback(const uint64_t value,
+               const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
+{
+       const uint64_t v = opaque[CLB_VAL_IDX];
+       const uint64_t m = (uint32_t)~0;
+
+       /* if the value has changed, abort entering power optimized state */
+       return (value & m) == v ? 0 : -1;
+}
+
+static int
+eth_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
+{
+       struct pkt_rx_queue *rxq = rx_queue;
+       unsigned int *prod = rxq->rx.producer;
+       const uint32_t cur_val = rxq->rx.cached_prod; /* use cached value */
+
+       /* watch for changes in producer ring */
+       pmc->addr = (void *)prod;
+
+       /* store current value */
+       pmc->opaque[CLB_VAL_IDX] = cur_val;
+       pmc->fn = eth_monitor_callback;
+
+       /* AF_XDP producer ring index is 32-bit */
+       pmc->size = sizeof(uint32_t);
+
+       return 0;
+}
+
 static int
 eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 {
@@ -746,6 +840,8 @@ eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
        dev_info->max_mtu = ETH_AF_XDP_FRAME_SIZE - XDP_PACKET_HEADROOM;
 #endif
 
+       dev_info->default_rxportconf.burst_size = ETH_AF_XDP_DFLT_BUSY_BUDGET;
+       dev_info->default_txportconf.burst_size = ETH_AF_XDP_DFLT_BUSY_BUDGET;
        dev_info->default_rxportconf.nb_queues = 1;
        dev_info->default_txportconf.nb_queues = 1;
        dev_info->default_rxportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
@@ -893,11 +989,6 @@ eth_dev_close(struct rte_eth_dev *dev)
        return 0;
 }
 
-static void
-eth_queue_release(void *q __rte_unused)
-{
-}
-
 static int
 eth_link_update(struct rte_eth_dev *dev __rte_unused,
                int wait_to_complete __rte_unused)
@@ -1093,6 +1184,65 @@ load_custom_xdp_prog(const char *prog_path, int if_index)
        return 0;
 }
 
+/* Detect support for busy polling through setsockopt(). */
+static int
+configure_preferred_busy_poll(struct pkt_rx_queue *rxq)
+{
+       int sock_opt = 1;
+       int fd = xsk_socket__fd(rxq->xsk);
+       int ret = 0;
+
+       ret = setsockopt(fd, SOL_SOCKET, SO_PREFER_BUSY_POLL,
+                       (void *)&sock_opt, sizeof(sock_opt));
+       if (ret < 0) {
+               AF_XDP_LOG(DEBUG, "Failed to set SO_PREFER_BUSY_POLL\n");
+               goto err_prefer;
+       }
+
+       sock_opt = ETH_AF_XDP_DFLT_BUSY_TIMEOUT;
+       ret = setsockopt(fd, SOL_SOCKET, SO_BUSY_POLL, (void *)&sock_opt,
+                       sizeof(sock_opt));
+       if (ret < 0) {
+               AF_XDP_LOG(DEBUG, "Failed to set SO_BUSY_POLL\n");
+               goto err_timeout;
+       }
+
+       sock_opt = rxq->busy_budget;
+       ret = setsockopt(fd, SOL_SOCKET, SO_BUSY_POLL_BUDGET,
+                       (void *)&sock_opt, sizeof(sock_opt));
+       if (ret < 0) {
+               AF_XDP_LOG(DEBUG, "Failed to set SO_BUSY_POLL_BUDGET\n");
+       } else {
+               AF_XDP_LOG(INFO, "Busy polling budget set to: %u\n",
+                                       rxq->busy_budget);
+               return 0;
+       }
+
+       /* setsockopt failure - attempt to restore xsk to default state and
+        * proceed without busy polling support.
+        */
+       sock_opt = 0;
+       ret = setsockopt(fd, SOL_SOCKET, SO_BUSY_POLL, (void *)&sock_opt,
+                       sizeof(sock_opt));
+       if (ret < 0) {
+               AF_XDP_LOG(ERR, "Failed to unset SO_BUSY_POLL\n");
+               return -1;
+       }
+
+err_timeout:
+       sock_opt = 0;
+       ret = setsockopt(fd, SOL_SOCKET, SO_PREFER_BUSY_POLL,
+                       (void *)&sock_opt, sizeof(sock_opt));
+       if (ret < 0) {
+               AF_XDP_LOG(ERR, "Failed to unset SO_PREFER_BUSY_POLL\n");
+               return -1;
+       }
+
+err_prefer:
+       rxq->busy_budget = 0;
+       return 0;
+}
+
 static int
 xsk_configure(struct pmd_internals *internals, struct pkt_rx_queue *rxq,
              int ring_size)
@@ -1145,11 +1295,21 @@ xsk_configure(struct pmd_internals *internals, struct pkt_rx_queue *rxq,
        }
 
 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
-       if (rte_pktmbuf_alloc_bulk(rxq->umem->mb_pool, fq_bufs, reserve_size)) {
+       ret = rte_pktmbuf_alloc_bulk(rxq->umem->mb_pool, fq_bufs, reserve_size);
+       if (ret) {
                AF_XDP_LOG(DEBUG, "Failed to get enough buffers for fq.\n");
                goto err;
        }
 #endif
+
+       if (rxq->busy_budget) {
+               ret = configure_preferred_busy_poll(rxq);
+               if (ret) {
+                       AF_XDP_LOG(ERR, "Failed configure busy polling.\n");
+                       goto err;
+               }
+       }
+
        ret = reserve_fill_queue(rxq->umem, reserve_size, fq_bufs, &rxq->fq);
        if (ret) {
                xsk_socket__delete(rxq->xsk);
@@ -1207,6 +1367,9 @@ eth_rx_queue_setup(struct rte_eth_dev *dev,
                goto err;
        }
 
+       if (!rxq->busy_budget)
+               AF_XDP_LOG(DEBUG, "Preferred busy polling not enabled\n");
+
        rxq->fds[0].fd = xsk_socket__fd(rxq->xsk);
        rxq->fds[0].events = POLLIN;
 
@@ -1306,13 +1469,30 @@ static const struct eth_dev_ops ops = {
        .promiscuous_disable = eth_dev_promiscuous_disable,
        .rx_queue_setup = eth_rx_queue_setup,
        .tx_queue_setup = eth_tx_queue_setup,
-       .rx_queue_release = eth_queue_release,
-       .tx_queue_release = eth_queue_release,
        .link_update = eth_link_update,
        .stats_get = eth_stats_get,
        .stats_reset = eth_stats_reset,
+       .get_monitor_addr = eth_get_monitor_addr,
 };
 
+/** parse busy_budget argument */
+static int
+parse_budget_arg(const char *key __rte_unused,
+                 const char *value, void *extra_args)
+{
+       int *i = (int *)extra_args;
+       char *end;
+
+       *i = strtol(value, &end, 10);
+       if (*i < 0 || *i > UINT16_MAX) {
+               AF_XDP_LOG(ERR, "Invalid busy_budget %i, must be >= 0 and <= %u\n",
+                               *i, UINT16_MAX);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
 /** parse integer from integer argument */
 static int
 parse_integer_arg(const char *key __rte_unused,
@@ -1415,7 +1595,8 @@ xdp_get_channels_info(const char *if_name, int *max_queues,
 
 static int
 parse_parameters(struct rte_kvargs *kvlist, char *if_name, int *start_queue,
-                       int *queue_cnt, int *shared_umem, char *prog_path)
+                       int *queue_cnt, int *shared_umem, char *prog_path,
+                       int *busy_budget)
 {
        int ret;
 
@@ -1446,6 +1627,11 @@ parse_parameters(struct rte_kvargs *kvlist, char *if_name, int *start_queue,
        if (ret < 0)
                goto free_kvlist;
 
+       ret = rte_kvargs_process(kvlist, ETH_AF_XDP_BUDGET_ARG,
+                               &parse_budget_arg, busy_budget);
+       if (ret < 0)
+               goto free_kvlist;
+
 free_kvlist:
        rte_kvargs_free(kvlist);
        return ret;
@@ -1484,7 +1670,7 @@ error:
 static struct rte_eth_dev *
 init_internals(struct rte_vdev_device *dev, const char *if_name,
                int start_queue_idx, int queue_cnt, int shared_umem,
-               const char *prog_path)
+               const char *prog_path, int busy_budget)
 {
        const char *name = rte_vdev_device_name(dev);
        const unsigned int numa_node = dev->device.numa_node;
@@ -1545,6 +1731,7 @@ init_internals(struct rte_vdev_device *dev, const char *if_name,
                internals->rx_queues[i].pair = &internals->tx_queues[i];
                internals->rx_queues[i].xsk_queue_idx = start_queue_idx + i;
                internals->tx_queues[i].xsk_queue_idx = start_queue_idx + i;
+               internals->rx_queues[i].busy_budget = busy_budget;
        }
 
        ret = get_iface_info(if_name, &internals->eth_addr,
@@ -1588,6 +1775,7 @@ rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
        int xsk_queue_cnt = ETH_AF_XDP_DFLT_QUEUE_COUNT;
        int shared_umem = 0;
        char prog_path[PATH_MAX] = {'\0'};
+       int busy_budget = -1;
        struct rte_eth_dev *eth_dev = NULL;
        const char *name;
 
@@ -1595,16 +1783,11 @@ rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
                rte_vdev_device_name(dev));
 
        name = rte_vdev_device_name(dev);
-       if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
-               strlen(rte_vdev_device_args(dev)) == 0) {
-               eth_dev = rte_eth_dev_attach_secondary(name);
-               if (eth_dev == NULL) {
-                       AF_XDP_LOG(ERR, "Failed to probe %s\n", name);
-                       return -EINVAL;
-               }
-               eth_dev->dev_ops = &ops;
-               rte_eth_dev_probing_finish(eth_dev);
-               return 0;
+       if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+               AF_XDP_LOG(ERR, "Failed to probe %s. "
+                               "AF_XDP PMD does not support secondary processes.\n",
+                               name);
+               return -ENOTSUP;
        }
 
        kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
@@ -1617,7 +1800,8 @@ rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
                dev->device.numa_node = rte_socket_id();
 
        if (parse_parameters(kvlist, if_name, &xsk_start_queue_idx,
-                            &xsk_queue_cnt, &shared_umem, prog_path) < 0) {
+                            &xsk_queue_cnt, &shared_umem, prog_path,
+                            &busy_budget) < 0) {
                AF_XDP_LOG(ERR, "Invalid kvargs value\n");
                return -EINVAL;
        }
@@ -1627,8 +1811,12 @@ rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
                return -EINVAL;
        }
 
+       busy_budget = busy_budget == -1 ? ETH_AF_XDP_DFLT_BUSY_BUDGET :
+                                       busy_budget;
+
        eth_dev = init_internals(dev, if_name, xsk_start_queue_idx,
-                                       xsk_queue_cnt, shared_umem, prog_path);
+                                       xsk_queue_cnt, shared_umem, prog_path,
+                                       busy_budget);
        if (eth_dev == NULL) {
                AF_XDP_LOG(ERR, "Failed to init internals\n");
                return -1;
@@ -1673,4 +1861,5 @@ RTE_PMD_REGISTER_PARAM_STRING(net_af_xdp,
                              "start_queue=<int> "
                              "queue_count=<int> "
                              "shared_umem=<int> "
-                             "xdp_prog=<string> ");
+                             "xdp_prog=<string> "
+                             "busy_budget=<int>");