net/af_xdp: prefer busy polling
authorCiara Loftus <ciara.loftus@intel.com>
Wed, 10 Mar 2021 07:48:16 +0000 (07:48 +0000)
committerFerruh Yigit <ferruh.yigit@intel.com>
Wed, 10 Mar 2021 17:49:32 +0000 (18:49 +0100)
This commit introduces support for preferred busy polling
to the AF_XDP PMD. This feature aims to improve single-core
performance for AF_XDP sockets under heavy load.

A new vdev arg is introduced called 'busy_budget' whose default
value is 64. busy_budget is the value supplied to the kernel
with the SO_BUSY_POLL_BUDGET socket option and represents the
busy-polling NAPI budget. To set the budget to a different value
eg. 256:

--vdev=net_af_xdp0,iface=eth0,busy_budget=256

Preferred busy polling is enabled by default provided a kernel with
version >= v5.11 is in use. To disable it, set the budget to zero.

The following settings are also strongly recommended to be used in
conjunction with this feature:

echo 2 | sudo tee /sys/class/net/eth0/napi_defer_hard_irqs
echo 200000 | sudo tee /sys/class/net/eth0/gro_flush_timeout

.. where eth0 is the interface being used by the PMD.

Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Reviewed-by: Ferruh Yigit <ferruh.yigit@intel.com>
doc/guides/nics/af_xdp.rst
doc/guides/rel_notes/release_21_05.rst
drivers/net/af_xdp/compat.h
drivers/net/af_xdp/rte_eth_af_xdp.c

index 5ed2437..8bf40b5 100644 (file)
@@ -35,6 +35,7 @@ The following options can be provided to set up an af_xdp port in DPDK.
 *   ``shared_umem`` - PMD will attempt to share UMEM with others (optional,
     default 0);
 *   ``xdp_prog`` - path to custom xdp program (optional, default none);
+*   ``busy_budget`` - busy polling budget (optional, default 64);
 
 Prerequisites
 -------------
@@ -51,6 +52,7 @@ This is a Linux-specific PMD, thus the following prerequisites apply:
 *  For shared_umem, it requires kernel version v5.10 or later and libbpf version
    v0.2.0 or later.
 *  For 32-bit OS, a kernel with version 5.4 or later is required.
+*  For busy polling, kernel version v5.11 or later is required.
 
 Set up an af_xdp interface
 -----------------------------
@@ -107,4 +109,38 @@ Limitations
   .. code-block:: console
 
     --vdev net_af_xdp0,iface=ens786f1,shared_umem=1 \
-    --vdev net_af_xdp1,iface=ens786f2,shared_umem=1 \
\ No newline at end of file
+    --vdev net_af_xdp1,iface=ens786f2,shared_umem=1 \
+
+- **Preferred Busy Polling**
+
+  The SO_PREFER_BUSY_POLL socket option was introduced in kernel v5.11. It can
+  deliver a performance improvement for sockets with heavy traffic loads and
+  can significantly improve single-core performance in this context.
+
+  The feature is enabled by default in the AF_XDP PMD. To disable it, set the
+  'busy_budget' vdevarg to zero:
+
+  .. code-block:: console
+
+    --vdev net_af_xdp0,iface=ens786f1,busy_budget=0
+
+  The default 'busy_budget' is 64 and it represents the number of packets the
+  kernel will attempt to process in the netdev's NAPI context. You can change
+  the value for example to 256 like so:
+
+  .. code-block:: console
+
+    --vdev net_af_xdp0,iface=ens786f1,busy_budget=256
+
+  It is also strongly recommended to set the following for optimal performance:
+
+  .. code-block:: console
+
+    echo 2 | sudo tee /sys/class/net/ens786f1/napi_defer_hard_irqs
+    echo 200000 | sudo tee /sys/class/net/ens786f1/gro_flush_timeout
+
+  The above defers interrupts for interface ens786f1 and instead schedules its
+  NAPI context from a watchdog timer instead of from softirqs. More information
+  on this feature can be found at [1].
+
+  [1] https://lwn.net/Articles/837010/
\ No newline at end of file
index 21dc6d2..32f0127 100644 (file)
@@ -76,6 +76,10 @@ New Features
 
   * Added support for txgbevf PMD.
 
+* **Updated the AF_XDP driver.**
+
+  * Added support for preferred busy polling.
+
 * **Updated testpmd.**
 
   * Added a command line option to configure forced speed for Ethernet port.
index 7aa40d5..545c8aa 100644 (file)
@@ -39,3 +39,17 @@ create_shared_socket(struct xsk_socket **xsk_ptr __rte_unused,
        return -1;
 }
 #endif
+
+#ifdef XDP_USE_NEED_WAKEUP
+static int
+syscall_needed(struct xsk_ring_prod *q, uint32_t busy_budget)
+{
+       return xsk_ring_prod__needs_wakeup(q) | busy_budget;
+}
+#else
+static int
+syscall_needed(struct xsk_ring_prod *q __rte_unused, uint32_t busy_budget)
+{
+       return busy_budget;
+}
+#endif
index 9c0e935..a64fef1 100644 (file)
 
 #include "compat.h"
 
+#ifndef SO_PREFER_BUSY_POLL
+#define SO_PREFER_BUSY_POLL 69
+#endif
+#ifndef SO_BUSY_POLL_BUDGET
+#define SO_BUSY_POLL_BUDGET 70
+#endif
+
 
 #ifndef SOL_XDP
 #define SOL_XDP 283
@@ -65,6 +72,8 @@ RTE_LOG_REGISTER(af_xdp_logtype, pmd.net.af_xdp, NOTICE);
 #define ETH_AF_XDP_DFLT_NUM_DESCS      XSK_RING_CONS__DEFAULT_NUM_DESCS
 #define ETH_AF_XDP_DFLT_START_QUEUE_IDX        0
 #define ETH_AF_XDP_DFLT_QUEUE_COUNT    1
+#define ETH_AF_XDP_DFLT_BUSY_BUDGET    64
+#define ETH_AF_XDP_DFLT_BUSY_TIMEOUT   20
 
 #define ETH_AF_XDP_RX_BATCH_SIZE       XSK_RING_CONS__DEFAULT_NUM_DESCS
 #define ETH_AF_XDP_TX_BATCH_SIZE       XSK_RING_CONS__DEFAULT_NUM_DESCS
@@ -100,6 +109,7 @@ struct pkt_rx_queue {
        struct pkt_tx_queue *pair;
        struct pollfd fds[1];
        int xsk_queue_idx;
+       int busy_budget;
 };
 
 struct tx_stats {
@@ -140,6 +150,7 @@ struct pmd_internals {
 #define ETH_AF_XDP_QUEUE_COUNT_ARG             "queue_count"
 #define ETH_AF_XDP_SHARED_UMEM_ARG             "shared_umem"
 #define ETH_AF_XDP_PROG_ARG                    "xdp_prog"
+#define ETH_AF_XDP_BUDGET_ARG                  "busy_budget"
 
 static const char * const valid_arguments[] = {
        ETH_AF_XDP_IFACE_ARG,
@@ -147,6 +158,7 @@ static const char * const valid_arguments[] = {
        ETH_AF_XDP_QUEUE_COUNT_ARG,
        ETH_AF_XDP_SHARED_UMEM_ARG,
        ETH_AF_XDP_PROG_ARG,
+       ETH_AF_XDP_BUDGET_ARG,
        NULL
 };
 
@@ -261,11 +273,9 @@ af_xdp_rx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
        nb_pkts = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
 
        if (nb_pkts == 0) {
-#if defined(XDP_USE_NEED_WAKEUP)
-               if (xsk_ring_prod__needs_wakeup(fq))
+               if (syscall_needed(&rxq->fq, rxq->busy_budget))
                        recvfrom(xsk_socket__fd(rxq->xsk), NULL, 0,
                                MSG_DONTWAIT, NULL, NULL);
-#endif
 
                return 0;
        }
@@ -446,9 +456,7 @@ kick_tx(struct pkt_tx_queue *txq, struct xsk_ring_cons *cq)
 
        pull_umem_cq(umem, XSK_RING_CONS__DEFAULT_NUM_DESCS, cq);
 
-#if defined(XDP_USE_NEED_WAKEUP)
-       if (xsk_ring_prod__needs_wakeup(&txq->tx))
-#endif
+       if (syscall_needed(&txq->tx, txq->pair->busy_budget))
                while (send(xsk_socket__fd(txq->pair->xsk), NULL,
                            0, MSG_DONTWAIT) < 0) {
                        /* some thing unexpected */
@@ -795,6 +803,8 @@ eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
        dev_info->max_mtu = ETH_AF_XDP_FRAME_SIZE - XDP_PACKET_HEADROOM;
 #endif
 
+       dev_info->default_rxportconf.burst_size = ETH_AF_XDP_DFLT_BUSY_BUDGET;
+       dev_info->default_txportconf.burst_size = ETH_AF_XDP_DFLT_BUSY_BUDGET;
        dev_info->default_rxportconf.nb_queues = 1;
        dev_info->default_txportconf.nb_queues = 1;
        dev_info->default_rxportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
@@ -1142,6 +1152,65 @@ load_custom_xdp_prog(const char *prog_path, int if_index)
        return 0;
 }
 
+/* Detect support for busy polling through setsockopt(). */
+static int
+configure_preferred_busy_poll(struct pkt_rx_queue *rxq)
+{
+       int sock_opt = 1;
+       int fd = xsk_socket__fd(rxq->xsk);
+       int ret = 0;
+
+       ret = setsockopt(fd, SOL_SOCKET, SO_PREFER_BUSY_POLL,
+                       (void *)&sock_opt, sizeof(sock_opt));
+       if (ret < 0) {
+               AF_XDP_LOG(DEBUG, "Failed to set SO_PREFER_BUSY_POLL\n");
+               goto err_prefer;
+       }
+
+       sock_opt = ETH_AF_XDP_DFLT_BUSY_TIMEOUT;
+       ret = setsockopt(fd, SOL_SOCKET, SO_BUSY_POLL, (void *)&sock_opt,
+                       sizeof(sock_opt));
+       if (ret < 0) {
+               AF_XDP_LOG(DEBUG, "Failed to set SO_BUSY_POLL\n");
+               goto err_timeout;
+       }
+
+       sock_opt = rxq->busy_budget;
+       ret = setsockopt(fd, SOL_SOCKET, SO_BUSY_POLL_BUDGET,
+                       (void *)&sock_opt, sizeof(sock_opt));
+       if (ret < 0) {
+               AF_XDP_LOG(DEBUG, "Failed to set SO_BUSY_POLL_BUDGET\n");
+       } else {
+               AF_XDP_LOG(INFO, "Busy polling budget set to: %u\n",
+                                       rxq->busy_budget);
+               return 0;
+       }
+
+       /* setsockopt failure - attempt to restore xsk to default state and
+        * proceed without busy polling support.
+        */
+       sock_opt = 0;
+       ret = setsockopt(fd, SOL_SOCKET, SO_BUSY_POLL, (void *)&sock_opt,
+                       sizeof(sock_opt));
+       if (ret < 0) {
+               AF_XDP_LOG(ERR, "Failed to unset SO_BUSY_POLL\n");
+               return -1;
+       }
+
+err_timeout:
+       sock_opt = 0;
+       ret = setsockopt(fd, SOL_SOCKET, SO_PREFER_BUSY_POLL,
+                       (void *)&sock_opt, sizeof(sock_opt));
+       if (ret < 0) {
+               AF_XDP_LOG(ERR, "Failed to unset SO_PREFER_BUSY_POLL\n");
+               return -1;
+       }
+
+err_prefer:
+       rxq->busy_budget = 0;
+       return 0;
+}
+
 static int
 xsk_configure(struct pmd_internals *internals, struct pkt_rx_queue *rxq,
              int ring_size)
@@ -1200,6 +1269,15 @@ xsk_configure(struct pmd_internals *internals, struct pkt_rx_queue *rxq,
                goto err;
        }
 #endif
+
+       if (rxq->busy_budget) {
+               ret = configure_preferred_busy_poll(rxq);
+               if (ret) {
+                       AF_XDP_LOG(ERR, "Failed configure busy polling.\n");
+                       goto err;
+               }
+       }
+
        ret = reserve_fill_queue(rxq->umem, reserve_size, fq_bufs, &rxq->fq);
        if (ret) {
                xsk_socket__delete(rxq->xsk);
@@ -1257,6 +1335,9 @@ eth_rx_queue_setup(struct rte_eth_dev *dev,
                goto err;
        }
 
+       if (!rxq->busy_budget)
+               AF_XDP_LOG(DEBUG, "Preferred busy polling not enabled\n");
+
        rxq->fds[0].fd = xsk_socket__fd(rxq->xsk);
        rxq->fds[0].events = POLLIN;
 
@@ -1363,6 +1444,24 @@ static const struct eth_dev_ops ops = {
        .stats_reset = eth_stats_reset,
 };
 
+/** parse busy_budget argument */
+static int
+parse_budget_arg(const char *key __rte_unused,
+                 const char *value, void *extra_args)
+{
+       int *i = (int *)extra_args;
+       char *end;
+
+       *i = strtol(value, &end, 10);
+       if (*i < 0 || *i > UINT16_MAX) {
+               AF_XDP_LOG(ERR, "Invalid busy_budget %i, must be >= 0 and <= %u\n",
+                               *i, UINT16_MAX);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
 /** parse integer from integer argument */
 static int
 parse_integer_arg(const char *key __rte_unused,
@@ -1465,7 +1564,8 @@ xdp_get_channels_info(const char *if_name, int *max_queues,
 
 static int
 parse_parameters(struct rte_kvargs *kvlist, char *if_name, int *start_queue,
-                       int *queue_cnt, int *shared_umem, char *prog_path)
+                       int *queue_cnt, int *shared_umem, char *prog_path,
+                       int *busy_budget)
 {
        int ret;
 
@@ -1496,6 +1596,11 @@ parse_parameters(struct rte_kvargs *kvlist, char *if_name, int *start_queue,
        if (ret < 0)
                goto free_kvlist;
 
+       ret = rte_kvargs_process(kvlist, ETH_AF_XDP_BUDGET_ARG,
+                               &parse_budget_arg, busy_budget);
+       if (ret < 0)
+               goto free_kvlist;
+
 free_kvlist:
        rte_kvargs_free(kvlist);
        return ret;
@@ -1534,7 +1639,7 @@ error:
 static struct rte_eth_dev *
 init_internals(struct rte_vdev_device *dev, const char *if_name,
                int start_queue_idx, int queue_cnt, int shared_umem,
-               const char *prog_path)
+               const char *prog_path, int busy_budget)
 {
        const char *name = rte_vdev_device_name(dev);
        const unsigned int numa_node = dev->device.numa_node;
@@ -1595,6 +1700,7 @@ init_internals(struct rte_vdev_device *dev, const char *if_name,
                internals->rx_queues[i].pair = &internals->tx_queues[i];
                internals->rx_queues[i].xsk_queue_idx = start_queue_idx + i;
                internals->tx_queues[i].xsk_queue_idx = start_queue_idx + i;
+               internals->rx_queues[i].busy_budget = busy_budget;
        }
 
        ret = get_iface_info(if_name, &internals->eth_addr,
@@ -1638,6 +1744,7 @@ rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
        int xsk_queue_cnt = ETH_AF_XDP_DFLT_QUEUE_COUNT;
        int shared_umem = 0;
        char prog_path[PATH_MAX] = {'\0'};
+       int busy_budget = -1;
        struct rte_eth_dev *eth_dev = NULL;
        const char *name;
 
@@ -1667,7 +1774,8 @@ rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
                dev->device.numa_node = rte_socket_id();
 
        if (parse_parameters(kvlist, if_name, &xsk_start_queue_idx,
-                            &xsk_queue_cnt, &shared_umem, prog_path) < 0) {
+                            &xsk_queue_cnt, &shared_umem, prog_path,
+                            &busy_budget) < 0) {
                AF_XDP_LOG(ERR, "Invalid kvargs value\n");
                return -EINVAL;
        }
@@ -1677,8 +1785,12 @@ rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
                return -EINVAL;
        }
 
+       busy_budget = busy_budget == -1 ? ETH_AF_XDP_DFLT_BUSY_BUDGET :
+                                       busy_budget;
+
        eth_dev = init_internals(dev, if_name, xsk_start_queue_idx,
-                                       xsk_queue_cnt, shared_umem, prog_path);
+                                       xsk_queue_cnt, shared_umem, prog_path,
+                                       busy_budget);
        if (eth_dev == NULL) {
                AF_XDP_LOG(ERR, "Failed to init internals\n");
                return -1;
@@ -1723,4 +1835,5 @@ RTE_PMD_REGISTER_PARAM_STRING(net_af_xdp,
                              "start_queue=<int> "
                              "queue_count=<int> "
                              "shared_umem=<int> "
-                             "xdp_prog=<string> ");
+                             "xdp_prog=<string> "
+                             "busy_budget=<int>");