net/mlx5: implement vectorized MPRQ burst
[dpdk.git] / drivers / net / mlx5 / mlx5.c
index 6dd211e..e4ce9a9 100644 (file)
@@ -9,19 +9,6 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include <errno.h>
-#include <net/if.h>
-#include <sys/mman.h>
-#include <linux/rtnetlink.h>
-
-/* Verbs header. */
-/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
-#ifdef PEDANTIC
-#pragma GCC diagnostic ignored "-Wpedantic"
-#endif
-#include <infiniband/verbs.h>
-#ifdef PEDANTIC
-#pragma GCC diagnostic error "-Wpedantic"
-#endif
 
 #include <rte_malloc.h>
 #include <rte_ethdev_driver.h>
@@ -40,6 +27,7 @@
 #include <mlx5_common.h>
 #include <mlx5_common_os.h>
 #include <mlx5_common_mp.h>
+#include <mlx5_common_pci.h>
 #include <mlx5_malloc.h>
 
 #include "mlx5_defs.h"
@@ -253,6 +241,28 @@ static const struct mlx5_indexed_pool_config mlx5_ipool_cfg[] = {
                .free = mlx5_free,
                .type = "mlx5_jump_ipool",
        },
+       {
+               .size = sizeof(struct mlx5_flow_dv_sample_resource),
+               .trunk_size = 64,
+               .grow_trunk = 3,
+               .grow_shift = 2,
+               .need_lock = 0,
+               .release_mem_en = 1,
+               .malloc = mlx5_malloc,
+               .free = mlx5_free,
+               .type = "mlx5_sample_ipool",
+       },
+       {
+               .size = sizeof(struct mlx5_flow_dv_dest_array_resource),
+               .trunk_size = 64,
+               .grow_trunk = 3,
+               .grow_shift = 2,
+               .need_lock = 0,
+               .release_mem_en = 1,
+               .malloc = mlx5_malloc,
+               .free = mlx5_free,
+               .type = "mlx5_dest_array_ipool",
+       },
 #endif
        {
                .size = sizeof(struct mlx5_flow_meter),
@@ -474,14 +484,13 @@ mlx5_flow_counters_mng_init(struct mlx5_dev_ctx_shared *sh)
 
        memset(&sh->cmng, 0, sizeof(sh->cmng));
        TAILQ_INIT(&sh->cmng.flow_counters);
-       for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i) {
-               sh->cmng.ccont[i].min_id = MLX5_CNT_BATCH_OFFSET;
-               sh->cmng.ccont[i].max_id = -1;
-               sh->cmng.ccont[i].last_pool_idx = POOL_IDX_INVALID;
-               TAILQ_INIT(&sh->cmng.ccont[i].pool_list);
-               rte_spinlock_init(&sh->cmng.ccont[i].resize_sl);
-               TAILQ_INIT(&sh->cmng.ccont[i].counters);
-               rte_spinlock_init(&sh->cmng.ccont[i].csl);
+       sh->cmng.min_id = MLX5_CNT_BATCH_OFFSET;
+       sh->cmng.max_id = -1;
+       sh->cmng.last_pool_idx = POOL_IDX_INVALID;
+       rte_spinlock_init(&sh->cmng.pool_update_sl);
+       for (i = 0; i < MLX5_COUNTER_TYPE_MAX; i++) {
+               TAILQ_INIT(&sh->cmng.counters[i]);
+               rte_spinlock_init(&sh->cmng.csl[i]);
        }
 }
 
@@ -512,8 +521,7 @@ static void
 mlx5_flow_counters_mng_close(struct mlx5_dev_ctx_shared *sh)
 {
        struct mlx5_counter_stats_mem_mng *mng;
-       int i;
-       int j;
+       int i, j;
        int retries = 1024;
 
        rte_errno = 0;
@@ -523,34 +531,33 @@ mlx5_flow_counters_mng_close(struct mlx5_dev_ctx_shared *sh)
                        break;
                rte_pause();
        }
-       for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i) {
+
+       if (sh->cmng.pools) {
                struct mlx5_flow_counter_pool *pool;
-               uint32_t batch = !!(i > 1);
+               uint16_t n_valid = sh->cmng.n_valid;
+               bool fallback = sh->cmng.counter_fallback;
 
-               if (!sh->cmng.ccont[i].pools)
-                       continue;
-               pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
-               while (pool) {
-                       if (batch && pool->min_dcs)
+               for (i = 0; i < n_valid; ++i) {
+                       pool = sh->cmng.pools[i];
+                       if (!fallback && pool->min_dcs)
                                claim_zero(mlx5_devx_cmd_destroy
                                                               (pool->min_dcs));
                        for (j = 0; j < MLX5_COUNTERS_PER_POOL; ++j) {
-                               if (MLX5_POOL_GET_CNT(pool, j)->action)
+                               struct mlx5_flow_counter *cnt =
+                                               MLX5_POOL_GET_CNT(pool, j);
+
+                               if (cnt->action)
                                        claim_zero
                                         (mlx5_glue->destroy_flow_action
-                                         (MLX5_POOL_GET_CNT
-                                         (pool, j)->action));
-                               if (!batch && MLX5_GET_POOL_CNT_EXT
-                                   (pool, j)->dcs)
+                                         (cnt->action));
+                               if (fallback && MLX5_POOL_GET_CNT
+                                   (pool, j)->dcs_when_free)
                                        claim_zero(mlx5_devx_cmd_destroy
-                                                  (MLX5_GET_POOL_CNT_EXT
-                                                   (pool, j)->dcs));
+                                                  (cnt->dcs_when_free));
                        }
-                       TAILQ_REMOVE(&sh->cmng.ccont[i].pool_list, pool, next);
                        mlx5_free(pool);
-                       pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
                }
-               mlx5_free(sh->cmng.ccont[i].pools);
+               mlx5_free(sh->cmng.pools);
        }
        mng = LIST_FIRST(&sh->cmng.mem_mngs);
        while (mng) {
@@ -716,6 +723,144 @@ mlx5_flex_parser_ecpri_release(struct rte_eth_dev *dev)
        prf->obj = NULL;
 }
 
+/*
+ * Allocate Rx and Tx UARs in robust fashion.
+ * This routine handles the following UAR allocation issues:
+ *
+ *  - tries to allocate the UAR with the most appropriate memory
+ *    mapping type from the ones supported by the host
+ *
+ *  - tries to allocate the UAR with non-NULL base address
+ *    OFED 5.0.x and Upstream rdma_core before v29 returned the NULL as
+ *    UAR base address if UAR was not the first object in the UAR page.
+ *    It caused the PMD failure and we should try to get another UAR
+ *    till we get the first one with non-NULL base address returned.
+ */
+static int
+mlx5_alloc_rxtx_uars(struct mlx5_dev_ctx_shared *sh,
+                    const struct mlx5_dev_config *config)
+{
+       uint32_t uar_mapping, retry;
+       int err = 0;
+       void *base_addr;
+
+       for (retry = 0; retry < MLX5_ALLOC_UAR_RETRY; ++retry) {
+#ifdef MLX5DV_UAR_ALLOC_TYPE_NC
+               /* Control the mapping type according to the settings. */
+               uar_mapping = (config->dbnc == MLX5_TXDB_NCACHED) ?
+                             MLX5DV_UAR_ALLOC_TYPE_NC :
+                             MLX5DV_UAR_ALLOC_TYPE_BF;
+#else
+               RTE_SET_USED(config);
+               /*
+                * It seems we have no way to control the memory mapping type
+                * for the UAR, the default "Write-Combining" type is supposed.
+                * The UAR initialization on queue creation queries the
+                * actual mapping type done by Verbs/kernel and setups the
+                * PMD datapath accordingly.
+                */
+               uar_mapping = 0;
+#endif
+               sh->tx_uar = mlx5_glue->devx_alloc_uar(sh->ctx, uar_mapping);
+#ifdef MLX5DV_UAR_ALLOC_TYPE_NC
+               if (!sh->tx_uar &&
+                   uar_mapping == MLX5DV_UAR_ALLOC_TYPE_BF) {
+                       if (config->dbnc == MLX5_TXDB_CACHED ||
+                           config->dbnc == MLX5_TXDB_HEURISTIC)
+                               DRV_LOG(WARNING, "Devarg tx_db_nc setting "
+                                                "is not supported by DevX");
+                       /*
+                        * In some environments like virtual machine
+                        * the Write Combining mapped might be not supported
+                        * and UAR allocation fails. We try "Non-Cached"
+                        * mapping for the case. The tx_burst routines take
+                        * the UAR mapping type into account on UAR setup
+                        * on queue creation.
+                        */
+                       DRV_LOG(WARNING, "Failed to allocate Tx DevX UAR (BF)");
+                       uar_mapping = MLX5DV_UAR_ALLOC_TYPE_NC;
+                       sh->tx_uar = mlx5_glue->devx_alloc_uar
+                                                       (sh->ctx, uar_mapping);
+               } else if (!sh->tx_uar &&
+                          uar_mapping == MLX5DV_UAR_ALLOC_TYPE_NC) {
+                       if (config->dbnc == MLX5_TXDB_NCACHED)
+                               DRV_LOG(WARNING, "Devarg tx_db_nc settings "
+                                                "is not supported by DevX");
+                       /*
+                        * If Verbs/kernel does not support "Non-Cached"
+                        * try the "Write-Combining".
+                        */
+                       DRV_LOG(WARNING, "Failed to allocate Tx DevX UAR (NC)");
+                       uar_mapping = MLX5DV_UAR_ALLOC_TYPE_BF;
+                       sh->tx_uar = mlx5_glue->devx_alloc_uar
+                                                       (sh->ctx, uar_mapping);
+               }
+#endif
+               if (!sh->tx_uar) {
+                       DRV_LOG(ERR, "Failed to allocate Tx DevX UAR (BF/NC)");
+                       err = ENOMEM;
+                       goto exit;
+               }
+               base_addr = mlx5_os_get_devx_uar_base_addr(sh->tx_uar);
+               if (base_addr)
+                       break;
+               /*
+                * The UARs are allocated by rdma_core within the
+                * IB device context, on context closure all UARs
+                * will be freed, should be no memory/object leakage.
+                */
+               DRV_LOG(WARNING, "Retrying to allocate Tx DevX UAR");
+               sh->tx_uar = NULL;
+       }
+       /* Check whether we finally succeeded with valid UAR allocation. */
+       if (!sh->tx_uar) {
+               DRV_LOG(ERR, "Failed to allocate Tx DevX UAR (NULL base)");
+               err = ENOMEM;
+               goto exit;
+       }
+       for (retry = 0; retry < MLX5_ALLOC_UAR_RETRY; ++retry) {
+               uar_mapping = 0;
+               sh->devx_rx_uar = mlx5_glue->devx_alloc_uar
+                                                       (sh->ctx, uar_mapping);
+#ifdef MLX5DV_UAR_ALLOC_TYPE_NC
+               if (!sh->devx_rx_uar &&
+                   uar_mapping == MLX5DV_UAR_ALLOC_TYPE_BF) {
+                       /*
+                        * Rx UAR is used to control interrupts only,
+                        * should be no datapath noticeable impact,
+                        * can try "Non-Cached" mapping safely.
+                        */
+                       DRV_LOG(WARNING, "Failed to allocate Rx DevX UAR (BF)");
+                       uar_mapping = MLX5DV_UAR_ALLOC_TYPE_NC;
+                       sh->devx_rx_uar = mlx5_glue->devx_alloc_uar
+                                                       (sh->ctx, uar_mapping);
+               }
+#endif
+               if (!sh->devx_rx_uar) {
+                       DRV_LOG(ERR, "Failed to allocate Rx DevX UAR (BF/NC)");
+                       err = ENOMEM;
+                       goto exit;
+               }
+               base_addr = mlx5_os_get_devx_uar_base_addr(sh->devx_rx_uar);
+               if (base_addr)
+                       break;
+               /*
+                * The UARs are allocated by rdma_core within the
+                * IB device context, on context closure all UARs
+                * will be freed, should be no memory/object leakage.
+                */
+               DRV_LOG(WARNING, "Retrying to allocate Rx DevX UAR");
+               sh->devx_rx_uar = NULL;
+       }
+       /* Check whether we finally succeeded with valid UAR allocation. */
+       if (!sh->devx_rx_uar) {
+               DRV_LOG(ERR, "Failed to allocate Rx DevX UAR (NULL base)");
+               err = ENOMEM;
+       }
+exit:
+       return err;
+}
+
 /**
  * Allocate shared device context. If there is multiport device the
  * master and representors will share this context, if there is single
@@ -799,6 +944,14 @@ mlx5_alloc_shared_dev_ctx(const struct mlx5_dev_spawn_data *spawn,
                goto error;
        }
        if (sh->devx) {
+               /* Query the EQN for this core. */
+               err = mlx5_glue->devx_query_eqn(sh->ctx, 0, &sh->eqn);
+               if (err) {
+                       rte_errno = errno;
+                       DRV_LOG(ERR, "Failed to query event queue number %d.",
+                               rte_errno);
+                       goto error;
+               }
                err = mlx5_os_get_pdn(sh->pd, &sh->pdn);
                if (err) {
                        DRV_LOG(ERR, "Fail to extract pdn from PD");
@@ -817,12 +970,14 @@ mlx5_alloc_shared_dev_ctx(const struct mlx5_dev_spawn_data *spawn,
                        err = ENOMEM;
                        goto error;
                }
-               sh->tx_uar = mlx5_glue->devx_alloc_uar(sh->ctx, 0);
-               if (!sh->tx_uar) {
-                       DRV_LOG(ERR, "Failed to allocate DevX UAR.");
-                       err = ENOMEM;
+               err = mlx5_alloc_rxtx_uars(sh, config);
+               if (err)
                        goto error;
-               }
+               MLX5_ASSERT(sh->tx_uar);
+               MLX5_ASSERT(mlx5_os_get_devx_uar_base_addr(sh->tx_uar));
+
+               MLX5_ASSERT(sh->devx_rx_uar);
+               MLX5_ASSERT(mlx5_os_get_devx_uar_base_addr(sh->devx_rx_uar));
        }
        sh->flow_id_pool = mlx5_flow_id_pool_alloc
                                        ((1 << HAIRPIN_FLOW_ID_BITS) - 1);
@@ -878,18 +1033,16 @@ error:
        pthread_mutex_destroy(&sh->txpp.mutex);
        pthread_mutex_unlock(&mlx5_dev_ctx_list_mutex);
        MLX5_ASSERT(sh);
-       if (sh->cnt_id_tbl) {
+       if (sh->cnt_id_tbl)
                mlx5_l3t_destroy(sh->cnt_id_tbl);
-               sh->cnt_id_tbl = NULL;
-       }
-       if (sh->tx_uar) {
-               mlx5_glue->devx_free_uar(sh->tx_uar);
-               sh->tx_uar = NULL;
-       }
        if (sh->tis)
                claim_zero(mlx5_devx_cmd_destroy(sh->tis));
        if (sh->td)
                claim_zero(mlx5_devx_cmd_destroy(sh->td));
+       if (sh->devx_rx_uar)
+               mlx5_glue->devx_free_uar(sh->devx_rx_uar);
+       if (sh->tx_uar)
+               mlx5_glue->devx_free_uar(sh->tx_uar);
        if (sh->pd)
                claim_zero(mlx5_glue->dealloc_pd(sh->pd));
        if (sh->ctx)
@@ -940,6 +1093,7 @@ mlx5_free_shared_dev_ctx(struct mlx5_dev_ctx_shared *sh)
        mlx5_mr_release_cache(&sh->share_cache);
        /* Remove context from the global device list. */
        LIST_REMOVE(sh, next);
+       pthread_mutex_unlock(&mlx5_dev_ctx_list_mutex);
        /*
         *  Ensure there is no async event handler installed.
         *  Only primary process handles async device events.
@@ -961,12 +1115,15 @@ mlx5_free_shared_dev_ctx(struct mlx5_dev_ctx_shared *sh)
                claim_zero(mlx5_devx_cmd_destroy(sh->tis));
        if (sh->td)
                claim_zero(mlx5_devx_cmd_destroy(sh->td));
+       if (sh->devx_rx_uar)
+               mlx5_glue->devx_free_uar(sh->devx_rx_uar);
        if (sh->ctx)
                claim_zero(mlx5_glue->close_device(sh->ctx));
        if (sh->flow_id_pool)
                mlx5_flow_id_pool_release(sh->flow_id_pool);
        pthread_mutex_destroy(&sh->txpp.mutex);
        mlx5_free(sh);
+       return;
 exit:
        pthread_mutex_unlock(&mlx5_dev_ctx_list_mutex);
 }
@@ -1211,7 +1368,7 @@ mlx5_proc_priv_uninit(struct rte_eth_dev *dev)
  * @param dev
  *   Pointer to Ethernet device structure.
  */
-void
+int
 mlx5_dev_close(struct rte_eth_dev *dev)
 {
        struct mlx5_priv *priv = dev->data->dev_private;
@@ -1221,14 +1378,14 @@ mlx5_dev_close(struct rte_eth_dev *dev)
        if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
                /* Check if process_private released. */
                if (!dev->process_private)
-                       return;
+                       return 0;
                mlx5_tx_uar_uninit_secondary(dev);
                mlx5_proc_priv_uninit(dev);
                rte_eth_dev_release_port(dev);
-               return;
+               return 0;
        }
        if (!priv->sh)
-               return;
+               return 0;
        DRV_LOG(DEBUG, "port %u closing device \"%s\"",
                dev->data->port_id,
                ((priv->sh->ctx != NULL) ?
@@ -1281,9 +1438,7 @@ mlx5_dev_close(struct rte_eth_dev *dev)
        if (priv->reta_idx != NULL)
                mlx5_free(priv->reta_idx);
        if (priv->config.vf)
-               mlx5_nl_mac_addr_flush(priv->nl_socket_route, mlx5_ifindex(dev),
-                                      dev->data->mac_addrs,
-                                      MLX5_MAX_MAC_ADDRESSES, priv->mac_own);
+               mlx5_os_mac_addr_flush(dev);
        if (priv->nl_socket_route >= 0)
                close(priv->nl_socket_route);
        if (priv->nl_socket_rdma >= 0)
@@ -1321,7 +1476,7 @@ mlx5_dev_close(struct rte_eth_dev *dev)
        /*
         * Free the shared context in last turn, because the cleanup
         * routines above may use some shared fields, like
-        * mlx5_nl_mac_addr_flush() uses ibdev_path for retrieveing
+        * mlx5_os_mac_addr_flush() uses ibdev_path for retrieveing
         * ifindex if Netlink fails.
         */
        mlx5_free_shared_dev_ctx(priv->sh);
@@ -1351,6 +1506,7 @@ mlx5_dev_close(struct rte_eth_dev *dev)
         * it is freed when dev_private is freed.
         */
        dev->data->mac_addrs = NULL;
+       return 0;
 }
 
 /**
@@ -1881,6 +2037,7 @@ static int
 mlx5_pci_remove(struct rte_pci_device *pci_dev)
 {
        uint16_t port_id;
+       int ret = 0;
 
        RTE_ETH_FOREACH_DEV_OF(port_id, &pci_dev->device) {
                /*
@@ -1888,11 +2045,11 @@ mlx5_pci_remove(struct rte_pci_device *pci_dev)
                 * call the close function explicitly for secondary process.
                 */
                if (rte_eal_process_type() == RTE_PROC_SECONDARY)
-                       mlx5_dev_close(&rte_eth_devices[port_id]);
+                       ret |= mlx5_dev_close(&rte_eth_devices[port_id]);
                else
-                       rte_eth_dev_close(port_id);
+                       ret |= rte_eth_dev_close(port_id);
        }
-       return 0;
+       return ret == 0 ? 0 : -EIO;
 }
 
 static const struct rte_pci_id mlx5_pci_id_map[] = {
@@ -1965,16 +2122,19 @@ static const struct rte_pci_id mlx5_pci_id_map[] = {
        }
 };
 
-struct rte_pci_driver mlx5_driver = {
-       .driver = {
-               .name = MLX5_DRIVER_NAME
+static struct mlx5_pci_driver mlx5_driver = {
+       .driver_class = MLX5_CLASS_NET,
+       .pci_driver = {
+               .driver = {
+                       .name = MLX5_DRIVER_NAME,
+               },
+               .id_table = mlx5_pci_id_map,
+               .probe = mlx5_os_pci_probe,
+               .remove = mlx5_pci_remove,
+               .dma_map = mlx5_dma_map,
+               .dma_unmap = mlx5_dma_unmap,
+               .drv_flags = PCI_DRV_FLAGS,
        },
-       .id_table = mlx5_pci_id_map,
-       .probe = mlx5_os_pci_probe,
-       .remove = mlx5_pci_remove,
-       .dma_map = mlx5_dma_map,
-       .dma_unmap = mlx5_dma_unmap,
-       .drv_flags = PCI_DRV_FLAGS,
 };
 
 /* Initialize driver log type. */
@@ -1985,12 +2145,13 @@ RTE_LOG_REGISTER(mlx5_logtype, pmd.net.mlx5, NOTICE)
  */
 RTE_INIT(rte_mlx5_pmd_init)
 {
+       mlx5_common_init();
        /* Build the static tables for Verbs conversion. */
        mlx5_set_ptype_table();
        mlx5_set_cksum_table();
        mlx5_set_swp_types_table();
        if (mlx5_glue)
-               rte_pci_register(&mlx5_driver);
+               mlx5_pci_driver_register(&mlx5_driver);
 }
 
 RTE_PMD_EXPORT_NAME(net_mlx5, __COUNTER__);