+/* Send FLOW_AGED event if needed. */
+void
+mlx5_age_event_prepare(struct mlx5_dev_ctx_shared *sh)
+{
+ struct mlx5_age_info *age_info;
+ uint32_t i;
+
+ for (i = 0; i < sh->max_port; i++) {
+ age_info = &sh->port[i].age_info;
+ if (!MLX5_AGE_GET(age_info, MLX5_AGE_EVENT_NEW))
+ continue;
+ MLX5_AGE_UNSET(age_info, MLX5_AGE_EVENT_NEW);
+ if (MLX5_AGE_GET(age_info, MLX5_AGE_TRIGGER)) {
+ MLX5_AGE_UNSET(age_info, MLX5_AGE_TRIGGER);
+ rte_eth_dev_callback_process
+ (&rte_eth_devices[sh->port[i].devx_ih_port_id],
+ RTE_ETH_EVENT_FLOW_AGED, NULL);
+ }
+ }
+}
+
+/*
+ * Initialize the ASO connection tracking structure.
+ *
+ * @param[in] sh
+ * Pointer to mlx5_dev_ctx_shared object.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_aso_ct_mng_init(struct mlx5_dev_ctx_shared *sh)
+{
+ int err;
+
+ if (sh->ct_mng)
+ return 0;
+ sh->ct_mng = mlx5_malloc(MLX5_MEM_ZERO, sizeof(*sh->ct_mng),
+ RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
+ if (!sh->ct_mng) {
+ DRV_LOG(ERR, "ASO CT management allocation failed.");
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ err = mlx5_aso_queue_init(sh, ASO_OPC_MOD_CONNECTION_TRACKING);
+ if (err) {
+ mlx5_free(sh->ct_mng);
+ /* rte_errno should be extracted from the failure. */
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ rte_spinlock_init(&sh->ct_mng->ct_sl);
+ rte_rwlock_init(&sh->ct_mng->resize_rwl);
+ LIST_INIT(&sh->ct_mng->free_cts);
+ return 0;
+}
+
+/*
+ * Close and release all the resources of the
+ * ASO connection tracking management structure.
+ *
+ * @param[in] sh
+ * Pointer to mlx5_dev_ctx_shared object to free.
+ */
+static void
+mlx5_flow_aso_ct_mng_close(struct mlx5_dev_ctx_shared *sh)
+{
+ struct mlx5_aso_ct_pools_mng *mng = sh->ct_mng;
+ struct mlx5_aso_ct_pool *ct_pool;
+ struct mlx5_aso_ct_action *ct;
+ uint32_t idx;
+ uint32_t val;
+ uint32_t cnt;
+ int i;
+
+ mlx5_aso_queue_uninit(sh, ASO_OPC_MOD_CONNECTION_TRACKING);
+ idx = mng->next;
+ while (idx--) {
+ cnt = 0;
+ ct_pool = mng->pools[idx];
+ for (i = 0; i < MLX5_ASO_CT_ACTIONS_PER_POOL; i++) {
+ ct = &ct_pool->actions[i];
+ val = __atomic_fetch_sub(&ct->refcnt, 1,
+ __ATOMIC_RELAXED);
+ MLX5_ASSERT(val == 1);
+ if (val > 1)
+ cnt++;
+#ifdef HAVE_MLX5_DR_ACTION_ASO_CT
+ if (ct->dr_action_orig)
+ claim_zero(mlx5_glue->destroy_flow_action
+ (ct->dr_action_orig));
+ if (ct->dr_action_rply)
+ claim_zero(mlx5_glue->destroy_flow_action
+ (ct->dr_action_rply));
+#endif
+ }
+ claim_zero(mlx5_devx_cmd_destroy(ct_pool->devx_obj));
+ if (cnt) {
+ DRV_LOG(DEBUG, "%u ASO CT objects are being used in the pool %u",
+ cnt, i);
+ }
+ mlx5_free(ct_pool);
+ /* in case of failure. */
+ mng->next--;
+ }
+ mlx5_free(mng->pools);
+ mlx5_free(mng);
+ /* Management structure must be cleared to 0s during allocation. */
+ sh->ct_mng = NULL;
+}
+
+/**
+ * Initialize the flow resources' indexed mempool.
+ *
+ * @param[in] sh
+ * Pointer to mlx5_dev_ctx_shared object.
+ * @param[in] config
+ * Pointer to user dev config.
+ */
+static void
+mlx5_flow_ipool_create(struct mlx5_dev_ctx_shared *sh,
+ const struct mlx5_dev_config *config)
+{
+ uint8_t i;
+ struct mlx5_indexed_pool_config cfg;
+
+ for (i = 0; i < MLX5_IPOOL_MAX; ++i) {
+ cfg = mlx5_ipool_cfg[i];
+ switch (i) {
+ default:
+ break;
+ /*
+ * Set MLX5_IPOOL_MLX5_FLOW ipool size
+ * according to PCI function flow configuration.
+ */
+ case MLX5_IPOOL_MLX5_FLOW:
+ cfg.size = config->dv_flow_en ?
+ sizeof(struct mlx5_flow_handle) :
+ MLX5_FLOW_HANDLE_VERBS_SIZE;
+ break;
+ }
+ if (config->reclaim_mode) {
+ cfg.release_mem_en = 1;
+ cfg.per_core_cache = 0;
+ } else {
+ cfg.release_mem_en = 0;
+ }
+ sh->ipool[i] = mlx5_ipool_create(&cfg);
+ }
+}
+
+
+/**
+ * Release the flow resources' indexed mempool.
+ *
+ * @param[in] sh
+ * Pointer to mlx5_dev_ctx_shared object.
+ */
+static void
+mlx5_flow_ipool_destroy(struct mlx5_dev_ctx_shared *sh)
+{
+ uint8_t i;
+
+ for (i = 0; i < MLX5_IPOOL_MAX; ++i)
+ mlx5_ipool_destroy(sh->ipool[i]);
+ for (i = 0; i < MLX5_MAX_MODIFY_NUM; ++i)
+ if (sh->mdh_ipools[i])
+ mlx5_ipool_destroy(sh->mdh_ipools[i]);
+}
+
+/*
+ * Check if dynamic flex parser for eCPRI already exists.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * true on exists, false on not.
+ */
+bool
+mlx5_flex_parser_ecpri_exist(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flex_parser_profiles *prf =
+ &priv->sh->fp[MLX5_FLEX_PARSER_ECPRI_0];
+
+ return !!prf->obj;
+}
+
+/*
+ * Allocation of a flex parser for eCPRI. Once created, this parser related
+ * resources will be held until the device is closed.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flex_parser_ecpri_alloc(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flex_parser_profiles *prf =
+ &priv->sh->fp[MLX5_FLEX_PARSER_ECPRI_0];
+ struct mlx5_devx_graph_node_attr node = {
+ .modify_field_select = 0,
+ };
+ uint32_t ids[8];
+ int ret;
+
+ if (!priv->config.hca_attr.parse_graph_flex_node) {
+ DRV_LOG(ERR, "Dynamic flex parser is not supported "
+ "for device %s.", priv->dev_data->name);
+ return -ENOTSUP;
+ }
+ node.header_length_mode = MLX5_GRAPH_NODE_LEN_FIXED;
+ /* 8 bytes now: 4B common header + 4B message body header. */
+ node.header_length_base_value = 0x8;
+ /* After MAC layer: Ether / VLAN. */
+ node.in[0].arc_parse_graph_node = MLX5_GRAPH_ARC_NODE_MAC;
+ /* Type of compared condition should be 0xAEFE in the L2 layer. */
+ node.in[0].compare_condition_value = RTE_ETHER_TYPE_ECPRI;
+ /* Sample #0: type in common header. */
+ node.sample[0].flow_match_sample_en = 1;
+ /* Fixed offset. */
+ node.sample[0].flow_match_sample_offset_mode = 0x0;
+ /* Only the 2nd byte will be used. */
+ node.sample[0].flow_match_sample_field_base_offset = 0x0;
+ /* Sample #1: message payload. */
+ node.sample[1].flow_match_sample_en = 1;
+ /* Fixed offset. */
+ node.sample[1].flow_match_sample_offset_mode = 0x0;
+ /*
+ * Only the first two bytes will be used right now, and its offset will
+ * start after the common header that with the length of a DW(u32).
+ */
+ node.sample[1].flow_match_sample_field_base_offset = sizeof(uint32_t);
+ prf->obj = mlx5_devx_cmd_create_flex_parser(priv->sh->ctx, &node);
+ if (!prf->obj) {
+ DRV_LOG(ERR, "Failed to create flex parser node object.");
+ return (rte_errno == 0) ? -ENODEV : -rte_errno;
+ }
+ prf->num = 2;
+ ret = mlx5_devx_cmd_query_parse_samples(prf->obj, ids, prf->num);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to query sample IDs.");
+ return (rte_errno == 0) ? -ENODEV : -rte_errno;
+ }
+ prf->offset[0] = 0x0;
+ prf->offset[1] = sizeof(uint32_t);
+ prf->ids[0] = ids[0];
+ prf->ids[1] = ids[1];
+ return 0;
+}
+
+/*
+ * Destroy the flex parser node, including the parser itself, input / output
+ * arcs and DW samples. Resources could be reused then.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ */
+static void
+mlx5_flex_parser_ecpri_release(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flex_parser_profiles *prf =
+ &priv->sh->fp[MLX5_FLEX_PARSER_ECPRI_0];
+
+ if (prf->obj)
+ mlx5_devx_cmd_destroy(prf->obj);
+ prf->obj = NULL;
+}
+
+/*
+ * Allocate Rx and Tx UARs in robust fashion.
+ * This routine handles the following UAR allocation issues:
+ *
+ * - tries to allocate the UAR with the most appropriate memory
+ * mapping type from the ones supported by the host
+ *
+ * - tries to allocate the UAR with non-NULL base address
+ * OFED 5.0.x and Upstream rdma_core before v29 returned the NULL as
+ * UAR base address if UAR was not the first object in the UAR page.
+ * It caused the PMD failure and we should try to get another UAR
+ * till we get the first one with non-NULL base address returned.
+ */
+static int
+mlx5_alloc_rxtx_uars(struct mlx5_dev_ctx_shared *sh,
+ const struct mlx5_dev_config *config)
+{
+ uint32_t uar_mapping, retry;
+ int err = 0;
+ void *base_addr;
+
+ for (retry = 0; retry < MLX5_ALLOC_UAR_RETRY; ++retry) {
+#ifdef MLX5DV_UAR_ALLOC_TYPE_NC
+ /* Control the mapping type according to the settings. */
+ uar_mapping = (config->dbnc == MLX5_TXDB_NCACHED) ?
+ MLX5DV_UAR_ALLOC_TYPE_NC :
+ MLX5DV_UAR_ALLOC_TYPE_BF;
+#else
+ RTE_SET_USED(config);
+ /*
+ * It seems we have no way to control the memory mapping type
+ * for the UAR, the default "Write-Combining" type is supposed.
+ * The UAR initialization on queue creation queries the
+ * actual mapping type done by Verbs/kernel and setups the
+ * PMD datapath accordingly.
+ */
+ uar_mapping = 0;
+#endif
+ sh->tx_uar = mlx5_glue->devx_alloc_uar(sh->ctx, uar_mapping);
+#ifdef MLX5DV_UAR_ALLOC_TYPE_NC
+ if (!sh->tx_uar &&
+ uar_mapping == MLX5DV_UAR_ALLOC_TYPE_BF) {
+ if (config->dbnc == MLX5_TXDB_CACHED ||
+ config->dbnc == MLX5_TXDB_HEURISTIC)
+ DRV_LOG(WARNING, "Devarg tx_db_nc setting "
+ "is not supported by DevX");
+ /*
+ * In some environments like virtual machine
+ * the Write Combining mapped might be not supported
+ * and UAR allocation fails. We try "Non-Cached"
+ * mapping for the case. The tx_burst routines take
+ * the UAR mapping type into account on UAR setup
+ * on queue creation.
+ */
+ DRV_LOG(DEBUG, "Failed to allocate Tx DevX UAR (BF)");
+ uar_mapping = MLX5DV_UAR_ALLOC_TYPE_NC;
+ sh->tx_uar = mlx5_glue->devx_alloc_uar
+ (sh->ctx, uar_mapping);
+ } else if (!sh->tx_uar &&
+ uar_mapping == MLX5DV_UAR_ALLOC_TYPE_NC) {
+ if (config->dbnc == MLX5_TXDB_NCACHED)
+ DRV_LOG(WARNING, "Devarg tx_db_nc settings "
+ "is not supported by DevX");
+ /*
+ * If Verbs/kernel does not support "Non-Cached"
+ * try the "Write-Combining".
+ */
+ DRV_LOG(DEBUG, "Failed to allocate Tx DevX UAR (NC)");
+ uar_mapping = MLX5DV_UAR_ALLOC_TYPE_BF;
+ sh->tx_uar = mlx5_glue->devx_alloc_uar
+ (sh->ctx, uar_mapping);
+ }
+#endif
+ if (!sh->tx_uar) {
+ DRV_LOG(ERR, "Failed to allocate Tx DevX UAR (BF/NC)");
+ err = ENOMEM;
+ goto exit;
+ }
+ base_addr = mlx5_os_get_devx_uar_base_addr(sh->tx_uar);
+ if (base_addr)
+ break;
+ /*
+ * The UARs are allocated by rdma_core within the
+ * IB device context, on context closure all UARs
+ * will be freed, should be no memory/object leakage.
+ */
+ DRV_LOG(DEBUG, "Retrying to allocate Tx DevX UAR");
+ sh->tx_uar = NULL;
+ }
+ /* Check whether we finally succeeded with valid UAR allocation. */
+ if (!sh->tx_uar) {
+ DRV_LOG(ERR, "Failed to allocate Tx DevX UAR (NULL base)");
+ err = ENOMEM;
+ goto exit;
+ }
+ for (retry = 0; retry < MLX5_ALLOC_UAR_RETRY; ++retry) {
+ uar_mapping = 0;
+ sh->devx_rx_uar = mlx5_glue->devx_alloc_uar
+ (sh->ctx, uar_mapping);
+#ifdef MLX5DV_UAR_ALLOC_TYPE_NC
+ if (!sh->devx_rx_uar &&
+ uar_mapping == MLX5DV_UAR_ALLOC_TYPE_BF) {
+ /*
+ * Rx UAR is used to control interrupts only,
+ * should be no datapath noticeable impact,
+ * can try "Non-Cached" mapping safely.
+ */
+ DRV_LOG(DEBUG, "Failed to allocate Rx DevX UAR (BF)");
+ uar_mapping = MLX5DV_UAR_ALLOC_TYPE_NC;
+ sh->devx_rx_uar = mlx5_glue->devx_alloc_uar
+ (sh->ctx, uar_mapping);
+ }
+#endif
+ if (!sh->devx_rx_uar) {
+ DRV_LOG(ERR, "Failed to allocate Rx DevX UAR (BF/NC)");
+ err = ENOMEM;
+ goto exit;
+ }
+ base_addr = mlx5_os_get_devx_uar_base_addr(sh->devx_rx_uar);
+ if (base_addr)
+ break;
+ /*
+ * The UARs are allocated by rdma_core within the
+ * IB device context, on context closure all UARs
+ * will be freed, should be no memory/object leakage.
+ */
+ DRV_LOG(DEBUG, "Retrying to allocate Rx DevX UAR");
+ sh->devx_rx_uar = NULL;
+ }
+ /* Check whether we finally succeeded with valid UAR allocation. */
+ if (!sh->devx_rx_uar) {
+ DRV_LOG(ERR, "Failed to allocate Rx DevX UAR (NULL base)");
+ err = ENOMEM;
+ }
+exit:
+ return err;
+}
+
+/**
+ * Allocate shared device context. If there is multiport device the
+ * master and representors will share this context, if there is single
+ * port dedicated device, the context will be used by only given
+ * port due to unification.
+ *
+ * Routine first searches the context for the specified device name,
+ * if found the shared context assumed and reference counter is incremented.
+ * If no context found the new one is created and initialized with specified
+ * device context and parameters.
+ *
+ * @param[in] spawn
+ * Pointer to the device attributes (name, port, etc).
+ * @param[in] config
+ * Pointer to device configuration structure.
+ *
+ * @return
+ * Pointer to mlx5_dev_ctx_shared object on success,
+ * otherwise NULL and rte_errno is set.
+ */
+struct mlx5_dev_ctx_shared *
+mlx5_alloc_shared_dev_ctx(const struct mlx5_dev_spawn_data *spawn,
+ const struct mlx5_dev_config *config)
+{
+ struct mlx5_dev_ctx_shared *sh;
+ int err = 0;
+ uint32_t i;
+ struct mlx5_devx_tis_attr tis_attr = { 0 };
+
+ MLX5_ASSERT(spawn);
+ /* Secondary process should not create the shared context. */
+ MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+ pthread_mutex_lock(&mlx5_dev_ctx_list_mutex);
+ /* Search for IB context by device name. */
+ LIST_FOREACH(sh, &mlx5_dev_ctx_list, next) {
+ if (!strcmp(sh->ibdev_name,
+ mlx5_os_get_dev_device_name(spawn->phys_dev))) {
+ sh->refcnt++;
+ goto exit;
+ }
+ }
+ /* No device found, we have to create new shared context. */
+ MLX5_ASSERT(spawn->max_port);
+ sh = mlx5_malloc(MLX5_MEM_ZERO | MLX5_MEM_RTE,
+ sizeof(struct mlx5_dev_ctx_shared) +
+ spawn->max_port *
+ sizeof(struct mlx5_dev_shared_port),
+ RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
+ if (!sh) {
+ DRV_LOG(ERR, "shared context allocation failure");
+ rte_errno = ENOMEM;
+ goto exit;
+ }
+ sh->numa_node = spawn->numa_node;
+ if (spawn->bond_info)
+ sh->bond = *spawn->bond_info;
+ err = mlx5_os_open_device(spawn, config, sh);
+ if (!sh->ctx)
+ goto error;
+ err = mlx5_os_get_dev_attr(sh->ctx, &sh->device_attr);
+ if (err) {
+ DRV_LOG(DEBUG, "mlx5_os_get_dev_attr() failed");
+ goto error;
+ }
+ sh->refcnt = 1;
+ sh->max_port = spawn->max_port;
+ sh->reclaim_mode = config->reclaim_mode;
+ strncpy(sh->ibdev_name, mlx5_os_get_ctx_device_name(sh->ctx),
+ sizeof(sh->ibdev_name) - 1);
+ strncpy(sh->ibdev_path, mlx5_os_get_ctx_device_path(sh->ctx),
+ sizeof(sh->ibdev_path) - 1);
+ /*
+ * Setting port_id to max unallowed value means
+ * there is no interrupt subhandler installed for
+ * the given port index i.
+ */
+ for (i = 0; i < sh->max_port; i++) {
+ sh->port[i].ih_port_id = RTE_MAX_ETHPORTS;
+ sh->port[i].devx_ih_port_id = RTE_MAX_ETHPORTS;
+ }
+ sh->pd = mlx5_os_alloc_pd(sh->ctx);
+ if (sh->pd == NULL) {
+ DRV_LOG(ERR, "PD allocation failure");
+ err = ENOMEM;
+ goto error;
+ }
+ if (sh->devx) {
+ err = mlx5_os_get_pdn(sh->pd, &sh->pdn);
+ if (err) {
+ DRV_LOG(ERR, "Fail to extract pdn from PD");
+ goto error;
+ }
+ sh->td = mlx5_devx_cmd_create_td(sh->ctx);
+ if (!sh->td) {
+ DRV_LOG(ERR, "TD allocation failure");
+ err = ENOMEM;
+ goto error;
+ }
+ tis_attr.transport_domain = sh->td->id;
+ sh->tis = mlx5_devx_cmd_create_tis(sh->ctx, &tis_attr);
+ if (!sh->tis) {
+ DRV_LOG(ERR, "TIS allocation failure");
+ err = ENOMEM;
+ goto error;
+ }
+ err = mlx5_alloc_rxtx_uars(sh, config);
+ if (err)
+ goto error;
+ MLX5_ASSERT(sh->tx_uar);
+ MLX5_ASSERT(mlx5_os_get_devx_uar_base_addr(sh->tx_uar));
+
+ MLX5_ASSERT(sh->devx_rx_uar);
+ MLX5_ASSERT(mlx5_os_get_devx_uar_base_addr(sh->devx_rx_uar));
+ }
+#ifndef RTE_ARCH_64
+ /* Initialize UAR access locks for 32bit implementations. */
+ rte_spinlock_init(&sh->uar_lock_cq);
+ for (i = 0; i < MLX5_UAR_PAGE_NUM_MAX; i++)
+ rte_spinlock_init(&sh->uar_lock[i]);
+#endif
+ /*
+ * Once the device is added to the list of memory event
+ * callback, its global MR cache table cannot be expanded
+ * on the fly because of deadlock. If it overflows, lookup
+ * should be done by searching MR list linearly, which is slow.
+ *
+ * At this point the device is not added to the memory
+ * event list yet, context is just being created.
+ */
+ err = mlx5_mr_btree_init(&sh->share_cache.cache,
+ MLX5_MR_BTREE_CACHE_N * 2,
+ sh->numa_node);
+ if (err) {
+ err = rte_errno;
+ goto error;
+ }
+ mlx5_os_set_reg_mr_cb(&sh->share_cache.reg_mr_cb,
+ &sh->share_cache.dereg_mr_cb);
+ mlx5_os_dev_shared_handler_install(sh);
+ sh->cnt_id_tbl = mlx5_l3t_create(MLX5_L3T_TYPE_DWORD);
+ if (!sh->cnt_id_tbl) {
+ err = rte_errno;
+ goto error;
+ }
+ if (LIST_EMPTY(&mlx5_dev_ctx_list)) {
+ err = mlx5_flow_os_init_workspace_once();
+ if (err)
+ goto error;
+ }
+ mlx5_flow_aging_init(sh);
+ mlx5_flow_counters_mng_init(sh);
+ mlx5_flow_ipool_create(sh, config);
+ /* Add device to memory callback list. */
+ rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
+ LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
+ sh, mem_event_cb);
+ rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
+ /* Add context to the global device list. */
+ LIST_INSERT_HEAD(&mlx5_dev_ctx_list, sh, next);
+ rte_spinlock_init(&sh->geneve_tlv_opt_sl);
+exit:
+ pthread_mutex_unlock(&mlx5_dev_ctx_list_mutex);
+ return sh;
+error:
+ pthread_mutex_destroy(&sh->txpp.mutex);
+ pthread_mutex_unlock(&mlx5_dev_ctx_list_mutex);
+ MLX5_ASSERT(sh);
+ if (sh->cnt_id_tbl)
+ mlx5_l3t_destroy(sh->cnt_id_tbl);
+ if (sh->share_cache.cache.table)
+ mlx5_mr_btree_free(&sh->share_cache.cache);
+ if (sh->tis)
+ claim_zero(mlx5_devx_cmd_destroy(sh->tis));
+ if (sh->td)
+ claim_zero(mlx5_devx_cmd_destroy(sh->td));
+ if (sh->devx_rx_uar)
+ mlx5_glue->devx_free_uar(sh->devx_rx_uar);
+ if (sh->tx_uar)
+ mlx5_glue->devx_free_uar(sh->tx_uar);
+ if (sh->pd)
+ claim_zero(mlx5_os_dealloc_pd(sh->pd));
+ if (sh->ctx)
+ claim_zero(mlx5_glue->close_device(sh->ctx));
+ mlx5_free(sh);
+ MLX5_ASSERT(err > 0);
+ rte_errno = err;
+ return NULL;
+}
+
+/**
+ * Free shared IB device context. Decrement counter and if zero free
+ * all allocated resources and close handles.
+ *
+ * @param[in] sh
+ * Pointer to mlx5_dev_ctx_shared object to free
+ */
+void
+mlx5_free_shared_dev_ctx(struct mlx5_dev_ctx_shared *sh)
+{
+ pthread_mutex_lock(&mlx5_dev_ctx_list_mutex);
+#ifdef RTE_LIBRTE_MLX5_DEBUG
+ /* Check the object presence in the list. */
+ struct mlx5_dev_ctx_shared *lctx;
+
+ LIST_FOREACH(lctx, &mlx5_dev_ctx_list, next)
+ if (lctx == sh)
+ break;
+ MLX5_ASSERT(lctx);
+ if (lctx != sh) {
+ DRV_LOG(ERR, "Freeing non-existing shared IB context");
+ goto exit;
+ }
+#endif
+ MLX5_ASSERT(sh);
+ MLX5_ASSERT(sh->refcnt);
+ /* Secondary process should not free the shared context. */
+ MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+ if (--sh->refcnt)
+ goto exit;
+ /* Remove from memory callback device list. */
+ rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
+ LIST_REMOVE(sh, mem_event_cb);
+ rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
+ /* Release created Memory Regions. */
+ mlx5_mr_release_cache(&sh->share_cache);
+ /* Remove context from the global device list. */
+ LIST_REMOVE(sh, next);
+ /* Release flow workspaces objects on the last device. */
+ if (LIST_EMPTY(&mlx5_dev_ctx_list))
+ mlx5_flow_os_release_workspace();
+ pthread_mutex_unlock(&mlx5_dev_ctx_list_mutex);
+ /*
+ * Ensure there is no async event handler installed.
+ * Only primary process handles async device events.
+ **/
+ mlx5_flow_counters_mng_close(sh);
+ if (sh->aso_age_mng) {
+ mlx5_flow_aso_age_mng_close(sh);
+ sh->aso_age_mng = NULL;
+ }
+ if (sh->mtrmng)
+ mlx5_aso_flow_mtrs_mng_close(sh);
+ mlx5_flow_ipool_destroy(sh);
+ mlx5_os_dev_shared_handler_uninstall(sh);
+ if (sh->cnt_id_tbl) {
+ mlx5_l3t_destroy(sh->cnt_id_tbl);
+ sh->cnt_id_tbl = NULL;
+ }
+ if (sh->tx_uar) {
+ mlx5_glue->devx_free_uar(sh->tx_uar);
+ sh->tx_uar = NULL;
+ }
+ if (sh->pd)
+ claim_zero(mlx5_os_dealloc_pd(sh->pd));
+ if (sh->tis)
+ claim_zero(mlx5_devx_cmd_destroy(sh->tis));
+ if (sh->td)
+ claim_zero(mlx5_devx_cmd_destroy(sh->td));
+ if (sh->devx_rx_uar)
+ mlx5_glue->devx_free_uar(sh->devx_rx_uar);
+ if (sh->ctx)
+ claim_zero(mlx5_glue->close_device(sh->ctx));
+ MLX5_ASSERT(sh->geneve_tlv_option_resource == NULL);
+ pthread_mutex_destroy(&sh->txpp.mutex);
+ mlx5_free(sh);
+ return;
+exit:
+ pthread_mutex_unlock(&mlx5_dev_ctx_list_mutex);
+}
+
+/**
+ * Destroy table hash list.
+ *
+ * @param[in] priv
+ * Pointer to the private device data structure.
+ */
+void
+mlx5_free_table_hash_list(struct mlx5_priv *priv)
+{
+ struct mlx5_dev_ctx_shared *sh = priv->sh;
+
+ if (!sh->flow_tbls)
+ return;
+ mlx5_hlist_destroy(sh->flow_tbls);
+}
+
+/**
+ * Initialize flow table hash list and create the root tables entry
+ * for each domain.
+ *
+ * @param[in] priv
+ * Pointer to the private device data structure.
+ *
+ * @return
+ * Zero on success, positive error code otherwise.
+ */
+int
+mlx5_alloc_table_hash_list(struct mlx5_priv *priv __rte_unused)
+{
+ int err = 0;
+ /* Tables are only used in DV and DR modes. */
+#if defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_INFINIBAND_VERBS_H)
+ struct mlx5_dev_ctx_shared *sh = priv->sh;
+ char s[MLX5_NAME_SIZE];
+
+ MLX5_ASSERT(sh);
+ snprintf(s, sizeof(s), "%s_flow_table", priv->sh->ibdev_name);
+ sh->flow_tbls = mlx5_hlist_create(s, MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE,
+ false, true, sh,
+ flow_dv_tbl_create_cb,
+ flow_dv_tbl_match_cb,
+ flow_dv_tbl_remove_cb,
+ flow_dv_tbl_clone_cb,
+ flow_dv_tbl_clone_free_cb);
+ if (!sh->flow_tbls) {
+ DRV_LOG(ERR, "flow tables with hash creation failed.");
+ err = ENOMEM;
+ return err;
+ }
+#ifndef HAVE_MLX5DV_DR
+ struct rte_flow_error error;
+ struct rte_eth_dev *dev = &rte_eth_devices[priv->dev_data->port_id];
+
+ /*
+ * In case we have not DR support, the zero tables should be created
+ * because DV expect to see them even if they cannot be created by
+ * RDMA-CORE.
+ */
+ if (!flow_dv_tbl_resource_get(dev, 0, 0, 0, 0,
+ NULL, 0, 1, 0, &error) ||
+ !flow_dv_tbl_resource_get(dev, 0, 1, 0, 0,
+ NULL, 0, 1, 0, &error) ||
+ !flow_dv_tbl_resource_get(dev, 0, 0, 1, 0,
+ NULL, 0, 1, 0, &error)) {
+ err = ENOMEM;
+ goto error;
+ }
+ return err;
+error:
+ mlx5_free_table_hash_list(priv);
+#endif /* HAVE_MLX5DV_DR */
+#endif
+ return err;
+}
+
+/**
+ * Retrieve integer value from environment variable.
+ *
+ * @param[in] name
+ * Environment variable name.
+ *
+ * @return
+ * Integer value, 0 if the variable is not set.
+ */
+int
+mlx5_getenv_int(const char *name)
+{
+ const char *val = getenv(name);
+
+ if (val == NULL)
+ return 0;
+ return atoi(val);
+}
+
+/**
+ * DPDK callback to add udp tunnel port
+ *
+ * @param[in] dev
+ * A pointer to eth_dev
+ * @param[in] udp_tunnel
+ * A pointer to udp tunnel
+ *
+ * @return
+ * 0 on valid udp ports and tunnels, -ENOTSUP otherwise.
+ */
+int
+mlx5_udp_tunnel_port_add(struct rte_eth_dev *dev __rte_unused,
+ struct rte_eth_udp_tunnel *udp_tunnel)
+{
+ MLX5_ASSERT(udp_tunnel != NULL);
+ if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN &&
+ udp_tunnel->udp_port == 4789)
+ return 0;
+ if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN_GPE &&
+ udp_tunnel->udp_port == 4790)
+ return 0;
+ return -ENOTSUP;
+}
+
+/**
+ * Initialize process private data structure.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_proc_priv_init(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_proc_priv *ppriv;
+ size_t ppriv_size;
+
+ mlx5_proc_priv_uninit(dev);
+ /*
+ * UAR register table follows the process private structure. BlueFlame
+ * registers for Tx queues are stored in the table.
+ */
+ ppriv_size =
+ sizeof(struct mlx5_proc_priv) + priv->txqs_n * sizeof(void *);
+ ppriv = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, ppriv_size,
+ RTE_CACHE_LINE_SIZE, dev->device->numa_node);
+ if (!ppriv) {
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ ppriv->uar_table_sz = priv->txqs_n;
+ dev->process_private = ppriv;
+ return 0;
+}
+
+/**
+ * Un-initialize process private data structure.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ */
+void
+mlx5_proc_priv_uninit(struct rte_eth_dev *dev)
+{
+ if (!dev->process_private)
+ return;
+ mlx5_free(dev->process_private);
+ dev->process_private = NULL;
+}
+
+/**
+ * DPDK callback to close the device.
+ *
+ * Destroy all queues and objects, free memory.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ */
+int
+mlx5_dev_close(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ unsigned int i;
+ int ret;
+
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+ /* Check if process_private released. */
+ if (!dev->process_private)
+ return 0;
+ mlx5_tx_uar_uninit_secondary(dev);
+ mlx5_proc_priv_uninit(dev);
+ rte_eth_dev_release_port(dev);
+ return 0;
+ }
+ if (!priv->sh)
+ return 0;
+ DRV_LOG(DEBUG, "port %u closing device \"%s\"",
+ dev->data->port_id,
+ ((priv->sh->ctx != NULL) ?
+ mlx5_os_get_ctx_device_name(priv->sh->ctx) : ""));
+ /*
+ * If default mreg copy action is removed at the stop stage,
+ * the search will return none and nothing will be done anymore.
+ */
+ mlx5_flow_stop_default(dev);
+ mlx5_traffic_disable(dev);
+ /*
+ * If all the flows are already flushed in the device stop stage,
+ * then this will return directly without any action.
+ */
+ mlx5_flow_list_flush(dev, MLX5_FLOW_TYPE_GEN, true);
+ mlx5_action_handle_flush(dev);
+ mlx5_flow_meter_flush(dev, NULL);
+ /* Prevent crashes when queues are still in use. */
+ dev->rx_pkt_burst = removed_rx_burst;
+ dev->tx_pkt_burst = removed_tx_burst;
+ rte_wmb();
+ /* Disable datapath on secondary process. */
+ mlx5_mp_os_req_stop_rxtx(dev);
+ /* Free the eCPRI flex parser resource. */
+ mlx5_flex_parser_ecpri_release(dev);
+ if (priv->rxqs != NULL) {
+ /* XXX race condition if mlx5_rx_burst() is still running. */
+ rte_delay_us_sleep(1000);
+ for (i = 0; (i != priv->rxqs_n); ++i)
+ mlx5_rxq_release(dev, i);
+ priv->rxqs_n = 0;
+ priv->rxqs = NULL;
+ }
+ if (priv->representor) {
+ /* Each representor has a dedicated interrupts handler */
+ mlx5_free(dev->intr_handle);
+ dev->intr_handle = NULL;
+ }
+ if (priv->txqs != NULL) {
+ /* XXX race condition if mlx5_tx_burst() is still running. */
+ rte_delay_us_sleep(1000);
+ for (i = 0; (i != priv->txqs_n); ++i)
+ mlx5_txq_release(dev, i);
+ priv->txqs_n = 0;
+ priv->txqs = NULL;
+ }
+ mlx5_proc_priv_uninit(dev);
+ if (priv->q_counters) {
+ mlx5_devx_cmd_destroy(priv->q_counters);
+ priv->q_counters = NULL;
+ }
+ if (priv->drop_queue.hrxq)
+ mlx5_drop_action_destroy(dev);
+ if (priv->mreg_cp_tbl)
+ mlx5_hlist_destroy(priv->mreg_cp_tbl);
+ mlx5_mprq_free_mp(dev);
+ if (priv->sh->ct_mng)
+ mlx5_flow_aso_ct_mng_close(priv->sh);
+ mlx5_os_free_shared_dr(priv);
+ if (priv->rss_conf.rss_key != NULL)
+ mlx5_free(priv->rss_conf.rss_key);
+ if (priv->reta_idx != NULL)
+ mlx5_free(priv->reta_idx);
+ if (priv->config.vf)
+ mlx5_os_mac_addr_flush(dev);
+ if (priv->nl_socket_route >= 0)
+ close(priv->nl_socket_route);
+ if (priv->nl_socket_rdma >= 0)
+ close(priv->nl_socket_rdma);
+ if (priv->vmwa_context)
+ mlx5_vlan_vmwa_exit(priv->vmwa_context);
+ ret = mlx5_hrxq_verify(dev);
+ if (ret)
+ DRV_LOG(WARNING, "port %u some hash Rx queue still remain",
+ dev->data->port_id);
+ ret = mlx5_ind_table_obj_verify(dev);
+ if (ret)
+ DRV_LOG(WARNING, "port %u some indirection table still remain",
+ dev->data->port_id);
+ ret = mlx5_rxq_obj_verify(dev);
+ if (ret)
+ DRV_LOG(WARNING, "port %u some Rx queue objects still remain",
+ dev->data->port_id);
+ ret = mlx5_rxq_verify(dev);
+ if (ret)
+ DRV_LOG(WARNING, "port %u some Rx queues still remain",
+ dev->data->port_id);
+ ret = mlx5_txq_obj_verify(dev);
+ if (ret)
+ DRV_LOG(WARNING, "port %u some Verbs Tx queue still remain",
+ dev->data->port_id);
+ ret = mlx5_txq_verify(dev);
+ if (ret)
+ DRV_LOG(WARNING, "port %u some Tx queues still remain",
+ dev->data->port_id);
+ ret = mlx5_flow_verify(dev);
+ if (ret)
+ DRV_LOG(WARNING, "port %u some flows still remain",
+ dev->data->port_id);
+ if (priv->hrxqs)
+ mlx5_list_destroy(priv->hrxqs);
+ /*
+ * Free the shared context in last turn, because the cleanup
+ * routines above may use some shared fields, like
+ * mlx5_os_mac_addr_flush() uses ibdev_path for retrieveing
+ * ifindex if Netlink fails.
+ */
+ mlx5_free_shared_dev_ctx(priv->sh);
+ if (priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
+ unsigned int c = 0;
+ uint16_t port_id;
+
+ MLX5_ETH_FOREACH_DEV(port_id, dev->device) {
+ struct mlx5_priv *opriv =
+ rte_eth_devices[port_id].data->dev_private;
+
+ if (!opriv ||
+ opriv->domain_id != priv->domain_id ||
+ &rte_eth_devices[port_id] == dev)
+ continue;
+ ++c;
+ break;
+ }
+ if (!c)
+ claim_zero(rte_eth_switch_domain_free(priv->domain_id));
+ }
+ memset(priv, 0, sizeof(*priv));
+ priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
+ /*
+ * Reset mac_addrs to NULL such that it is not freed as part of
+ * rte_eth_dev_release_port(). mac_addrs is part of dev_private so
+ * it is freed when dev_private is freed.
+ */
+ dev->data->mac_addrs = NULL;
+ return 0;
+}
+
+const struct eth_dev_ops mlx5_dev_ops = {
+ .dev_configure = mlx5_dev_configure,
+ .dev_start = mlx5_dev_start,
+ .dev_stop = mlx5_dev_stop,
+ .dev_set_link_down = mlx5_set_link_down,
+ .dev_set_link_up = mlx5_set_link_up,
+ .dev_close = mlx5_dev_close,
+ .promiscuous_enable = mlx5_promiscuous_enable,
+ .promiscuous_disable = mlx5_promiscuous_disable,
+ .allmulticast_enable = mlx5_allmulticast_enable,
+ .allmulticast_disable = mlx5_allmulticast_disable,
+ .link_update = mlx5_link_update,
+ .stats_get = mlx5_stats_get,
+ .stats_reset = mlx5_stats_reset,
+ .xstats_get = mlx5_xstats_get,
+ .xstats_reset = mlx5_xstats_reset,
+ .xstats_get_names = mlx5_xstats_get_names,
+ .fw_version_get = mlx5_fw_version_get,
+ .dev_infos_get = mlx5_dev_infos_get,
+ .representor_info_get = mlx5_representor_info_get,
+ .read_clock = mlx5_txpp_read_clock,
+ .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get,
+ .vlan_filter_set = mlx5_vlan_filter_set,
+ .rx_queue_setup = mlx5_rx_queue_setup,
+ .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup,
+ .tx_queue_setup = mlx5_tx_queue_setup,
+ .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup,
+ .rx_queue_release = mlx5_rx_queue_release,
+ .tx_queue_release = mlx5_tx_queue_release,
+ .rx_queue_start = mlx5_rx_queue_start,
+ .rx_queue_stop = mlx5_rx_queue_stop,
+ .tx_queue_start = mlx5_tx_queue_start,
+ .tx_queue_stop = mlx5_tx_queue_stop,
+ .flow_ctrl_get = mlx5_dev_get_flow_ctrl,
+ .flow_ctrl_set = mlx5_dev_set_flow_ctrl,
+ .mac_addr_remove = mlx5_mac_addr_remove,
+ .mac_addr_add = mlx5_mac_addr_add,
+ .mac_addr_set = mlx5_mac_addr_set,
+ .set_mc_addr_list = mlx5_set_mc_addr_list,
+ .mtu_set = mlx5_dev_set_mtu,
+ .vlan_strip_queue_set = mlx5_vlan_strip_queue_set,
+ .vlan_offload_set = mlx5_vlan_offload_set,
+ .reta_update = mlx5_dev_rss_reta_update,
+ .reta_query = mlx5_dev_rss_reta_query,
+ .rss_hash_update = mlx5_rss_hash_update,
+ .rss_hash_conf_get = mlx5_rss_hash_conf_get,
+ .flow_ops_get = mlx5_flow_ops_get,
+ .rxq_info_get = mlx5_rxq_info_get,
+ .txq_info_get = mlx5_txq_info_get,
+ .rx_burst_mode_get = mlx5_rx_burst_mode_get,
+ .tx_burst_mode_get = mlx5_tx_burst_mode_get,
+ .rx_queue_intr_enable = mlx5_rx_intr_enable,
+ .rx_queue_intr_disable = mlx5_rx_intr_disable,
+ .is_removed = mlx5_is_removed,
+ .udp_tunnel_port_add = mlx5_udp_tunnel_port_add,
+ .get_module_info = mlx5_get_module_info,
+ .get_module_eeprom = mlx5_get_module_eeprom,
+ .hairpin_cap_get = mlx5_hairpin_cap_get,
+ .mtr_ops_get = mlx5_flow_meter_ops_get,
+ .hairpin_bind = mlx5_hairpin_bind,
+ .hairpin_unbind = mlx5_hairpin_unbind,
+ .hairpin_get_peer_ports = mlx5_hairpin_get_peer_ports,
+ .hairpin_queue_peer_update = mlx5_hairpin_queue_peer_update,
+ .hairpin_queue_peer_bind = mlx5_hairpin_queue_peer_bind,
+ .hairpin_queue_peer_unbind = mlx5_hairpin_queue_peer_unbind,
+ .get_monitor_addr = mlx5_get_monitor_addr,