X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=drivers%2Fnet%2Fmlx5%2Fmlx5.c;h=1521ecfd8f786cb0e7ade3ceda6ba5c7a7a257fd;hb=f7e95215ac7c;hp=33ca36d29a0b58548e07195ec2a63b11da269327;hpb=38b4b397a57dfb0f892c8289ce1e724f6a37ba0b;p=dpdk.git diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 33ca36d29a..1521ecfd8f 100644 --- a/drivers/net/mlx5/mlx5.c +++ b/drivers/net/mlx5/mlx5.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include @@ -138,6 +137,9 @@ /* Device parameter to configure the maximum number of dump files per queue. */ #define MLX5_MAX_DUMP_FILES_NUM "max_dump_files_num" +/* Configure timeout of LRO session (in microseconds). */ +#define MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec" + #ifndef HAVE_IBV_MLX5_MOD_MPW #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2) #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3) @@ -166,6 +168,7 @@ struct mlx5_dev_spawn_data { uint32_t ifindex; /**< Network interface index. */ uint32_t max_port; /**< IB device maximal port index. */ uint32_t ibv_port; /**< IB device physical port index. */ + int pf_bond; /**< bonding device PF index. < 0 - no bonding */ struct mlx5_switch_info info; /**< Switch information. */ struct ibv_device *ibv_dev; /**< Associated IB device. */ struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */ @@ -266,6 +269,37 @@ mlx5_flow_counters_mng_close(struct mlx5_ibv_shared *sh) memset(&sh->cmng, 0, sizeof(sh->cmng)); } +/** + * Extract pdn of PD object using DV API. + * + * @param[in] pd + * Pointer to the verbs PD object. + * @param[out] pdn + * Pointer to the PD object number variable. + * + * @return + * 0 on success, error value otherwise. + */ +#ifdef HAVE_IBV_FLOW_DV_SUPPORT +static int +mlx5_get_pdn(struct ibv_pd *pd __rte_unused, uint32_t *pdn __rte_unused) +{ + struct mlx5dv_obj obj; + struct mlx5dv_pd pd_info; + int ret = 0; + + obj.pd.in = pd; + obj.pd.out = &pd_info; + ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD); + if (ret) { + DRV_LOG(DEBUG, "Fail to get PD object info"); + return ret; + } + *pdn = pd_info.pdn; + return 0; +} +#endif /* HAVE_IBV_FLOW_DV_SUPPORT */ + /** * Allocate shared IB device context. If there is multiport device the * master and representors will share this context, if there is single @@ -339,7 +373,6 @@ mlx5_alloc_shared_ibctx(const struct mlx5_dev_spawn_data *spawn) sizeof(sh->ibdev_name)); strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path, sizeof(sh->ibdev_path)); - sh->pci_dev = spawn->pci_dev; pthread_mutex_init(&sh->intr_mutex, NULL); /* * Setting port_id to max unallowed value means @@ -354,6 +387,13 @@ mlx5_alloc_shared_ibctx(const struct mlx5_dev_spawn_data *spawn) err = ENOMEM; goto error; } +#ifdef HAVE_IBV_FLOW_DV_SUPPORT + err = mlx5_get_pdn(sh->pd, &sh->pdn); + if (err) { + DRV_LOG(ERR, "Fail to extract pdn from PD"); + goto error; + } +#endif /* HAVE_IBV_FLOW_DV_SUPPORT */ /* * Once the device is added to the list of memory event * callback, its global MR cache table cannot be expanded @@ -365,12 +405,18 @@ mlx5_alloc_shared_ibctx(const struct mlx5_dev_spawn_data *spawn) */ err = mlx5_mr_btree_init(&sh->mr.cache, MLX5_MR_BTREE_CACHE_N * 2, - sh->pci_dev->device.numa_node); + spawn->pci_dev->device.numa_node); if (err) { err = rte_errno; goto error; } mlx5_flow_counters_mng_init(sh); + /* Add device to memory callback list. */ + rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); + LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list, + sh, mem_event_cb); + rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); + /* Add context to the global device list. */ LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next); exit: pthread_mutex_unlock(&mlx5_ibv_list_mutex); @@ -420,6 +466,11 @@ mlx5_free_shared_ibctx(struct mlx5_ibv_shared *sh) goto exit; /* Release created Memory Regions. */ mlx5_mr_release(sh); + /* Remove from memory callback device list. */ + rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); + LIST_REMOVE(sh, mem_event_cb); + rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); + /* Remove context from the global device list. */ LIST_REMOVE(sh, next); /* * Ensure there is no async event handler installed. @@ -497,6 +548,7 @@ mlx5_alloc_shared_dr(struct mlx5_priv *priv) sh->esw_drop_action = mlx5_glue->dr_create_flow_action_drop(); } #endif + sh->pop_vlan_action = mlx5_glue->dr_create_flow_action_pop_vlan(); sh->dv_refcnt++; priv->dr_shared = 1; return 0; @@ -519,6 +571,10 @@ error: mlx5_glue->destroy_flow_action(sh->esw_drop_action); sh->esw_drop_action = NULL; } + if (sh->pop_vlan_action) { + mlx5_glue->destroy_flow_action(sh->pop_vlan_action); + sh->pop_vlan_action = NULL; + } return err; #else (void)priv; @@ -564,6 +620,10 @@ mlx5_free_shared_dr(struct mlx5_priv *priv) sh->esw_drop_action = NULL; } #endif + if (sh->pop_vlan_action) { + mlx5_glue->destroy_flow_action(sh->pop_vlan_action); + sh->pop_vlan_action = NULL; + } pthread_mutex_destroy(&sh->dv_mutex); #else (void)priv; @@ -692,6 +752,31 @@ mlx5_free_verbs_buf(void *ptr, void *data __rte_unused) rte_free(ptr); } +/** + * DPDK callback to add udp tunnel port + * + * @param[in] dev + * A pointer to eth_dev + * @param[in] udp_tunnel + * A pointer to udp tunnel + * + * @return + * 0 on valid udp ports and tunnels, -ENOTSUP otherwise. + */ +int +mlx5_udp_tunnel_port_add(struct rte_eth_dev *dev __rte_unused, + struct rte_eth_udp_tunnel *udp_tunnel) +{ + assert(udp_tunnel != NULL); + if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN && + udp_tunnel->udp_port == 4789) + return 0; + if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN_GPE && + udp_tunnel->udp_port == 4790) + return 0; + return -ENOTSUP; +} + /** * Initialize process private data structure. * @@ -786,11 +871,6 @@ mlx5_dev_close(struct rte_eth_dev *dev) } mlx5_proc_priv_uninit(dev); mlx5_mprq_free_mp(dev); - /* Remove from memory callback device list. */ - rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); - assert(priv->sh); - LIST_REMOVE(priv->sh, mem_event_cb); - rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); mlx5_free_shared_dr(priv); if (priv->rss_conf.rss_key != NULL) rte_free(priv->rss_conf.rss_key); @@ -802,6 +882,8 @@ mlx5_dev_close(struct rte_eth_dev *dev) close(priv->nl_socket_route); if (priv->nl_socket_rdma >= 0) close(priv->nl_socket_rdma); + if (priv->vmwa_context) + mlx5_vlan_vmwa_exit(priv->vmwa_context); if (priv->sh) { /* * Free the shared context in last turn, because the cleanup @@ -812,17 +894,17 @@ mlx5_dev_close(struct rte_eth_dev *dev) mlx5_free_shared_ibctx(priv->sh); priv->sh = NULL; } - ret = mlx5_hrxq_ibv_verify(dev); + ret = mlx5_hrxq_verify(dev); if (ret) DRV_LOG(WARNING, "port %u some hash Rx queue still remain", dev->data->port_id); - ret = mlx5_ind_table_ibv_verify(dev); + ret = mlx5_ind_table_obj_verify(dev); if (ret) DRV_LOG(WARNING, "port %u some indirection table still remain", dev->data->port_id); - ret = mlx5_rxq_ibv_verify(dev); + ret = mlx5_rxq_obj_verify(dev); if (ret) - DRV_LOG(WARNING, "port %u some Verbs Rx queue still remain", + DRV_LOG(WARNING, "port %u some Rx queue objects still remain", dev->data->port_id); ret = mlx5_rxq_verify(dev); if (ret) @@ -844,7 +926,7 @@ mlx5_dev_close(struct rte_eth_dev *dev) unsigned int c = 0; uint16_t port_id; - RTE_ETH_FOREACH_DEV_OF(port_id, dev->device) { + MLX5_ETH_FOREACH_DEV(port_id) { struct mlx5_priv *opriv = rte_eth_devices[port_id].data->dev_private; @@ -853,6 +935,7 @@ mlx5_dev_close(struct rte_eth_dev *dev) &rte_eth_devices[port_id] == dev) continue; ++c; + break; } if (!c) claim_zero(rte_eth_switch_domain_free(priv->domain_id)); @@ -913,6 +996,9 @@ const struct eth_dev_ops mlx5_dev_ops = { .rx_queue_intr_enable = mlx5_rx_intr_enable, .rx_queue_intr_disable = mlx5_rx_intr_disable, .is_removed = mlx5_is_removed, + .udp_tunnel_port_add = mlx5_udp_tunnel_port_add, + .get_module_info = mlx5_get_module_info, + .get_module_eeprom = mlx5_get_module_eeprom, }; /* Available operations from secondary process. */ @@ -926,6 +1012,8 @@ static const struct eth_dev_ops mlx5_dev_sec_ops = { .dev_infos_get = mlx5_dev_infos_get, .rx_descriptor_status = mlx5_rx_descriptor_status, .tx_descriptor_status = mlx5_tx_descriptor_status, + .get_module_info = mlx5_get_module_info, + .get_module_eeprom = mlx5_get_module_eeprom, }; /* Available operations in flow isolated mode. */ @@ -969,6 +1057,8 @@ const struct eth_dev_ops mlx5_dev_ops_isolate = { .rx_queue_intr_enable = mlx5_rx_intr_enable, .rx_queue_intr_disable = mlx5_rx_intr_disable, .is_removed = mlx5_is_removed, + .get_module_info = mlx5_get_module_info, + .get_module_eeprom = mlx5_get_module_eeprom, }; /** @@ -1052,6 +1142,8 @@ mlx5_args_check(const char *key, const char *val, void *opaque) config->mr_ext_memseg_en = !!tmp; } else if (strcmp(MLX5_MAX_DUMP_FILES_NUM, key) == 0) { config->max_dump_files_num = tmp; + } else if (strcmp(MLX5_LRO_TIMEOUT_USEC, key) == 0) { + config->lro.timeout = tmp; } else { DRV_LOG(WARNING, "%s: unknown parameter", key); rte_errno = EINVAL; @@ -1100,6 +1192,7 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs) MLX5_MR_EXT_MEMSEG_EN, MLX5_REPRESENTOR, MLX5_MAX_DUMP_FILES_NUM, + MLX5_LRO_TIMEOUT_USEC, NULL, }; struct rte_kvargs *kvlist; @@ -1209,8 +1302,6 @@ mlx5_set_min_inline(struct mlx5_dev_spawn_data *spawn, switch (spawn->pci_dev->id.device_id) { case PCI_DEVICE_ID_MELLANOX_CONNECTX4: case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF: - case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX: - case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF: if (config->txq_inline_min < (int)MLX5_INLINE_HSIZE_L2) { DRV_LOG(DEBUG, @@ -1296,6 +1387,7 @@ mlx5_set_min_inline(struct mlx5_dev_spawn_data *spawn, * and PFC control may be broken, so disable feature. */ config->hw_vlan_insert = 0; + config->txq_inline_min = MLX5_INLINE_HSIZE_NONE; break; default: config->txq_inline_min = MLX5_INLINE_HSIZE_NONE; @@ -1305,6 +1397,126 @@ exit: DRV_LOG(DEBUG, "min tx inline configured: %d", config->txq_inline_min); } +/** + * Allocate page of door-bells and register it using DevX API. + * + * @param [in] dev + * Pointer to Ethernet device. + * + * @return + * Pointer to new page on success, NULL otherwise. + */ +static struct mlx5_devx_dbr_page * +mlx5_alloc_dbr_page(struct rte_eth_dev *dev) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_devx_dbr_page *page; + + /* Allocate space for door-bell page and management data. */ + page = rte_calloc_socket(__func__, 1, sizeof(struct mlx5_devx_dbr_page), + RTE_CACHE_LINE_SIZE, dev->device->numa_node); + if (!page) { + DRV_LOG(ERR, "port %u cannot allocate dbr page", + dev->data->port_id); + return NULL; + } + /* Register allocated memory. */ + page->umem = mlx5_glue->devx_umem_reg(priv->sh->ctx, page->dbrs, + MLX5_DBR_PAGE_SIZE, 0); + if (!page->umem) { + DRV_LOG(ERR, "port %u cannot umem reg dbr page", + dev->data->port_id); + rte_free(page); + return NULL; + } + return page; +} + +/** + * Find the next available door-bell, allocate new page if needed. + * + * @param [in] dev + * Pointer to Ethernet device. + * @param [out] dbr_page + * Door-bell page containing the page data. + * + * @return + * Door-bell address offset on success, a negative error value otherwise. + */ +int64_t +mlx5_get_dbr(struct rte_eth_dev *dev, struct mlx5_devx_dbr_page **dbr_page) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_devx_dbr_page *page = NULL; + uint32_t i, j; + + LIST_FOREACH(page, &priv->dbrpgs, next) + if (page->dbr_count < MLX5_DBR_PER_PAGE) + break; + if (!page) { /* No page with free door-bell exists. */ + page = mlx5_alloc_dbr_page(dev); + if (!page) /* Failed to allocate new page. */ + return (-1); + LIST_INSERT_HEAD(&priv->dbrpgs, page, next); + } + /* Loop to find bitmap part with clear bit. */ + for (i = 0; + i < MLX5_DBR_BITMAP_SIZE && page->dbr_bitmap[i] == UINT64_MAX; + i++) + ; /* Empty. */ + /* Find the first clear bit. */ + j = rte_bsf64(~page->dbr_bitmap[i]); + assert(i < (MLX5_DBR_PER_PAGE / 64)); + page->dbr_bitmap[i] |= (1 << j); + page->dbr_count++; + *dbr_page = page; + return (((i * 64) + j) * sizeof(uint64_t)); +} + +/** + * Release a door-bell record. + * + * @param [in] dev + * Pointer to Ethernet device. + * @param [in] umem_id + * UMEM ID of page containing the door-bell record to release. + * @param [in] offset + * Offset of door-bell record in page. + * + * @return + * 0 on success, a negative error value otherwise. + */ +int32_t +mlx5_release_dbr(struct rte_eth_dev *dev, uint32_t umem_id, uint64_t offset) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_devx_dbr_page *page = NULL; + int ret = 0; + + LIST_FOREACH(page, &priv->dbrpgs, next) + /* Find the page this address belongs to. */ + if (page->umem->umem_id == umem_id) + break; + if (!page) + return -EINVAL; + page->dbr_count--; + if (!page->dbr_count) { + /* Page not used, free it and remove from list. */ + LIST_REMOVE(page, next); + if (page->umem) + ret = -mlx5_glue->devx_umem_dereg(page->umem); + rte_free(page); + } else { + /* Mark in bitmap that this door-bell is not in use. */ + offset /= MLX5_DBR_SIZE; + int i = offset / 64; + int j = offset % 64; + + page->dbr_bitmap[i] &= ~(1 << j); + } + return ret; +} + /** * Spawn an Ethernet device from Verbs information. * @@ -1351,6 +1563,9 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, int own_domain_id = 0; uint16_t port_id; unsigned int i; +#ifdef HAVE_MLX5DV_DR_DEVX_PORT + struct mlx5dv_devx_port devx_port; +#endif /* Determine if this port representor is supposed to be spawned. */ if (switch_info->representor && dpdk_dev->devargs) { @@ -1373,11 +1588,23 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, } } /* Build device name. */ - if (!switch_info->representor) - strlcpy(name, dpdk_dev->name, sizeof(name)); - else - snprintf(name, sizeof(name), "%s_representor_%u", - dpdk_dev->name, switch_info->port_name); + if (spawn->pf_bond < 0) { + /* Single device. */ + if (!switch_info->representor) + strlcpy(name, dpdk_dev->name, sizeof(name)); + else + snprintf(name, sizeof(name), "%s_representor_%u", + dpdk_dev->name, switch_info->port_name); + } else { + /* Bonding device. */ + if (!switch_info->representor) + snprintf(name, sizeof(name), "%s_%s", + dpdk_dev->name, spawn->ibv_dev->name); + else + snprintf(name, sizeof(name), "%s_%s_representor_%u", + dpdk_dev->name, spawn->ibv_dev->name, + switch_info->port_name); + } /* check if the device is already spawned */ if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) { rte_errno = EEXIST; @@ -1417,6 +1644,9 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, if (!sh) return NULL; config.devx = sh->devx; +#ifdef HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR + config.dest_tir = 1; +#endif #ifdef HAVE_IBV_MLX5_MOD_SWP dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP; #endif @@ -1541,6 +1771,7 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, } priv->sh = sh; priv->ibv_port = spawn->ibv_port; + priv->pci_dev = spawn->pci_dev; priv->mtu = RTE_ETHER_MTU; #ifndef RTE_ARCH_64 /* Initialize UAR access locks for 32bit implementations. */ @@ -1555,8 +1786,56 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, priv->representor = !!switch_info->representor; priv->master = !!switch_info->master; priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; + priv->vport_meta_tag = 0; + priv->vport_meta_mask = 0; +#ifdef HAVE_MLX5DV_DR_DEVX_PORT + /* + * The DevX port query API is implemented. E-Switch may use + * either vport or reg_c[0] metadata register to match on + * vport index. The engaged part of metadata register is + * defined by mask. + */ + devx_port.comp_mask = MLX5DV_DEVX_PORT_VPORT | + MLX5DV_DEVX_PORT_MATCH_REG_C_0; + err = mlx5dv_query_devx_port(sh->ctx, spawn->ibv_port, &devx_port); + if (err) { + DRV_LOG(WARNING, "can't query devx port %d on device %s\n", + spawn->ibv_port, spawn->ibv_dev->name); + devx_port.comp_mask = 0; + } + if (devx_port.comp_mask & MLX5DV_DEVX_PORT_MATCH_REG_C_0) { + priv->vport_meta_tag = devx_port.reg_c_0.value; + priv->vport_meta_mask = devx_port.reg_c_0.mask; + if (!priv->vport_meta_mask) { + DRV_LOG(ERR, "vport zero mask for port %d" + " on bonding device %s\n", + spawn->ibv_port, spawn->ibv_dev->name); + err = ENOTSUP; + goto error; + } + if (priv->vport_meta_tag & ~priv->vport_meta_mask) { + DRV_LOG(ERR, "invalid vport tag for port %d" + " on bonding device %s\n", + spawn->ibv_port, spawn->ibv_dev->name); + err = ENOTSUP; + goto error; + } + } else if (devx_port.comp_mask & MLX5DV_DEVX_PORT_VPORT) { + priv->vport_id = devx_port.vport_num; + } else if (spawn->pf_bond >= 0) { + DRV_LOG(ERR, "can't deduce vport index for port %d" + " on bonding device %s\n", + spawn->ibv_port, spawn->ibv_dev->name); + err = ENOTSUP; + goto error; + } else { + /* Suppose vport index in compatible way. */ + priv->vport_id = switch_info->representor ? + switch_info->port_name + 1 : -1; + } +#else /* - * Currently we support single E-Switch per PF configurations + * Kernel/rdma_core support single E-Switch per PF configurations * only and vport_id field contains the vport index for * associated VF, which is deduced from representor port name. * For example, let's have the IB device port 10, it has @@ -1568,18 +1847,20 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, */ priv->vport_id = switch_info->representor ? switch_info->port_name + 1 : -1; - /* representor_id field keeps the unmodified port/VF index. */ +#endif + /* representor_id field keeps the unmodified VF index. */ priv->representor_id = switch_info->representor ? switch_info->port_name : -1; /* * Look for sibling devices in order to reuse their switch domain * if any, otherwise allocate one. */ - RTE_ETH_FOREACH_DEV_OF(port_id, dpdk_dev) { + MLX5_ETH_FOREACH_DEV(port_id) { const struct mlx5_priv *opriv = rte_eth_devices[port_id].data->dev_private; if (!opriv || + opriv->sh != priv->sh || opriv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) continue; @@ -1674,6 +1955,36 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, } else if (config.cqe_pad) { DRV_LOG(INFO, "Rx CQE padding is enabled"); } + if (config.devx) { + priv->counter_fallback = 0; + err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config.hca_attr); + if (err) { + err = -err; + goto error; + } + if (!config.hca_attr.flow_counters_dump) + priv->counter_fallback = 1; +#ifndef HAVE_IBV_DEVX_ASYNC + priv->counter_fallback = 1; +#endif + if (priv->counter_fallback) + DRV_LOG(INFO, "Use fall-back DV counter management\n"); + /* Check for LRO support. */ + if (config.dest_tir && config.hca_attr.lro_cap) { + /* TBD check tunnel lro caps. */ + config.lro.supported = config.hca_attr.lro_cap; + DRV_LOG(DEBUG, "Device supports LRO"); + /* + * If LRO timeout is not configured by application, + * use the minimal supported value. + */ + if (!config.lro.timeout) + config.lro.timeout = + config.hca_attr.lro_timer_supported_periods[0]; + DRV_LOG(DEBUG, "LRO session timeout set to %d usec", + config.lro.timeout); + } + } if (config.mprq.enabled && mprq) { if (config.mprq.stride_num_n > mprq_max_stride_num_n || config.mprq.stride_num_n < mprq_min_stride_num_n) { @@ -1706,6 +2017,13 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR; eth_dev->data->representor_id = priv->representor_id; } + /* + * Store associated network device interface index. This index + * is permanent throughout the lifetime of device. So, we may store + * the ifindex here and use the cached value further. + */ + assert(spawn->ifindex); + priv->if_index = spawn->ifindex; eth_dev->data->dev_private = priv; priv->dev_data = eth_dev->data; eth_dev->data->mac_addrs = priv->mac; @@ -1774,23 +2092,6 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, * Verbs context returned by ibv_open_device(). */ mlx5_link_update(eth_dev, 0); -#ifdef HAVE_IBV_DEVX_OBJ - if (config.devx) { - priv->counter_fallback = 0; - err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config.hca_attr); - if (err) { - err = -err; - goto error; - } - if (!config.hca_attr.flow_counters_dump) - priv->counter_fallback = 1; -#ifndef HAVE_IBV_DEVX_ASYNC - priv->counter_fallback = 1; -#endif - if (priv->counter_fallback) - DRV_LOG(INFO, "Use fall-back DV counter management\n"); - } -#endif #ifdef HAVE_MLX5DV_DR_ESWITCH if (!(config.hca_attr.eswitch_manager && config.dv_flow_en && (switch_info->representor || switch_info->master))) @@ -1802,6 +2103,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, mlx5_set_min_inline(spawn, &config); /* Store device configuration on private structure. */ priv->config = config; + /* Create context for virtual machine VLAN workaround. */ + priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex); if (config.dv_flow_en) { err = mlx5_alloc_shared_dr(priv); if (err) @@ -1814,11 +2117,6 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, goto error; } priv->config.flow_prio = err; - /* Add device to memory callback list. */ - rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); - LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list, - sh, mem_event_cb); - rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); return eth_dev; error: if (priv) { @@ -1828,6 +2126,8 @@ error: close(priv->nl_socket_route); if (priv->nl_socket_rdma >= 0) close(priv->nl_socket_rdma); + if (priv->vmwa_context) + mlx5_vlan_vmwa_exit(priv->vmwa_context); if (own_domain_id) claim_zero(rte_eth_switch_domain_free(priv->domain_id)); rte_free(priv); @@ -1884,6 +2184,105 @@ mlx5_dev_spawn_data_cmp(const void *a, const void *b) return si_a->port_name - si_b->port_name; } +/** + * Match PCI information for possible slaves of bonding device. + * + * @param[in] ibv_dev + * Pointer to Infiniband device structure. + * @param[in] pci_dev + * Pointer to PCI device structure to match PCI address. + * @param[in] nl_rdma + * Netlink RDMA group socket handle. + * + * @return + * negative value if no bonding device found, otherwise + * positive index of slave PF in bonding. + */ +static int +mlx5_device_bond_pci_match(const struct ibv_device *ibv_dev, + const struct rte_pci_device *pci_dev, + int nl_rdma) +{ + char ifname[IF_NAMESIZE + 1]; + unsigned int ifindex; + unsigned int np, i; + FILE *file = NULL; + int pf = -1; + + /* + * Try to get master device name. If something goes + * wrong suppose the lack of kernel support and no + * bonding devices. + */ + if (nl_rdma < 0) + return -1; + if (!strstr(ibv_dev->name, "bond")) + return -1; + np = mlx5_nl_portnum(nl_rdma, ibv_dev->name); + if (!np) + return -1; + /* + * The Master device might not be on the predefined + * port (not on port index 1, it is not garanted), + * we have to scan all Infiniband device port and + * find master. + */ + for (i = 1; i <= np; ++i) { + /* Check whether Infiniband port is populated. */ + ifindex = mlx5_nl_ifindex(nl_rdma, ibv_dev->name, i); + if (!ifindex) + continue; + if (!if_indextoname(ifindex, ifname)) + continue; + /* Try to read bonding slave names from sysfs. */ + MKSTR(slaves, + "/sys/class/net/%s/master/bonding/slaves", ifname); + file = fopen(slaves, "r"); + if (file) + break; + } + if (!file) + return -1; + /* Use safe format to check maximal buffer length. */ + assert(atol(RTE_STR(IF_NAMESIZE)) == IF_NAMESIZE); + while (fscanf(file, "%" RTE_STR(IF_NAMESIZE) "s", ifname) == 1) { + char tmp_str[IF_NAMESIZE + 32]; + struct rte_pci_addr pci_addr; + struct mlx5_switch_info info; + + /* Process slave interface names in the loop. */ + snprintf(tmp_str, sizeof(tmp_str), + "/sys/class/net/%s", ifname); + if (mlx5_dev_to_pci_addr(tmp_str, &pci_addr)) { + DRV_LOG(WARNING, "can not get PCI address" + " for netdev \"%s\"", ifname); + continue; + } + if (pci_dev->addr.domain != pci_addr.domain || + pci_dev->addr.bus != pci_addr.bus || + pci_dev->addr.devid != pci_addr.devid || + pci_dev->addr.function != pci_addr.function) + continue; + /* Slave interface PCI address match found. */ + fclose(file); + snprintf(tmp_str, sizeof(tmp_str), + "/sys/class/net/%s/phys_port_name", ifname); + file = fopen(tmp_str, "rb"); + if (!file) + break; + info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET; + if (fscanf(file, "%32s", tmp_str) == 1) + mlx5_translate_port_name(tmp_str, &info); + if (info.name_type == MLX5_PHYS_PORT_NAME_TYPE_LEGACY || + info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK) + pf = info.port_name; + break; + } + if (file) + fclose(file); + return pf; +} + /** * DPDK callback to register a PCI device. * @@ -1920,6 +2319,13 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, * Actually this is the number of iterations to spawn. */ unsigned int ns = 0; + /* + * Bonding device + * < 0 - no bonding device (single one) + * >= 0 - bonding device (value is slave PF index) + */ + int bd = -1; + struct mlx5_dev_spawn_data *list = NULL; struct mlx5_dev_config dev_config; int ret; @@ -1942,15 +2348,40 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, * matching ones, gathering into the list. */ struct ibv_device *ibv_match[ret + 1]; - int nl_route = -1; - int nl_rdma = -1; + int nl_route = mlx5_nl_init(NETLINK_ROUTE); + int nl_rdma = mlx5_nl_init(NETLINK_RDMA); unsigned int i; while (ret-- > 0) { struct rte_pci_addr pci_addr; DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name); - if (mlx5_ibv_device_to_pci_addr(ibv_list[ret], &pci_addr)) + bd = mlx5_device_bond_pci_match + (ibv_list[ret], pci_dev, nl_rdma); + if (bd >= 0) { + /* + * Bonding device detected. Only one match is allowed, + * the bonding is supported over multi-port IB device, + * there should be no matches on representor PCI + * functions or non VF LAG bonding devices with + * specified address. + */ + if (nd) { + DRV_LOG(ERR, + "multiple PCI match on bonding device" + "\"%s\" found", ibv_list[ret]->name); + rte_errno = ENOENT; + ret = -rte_errno; + goto exit; + } + DRV_LOG(INFO, "PCI information matches for" + " slave %d bonding device \"%s\"", + bd, ibv_list[ret]->name); + ibv_match[nd++] = ibv_list[ret]; + break; + } + if (mlx5_dev_to_pci_addr + (ibv_list[ret]->ibdev_path, &pci_addr)) continue; if (pci_dev->addr.domain != pci_addr.domain || pci_dev->addr.bus != pci_addr.bus || @@ -1964,7 +2395,6 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, ibv_match[nd] = NULL; if (!nd) { /* No device matches, just complain and bail out. */ - mlx5_glue->free_device_list(ibv_list); DRV_LOG(WARNING, "no Verbs device matches PCI device " PCI_PRI_FMT "," " are kernel drivers loaded?", @@ -1972,10 +2402,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, pci_dev->addr.devid, pci_dev->addr.function); rte_errno = ENOENT; ret = -rte_errno; - return ret; + goto exit; } - nl_route = mlx5_nl_init(NETLINK_ROUTE); - nl_rdma = mlx5_nl_init(NETLINK_RDMA); if (nd == 1) { /* * Found single matching device may have multiple ports. @@ -1987,14 +2415,42 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, if (!np) DRV_LOG(WARNING, "can not get IB device \"%s\"" " ports number", ibv_match[0]->name); + if (bd >= 0 && !np) { + DRV_LOG(ERR, "can not get ports" + " for bonding device"); + rte_errno = ENOENT; + ret = -rte_errno; + goto exit; + } } +#ifndef HAVE_MLX5DV_DR_DEVX_PORT + if (bd >= 0) { + /* + * This may happen if there is VF LAG kernel support and + * application is compiled with older rdma_core library. + */ + DRV_LOG(ERR, + "No kernel/verbs support for VF LAG bonding found."); + rte_errno = ENOTSUP; + ret = -rte_errno; + goto exit; + } +#endif /* * Now we can determine the maximal * amount of devices to be spawned. */ - struct mlx5_dev_spawn_data list[np ? np : nd]; - - if (np > 1) { + list = rte_zmalloc("device spawn data", + sizeof(struct mlx5_dev_spawn_data) * + (np ? np : nd), + RTE_CACHE_LINE_SIZE); + if (!list) { + DRV_LOG(ERR, "spawn data array allocation failure"); + rte_errno = ENOMEM; + ret = -rte_errno; + goto exit; + } + if (bd >= 0 || np > 1) { /* * Single IB device with multiple ports found, * it may be E-Switch master device and representors. @@ -2003,12 +2459,14 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, assert(nl_rdma >= 0); assert(ns == 0); assert(nd == 1); + assert(np); for (i = 1; i <= np; ++i) { list[ns].max_port = np; list[ns].ibv_port = i; list[ns].ibv_dev = ibv_match[0]; list[ns].eth_dev = NULL; list[ns].pci_dev = pci_dev; + list[ns].pf_bond = bd; list[ns].ifindex = mlx5_nl_ifindex (nl_rdma, list[ns].ibv_dev->name, i); if (!list[ns].ifindex) { @@ -2038,6 +2496,21 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, (list[ns].ifindex, &list[ns].info); } + if (!ret && bd >= 0) { + switch (list[ns].info.name_type) { + case MLX5_PHYS_PORT_NAME_TYPE_UPLINK: + if (list[ns].info.port_name == bd) + ns++; + break; + case MLX5_PHYS_PORT_NAME_TYPE_PFVF: + if (list[ns].info.pf_num == bd) + ns++; + break; + default: + break; + } + continue; + } if (!ret && (list[ns].info.representor ^ list[ns].info.master)) ns++; @@ -2076,6 +2549,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, list[ns].ibv_dev = ibv_match[i]; list[ns].eth_dev = NULL; list[ns].pci_dev = pci_dev; + list[ns].pf_bond = -1; list[ns].ifindex = 0; if (nl_rdma >= 0) list[ns].ifindex = mlx5_nl_ifindex @@ -2191,6 +2665,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF: case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: + case PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF: + case PCI_DEVICE_ID_MELLANOX_CONNECTX6VF: dev_config.vf = 1; break; default: @@ -2240,17 +2716,39 @@ exit: /* * Do the routine cleanup: * - close opened Netlink sockets + * - free allocated spawn data array * - free the Infiniband device list */ if (nl_rdma >= 0) close(nl_rdma); if (nl_route >= 0) close(nl_route); + if (list) + rte_free(list); assert(ibv_list); mlx5_glue->free_device_list(ibv_list); return ret; } +uint16_t +mlx5_eth_find_next(uint16_t port_id) +{ + while (port_id < RTE_MAX_ETHPORTS) { + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + + if (dev->state != RTE_ETH_DEV_UNUSED && + dev->device && + dev->device->driver && + dev->device->driver->name && + !strcmp(dev->device->driver->name, MLX5_DRIVER_NAME)) + break; + port_id++; + } + if (port_id >= RTE_MAX_ETHPORTS) + return RTE_MAX_ETHPORTS; + return port_id; +} + /** * DPDK callback to remove a PCI device. *