X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=drivers%2Fnet%2Fmlx5%2Fmlx5.c;h=7e0d7ec21930fc0ca5f771313dcf4ffc9f8ea039;hb=c345c7d1acf43b4d30e1ecdd5a8cd3402234a6aa;hp=be3fb2a3a6f5453c80f0e0c7940a8ca8135fb163;hpb=a62ec9916168a2b09cc4596a23a2c966c46591bc;p=dpdk.git diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index be3fb2a3a6..7e0d7ec219 100644 --- a/drivers/net/mlx5/mlx5.c +++ b/drivers/net/mlx5/mlx5.c @@ -168,6 +168,7 @@ struct mlx5_dev_spawn_data { uint32_t ifindex; /**< Network interface index. */ uint32_t max_port; /**< IB device maximal port index. */ uint32_t ibv_port; /**< IB device physical port index. */ + int pf_bond; /**< bonding device PF index. < 0 - no bonding */ struct mlx5_switch_info info; /**< Switch information. */ struct ibv_device *ibv_dev; /**< Associated IB device. */ struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */ @@ -925,7 +926,7 @@ mlx5_dev_close(struct rte_eth_dev *dev) unsigned int c = 0; uint16_t port_id; - RTE_ETH_FOREACH_DEV_OF(port_id, dev->device) { + MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) { struct mlx5_priv *opriv = rte_eth_devices[port_id].data->dev_private; @@ -934,6 +935,7 @@ mlx5_dev_close(struct rte_eth_dev *dev) &rte_eth_devices[port_id] == dev) continue; ++c; + break; } if (!c) claim_zero(rte_eth_switch_domain_free(priv->domain_id)); @@ -1515,6 +1517,53 @@ mlx5_release_dbr(struct rte_eth_dev *dev, uint32_t umem_id, uint64_t offset) return ret; } +/** + * Check sibling device configurations. + * + * Sibling devices sharing the Infiniband device context + * should have compatible configurations. This regards + * representors and bonding slaves. + * + * @param priv + * Private device descriptor. + * @param config + * Configuration of the device is going to be created. + * + * @return + * 0 on success, EINVAL otherwise + */ +static int +mlx5_dev_check_sibling_config(struct mlx5_priv *priv, + struct mlx5_dev_config *config) +{ + struct mlx5_ibv_shared *sh = priv->sh; + struct mlx5_dev_config *sh_conf = NULL; + uint16_t port_id; + + assert(sh); + /* Nothing to compare for the single/first device. */ + if (sh->refcnt == 1) + return 0; + /* Find the device with shared context. */ + MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) { + struct mlx5_priv *opriv = + rte_eth_devices[port_id].data->dev_private; + + if (opriv && opriv != priv && opriv->sh == sh) { + sh_conf = &opriv->config; + break; + } + } + if (!sh_conf) + return 0; + if (sh_conf->dv_flow_en ^ config->dv_flow_en) { + DRV_LOG(ERR, "\"dv_flow_en\" configuration mismatch" + " for shared %s context", sh->ibdev_name); + rte_errno = EINVAL; + return rte_errno; + } + return 0; +} /** * Spawn an Ethernet device from Verbs information. * @@ -1561,6 +1610,9 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, int own_domain_id = 0; uint16_t port_id; unsigned int i; +#ifdef HAVE_MLX5DV_DR_DEVX_PORT + struct mlx5dv_devx_port devx_port; +#endif /* Determine if this port representor is supposed to be spawned. */ if (switch_info->representor && dpdk_dev->devargs) { @@ -1583,11 +1635,23 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, } } /* Build device name. */ - if (!switch_info->representor) - strlcpy(name, dpdk_dev->name, sizeof(name)); - else - snprintf(name, sizeof(name), "%s_representor_%u", - dpdk_dev->name, switch_info->port_name); + if (spawn->pf_bond < 0) { + /* Single device. */ + if (!switch_info->representor) + strlcpy(name, dpdk_dev->name, sizeof(name)); + else + snprintf(name, sizeof(name), "%s_representor_%u", + dpdk_dev->name, switch_info->port_name); + } else { + /* Bonding device. */ + if (!switch_info->representor) + snprintf(name, sizeof(name), "%s_%s", + dpdk_dev->name, spawn->ibv_dev->name); + else + snprintf(name, sizeof(name), "%s_%s_representor_%u", + dpdk_dev->name, spawn->ibv_dev->name, + switch_info->port_name); + } /* check if the device is already spawned */ if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) { rte_errno = EEXIST; @@ -1769,8 +1833,57 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, priv->representor = !!switch_info->representor; priv->master = !!switch_info->master; priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; + priv->vport_meta_tag = 0; + priv->vport_meta_mask = 0; + priv->pf_bond = spawn->pf_bond; +#ifdef HAVE_MLX5DV_DR_DEVX_PORT + /* + * The DevX port query API is implemented. E-Switch may use + * either vport or reg_c[0] metadata register to match on + * vport index. The engaged part of metadata register is + * defined by mask. + */ + devx_port.comp_mask = MLX5DV_DEVX_PORT_VPORT | + MLX5DV_DEVX_PORT_MATCH_REG_C_0; + err = mlx5_glue->devx_port_query(sh->ctx, spawn->ibv_port, &devx_port); + if (err) { + DRV_LOG(WARNING, "can't query devx port %d on device %s\n", + spawn->ibv_port, spawn->ibv_dev->name); + devx_port.comp_mask = 0; + } + if (devx_port.comp_mask & MLX5DV_DEVX_PORT_MATCH_REG_C_0) { + priv->vport_meta_tag = devx_port.reg_c_0.value; + priv->vport_meta_mask = devx_port.reg_c_0.mask; + if (!priv->vport_meta_mask) { + DRV_LOG(ERR, "vport zero mask for port %d" + " on bonding device %s\n", + spawn->ibv_port, spawn->ibv_dev->name); + err = ENOTSUP; + goto error; + } + if (priv->vport_meta_tag & ~priv->vport_meta_mask) { + DRV_LOG(ERR, "invalid vport tag for port %d" + " on bonding device %s\n", + spawn->ibv_port, spawn->ibv_dev->name); + err = ENOTSUP; + goto error; + } + } else if (devx_port.comp_mask & MLX5DV_DEVX_PORT_VPORT) { + priv->vport_id = devx_port.vport_num; + } else if (spawn->pf_bond >= 0) { + DRV_LOG(ERR, "can't deduce vport index for port %d" + " on bonding device %s\n", + spawn->ibv_port, spawn->ibv_dev->name); + err = ENOTSUP; + goto error; + } else { + /* Suppose vport index in compatible way. */ + priv->vport_id = switch_info->representor ? + switch_info->port_name + 1 : -1; + } +#else /* - * Currently we support single E-Switch per PF configurations + * Kernel/rdma_core support single E-Switch per PF configurations * only and vport_id field contains the vport index for * associated VF, which is deduced from representor port name. * For example, let's have the IB device port 10, it has @@ -1782,18 +1895,20 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, */ priv->vport_id = switch_info->representor ? switch_info->port_name + 1 : -1; - /* representor_id field keeps the unmodified port/VF index. */ +#endif + /* representor_id field keeps the unmodified VF index. */ priv->representor_id = switch_info->representor ? switch_info->port_name : -1; /* * Look for sibling devices in order to reuse their switch domain * if any, otherwise allocate one. */ - RTE_ETH_FOREACH_DEV_OF(port_id, dpdk_dev) { + MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) { const struct mlx5_priv *opriv = rte_eth_devices[port_id].data->dev_private; if (!opriv || + opriv->sh != priv->sh || opriv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) continue; @@ -1817,6 +1932,9 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, strerror(rte_errno)); goto error; } + err = mlx5_dev_check_sibling_config(priv, &config); + if (err) + goto error; config.hw_csum = !!(sh->device_attr.device_cap_flags_ex & IBV_DEVICE_RAW_IP_CSUM); DRV_LOG(DEBUG, "checksum offloading is %ssupported", @@ -2117,6 +2235,105 @@ mlx5_dev_spawn_data_cmp(const void *a, const void *b) return si_a->port_name - si_b->port_name; } +/** + * Match PCI information for possible slaves of bonding device. + * + * @param[in] ibv_dev + * Pointer to Infiniband device structure. + * @param[in] pci_dev + * Pointer to PCI device structure to match PCI address. + * @param[in] nl_rdma + * Netlink RDMA group socket handle. + * + * @return + * negative value if no bonding device found, otherwise + * positive index of slave PF in bonding. + */ +static int +mlx5_device_bond_pci_match(const struct ibv_device *ibv_dev, + const struct rte_pci_device *pci_dev, + int nl_rdma) +{ + char ifname[IF_NAMESIZE + 1]; + unsigned int ifindex; + unsigned int np, i; + FILE *file = NULL; + int pf = -1; + + /* + * Try to get master device name. If something goes + * wrong suppose the lack of kernel support and no + * bonding devices. + */ + if (nl_rdma < 0) + return -1; + if (!strstr(ibv_dev->name, "bond")) + return -1; + np = mlx5_nl_portnum(nl_rdma, ibv_dev->name); + if (!np) + return -1; + /* + * The Master device might not be on the predefined + * port (not on port index 1, it is not garanted), + * we have to scan all Infiniband device port and + * find master. + */ + for (i = 1; i <= np; ++i) { + /* Check whether Infiniband port is populated. */ + ifindex = mlx5_nl_ifindex(nl_rdma, ibv_dev->name, i); + if (!ifindex) + continue; + if (!if_indextoname(ifindex, ifname)) + continue; + /* Try to read bonding slave names from sysfs. */ + MKSTR(slaves, + "/sys/class/net/%s/master/bonding/slaves", ifname); + file = fopen(slaves, "r"); + if (file) + break; + } + if (!file) + return -1; + /* Use safe format to check maximal buffer length. */ + assert(atol(RTE_STR(IF_NAMESIZE)) == IF_NAMESIZE); + while (fscanf(file, "%" RTE_STR(IF_NAMESIZE) "s", ifname) == 1) { + char tmp_str[IF_NAMESIZE + 32]; + struct rte_pci_addr pci_addr; + struct mlx5_switch_info info; + + /* Process slave interface names in the loop. */ + snprintf(tmp_str, sizeof(tmp_str), + "/sys/class/net/%s", ifname); + if (mlx5_dev_to_pci_addr(tmp_str, &pci_addr)) { + DRV_LOG(WARNING, "can not get PCI address" + " for netdev \"%s\"", ifname); + continue; + } + if (pci_dev->addr.domain != pci_addr.domain || + pci_dev->addr.bus != pci_addr.bus || + pci_dev->addr.devid != pci_addr.devid || + pci_dev->addr.function != pci_addr.function) + continue; + /* Slave interface PCI address match found. */ + fclose(file); + snprintf(tmp_str, sizeof(tmp_str), + "/sys/class/net/%s/phys_port_name", ifname); + file = fopen(tmp_str, "rb"); + if (!file) + break; + info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET; + if (fscanf(file, "%32s", tmp_str) == 1) + mlx5_translate_port_name(tmp_str, &info); + if (info.name_type == MLX5_PHYS_PORT_NAME_TYPE_LEGACY || + info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK) + pf = info.port_name; + break; + } + if (file) + fclose(file); + return pf; +} + /** * DPDK callback to register a PCI device. * @@ -2153,6 +2370,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, * Actually this is the number of iterations to spawn. */ unsigned int ns = 0; + /* + * Bonding device + * < 0 - no bonding device (single one) + * >= 0 - bonding device (value is slave PF index) + */ + int bd = -1; struct mlx5_dev_spawn_data *list = NULL; struct mlx5_dev_config dev_config; int ret; @@ -2184,6 +2407,30 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, struct rte_pci_addr pci_addr; DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name); + bd = mlx5_device_bond_pci_match + (ibv_list[ret], pci_dev, nl_rdma); + if (bd >= 0) { + /* + * Bonding device detected. Only one match is allowed, + * the bonding is supported over multi-port IB device, + * there should be no matches on representor PCI + * functions or non VF LAG bonding devices with + * specified address. + */ + if (nd) { + DRV_LOG(ERR, + "multiple PCI match on bonding device" + "\"%s\" found", ibv_list[ret]->name); + rte_errno = ENOENT; + ret = -rte_errno; + goto exit; + } + DRV_LOG(INFO, "PCI information matches for" + " slave %d bonding device \"%s\"", + bd, ibv_list[ret]->name); + ibv_match[nd++] = ibv_list[ret]; + break; + } if (mlx5_dev_to_pci_addr (ibv_list[ret]->ibdev_path, &pci_addr)) continue; @@ -2219,7 +2466,27 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, if (!np) DRV_LOG(WARNING, "can not get IB device \"%s\"" " ports number", ibv_match[0]->name); + if (bd >= 0 && !np) { + DRV_LOG(ERR, "can not get ports" + " for bonding device"); + rte_errno = ENOENT; + ret = -rte_errno; + goto exit; + } + } +#ifndef HAVE_MLX5DV_DR_DEVX_PORT + if (bd >= 0) { + /* + * This may happen if there is VF LAG kernel support and + * application is compiled with older rdma_core library. + */ + DRV_LOG(ERR, + "No kernel/verbs support for VF LAG bonding found."); + rte_errno = ENOTSUP; + ret = -rte_errno; + goto exit; } +#endif /* * Now we can determine the maximal * amount of devices to be spawned. @@ -2234,7 +2501,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, ret = -rte_errno; goto exit; } - if (np > 1) { + if (bd >= 0 || np > 1) { /* * Single IB device with multiple ports found, * it may be E-Switch master device and representors. @@ -2243,12 +2510,14 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, assert(nl_rdma >= 0); assert(ns == 0); assert(nd == 1); + assert(np); for (i = 1; i <= np; ++i) { list[ns].max_port = np; list[ns].ibv_port = i; list[ns].ibv_dev = ibv_match[0]; list[ns].eth_dev = NULL; list[ns].pci_dev = pci_dev; + list[ns].pf_bond = bd; list[ns].ifindex = mlx5_nl_ifindex (nl_rdma, list[ns].ibv_dev->name, i); if (!list[ns].ifindex) { @@ -2278,6 +2547,21 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, (list[ns].ifindex, &list[ns].info); } + if (!ret && bd >= 0) { + switch (list[ns].info.name_type) { + case MLX5_PHYS_PORT_NAME_TYPE_UPLINK: + if (list[ns].info.port_name == bd) + ns++; + break; + case MLX5_PHYS_PORT_NAME_TYPE_PFVF: + if (list[ns].info.pf_num == bd) + ns++; + break; + default: + break; + } + continue; + } if (!ret && (list[ns].info.representor ^ list[ns].info.master)) ns++; @@ -2316,6 +2600,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, list[ns].ibv_dev = ibv_match[i]; list[ns].eth_dev = NULL; list[ns].pci_dev = pci_dev; + list[ns].pf_bond = -1; list[ns].ifindex = 0; if (nl_rdma >= 0) list[ns].ifindex = mlx5_nl_ifindex @@ -2496,6 +2781,41 @@ exit: return ret; } +/** + * Look for the ethernet device belonging to mlx5 driver. + * + * @param[in] port_id + * port_id to start looking for device. + * @param[in] pci_dev + * Pointer to the hint PCI device. When device is being probed + * the its siblings (master and preceding representors might + * not have assigned driver yet (because the mlx5_pci_probe() + * is not completed yet, for this case match on hint PCI + * device may be used to detect sibling device. + * + * @return + * port_id of found device, RTE_MAX_ETHPORT if not found. + */ +uint16_t +mlx5_eth_find_next(uint16_t port_id, struct rte_pci_device *pci_dev) +{ + while (port_id < RTE_MAX_ETHPORTS) { + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; + + if (dev->state != RTE_ETH_DEV_UNUSED && + dev->device && + (dev->device == &pci_dev->device || + (dev->device->driver && + dev->device->driver->name && + !strcmp(dev->device->driver->name, MLX5_DRIVER_NAME)))) + break; + port_id++; + } + if (port_id >= RTE_MAX_ETHPORTS) + return RTE_MAX_ETHPORTS; + return port_id; +} + /** * DPDK callback to remove a PCI device. *