1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2020 Mellanox Technologies, Ltd
9 #ifdef RTE_IBVERBS_LINK_DLOPEN
15 #include <rte_errno.h>
16 #include <rte_string_fns.h>
17 #include <rte_bus_pci.h>
18 #include <rte_bus_auxiliary.h>
20 #include "mlx5_common.h"
22 #include "mlx5_common_log.h"
23 #include "mlx5_common_private.h"
24 #include "mlx5_common_defs.h"
25 #include "mlx5_common_os.h"
26 #include "mlx5_glue.h"
29 const struct mlx5_glue *mlx5_glue;
33 mlx5_get_pci_addr(const char *dev_path, struct rte_pci_addr *pci_addr)
38 MKSTR(path, "%s/device/uevent", dev_path);
40 file = fopen(path, "rb");
45 while (fgets(line, sizeof(line), file) == line) {
46 size_t len = strlen(line);
48 /* Truncate long lines. */
49 if (len == (sizeof(line) - 1)) {
50 while (line[(len - 1)] != '\n') {
51 int ret = fgetc(file);
55 line[(len - 1)] = ret;
57 /* No match for long lines. */
60 /* Extract information. */
63 "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
67 &pci_addr->function) == 4) {
80 * Extract port name, as a number, from sysfs or netlink information.
82 * @param[in] port_name_in
83 * String representing the port name.
84 * @param[out] port_info_out
85 * Port information, including port name as a number and port name
89 * port_name field set according to recognized name format.
92 mlx5_translate_port_name(const char *port_name_in,
93 struct mlx5_switch_info *port_info_out)
95 char ctrl = 0, pf_c1, pf_c2, vf_c1, vf_c2, eol;
99 sc_items = sscanf(port_name_in, "%c%d",
100 &ctrl, &port_info_out->ctrl_num);
101 if (sc_items == 2 && ctrl == 'c') {
102 port_name_in++; /* 'c' */
103 port_name_in += snprintf(NULL, 0, "%d",
104 port_info_out->ctrl_num);
106 /* Check for port-name as a string of the form pf0vf0 or pf0sf0 */
107 sc_items = sscanf(port_name_in, "%c%c%d%c%c%d%c",
108 &pf_c1, &pf_c2, &port_info_out->pf_num,
109 &vf_c1, &vf_c2, &port_info_out->port_name, &eol);
110 if (sc_items == 6 && pf_c1 == 'p' && pf_c2 == 'f') {
111 if (vf_c1 == 'v' && vf_c2 == 'f') {
112 /* Kernel ver >= 5.0 or OFED ver >= 4.6 */
113 port_info_out->name_type =
114 MLX5_PHYS_PORT_NAME_TYPE_PFVF;
117 if (vf_c1 == 's' && vf_c2 == 'f') {
118 /* Kernel ver >= 5.11 or OFED ver >= 5.1 */
119 port_info_out->name_type =
120 MLX5_PHYS_PORT_NAME_TYPE_PFSF;
125 * Check for port-name as a string of the form p0
126 * (support kernel ver >= 5.0, or OFED ver >= 4.6).
128 sc_items = sscanf(port_name_in, "%c%d%c",
129 &pf_c1, &port_info_out->port_name, &eol);
130 if (sc_items == 2 && pf_c1 == 'p') {
131 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UPLINK;
135 * Check for port-name as a string of the form pf0
136 * (support kernel ver >= 5.7 for HPF representor on BF).
138 sc_items = sscanf(port_name_in, "%c%c%d%c",
139 &pf_c1, &pf_c2, &port_info_out->pf_num, &eol);
140 if (sc_items == 3 && pf_c1 == 'p' && pf_c2 == 'f') {
141 port_info_out->port_name = -1;
142 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_PFHPF;
145 /* Check for port-name as a number (support kernel ver < 5.0 */
147 port_info_out->port_name = strtol(port_name_in, &end, 0);
149 (size_t)(end - port_name_in) == strlen(port_name_in)) {
150 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_LEGACY;
153 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN;
157 mlx5_get_ifname_sysfs(const char *ibdev_path, char *ifname)
161 unsigned int dev_type = 0;
162 unsigned int dev_port_prev = ~0u;
163 char match[IF_NAMESIZE] = "";
165 MLX5_ASSERT(ibdev_path);
167 MKSTR(path, "%s/device/net", ibdev_path);
175 while ((dent = readdir(dir)) != NULL) {
176 char *name = dent->d_name;
178 unsigned int dev_port;
181 if ((name[0] == '.') &&
182 ((name[1] == '\0') ||
183 ((name[1] == '.') && (name[2] == '\0'))))
186 MKSTR(path, "%s/device/net/%s/%s",
188 (dev_type ? "dev_id" : "dev_port"));
190 file = fopen(path, "rb");
195 * Switch to dev_id when dev_port does not exist as
196 * is the case with Linux kernel versions < 3.15.
207 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
212 * Switch to dev_id when dev_port returns the same value for
213 * all ports. May happen when using a MOFED release older than
214 * 3.0 with a Linux kernel >= 3.15.
216 if (dev_port == dev_port_prev)
218 dev_port_prev = dev_port;
220 strlcpy(match, name, IF_NAMESIZE);
223 if (match[0] == '\0') {
227 strncpy(ifname, match, IF_NAMESIZE);
234 * Suffix RTE_EAL_PMD_PATH with "-glue".
236 * This function performs a sanity check on RTE_EAL_PMD_PATH before
237 * suffixing its last component.
240 * Output buffer, should be large enough otherwise NULL is returned.
245 * Pointer to @p buf or @p NULL in case suffix cannot be appended.
248 mlx5_glue_path(char *buf, size_t size)
250 static const char *const bad[] = { "/", ".", "..", NULL };
251 const char *path = RTE_EAL_PMD_PATH;
252 size_t len = strlen(path);
256 while (len && path[len - 1] == '/')
258 for (off = len; off && path[off - 1] != '/'; --off)
260 for (i = 0; bad[i]; ++i)
261 if (!strncmp(path + off, bad[i], (int)(len - off)))
263 i = snprintf(buf, size, "%.*s-glue", (int)len, path);
264 if (i == -1 || (size_t)i >= size)
268 RTE_LOG(ERR, PMD, "unable to append \"-glue\" to last component of"
269 " RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\"), please"
270 " re-configure DPDK");
275 mlx5_glue_dlopen(void)
277 char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")];
280 char const *path[] = {
282 * A basic security check is necessary before trusting
283 * MLX5_GLUE_PATH, which may override RTE_EAL_PMD_PATH.
285 (geteuid() == getuid() && getegid() == getgid() ?
286 getenv("MLX5_GLUE_PATH") : NULL),
288 * When RTE_EAL_PMD_PATH is set, use its glue-suffixed
289 * variant, otherwise let dlopen() look up libraries on its
293 mlx5_glue_path(glue_path, sizeof(glue_path)) : ""),
299 while (!handle && i != RTE_DIM(path)) {
308 end = strpbrk(path[i], ":;");
310 end = path[i] + strlen(path[i]);
316 ret = snprintf(name, sizeof(name), "%.*s%s" MLX5_GLUE,
318 (!len || *(end - 1) == '/') ? "" : "/");
321 if (sizeof(name) != (size_t)ret + 1)
323 DRV_LOG(DEBUG, "Looking for rdma-core glue as "
325 handle = dlopen(name, RTLD_LAZY);
336 DRV_LOG(WARNING, "Cannot load glue library: %s", dlmsg);
339 sym = dlsym(handle, "mlx5_glue");
344 DRV_LOG(ERR, "Cannot resolve glue symbol: %s", dlmsg);
359 * Initialization routine for run-time dependency on rdma-core.
362 mlx5_glue_constructor(void)
365 * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use
366 * huge pages. Calling ibv_fork_init() during init allows
367 * applications to use fork() safely for purposes other than
368 * using this PMD, which is not supported in forked processes.
370 setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
371 /* Match the size of Rx completion entry to the size of a cacheline. */
372 if (RTE_CACHE_LINE_SIZE == 128)
373 setenv("MLX5_CQE_SIZE", "128", 0);
375 * MLX5_DEVICE_FATAL_CLEANUP tells ibv_destroy functions to
376 * cleanup all the Verbs resources even when the device was removed.
378 setenv("MLX5_DEVICE_FATAL_CLEANUP", "1", 1);
381 if (mlx5_glue_dlopen() != 0)
385 #ifdef RTE_LIBRTE_MLX5_DEBUG
386 /* Glue structure must not contain any NULL pointers. */
390 for (i = 0; i != sizeof(*mlx5_glue) / sizeof(void *); ++i)
391 MLX5_ASSERT(((const void *const *)mlx5_glue)[i]);
394 if (strcmp(mlx5_glue->version, MLX5_GLUE_VERSION)) {
396 DRV_LOG(ERR, "rdma-core glue \"%s\" mismatch: \"%s\" is "
397 "required", mlx5_glue->version, MLX5_GLUE_VERSION);
400 mlx5_glue->fork_init();
404 DRV_LOG(WARNING, "Cannot initialize MLX5 common due to missing"
405 " run-time dependency on rdma-core libraries (libibverbs,"
411 * Allocate Protection Domain object and extract its pdn using DV API.
414 * Pointer to the mlx5 device.
417 * 0 on success, a negative errno value otherwise and rte_errno is set.
420 mlx5_os_pd_create(struct mlx5_common_device *cdev)
422 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
423 struct mlx5dv_obj obj;
424 struct mlx5dv_pd pd_info;
428 cdev->pd = mlx5_glue->alloc_pd(cdev->ctx);
429 if (cdev->pd == NULL) {
430 DRV_LOG(ERR, "Failed to allocate PD.");
431 return errno ? -errno : -ENOMEM;
433 if (cdev->config.devx == 0)
435 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
436 obj.pd.in = cdev->pd;
437 obj.pd.out = &pd_info;
438 ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
440 DRV_LOG(ERR, "Fail to get PD object info.");
441 mlx5_glue->dealloc_pd(cdev->pd);
445 cdev->pdn = pd_info.pdn;
448 DRV_LOG(ERR, "Cannot get pdn - no DV support.");
450 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */
453 static struct ibv_device *
454 mlx5_os_get_ibv_device(const struct rte_pci_addr *addr)
457 struct ibv_device **ibv_list = mlx5_glue->get_device_list(&n);
458 struct ibv_device *ibv_match = NULL;
460 if (ibv_list == NULL) {
465 struct rte_pci_addr paddr;
467 DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[n]->name);
468 if (mlx5_get_pci_addr(ibv_list[n]->ibdev_path, &paddr) != 0)
470 if (rte_pci_addr_cmp(addr, &paddr) != 0)
472 ibv_match = ibv_list[n];
475 if (ibv_match == NULL) {
477 "No Verbs device matches PCI device " PCI_PRI_FMT ","
478 " are kernel drivers loaded?",
479 addr->domain, addr->bus, addr->devid, addr->function);
482 mlx5_glue->free_device_list(ibv_list);
486 /* Try to disable ROCE by Netlink\Devlink. */
488 mlx5_nl_roce_disable(const char *addr)
490 int nlsk_fd = mlx5_nl_init(NETLINK_GENERIC);
497 devlink_id = mlx5_nl_devlink_family_id_get(nlsk_fd);
498 if (devlink_id < 0) {
501 "Failed to get devlink id for ROCE operations by Netlink.");
504 ret = mlx5_nl_enable_roce_get(nlsk_fd, devlink_id, addr, &enable);
506 DRV_LOG(DEBUG, "Failed to get ROCE enable by Netlink: %d.",
509 } else if (!enable) {
510 DRV_LOG(INFO, "ROCE has already disabled(Netlink).");
513 ret = mlx5_nl_enable_roce_set(nlsk_fd, devlink_id, addr, 0);
515 DRV_LOG(DEBUG, "Failed to disable ROCE by Netlink: %d.", ret);
517 DRV_LOG(INFO, "ROCE is disabled by Netlink successfully.");
523 /* Try to disable ROCE by sysfs. */
525 mlx5_sys_roce_disable(const char *addr)
531 MKSTR(file_p, "/sys/bus/pci/devices/%s/roce_enable", addr);
532 file_o = fopen(file_p, "rb");
537 ret = fscanf(file_o, "%d", &enable);
542 } else if (!enable) {
544 DRV_LOG(INFO, "ROCE has already disabled(sysfs).");
548 file_o = fopen(file_p, "wb");
553 fprintf(file_o, "0\n");
557 DRV_LOG(DEBUG, "Failed to disable ROCE by sysfs: %d.", ret);
559 DRV_LOG(INFO, "ROCE is disabled by sysfs successfully.");
565 mlx5_roce_disable(const struct rte_device *dev)
567 char pci_addr[PCI_PRI_STR_SIZE] = { 0 };
569 if (mlx5_dev_to_pci_str(dev, pci_addr, sizeof(pci_addr)) < 0)
571 /* Firstly try to disable ROCE by Netlink and fallback to sysfs. */
572 if (mlx5_nl_roce_disable(pci_addr) != 0 &&
573 mlx5_sys_roce_disable(pci_addr) != 0)
578 static struct ibv_device *
579 mlx5_os_get_ibv_dev(const struct rte_device *dev)
581 struct ibv_device *ibv;
583 if (mlx5_dev_is_pci(dev))
584 ibv = mlx5_os_get_ibv_device(&RTE_DEV_TO_PCI_CONST(dev)->addr);
586 ibv = mlx5_get_aux_ibv_device(RTE_DEV_TO_AUXILIARY_CONST(dev));
589 DRV_LOG(ERR, "Verbs device not found: %s", dev->name);
594 static struct ibv_device *
595 mlx5_vdpa_get_ibv_dev(const struct rte_device *dev)
597 struct ibv_device *ibv;
600 if (mlx5_roce_disable(dev) != 0) {
601 DRV_LOG(WARNING, "Failed to disable ROCE for \"%s\".",
605 /* Wait for the IB device to appear again after reload. */
606 for (retry = MLX5_VDPA_MAX_RETRIES; retry > 0; --retry) {
607 ibv = mlx5_os_get_ibv_dev(dev);
610 usleep(MLX5_VDPA_USEC);
613 "Cannot get IB device after disabling RoCE for \"%s\", retries exceed %d.",
614 dev->name, MLX5_VDPA_MAX_RETRIES);
620 mlx5_config_doorbell_mapping_env(int dbnc)
625 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
626 /* Get environment variable to store. */
627 env = getenv(MLX5_SHUT_UP_BF);
628 value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET;
629 if (dbnc == MLX5_ARG_UNSET)
630 setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1);
632 setenv(MLX5_SHUT_UP_BF,
633 dbnc == MLX5_TXDB_NCACHED ? "1" : "0", 1);
638 mlx5_restore_doorbell_mapping_env(int value)
640 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
641 /* Restore the original environment variable state. */
642 if (value == MLX5_ARG_UNSET)
643 unsetenv(MLX5_SHUT_UP_BF);
645 setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1);
649 * Function API to open IB device.
653 * Pointer to the mlx5 device.
655 * Chosen classes come from device arguments.
658 * 0 on success, a negative errno value otherwise and rte_errno is set.
661 mlx5_os_open_device(struct mlx5_common_device *cdev, uint32_t classes)
663 struct ibv_device *ibv;
664 struct ibv_context *ctx = NULL;
667 if (classes & MLX5_CLASS_VDPA)
668 ibv = mlx5_vdpa_get_ibv_dev(cdev->dev);
670 ibv = mlx5_os_get_ibv_dev(cdev->dev);
673 DRV_LOG(INFO, "Dev information matches for device \"%s\".", ibv->name);
675 * Configure environment variable "MLX5_BF_SHUT_UP" before the device
676 * creation. The rdma_core library checks the variable at device
677 * creation and stores the result internally.
679 dbmap_env = mlx5_config_doorbell_mapping_env(cdev->config.dbnc);
680 /* Try to open IB device with DV first, then usual Verbs. */
682 ctx = mlx5_glue->dv_open_device(ibv);
684 cdev->config.devx = 1;
685 DRV_LOG(DEBUG, "DevX is supported.");
686 } else if (classes == MLX5_CLASS_ETH) {
687 /* The environment variable is still configured. */
688 ctx = mlx5_glue->open_device(ibv);
691 DRV_LOG(DEBUG, "DevX is NOT supported.");
695 /* The device is created, no need for environment. */
696 mlx5_restore_doorbell_mapping_env(dbmap_env);
697 /* Hint libmlx5 to use PMD allocator for data plane resources */
698 mlx5_set_context_attr(cdev->dev, ctx);
702 rte_errno = errno ? errno : ENODEV;
703 /* The device creation is failed, no need for environment. */
704 mlx5_restore_doorbell_mapping_env(dbmap_env);
705 DRV_LOG(ERR, "Failed to open IB device \"%s\".", ibv->name);
709 mlx5_get_device_guid(const struct rte_pci_addr *dev, uint8_t *guid, size_t len)
712 char cur_ifname[IF_NAMESIZE + 1];
718 if (guid == NULL || len < sizeof(u_int64_t) + 1)
720 memset(guid, 0, len);
721 snprintf(tmp, sizeof(tmp), "/sys/bus/pci/devices/%04x:%02x:%02x.%x/net",
722 dev->domain, dev->bus, dev->devid, dev->function);
726 /* Traverse to identify PF interface */
729 if (ptr == NULL || ptr->d_type != DT_DIR) {
733 } while (strchr(ptr->d_name, '.') || strchr(ptr->d_name, '_') ||
734 strchr(ptr->d_name, 'v'));
735 snprintf(cur_ifname, sizeof(cur_ifname), "%s", ptr->d_name);
737 snprintf(tmp + strlen(tmp), sizeof(tmp) - strlen(tmp),
738 "/%s/phys_switch_id", cur_ifname);
739 /* Older OFED like 5.3 doesn't support read */
740 id_file = fopen(tmp, "r");
743 ret = fscanf(id_file, "%16s", guid);