net/mlx5: add port representor awareness
authorAdrien Mazarguil <adrien.mazarguil@6wind.com>
Tue, 10 Jul 2018 16:04:52 +0000 (18:04 +0200)
committerShahaf Shuler <shahafs@mellanox.com>
Wed, 11 Jul 2018 13:37:14 +0000 (15:37 +0200)
The current PCI probing method is not aware of Verbs port representors,
which appear as standard Verbs devices bound to the same PCI address and
cannot be distinguished.

Problem is that more often than not, the wrong Verbs device is used,
resulting in unexpected traffic.

This patch makes the driver discard representors to only use the master
device. If unable to identify it (e.g. kernel drivers not recent enough),
either:

- There is only one matching device which isn't identified as a
  representor, in that case use it.
- Otherwise log an error and do not probe the device.

(Patch based on prior work from Yuanhan Liu)

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Reviewed-by: Xueming Li <xuemingl@mellanox.com>
drivers/net/mlx5/Makefile
drivers/net/mlx5/mlx5.c
drivers/net/mlx5/mlx5.h
drivers/net/mlx5/mlx5_nl.c

index 955861a..9e27496 100644 (file)
@@ -152,6 +152,51 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
                infiniband/verbs.h \
                enum IBV_FLOW_SPEC_ACTION_COUNT \
                $(AUTOCONF_OUTPUT)
+       $Q sh -- '$<' '$@' \
+               HAVE_RDMA_NL_NLDEV \
+               rdma/rdma_netlink.h \
+               enum RDMA_NL_NLDEV \
+               $(AUTOCONF_OUTPUT)
+       $Q sh -- '$<' '$@' \
+               HAVE_RDMA_NLDEV_CMD_GET \
+               rdma/rdma_netlink.h \
+               enum RDMA_NLDEV_CMD_GET \
+               $(AUTOCONF_OUTPUT)
+       $Q sh -- '$<' '$@' \
+               HAVE_RDMA_NLDEV_CMD_PORT_GET \
+               rdma/rdma_netlink.h \
+               enum RDMA_NLDEV_CMD_PORT_GET \
+               $(AUTOCONF_OUTPUT)
+       $Q sh -- '$<' '$@' \
+               HAVE_RDMA_NLDEV_ATTR_DEV_INDEX \
+               rdma/rdma_netlink.h \
+               enum RDMA_NLDEV_ATTR_DEV_INDEX \
+               $(AUTOCONF_OUTPUT)
+       $Q sh -- '$<' '$@' \
+               HAVE_RDMA_NLDEV_ATTR_DEV_NAME \
+               rdma/rdma_netlink.h \
+               enum RDMA_NLDEV_ATTR_DEV_NAME \
+               $(AUTOCONF_OUTPUT)
+       $Q sh -- '$<' '$@' \
+               HAVE_RDMA_NLDEV_ATTR_PORT_INDEX \
+               rdma/rdma_netlink.h \
+               enum RDMA_NLDEV_ATTR_PORT_INDEX \
+               $(AUTOCONF_OUTPUT)
+       $Q sh -- '$<' '$@' \
+               HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX \
+               rdma/rdma_netlink.h \
+               enum RDMA_NLDEV_ATTR_NDEV_INDEX \
+               $(AUTOCONF_OUTPUT)
+       $Q sh -- '$<' '$@' \
+               HAVE_IFLA_PHYS_SWITCH_ID \
+               linux/if_link.h \
+               enum IFLA_PHYS_SWITCH_ID \
+               $(AUTOCONF_OUTPUT)
+       $Q sh -- '$<' '$@' \
+               HAVE_IFLA_PHYS_PORT_NAME \
+               linux/if_link.h \
+               enum IFLA_PHYS_PORT_NAME \
+               $(AUTOCONF_OUTPUT)
 
 # Create mlx5_autoconf.h or update it in case it differs from the new one.
 
index 3a44b04..958d60a 100644 (file)
@@ -13,6 +13,7 @@
 #include <errno.h>
 #include <net/if.h>
 #include <sys/mman.h>
+#include <linux/netlink.h>
 #include <linux/rtnetlink.h>
 
 /* Verbs header. */
@@ -274,8 +275,10 @@ mlx5_dev_close(struct rte_eth_dev *dev)
                mlx5_socket_uninit(dev);
        if (priv->config.vf)
                mlx5_nl_mac_addr_flush(dev);
-       if (priv->nl_socket >= 0)
-               close(priv->nl_socket);
+       if (priv->nl_socket_route >= 0)
+               close(priv->nl_socket_route);
+       if (priv->nl_socket_rdma >= 0)
+               close(priv->nl_socket_rdma);
        ret = mlx5_hrxq_ibv_verify(dev);
        if (ret)
                DRV_LOG(WARNING, "port %u some hash Rx queue still remain",
@@ -876,6 +879,10 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
        priv->device_attr = attr;
        priv->pd = pd;
        priv->mtu = ETHER_MTU;
+       /* Some internal functions rely on Netlink sockets, open them now. */
+       priv->nl_socket_rdma = mlx5_nl_init(0, NETLINK_RDMA);
+       priv->nl_socket_route = mlx5_nl_init(RTMGRP_LINK, NETLINK_ROUTE);
+       priv->nl_sn = 0;
        err = mlx5_args(&config, dpdk_dev->devargs);
        if (err) {
                err = rte_errno;
@@ -1010,14 +1017,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
        eth_dev->dev_ops = &mlx5_dev_ops;
        /* Register MAC address. */
        claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
-       priv->nl_socket = -1;
-       priv->nl_sn = 0;
-       if (vf && config.vf_nl_en) {
-               priv->nl_socket = mlx5_nl_init(RTMGRP_LINK);
-               if (priv->nl_socket < 0)
-                       priv->nl_socket = -1;
+       if (vf && config.vf_nl_en)
                mlx5_nl_mac_addr_sync(eth_dev);
-       }
        TAILQ_INIT(&priv->flows);
        TAILQ_INIT(&priv->ctrl_flows);
        /* Hint libmlx5 to use PMD allocator for data plane resources */
@@ -1078,8 +1079,13 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
        rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
        return eth_dev;
 error:
-       if (priv)
+       if (priv) {
+               if (priv->nl_socket_route >= 0)
+                       close(priv->nl_socket_route);
+               if (priv->nl_socket_rdma >= 0)
+                       close(priv->nl_socket_rdma);
                rte_free(priv);
+       }
        if (pd)
                claim_zero(mlx5_glue->dealloc_pd(pd));
        if (eth_dev)
@@ -1110,6 +1116,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 {
        struct ibv_device **ibv_list;
        struct rte_eth_dev *eth_dev = NULL;
+       unsigned int n = 0;
        int vf;
        int ret;
 
@@ -1121,6 +1128,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
                DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
                return -rte_errno;
        }
+
+       struct ibv_device *ibv_match[ret + 1];
+
        while (ret-- > 0) {
                struct rte_pci_addr pci_addr;
 
@@ -1132,10 +1142,81 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
                    pci_dev->addr.devid != pci_addr.devid ||
                    pci_dev->addr.function != pci_addr.function)
                        continue;
-               DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
+               DRV_LOG(INFO, "PCI information matches for device \"%s\"",
                        ibv_list[ret]->name);
+               ibv_match[n++] = ibv_list[ret];
+       }
+       ibv_match[n] = NULL;
+
+       unsigned int ifindex[n];
+       struct mlx5_switch_info info[n];
+       int nl_route = n ? mlx5_nl_init(0, NETLINK_ROUTE) : -1;
+       int nl_rdma = n ? mlx5_nl_init(0, NETLINK_RDMA) : -1;
+       unsigned int i;
+
+       /*
+        * The existence of several matching entries (n > 1) means port
+        * representors have been instantiated. No existing Verbs call nor
+        * /sys entries can tell them apart, this can only be done through
+        * Netlink calls assuming kernel drivers are recent enough to
+        * support them.
+        *
+        * In the event of identification failure through Netlink, either:
+        *
+        * 1. No device matches (n == 0), complain and bail out.
+        * 2. A single IB device matches (n == 1) and is not a representor,
+        *    assume no switch support.
+        * 3. Otherwise no safe assumptions can be made; complain louder and
+        *    bail out.
+        */
+       for (i = 0; i != n; ++i) {
+               if (nl_rdma < 0)
+                       ifindex[i] = 0;
+               else
+                       ifindex[i] = mlx5_nl_ifindex(nl_rdma,
+                                                    ibv_match[i]->name);
+               if (nl_route < 0 ||
+                   !ifindex[i] ||
+                   mlx5_nl_switch_info(nl_route, ifindex[i], &info[i])) {
+                       ifindex[i] = 0;
+                       memset(&info[i], 0, sizeof(info[i]));
+                       continue;
+               }
+       }
+       if (nl_rdma >= 0)
+               close(nl_rdma);
+       if (nl_route >= 0)
+               close(nl_route);
+       /* Look for master device. */
+       for (i = 0; i != n; ++i) {
+               if (!info[i].master)
+                       continue;
+               /* Make it the first entry. */
+               if (i == 0)
+                       break;
+               ibv_match[n] = ibv_match[0];
+               ibv_match[0] = ibv_match[i];
+               ibv_match[n] = NULL;
                break;
        }
+       if (n && i == n) {
+               if (n == 1 && !info[0].representor) {
+                       /* Case #2. */
+                       DRV_LOG(INFO, "no switch support detected");
+               } else if (n == 1) {
+                       /* Case #3. */
+                       DRV_LOG(ERR,
+                               "device looks like a port representor, this is"
+                               " not supported yet");
+                       n = 0;
+               } else {
+                       /* Case #3. */
+                       DRV_LOG(ERR,
+                               "unable to tell which of the matching devices"
+                               " is the master (lack of kernel support?)");
+                       n = 0;
+               }
+       }
        switch (pci_dev->id.device_id) {
        case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
        case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
@@ -1146,10 +1227,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
        default:
                vf = 0;
        }
-       if (ret >= 0)
-               eth_dev = mlx5_dev_spawn(&pci_dev->device, ibv_list[ret], vf);
+       if (n)
+               eth_dev = mlx5_dev_spawn(&pci_dev->device, ibv_match[0], vf);
        mlx5_glue->free_device_list(ibv_list);
-       if (!ret) {
+       if (!n) {
                DRV_LOG(WARNING,
                        "no Verbs device matches PCI device " PCI_PRI_FMT ","
                        " are kernel drivers loaded?",
index f55ff4a..7040462 100644 (file)
@@ -53,6 +53,14 @@ enum {
        PCI_DEVICE_ID_MELLANOX_CONNECTX5BF = 0xa2d2,
 };
 
+/** Switch information returned by mlx5_nl_switch_info(). */
+struct mlx5_switch_info {
+       uint32_t master:1; /**< Master device. */
+       uint32_t representor:1; /**< Representor device. */
+       int32_t port_name; /**< Representor port name. */
+       uint64_t switch_id; /**< Switch identifier. */
+};
+
 LIST_HEAD(mlx5_dev_list, priv);
 
 /* Shared memory between primary and secondary processes. */
@@ -195,7 +203,8 @@ struct priv {
        struct mlx5_dev_config config; /* Device configuration. */
        struct mlx5_verbs_alloc_ctx verbs_alloc_ctx;
        /* Context for Verbs allocator. */
-       int nl_socket; /* Netlink socket. */
+       int nl_socket_rdma; /* Netlink socket (NETLINK_RDMA). */
+       int nl_socket_route; /* Netlink socket (NETLINK_ROUTE). */
        uint32_t nl_sn; /* Netlink message sequence number. */
 };
 
@@ -342,7 +351,7 @@ int mlx5_socket_connect(struct rte_eth_dev *priv);
 
 /* mlx5_nl.c */
 
-int mlx5_nl_init(uint32_t nlgroups);
+int mlx5_nl_init(uint32_t nlgroups, int protocol);
 int mlx5_nl_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac,
                         uint32_t index);
 int mlx5_nl_mac_addr_remove(struct rte_eth_dev *dev, struct ether_addr *mac,
@@ -351,5 +360,8 @@ void mlx5_nl_mac_addr_sync(struct rte_eth_dev *dev);
 void mlx5_nl_mac_addr_flush(struct rte_eth_dev *dev);
 int mlx5_nl_promisc(struct rte_eth_dev *dev, int enable);
 int mlx5_nl_allmulti(struct rte_eth_dev *dev, int enable);
+unsigned int mlx5_nl_ifindex(int nl, const char *name);
+int mlx5_nl_switch_info(int nl, unsigned int ifindex,
+                       struct mlx5_switch_info *info);
 
 #endif /* RTE_PMD_MLX5_H_ */
index dca8583..008cd2c 100644 (file)
@@ -3,10 +3,21 @@
  * Copyright 2018 Mellanox Technologies, Ltd
  */
 
+#include <errno.h>
+#include <linux/if_link.h>
 #include <linux/netlink.h>
 #include <linux/rtnetlink.h>
+#include <net/if.h>
+#include <rdma/rdma_netlink.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
 #include <unistd.h>
 
+#include <rte_errno.h>
+
 #include "mlx5.h"
 #include "mlx5_utils.h"
 
        ((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
 #endif
 
+/*
+ * The following definitions are normally found in rdma/rdma_netlink.h,
+ * however they are so recent that most systems do not expose them yet.
+ */
+#ifndef HAVE_RDMA_NL_NLDEV
+#define RDMA_NL_NLDEV 5
+#endif
+#ifndef HAVE_RDMA_NLDEV_CMD_GET
+#define RDMA_NLDEV_CMD_GET 1
+#endif
+#ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET
+#define RDMA_NLDEV_CMD_PORT_GET 5
+#endif
+#ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX
+#define RDMA_NLDEV_ATTR_DEV_INDEX 1
+#endif
+#ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME
+#define RDMA_NLDEV_ATTR_DEV_NAME 2
+#endif
+#ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
+#define RDMA_NLDEV_ATTR_PORT_INDEX 3
+#endif
+#ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
+#define RDMA_NLDEV_ATTR_NDEV_INDEX 50
+#endif
+
+/* These are normally found in linux/if_link.h. */
+#ifndef HAVE_IFLA_PHYS_SWITCH_ID
+#define IFLA_PHYS_SWITCH_ID 36
+#endif
+#ifndef HAVE_IFLA_PHYS_PORT_NAME
+#define IFLA_PHYS_PORT_NAME 38
+#endif
+
 /* Add/remove MAC address through Netlink */
 struct mlx5_nl_mac_addr {
        struct ether_addr (*mac)[];
@@ -34,18 +79,27 @@ struct mlx5_nl_mac_addr {
        int mac_n; /**< Number of addresses in the array. */
 };
 
+/** Data structure used by mlx5_nl_ifindex_cb(). */
+struct mlx5_nl_ifindex_data {
+       const char *name; /**< IB device name (in). */
+       uint32_t ibindex; /**< IB device index (out). */
+       uint32_t ifindex; /**< Network interface index (out). */
+};
+
 /**
  * Opens a Netlink socket.
  *
  * @param nl_groups
  *   Netlink group value (e.g. RTMGRP_LINK).
+ * @param protocol
+ *   Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA).
  *
  * @return
  *   A file descriptor on success, a negative errno value otherwise and
  *   rte_errno is set.
  */
 int
-mlx5_nl_init(uint32_t nl_groups)
+mlx5_nl_init(uint32_t nl_groups, int protocol)
 {
        int fd;
        int sndbuf_size = MLX5_SEND_BUF_SIZE;
@@ -56,7 +110,7 @@ mlx5_nl_init(uint32_t nl_groups)
        };
        int ret;
 
-       fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
+       fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
        if (fd == -1) {
                rte_errno = errno;
                return -rte_errno;
@@ -334,9 +388,9 @@ mlx5_nl_mac_addr_list(struct rte_eth_dev *dev, struct ether_addr (*mac)[],
        int ret;
        uint32_t sn = priv->nl_sn++;
 
-       if (priv->nl_socket == -1)
+       if (priv->nl_socket_route == -1)
                return 0;
-       fd = priv->nl_socket;
+       fd = priv->nl_socket_route;
        ret = mlx5_nl_request(fd, &req.hdr, sn, &req.ifm,
                              sizeof(struct ifinfomsg));
        if (ret < 0)
@@ -398,9 +452,9 @@ mlx5_nl_mac_addr_modify(struct rte_eth_dev *dev, struct ether_addr *mac,
        int ret;
        uint32_t sn = priv->nl_sn++;
 
-       if (priv->nl_socket == -1)
+       if (priv->nl_socket_route == -1)
                return 0;
-       fd = priv->nl_socket;
+       fd = priv->nl_socket_route;
        memcpy(RTA_DATA(&req.rta), mac, ETHER_ADDR_LEN);
        req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
                RTA_ALIGN(req.rta.rta_len);
@@ -569,9 +623,9 @@ mlx5_nl_device_flags(struct rte_eth_dev *dev, uint32_t flags, int enable)
        int ret;
 
        assert(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI)));
-       if (priv->nl_socket < 0)
+       if (priv->nl_socket_route < 0)
                return 0;
-       fd = priv->nl_socket;
+       fd = priv->nl_socket_route;
        ret = mlx5_nl_send(fd, &req.hdr, priv->nl_sn++);
        if (ret < 0)
                return ret;
@@ -625,3 +679,241 @@ mlx5_nl_allmulti(struct rte_eth_dev *dev, int enable)
                        strerror(rte_errno));
        return ret;
 }
+
+/**
+ * Process network interface information from Netlink message.
+ *
+ * @param nh
+ *   Pointer to Netlink message header.
+ * @param arg
+ *   Opaque data pointer for this callback.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_ifindex_cb(struct nlmsghdr *nh, void *arg)
+{
+       struct mlx5_nl_ifindex_data *data = arg;
+       size_t off = NLMSG_HDRLEN;
+       uint32_t ibindex = 0;
+       uint32_t ifindex = 0;
+       int found = 0;
+
+       if (nh->nlmsg_type !=
+           RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) &&
+           nh->nlmsg_type !=
+           RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET))
+               goto error;
+       while (off < nh->nlmsg_len) {
+               struct nlattr *na = (void *)((uintptr_t)nh + off);
+               void *payload = (void *)((uintptr_t)na + NLA_HDRLEN);
+
+               if (na->nla_len > nh->nlmsg_len - off)
+                       goto error;
+               switch (na->nla_type) {
+               case RDMA_NLDEV_ATTR_DEV_INDEX:
+                       ibindex = *(uint32_t *)payload;
+                       break;
+               case RDMA_NLDEV_ATTR_DEV_NAME:
+                       if (!strcmp(payload, data->name))
+                               found = 1;
+                       break;
+               case RDMA_NLDEV_ATTR_NDEV_INDEX:
+                       ifindex = *(uint32_t *)payload;
+                       break;
+               default:
+                       break;
+               }
+               off += NLA_ALIGN(na->nla_len);
+       }
+       if (found) {
+               data->ibindex = ibindex;
+               data->ifindex = ifindex;
+       }
+       return 0;
+error:
+       rte_errno = EINVAL;
+       return -rte_errno;
+}
+
+/**
+ * Get index of network interface associated with some IB device.
+ *
+ * This is the only somewhat safe method to avoid resorting to heuristics
+ * when faced with port representors. Unfortunately it requires at least
+ * Linux 4.17.
+ *
+ * @param nl
+ *   Netlink socket of the RDMA kind (NETLINK_RDMA).
+ * @param[in] name
+ *   IB device name.
+ *
+ * @return
+ *   A valid (nonzero) interface index on success, 0 otherwise and rte_errno
+ *   is set.
+ */
+unsigned int
+mlx5_nl_ifindex(int nl, const char *name)
+{
+       static const uint32_t pindex = 1;
+       uint32_t seq = random();
+       struct mlx5_nl_ifindex_data data = {
+               .name = name,
+               .ibindex = 0, /* Determined during first pass. */
+               .ifindex = 0, /* Determined during second pass. */
+       };
+       union {
+               struct nlmsghdr nh;
+               uint8_t buf[NLMSG_HDRLEN +
+                           NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) +
+                           NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
+       } req = {
+               .nh = {
+                       .nlmsg_len = NLMSG_LENGTH(0),
+                       .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+                                                      RDMA_NLDEV_CMD_GET),
+                       .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
+               },
+       };
+       struct nlattr *na;
+       int ret;
+
+       ret = mlx5_nl_send(nl, &req.nh, seq);
+       if (ret < 0)
+               return 0;
+       ret = mlx5_nl_recv(nl, seq, mlx5_nl_ifindex_cb, &data);
+       if (ret < 0)
+               return 0;
+       if (!data.ibindex)
+               goto error;
+       ++seq;
+       req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+                                            RDMA_NLDEV_CMD_PORT_GET);
+       req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+       req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
+       na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
+       na->nla_len = NLA_HDRLEN + sizeof(data.ibindex);
+       na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
+       memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
+              &data.ibindex, sizeof(data.ibindex));
+       na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
+       na->nla_len = NLA_HDRLEN + sizeof(pindex);
+       na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
+       memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
+              &pindex, sizeof(pindex));
+       ret = mlx5_nl_send(nl, &req.nh, seq);
+       if (ret < 0)
+               return 0;
+       ret = mlx5_nl_recv(nl, seq, mlx5_nl_ifindex_cb, &data);
+       if (ret < 0)
+               return 0;
+       if (!data.ifindex)
+               goto error;
+       return data.ifindex;
+error:
+       rte_errno = ENODEV;
+       return 0;
+}
+
+/**
+ * Process switch information from Netlink message.
+ *
+ * @param nh
+ *   Pointer to Netlink message header.
+ * @param arg
+ *   Opaque data pointer for this callback.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
+{
+       struct mlx5_switch_info info = {
+               .master = 0,
+               .representor = 0,
+               .port_name = 0,
+               .switch_id = 0,
+       };
+       size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+       bool port_name_set = false;
+       bool switch_id_set = false;
+
+       if (nh->nlmsg_type != RTM_NEWLINK)
+               goto error;
+       while (off < nh->nlmsg_len) {
+               struct rtattr *ra = (void *)((uintptr_t)nh + off);
+               void *payload = RTA_DATA(ra);
+               char *end;
+               unsigned int i;
+
+               if (ra->rta_len > nh->nlmsg_len - off)
+                       goto error;
+               switch (ra->rta_type) {
+               case IFLA_PHYS_PORT_NAME:
+                       errno = 0;
+                       info.port_name = strtol(payload, &end, 0);
+                       if (errno ||
+                           (size_t)(end - (char *)payload) != strlen(payload))
+                               goto error;
+                       port_name_set = true;
+                       break;
+               case IFLA_PHYS_SWITCH_ID:
+                       info.switch_id = 0;
+                       for (i = 0; i < RTA_PAYLOAD(ra); ++i) {
+                               info.switch_id <<= 8;
+                               info.switch_id |= ((uint8_t *)payload)[i];
+                       }
+                       switch_id_set = true;
+                       break;
+               }
+               off += RTA_ALIGN(ra->rta_len);
+       }
+       info.master = switch_id_set && !port_name_set;
+       info.representor = switch_id_set && port_name_set;
+       memcpy(arg, &info, sizeof(info));
+       return 0;
+error:
+       rte_errno = EINVAL;
+       return -rte_errno;
+}
+
+/**
+ * Get switch information associated with network interface.
+ *
+ * @param nl
+ *   Netlink socket of the ROUTE kind (NETLINK_ROUTE).
+ * @param ifindex
+ *   Network interface index.
+ * @param[out] info
+ *   Switch information object, populated in case of success.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_nl_switch_info(int nl, unsigned int ifindex, struct mlx5_switch_info *info)
+{
+       uint32_t seq = random();
+       struct {
+               struct nlmsghdr nh;
+               struct ifinfomsg info;
+       } req = {
+               .nh = {
+                       .nlmsg_len = NLMSG_LENGTH(sizeof(req.info)),
+                       .nlmsg_type = RTM_GETLINK,
+                       .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
+               },
+               .info = {
+                       .ifi_family = AF_UNSPEC,
+                       .ifi_index = ifindex,
+               },
+       };
+       int ret;
+
+       ret = mlx5_nl_send(nl, &req.nh, seq);
+       if (ret >= 0)
+               ret = mlx5_nl_recv(nl, seq, mlx5_nl_switch_info_cb, info);
+       return ret;
+}