common/mlx5: share Netlink commands
authorMatan Azrad <matan@mellanox.com>
Wed, 29 Jan 2020 12:38:49 +0000 (12:38 +0000)
committerFerruh Yigit <ferruh.yigit@intel.com>
Wed, 5 Feb 2020 08:51:20 +0000 (09:51 +0100)
Move Netlink mechanism and its dependencies from net/mlx5 to
common/mlx5 in order to be ready to use by other mlx5 drivers.

The dependencies are BITFIELD defines, the ppc64 compilation workaround
for bool type and the function mlx5_translate_port_name.

Update build mechanism accordingly.

Signed-off-by: Matan Azrad <matan@mellanox.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
15 files changed:
drivers/common/mlx5/Makefile
drivers/common/mlx5/meson.build
drivers/common/mlx5/mlx5_common.c
drivers/common/mlx5/mlx5_common.h
drivers/common/mlx5/mlx5_nl.c [new file with mode: 0644]
drivers/common/mlx5/mlx5_nl.h [new file with mode: 0644]
drivers/common/mlx5/rte_common_mlx5_version.map
drivers/net/mlx5/Makefile
drivers/net/mlx5/meson.build
drivers/net/mlx5/mlx5.h
drivers/net/mlx5/mlx5_defs.h
drivers/net/mlx5/mlx5_ethdev.c
drivers/net/mlx5/mlx5_nl.c [deleted file]
drivers/net/mlx5/mlx5_nl.h [deleted file]
drivers/net/mlx5/mlx5_vlan.c

index b9e9803..6a14b7d 100644 (file)
@@ -15,6 +15,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_glue.c
 endif
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_devx_cmds.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_common.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_nl.c
 
 ifeq ($(CONFIG_RTE_IBVERBS_LINK_DLOPEN),y)
 INSTALL-$(CONFIG_RTE_LIBRTE_MLX5_PMD)-lib += $(LIB_GLUE)
@@ -41,7 +42,7 @@ else
 LDLIBS += -libverbs -lmlx5
 endif
 
-LDLIBS += -lrte_eal -lrte_pci -lrte_kvargs
+LDLIBS += -lrte_eal -lrte_pci -lrte_kvargs -lrte_net
 
 # A few warnings cannot be avoided in external headers.
 CFLAGS += -Wno-error=cast-qual -DNDEBUG -UPEDANTIC
index b88822e..34cb7b9 100644 (file)
@@ -42,6 +42,7 @@ if build
        sources = files(
                'mlx5_devx_cmds.c',
                'mlx5_common.c',
+               'mlx5_nl.c',
        )
        if not pmd_dlopen
                sources += files('mlx5_glue.c')
index 57d72b4..99d15cd 100644 (file)
@@ -105,6 +105,61 @@ mlx5_class_get(struct rte_devargs *devargs)
        return ret;
 }
 
+/**
+ * Extract port name, as a number, from sysfs or netlink information.
+ *
+ * @param[in] port_name_in
+ *   String representing the port name.
+ * @param[out] port_info_out
+ *   Port information, including port name as a number and port name
+ *   type if recognized
+ *
+ * @return
+ *   port_name field set according to recognized name format.
+ */
+void
+mlx5_translate_port_name(const char *port_name_in,
+                        struct mlx5_switch_info *port_info_out)
+{
+       char pf_c1, pf_c2, vf_c1, vf_c2;
+       char *end;
+       int sc_items;
+
+       /*
+        * Check for port-name as a string of the form pf0vf0
+        * (support kernel ver >= 5.0 or OFED ver >= 4.6).
+        */
+       sc_items = sscanf(port_name_in, "%c%c%d%c%c%d",
+                         &pf_c1, &pf_c2, &port_info_out->pf_num,
+                         &vf_c1, &vf_c2, &port_info_out->port_name);
+       if (sc_items == 6 &&
+           pf_c1 == 'p' && pf_c2 == 'f' &&
+           vf_c1 == 'v' && vf_c2 == 'f') {
+               port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_PFVF;
+               return;
+       }
+       /*
+        * Check for port-name as a string of the form p0
+        * (support kernel ver >= 5.0, or OFED ver >= 4.6).
+        */
+       sc_items = sscanf(port_name_in, "%c%d",
+                         &pf_c1, &port_info_out->port_name);
+       if (sc_items == 2 && pf_c1 == 'p') {
+               port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UPLINK;
+               return;
+       }
+       /* Check for port-name as a number (support kernel ver < 5.0 */
+       errno = 0;
+       port_info_out->port_name = strtol(port_name_in, &end, 0);
+       if (!errno &&
+           (size_t)(end - port_name_in) == strlen(port_name_in)) {
+               port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_LEGACY;
+               return;
+       }
+       port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN;
+       return;
+}
+
 #ifdef RTE_IBVERBS_LINK_DLOPEN
 
 /**
index 2988f4b..d9c2d26 100644 (file)
 #include "mlx5_prm.h"
 
 
+/*
+ * Compilation workaround for PPC64 when AltiVec is fully enabled, e.g. std=c11.
+ * Otherwise there would be a type conflict between stdbool and altivec.
+ */
+#if defined(__PPC64__) && !defined(__APPLE_ALTIVEC__)
+#undef bool
+/* redefine as in stdbool.h */
+#define bool _Bool
+#endif
+
+/* Bit-field manipulation. */
+#define BITFIELD_DECLARE(bf, type, size) \
+       type bf[(((size_t)(size) / (sizeof(type) * CHAR_BIT)) + \
+                !!((size_t)(size) % (sizeof(type) * CHAR_BIT)))]
+#define BITFIELD_DEFINE(bf, type, size) \
+       BITFIELD_DECLARE((bf), type, (size)) = { 0 }
+#define BITFIELD_SET(bf, b) \
+       (assert((size_t)(b) < (sizeof(bf) * CHAR_BIT)), \
+        (void)((bf)[((b) / (sizeof((bf)[0]) * CHAR_BIT))] |= \
+               ((size_t)1 << ((b) % (sizeof((bf)[0]) * CHAR_BIT)))))
+#define BITFIELD_RESET(bf, b) \
+       (assert((size_t)(b) < (sizeof(bf) * CHAR_BIT)), \
+        (void)((bf)[((b) / (sizeof((bf)[0]) * CHAR_BIT))] &= \
+               ~((size_t)1 << ((b) % (sizeof((bf)[0]) * CHAR_BIT)))))
+#define BITFIELD_ISSET(bf, b) \
+       (assert((size_t)(b) < (sizeof(bf) * CHAR_BIT)), \
+        !!(((bf)[((b) / (sizeof((bf)[0]) * CHAR_BIT))] & \
+            ((size_t)1 << ((b) % (sizeof((bf)[0]) * CHAR_BIT))))))
+
 /*
  * Helper macros to work around __VA_ARGS__ limitations in a C99 compliant
  * manner.
@@ -112,6 +141,33 @@ enum {
        PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF = 0x101e,
 };
 
+/* Maximum number of simultaneous unicast MAC addresses. */
+#define MLX5_MAX_UC_MAC_ADDRESSES 128
+/* Maximum number of simultaneous Multicast MAC addresses. */
+#define MLX5_MAX_MC_MAC_ADDRESSES 128
+/* Maximum number of simultaneous MAC addresses. */
+#define MLX5_MAX_MAC_ADDRESSES \
+       (MLX5_MAX_UC_MAC_ADDRESSES + MLX5_MAX_MC_MAC_ADDRESSES)
+
+/* Recognized Infiniband device physical port name types. */
+enum mlx5_nl_phys_port_name_type {
+       MLX5_PHYS_PORT_NAME_TYPE_NOTSET = 0, /* Not set. */
+       MLX5_PHYS_PORT_NAME_TYPE_LEGACY, /* before kernel ver < 5.0 */
+       MLX5_PHYS_PORT_NAME_TYPE_UPLINK, /* p0, kernel ver >= 5.0 */
+       MLX5_PHYS_PORT_NAME_TYPE_PFVF, /* pf0vf0, kernel ver >= 5.0 */
+       MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN, /* Unrecognized. */
+};
+
+/** Switch information returned by mlx5_nl_switch_info(). */
+struct mlx5_switch_info {
+       uint32_t master:1; /**< Master device. */
+       uint32_t representor:1; /**< Representor device. */
+       enum mlx5_nl_phys_port_name_type name_type; /** < Port name type. */
+       int32_t pf_num; /**< PF number (valid for pfxvfx format only). */
+       int32_t port_name; /**< Representor port name. */
+       uint64_t switch_id; /**< Switch identifier. */
+};
+
 /* CQE status. */
 enum mlx5_cqe_status {
        MLX5_CQE_STATUS_SW_OWN = -1,
@@ -159,6 +215,9 @@ enum mlx5_class {
        MLX5_CLASS_VDPA,
        MLX5_CLASS_INVALID,
 };
+
 enum mlx5_class mlx5_class_get(struct rte_devargs *devargs);
+void mlx5_translate_port_name(const char *port_name_in,
+                             struct mlx5_switch_info *port_info_out);
 
 #endif /* RTE_PMD_MLX5_COMMON_H_ */
diff --git a/drivers/common/mlx5/mlx5_nl.c b/drivers/common/mlx5/mlx5_nl.c
new file mode 100644 (file)
index 0000000..3162743
--- /dev/null
@@ -0,0 +1,1336 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2018 6WIND S.A.
+ * Copyright 2018 Mellanox Technologies, Ltd
+ */
+
+#include <errno.h>
+#include <linux/if_link.h>
+#include <linux/rtnetlink.h>
+#include <net/if.h>
+#include <rdma/rdma_netlink.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdalign.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#include <rte_errno.h>
+#include <rte_atomic.h>
+
+#include "mlx5_nl.h"
+#include "mlx5_common_utils.h"
+
+/* Size of the buffer to receive kernel messages */
+#define MLX5_NL_BUF_SIZE (32 * 1024)
+/* Send buffer size for the Netlink socket */
+#define MLX5_SEND_BUF_SIZE 32768
+/* Receive buffer size for the Netlink socket */
+#define MLX5_RECV_BUF_SIZE 32768
+
+/** Parameters of VLAN devices created by driver. */
+#define MLX5_VMWA_VLAN_DEVICE_PFX "evmlx"
+/*
+ * Define NDA_RTA as defined in iproute2 sources.
+ *
+ * see in iproute2 sources file include/libnetlink.h
+ */
+#ifndef MLX5_NDA_RTA
+#define MLX5_NDA_RTA(r) \
+       ((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
+#endif
+/*
+ * Define NLMSG_TAIL as defined in iproute2 sources.
+ *
+ * see in iproute2 sources file include/libnetlink.h
+ */
+#ifndef NLMSG_TAIL
+#define NLMSG_TAIL(nmsg) \
+       ((struct rtattr *)(((char *)(nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
+#endif
+/*
+ * The following definitions are normally found in rdma/rdma_netlink.h,
+ * however they are so recent that most systems do not expose them yet.
+ */
+#ifndef HAVE_RDMA_NL_NLDEV
+#define RDMA_NL_NLDEV 5
+#endif
+#ifndef HAVE_RDMA_NLDEV_CMD_GET
+#define RDMA_NLDEV_CMD_GET 1
+#endif
+#ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET
+#define RDMA_NLDEV_CMD_PORT_GET 5
+#endif
+#ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX
+#define RDMA_NLDEV_ATTR_DEV_INDEX 1
+#endif
+#ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME
+#define RDMA_NLDEV_ATTR_DEV_NAME 2
+#endif
+#ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
+#define RDMA_NLDEV_ATTR_PORT_INDEX 3
+#endif
+#ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
+#define RDMA_NLDEV_ATTR_NDEV_INDEX 50
+#endif
+
+/* These are normally found in linux/if_link.h. */
+#ifndef HAVE_IFLA_NUM_VF
+#define IFLA_NUM_VF 21
+#endif
+#ifndef HAVE_IFLA_EXT_MASK
+#define IFLA_EXT_MASK 29
+#endif
+#ifndef HAVE_IFLA_PHYS_SWITCH_ID
+#define IFLA_PHYS_SWITCH_ID 36
+#endif
+#ifndef HAVE_IFLA_PHYS_PORT_NAME
+#define IFLA_PHYS_PORT_NAME 38
+#endif
+
+/* Add/remove MAC address through Netlink */
+struct mlx5_nl_mac_addr {
+       struct rte_ether_addr (*mac)[];
+       /**< MAC address handled by the device. */
+       int mac_n; /**< Number of addresses in the array. */
+};
+
+#define MLX5_NL_CMD_GET_IB_NAME (1 << 0)
+#define MLX5_NL_CMD_GET_IB_INDEX (1 << 1)
+#define MLX5_NL_CMD_GET_NET_INDEX (1 << 2)
+#define MLX5_NL_CMD_GET_PORT_INDEX (1 << 3)
+
+/** Data structure used by mlx5_nl_cmdget_cb(). */
+struct mlx5_nl_ifindex_data {
+       const char *name; /**< IB device name (in). */
+       uint32_t flags; /**< found attribute flags (out). */
+       uint32_t ibindex; /**< IB device index (out). */
+       uint32_t ifindex; /**< Network interface index (out). */
+       uint32_t portnum; /**< IB device max port number (out). */
+};
+
+rte_atomic32_t atomic_sn = RTE_ATOMIC32_INIT(0);
+
+/* Generate Netlink sequence number. */
+#define MLX5_NL_SN_GENERATE ((uint32_t)rte_atomic32_add_return(&atomic_sn, 1))
+
+/**
+ * Opens a Netlink socket.
+ *
+ * @param protocol
+ *   Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA).
+ *
+ * @return
+ *   A file descriptor on success, a negative errno value otherwise and
+ *   rte_errno is set.
+ */
+int
+mlx5_nl_init(int protocol)
+{
+       int fd;
+       int sndbuf_size = MLX5_SEND_BUF_SIZE;
+       int rcvbuf_size = MLX5_RECV_BUF_SIZE;
+       struct sockaddr_nl local = {
+               .nl_family = AF_NETLINK,
+       };
+       int ret;
+
+       fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
+       if (fd == -1) {
+               rte_errno = errno;
+               return -rte_errno;
+       }
+       ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int));
+       if (ret == -1) {
+               rte_errno = errno;
+               goto error;
+       }
+       ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int));
+       if (ret == -1) {
+               rte_errno = errno;
+               goto error;
+       }
+       ret = bind(fd, (struct sockaddr *)&local, sizeof(local));
+       if (ret == -1) {
+               rte_errno = errno;
+               goto error;
+       }
+       return fd;
+error:
+       close(fd);
+       return -rte_errno;
+}
+
+/**
+ * Send a request message to the kernel on the Netlink socket.
+ *
+ * @param[in] nlsk_fd
+ *   Netlink socket file descriptor.
+ * @param[in] nh
+ *   The Netlink message send to the kernel.
+ * @param[in] ssn
+ *   Sequence number.
+ * @param[in] req
+ *   Pointer to the request structure.
+ * @param[in] len
+ *   Length of the request in bytes.
+ *
+ * @return
+ *   The number of sent bytes on success, a negative errno value otherwise and
+ *   rte_errno is set.
+ */
+static int
+mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req,
+               int len)
+{
+       struct sockaddr_nl sa = {
+               .nl_family = AF_NETLINK,
+       };
+       struct iovec iov[2] = {
+               { .iov_base = nh, .iov_len = sizeof(*nh), },
+               { .iov_base = req, .iov_len = len, },
+       };
+       struct msghdr msg = {
+               .msg_name = &sa,
+               .msg_namelen = sizeof(sa),
+               .msg_iov = iov,
+               .msg_iovlen = 2,
+       };
+       int send_bytes;
+
+       nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
+       nh->nlmsg_seq = sn;
+       send_bytes = sendmsg(nlsk_fd, &msg, 0);
+       if (send_bytes < 0) {
+               rte_errno = errno;
+               return -rte_errno;
+       }
+       return send_bytes;
+}
+
+/**
+ * Send a message to the kernel on the Netlink socket.
+ *
+ * @param[in] nlsk_fd
+ *   The Netlink socket file descriptor used for communication.
+ * @param[in] nh
+ *   The Netlink message send to the kernel.
+ * @param[in] sn
+ *   Sequence number.
+ *
+ * @return
+ *   The number of sent bytes on success, a negative errno value otherwise and
+ *   rte_errno is set.
+ */
+static int
+mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn)
+{
+       struct sockaddr_nl sa = {
+               .nl_family = AF_NETLINK,
+       };
+       struct iovec iov = {
+               .iov_base = nh,
+               .iov_len = nh->nlmsg_len,
+       };
+       struct msghdr msg = {
+               .msg_name = &sa,
+               .msg_namelen = sizeof(sa),
+               .msg_iov = &iov,
+               .msg_iovlen = 1,
+       };
+       int send_bytes;
+
+       nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
+       nh->nlmsg_seq = sn;
+       send_bytes = sendmsg(nlsk_fd, &msg, 0);
+       if (send_bytes < 0) {
+               rte_errno = errno;
+               return -rte_errno;
+       }
+       return send_bytes;
+}
+
+/**
+ * Receive a message from the kernel on the Netlink socket, following
+ * mlx5_nl_send().
+ *
+ * @param[in] nlsk_fd
+ *   The Netlink socket file descriptor used for communication.
+ * @param[in] sn
+ *   Sequence number.
+ * @param[in] cb
+ *   The callback function to call for each Netlink message received.
+ * @param[in, out] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg),
+            void *arg)
+{
+       struct sockaddr_nl sa;
+       char buf[MLX5_RECV_BUF_SIZE];
+       struct iovec iov = {
+               .iov_base = buf,
+               .iov_len = sizeof(buf),
+       };
+       struct msghdr msg = {
+               .msg_name = &sa,
+               .msg_namelen = sizeof(sa),
+               .msg_iov = &iov,
+               /* One message at a time */
+               .msg_iovlen = 1,
+       };
+       int multipart = 0;
+       int ret = 0;
+
+       do {
+               struct nlmsghdr *nh;
+               int recv_bytes = 0;
+
+               do {
+                       recv_bytes = recvmsg(nlsk_fd, &msg, 0);
+                       if (recv_bytes == -1) {
+                               rte_errno = errno;
+                               return -rte_errno;
+                       }
+                       nh = (struct nlmsghdr *)buf;
+               } while (nh->nlmsg_seq != sn);
+               for (;
+                    NLMSG_OK(nh, (unsigned int)recv_bytes);
+                    nh = NLMSG_NEXT(nh, recv_bytes)) {
+                       if (nh->nlmsg_type == NLMSG_ERROR) {
+                               struct nlmsgerr *err_data = NLMSG_DATA(nh);
+
+                               if (err_data->error < 0) {
+                                       rte_errno = -err_data->error;
+                                       return -rte_errno;
+                               }
+                               /* Ack message. */
+                               return 0;
+                       }
+                       /* Multi-part msgs and their trailing DONE message. */
+                       if (nh->nlmsg_flags & NLM_F_MULTI) {
+                               if (nh->nlmsg_type == NLMSG_DONE)
+                                       return 0;
+                               multipart = 1;
+                       }
+                       if (cb) {
+                               ret = cb(nh, arg);
+                               if (ret < 0)
+                                       return ret;
+                       }
+               }
+       } while (multipart);
+       return ret;
+}
+
+/**
+ * Parse Netlink message to retrieve the bridge MAC address.
+ *
+ * @param nh
+ *   Pointer to Netlink Message Header.
+ * @param arg
+ *   PMD data register with this callback.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg)
+{
+       struct mlx5_nl_mac_addr *data = arg;
+       struct ndmsg *r = NLMSG_DATA(nh);
+       struct rtattr *attribute;
+       int len;
+
+       len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
+       for (attribute = MLX5_NDA_RTA(r);
+            RTA_OK(attribute, len);
+            attribute = RTA_NEXT(attribute, len)) {
+               if (attribute->rta_type == NDA_LLADDR) {
+                       if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) {
+                               DRV_LOG(WARNING,
+                                       "not enough room to finalize the"
+                                       " request");
+                               rte_errno = ENOMEM;
+                               return -rte_errno;
+                       }
+#ifndef NDEBUG
+                       char m[18];
+
+                       rte_ether_format_addr(m, 18, RTA_DATA(attribute));
+                       DRV_LOG(DEBUG, "bridge MAC address %s", m);
+#endif
+                       memcpy(&(*data->mac)[data->mac_n++],
+                              RTA_DATA(attribute), RTE_ETHER_ADDR_LEN);
+               }
+       }
+       return 0;
+}
+
+/**
+ * Get bridge MAC addresses.
+ *
+ * @param[in] nlsk_fd
+ *   Netlink socket file descriptor.
+ * @param[in] iface_idx
+ *   Net device interface index.
+ * @param mac[out]
+ *   Pointer to the array table of MAC addresses to fill.
+ *   Its size should be of MLX5_MAX_MAC_ADDRESSES.
+ * @param mac_n[out]
+ *   Number of entries filled in MAC array.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_mac_addr_list(int nlsk_fd, unsigned int iface_idx,
+                     struct rte_ether_addr (*mac)[], int *mac_n)
+{
+       struct {
+               struct nlmsghdr hdr;
+               struct ifinfomsg ifm;
+       } req = {
+               .hdr = {
+                       .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
+                       .nlmsg_type = RTM_GETNEIGH,
+                       .nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
+               },
+               .ifm = {
+                       .ifi_family = PF_BRIDGE,
+                       .ifi_index = iface_idx,
+               },
+       };
+       struct mlx5_nl_mac_addr data = {
+               .mac = mac,
+               .mac_n = 0,
+       };
+       uint32_t sn = MLX5_NL_SN_GENERATE;
+       int ret;
+
+       if (nlsk_fd == -1)
+               return 0;
+       ret = mlx5_nl_request(nlsk_fd, &req.hdr, sn, &req.ifm,
+                             sizeof(struct ifinfomsg));
+       if (ret < 0)
+               goto error;
+       ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_mac_addr_cb, &data);
+       if (ret < 0)
+               goto error;
+       *mac_n = data.mac_n;
+       return 0;
+error:
+       DRV_LOG(DEBUG, "Interface %u cannot retrieve MAC address list %s",
+               iface_idx, strerror(rte_errno));
+       return -rte_errno;
+}
+
+/**
+ * Modify the MAC address neighbour table with Netlink.
+ *
+ * @param[in] nlsk_fd
+ *   Netlink socket file descriptor.
+ * @param[in] iface_idx
+ *   Net device interface index.
+ * @param mac
+ *   MAC address to consider.
+ * @param add
+ *   1 to add the MAC address, 0 to remove the MAC address.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
+                       struct rte_ether_addr *mac, int add)
+{
+       struct {
+               struct nlmsghdr hdr;
+               struct ndmsg ndm;
+               struct rtattr rta;
+               uint8_t buffer[RTE_ETHER_ADDR_LEN];
+       } req = {
+               .hdr = {
+                       .nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
+                       .nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
+                               NLM_F_EXCL | NLM_F_ACK,
+                       .nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH,
+               },
+               .ndm = {
+                       .ndm_family = PF_BRIDGE,
+                       .ndm_state = NUD_NOARP | NUD_PERMANENT,
+                       .ndm_ifindex = iface_idx,
+                       .ndm_flags = NTF_SELF,
+               },
+               .rta = {
+                       .rta_type = NDA_LLADDR,
+                       .rta_len = RTA_LENGTH(RTE_ETHER_ADDR_LEN),
+               },
+       };
+       uint32_t sn = MLX5_NL_SN_GENERATE;
+       int ret;
+
+       if (nlsk_fd == -1)
+               return 0;
+       memcpy(RTA_DATA(&req.rta), mac, RTE_ETHER_ADDR_LEN);
+       req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
+               RTA_ALIGN(req.rta.rta_len);
+       ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
+       if (ret < 0)
+               goto error;
+       ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
+       if (ret < 0)
+               goto error;
+       return 0;
+error:
+       DRV_LOG(DEBUG,
+               "Interface %u cannot %s MAC address"
+               " %02X:%02X:%02X:%02X:%02X:%02X %s",
+               iface_idx,
+               add ? "add" : "remove",
+               mac->addr_bytes[0], mac->addr_bytes[1],
+               mac->addr_bytes[2], mac->addr_bytes[3],
+               mac->addr_bytes[4], mac->addr_bytes[5],
+               strerror(rte_errno));
+       return -rte_errno;
+}
+
+/**
+ * Modify the VF MAC address neighbour table with Netlink.
+ *
+ * @param[in] nlsk_fd
+ *   Netlink socket file descriptor.
+ * @param[in] iface_idx
+ *   Net device interface index.
+ * @param mac
+ *    MAC address to consider.
+ * @param vf_index
+ *    VF index.
+ *
+ * @return
+ *    0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_nl_vf_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
+                          struct rte_ether_addr *mac, int vf_index)
+{
+       int ret;
+       struct {
+               struct nlmsghdr hdr;
+               struct ifinfomsg ifm;
+               struct rtattr vf_list_rta;
+               struct rtattr vf_info_rta;
+               struct rtattr vf_mac_rta;
+               struct ifla_vf_mac ivm;
+       } req = {
+               .hdr = {
+                       .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
+                       .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
+                       .nlmsg_type = RTM_BASE,
+               },
+               .ifm = {
+                       .ifi_index = iface_idx,
+               },
+               .vf_list_rta = {
+                       .rta_type = IFLA_VFINFO_LIST,
+                       .rta_len = RTA_ALIGN(RTA_LENGTH(0)),
+               },
+               .vf_info_rta = {
+                       .rta_type = IFLA_VF_INFO,
+                       .rta_len = RTA_ALIGN(RTA_LENGTH(0)),
+               },
+               .vf_mac_rta = {
+                       .rta_type = IFLA_VF_MAC,
+               },
+       };
+       struct ifla_vf_mac ivm = {
+               .vf = vf_index,
+       };
+       uint32_t sn = MLX5_NL_SN_GENERATE;
+
+       memcpy(&ivm.mac, mac, RTE_ETHER_ADDR_LEN);
+       memcpy(RTA_DATA(&req.vf_mac_rta), &ivm, sizeof(ivm));
+
+       req.vf_mac_rta.rta_len = RTA_LENGTH(sizeof(ivm));
+       req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
+               RTA_ALIGN(req.vf_list_rta.rta_len) +
+               RTA_ALIGN(req.vf_info_rta.rta_len) +
+               RTA_ALIGN(req.vf_mac_rta.rta_len);
+       req.vf_list_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
+                                              &req.vf_list_rta);
+       req.vf_info_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
+                                              &req.vf_info_rta);
+
+       if (nlsk_fd < 0)
+               return -1;
+       ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
+       if (ret < 0)
+               goto error;
+       ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
+       if (ret < 0)
+               goto error;
+       return 0;
+error:
+       DRV_LOG(ERR,
+               "representor %u cannot set VF MAC address "
+               "%02X:%02X:%02X:%02X:%02X:%02X : %s",
+               vf_index,
+               mac->addr_bytes[0], mac->addr_bytes[1],
+               mac->addr_bytes[2], mac->addr_bytes[3],
+               mac->addr_bytes[4], mac->addr_bytes[5],
+               strerror(rte_errno));
+       return -rte_errno;
+}
+
+/**
+ * Add a MAC address.
+ *
+ * @param[in] nlsk_fd
+ *   Netlink socket file descriptor.
+ * @param[in] iface_idx
+ *   Net device interface index.
+ * @param mac_own
+ *   BITFIELD_DECLARE array to store the mac.
+ * @param mac
+ *   MAC address to register.
+ * @param index
+ *   MAC address index.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_nl_mac_addr_add(int nlsk_fd, unsigned int iface_idx,
+                    uint64_t *mac_own, struct rte_ether_addr *mac,
+                    uint32_t index)
+{
+       int ret;
+
+       ret = mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 1);
+       if (!ret)
+               BITFIELD_SET(mac_own, index);
+       if (ret == -EEXIST)
+               return 0;
+       return ret;
+}
+
+/**
+ * Remove a MAC address.
+ *
+ * @param[in] nlsk_fd
+ *   Netlink socket file descriptor.
+ * @param[in] iface_idx
+ *   Net device interface index.
+ * @param mac_own
+ *   BITFIELD_DECLARE array to store the mac.
+ * @param mac
+ *   MAC address to remove.
+ * @param index
+ *   MAC address index.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_nl_mac_addr_remove(int nlsk_fd, unsigned int iface_idx, uint64_t *mac_own,
+                       struct rte_ether_addr *mac, uint32_t index)
+{
+       BITFIELD_RESET(mac_own, index);
+       return mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 0);
+}
+
+/**
+ * Synchronize Netlink bridge table to the internal table.
+ *
+ * @param[in] nlsk_fd
+ *   Netlink socket file descriptor.
+ * @param[in] iface_idx
+ *   Net device interface index.
+ * @param mac_addrs
+ *   Mac addresses array to sync.
+ * @param n
+ *   @p mac_addrs array size.
+ */
+void
+mlx5_nl_mac_addr_sync(int nlsk_fd, unsigned int iface_idx,
+                     struct rte_ether_addr *mac_addrs, int n)
+{
+       struct rte_ether_addr macs[n];
+       int macs_n = 0;
+       int i;
+       int ret;
+
+       ret = mlx5_nl_mac_addr_list(nlsk_fd, iface_idx, &macs, &macs_n);
+       if (ret)
+               return;
+       for (i = 0; i != macs_n; ++i) {
+               int j;
+
+               /* Verify the address is not in the array yet. */
+               for (j = 0; j != n; ++j)
+                       if (rte_is_same_ether_addr(&macs[i], &mac_addrs[j]))
+                               break;
+               if (j != n)
+                       continue;
+               /* Find the first entry available. */
+               for (j = 0; j != n; ++j) {
+                       if (rte_is_zero_ether_addr(&mac_addrs[j])) {
+                               mac_addrs[j] = macs[i];
+                               break;
+                       }
+               }
+       }
+}
+
+/**
+ * Flush all added MAC addresses.
+ *
+ * @param[in] nlsk_fd
+ *   Netlink socket file descriptor.
+ * @param[in] iface_idx
+ *   Net device interface index.
+ * @param[in] mac_addrs
+ *   Mac addresses array to flush.
+ * @param n
+ *   @p mac_addrs array size.
+ * @param mac_own
+ *   BITFIELD_DECLARE array to store the mac.
+ */
+void
+mlx5_nl_mac_addr_flush(int nlsk_fd, unsigned int iface_idx,
+                      struct rte_ether_addr *mac_addrs, int n,
+                      uint64_t *mac_own)
+{
+       int i;
+
+       for (i = n - 1; i >= 0; --i) {
+               struct rte_ether_addr *m = &mac_addrs[i];
+
+               if (BITFIELD_ISSET(mac_own, i))
+                       mlx5_nl_mac_addr_remove(nlsk_fd, iface_idx, mac_own, m,
+                                               i);
+       }
+}
+
+/**
+ * Enable promiscuous / all multicast mode through Netlink.
+ *
+ * @param[in] nlsk_fd
+ *   Netlink socket file descriptor.
+ * @param[in] iface_idx
+ *   Net device interface index.
+ * @param flags
+ *   IFF_PROMISC for promiscuous, IFF_ALLMULTI for allmulti.
+ * @param enable
+ *   Nonzero to enable, disable otherwise.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_device_flags(int nlsk_fd, unsigned int iface_idx, uint32_t flags,
+                    int enable)
+{
+       struct {
+               struct nlmsghdr hdr;
+               struct ifinfomsg ifi;
+       } req = {
+               .hdr = {
+                       .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
+                       .nlmsg_type = RTM_NEWLINK,
+                       .nlmsg_flags = NLM_F_REQUEST,
+               },
+               .ifi = {
+                       .ifi_flags = enable ? flags : 0,
+                       .ifi_change = flags,
+                       .ifi_index = iface_idx,
+               },
+       };
+       uint32_t sn = MLX5_NL_SN_GENERATE;
+       int ret;
+
+       assert(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI)));
+       if (nlsk_fd < 0)
+               return 0;
+       ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
+       if (ret < 0)
+               return ret;
+       return 0;
+}
+
+/**
+ * Enable promiscuous mode through Netlink.
+ *
+ * @param[in] nlsk_fd
+ *   Netlink socket file descriptor.
+ * @param[in] iface_idx
+ *   Net device interface index.
+ * @param enable
+ *   Nonzero to enable, disable otherwise.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_nl_promisc(int nlsk_fd, unsigned int iface_idx, int enable)
+{
+       int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_PROMISC, enable);
+
+       if (ret)
+               DRV_LOG(DEBUG,
+                       "Interface %u cannot %s promisc mode: Netlink error %s",
+                       iface_idx, enable ? "enable" : "disable",
+                       strerror(rte_errno));
+       return ret;
+}
+
+/**
+ * Enable all multicast mode through Netlink.
+ *
+ * @param[in] nlsk_fd
+ *   Netlink socket file descriptor.
+ * @param[in] iface_idx
+ *   Net device interface index.
+ * @param enable
+ *   Nonzero to enable, disable otherwise.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_nl_allmulti(int nlsk_fd, unsigned int iface_idx, int enable)
+{
+       int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_ALLMULTI,
+                                      enable);
+
+       if (ret)
+               DRV_LOG(DEBUG,
+                       "Interface %u cannot %s allmulti : Netlink error %s",
+                       iface_idx, enable ? "enable" : "disable",
+                       strerror(rte_errno));
+       return ret;
+}
+
+/**
+ * Process network interface information from Netlink message.
+ *
+ * @param nh
+ *   Pointer to Netlink message header.
+ * @param arg
+ *   Opaque data pointer for this callback.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
+{
+       struct mlx5_nl_ifindex_data *data = arg;
+       struct mlx5_nl_ifindex_data local = {
+               .flags = 0,
+       };
+       size_t off = NLMSG_HDRLEN;
+
+       if (nh->nlmsg_type !=
+           RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) &&
+           nh->nlmsg_type !=
+           RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET))
+               goto error;
+       while (off < nh->nlmsg_len) {
+               struct nlattr *na = (void *)((uintptr_t)nh + off);
+               void *payload = (void *)((uintptr_t)na + NLA_HDRLEN);
+
+               if (na->nla_len > nh->nlmsg_len - off)
+                       goto error;
+               switch (na->nla_type) {
+               case RDMA_NLDEV_ATTR_DEV_INDEX:
+                       local.ibindex = *(uint32_t *)payload;
+                       local.flags |= MLX5_NL_CMD_GET_IB_INDEX;
+                       break;
+               case RDMA_NLDEV_ATTR_DEV_NAME:
+                       if (!strcmp(payload, data->name))
+                               local.flags |= MLX5_NL_CMD_GET_IB_NAME;
+                       break;
+               case RDMA_NLDEV_ATTR_NDEV_INDEX:
+                       local.ifindex = *(uint32_t *)payload;
+                       local.flags |= MLX5_NL_CMD_GET_NET_INDEX;
+                       break;
+               case RDMA_NLDEV_ATTR_PORT_INDEX:
+                       local.portnum = *(uint32_t *)payload;
+                       local.flags |= MLX5_NL_CMD_GET_PORT_INDEX;
+                       break;
+               default:
+                       break;
+               }
+               off += NLA_ALIGN(na->nla_len);
+       }
+       /*
+        * It is possible to have multiple messages for all
+        * Infiniband devices in the system with appropriate name.
+        * So we should gather parameters locally and copy to
+        * query context only in case of coinciding device name.
+        */
+       if (local.flags & MLX5_NL_CMD_GET_IB_NAME) {
+               data->flags = local.flags;
+               data->ibindex = local.ibindex;
+               data->ifindex = local.ifindex;
+               data->portnum = local.portnum;
+       }
+       return 0;
+error:
+       rte_errno = EINVAL;
+       return -rte_errno;
+}
+
+/**
+ * Get index of network interface associated with some IB device.
+ *
+ * This is the only somewhat safe method to avoid resorting to heuristics
+ * when faced with port representors. Unfortunately it requires at least
+ * Linux 4.17.
+ *
+ * @param nl
+ *   Netlink socket of the RDMA kind (NETLINK_RDMA).
+ * @param[in] name
+ *   IB device name.
+ * @param[in] pindex
+ *   IB device port index, starting from 1
+ * @return
+ *   A valid (nonzero) interface index on success, 0 otherwise and rte_errno
+ *   is set.
+ */
+unsigned int
+mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
+{
+       struct mlx5_nl_ifindex_data data = {
+               .name = name,
+               .flags = 0,
+               .ibindex = 0, /* Determined during first pass. */
+               .ifindex = 0, /* Determined during second pass. */
+       };
+       union {
+               struct nlmsghdr nh;
+               uint8_t buf[NLMSG_HDRLEN +
+                           NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) +
+                           NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
+       } req = {
+               .nh = {
+                       .nlmsg_len = NLMSG_LENGTH(0),
+                       .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+                                                      RDMA_NLDEV_CMD_GET),
+                       .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
+               },
+       };
+       struct nlattr *na;
+       uint32_t sn = MLX5_NL_SN_GENERATE;
+       int ret;
+
+       ret = mlx5_nl_send(nl, &req.nh, sn);
+       if (ret < 0)
+               return 0;
+       ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
+       if (ret < 0)
+               return 0;
+       if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
+           !(data.flags & MLX5_NL_CMD_GET_IB_INDEX))
+               goto error;
+       data.flags = 0;
+       sn = MLX5_NL_SN_GENERATE;
+       req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+                                            RDMA_NLDEV_CMD_PORT_GET);
+       req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+       req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
+       na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
+       na->nla_len = NLA_HDRLEN + sizeof(data.ibindex);
+       na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
+       memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
+              &data.ibindex, sizeof(data.ibindex));
+       na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
+       na->nla_len = NLA_HDRLEN + sizeof(pindex);
+       na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
+       memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
+              &pindex, sizeof(pindex));
+       ret = mlx5_nl_send(nl, &req.nh, sn);
+       if (ret < 0)
+               return 0;
+       ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
+       if (ret < 0)
+               return 0;
+       if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
+           !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
+           !(data.flags & MLX5_NL_CMD_GET_NET_INDEX) ||
+           !data.ifindex)
+               goto error;
+       return data.ifindex;
+error:
+       rte_errno = ENODEV;
+       return 0;
+}
+
+/**
+ * Get the number of physical ports of given IB device.
+ *
+ * @param nl
+ *   Netlink socket of the RDMA kind (NETLINK_RDMA).
+ * @param[in] name
+ *   IB device name.
+ *
+ * @return
+ *   A valid (nonzero) number of ports on success, 0 otherwise
+ *   and rte_errno is set.
+ */
+unsigned int
+mlx5_nl_portnum(int nl, const char *name)
+{
+       struct mlx5_nl_ifindex_data data = {
+               .flags = 0,
+               .name = name,
+               .ifindex = 0,
+               .portnum = 0,
+       };
+       struct nlmsghdr req = {
+               .nlmsg_len = NLMSG_LENGTH(0),
+               .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+                                              RDMA_NLDEV_CMD_GET),
+               .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
+       };
+       uint32_t sn = MLX5_NL_SN_GENERATE;
+       int ret;
+
+       ret = mlx5_nl_send(nl, &req, sn);
+       if (ret < 0)
+               return 0;
+       ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
+       if (ret < 0)
+               return 0;
+       if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
+           !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
+           !(data.flags & MLX5_NL_CMD_GET_PORT_INDEX)) {
+               rte_errno = ENODEV;
+               return 0;
+       }
+       if (!data.portnum)
+               rte_errno = EINVAL;
+       return data.portnum;
+}
+
+/**
+ * Analyze gathered port parameters via Netlink to recognize master
+ * and representor devices for E-Switch configuration.
+ *
+ * @param[in] num_vf_set
+ *   flag of presence of number of VFs port attribute.
+ * @param[inout] switch_info
+ *   Port information, including port name as a number and port name
+ *   type if recognized
+ *
+ * @return
+ *   master and representor flags are set in switch_info according to
+ *   recognized parameters (if any).
+ */
+static void
+mlx5_nl_check_switch_info(bool num_vf_set,
+                         struct mlx5_switch_info *switch_info)
+{
+       switch (switch_info->name_type) {
+       case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
+               /*
+                * Name is not recognized, assume the master,
+                * check the number of VFs key presence.
+                */
+               switch_info->master = num_vf_set;
+               break;
+       case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
+               /*
+                * Name is not set, this assumes the legacy naming
+                * schema for master, just check if there is a
+                * number of VFs key.
+                */
+               switch_info->master = num_vf_set;
+               break;
+       case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
+               /* New uplink naming schema recognized. */
+               switch_info->master = 1;
+               break;
+       case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
+               /* Legacy representors naming schema. */
+               switch_info->representor = !num_vf_set;
+               break;
+       case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
+               /* New representors naming schema. */
+               switch_info->representor = 1;
+               break;
+       }
+}
+
+/**
+ * Process switch information from Netlink message.
+ *
+ * @param nh
+ *   Pointer to Netlink message header.
+ * @param arg
+ *   Opaque data pointer for this callback.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
+{
+       struct mlx5_switch_info info = {
+               .master = 0,
+               .representor = 0,
+               .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
+               .port_name = 0,
+               .switch_id = 0,
+       };
+       size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+       bool switch_id_set = false;
+       bool num_vf_set = false;
+
+       if (nh->nlmsg_type != RTM_NEWLINK)
+               goto error;
+       while (off < nh->nlmsg_len) {
+               struct rtattr *ra = (void *)((uintptr_t)nh + off);
+               void *payload = RTA_DATA(ra);
+               unsigned int i;
+
+               if (ra->rta_len > nh->nlmsg_len - off)
+                       goto error;
+               switch (ra->rta_type) {
+               case IFLA_NUM_VF:
+                       num_vf_set = true;
+                       break;
+               case IFLA_PHYS_PORT_NAME:
+                       mlx5_translate_port_name((char *)payload, &info);
+                       break;
+               case IFLA_PHYS_SWITCH_ID:
+                       info.switch_id = 0;
+                       for (i = 0; i < RTA_PAYLOAD(ra); ++i) {
+                               info.switch_id <<= 8;
+                               info.switch_id |= ((uint8_t *)payload)[i];
+                       }
+                       switch_id_set = true;
+                       break;
+               }
+               off += RTA_ALIGN(ra->rta_len);
+       }
+       if (switch_id_set) {
+               /* We have some E-Switch configuration. */
+               mlx5_nl_check_switch_info(num_vf_set, &info);
+       }
+       assert(!(info.master && info.representor));
+       memcpy(arg, &info, sizeof(info));
+       return 0;
+error:
+       rte_errno = EINVAL;
+       return -rte_errno;
+}
+
+/**
+ * Get switch information associated with network interface.
+ *
+ * @param nl
+ *   Netlink socket of the ROUTE kind (NETLINK_ROUTE).
+ * @param ifindex
+ *   Network interface index.
+ * @param[out] info
+ *   Switch information object, populated in case of success.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_nl_switch_info(int nl, unsigned int ifindex,
+                   struct mlx5_switch_info *info)
+{
+       struct {
+               struct nlmsghdr nh;
+               struct ifinfomsg info;
+               struct rtattr rta;
+               uint32_t extmask;
+       } req = {
+               .nh = {
+                       .nlmsg_len = NLMSG_LENGTH
+                                       (sizeof(req.info) +
+                                        RTA_LENGTH(sizeof(uint32_t))),
+                       .nlmsg_type = RTM_GETLINK,
+                       .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
+               },
+               .info = {
+                       .ifi_family = AF_UNSPEC,
+                       .ifi_index = ifindex,
+               },
+               .rta = {
+                       .rta_type = IFLA_EXT_MASK,
+                       .rta_len = RTA_LENGTH(sizeof(int32_t)),
+               },
+               .extmask = RTE_LE32(1),
+       };
+       uint32_t sn = MLX5_NL_SN_GENERATE;
+       int ret;
+
+       ret = mlx5_nl_send(nl, &req.nh, sn);
+       if (ret >= 0)
+               ret = mlx5_nl_recv(nl, sn, mlx5_nl_switch_info_cb, info);
+       if (info->master && info->representor) {
+               DRV_LOG(ERR, "ifindex %u device is recognized as master"
+                            " and as representor", ifindex);
+               rte_errno = ENODEV;
+               ret = -rte_errno;
+       }
+       return ret;
+}
+
+/*
+ * Delete VLAN network device by ifindex.
+ *
+ * @param[in] tcf
+ *   Context object initialized by mlx5_nl_vlan_vmwa_init().
+ * @param[in] ifindex
+ *   Interface index of network device to delete.
+ */
+void
+mlx5_nl_vlan_vmwa_delete(struct mlx5_nl_vlan_vmwa_context *vmwa,
+                     uint32_t ifindex)
+{
+       uint32_t sn = MLX5_NL_SN_GENERATE;
+       int ret;
+       struct {
+               struct nlmsghdr nh;
+               struct ifinfomsg info;
+       } req = {
+               .nh = {
+                       .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
+                       .nlmsg_type = RTM_DELLINK,
+                       .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
+               },
+               .info = {
+                       .ifi_family = AF_UNSPEC,
+                       .ifi_index = ifindex,
+               },
+       };
+
+       if (ifindex) {
+               ret = mlx5_nl_send(vmwa->nl_socket, &req.nh, sn);
+               if (ret >= 0)
+                       ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
+               if (ret < 0)
+                       DRV_LOG(WARNING, "netlink: error deleting VLAN WA"
+                               " ifindex %u, %d", ifindex, ret);
+       }
+}
+
+/* Set of subroutines to build Netlink message. */
+static struct nlattr *
+nl_msg_tail(struct nlmsghdr *nlh)
+{
+       return (struct nlattr *)
+               (((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
+}
+
+static void
+nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
+{
+       struct nlattr *nla = nl_msg_tail(nlh);
+
+       nla->nla_type = type;
+       nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr) + alen);
+       nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + nla->nla_len;
+
+       if (alen)
+               memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
+}
+
+static struct nlattr *
+nl_attr_nest_start(struct nlmsghdr *nlh, int type)
+{
+       struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
+
+       nl_attr_put(nlh, type, NULL, 0);
+       return nest;
+}
+
+static void
+nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest)
+{
+       nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest;
+}
+
+/*
+ * Create network VLAN device with specified VLAN tag.
+ *
+ * @param[in] tcf
+ *   Context object initialized by mlx5_nl_vlan_vmwa_init().
+ * @param[in] ifindex
+ *   Base network interface index.
+ * @param[in] tag
+ *   VLAN tag for VLAN network device to create.
+ */
+uint32_t
+mlx5_nl_vlan_vmwa_create(struct mlx5_nl_vlan_vmwa_context *vmwa,
+                        uint32_t ifindex, uint16_t tag)
+{
+       struct nlmsghdr *nlh;
+       struct ifinfomsg *ifm;
+       char name[sizeof(MLX5_VMWA_VLAN_DEVICE_PFX) + 32];
+
+       alignas(RTE_CACHE_LINE_SIZE)
+       uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
+                   NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
+                   NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
+                   NLMSG_ALIGN(sizeof(uint32_t)) +
+                   NLMSG_ALIGN(sizeof(name)) +
+                   NLMSG_ALIGN(sizeof("vlan")) +
+                   NLMSG_ALIGN(sizeof(uint32_t)) +
+                   NLMSG_ALIGN(sizeof(uint16_t)) + 16];
+       struct nlattr *na_info;
+       struct nlattr *na_vlan;
+       uint32_t sn = MLX5_NL_SN_GENERATE;
+       int ret;
+
+       memset(buf, 0, sizeof(buf));
+       nlh = (struct nlmsghdr *)buf;
+       nlh->nlmsg_len = sizeof(struct nlmsghdr);
+       nlh->nlmsg_type = RTM_NEWLINK;
+       nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
+                          NLM_F_EXCL | NLM_F_ACK;
+       ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
+       nlh->nlmsg_len += sizeof(struct ifinfomsg);
+       ifm->ifi_family = AF_UNSPEC;
+       ifm->ifi_type = 0;
+       ifm->ifi_index = 0;
+       ifm->ifi_flags = IFF_UP;
+       ifm->ifi_change = 0xffffffff;
+       nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
+       ret = snprintf(name, sizeof(name), "%s.%u.%u",
+                      MLX5_VMWA_VLAN_DEVICE_PFX, ifindex, tag);
+       nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
+       na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
+       nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
+       na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
+       nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
+       nl_attr_nest_end(nlh, na_vlan);
+       nl_attr_nest_end(nlh, na_info);
+       assert(sizeof(buf) >= nlh->nlmsg_len);
+       ret = mlx5_nl_send(vmwa->nl_socket, nlh, sn);
+       if (ret >= 0)
+               ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
+       if (ret < 0) {
+               DRV_LOG(WARNING, "netlink: VLAN %s create failure (%d)", name,
+                       ret);
+       }
+       // Try to get ifindex of created or pre-existing device.
+       ret = if_nametoindex(name);
+       if (!ret) {
+               DRV_LOG(WARNING, "VLAN %s failed to get index (%d)", name,
+                       errno);
+               return 0;
+       }
+       return ret;
+}
diff --git a/drivers/common/mlx5/mlx5_nl.h b/drivers/common/mlx5/mlx5_nl.h
new file mode 100644 (file)
index 0000000..8e66a98
--- /dev/null
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_NL_H_
+#define RTE_PMD_MLX5_NL_H_
+
+#include <linux/netlink.h>
+
+#include <rte_ether.h>
+
+#include "mlx5_common.h"
+
+
+/* VLAN netdev for VLAN workaround. */
+struct mlx5_nl_vlan_dev {
+       uint32_t refcnt;
+       uint32_t ifindex; /**< Own interface index. */
+};
+
+/*
+ * Array of VLAN devices created on the base of VF
+ * used for workaround in virtual environments.
+ */
+struct mlx5_nl_vlan_vmwa_context {
+       int nl_socket;
+       uint32_t vf_ifindex;
+       struct mlx5_nl_vlan_dev vlan_dev[4096];
+};
+
+
+int mlx5_nl_init(int protocol);
+int mlx5_nl_mac_addr_add(int nlsk_fd, unsigned int iface_idx, uint64_t *mac_own,
+                        struct rte_ether_addr *mac, uint32_t index);
+int mlx5_nl_mac_addr_remove(int nlsk_fd, unsigned int iface_idx,
+                           uint64_t *mac_own, struct rte_ether_addr *mac,
+                           uint32_t index);
+void mlx5_nl_mac_addr_sync(int nlsk_fd, unsigned int iface_idx,
+                          struct rte_ether_addr *mac_addrs, int n);
+void mlx5_nl_mac_addr_flush(int nlsk_fd, unsigned int iface_idx,
+                           struct rte_ether_addr *mac_addrs, int n,
+                           uint64_t *mac_own);
+int mlx5_nl_promisc(int nlsk_fd, unsigned int iface_idx, int enable);
+int mlx5_nl_allmulti(int nlsk_fd, unsigned int iface_idx, int enable);
+unsigned int mlx5_nl_portnum(int nl, const char *name);
+unsigned int mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex);
+int mlx5_nl_vf_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
+                              struct rte_ether_addr *mac, int vf_index);
+int mlx5_nl_switch_info(int nl, unsigned int ifindex,
+                       struct mlx5_switch_info *info);
+
+void mlx5_nl_vlan_vmwa_delete(struct mlx5_nl_vlan_vmwa_context *vmwa,
+                             uint32_t ifindex);
+uint32_t mlx5_nl_vlan_vmwa_create(struct mlx5_nl_vlan_vmwa_context *vmwa,
+                                 uint32_t ifindex, uint16_t tag);
+
+#endif /* RTE_PMD_MLX5_NL_H_ */
index 3e7038b..f93f5cb 100644 (file)
@@ -28,4 +28,20 @@ DPDK_20.02 {
        mlx5_devx_get_out_command_status;
 
        mlx5_dev_to_pci_addr;
+
+       mlx5_nl_allmulti;
+       mlx5_nl_ifindex;
+       mlx5_nl_init;
+       mlx5_nl_mac_addr_add;
+       mlx5_nl_mac_addr_flush;
+       mlx5_nl_mac_addr_remove;
+       mlx5_nl_mac_addr_sync;
+       mlx5_nl_portnum;
+       mlx5_nl_promisc;
+       mlx5_nl_switch_info;
+       mlx5_nl_vf_mac_addr_modify;
+       mlx5_nl_vlan_vmwa_create;
+       mlx5_nl_vlan_vmwa_delete;
+
+       mlx5_translate_port_name;
 };
index dc6b3c8..d26afbb 100644 (file)
@@ -30,7 +30,6 @@ SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow_meter.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow_dv.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow_verbs.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mp.c
-SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_nl.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_utils.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_socket.c
 
index e10ef3a..d45be00 100644 (file)
@@ -19,7 +19,6 @@ sources = files(
        'mlx5_flow_verbs.c',
        'mlx5_mac.c',
        'mlx5_mr.c',
-       'mlx5_nl.c',
        'mlx5_rss.c',
        'mlx5_rxmode.c',
        'mlx5_rxq.c',
index 9864aa7..a7e7089 100644 (file)
 #include <mlx5_glue.h>
 #include <mlx5_devx_cmds.h>
 #include <mlx5_prm.h>
+#include <mlx5_nl.h>
 
 #include "mlx5_defs.h"
 #include "mlx5_utils.h"
 #include "mlx5_mr.h"
-#include "mlx5_nl.h"
 #include "mlx5_autoconf.h"
 
 /* Request types for IPC. */
index dc9b965..9b392ed 100644 (file)
 /* Reported driver name. */
 #define MLX5_DRIVER_NAME "net_mlx5"
 
-/* Maximum number of simultaneous unicast MAC addresses. */
-#define MLX5_MAX_UC_MAC_ADDRESSES 128
-/* Maximum number of simultaneous Multicast MAC addresses. */
-#define MLX5_MAX_MC_MAC_ADDRESSES 128
-/* Maximum number of simultaneous MAC addresses. */
-#define MLX5_MAX_MAC_ADDRESSES \
-       (MLX5_MAX_UC_MAC_ADDRESSES + MLX5_MAX_MC_MAC_ADDRESSES)
-
 /* Maximum number of simultaneous VLAN filters. */
 #define MLX5_MAX_VLAN_IDS 128
 
index 5484104..b765636 100644 (file)
@@ -1939,61 +1939,6 @@ mlx5_sysfs_check_switch_info(bool device_dir,
        }
 }
 
-/**
- * Extract port name, as a number, from sysfs or netlink information.
- *
- * @param[in] port_name_in
- *   String representing the port name.
- * @param[out] port_info_out
- *   Port information, including port name as a number and port name
- *   type if recognized
- *
- * @return
- *   port_name field set according to recognized name format.
- */
-void
-mlx5_translate_port_name(const char *port_name_in,
-                        struct mlx5_switch_info *port_info_out)
-{
-       char pf_c1, pf_c2, vf_c1, vf_c2;
-       char *end;
-       int sc_items;
-
-       /*
-        * Check for port-name as a string of the form pf0vf0
-        * (support kernel ver >= 5.0 or OFED ver >= 4.6).
-        */
-       sc_items = sscanf(port_name_in, "%c%c%d%c%c%d",
-                         &pf_c1, &pf_c2, &port_info_out->pf_num,
-                         &vf_c1, &vf_c2, &port_info_out->port_name);
-       if (sc_items == 6 &&
-           pf_c1 == 'p' && pf_c2 == 'f' &&
-           vf_c1 == 'v' && vf_c2 == 'f') {
-               port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_PFVF;
-               return;
-       }
-       /*
-        * Check for port-name as a string of the form p0
-        * (support kernel ver >= 5.0, or OFED ver >= 4.6).
-        */
-       sc_items = sscanf(port_name_in, "%c%d",
-                         &pf_c1, &port_info_out->port_name);
-       if (sc_items == 2 && pf_c1 == 'p') {
-               port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UPLINK;
-               return;
-       }
-       /* Check for port-name as a number (support kernel ver < 5.0 */
-       errno = 0;
-       port_info_out->port_name = strtol(port_name_in, &end, 0);
-       if (!errno &&
-           (size_t)(end - port_name_in) == strlen(port_name_in)) {
-               port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_LEGACY;
-               return;
-       }
-       port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN;
-       return;
-}
-
 /**
  * DPDK callback to retrieve plug-in module EEPROM information (type and size).
  *
diff --git a/drivers/net/mlx5/mlx5_nl.c b/drivers/net/mlx5/mlx5_nl.c
deleted file mode 100644 (file)
index 6b8ca00..0000000
+++ /dev/null
@@ -1,1338 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright 2018 6WIND S.A.
- * Copyright 2018 Mellanox Technologies, Ltd
- */
-
-#include <errno.h>
-#include <linux/if_link.h>
-#include <linux/rtnetlink.h>
-#include <net/if.h>
-#include <rdma/rdma_netlink.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <stdalign.h>
-#include <string.h>
-#include <sys/socket.h>
-#include <unistd.h>
-
-#include <rte_errno.h>
-#include <rte_atomic.h>
-#include <rte_ether.h>
-
-#include "mlx5.h"
-#include "mlx5_nl.h"
-#include "mlx5_utils.h"
-
-/* Size of the buffer to receive kernel messages */
-#define MLX5_NL_BUF_SIZE (32 * 1024)
-/* Send buffer size for the Netlink socket */
-#define MLX5_SEND_BUF_SIZE 32768
-/* Receive buffer size for the Netlink socket */
-#define MLX5_RECV_BUF_SIZE 32768
-
-/** Parameters of VLAN devices created by driver. */
-#define MLX5_VMWA_VLAN_DEVICE_PFX "evmlx"
-/*
- * Define NDA_RTA as defined in iproute2 sources.
- *
- * see in iproute2 sources file include/libnetlink.h
- */
-#ifndef MLX5_NDA_RTA
-#define MLX5_NDA_RTA(r) \
-       ((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
-#endif
-/*
- * Define NLMSG_TAIL as defined in iproute2 sources.
- *
- * see in iproute2 sources file include/libnetlink.h
- */
-#ifndef NLMSG_TAIL
-#define NLMSG_TAIL(nmsg) \
-       ((struct rtattr *)(((char *)(nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
-#endif
-/*
- * The following definitions are normally found in rdma/rdma_netlink.h,
- * however they are so recent that most systems do not expose them yet.
- */
-#ifndef HAVE_RDMA_NL_NLDEV
-#define RDMA_NL_NLDEV 5
-#endif
-#ifndef HAVE_RDMA_NLDEV_CMD_GET
-#define RDMA_NLDEV_CMD_GET 1
-#endif
-#ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET
-#define RDMA_NLDEV_CMD_PORT_GET 5
-#endif
-#ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX
-#define RDMA_NLDEV_ATTR_DEV_INDEX 1
-#endif
-#ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME
-#define RDMA_NLDEV_ATTR_DEV_NAME 2
-#endif
-#ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
-#define RDMA_NLDEV_ATTR_PORT_INDEX 3
-#endif
-#ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
-#define RDMA_NLDEV_ATTR_NDEV_INDEX 50
-#endif
-
-/* These are normally found in linux/if_link.h. */
-#ifndef HAVE_IFLA_NUM_VF
-#define IFLA_NUM_VF 21
-#endif
-#ifndef HAVE_IFLA_EXT_MASK
-#define IFLA_EXT_MASK 29
-#endif
-#ifndef HAVE_IFLA_PHYS_SWITCH_ID
-#define IFLA_PHYS_SWITCH_ID 36
-#endif
-#ifndef HAVE_IFLA_PHYS_PORT_NAME
-#define IFLA_PHYS_PORT_NAME 38
-#endif
-
-/* Add/remove MAC address through Netlink */
-struct mlx5_nl_mac_addr {
-       struct rte_ether_addr (*mac)[];
-       /**< MAC address handled by the device. */
-       int mac_n; /**< Number of addresses in the array. */
-};
-
-#define MLX5_NL_CMD_GET_IB_NAME (1 << 0)
-#define MLX5_NL_CMD_GET_IB_INDEX (1 << 1)
-#define MLX5_NL_CMD_GET_NET_INDEX (1 << 2)
-#define MLX5_NL_CMD_GET_PORT_INDEX (1 << 3)
-
-/** Data structure used by mlx5_nl_cmdget_cb(). */
-struct mlx5_nl_ifindex_data {
-       const char *name; /**< IB device name (in). */
-       uint32_t flags; /**< found attribute flags (out). */
-       uint32_t ibindex; /**< IB device index (out). */
-       uint32_t ifindex; /**< Network interface index (out). */
-       uint32_t portnum; /**< IB device max port number (out). */
-};
-
-rte_atomic32_t atomic_sn = RTE_ATOMIC32_INIT(0);
-
-/* Generate Netlink sequence number. */
-#define MLX5_NL_SN_GENERATE ((uint32_t)rte_atomic32_add_return(&atomic_sn, 1))
-
-/**
- * Opens a Netlink socket.
- *
- * @param protocol
- *   Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA).
- *
- * @return
- *   A file descriptor on success, a negative errno value otherwise and
- *   rte_errno is set.
- */
-int
-mlx5_nl_init(int protocol)
-{
-       int fd;
-       int sndbuf_size = MLX5_SEND_BUF_SIZE;
-       int rcvbuf_size = MLX5_RECV_BUF_SIZE;
-       struct sockaddr_nl local = {
-               .nl_family = AF_NETLINK,
-       };
-       int ret;
-
-       fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
-       if (fd == -1) {
-               rte_errno = errno;
-               return -rte_errno;
-       }
-       ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int));
-       if (ret == -1) {
-               rte_errno = errno;
-               goto error;
-       }
-       ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int));
-       if (ret == -1) {
-               rte_errno = errno;
-               goto error;
-       }
-       ret = bind(fd, (struct sockaddr *)&local, sizeof(local));
-       if (ret == -1) {
-               rte_errno = errno;
-               goto error;
-       }
-       return fd;
-error:
-       close(fd);
-       return -rte_errno;
-}
-
-/**
- * Send a request message to the kernel on the Netlink socket.
- *
- * @param[in] nlsk_fd
- *   Netlink socket file descriptor.
- * @param[in] nh
- *   The Netlink message send to the kernel.
- * @param[in] ssn
- *   Sequence number.
- * @param[in] req
- *   Pointer to the request structure.
- * @param[in] len
- *   Length of the request in bytes.
- *
- * @return
- *   The number of sent bytes on success, a negative errno value otherwise and
- *   rte_errno is set.
- */
-static int
-mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req,
-               int len)
-{
-       struct sockaddr_nl sa = {
-               .nl_family = AF_NETLINK,
-       };
-       struct iovec iov[2] = {
-               { .iov_base = nh, .iov_len = sizeof(*nh), },
-               { .iov_base = req, .iov_len = len, },
-       };
-       struct msghdr msg = {
-               .msg_name = &sa,
-               .msg_namelen = sizeof(sa),
-               .msg_iov = iov,
-               .msg_iovlen = 2,
-       };
-       int send_bytes;
-
-       nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
-       nh->nlmsg_seq = sn;
-       send_bytes = sendmsg(nlsk_fd, &msg, 0);
-       if (send_bytes < 0) {
-               rte_errno = errno;
-               return -rte_errno;
-       }
-       return send_bytes;
-}
-
-/**
- * Send a message to the kernel on the Netlink socket.
- *
- * @param[in] nlsk_fd
- *   The Netlink socket file descriptor used for communication.
- * @param[in] nh
- *   The Netlink message send to the kernel.
- * @param[in] sn
- *   Sequence number.
- *
- * @return
- *   The number of sent bytes on success, a negative errno value otherwise and
- *   rte_errno is set.
- */
-static int
-mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn)
-{
-       struct sockaddr_nl sa = {
-               .nl_family = AF_NETLINK,
-       };
-       struct iovec iov = {
-               .iov_base = nh,
-               .iov_len = nh->nlmsg_len,
-       };
-       struct msghdr msg = {
-               .msg_name = &sa,
-               .msg_namelen = sizeof(sa),
-               .msg_iov = &iov,
-               .msg_iovlen = 1,
-       };
-       int send_bytes;
-
-       nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
-       nh->nlmsg_seq = sn;
-       send_bytes = sendmsg(nlsk_fd, &msg, 0);
-       if (send_bytes < 0) {
-               rte_errno = errno;
-               return -rte_errno;
-       }
-       return send_bytes;
-}
-
-/**
- * Receive a message from the kernel on the Netlink socket, following
- * mlx5_nl_send().
- *
- * @param[in] nlsk_fd
- *   The Netlink socket file descriptor used for communication.
- * @param[in] sn
- *   Sequence number.
- * @param[in] cb
- *   The callback function to call for each Netlink message received.
- * @param[in, out] arg
- *   Custom arguments for the callback.
- *
- * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
- */
-static int
-mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg),
-            void *arg)
-{
-       struct sockaddr_nl sa;
-       char buf[MLX5_RECV_BUF_SIZE];
-       struct iovec iov = {
-               .iov_base = buf,
-               .iov_len = sizeof(buf),
-       };
-       struct msghdr msg = {
-               .msg_name = &sa,
-               .msg_namelen = sizeof(sa),
-               .msg_iov = &iov,
-               /* One message at a time */
-               .msg_iovlen = 1,
-       };
-       int multipart = 0;
-       int ret = 0;
-
-       do {
-               struct nlmsghdr *nh;
-               int recv_bytes = 0;
-
-               do {
-                       recv_bytes = recvmsg(nlsk_fd, &msg, 0);
-                       if (recv_bytes == -1) {
-                               rte_errno = errno;
-                               return -rte_errno;
-                       }
-                       nh = (struct nlmsghdr *)buf;
-               } while (nh->nlmsg_seq != sn);
-               for (;
-                    NLMSG_OK(nh, (unsigned int)recv_bytes);
-                    nh = NLMSG_NEXT(nh, recv_bytes)) {
-                       if (nh->nlmsg_type == NLMSG_ERROR) {
-                               struct nlmsgerr *err_data = NLMSG_DATA(nh);
-
-                               if (err_data->error < 0) {
-                                       rte_errno = -err_data->error;
-                                       return -rte_errno;
-                               }
-                               /* Ack message. */
-                               return 0;
-                       }
-                       /* Multi-part msgs and their trailing DONE message. */
-                       if (nh->nlmsg_flags & NLM_F_MULTI) {
-                               if (nh->nlmsg_type == NLMSG_DONE)
-                                       return 0;
-                               multipart = 1;
-                       }
-                       if (cb) {
-                               ret = cb(nh, arg);
-                               if (ret < 0)
-                                       return ret;
-                       }
-               }
-       } while (multipart);
-       return ret;
-}
-
-/**
- * Parse Netlink message to retrieve the bridge MAC address.
- *
- * @param nh
- *   Pointer to Netlink Message Header.
- * @param arg
- *   PMD data register with this callback.
- *
- * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
- */
-static int
-mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg)
-{
-       struct mlx5_nl_mac_addr *data = arg;
-       struct ndmsg *r = NLMSG_DATA(nh);
-       struct rtattr *attribute;
-       int len;
-
-       len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
-       for (attribute = MLX5_NDA_RTA(r);
-            RTA_OK(attribute, len);
-            attribute = RTA_NEXT(attribute, len)) {
-               if (attribute->rta_type == NDA_LLADDR) {
-                       if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) {
-                               DRV_LOG(WARNING,
-                                       "not enough room to finalize the"
-                                       " request");
-                               rte_errno = ENOMEM;
-                               return -rte_errno;
-                       }
-#ifndef NDEBUG
-                       char m[18];
-
-                       rte_ether_format_addr(m, 18, RTA_DATA(attribute));
-                       DRV_LOG(DEBUG, "bridge MAC address %s", m);
-#endif
-                       memcpy(&(*data->mac)[data->mac_n++],
-                              RTA_DATA(attribute), RTE_ETHER_ADDR_LEN);
-               }
-       }
-       return 0;
-}
-
-/**
- * Get bridge MAC addresses.
- *
- * @param[in] nlsk_fd
- *   Netlink socket file descriptor.
- * @param[in] iface_idx
- *   Net device interface index.
- * @param mac[out]
- *   Pointer to the array table of MAC addresses to fill.
- *   Its size should be of MLX5_MAX_MAC_ADDRESSES.
- * @param mac_n[out]
- *   Number of entries filled in MAC array.
- *
- * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
- */
-static int
-mlx5_nl_mac_addr_list(int nlsk_fd, unsigned int iface_idx,
-                     struct rte_ether_addr (*mac)[], int *mac_n)
-{
-       struct {
-               struct nlmsghdr hdr;
-               struct ifinfomsg ifm;
-       } req = {
-               .hdr = {
-                       .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
-                       .nlmsg_type = RTM_GETNEIGH,
-                       .nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
-               },
-               .ifm = {
-                       .ifi_family = PF_BRIDGE,
-                       .ifi_index = iface_idx,
-               },
-       };
-       struct mlx5_nl_mac_addr data = {
-               .mac = mac,
-               .mac_n = 0,
-       };
-       uint32_t sn = MLX5_NL_SN_GENERATE;
-       int ret;
-
-       if (nlsk_fd == -1)
-               return 0;
-       ret = mlx5_nl_request(nlsk_fd, &req.hdr, sn, &req.ifm,
-                             sizeof(struct ifinfomsg));
-       if (ret < 0)
-               goto error;
-       ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_mac_addr_cb, &data);
-       if (ret < 0)
-               goto error;
-       *mac_n = data.mac_n;
-       return 0;
-error:
-       DRV_LOG(DEBUG, "Interface %u cannot retrieve MAC address list %s",
-               iface_idx, strerror(rte_errno));
-       return -rte_errno;
-}
-
-/**
- * Modify the MAC address neighbour table with Netlink.
- *
- * @param[in] nlsk_fd
- *   Netlink socket file descriptor.
- * @param[in] iface_idx
- *   Net device interface index.
- * @param mac
- *   MAC address to consider.
- * @param add
- *   1 to add the MAC address, 0 to remove the MAC address.
- *
- * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
- */
-static int
-mlx5_nl_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
-                       struct rte_ether_addr *mac, int add)
-{
-       struct {
-               struct nlmsghdr hdr;
-               struct ndmsg ndm;
-               struct rtattr rta;
-               uint8_t buffer[RTE_ETHER_ADDR_LEN];
-       } req = {
-               .hdr = {
-                       .nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
-                       .nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
-                               NLM_F_EXCL | NLM_F_ACK,
-                       .nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH,
-               },
-               .ndm = {
-                       .ndm_family = PF_BRIDGE,
-                       .ndm_state = NUD_NOARP | NUD_PERMANENT,
-                       .ndm_ifindex = iface_idx,
-                       .ndm_flags = NTF_SELF,
-               },
-               .rta = {
-                       .rta_type = NDA_LLADDR,
-                       .rta_len = RTA_LENGTH(RTE_ETHER_ADDR_LEN),
-               },
-       };
-       uint32_t sn = MLX5_NL_SN_GENERATE;
-       int ret;
-
-       if (nlsk_fd == -1)
-               return 0;
-       memcpy(RTA_DATA(&req.rta), mac, RTE_ETHER_ADDR_LEN);
-       req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
-               RTA_ALIGN(req.rta.rta_len);
-       ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
-       if (ret < 0)
-               goto error;
-       ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
-       if (ret < 0)
-               goto error;
-       return 0;
-error:
-       DRV_LOG(DEBUG,
-               "Interface %u cannot %s MAC address"
-               " %02X:%02X:%02X:%02X:%02X:%02X %s",
-               iface_idx,
-               add ? "add" : "remove",
-               mac->addr_bytes[0], mac->addr_bytes[1],
-               mac->addr_bytes[2], mac->addr_bytes[3],
-               mac->addr_bytes[4], mac->addr_bytes[5],
-               strerror(rte_errno));
-       return -rte_errno;
-}
-
-/**
- * Modify the VF MAC address neighbour table with Netlink.
- *
- * @param[in] nlsk_fd
- *   Netlink socket file descriptor.
- * @param[in] iface_idx
- *   Net device interface index.
- * @param mac
- *    MAC address to consider.
- * @param vf_index
- *    VF index.
- *
- * @return
- *    0 on success, a negative errno value otherwise and rte_errno is set.
- */
-int
-mlx5_nl_vf_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
-                          struct rte_ether_addr *mac, int vf_index)
-{
-       int ret;
-       struct {
-               struct nlmsghdr hdr;
-               struct ifinfomsg ifm;
-               struct rtattr vf_list_rta;
-               struct rtattr vf_info_rta;
-               struct rtattr vf_mac_rta;
-               struct ifla_vf_mac ivm;
-       } req = {
-               .hdr = {
-                       .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
-                       .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
-                       .nlmsg_type = RTM_BASE,
-               },
-               .ifm = {
-                       .ifi_index = iface_idx,
-               },
-               .vf_list_rta = {
-                       .rta_type = IFLA_VFINFO_LIST,
-                       .rta_len = RTA_ALIGN(RTA_LENGTH(0)),
-               },
-               .vf_info_rta = {
-                       .rta_type = IFLA_VF_INFO,
-                       .rta_len = RTA_ALIGN(RTA_LENGTH(0)),
-               },
-               .vf_mac_rta = {
-                       .rta_type = IFLA_VF_MAC,
-               },
-       };
-       struct ifla_vf_mac ivm = {
-               .vf = vf_index,
-       };
-       uint32_t sn = MLX5_NL_SN_GENERATE;
-
-       memcpy(&ivm.mac, mac, RTE_ETHER_ADDR_LEN);
-       memcpy(RTA_DATA(&req.vf_mac_rta), &ivm, sizeof(ivm));
-
-       req.vf_mac_rta.rta_len = RTA_LENGTH(sizeof(ivm));
-       req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
-               RTA_ALIGN(req.vf_list_rta.rta_len) +
-               RTA_ALIGN(req.vf_info_rta.rta_len) +
-               RTA_ALIGN(req.vf_mac_rta.rta_len);
-       req.vf_list_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
-                                              &req.vf_list_rta);
-       req.vf_info_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
-                                              &req.vf_info_rta);
-
-       if (nlsk_fd < 0)
-               return -1;
-       ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
-       if (ret < 0)
-               goto error;
-       ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
-       if (ret < 0)
-               goto error;
-       return 0;
-error:
-       DRV_LOG(ERR,
-               "representor %u cannot set VF MAC address "
-               "%02X:%02X:%02X:%02X:%02X:%02X : %s",
-               vf_index,
-               mac->addr_bytes[0], mac->addr_bytes[1],
-               mac->addr_bytes[2], mac->addr_bytes[3],
-               mac->addr_bytes[4], mac->addr_bytes[5],
-               strerror(rte_errno));
-       return -rte_errno;
-}
-
-/**
- * Add a MAC address.
- *
- * @param[in] nlsk_fd
- *   Netlink socket file descriptor.
- * @param[in] iface_idx
- *   Net device interface index.
- * @param mac_own
- *   BITFIELD_DECLARE array to store the mac.
- * @param mac
- *   MAC address to register.
- * @param index
- *   MAC address index.
- *
- * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
- */
-int
-mlx5_nl_mac_addr_add(int nlsk_fd, unsigned int iface_idx,
-                    uint64_t *mac_own, struct rte_ether_addr *mac,
-                    uint32_t index)
-{
-       int ret;
-
-       ret = mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 1);
-       if (!ret)
-               BITFIELD_SET(mac_own, index);
-       if (ret == -EEXIST)
-               return 0;
-       return ret;
-}
-
-/**
- * Remove a MAC address.
- *
- * @param[in] nlsk_fd
- *   Netlink socket file descriptor.
- * @param[in] iface_idx
- *   Net device interface index.
- * @param mac_own
- *   BITFIELD_DECLARE array to store the mac.
- * @param mac
- *   MAC address to remove.
- * @param index
- *   MAC address index.
- *
- * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
- */
-int
-mlx5_nl_mac_addr_remove(int nlsk_fd, unsigned int iface_idx, uint64_t *mac_own,
-                       struct rte_ether_addr *mac, uint32_t index)
-{
-       BITFIELD_RESET(mac_own, index);
-       return mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 0);
-}
-
-/**
- * Synchronize Netlink bridge table to the internal table.
- *
- * @param[in] nlsk_fd
- *   Netlink socket file descriptor.
- * @param[in] iface_idx
- *   Net device interface index.
- * @param mac_addrs
- *   Mac addresses array to sync.
- * @param n
- *   @p mac_addrs array size.
- */
-void
-mlx5_nl_mac_addr_sync(int nlsk_fd, unsigned int iface_idx,
-                     struct rte_ether_addr *mac_addrs, int n)
-{
-       struct rte_ether_addr macs[n];
-       int macs_n = 0;
-       int i;
-       int ret;
-
-       ret = mlx5_nl_mac_addr_list(nlsk_fd, iface_idx, &macs, &macs_n);
-       if (ret)
-               return;
-       for (i = 0; i != macs_n; ++i) {
-               int j;
-
-               /* Verify the address is not in the array yet. */
-               for (j = 0; j != n; ++j)
-                       if (rte_is_same_ether_addr(&macs[i], &mac_addrs[j]))
-                               break;
-               if (j != n)
-                       continue;
-               /* Find the first entry available. */
-               for (j = 0; j != n; ++j) {
-                       if (rte_is_zero_ether_addr(&mac_addrs[j])) {
-                               mac_addrs[j] = macs[i];
-                               break;
-                       }
-               }
-       }
-}
-
-/**
- * Flush all added MAC addresses.
- *
- * @param[in] nlsk_fd
- *   Netlink socket file descriptor.
- * @param[in] iface_idx
- *   Net device interface index.
- * @param[in] mac_addrs
- *   Mac addresses array to flush.
- * @param n
- *   @p mac_addrs array size.
- * @param mac_own
- *   BITFIELD_DECLARE array to store the mac.
- */
-void
-mlx5_nl_mac_addr_flush(int nlsk_fd, unsigned int iface_idx,
-                      struct rte_ether_addr *mac_addrs, int n,
-                      uint64_t *mac_own)
-{
-       int i;
-
-       for (i = n - 1; i >= 0; --i) {
-               struct rte_ether_addr *m = &mac_addrs[i];
-
-               if (BITFIELD_ISSET(mac_own, i))
-                       mlx5_nl_mac_addr_remove(nlsk_fd, iface_idx, mac_own, m,
-                                               i);
-       }
-}
-
-/**
- * Enable promiscuous / all multicast mode through Netlink.
- *
- * @param[in] nlsk_fd
- *   Netlink socket file descriptor.
- * @param[in] iface_idx
- *   Net device interface index.
- * @param flags
- *   IFF_PROMISC for promiscuous, IFF_ALLMULTI for allmulti.
- * @param enable
- *   Nonzero to enable, disable otherwise.
- *
- * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
- */
-static int
-mlx5_nl_device_flags(int nlsk_fd, unsigned int iface_idx, uint32_t flags,
-                    int enable)
-{
-       struct {
-               struct nlmsghdr hdr;
-               struct ifinfomsg ifi;
-       } req = {
-               .hdr = {
-                       .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
-                       .nlmsg_type = RTM_NEWLINK,
-                       .nlmsg_flags = NLM_F_REQUEST,
-               },
-               .ifi = {
-                       .ifi_flags = enable ? flags : 0,
-                       .ifi_change = flags,
-                       .ifi_index = iface_idx,
-               },
-       };
-       uint32_t sn = MLX5_NL_SN_GENERATE;
-       int ret;
-
-       assert(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI)));
-       if (nlsk_fd < 0)
-               return 0;
-       ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
-       if (ret < 0)
-               return ret;
-       return 0;
-}
-
-/**
- * Enable promiscuous mode through Netlink.
- *
- * @param[in] nlsk_fd
- *   Netlink socket file descriptor.
- * @param[in] iface_idx
- *   Net device interface index.
- * @param enable
- *   Nonzero to enable, disable otherwise.
- *
- * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
- */
-int
-mlx5_nl_promisc(int nlsk_fd, unsigned int iface_idx, int enable)
-{
-       int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_PROMISC, enable);
-
-       if (ret)
-               DRV_LOG(DEBUG,
-                       "Interface %u cannot %s promisc mode: Netlink error %s",
-                       iface_idx, enable ? "enable" : "disable",
-                       strerror(rte_errno));
-       return ret;
-}
-
-/**
- * Enable all multicast mode through Netlink.
- *
- * @param[in] nlsk_fd
- *   Netlink socket file descriptor.
- * @param[in] iface_idx
- *   Net device interface index.
- * @param enable
- *   Nonzero to enable, disable otherwise.
- *
- * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
- */
-int
-mlx5_nl_allmulti(int nlsk_fd, unsigned int iface_idx, int enable)
-{
-       int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_ALLMULTI,
-                                      enable);
-
-       if (ret)
-               DRV_LOG(DEBUG,
-                       "Interface %u cannot %s allmulti : Netlink error %s",
-                       iface_idx, enable ? "enable" : "disable",
-                       strerror(rte_errno));
-       return ret;
-}
-
-/**
- * Process network interface information from Netlink message.
- *
- * @param nh
- *   Pointer to Netlink message header.
- * @param arg
- *   Opaque data pointer for this callback.
- *
- * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
- */
-static int
-mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
-{
-       struct mlx5_nl_ifindex_data *data = arg;
-       struct mlx5_nl_ifindex_data local = {
-               .flags = 0,
-       };
-       size_t off = NLMSG_HDRLEN;
-
-       if (nh->nlmsg_type !=
-           RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) &&
-           nh->nlmsg_type !=
-           RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET))
-               goto error;
-       while (off < nh->nlmsg_len) {
-               struct nlattr *na = (void *)((uintptr_t)nh + off);
-               void *payload = (void *)((uintptr_t)na + NLA_HDRLEN);
-
-               if (na->nla_len > nh->nlmsg_len - off)
-                       goto error;
-               switch (na->nla_type) {
-               case RDMA_NLDEV_ATTR_DEV_INDEX:
-                       local.ibindex = *(uint32_t *)payload;
-                       local.flags |= MLX5_NL_CMD_GET_IB_INDEX;
-                       break;
-               case RDMA_NLDEV_ATTR_DEV_NAME:
-                       if (!strcmp(payload, data->name))
-                               local.flags |= MLX5_NL_CMD_GET_IB_NAME;
-                       break;
-               case RDMA_NLDEV_ATTR_NDEV_INDEX:
-                       local.ifindex = *(uint32_t *)payload;
-                       local.flags |= MLX5_NL_CMD_GET_NET_INDEX;
-                       break;
-               case RDMA_NLDEV_ATTR_PORT_INDEX:
-                       local.portnum = *(uint32_t *)payload;
-                       local.flags |= MLX5_NL_CMD_GET_PORT_INDEX;
-                       break;
-               default:
-                       break;
-               }
-               off += NLA_ALIGN(na->nla_len);
-       }
-       /*
-        * It is possible to have multiple messages for all
-        * Infiniband devices in the system with appropriate name.
-        * So we should gather parameters locally and copy to
-        * query context only in case of coinciding device name.
-        */
-       if (local.flags & MLX5_NL_CMD_GET_IB_NAME) {
-               data->flags = local.flags;
-               data->ibindex = local.ibindex;
-               data->ifindex = local.ifindex;
-               data->portnum = local.portnum;
-       }
-       return 0;
-error:
-       rte_errno = EINVAL;
-       return -rte_errno;
-}
-
-/**
- * Get index of network interface associated with some IB device.
- *
- * This is the only somewhat safe method to avoid resorting to heuristics
- * when faced with port representors. Unfortunately it requires at least
- * Linux 4.17.
- *
- * @param nl
- *   Netlink socket of the RDMA kind (NETLINK_RDMA).
- * @param[in] name
- *   IB device name.
- * @param[in] pindex
- *   IB device port index, starting from 1
- * @return
- *   A valid (nonzero) interface index on success, 0 otherwise and rte_errno
- *   is set.
- */
-unsigned int
-mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
-{
-       struct mlx5_nl_ifindex_data data = {
-               .name = name,
-               .flags = 0,
-               .ibindex = 0, /* Determined during first pass. */
-               .ifindex = 0, /* Determined during second pass. */
-       };
-       union {
-               struct nlmsghdr nh;
-               uint8_t buf[NLMSG_HDRLEN +
-                           NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) +
-                           NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
-       } req = {
-               .nh = {
-                       .nlmsg_len = NLMSG_LENGTH(0),
-                       .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
-                                                      RDMA_NLDEV_CMD_GET),
-                       .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
-               },
-       };
-       struct nlattr *na;
-       uint32_t sn = MLX5_NL_SN_GENERATE;
-       int ret;
-
-       ret = mlx5_nl_send(nl, &req.nh, sn);
-       if (ret < 0)
-               return 0;
-       ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
-       if (ret < 0)
-               return 0;
-       if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
-           !(data.flags & MLX5_NL_CMD_GET_IB_INDEX))
-               goto error;
-       data.flags = 0;
-       sn = MLX5_NL_SN_GENERATE;
-       req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
-                                            RDMA_NLDEV_CMD_PORT_GET);
-       req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
-       req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
-       na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
-       na->nla_len = NLA_HDRLEN + sizeof(data.ibindex);
-       na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
-       memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
-              &data.ibindex, sizeof(data.ibindex));
-       na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
-       na->nla_len = NLA_HDRLEN + sizeof(pindex);
-       na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
-       memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
-              &pindex, sizeof(pindex));
-       ret = mlx5_nl_send(nl, &req.nh, sn);
-       if (ret < 0)
-               return 0;
-       ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
-       if (ret < 0)
-               return 0;
-       if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
-           !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
-           !(data.flags & MLX5_NL_CMD_GET_NET_INDEX) ||
-           !data.ifindex)
-               goto error;
-       return data.ifindex;
-error:
-       rte_errno = ENODEV;
-       return 0;
-}
-
-/**
- * Get the number of physical ports of given IB device.
- *
- * @param nl
- *   Netlink socket of the RDMA kind (NETLINK_RDMA).
- * @param[in] name
- *   IB device name.
- *
- * @return
- *   A valid (nonzero) number of ports on success, 0 otherwise
- *   and rte_errno is set.
- */
-unsigned int
-mlx5_nl_portnum(int nl, const char *name)
-{
-       struct mlx5_nl_ifindex_data data = {
-               .flags = 0,
-               .name = name,
-               .ifindex = 0,
-               .portnum = 0,
-       };
-       struct nlmsghdr req = {
-               .nlmsg_len = NLMSG_LENGTH(0),
-               .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
-                                              RDMA_NLDEV_CMD_GET),
-               .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
-       };
-       uint32_t sn = MLX5_NL_SN_GENERATE;
-       int ret;
-
-       ret = mlx5_nl_send(nl, &req, sn);
-       if (ret < 0)
-               return 0;
-       ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
-       if (ret < 0)
-               return 0;
-       if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
-           !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
-           !(data.flags & MLX5_NL_CMD_GET_PORT_INDEX)) {
-               rte_errno = ENODEV;
-               return 0;
-       }
-       if (!data.portnum)
-               rte_errno = EINVAL;
-       return data.portnum;
-}
-
-/**
- * Analyze gathered port parameters via Netlink to recognize master
- * and representor devices for E-Switch configuration.
- *
- * @param[in] num_vf_set
- *   flag of presence of number of VFs port attribute.
- * @param[inout] switch_info
- *   Port information, including port name as a number and port name
- *   type if recognized
- *
- * @return
- *   master and representor flags are set in switch_info according to
- *   recognized parameters (if any).
- */
-static void
-mlx5_nl_check_switch_info(bool num_vf_set,
-                         struct mlx5_switch_info *switch_info)
-{
-       switch (switch_info->name_type) {
-       case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
-               /*
-                * Name is not recognized, assume the master,
-                * check the number of VFs key presence.
-                */
-               switch_info->master = num_vf_set;
-               break;
-       case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
-               /*
-                * Name is not set, this assumes the legacy naming
-                * schema for master, just check if there is a
-                * number of VFs key.
-                */
-               switch_info->master = num_vf_set;
-               break;
-       case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
-               /* New uplink naming schema recognized. */
-               switch_info->master = 1;
-               break;
-       case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
-               /* Legacy representors naming schema. */
-               switch_info->representor = !num_vf_set;
-               break;
-       case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
-               /* New representors naming schema. */
-               switch_info->representor = 1;
-               break;
-       }
-}
-
-/**
- * Process switch information from Netlink message.
- *
- * @param nh
- *   Pointer to Netlink message header.
- * @param arg
- *   Opaque data pointer for this callback.
- *
- * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
- */
-static int
-mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
-{
-       struct mlx5_switch_info info = {
-               .master = 0,
-               .representor = 0,
-               .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
-               .port_name = 0,
-               .switch_id = 0,
-       };
-       size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg));
-       bool switch_id_set = false;
-       bool num_vf_set = false;
-
-       if (nh->nlmsg_type != RTM_NEWLINK)
-               goto error;
-       while (off < nh->nlmsg_len) {
-               struct rtattr *ra = (void *)((uintptr_t)nh + off);
-               void *payload = RTA_DATA(ra);
-               unsigned int i;
-
-               if (ra->rta_len > nh->nlmsg_len - off)
-                       goto error;
-               switch (ra->rta_type) {
-               case IFLA_NUM_VF:
-                       num_vf_set = true;
-                       break;
-               case IFLA_PHYS_PORT_NAME:
-                       mlx5_translate_port_name((char *)payload, &info);
-                       break;
-               case IFLA_PHYS_SWITCH_ID:
-                       info.switch_id = 0;
-                       for (i = 0; i < RTA_PAYLOAD(ra); ++i) {
-                               info.switch_id <<= 8;
-                               info.switch_id |= ((uint8_t *)payload)[i];
-                       }
-                       switch_id_set = true;
-                       break;
-               }
-               off += RTA_ALIGN(ra->rta_len);
-       }
-       if (switch_id_set) {
-               /* We have some E-Switch configuration. */
-               mlx5_nl_check_switch_info(num_vf_set, &info);
-       }
-       assert(!(info.master && info.representor));
-       memcpy(arg, &info, sizeof(info));
-       return 0;
-error:
-       rte_errno = EINVAL;
-       return -rte_errno;
-}
-
-/**
- * Get switch information associated with network interface.
- *
- * @param nl
- *   Netlink socket of the ROUTE kind (NETLINK_ROUTE).
- * @param ifindex
- *   Network interface index.
- * @param[out] info
- *   Switch information object, populated in case of success.
- *
- * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
- */
-int
-mlx5_nl_switch_info(int nl, unsigned int ifindex,
-                   struct mlx5_switch_info *info)
-{
-       struct {
-               struct nlmsghdr nh;
-               struct ifinfomsg info;
-               struct rtattr rta;
-               uint32_t extmask;
-       } req = {
-               .nh = {
-                       .nlmsg_len = NLMSG_LENGTH
-                                       (sizeof(req.info) +
-                                        RTA_LENGTH(sizeof(uint32_t))),
-                       .nlmsg_type = RTM_GETLINK,
-                       .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
-               },
-               .info = {
-                       .ifi_family = AF_UNSPEC,
-                       .ifi_index = ifindex,
-               },
-               .rta = {
-                       .rta_type = IFLA_EXT_MASK,
-                       .rta_len = RTA_LENGTH(sizeof(int32_t)),
-               },
-               .extmask = RTE_LE32(1),
-       };
-       uint32_t sn = MLX5_NL_SN_GENERATE;
-       int ret;
-
-       ret = mlx5_nl_send(nl, &req.nh, sn);
-       if (ret >= 0)
-               ret = mlx5_nl_recv(nl, sn, mlx5_nl_switch_info_cb, info);
-       if (info->master && info->representor) {
-               DRV_LOG(ERR, "ifindex %u device is recognized as master"
-                            " and as representor", ifindex);
-               rte_errno = ENODEV;
-               ret = -rte_errno;
-       }
-       return ret;
-}
-
-/*
- * Delete VLAN network device by ifindex.
- *
- * @param[in] tcf
- *   Context object initialized by mlx5_nl_vlan_vmwa_init().
- * @param[in] ifindex
- *   Interface index of network device to delete.
- */
-void
-mlx5_nl_vlan_vmwa_delete(struct mlx5_nl_vlan_vmwa_context *vmwa,
-                     uint32_t ifindex)
-{
-       uint32_t sn = MLX5_NL_SN_GENERATE;
-       int ret;
-       struct {
-               struct nlmsghdr nh;
-               struct ifinfomsg info;
-       } req = {
-               .nh = {
-                       .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
-                       .nlmsg_type = RTM_DELLINK,
-                       .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
-               },
-               .info = {
-                       .ifi_family = AF_UNSPEC,
-                       .ifi_index = ifindex,
-               },
-       };
-
-       if (ifindex) {
-               ret = mlx5_nl_send(vmwa->nl_socket, &req.nh, sn);
-               if (ret >= 0)
-                       ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
-               if (ret < 0)
-                       DRV_LOG(WARNING, "netlink: error deleting VLAN WA"
-                               " ifindex %u, %d", ifindex, ret);
-       }
-}
-
-/* Set of subroutines to build Netlink message. */
-static struct nlattr *
-nl_msg_tail(struct nlmsghdr *nlh)
-{
-       return (struct nlattr *)
-               (((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
-}
-
-static void
-nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
-{
-       struct nlattr *nla = nl_msg_tail(nlh);
-
-       nla->nla_type = type;
-       nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr) + alen);
-       nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + nla->nla_len;
-
-       if (alen)
-               memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
-}
-
-static struct nlattr *
-nl_attr_nest_start(struct nlmsghdr *nlh, int type)
-{
-       struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
-
-       nl_attr_put(nlh, type, NULL, 0);
-       return nest;
-}
-
-static void
-nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest)
-{
-       nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest;
-}
-
-/*
- * Create network VLAN device with specified VLAN tag.
- *
- * @param[in] tcf
- *   Context object initialized by mlx5_nl_vlan_vmwa_init().
- * @param[in] ifindex
- *   Base network interface index.
- * @param[in] tag
- *   VLAN tag for VLAN network device to create.
- */
-uint32_t
-mlx5_nl_vlan_vmwa_create(struct mlx5_nl_vlan_vmwa_context *vmwa,
-                        uint32_t ifindex, uint16_t tag)
-{
-       struct nlmsghdr *nlh;
-       struct ifinfomsg *ifm;
-       char name[sizeof(MLX5_VMWA_VLAN_DEVICE_PFX) + 32];
-
-       alignas(RTE_CACHE_LINE_SIZE)
-       uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
-                   NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
-                   NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
-                   NLMSG_ALIGN(sizeof(uint32_t)) +
-                   NLMSG_ALIGN(sizeof(name)) +
-                   NLMSG_ALIGN(sizeof("vlan")) +
-                   NLMSG_ALIGN(sizeof(uint32_t)) +
-                   NLMSG_ALIGN(sizeof(uint16_t)) + 16];
-       struct nlattr *na_info;
-       struct nlattr *na_vlan;
-       uint32_t sn = MLX5_NL_SN_GENERATE;
-       int ret;
-
-       memset(buf, 0, sizeof(buf));
-       nlh = (struct nlmsghdr *)buf;
-       nlh->nlmsg_len = sizeof(struct nlmsghdr);
-       nlh->nlmsg_type = RTM_NEWLINK;
-       nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
-                          NLM_F_EXCL | NLM_F_ACK;
-       ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
-       nlh->nlmsg_len += sizeof(struct ifinfomsg);
-       ifm->ifi_family = AF_UNSPEC;
-       ifm->ifi_type = 0;
-       ifm->ifi_index = 0;
-       ifm->ifi_flags = IFF_UP;
-       ifm->ifi_change = 0xffffffff;
-       nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
-       ret = snprintf(name, sizeof(name), "%s.%u.%u",
-                      MLX5_VMWA_VLAN_DEVICE_PFX, ifindex, tag);
-       nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
-       na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
-       nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
-       na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
-       nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
-       nl_attr_nest_end(nlh, na_vlan);
-       nl_attr_nest_end(nlh, na_info);
-       assert(sizeof(buf) >= nlh->nlmsg_len);
-       ret = mlx5_nl_send(vmwa->nl_socket, nlh, sn);
-       if (ret >= 0)
-               ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
-       if (ret < 0) {
-               DRV_LOG(WARNING, "netlink: VLAN %s create failure (%d)", name,
-                       ret);
-       }
-       // Try to get ifindex of created or pre-existing device.
-       ret = if_nametoindex(name);
-       if (!ret) {
-               DRV_LOG(WARNING, "VLAN %s failed to get index (%d)", name,
-                       errno);
-               return 0;
-       }
-       return ret;
-}
diff --git a/drivers/net/mlx5/mlx5_nl.h b/drivers/net/mlx5/mlx5_nl.h
deleted file mode 100644 (file)
index 9be87c0..0000000
+++ /dev/null
@@ -1,72 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright 2019 Mellanox Technologies, Ltd
- */
-
-#ifndef RTE_PMD_MLX5_NL_H_
-#define RTE_PMD_MLX5_NL_H_
-
-#include <linux/netlink.h>
-
-
-/* Recognized Infiniband device physical port name types. */
-enum mlx5_nl_phys_port_name_type {
-       MLX5_PHYS_PORT_NAME_TYPE_NOTSET = 0, /* Not set. */
-       MLX5_PHYS_PORT_NAME_TYPE_LEGACY, /* before kernel ver < 5.0 */
-       MLX5_PHYS_PORT_NAME_TYPE_UPLINK, /* p0, kernel ver >= 5.0 */
-       MLX5_PHYS_PORT_NAME_TYPE_PFVF, /* pf0vf0, kernel ver >= 5.0 */
-       MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN, /* Unrecognized. */
-};
-
-/** Switch information returned by mlx5_nl_switch_info(). */
-struct mlx5_switch_info {
-       uint32_t master:1; /**< Master device. */
-       uint32_t representor:1; /**< Representor device. */
-       enum mlx5_nl_phys_port_name_type name_type; /** < Port name type. */
-       int32_t pf_num; /**< PF number (valid for pfxvfx format only). */
-       int32_t port_name; /**< Representor port name. */
-       uint64_t switch_id; /**< Switch identifier. */
-};
-
-/* VLAN netdev for VLAN workaround. */
-struct mlx5_nl_vlan_dev {
-       uint32_t refcnt;
-       uint32_t ifindex; /**< Own interface index. */
-};
-
-/*
- * Array of VLAN devices created on the base of VF
- * used for workaround in virtual environments.
- */
-struct mlx5_nl_vlan_vmwa_context {
-       int nl_socket;
-       uint32_t vf_ifindex;
-       struct mlx5_nl_vlan_dev vlan_dev[4096];
-};
-
-
-int mlx5_nl_init(int protocol);
-int mlx5_nl_mac_addr_add(int nlsk_fd, unsigned int iface_idx, uint64_t *mac_own,
-                        struct rte_ether_addr *mac, uint32_t index);
-int mlx5_nl_mac_addr_remove(int nlsk_fd, unsigned int iface_idx,
-                           uint64_t *mac_own, struct rte_ether_addr *mac,
-                           uint32_t index);
-void mlx5_nl_mac_addr_sync(int nlsk_fd, unsigned int iface_idx,
-                          struct rte_ether_addr *mac_addrs, int n);
-void mlx5_nl_mac_addr_flush(int nlsk_fd, unsigned int iface_idx,
-                           struct rte_ether_addr *mac_addrs, int n,
-                           uint64_t *mac_own);
-int mlx5_nl_promisc(int nlsk_fd, unsigned int iface_idx, int enable);
-int mlx5_nl_allmulti(int nlsk_fd, unsigned int iface_idx, int enable);
-unsigned int mlx5_nl_portnum(int nl, const char *name);
-unsigned int mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex);
-int mlx5_nl_vf_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
-                              struct rte_ether_addr *mac, int vf_index);
-int mlx5_nl_switch_info(int nl, unsigned int ifindex,
-                       struct mlx5_switch_info *info);
-
-void mlx5_nl_vlan_vmwa_delete(struct mlx5_nl_vlan_vmwa_context *vmwa,
-                             uint32_t ifindex);
-uint32_t mlx5_nl_vlan_vmwa_create(struct mlx5_nl_vlan_vmwa_context *vmwa,
-                                 uint32_t ifindex, uint16_t tag);
-
-#endif /* RTE_PMD_MLX5_NL_H_ */
index fc1a91c..8e63b67 100644 (file)
 
 #include <mlx5_glue.h>
 #include <mlx5_devx_cmds.h>
+#include <mlx5_nl.h>
 
 #include "mlx5.h"
 #include "mlx5_autoconf.h"
 #include "mlx5_rxtx.h"
-#include "mlx5_nl.h"
 #include "mlx5_utils.h"
 
 /**