net/mlx5: set VF MAC address from host
[dpdk.git] / drivers / net / mlx5 / mlx5_nl.c
index 8a10109..e7ba034 100644 (file)
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <stdalign.h>
 #include <string.h>
 #include <sys/socket.h>
 #include <unistd.h>
 
 #include <rte_errno.h>
+#include <rte_malloc.h>
+#include <rte_hypervisor.h>
 
 #include "mlx5.h"
 #include "mlx5_utils.h"
@@ -28,6 +31,8 @@
 /* Receive buffer size for the Netlink socket */
 #define MLX5_RECV_BUF_SIZE 32768
 
+/** Parameters of VLAN devices created by driver. */
+#define MLX5_VMWA_VLAN_DEVICE_PFX "evmlx"
 /*
  * Define NDA_RTA as defined in iproute2 sources.
  *
 #define MLX5_NDA_RTA(r) \
        ((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
 #endif
-
+/*
+ * Define NLMSG_TAIL as defined in iproute2 sources.
+ *
+ * see in iproute2 sources file include/libnetlink.h
+ */
+#ifndef NLMSG_TAIL
+#define NLMSG_TAIL(nmsg) \
+       ((struct rtattr *)(((char *)(nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
+#endif
 /*
  * The following definitions are normally found in rdma/rdma_netlink.h,
  * however they are so recent that most systems do not expose them yet.
 #endif
 
 /* These are normally found in linux/if_link.h. */
+#ifndef HAVE_IFLA_NUM_VF
+#define IFLA_NUM_VF 21
+#endif
+#ifndef HAVE_IFLA_EXT_MASK
+#define IFLA_EXT_MASK 29
+#endif
 #ifndef HAVE_IFLA_PHYS_SWITCH_ID
 #define IFLA_PHYS_SWITCH_ID 36
 #endif
 
 /* Add/remove MAC address through Netlink */
 struct mlx5_nl_mac_addr {
-       struct ether_addr (*mac)[];
+       struct rte_ether_addr (*mac)[];
        /**< MAC address handled by the device. */
        int mac_n; /**< Number of addresses in the array. */
 };
 
-/** Data structure used by mlx5_nl_ifindex_cb(). */
+#define MLX5_NL_CMD_GET_IB_NAME (1 << 0)
+#define MLX5_NL_CMD_GET_IB_INDEX (1 << 1)
+#define MLX5_NL_CMD_GET_NET_INDEX (1 << 2)
+#define MLX5_NL_CMD_GET_PORT_INDEX (1 << 3)
+
+/** Data structure used by mlx5_nl_cmdget_cb(). */
 struct mlx5_nl_ifindex_data {
        const char *name; /**< IB device name (in). */
+       uint32_t flags; /**< found attribute flags (out). */
        uint32_t ibindex; /**< IB device index (out). */
        uint32_t ifindex; /**< Network interface index (out). */
+       uint32_t portnum; /**< IB device max port number (out). */
 };
 
 /**
@@ -333,11 +359,11 @@ mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg)
 #ifndef NDEBUG
                        char m[18];
 
-                       ether_format_addr(m, 18, RTA_DATA(attribute));
+                       rte_ether_format_addr(m, 18, RTA_DATA(attribute));
                        DRV_LOG(DEBUG, "bridge MAC address %s", m);
 #endif
                        memcpy(&(*data->mac)[data->mac_n++],
-                              RTA_DATA(attribute), ETHER_ADDR_LEN);
+                              RTA_DATA(attribute), RTE_ETHER_ADDR_LEN);
                }
        }
        return 0;
@@ -358,7 +384,7 @@ mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg)
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 static int
-mlx5_nl_mac_addr_list(struct rte_eth_dev *dev, struct ether_addr (*mac)[],
+mlx5_nl_mac_addr_list(struct rte_eth_dev *dev, struct rte_ether_addr (*mac)[],
                      int *mac_n)
 {
        struct mlx5_priv *priv = dev->data->dev_private;
@@ -417,7 +443,7 @@ error:
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 static int
-mlx5_nl_mac_addr_modify(struct rte_eth_dev *dev, struct ether_addr *mac,
+mlx5_nl_mac_addr_modify(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
                        int add)
 {
        struct mlx5_priv *priv = dev->data->dev_private;
@@ -426,7 +452,7 @@ mlx5_nl_mac_addr_modify(struct rte_eth_dev *dev, struct ether_addr *mac,
                struct nlmsghdr hdr;
                struct ndmsg ndm;
                struct rtattr rta;
-               uint8_t buffer[ETHER_ADDR_LEN];
+               uint8_t buffer[RTE_ETHER_ADDR_LEN];
        } req = {
                .hdr = {
                        .nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
@@ -442,7 +468,7 @@ mlx5_nl_mac_addr_modify(struct rte_eth_dev *dev, struct ether_addr *mac,
                },
                .rta = {
                        .rta_type = NDA_LLADDR,
-                       .rta_len = RTA_LENGTH(ETHER_ADDR_LEN),
+                       .rta_len = RTA_LENGTH(RTE_ETHER_ADDR_LEN),
                },
        };
        int fd;
@@ -452,7 +478,7 @@ mlx5_nl_mac_addr_modify(struct rte_eth_dev *dev, struct ether_addr *mac,
        if (priv->nl_socket_route == -1)
                return 0;
        fd = priv->nl_socket_route;
-       memcpy(RTA_DATA(&req.rta), mac, ETHER_ADDR_LEN);
+       memcpy(RTA_DATA(&req.rta), mac, RTE_ETHER_ADDR_LEN);
        req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
                RTA_ALIGN(req.rta.rta_len);
        ret = mlx5_nl_send(fd, &req.hdr, sn);
@@ -475,6 +501,94 @@ error:
        return -rte_errno;
 }
 
+/**
+ * Modify the VF MAC address neighbour table with Netlink.
+ *
+ * @param dev
+ *    Pointer to Ethernet device.
+ * @param mac
+ *    MAC address to consider.
+ * @param vf_index
+ *    VF index.
+ *
+ * @return
+ *    0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_nl_vf_mac_addr_modify(struct rte_eth_dev *dev,
+                          struct rte_ether_addr *mac, int vf_index)
+{
+       int fd, ret;
+       struct mlx5_priv *priv = dev->data->dev_private;
+       unsigned int iface_idx = mlx5_ifindex(dev);
+       struct {
+               struct nlmsghdr hdr;
+               struct ifinfomsg ifm;
+               struct rtattr vf_list_rta;
+               struct rtattr vf_info_rta;
+               struct rtattr vf_mac_rta;
+               struct ifla_vf_mac ivm;
+       } req = {
+               .hdr = {
+                       .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
+                       .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
+                       .nlmsg_type = RTM_BASE,
+               },
+               .ifm = {
+                       .ifi_index = iface_idx,
+               },
+               .vf_list_rta = {
+                       .rta_type = IFLA_VFINFO_LIST,
+                       .rta_len = RTA_ALIGN(RTA_LENGTH(0)),
+               },
+               .vf_info_rta = {
+                       .rta_type = IFLA_VF_INFO,
+                       .rta_len = RTA_ALIGN(RTA_LENGTH(0)),
+               },
+               .vf_mac_rta = {
+                       .rta_type = IFLA_VF_MAC,
+               },
+       };
+       uint32_t sn = priv->nl_sn++;
+       struct ifla_vf_mac ivm = {
+               .vf = vf_index,
+       };
+
+       memcpy(&ivm.mac, mac, RTE_ETHER_ADDR_LEN);
+       memcpy(RTA_DATA(&req.vf_mac_rta), &ivm, sizeof(ivm));
+
+       req.vf_mac_rta.rta_len = RTA_LENGTH(sizeof(ivm));
+       req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
+               RTA_ALIGN(req.vf_list_rta.rta_len) +
+               RTA_ALIGN(req.vf_info_rta.rta_len) +
+               RTA_ALIGN(req.vf_mac_rta.rta_len);
+       req.vf_list_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
+                                              &req.vf_list_rta);
+       req.vf_info_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
+                                              &req.vf_info_rta);
+
+       fd = priv->nl_socket_route;
+       if (fd < 0)
+               return -1;
+       ret = mlx5_nl_send(fd, &req.hdr, sn);
+       if (ret < 0)
+               goto error;
+       ret = mlx5_nl_recv(fd, sn, NULL, NULL);
+       if (ret < 0)
+               goto error;
+       return 0;
+error:
+       DRV_LOG(ERR,
+               "representor %u cannot set VF MAC address "
+               "%02X:%02X:%02X:%02X:%02X:%02X : %s",
+               vf_index,
+               mac->addr_bytes[0], mac->addr_bytes[1],
+               mac->addr_bytes[2], mac->addr_bytes[3],
+               mac->addr_bytes[4], mac->addr_bytes[5],
+               strerror(rte_errno));
+       return -rte_errno;
+}
+
 /**
  * Add a MAC address.
  *
@@ -489,7 +603,7 @@ error:
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 int
-mlx5_nl_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac,
+mlx5_nl_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
                     uint32_t index)
 {
        struct mlx5_priv *priv = dev->data->dev_private;
@@ -517,7 +631,7 @@ mlx5_nl_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac,
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 int
-mlx5_nl_mac_addr_remove(struct rte_eth_dev *dev, struct ether_addr *mac,
+mlx5_nl_mac_addr_remove(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
                        uint32_t index)
 {
        struct mlx5_priv *priv = dev->data->dev_private;
@@ -535,7 +649,7 @@ mlx5_nl_mac_addr_remove(struct rte_eth_dev *dev, struct ether_addr *mac,
 void
 mlx5_nl_mac_addr_sync(struct rte_eth_dev *dev)
 {
-       struct ether_addr macs[MLX5_MAX_MAC_ADDRESSES];
+       struct rte_ether_addr macs[MLX5_MAX_MAC_ADDRESSES];
        int macs_n = 0;
        int i;
        int ret;
@@ -548,14 +662,14 @@ mlx5_nl_mac_addr_sync(struct rte_eth_dev *dev)
 
                /* Verify the address is not in the array yet. */
                for (j = 0; j != MLX5_MAX_MAC_ADDRESSES; ++j)
-                       if (is_same_ether_addr(&macs[i],
+                       if (rte_is_same_ether_addr(&macs[i],
                                               &dev->data->mac_addrs[j]))
                                break;
                if (j != MLX5_MAX_MAC_ADDRESSES)
                        continue;
                /* Find the first entry available. */
                for (j = 0; j != MLX5_MAX_MAC_ADDRESSES; ++j) {
-                       if (is_zero_ether_addr(&dev->data->mac_addrs[j])) {
+                       if (rte_is_zero_ether_addr(&dev->data->mac_addrs[j])) {
                                dev->data->mac_addrs[j] = macs[i];
                                break;
                        }
@@ -576,7 +690,7 @@ mlx5_nl_mac_addr_flush(struct rte_eth_dev *dev)
        int i;
 
        for (i = MLX5_MAX_MAC_ADDRESSES - 1; i >= 0; --i) {
-               struct ether_addr *m = &dev->data->mac_addrs[i];
+               struct rte_ether_addr *m = &dev->data->mac_addrs[i];
 
                if (BITFIELD_ISSET(priv->mac_own, i))
                        mlx5_nl_mac_addr_remove(dev, m, i);
@@ -689,13 +803,13 @@ mlx5_nl_allmulti(struct rte_eth_dev *dev, int enable)
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 static int
-mlx5_nl_ifindex_cb(struct nlmsghdr *nh, void *arg)
+mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
 {
        struct mlx5_nl_ifindex_data *data = arg;
+       struct mlx5_nl_ifindex_data local = {
+               .flags = 0,
+       };
        size_t off = NLMSG_HDRLEN;
-       uint32_t ibindex = 0;
-       uint32_t ifindex = 0;
-       int found = 0;
 
        if (nh->nlmsg_type !=
            RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) &&
@@ -710,23 +824,37 @@ mlx5_nl_ifindex_cb(struct nlmsghdr *nh, void *arg)
                        goto error;
                switch (na->nla_type) {
                case RDMA_NLDEV_ATTR_DEV_INDEX:
-                       ibindex = *(uint32_t *)payload;
+                       local.ibindex = *(uint32_t *)payload;
+                       local.flags |= MLX5_NL_CMD_GET_IB_INDEX;
                        break;
                case RDMA_NLDEV_ATTR_DEV_NAME:
                        if (!strcmp(payload, data->name))
-                               found = 1;
+                               local.flags |= MLX5_NL_CMD_GET_IB_NAME;
                        break;
                case RDMA_NLDEV_ATTR_NDEV_INDEX:
-                       ifindex = *(uint32_t *)payload;
+                       local.ifindex = *(uint32_t *)payload;
+                       local.flags |= MLX5_NL_CMD_GET_NET_INDEX;
+                       break;
+               case RDMA_NLDEV_ATTR_PORT_INDEX:
+                       local.portnum = *(uint32_t *)payload;
+                       local.flags |= MLX5_NL_CMD_GET_PORT_INDEX;
                        break;
                default:
                        break;
                }
                off += NLA_ALIGN(na->nla_len);
        }
-       if (found) {
-               data->ibindex = ibindex;
-               data->ifindex = ifindex;
+       /*
+        * It is possible to have multiple messages for all
+        * Infiniband devices in the system with appropriate name.
+        * So we should gather parameters locally and copy to
+        * query context only in case of coinciding device name.
+        */
+       if (local.flags & MLX5_NL_CMD_GET_IB_NAME) {
+               data->flags = local.flags;
+               data->ibindex = local.ibindex;
+               data->ifindex = local.ifindex;
+               data->portnum = local.portnum;
        }
        return 0;
 error:
@@ -745,18 +873,19 @@ error:
  *   Netlink socket of the RDMA kind (NETLINK_RDMA).
  * @param[in] name
  *   IB device name.
- *
+ * @param[in] pindex
+ *   IB device port index, starting from 1
  * @return
  *   A valid (nonzero) interface index on success, 0 otherwise and rte_errno
  *   is set.
  */
 unsigned int
-mlx5_nl_ifindex(int nl, const char *name)
+mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
 {
-       static const uint32_t pindex = 1;
        uint32_t seq = random();
        struct mlx5_nl_ifindex_data data = {
                .name = name,
+               .flags = 0,
                .ibindex = 0, /* Determined during first pass. */
                .ifindex = 0, /* Determined during second pass. */
        };
@@ -779,11 +908,13 @@ mlx5_nl_ifindex(int nl, const char *name)
        ret = mlx5_nl_send(nl, &req.nh, seq);
        if (ret < 0)
                return 0;
-       ret = mlx5_nl_recv(nl, seq, mlx5_nl_ifindex_cb, &data);
+       ret = mlx5_nl_recv(nl, seq, mlx5_nl_cmdget_cb, &data);
        if (ret < 0)
                return 0;
-       if (!data.ibindex)
+       if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
+           !(data.flags & MLX5_NL_CMD_GET_IB_INDEX))
                goto error;
+       data.flags = 0;
        ++seq;
        req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
                                             RDMA_NLDEV_CMD_PORT_GET);
@@ -802,10 +933,13 @@ mlx5_nl_ifindex(int nl, const char *name)
        ret = mlx5_nl_send(nl, &req.nh, seq);
        if (ret < 0)
                return 0;
-       ret = mlx5_nl_recv(nl, seq, mlx5_nl_ifindex_cb, &data);
+       ret = mlx5_nl_recv(nl, seq, mlx5_nl_cmdget_cb, &data);
        if (ret < 0)
                return 0;
-       if (!data.ifindex)
+       if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
+           !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
+           !(data.flags & MLX5_NL_CMD_GET_NET_INDEX) ||
+           !data.ifindex)
                goto error;
        return data.ifindex;
 error:
@@ -813,6 +947,53 @@ error:
        return 0;
 }
 
+/**
+ * Get the number of physical ports of given IB device.
+ *
+ * @param nl
+ *   Netlink socket of the RDMA kind (NETLINK_RDMA).
+ * @param[in] name
+ *   IB device name.
+ *
+ * @return
+ *   A valid (nonzero) number of ports on success, 0 otherwise
+ *   and rte_errno is set.
+ */
+unsigned int
+mlx5_nl_portnum(int nl, const char *name)
+{
+       uint32_t seq = random();
+       struct mlx5_nl_ifindex_data data = {
+               .flags = 0,
+               .name = name,
+               .ifindex = 0,
+               .portnum = 0,
+       };
+       struct nlmsghdr req = {
+               .nlmsg_len = NLMSG_LENGTH(0),
+               .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+                                              RDMA_NLDEV_CMD_GET),
+               .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
+       };
+       int ret;
+
+       ret = mlx5_nl_send(nl, &req, seq);
+       if (ret < 0)
+               return 0;
+       ret = mlx5_nl_recv(nl, seq, mlx5_nl_cmdget_cb, &data);
+       if (ret < 0)
+               return 0;
+       if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
+           !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
+           !(data.flags & MLX5_NL_CMD_GET_PORT_INDEX)) {
+               rte_errno = ENODEV;
+               return 0;
+       }
+       if (!data.portnum)
+               rte_errno = EINVAL;
+       return data.portnum;
+}
+
 /**
  * Process switch information from Netlink message.
  *
@@ -830,13 +1011,13 @@ mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
        struct mlx5_switch_info info = {
                .master = 0,
                .representor = 0,
-               .port_name_new = 0,
+               .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
                .port_name = 0,
                .switch_id = 0,
        };
        size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg));
-       bool port_name_set = false;
        bool switch_id_set = false;
+       bool num_vf_set = false;
 
        if (nh->nlmsg_type != RTM_NEWLINK)
                goto error;
@@ -848,10 +1029,11 @@ mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
                if (ra->rta_len > nh->nlmsg_len - off)
                        goto error;
                switch (ra->rta_type) {
+               case IFLA_NUM_VF:
+                       num_vf_set = true;
+                       break;
                case IFLA_PHYS_PORT_NAME:
-                       port_name_set =
-                               mlx5_translate_port_name((char *)payload,
-                                                        &info);
+                       mlx5_translate_port_name((char *)payload, &info);
                        break;
                case IFLA_PHYS_SWITCH_ID:
                        info.switch_id = 0;
@@ -864,8 +1046,11 @@ mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
                }
                off += RTA_ALIGN(ra->rta_len);
        }
-       info.master = switch_id_set && !port_name_set;
-       info.representor = switch_id_set && port_name_set;
+       if (switch_id_set) {
+               /* We have some E-Switch configuration. */
+               mlx5_nl_check_switch_info(num_vf_set, &info);
+       }
+       assert(!(info.master && info.representor));
        memcpy(arg, &info, sizeof(info));
        return 0;
 error:
@@ -893,9 +1078,13 @@ mlx5_nl_switch_info(int nl, unsigned int ifindex, struct mlx5_switch_info *info)
        struct {
                struct nlmsghdr nh;
                struct ifinfomsg info;
+               struct rtattr rta;
+               uint32_t extmask;
        } req = {
                .nh = {
-                       .nlmsg_len = NLMSG_LENGTH(sizeof(req.info)),
+                       .nlmsg_len = NLMSG_LENGTH
+                                       (sizeof(req.info) +
+                                        RTA_LENGTH(sizeof(uint32_t))),
                        .nlmsg_type = RTM_GETLINK,
                        .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
                },
@@ -903,11 +1092,311 @@ mlx5_nl_switch_info(int nl, unsigned int ifindex, struct mlx5_switch_info *info)
                        .ifi_family = AF_UNSPEC,
                        .ifi_index = ifindex,
                },
+               .rta = {
+                       .rta_type = IFLA_EXT_MASK,
+                       .rta_len = RTA_LENGTH(sizeof(int32_t)),
+               },
+               .extmask = RTE_LE32(1),
        };
        int ret;
 
        ret = mlx5_nl_send(nl, &req.nh, seq);
        if (ret >= 0)
                ret = mlx5_nl_recv(nl, seq, mlx5_nl_switch_info_cb, info);
+       if (info->master && info->representor) {
+               DRV_LOG(ERR, "ifindex %u device is recognized as master"
+                            " and as representor", ifindex);
+               rte_errno = ENODEV;
+               ret = -rte_errno;
+       }
+       return ret;
+}
+
+/*
+ * Delete VLAN network device by ifindex.
+ *
+ * @param[in] tcf
+ *   Context object initialized by mlx5_vlan_vmwa_init().
+ * @param[in] ifindex
+ *   Interface index of network device to delete.
+ */
+static void
+mlx5_vlan_vmwa_delete(struct mlx5_vlan_vmwa_context *vmwa,
+                     uint32_t ifindex)
+{
+       int ret;
+       struct {
+               struct nlmsghdr nh;
+               struct ifinfomsg info;
+       } req = {
+               .nh = {
+                       .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
+                       .nlmsg_type = RTM_DELLINK,
+                       .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
+               },
+               .info = {
+                       .ifi_family = AF_UNSPEC,
+                       .ifi_index = ifindex,
+               },
+       };
+
+       if (ifindex) {
+               ++vmwa->nl_sn;
+               if (!vmwa->nl_sn)
+                       ++vmwa->nl_sn;
+               ret = mlx5_nl_send(vmwa->nl_socket, &req.nh, vmwa->nl_sn);
+               if (ret >= 0)
+                       ret = mlx5_nl_recv(vmwa->nl_socket,
+                                          vmwa->nl_sn,
+                                          NULL, NULL);
+               if (ret < 0)
+                       DRV_LOG(WARNING, "netlink: error deleting"
+                                        " VLAN WA ifindex %u, %d",
+                                        ifindex, ret);
+       }
+}
+
+/* Set of subroutines to build Netlink message. */
+static struct nlattr *
+nl_msg_tail(struct nlmsghdr *nlh)
+{
+       return (struct nlattr *)
+               (((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
+}
+
+static void
+nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
+{
+       struct nlattr *nla = nl_msg_tail(nlh);
+
+       nla->nla_type = type;
+       nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr) + alen);
+       nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + nla->nla_len;
+
+       if (alen)
+               memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
+}
+
+static struct nlattr *
+nl_attr_nest_start(struct nlmsghdr *nlh, int type)
+{
+       struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
+
+       nl_attr_put(nlh, type, NULL, 0);
+       return nest;
+}
+
+static void
+nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest)
+{
+       nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest;
+}
+
+/*
+ * Create network VLAN device with specified VLAN tag.
+ *
+ * @param[in] tcf
+ *   Context object initialized by mlx5_vlan_vmwa_init().
+ * @param[in] ifindex
+ *   Base network interface index.
+ * @param[in] tag
+ *   VLAN tag for VLAN network device to create.
+ */
+static uint32_t
+mlx5_vlan_vmwa_create(struct mlx5_vlan_vmwa_context *vmwa,
+                     uint32_t ifindex,
+                     uint16_t tag)
+{
+       struct nlmsghdr *nlh;
+       struct ifinfomsg *ifm;
+       char name[sizeof(MLX5_VMWA_VLAN_DEVICE_PFX) + 32];
+
+       alignas(RTE_CACHE_LINE_SIZE)
+       uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
+                   NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
+                   NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
+                   NLMSG_ALIGN(sizeof(uint32_t)) +
+                   NLMSG_ALIGN(sizeof(name)) +
+                   NLMSG_ALIGN(sizeof("vlan")) +
+                   NLMSG_ALIGN(sizeof(uint32_t)) +
+                   NLMSG_ALIGN(sizeof(uint16_t)) + 16];
+       struct nlattr *na_info;
+       struct nlattr *na_vlan;
+       int ret;
+
+       memset(buf, 0, sizeof(buf));
+       ++vmwa->nl_sn;
+       if (!vmwa->nl_sn)
+               ++vmwa->nl_sn;
+       nlh = (struct nlmsghdr *)buf;
+       nlh->nlmsg_len = sizeof(struct nlmsghdr);
+       nlh->nlmsg_type = RTM_NEWLINK;
+       nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
+                          NLM_F_EXCL | NLM_F_ACK;
+       ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
+       nlh->nlmsg_len += sizeof(struct ifinfomsg);
+       ifm->ifi_family = AF_UNSPEC;
+       ifm->ifi_type = 0;
+       ifm->ifi_index = 0;
+       ifm->ifi_flags = IFF_UP;
+       ifm->ifi_change = 0xffffffff;
+       nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
+       ret = snprintf(name, sizeof(name), "%s.%u.%u",
+                      MLX5_VMWA_VLAN_DEVICE_PFX, ifindex, tag);
+       nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
+       na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
+       nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
+       na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
+       nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
+       nl_attr_nest_end(nlh, na_vlan);
+       nl_attr_nest_end(nlh, na_info);
+       assert(sizeof(buf) >= nlh->nlmsg_len);
+       ret = mlx5_nl_send(vmwa->nl_socket, nlh, vmwa->nl_sn);
+       if (ret >= 0)
+               ret = mlx5_nl_recv(vmwa->nl_socket, vmwa->nl_sn, NULL, NULL);
+       if (ret < 0) {
+               DRV_LOG(WARNING,
+                       "netlink: VLAN %s create failure (%d)",
+                       name, ret);
+       }
+       // Try to get ifindex of created or pre-existing device.
+       ret = if_nametoindex(name);
+       if (!ret) {
+               DRV_LOG(WARNING,
+                       "VLAN %s failed to get index (%d)",
+                       name, errno);
+               return 0;
+       }
        return ret;
 }
+
+/*
+ * Release VLAN network device, created for VM workaround.
+ *
+ * @param[in] dev
+ *   Ethernet device object, Netlink context provider.
+ * @param[in] vlan
+ *   Object representing the network device to release.
+ */
+void mlx5_vlan_vmwa_release(struct rte_eth_dev *dev,
+                           struct mlx5_vf_vlan *vlan)
+{
+       struct mlx5_priv *priv = dev->data->dev_private;
+       struct mlx5_vlan_vmwa_context *vmwa = priv->vmwa_context;
+       struct mlx5_vlan_dev *vlan_dev = &vmwa->vlan_dev[0];
+
+       assert(vlan->created);
+       assert(priv->vmwa_context);
+       if (!vlan->created || !vmwa)
+               return;
+       vlan->created = 0;
+       assert(vlan_dev[vlan->tag].refcnt);
+       if (--vlan_dev[vlan->tag].refcnt == 0 &&
+           vlan_dev[vlan->tag].ifindex) {
+               mlx5_vlan_vmwa_delete(vmwa, vlan_dev[vlan->tag].ifindex);
+               vlan_dev[vlan->tag].ifindex = 0;
+       }
+}
+
+/**
+ * Acquire VLAN interface with specified tag for VM workaround.
+ *
+ * @param[in] dev
+ *   Ethernet device object, Netlink context provider.
+ * @param[in] vlan
+ *   Object representing the network device to acquire.
+ */
+void mlx5_vlan_vmwa_acquire(struct rte_eth_dev *dev,
+                           struct mlx5_vf_vlan *vlan)
+{
+       struct mlx5_priv *priv = dev->data->dev_private;
+       struct mlx5_vlan_vmwa_context *vmwa = priv->vmwa_context;
+       struct mlx5_vlan_dev *vlan_dev = &vmwa->vlan_dev[0];
+
+       assert(!vlan->created);
+       assert(priv->vmwa_context);
+       if (vlan->created || !vmwa)
+               return;
+       if (vlan_dev[vlan->tag].refcnt == 0) {
+               assert(!vlan_dev[vlan->tag].ifindex);
+               vlan_dev[vlan->tag].ifindex =
+                       mlx5_vlan_vmwa_create(vmwa,
+                                             vmwa->vf_ifindex,
+                                             vlan->tag);
+       }
+       if (vlan_dev[vlan->tag].ifindex) {
+               vlan_dev[vlan->tag].refcnt++;
+               vlan->created = 1;
+       }
+}
+
+/*
+ * Create per ethernet device VLAN VM workaround context
+ */
+struct mlx5_vlan_vmwa_context *
+mlx5_vlan_vmwa_init(struct rte_eth_dev *dev,
+                   uint32_t ifindex)
+{
+       struct mlx5_priv *priv = dev->data->dev_private;
+       struct mlx5_dev_config *config = &priv->config;
+       struct mlx5_vlan_vmwa_context *vmwa;
+       enum rte_hypervisor hv_type;
+
+       /* Do not engage workaround over PF. */
+       if (!config->vf)
+               return NULL;
+       /* Check whether there is desired virtual environment */
+       hv_type = rte_hypervisor_get();
+       switch (hv_type) {
+       case RTE_HYPERVISOR_UNKNOWN:
+       case RTE_HYPERVISOR_VMWARE:
+               /*
+                * The "white list" of configurations
+                * to engage the workaround.
+                */
+               break;
+       default:
+               /*
+                * The configuration is not found in the "white list".
+                * We should not engage the VLAN workaround.
+                */
+               return NULL;
+       }
+       vmwa = rte_zmalloc(__func__, sizeof(*vmwa), sizeof(uint32_t));
+       if (!vmwa) {
+               DRV_LOG(WARNING,
+                       "Can not allocate memory"
+                       " for VLAN workaround context");
+               return NULL;
+       }
+       vmwa->nl_socket = mlx5_nl_init(NETLINK_ROUTE);
+       if (vmwa->nl_socket < 0) {
+               DRV_LOG(WARNING,
+                       "Can not create Netlink socket"
+                       " for VLAN workaround context");
+               rte_free(vmwa);
+               return NULL;
+       }
+       vmwa->nl_sn = random();
+       vmwa->vf_ifindex = ifindex;
+       vmwa->dev = dev;
+       /* Cleanup for existing VLAN devices. */
+       return vmwa;
+}
+
+/*
+ * Destroy per ethernet device VLAN VM workaround context
+ */
+void mlx5_vlan_vmwa_exit(struct mlx5_vlan_vmwa_context *vmwa)
+{
+       unsigned int i;
+
+       /* Delete all remaining VLAN devices. */
+       for (i = 0; i < RTE_DIM(vmwa->vlan_dev); i++) {
+               if (vmwa->vlan_dev[i].ifindex)
+                       mlx5_vlan_vmwa_delete(vmwa, vmwa->vlan_dev[i].ifindex);
+       }
+       if (vmwa->nl_socket >= 0)
+               close(vmwa->nl_socket);
+       rte_free(vmwa);
+}