1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2018 6WIND S.A.
3 * Copyright 2018 Mellanox Technologies, Ltd
7 #include <linux/if_link.h>
8 #include <linux/netlink.h>
9 #include <linux/rtnetlink.h>
11 #include <rdma/rdma_netlink.h>
17 #include <sys/socket.h>
20 #include <rte_errno.h>
21 #include <rte_malloc.h>
22 #include <rte_hypervisor.h>
25 #include "mlx5_utils.h"
27 /* Size of the buffer to receive kernel messages */
28 #define MLX5_NL_BUF_SIZE (32 * 1024)
29 /* Send buffer size for the Netlink socket */
30 #define MLX5_SEND_BUF_SIZE 32768
31 /* Receive buffer size for the Netlink socket */
32 #define MLX5_RECV_BUF_SIZE 32768
34 /** Parameters of VLAN devices created by driver. */
35 #define MLX5_VMWA_VLAN_DEVICE_PFX "evmlx"
37 * Define NDA_RTA as defined in iproute2 sources.
39 * see in iproute2 sources file include/libnetlink.h
42 #define MLX5_NDA_RTA(r) \
43 ((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
47 * The following definitions are normally found in rdma/rdma_netlink.h,
48 * however they are so recent that most systems do not expose them yet.
50 #ifndef HAVE_RDMA_NL_NLDEV
51 #define RDMA_NL_NLDEV 5
53 #ifndef HAVE_RDMA_NLDEV_CMD_GET
54 #define RDMA_NLDEV_CMD_GET 1
56 #ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET
57 #define RDMA_NLDEV_CMD_PORT_GET 5
59 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX
60 #define RDMA_NLDEV_ATTR_DEV_INDEX 1
62 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME
63 #define RDMA_NLDEV_ATTR_DEV_NAME 2
65 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
66 #define RDMA_NLDEV_ATTR_PORT_INDEX 3
68 #ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
69 #define RDMA_NLDEV_ATTR_NDEV_INDEX 50
72 /* These are normally found in linux/if_link.h. */
73 #ifndef HAVE_IFLA_NUM_VF
74 #define IFLA_NUM_VF 21
76 #ifndef HAVE_IFLA_EXT_MASK
77 #define IFLA_EXT_MASK 29
79 #ifndef HAVE_IFLA_PHYS_SWITCH_ID
80 #define IFLA_PHYS_SWITCH_ID 36
82 #ifndef HAVE_IFLA_PHYS_PORT_NAME
83 #define IFLA_PHYS_PORT_NAME 38
86 /* Add/remove MAC address through Netlink */
87 struct mlx5_nl_mac_addr {
88 struct rte_ether_addr (*mac)[];
89 /**< MAC address handled by the device. */
90 int mac_n; /**< Number of addresses in the array. */
93 /** Data structure used by mlx5_nl_cmdget_cb(). */
94 struct mlx5_nl_ifindex_data {
95 const char *name; /**< IB device name (in). */
96 uint32_t ibindex; /**< IB device index (out). */
97 uint32_t ifindex; /**< Network interface index (out). */
98 uint32_t portnum; /**< IB device max port number. */
102 * Opens a Netlink socket.
105 * Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA).
108 * A file descriptor on success, a negative errno value otherwise and
112 mlx5_nl_init(int protocol)
115 int sndbuf_size = MLX5_SEND_BUF_SIZE;
116 int rcvbuf_size = MLX5_RECV_BUF_SIZE;
117 struct sockaddr_nl local = {
118 .nl_family = AF_NETLINK,
122 fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
127 ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int));
132 ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int));
137 ret = bind(fd, (struct sockaddr *)&local, sizeof(local));
149 * Send a request message to the kernel on the Netlink socket.
152 * Netlink socket file descriptor.
154 * The Netlink message send to the kernel.
158 * Pointer to the request structure.
160 * Length of the request in bytes.
163 * The number of sent bytes on success, a negative errno value otherwise and
167 mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req,
170 struct sockaddr_nl sa = {
171 .nl_family = AF_NETLINK,
173 struct iovec iov[2] = {
174 { .iov_base = nh, .iov_len = sizeof(*nh), },
175 { .iov_base = req, .iov_len = len, },
177 struct msghdr msg = {
179 .msg_namelen = sizeof(sa),
185 nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
187 send_bytes = sendmsg(nlsk_fd, &msg, 0);
188 if (send_bytes < 0) {
196 * Send a message to the kernel on the Netlink socket.
199 * The Netlink socket file descriptor used for communication.
201 * The Netlink message send to the kernel.
206 * The number of sent bytes on success, a negative errno value otherwise and
210 mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn)
212 struct sockaddr_nl sa = {
213 .nl_family = AF_NETLINK,
217 .iov_len = nh->nlmsg_len,
219 struct msghdr msg = {
221 .msg_namelen = sizeof(sa),
227 nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
229 send_bytes = sendmsg(nlsk_fd, &msg, 0);
230 if (send_bytes < 0) {
238 * Receive a message from the kernel on the Netlink socket, following
242 * The Netlink socket file descriptor used for communication.
246 * The callback function to call for each Netlink message received.
247 * @param[in, out] arg
248 * Custom arguments for the callback.
251 * 0 on success, a negative errno value otherwise and rte_errno is set.
254 mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg),
257 struct sockaddr_nl sa;
258 char buf[MLX5_RECV_BUF_SIZE];
261 .iov_len = sizeof(buf),
263 struct msghdr msg = {
265 .msg_namelen = sizeof(sa),
267 /* One message at a time */
278 recv_bytes = recvmsg(nlsk_fd, &msg, 0);
279 if (recv_bytes == -1) {
283 nh = (struct nlmsghdr *)buf;
284 } while (nh->nlmsg_seq != sn);
286 NLMSG_OK(nh, (unsigned int)recv_bytes);
287 nh = NLMSG_NEXT(nh, recv_bytes)) {
288 if (nh->nlmsg_type == NLMSG_ERROR) {
289 struct nlmsgerr *err_data = NLMSG_DATA(nh);
291 if (err_data->error < 0) {
292 rte_errno = -err_data->error;
298 /* Multi-part msgs and their trailing DONE message. */
299 if (nh->nlmsg_flags & NLM_F_MULTI) {
300 if (nh->nlmsg_type == NLMSG_DONE)
315 * Parse Netlink message to retrieve the bridge MAC address.
318 * Pointer to Netlink Message Header.
320 * PMD data register with this callback.
323 * 0 on success, a negative errno value otherwise and rte_errno is set.
326 mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg)
328 struct mlx5_nl_mac_addr *data = arg;
329 struct ndmsg *r = NLMSG_DATA(nh);
330 struct rtattr *attribute;
333 len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
334 for (attribute = MLX5_NDA_RTA(r);
335 RTA_OK(attribute, len);
336 attribute = RTA_NEXT(attribute, len)) {
337 if (attribute->rta_type == NDA_LLADDR) {
338 if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) {
340 "not enough room to finalize the"
348 rte_ether_format_addr(m, 18, RTA_DATA(attribute));
349 DRV_LOG(DEBUG, "bridge MAC address %s", m);
351 memcpy(&(*data->mac)[data->mac_n++],
352 RTA_DATA(attribute), RTE_ETHER_ADDR_LEN);
359 * Get bridge MAC addresses.
362 * Pointer to Ethernet device.
364 * Pointer to the array table of MAC addresses to fill.
365 * Its size should be of MLX5_MAX_MAC_ADDRESSES.
367 * Number of entries filled in MAC array.
370 * 0 on success, a negative errno value otherwise and rte_errno is set.
373 mlx5_nl_mac_addr_list(struct rte_eth_dev *dev, struct rte_ether_addr (*mac)[],
376 struct mlx5_priv *priv = dev->data->dev_private;
377 unsigned int iface_idx = mlx5_ifindex(dev);
380 struct ifinfomsg ifm;
383 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
384 .nlmsg_type = RTM_GETNEIGH,
385 .nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
388 .ifi_family = PF_BRIDGE,
389 .ifi_index = iface_idx,
392 struct mlx5_nl_mac_addr data = {
398 uint32_t sn = priv->nl_sn++;
400 if (priv->nl_socket_route == -1)
402 fd = priv->nl_socket_route;
403 ret = mlx5_nl_request(fd, &req.hdr, sn, &req.ifm,
404 sizeof(struct ifinfomsg));
407 ret = mlx5_nl_recv(fd, sn, mlx5_nl_mac_addr_cb, &data);
413 DRV_LOG(DEBUG, "port %u cannot retrieve MAC address list %s",
414 dev->data->port_id, strerror(rte_errno));
419 * Modify the MAC address neighbour table with Netlink.
422 * Pointer to Ethernet device.
424 * MAC address to consider.
426 * 1 to add the MAC address, 0 to remove the MAC address.
429 * 0 on success, a negative errno value otherwise and rte_errno is set.
432 mlx5_nl_mac_addr_modify(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
435 struct mlx5_priv *priv = dev->data->dev_private;
436 unsigned int iface_idx = mlx5_ifindex(dev);
441 uint8_t buffer[RTE_ETHER_ADDR_LEN];
444 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
445 .nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
446 NLM_F_EXCL | NLM_F_ACK,
447 .nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH,
450 .ndm_family = PF_BRIDGE,
451 .ndm_state = NUD_NOARP | NUD_PERMANENT,
452 .ndm_ifindex = iface_idx,
453 .ndm_flags = NTF_SELF,
456 .rta_type = NDA_LLADDR,
457 .rta_len = RTA_LENGTH(RTE_ETHER_ADDR_LEN),
462 uint32_t sn = priv->nl_sn++;
464 if (priv->nl_socket_route == -1)
466 fd = priv->nl_socket_route;
467 memcpy(RTA_DATA(&req.rta), mac, RTE_ETHER_ADDR_LEN);
468 req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
469 RTA_ALIGN(req.rta.rta_len);
470 ret = mlx5_nl_send(fd, &req.hdr, sn);
473 ret = mlx5_nl_recv(fd, sn, NULL, NULL);
479 "port %u cannot %s MAC address %02X:%02X:%02X:%02X:%02X:%02X"
482 add ? "add" : "remove",
483 mac->addr_bytes[0], mac->addr_bytes[1],
484 mac->addr_bytes[2], mac->addr_bytes[3],
485 mac->addr_bytes[4], mac->addr_bytes[5],
486 strerror(rte_errno));
494 * Pointer to Ethernet device.
496 * MAC address to register.
501 * 0 on success, a negative errno value otherwise and rte_errno is set.
504 mlx5_nl_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
507 struct mlx5_priv *priv = dev->data->dev_private;
510 ret = mlx5_nl_mac_addr_modify(dev, mac, 1);
512 BITFIELD_SET(priv->mac_own, index);
519 * Remove a MAC address.
522 * Pointer to Ethernet device.
524 * MAC address to remove.
529 * 0 on success, a negative errno value otherwise and rte_errno is set.
532 mlx5_nl_mac_addr_remove(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
535 struct mlx5_priv *priv = dev->data->dev_private;
537 BITFIELD_RESET(priv->mac_own, index);
538 return mlx5_nl_mac_addr_modify(dev, mac, 0);
542 * Synchronize Netlink bridge table to the internal table.
545 * Pointer to Ethernet device.
548 mlx5_nl_mac_addr_sync(struct rte_eth_dev *dev)
550 struct rte_ether_addr macs[MLX5_MAX_MAC_ADDRESSES];
555 ret = mlx5_nl_mac_addr_list(dev, &macs, &macs_n);
558 for (i = 0; i != macs_n; ++i) {
561 /* Verify the address is not in the array yet. */
562 for (j = 0; j != MLX5_MAX_MAC_ADDRESSES; ++j)
563 if (rte_is_same_ether_addr(&macs[i],
564 &dev->data->mac_addrs[j]))
566 if (j != MLX5_MAX_MAC_ADDRESSES)
568 /* Find the first entry available. */
569 for (j = 0; j != MLX5_MAX_MAC_ADDRESSES; ++j) {
570 if (rte_is_zero_ether_addr(&dev->data->mac_addrs[j])) {
571 dev->data->mac_addrs[j] = macs[i];
579 * Flush all added MAC addresses.
582 * Pointer to Ethernet device.
585 mlx5_nl_mac_addr_flush(struct rte_eth_dev *dev)
587 struct mlx5_priv *priv = dev->data->dev_private;
590 for (i = MLX5_MAX_MAC_ADDRESSES - 1; i >= 0; --i) {
591 struct rte_ether_addr *m = &dev->data->mac_addrs[i];
593 if (BITFIELD_ISSET(priv->mac_own, i))
594 mlx5_nl_mac_addr_remove(dev, m, i);
599 * Enable promiscuous / all multicast mode through Netlink.
602 * Pointer to Ethernet device structure.
604 * IFF_PROMISC for promiscuous, IFF_ALLMULTI for allmulti.
606 * Nonzero to enable, disable otherwise.
609 * 0 on success, a negative errno value otherwise and rte_errno is set.
612 mlx5_nl_device_flags(struct rte_eth_dev *dev, uint32_t flags, int enable)
614 struct mlx5_priv *priv = dev->data->dev_private;
615 unsigned int iface_idx = mlx5_ifindex(dev);
618 struct ifinfomsg ifi;
621 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
622 .nlmsg_type = RTM_NEWLINK,
623 .nlmsg_flags = NLM_F_REQUEST,
626 .ifi_flags = enable ? flags : 0,
628 .ifi_index = iface_idx,
634 assert(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI)));
635 if (priv->nl_socket_route < 0)
637 fd = priv->nl_socket_route;
638 ret = mlx5_nl_send(fd, &req.hdr, priv->nl_sn++);
645 * Enable promiscuous mode through Netlink.
648 * Pointer to Ethernet device structure.
650 * Nonzero to enable, disable otherwise.
653 * 0 on success, a negative errno value otherwise and rte_errno is set.
656 mlx5_nl_promisc(struct rte_eth_dev *dev, int enable)
658 int ret = mlx5_nl_device_flags(dev, IFF_PROMISC, enable);
662 "port %u cannot %s promisc mode: Netlink error %s",
663 dev->data->port_id, enable ? "enable" : "disable",
664 strerror(rte_errno));
669 * Enable all multicast mode through Netlink.
672 * Pointer to Ethernet device structure.
674 * Nonzero to enable, disable otherwise.
677 * 0 on success, a negative errno value otherwise and rte_errno is set.
680 mlx5_nl_allmulti(struct rte_eth_dev *dev, int enable)
682 int ret = mlx5_nl_device_flags(dev, IFF_ALLMULTI, enable);
686 "port %u cannot %s allmulti mode: Netlink error %s",
687 dev->data->port_id, enable ? "enable" : "disable",
688 strerror(rte_errno));
693 * Process network interface information from Netlink message.
696 * Pointer to Netlink message header.
698 * Opaque data pointer for this callback.
701 * 0 on success, a negative errno value otherwise and rte_errno is set.
704 mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
706 struct mlx5_nl_ifindex_data *data = arg;
707 size_t off = NLMSG_HDRLEN;
708 uint32_t ibindex = 0;
709 uint32_t ifindex = 0;
710 uint32_t portnum = 0;
713 if (nh->nlmsg_type !=
714 RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) &&
716 RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET))
718 while (off < nh->nlmsg_len) {
719 struct nlattr *na = (void *)((uintptr_t)nh + off);
720 void *payload = (void *)((uintptr_t)na + NLA_HDRLEN);
722 if (na->nla_len > nh->nlmsg_len - off)
724 switch (na->nla_type) {
725 case RDMA_NLDEV_ATTR_DEV_INDEX:
726 ibindex = *(uint32_t *)payload;
728 case RDMA_NLDEV_ATTR_DEV_NAME:
729 if (!strcmp(payload, data->name))
732 case RDMA_NLDEV_ATTR_NDEV_INDEX:
733 ifindex = *(uint32_t *)payload;
735 case RDMA_NLDEV_ATTR_PORT_INDEX:
736 portnum = *(uint32_t *)payload;
741 off += NLA_ALIGN(na->nla_len);
744 data->ibindex = ibindex;
745 data->ifindex = ifindex;
746 data->portnum = portnum;
755 * Get index of network interface associated with some IB device.
757 * This is the only somewhat safe method to avoid resorting to heuristics
758 * when faced with port representors. Unfortunately it requires at least
762 * Netlink socket of the RDMA kind (NETLINK_RDMA).
766 * IB device port index, starting from 1
768 * A valid (nonzero) interface index on success, 0 otherwise and rte_errno
772 mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
774 uint32_t seq = random();
775 struct mlx5_nl_ifindex_data data = {
777 .ibindex = 0, /* Determined during first pass. */
778 .ifindex = 0, /* Determined during second pass. */
782 uint8_t buf[NLMSG_HDRLEN +
783 NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) +
784 NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
787 .nlmsg_len = NLMSG_LENGTH(0),
788 .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
790 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
796 ret = mlx5_nl_send(nl, &req.nh, seq);
799 ret = mlx5_nl_recv(nl, seq, mlx5_nl_cmdget_cb, &data);
805 req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
806 RDMA_NLDEV_CMD_PORT_GET);
807 req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
808 req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
809 na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
810 na->nla_len = NLA_HDRLEN + sizeof(data.ibindex);
811 na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
812 memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
813 &data.ibindex, sizeof(data.ibindex));
814 na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
815 na->nla_len = NLA_HDRLEN + sizeof(pindex);
816 na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
817 memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
818 &pindex, sizeof(pindex));
819 ret = mlx5_nl_send(nl, &req.nh, seq);
822 ret = mlx5_nl_recv(nl, seq, mlx5_nl_cmdget_cb, &data);
834 * Get the number of physical ports of given IB device.
837 * Netlink socket of the RDMA kind (NETLINK_RDMA).
842 * A valid (nonzero) number of ports on success, 0 otherwise
843 * and rte_errno is set.
846 mlx5_nl_portnum(int nl, const char *name)
848 uint32_t seq = random();
849 struct mlx5_nl_ifindex_data data = {
855 struct nlmsghdr req = {
856 .nlmsg_len = NLMSG_LENGTH(0),
857 .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
859 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
863 ret = mlx5_nl_send(nl, &req, seq);
866 ret = mlx5_nl_recv(nl, seq, mlx5_nl_cmdget_cb, &data);
879 * Process switch information from Netlink message.
882 * Pointer to Netlink message header.
884 * Opaque data pointer for this callback.
887 * 0 on success, a negative errno value otherwise and rte_errno is set.
890 mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
892 struct mlx5_switch_info info = {
895 .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
899 size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg));
900 bool switch_id_set = false;
901 bool num_vf_set = false;
903 if (nh->nlmsg_type != RTM_NEWLINK)
905 while (off < nh->nlmsg_len) {
906 struct rtattr *ra = (void *)((uintptr_t)nh + off);
907 void *payload = RTA_DATA(ra);
910 if (ra->rta_len > nh->nlmsg_len - off)
912 switch (ra->rta_type) {
916 case IFLA_PHYS_PORT_NAME:
917 mlx5_translate_port_name((char *)payload, &info);
919 case IFLA_PHYS_SWITCH_ID:
921 for (i = 0; i < RTA_PAYLOAD(ra); ++i) {
922 info.switch_id <<= 8;
923 info.switch_id |= ((uint8_t *)payload)[i];
925 switch_id_set = true;
928 off += RTA_ALIGN(ra->rta_len);
931 /* We have some E-Switch configuration. */
932 mlx5_nl_check_switch_info(num_vf_set, &info);
934 assert(!(info.master && info.representor));
935 memcpy(arg, &info, sizeof(info));
943 * Get switch information associated with network interface.
946 * Netlink socket of the ROUTE kind (NETLINK_ROUTE).
948 * Network interface index.
950 * Switch information object, populated in case of success.
953 * 0 on success, a negative errno value otherwise and rte_errno is set.
956 mlx5_nl_switch_info(int nl, unsigned int ifindex, struct mlx5_switch_info *info)
958 uint32_t seq = random();
961 struct ifinfomsg info;
966 .nlmsg_len = NLMSG_LENGTH
968 RTA_LENGTH(sizeof(uint32_t))),
969 .nlmsg_type = RTM_GETLINK,
970 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
973 .ifi_family = AF_UNSPEC,
974 .ifi_index = ifindex,
977 .rta_type = IFLA_EXT_MASK,
978 .rta_len = RTA_LENGTH(sizeof(int32_t)),
980 .extmask = RTE_LE32(1),
984 ret = mlx5_nl_send(nl, &req.nh, seq);
986 ret = mlx5_nl_recv(nl, seq, mlx5_nl_switch_info_cb, info);
987 if (info->master && info->representor) {
988 DRV_LOG(ERR, "ifindex %u device is recognized as master"
989 " and as representor", ifindex);
997 * Delete VLAN network device by ifindex.
1000 * Context object initialized by mlx5_vlan_vmwa_init().
1001 * @param[in] ifindex
1002 * Interface index of network device to delete.
1005 mlx5_vlan_vmwa_delete(struct mlx5_vlan_vmwa_context *vmwa,
1011 struct ifinfomsg info;
1014 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
1015 .nlmsg_type = RTM_DELLINK,
1016 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1019 .ifi_family = AF_UNSPEC,
1020 .ifi_index = ifindex,
1028 ret = mlx5_nl_send(vmwa->nl_socket, &req.nh, vmwa->nl_sn);
1030 ret = mlx5_nl_recv(vmwa->nl_socket,
1034 DRV_LOG(WARNING, "netlink: error deleting"
1035 " VLAN WA ifindex %u, %d",
1040 /* Set of subroutines to build Netlink message. */
1041 static struct nlattr *
1042 nl_msg_tail(struct nlmsghdr *nlh)
1044 return (struct nlattr *)
1045 (((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
1049 nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
1051 struct nlattr *nla = nl_msg_tail(nlh);
1053 nla->nla_type = type;
1054 nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr) + alen);
1055 nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + nla->nla_len;
1058 memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
1061 static struct nlattr *
1062 nl_attr_nest_start(struct nlmsghdr *nlh, int type)
1064 struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
1066 nl_attr_put(nlh, type, NULL, 0);
1071 nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest)
1073 nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest;
1077 * Create network VLAN device with specified VLAN tag.
1080 * Context object initialized by mlx5_vlan_vmwa_init().
1081 * @param[in] ifindex
1082 * Base network interface index.
1084 * VLAN tag for VLAN network device to create.
1087 mlx5_vlan_vmwa_create(struct mlx5_vlan_vmwa_context *vmwa,
1091 struct nlmsghdr *nlh;
1092 struct ifinfomsg *ifm;
1093 char name[sizeof(MLX5_VMWA_VLAN_DEVICE_PFX) + 32];
1095 alignas(RTE_CACHE_LINE_SIZE)
1096 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1097 NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
1098 NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
1099 NLMSG_ALIGN(sizeof(uint32_t)) +
1100 NLMSG_ALIGN(sizeof(name)) +
1101 NLMSG_ALIGN(sizeof("vlan")) +
1102 NLMSG_ALIGN(sizeof(uint32_t)) +
1103 NLMSG_ALIGN(sizeof(uint16_t)) + 16];
1104 struct nlattr *na_info;
1105 struct nlattr *na_vlan;
1108 memset(buf, 0, sizeof(buf));
1112 nlh = (struct nlmsghdr *)buf;
1113 nlh->nlmsg_len = sizeof(struct nlmsghdr);
1114 nlh->nlmsg_type = RTM_NEWLINK;
1115 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
1116 NLM_F_EXCL | NLM_F_ACK;
1117 ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
1118 nlh->nlmsg_len += sizeof(struct ifinfomsg);
1119 ifm->ifi_family = AF_UNSPEC;
1122 ifm->ifi_flags = IFF_UP;
1123 ifm->ifi_change = 0xffffffff;
1124 nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
1125 ret = snprintf(name, sizeof(name), "%s.%u.%u",
1126 MLX5_VMWA_VLAN_DEVICE_PFX, ifindex, tag);
1127 nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
1128 na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
1129 nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
1130 na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
1131 nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
1132 nl_attr_nest_end(nlh, na_vlan);
1133 nl_attr_nest_end(nlh, na_info);
1134 assert(sizeof(buf) >= nlh->nlmsg_len);
1135 ret = mlx5_nl_send(vmwa->nl_socket, nlh, vmwa->nl_sn);
1137 ret = mlx5_nl_recv(vmwa->nl_socket, vmwa->nl_sn, NULL, NULL);
1140 "netlink: VLAN %s create failure (%d)",
1143 // Try to get ifindex of created or pre-existing device.
1144 ret = if_nametoindex(name);
1147 "VLAN %s failed to get index (%d)",
1155 * Release VLAN network device, created for VM workaround.
1158 * Ethernet device object, Netlink context provider.
1160 * Object representing the network device to release.
1162 void mlx5_vlan_vmwa_release(struct rte_eth_dev *dev,
1163 struct mlx5_vf_vlan *vlan)
1165 struct mlx5_priv *priv = dev->data->dev_private;
1166 struct mlx5_vlan_vmwa_context *vmwa = priv->vmwa_context;
1167 struct mlx5_vlan_dev *vlan_dev = &vmwa->vlan_dev[0];
1169 assert(vlan->created);
1170 assert(priv->vmwa_context);
1171 if (!vlan->created || !vmwa)
1174 assert(vlan_dev[vlan->tag].refcnt);
1175 if (--vlan_dev[vlan->tag].refcnt == 0 &&
1176 vlan_dev[vlan->tag].ifindex) {
1177 mlx5_vlan_vmwa_delete(vmwa, vlan_dev[vlan->tag].ifindex);
1178 vlan_dev[vlan->tag].ifindex = 0;
1183 * Acquire VLAN interface with specified tag for VM workaround.
1186 * Ethernet device object, Netlink context provider.
1188 * Object representing the network device to acquire.
1190 void mlx5_vlan_vmwa_acquire(struct rte_eth_dev *dev,
1191 struct mlx5_vf_vlan *vlan)
1193 struct mlx5_priv *priv = dev->data->dev_private;
1194 struct mlx5_vlan_vmwa_context *vmwa = priv->vmwa_context;
1195 struct mlx5_vlan_dev *vlan_dev = &vmwa->vlan_dev[0];
1197 assert(!vlan->created);
1198 assert(priv->vmwa_context);
1199 if (vlan->created || !vmwa)
1201 if (vlan_dev[vlan->tag].refcnt == 0) {
1202 assert(!vlan_dev[vlan->tag].ifindex);
1203 vlan_dev[vlan->tag].ifindex =
1204 mlx5_vlan_vmwa_create(vmwa,
1208 if (vlan_dev[vlan->tag].ifindex) {
1209 vlan_dev[vlan->tag].refcnt++;
1215 * Create per ethernet device VLAN VM workaround context
1217 struct mlx5_vlan_vmwa_context *
1218 mlx5_vlan_vmwa_init(struct rte_eth_dev *dev,
1221 struct mlx5_priv *priv = dev->data->dev_private;
1222 struct mlx5_dev_config *config = &priv->config;
1223 struct mlx5_vlan_vmwa_context *vmwa;
1224 enum rte_hypervisor hv_type;
1226 /* Do not engage workaround over PF. */
1229 /* Check whether there is desired virtual environment */
1230 hv_type = rte_hypervisor_get();
1232 case RTE_HYPERVISOR_UNKNOWN:
1233 case RTE_HYPERVISOR_VMWARE:
1235 * The "white list" of configurations
1236 * to engage the workaround.
1241 * The configuration is not found in the "white list".
1242 * We should not engage the VLAN workaround.
1246 vmwa = rte_zmalloc(__func__, sizeof(*vmwa), sizeof(uint32_t));
1249 "Can not allocate memory"
1250 " for VLAN workaround context");
1253 vmwa->nl_socket = mlx5_nl_init(NETLINK_ROUTE);
1254 if (vmwa->nl_socket < 0) {
1256 "Can not create Netlink socket"
1257 " for VLAN workaround context");
1261 vmwa->nl_sn = random();
1262 vmwa->vf_ifindex = ifindex;
1264 /* Cleanup for existing VLAN devices. */
1269 * Destroy per ethernet device VLAN VM workaround context
1271 void mlx5_vlan_vmwa_exit(struct mlx5_vlan_vmwa_context *vmwa)
1275 /* Delete all remaining VLAN devices. */
1276 for (i = 0; i < RTE_DIM(vmwa->vlan_dev); i++) {
1277 if (vmwa->vlan_dev[i].ifindex)
1278 mlx5_vlan_vmwa_delete(vmwa, vmwa->vlan_dev[i].ifindex);
1280 if (vmwa->nl_socket >= 0)
1281 close(vmwa->nl_socket);