1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2018 6WIND S.A.
3 * Copyright 2018 Mellanox Technologies, Ltd
7 #include <linux/if_link.h>
8 #include <linux/netlink.h>
9 #include <linux/rtnetlink.h>
11 #include <rdma/rdma_netlink.h>
17 #include <sys/socket.h>
20 #include <rte_errno.h>
21 #include <rte_malloc.h>
22 #include <rte_hypervisor.h>
25 #include "mlx5_utils.h"
27 /* Size of the buffer to receive kernel messages */
28 #define MLX5_NL_BUF_SIZE (32 * 1024)
29 /* Send buffer size for the Netlink socket */
30 #define MLX5_SEND_BUF_SIZE 32768
31 /* Receive buffer size for the Netlink socket */
32 #define MLX5_RECV_BUF_SIZE 32768
34 /** Parameters of VLAN devices created by driver. */
35 #define MLX5_VMWA_VLAN_DEVICE_PFX "evmlx"
37 * Define NDA_RTA as defined in iproute2 sources.
39 * see in iproute2 sources file include/libnetlink.h
42 #define MLX5_NDA_RTA(r) \
43 ((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
47 * The following definitions are normally found in rdma/rdma_netlink.h,
48 * however they are so recent that most systems do not expose them yet.
50 #ifndef HAVE_RDMA_NL_NLDEV
51 #define RDMA_NL_NLDEV 5
53 #ifndef HAVE_RDMA_NLDEV_CMD_GET
54 #define RDMA_NLDEV_CMD_GET 1
56 #ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET
57 #define RDMA_NLDEV_CMD_PORT_GET 5
59 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX
60 #define RDMA_NLDEV_ATTR_DEV_INDEX 1
62 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME
63 #define RDMA_NLDEV_ATTR_DEV_NAME 2
65 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
66 #define RDMA_NLDEV_ATTR_PORT_INDEX 3
68 #ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
69 #define RDMA_NLDEV_ATTR_NDEV_INDEX 50
72 /* These are normally found in linux/if_link.h. */
73 #ifndef HAVE_IFLA_NUM_VF
74 #define IFLA_NUM_VF 21
76 #ifndef HAVE_IFLA_EXT_MASK
77 #define IFLA_EXT_MASK 29
79 #ifndef HAVE_IFLA_PHYS_SWITCH_ID
80 #define IFLA_PHYS_SWITCH_ID 36
82 #ifndef HAVE_IFLA_PHYS_PORT_NAME
83 #define IFLA_PHYS_PORT_NAME 38
86 /* Add/remove MAC address through Netlink */
87 struct mlx5_nl_mac_addr {
88 struct rte_ether_addr (*mac)[];
89 /**< MAC address handled by the device. */
90 int mac_n; /**< Number of addresses in the array. */
93 #define MLX5_NL_CMD_GET_IB_NAME (1 << 0)
94 #define MLX5_NL_CMD_GET_IB_INDEX (1 << 1)
95 #define MLX5_NL_CMD_GET_NET_INDEX (1 << 2)
96 #define MLX5_NL_CMD_GET_PORT_INDEX (1 << 3)
98 /** Data structure used by mlx5_nl_cmdget_cb(). */
99 struct mlx5_nl_ifindex_data {
100 const char *name; /**< IB device name (in). */
101 uint32_t flags; /**< found attribute flags (out). */
102 uint32_t ibindex; /**< IB device index (out). */
103 uint32_t ifindex; /**< Network interface index (out). */
104 uint32_t portnum; /**< IB device max port number (out). */
108 * Opens a Netlink socket.
111 * Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA).
114 * A file descriptor on success, a negative errno value otherwise and
118 mlx5_nl_init(int protocol)
121 int sndbuf_size = MLX5_SEND_BUF_SIZE;
122 int rcvbuf_size = MLX5_RECV_BUF_SIZE;
123 struct sockaddr_nl local = {
124 .nl_family = AF_NETLINK,
128 fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
133 ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int));
138 ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int));
143 ret = bind(fd, (struct sockaddr *)&local, sizeof(local));
155 * Send a request message to the kernel on the Netlink socket.
158 * Netlink socket file descriptor.
160 * The Netlink message send to the kernel.
164 * Pointer to the request structure.
166 * Length of the request in bytes.
169 * The number of sent bytes on success, a negative errno value otherwise and
173 mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req,
176 struct sockaddr_nl sa = {
177 .nl_family = AF_NETLINK,
179 struct iovec iov[2] = {
180 { .iov_base = nh, .iov_len = sizeof(*nh), },
181 { .iov_base = req, .iov_len = len, },
183 struct msghdr msg = {
185 .msg_namelen = sizeof(sa),
191 nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
193 send_bytes = sendmsg(nlsk_fd, &msg, 0);
194 if (send_bytes < 0) {
202 * Send a message to the kernel on the Netlink socket.
205 * The Netlink socket file descriptor used for communication.
207 * The Netlink message send to the kernel.
212 * The number of sent bytes on success, a negative errno value otherwise and
216 mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn)
218 struct sockaddr_nl sa = {
219 .nl_family = AF_NETLINK,
223 .iov_len = nh->nlmsg_len,
225 struct msghdr msg = {
227 .msg_namelen = sizeof(sa),
233 nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
235 send_bytes = sendmsg(nlsk_fd, &msg, 0);
236 if (send_bytes < 0) {
244 * Receive a message from the kernel on the Netlink socket, following
248 * The Netlink socket file descriptor used for communication.
252 * The callback function to call for each Netlink message received.
253 * @param[in, out] arg
254 * Custom arguments for the callback.
257 * 0 on success, a negative errno value otherwise and rte_errno is set.
260 mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg),
263 struct sockaddr_nl sa;
264 char buf[MLX5_RECV_BUF_SIZE];
267 .iov_len = sizeof(buf),
269 struct msghdr msg = {
271 .msg_namelen = sizeof(sa),
273 /* One message at a time */
284 recv_bytes = recvmsg(nlsk_fd, &msg, 0);
285 if (recv_bytes == -1) {
289 nh = (struct nlmsghdr *)buf;
290 } while (nh->nlmsg_seq != sn);
292 NLMSG_OK(nh, (unsigned int)recv_bytes);
293 nh = NLMSG_NEXT(nh, recv_bytes)) {
294 if (nh->nlmsg_type == NLMSG_ERROR) {
295 struct nlmsgerr *err_data = NLMSG_DATA(nh);
297 if (err_data->error < 0) {
298 rte_errno = -err_data->error;
304 /* Multi-part msgs and their trailing DONE message. */
305 if (nh->nlmsg_flags & NLM_F_MULTI) {
306 if (nh->nlmsg_type == NLMSG_DONE)
321 * Parse Netlink message to retrieve the bridge MAC address.
324 * Pointer to Netlink Message Header.
326 * PMD data register with this callback.
329 * 0 on success, a negative errno value otherwise and rte_errno is set.
332 mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg)
334 struct mlx5_nl_mac_addr *data = arg;
335 struct ndmsg *r = NLMSG_DATA(nh);
336 struct rtattr *attribute;
339 len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
340 for (attribute = MLX5_NDA_RTA(r);
341 RTA_OK(attribute, len);
342 attribute = RTA_NEXT(attribute, len)) {
343 if (attribute->rta_type == NDA_LLADDR) {
344 if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) {
346 "not enough room to finalize the"
354 rte_ether_format_addr(m, 18, RTA_DATA(attribute));
355 DRV_LOG(DEBUG, "bridge MAC address %s", m);
357 memcpy(&(*data->mac)[data->mac_n++],
358 RTA_DATA(attribute), RTE_ETHER_ADDR_LEN);
365 * Get bridge MAC addresses.
368 * Pointer to Ethernet device.
370 * Pointer to the array table of MAC addresses to fill.
371 * Its size should be of MLX5_MAX_MAC_ADDRESSES.
373 * Number of entries filled in MAC array.
376 * 0 on success, a negative errno value otherwise and rte_errno is set.
379 mlx5_nl_mac_addr_list(struct rte_eth_dev *dev, struct rte_ether_addr (*mac)[],
382 struct mlx5_priv *priv = dev->data->dev_private;
383 unsigned int iface_idx = mlx5_ifindex(dev);
386 struct ifinfomsg ifm;
389 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
390 .nlmsg_type = RTM_GETNEIGH,
391 .nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
394 .ifi_family = PF_BRIDGE,
395 .ifi_index = iface_idx,
398 struct mlx5_nl_mac_addr data = {
404 uint32_t sn = priv->nl_sn++;
406 if (priv->nl_socket_route == -1)
408 fd = priv->nl_socket_route;
409 ret = mlx5_nl_request(fd, &req.hdr, sn, &req.ifm,
410 sizeof(struct ifinfomsg));
413 ret = mlx5_nl_recv(fd, sn, mlx5_nl_mac_addr_cb, &data);
419 DRV_LOG(DEBUG, "port %u cannot retrieve MAC address list %s",
420 dev->data->port_id, strerror(rte_errno));
425 * Modify the MAC address neighbour table with Netlink.
428 * Pointer to Ethernet device.
430 * MAC address to consider.
432 * 1 to add the MAC address, 0 to remove the MAC address.
435 * 0 on success, a negative errno value otherwise and rte_errno is set.
438 mlx5_nl_mac_addr_modify(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
441 struct mlx5_priv *priv = dev->data->dev_private;
442 unsigned int iface_idx = mlx5_ifindex(dev);
447 uint8_t buffer[RTE_ETHER_ADDR_LEN];
450 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
451 .nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
452 NLM_F_EXCL | NLM_F_ACK,
453 .nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH,
456 .ndm_family = PF_BRIDGE,
457 .ndm_state = NUD_NOARP | NUD_PERMANENT,
458 .ndm_ifindex = iface_idx,
459 .ndm_flags = NTF_SELF,
462 .rta_type = NDA_LLADDR,
463 .rta_len = RTA_LENGTH(RTE_ETHER_ADDR_LEN),
468 uint32_t sn = priv->nl_sn++;
470 if (priv->nl_socket_route == -1)
472 fd = priv->nl_socket_route;
473 memcpy(RTA_DATA(&req.rta), mac, RTE_ETHER_ADDR_LEN);
474 req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
475 RTA_ALIGN(req.rta.rta_len);
476 ret = mlx5_nl_send(fd, &req.hdr, sn);
479 ret = mlx5_nl_recv(fd, sn, NULL, NULL);
485 "port %u cannot %s MAC address %02X:%02X:%02X:%02X:%02X:%02X"
488 add ? "add" : "remove",
489 mac->addr_bytes[0], mac->addr_bytes[1],
490 mac->addr_bytes[2], mac->addr_bytes[3],
491 mac->addr_bytes[4], mac->addr_bytes[5],
492 strerror(rte_errno));
500 * Pointer to Ethernet device.
502 * MAC address to register.
507 * 0 on success, a negative errno value otherwise and rte_errno is set.
510 mlx5_nl_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
513 struct mlx5_priv *priv = dev->data->dev_private;
516 ret = mlx5_nl_mac_addr_modify(dev, mac, 1);
518 BITFIELD_SET(priv->mac_own, index);
525 * Remove a MAC address.
528 * Pointer to Ethernet device.
530 * MAC address to remove.
535 * 0 on success, a negative errno value otherwise and rte_errno is set.
538 mlx5_nl_mac_addr_remove(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
541 struct mlx5_priv *priv = dev->data->dev_private;
543 BITFIELD_RESET(priv->mac_own, index);
544 return mlx5_nl_mac_addr_modify(dev, mac, 0);
548 * Synchronize Netlink bridge table to the internal table.
551 * Pointer to Ethernet device.
554 mlx5_nl_mac_addr_sync(struct rte_eth_dev *dev)
556 struct rte_ether_addr macs[MLX5_MAX_MAC_ADDRESSES];
561 ret = mlx5_nl_mac_addr_list(dev, &macs, &macs_n);
564 for (i = 0; i != macs_n; ++i) {
567 /* Verify the address is not in the array yet. */
568 for (j = 0; j != MLX5_MAX_MAC_ADDRESSES; ++j)
569 if (rte_is_same_ether_addr(&macs[i],
570 &dev->data->mac_addrs[j]))
572 if (j != MLX5_MAX_MAC_ADDRESSES)
574 /* Find the first entry available. */
575 for (j = 0; j != MLX5_MAX_MAC_ADDRESSES; ++j) {
576 if (rte_is_zero_ether_addr(&dev->data->mac_addrs[j])) {
577 dev->data->mac_addrs[j] = macs[i];
585 * Flush all added MAC addresses.
588 * Pointer to Ethernet device.
591 mlx5_nl_mac_addr_flush(struct rte_eth_dev *dev)
593 struct mlx5_priv *priv = dev->data->dev_private;
596 for (i = MLX5_MAX_MAC_ADDRESSES - 1; i >= 0; --i) {
597 struct rte_ether_addr *m = &dev->data->mac_addrs[i];
599 if (BITFIELD_ISSET(priv->mac_own, i))
600 mlx5_nl_mac_addr_remove(dev, m, i);
605 * Enable promiscuous / all multicast mode through Netlink.
608 * Pointer to Ethernet device structure.
610 * IFF_PROMISC for promiscuous, IFF_ALLMULTI for allmulti.
612 * Nonzero to enable, disable otherwise.
615 * 0 on success, a negative errno value otherwise and rte_errno is set.
618 mlx5_nl_device_flags(struct rte_eth_dev *dev, uint32_t flags, int enable)
620 struct mlx5_priv *priv = dev->data->dev_private;
621 unsigned int iface_idx = mlx5_ifindex(dev);
624 struct ifinfomsg ifi;
627 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
628 .nlmsg_type = RTM_NEWLINK,
629 .nlmsg_flags = NLM_F_REQUEST,
632 .ifi_flags = enable ? flags : 0,
634 .ifi_index = iface_idx,
640 assert(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI)));
641 if (priv->nl_socket_route < 0)
643 fd = priv->nl_socket_route;
644 ret = mlx5_nl_send(fd, &req.hdr, priv->nl_sn++);
651 * Enable promiscuous mode through Netlink.
654 * Pointer to Ethernet device structure.
656 * Nonzero to enable, disable otherwise.
659 * 0 on success, a negative errno value otherwise and rte_errno is set.
662 mlx5_nl_promisc(struct rte_eth_dev *dev, int enable)
664 int ret = mlx5_nl_device_flags(dev, IFF_PROMISC, enable);
668 "port %u cannot %s promisc mode: Netlink error %s",
669 dev->data->port_id, enable ? "enable" : "disable",
670 strerror(rte_errno));
675 * Enable all multicast mode through Netlink.
678 * Pointer to Ethernet device structure.
680 * Nonzero to enable, disable otherwise.
683 * 0 on success, a negative errno value otherwise and rte_errno is set.
686 mlx5_nl_allmulti(struct rte_eth_dev *dev, int enable)
688 int ret = mlx5_nl_device_flags(dev, IFF_ALLMULTI, enable);
692 "port %u cannot %s allmulti mode: Netlink error %s",
693 dev->data->port_id, enable ? "enable" : "disable",
694 strerror(rte_errno));
699 * Process network interface information from Netlink message.
702 * Pointer to Netlink message header.
704 * Opaque data pointer for this callback.
707 * 0 on success, a negative errno value otherwise and rte_errno is set.
710 mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
712 struct mlx5_nl_ifindex_data *data = arg;
713 struct mlx5_nl_ifindex_data local = {
716 size_t off = NLMSG_HDRLEN;
718 if (nh->nlmsg_type !=
719 RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) &&
721 RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET))
723 while (off < nh->nlmsg_len) {
724 struct nlattr *na = (void *)((uintptr_t)nh + off);
725 void *payload = (void *)((uintptr_t)na + NLA_HDRLEN);
727 if (na->nla_len > nh->nlmsg_len - off)
729 switch (na->nla_type) {
730 case RDMA_NLDEV_ATTR_DEV_INDEX:
731 local.ibindex = *(uint32_t *)payload;
732 local.flags |= MLX5_NL_CMD_GET_IB_INDEX;
734 case RDMA_NLDEV_ATTR_DEV_NAME:
735 if (!strcmp(payload, data->name))
736 local.flags |= MLX5_NL_CMD_GET_IB_NAME;
738 case RDMA_NLDEV_ATTR_NDEV_INDEX:
739 local.ifindex = *(uint32_t *)payload;
740 local.flags |= MLX5_NL_CMD_GET_NET_INDEX;
742 case RDMA_NLDEV_ATTR_PORT_INDEX:
743 local.portnum = *(uint32_t *)payload;
744 local.flags |= MLX5_NL_CMD_GET_PORT_INDEX;
749 off += NLA_ALIGN(na->nla_len);
752 * It is possible to have multiple messages for all
753 * Infiniband devices in the system with appropriate name.
754 * So we should gather parameters locally and copy to
755 * query context only in case of coinciding device name.
757 if (local.flags & MLX5_NL_CMD_GET_IB_NAME) {
758 data->flags = local.flags;
759 data->ibindex = local.ibindex;
760 data->ifindex = local.ifindex;
761 data->portnum = local.portnum;
770 * Get index of network interface associated with some IB device.
772 * This is the only somewhat safe method to avoid resorting to heuristics
773 * when faced with port representors. Unfortunately it requires at least
777 * Netlink socket of the RDMA kind (NETLINK_RDMA).
781 * IB device port index, starting from 1
783 * A valid (nonzero) interface index on success, 0 otherwise and rte_errno
787 mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
789 uint32_t seq = random();
790 struct mlx5_nl_ifindex_data data = {
793 .ibindex = 0, /* Determined during first pass. */
794 .ifindex = 0, /* Determined during second pass. */
798 uint8_t buf[NLMSG_HDRLEN +
799 NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) +
800 NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
803 .nlmsg_len = NLMSG_LENGTH(0),
804 .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
806 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
812 ret = mlx5_nl_send(nl, &req.nh, seq);
815 ret = mlx5_nl_recv(nl, seq, mlx5_nl_cmdget_cb, &data);
818 if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
819 !(data.flags & MLX5_NL_CMD_GET_IB_INDEX))
823 req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
824 RDMA_NLDEV_CMD_PORT_GET);
825 req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
826 req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
827 na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
828 na->nla_len = NLA_HDRLEN + sizeof(data.ibindex);
829 na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
830 memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
831 &data.ibindex, sizeof(data.ibindex));
832 na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
833 na->nla_len = NLA_HDRLEN + sizeof(pindex);
834 na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
835 memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
836 &pindex, sizeof(pindex));
837 ret = mlx5_nl_send(nl, &req.nh, seq);
840 ret = mlx5_nl_recv(nl, seq, mlx5_nl_cmdget_cb, &data);
843 if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
844 !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
845 !(data.flags & MLX5_NL_CMD_GET_NET_INDEX) ||
855 * Get the number of physical ports of given IB device.
858 * Netlink socket of the RDMA kind (NETLINK_RDMA).
863 * A valid (nonzero) number of ports on success, 0 otherwise
864 * and rte_errno is set.
867 mlx5_nl_portnum(int nl, const char *name)
869 uint32_t seq = random();
870 struct mlx5_nl_ifindex_data data = {
876 struct nlmsghdr req = {
877 .nlmsg_len = NLMSG_LENGTH(0),
878 .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
880 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
884 ret = mlx5_nl_send(nl, &req, seq);
887 ret = mlx5_nl_recv(nl, seq, mlx5_nl_cmdget_cb, &data);
890 if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
891 !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
892 !(data.flags & MLX5_NL_CMD_GET_PORT_INDEX)) {
902 * Process switch information from Netlink message.
905 * Pointer to Netlink message header.
907 * Opaque data pointer for this callback.
910 * 0 on success, a negative errno value otherwise and rte_errno is set.
913 mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
915 struct mlx5_switch_info info = {
918 .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
922 size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg));
923 bool switch_id_set = false;
924 bool num_vf_set = false;
926 if (nh->nlmsg_type != RTM_NEWLINK)
928 while (off < nh->nlmsg_len) {
929 struct rtattr *ra = (void *)((uintptr_t)nh + off);
930 void *payload = RTA_DATA(ra);
933 if (ra->rta_len > nh->nlmsg_len - off)
935 switch (ra->rta_type) {
939 case IFLA_PHYS_PORT_NAME:
940 mlx5_translate_port_name((char *)payload, &info);
942 case IFLA_PHYS_SWITCH_ID:
944 for (i = 0; i < RTA_PAYLOAD(ra); ++i) {
945 info.switch_id <<= 8;
946 info.switch_id |= ((uint8_t *)payload)[i];
948 switch_id_set = true;
951 off += RTA_ALIGN(ra->rta_len);
954 /* We have some E-Switch configuration. */
955 mlx5_nl_check_switch_info(num_vf_set, &info);
957 assert(!(info.master && info.representor));
958 memcpy(arg, &info, sizeof(info));
966 * Get switch information associated with network interface.
969 * Netlink socket of the ROUTE kind (NETLINK_ROUTE).
971 * Network interface index.
973 * Switch information object, populated in case of success.
976 * 0 on success, a negative errno value otherwise and rte_errno is set.
979 mlx5_nl_switch_info(int nl, unsigned int ifindex, struct mlx5_switch_info *info)
981 uint32_t seq = random();
984 struct ifinfomsg info;
989 .nlmsg_len = NLMSG_LENGTH
991 RTA_LENGTH(sizeof(uint32_t))),
992 .nlmsg_type = RTM_GETLINK,
993 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
996 .ifi_family = AF_UNSPEC,
997 .ifi_index = ifindex,
1000 .rta_type = IFLA_EXT_MASK,
1001 .rta_len = RTA_LENGTH(sizeof(int32_t)),
1003 .extmask = RTE_LE32(1),
1007 ret = mlx5_nl_send(nl, &req.nh, seq);
1009 ret = mlx5_nl_recv(nl, seq, mlx5_nl_switch_info_cb, info);
1010 if (info->master && info->representor) {
1011 DRV_LOG(ERR, "ifindex %u device is recognized as master"
1012 " and as representor", ifindex);
1020 * Delete VLAN network device by ifindex.
1023 * Context object initialized by mlx5_vlan_vmwa_init().
1024 * @param[in] ifindex
1025 * Interface index of network device to delete.
1028 mlx5_vlan_vmwa_delete(struct mlx5_vlan_vmwa_context *vmwa,
1034 struct ifinfomsg info;
1037 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
1038 .nlmsg_type = RTM_DELLINK,
1039 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1042 .ifi_family = AF_UNSPEC,
1043 .ifi_index = ifindex,
1051 ret = mlx5_nl_send(vmwa->nl_socket, &req.nh, vmwa->nl_sn);
1053 ret = mlx5_nl_recv(vmwa->nl_socket,
1057 DRV_LOG(WARNING, "netlink: error deleting"
1058 " VLAN WA ifindex %u, %d",
1063 /* Set of subroutines to build Netlink message. */
1064 static struct nlattr *
1065 nl_msg_tail(struct nlmsghdr *nlh)
1067 return (struct nlattr *)
1068 (((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
1072 nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
1074 struct nlattr *nla = nl_msg_tail(nlh);
1076 nla->nla_type = type;
1077 nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr) + alen);
1078 nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + nla->nla_len;
1081 memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
1084 static struct nlattr *
1085 nl_attr_nest_start(struct nlmsghdr *nlh, int type)
1087 struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
1089 nl_attr_put(nlh, type, NULL, 0);
1094 nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest)
1096 nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest;
1100 * Create network VLAN device with specified VLAN tag.
1103 * Context object initialized by mlx5_vlan_vmwa_init().
1104 * @param[in] ifindex
1105 * Base network interface index.
1107 * VLAN tag for VLAN network device to create.
1110 mlx5_vlan_vmwa_create(struct mlx5_vlan_vmwa_context *vmwa,
1114 struct nlmsghdr *nlh;
1115 struct ifinfomsg *ifm;
1116 char name[sizeof(MLX5_VMWA_VLAN_DEVICE_PFX) + 32];
1118 alignas(RTE_CACHE_LINE_SIZE)
1119 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1120 NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
1121 NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
1122 NLMSG_ALIGN(sizeof(uint32_t)) +
1123 NLMSG_ALIGN(sizeof(name)) +
1124 NLMSG_ALIGN(sizeof("vlan")) +
1125 NLMSG_ALIGN(sizeof(uint32_t)) +
1126 NLMSG_ALIGN(sizeof(uint16_t)) + 16];
1127 struct nlattr *na_info;
1128 struct nlattr *na_vlan;
1131 memset(buf, 0, sizeof(buf));
1135 nlh = (struct nlmsghdr *)buf;
1136 nlh->nlmsg_len = sizeof(struct nlmsghdr);
1137 nlh->nlmsg_type = RTM_NEWLINK;
1138 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
1139 NLM_F_EXCL | NLM_F_ACK;
1140 ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
1141 nlh->nlmsg_len += sizeof(struct ifinfomsg);
1142 ifm->ifi_family = AF_UNSPEC;
1145 ifm->ifi_flags = IFF_UP;
1146 ifm->ifi_change = 0xffffffff;
1147 nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
1148 ret = snprintf(name, sizeof(name), "%s.%u.%u",
1149 MLX5_VMWA_VLAN_DEVICE_PFX, ifindex, tag);
1150 nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
1151 na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
1152 nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
1153 na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
1154 nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
1155 nl_attr_nest_end(nlh, na_vlan);
1156 nl_attr_nest_end(nlh, na_info);
1157 assert(sizeof(buf) >= nlh->nlmsg_len);
1158 ret = mlx5_nl_send(vmwa->nl_socket, nlh, vmwa->nl_sn);
1160 ret = mlx5_nl_recv(vmwa->nl_socket, vmwa->nl_sn, NULL, NULL);
1163 "netlink: VLAN %s create failure (%d)",
1166 // Try to get ifindex of created or pre-existing device.
1167 ret = if_nametoindex(name);
1170 "VLAN %s failed to get index (%d)",
1178 * Release VLAN network device, created for VM workaround.
1181 * Ethernet device object, Netlink context provider.
1183 * Object representing the network device to release.
1185 void mlx5_vlan_vmwa_release(struct rte_eth_dev *dev,
1186 struct mlx5_vf_vlan *vlan)
1188 struct mlx5_priv *priv = dev->data->dev_private;
1189 struct mlx5_vlan_vmwa_context *vmwa = priv->vmwa_context;
1190 struct mlx5_vlan_dev *vlan_dev = &vmwa->vlan_dev[0];
1192 assert(vlan->created);
1193 assert(priv->vmwa_context);
1194 if (!vlan->created || !vmwa)
1197 assert(vlan_dev[vlan->tag].refcnt);
1198 if (--vlan_dev[vlan->tag].refcnt == 0 &&
1199 vlan_dev[vlan->tag].ifindex) {
1200 mlx5_vlan_vmwa_delete(vmwa, vlan_dev[vlan->tag].ifindex);
1201 vlan_dev[vlan->tag].ifindex = 0;
1206 * Acquire VLAN interface with specified tag for VM workaround.
1209 * Ethernet device object, Netlink context provider.
1211 * Object representing the network device to acquire.
1213 void mlx5_vlan_vmwa_acquire(struct rte_eth_dev *dev,
1214 struct mlx5_vf_vlan *vlan)
1216 struct mlx5_priv *priv = dev->data->dev_private;
1217 struct mlx5_vlan_vmwa_context *vmwa = priv->vmwa_context;
1218 struct mlx5_vlan_dev *vlan_dev = &vmwa->vlan_dev[0];
1220 assert(!vlan->created);
1221 assert(priv->vmwa_context);
1222 if (vlan->created || !vmwa)
1224 if (vlan_dev[vlan->tag].refcnt == 0) {
1225 assert(!vlan_dev[vlan->tag].ifindex);
1226 vlan_dev[vlan->tag].ifindex =
1227 mlx5_vlan_vmwa_create(vmwa,
1231 if (vlan_dev[vlan->tag].ifindex) {
1232 vlan_dev[vlan->tag].refcnt++;
1238 * Create per ethernet device VLAN VM workaround context
1240 struct mlx5_vlan_vmwa_context *
1241 mlx5_vlan_vmwa_init(struct rte_eth_dev *dev,
1244 struct mlx5_priv *priv = dev->data->dev_private;
1245 struct mlx5_dev_config *config = &priv->config;
1246 struct mlx5_vlan_vmwa_context *vmwa;
1247 enum rte_hypervisor hv_type;
1249 /* Do not engage workaround over PF. */
1252 /* Check whether there is desired virtual environment */
1253 hv_type = rte_hypervisor_get();
1255 case RTE_HYPERVISOR_UNKNOWN:
1256 case RTE_HYPERVISOR_VMWARE:
1258 * The "white list" of configurations
1259 * to engage the workaround.
1264 * The configuration is not found in the "white list".
1265 * We should not engage the VLAN workaround.
1269 vmwa = rte_zmalloc(__func__, sizeof(*vmwa), sizeof(uint32_t));
1272 "Can not allocate memory"
1273 " for VLAN workaround context");
1276 vmwa->nl_socket = mlx5_nl_init(NETLINK_ROUTE);
1277 if (vmwa->nl_socket < 0) {
1279 "Can not create Netlink socket"
1280 " for VLAN workaround context");
1284 vmwa->nl_sn = random();
1285 vmwa->vf_ifindex = ifindex;
1287 /* Cleanup for existing VLAN devices. */
1292 * Destroy per ethernet device VLAN VM workaround context
1294 void mlx5_vlan_vmwa_exit(struct mlx5_vlan_vmwa_context *vmwa)
1298 /* Delete all remaining VLAN devices. */
1299 for (i = 0; i < RTE_DIM(vmwa->vlan_dev); i++) {
1300 if (vmwa->vlan_dev[i].ifindex)
1301 mlx5_vlan_vmwa_delete(vmwa, vmwa->vlan_dev[i].ifindex);
1303 if (vmwa->nl_socket >= 0)
1304 close(vmwa->nl_socket);