1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2018 6WIND S.A.
3 * Copyright 2018 Mellanox Technologies, Ltd
7 #include <linux/if_link.h>
8 #include <linux/netlink.h>
9 #include <linux/rtnetlink.h>
11 #include <rdma/rdma_netlink.h>
17 #include <sys/socket.h>
20 #include <rte_errno.h>
21 #include <rte_malloc.h>
22 #include <rte_hypervisor.h>
25 #include "mlx5_utils.h"
27 /* Size of the buffer to receive kernel messages */
28 #define MLX5_NL_BUF_SIZE (32 * 1024)
29 /* Send buffer size for the Netlink socket */
30 #define MLX5_SEND_BUF_SIZE 32768
31 /* Receive buffer size for the Netlink socket */
32 #define MLX5_RECV_BUF_SIZE 32768
34 /** Parameters of VLAN devices created by driver. */
35 #define MLX5_VMWA_VLAN_DEVICE_PFX "evmlx"
37 * Define NDA_RTA as defined in iproute2 sources.
39 * see in iproute2 sources file include/libnetlink.h
42 #define MLX5_NDA_RTA(r) \
43 ((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
46 * Define NLMSG_TAIL as defined in iproute2 sources.
48 * see in iproute2 sources file include/libnetlink.h
51 #define NLMSG_TAIL(nmsg) \
52 ((struct rtattr *)(((char *)(nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
55 * The following definitions are normally found in rdma/rdma_netlink.h,
56 * however they are so recent that most systems do not expose them yet.
58 #ifndef HAVE_RDMA_NL_NLDEV
59 #define RDMA_NL_NLDEV 5
61 #ifndef HAVE_RDMA_NLDEV_CMD_GET
62 #define RDMA_NLDEV_CMD_GET 1
64 #ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET
65 #define RDMA_NLDEV_CMD_PORT_GET 5
67 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX
68 #define RDMA_NLDEV_ATTR_DEV_INDEX 1
70 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME
71 #define RDMA_NLDEV_ATTR_DEV_NAME 2
73 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
74 #define RDMA_NLDEV_ATTR_PORT_INDEX 3
76 #ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
77 #define RDMA_NLDEV_ATTR_NDEV_INDEX 50
80 /* These are normally found in linux/if_link.h. */
81 #ifndef HAVE_IFLA_NUM_VF
82 #define IFLA_NUM_VF 21
84 #ifndef HAVE_IFLA_EXT_MASK
85 #define IFLA_EXT_MASK 29
87 #ifndef HAVE_IFLA_PHYS_SWITCH_ID
88 #define IFLA_PHYS_SWITCH_ID 36
90 #ifndef HAVE_IFLA_PHYS_PORT_NAME
91 #define IFLA_PHYS_PORT_NAME 38
94 /* Add/remove MAC address through Netlink */
95 struct mlx5_nl_mac_addr {
96 struct rte_ether_addr (*mac)[];
97 /**< MAC address handled by the device. */
98 int mac_n; /**< Number of addresses in the array. */
101 #define MLX5_NL_CMD_GET_IB_NAME (1 << 0)
102 #define MLX5_NL_CMD_GET_IB_INDEX (1 << 1)
103 #define MLX5_NL_CMD_GET_NET_INDEX (1 << 2)
104 #define MLX5_NL_CMD_GET_PORT_INDEX (1 << 3)
106 /** Data structure used by mlx5_nl_cmdget_cb(). */
107 struct mlx5_nl_ifindex_data {
108 const char *name; /**< IB device name (in). */
109 uint32_t flags; /**< found attribute flags (out). */
110 uint32_t ibindex; /**< IB device index (out). */
111 uint32_t ifindex; /**< Network interface index (out). */
112 uint32_t portnum; /**< IB device max port number (out). */
116 * Opens a Netlink socket.
119 * Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA).
122 * A file descriptor on success, a negative errno value otherwise and
126 mlx5_nl_init(int protocol)
129 int sndbuf_size = MLX5_SEND_BUF_SIZE;
130 int rcvbuf_size = MLX5_RECV_BUF_SIZE;
131 struct sockaddr_nl local = {
132 .nl_family = AF_NETLINK,
136 fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
141 ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int));
146 ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int));
151 ret = bind(fd, (struct sockaddr *)&local, sizeof(local));
163 * Send a request message to the kernel on the Netlink socket.
166 * Netlink socket file descriptor.
168 * The Netlink message send to the kernel.
172 * Pointer to the request structure.
174 * Length of the request in bytes.
177 * The number of sent bytes on success, a negative errno value otherwise and
181 mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req,
184 struct sockaddr_nl sa = {
185 .nl_family = AF_NETLINK,
187 struct iovec iov[2] = {
188 { .iov_base = nh, .iov_len = sizeof(*nh), },
189 { .iov_base = req, .iov_len = len, },
191 struct msghdr msg = {
193 .msg_namelen = sizeof(sa),
199 nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
201 send_bytes = sendmsg(nlsk_fd, &msg, 0);
202 if (send_bytes < 0) {
210 * Send a message to the kernel on the Netlink socket.
213 * The Netlink socket file descriptor used for communication.
215 * The Netlink message send to the kernel.
220 * The number of sent bytes on success, a negative errno value otherwise and
224 mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn)
226 struct sockaddr_nl sa = {
227 .nl_family = AF_NETLINK,
231 .iov_len = nh->nlmsg_len,
233 struct msghdr msg = {
235 .msg_namelen = sizeof(sa),
241 nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
243 send_bytes = sendmsg(nlsk_fd, &msg, 0);
244 if (send_bytes < 0) {
252 * Receive a message from the kernel on the Netlink socket, following
256 * The Netlink socket file descriptor used for communication.
260 * The callback function to call for each Netlink message received.
261 * @param[in, out] arg
262 * Custom arguments for the callback.
265 * 0 on success, a negative errno value otherwise and rte_errno is set.
268 mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg),
271 struct sockaddr_nl sa;
272 char buf[MLX5_RECV_BUF_SIZE];
275 .iov_len = sizeof(buf),
277 struct msghdr msg = {
279 .msg_namelen = sizeof(sa),
281 /* One message at a time */
292 recv_bytes = recvmsg(nlsk_fd, &msg, 0);
293 if (recv_bytes == -1) {
297 nh = (struct nlmsghdr *)buf;
298 } while (nh->nlmsg_seq != sn);
300 NLMSG_OK(nh, (unsigned int)recv_bytes);
301 nh = NLMSG_NEXT(nh, recv_bytes)) {
302 if (nh->nlmsg_type == NLMSG_ERROR) {
303 struct nlmsgerr *err_data = NLMSG_DATA(nh);
305 if (err_data->error < 0) {
306 rte_errno = -err_data->error;
312 /* Multi-part msgs and their trailing DONE message. */
313 if (nh->nlmsg_flags & NLM_F_MULTI) {
314 if (nh->nlmsg_type == NLMSG_DONE)
329 * Parse Netlink message to retrieve the bridge MAC address.
332 * Pointer to Netlink Message Header.
334 * PMD data register with this callback.
337 * 0 on success, a negative errno value otherwise and rte_errno is set.
340 mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg)
342 struct mlx5_nl_mac_addr *data = arg;
343 struct ndmsg *r = NLMSG_DATA(nh);
344 struct rtattr *attribute;
347 len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
348 for (attribute = MLX5_NDA_RTA(r);
349 RTA_OK(attribute, len);
350 attribute = RTA_NEXT(attribute, len)) {
351 if (attribute->rta_type == NDA_LLADDR) {
352 if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) {
354 "not enough room to finalize the"
362 rte_ether_format_addr(m, 18, RTA_DATA(attribute));
363 DRV_LOG(DEBUG, "bridge MAC address %s", m);
365 memcpy(&(*data->mac)[data->mac_n++],
366 RTA_DATA(attribute), RTE_ETHER_ADDR_LEN);
373 * Get bridge MAC addresses.
376 * Pointer to Ethernet device.
378 * Pointer to the array table of MAC addresses to fill.
379 * Its size should be of MLX5_MAX_MAC_ADDRESSES.
381 * Number of entries filled in MAC array.
384 * 0 on success, a negative errno value otherwise and rte_errno is set.
387 mlx5_nl_mac_addr_list(struct rte_eth_dev *dev, struct rte_ether_addr (*mac)[],
390 struct mlx5_priv *priv = dev->data->dev_private;
391 unsigned int iface_idx = mlx5_ifindex(dev);
394 struct ifinfomsg ifm;
397 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
398 .nlmsg_type = RTM_GETNEIGH,
399 .nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
402 .ifi_family = PF_BRIDGE,
403 .ifi_index = iface_idx,
406 struct mlx5_nl_mac_addr data = {
412 uint32_t sn = priv->nl_sn++;
414 if (priv->nl_socket_route == -1)
416 fd = priv->nl_socket_route;
417 ret = mlx5_nl_request(fd, &req.hdr, sn, &req.ifm,
418 sizeof(struct ifinfomsg));
421 ret = mlx5_nl_recv(fd, sn, mlx5_nl_mac_addr_cb, &data);
427 DRV_LOG(DEBUG, "port %u cannot retrieve MAC address list %s",
428 dev->data->port_id, strerror(rte_errno));
433 * Modify the MAC address neighbour table with Netlink.
436 * Pointer to Ethernet device.
438 * MAC address to consider.
440 * 1 to add the MAC address, 0 to remove the MAC address.
443 * 0 on success, a negative errno value otherwise and rte_errno is set.
446 mlx5_nl_mac_addr_modify(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
449 struct mlx5_priv *priv = dev->data->dev_private;
450 unsigned int iface_idx = mlx5_ifindex(dev);
455 uint8_t buffer[RTE_ETHER_ADDR_LEN];
458 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
459 .nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
460 NLM_F_EXCL | NLM_F_ACK,
461 .nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH,
464 .ndm_family = PF_BRIDGE,
465 .ndm_state = NUD_NOARP | NUD_PERMANENT,
466 .ndm_ifindex = iface_idx,
467 .ndm_flags = NTF_SELF,
470 .rta_type = NDA_LLADDR,
471 .rta_len = RTA_LENGTH(RTE_ETHER_ADDR_LEN),
476 uint32_t sn = priv->nl_sn++;
478 if (priv->nl_socket_route == -1)
480 fd = priv->nl_socket_route;
481 memcpy(RTA_DATA(&req.rta), mac, RTE_ETHER_ADDR_LEN);
482 req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
483 RTA_ALIGN(req.rta.rta_len);
484 ret = mlx5_nl_send(fd, &req.hdr, sn);
487 ret = mlx5_nl_recv(fd, sn, NULL, NULL);
493 "port %u cannot %s MAC address %02X:%02X:%02X:%02X:%02X:%02X"
496 add ? "add" : "remove",
497 mac->addr_bytes[0], mac->addr_bytes[1],
498 mac->addr_bytes[2], mac->addr_bytes[3],
499 mac->addr_bytes[4], mac->addr_bytes[5],
500 strerror(rte_errno));
505 * Modify the VF MAC address neighbour table with Netlink.
508 * Pointer to Ethernet device.
510 * MAC address to consider.
515 * 0 on success, a negative errno value otherwise and rte_errno is set.
518 mlx5_nl_vf_mac_addr_modify(struct rte_eth_dev *dev,
519 struct rte_ether_addr *mac, int vf_index)
522 struct mlx5_priv *priv = dev->data->dev_private;
523 unsigned int iface_idx = mlx5_ifindex(dev);
526 struct ifinfomsg ifm;
527 struct rtattr vf_list_rta;
528 struct rtattr vf_info_rta;
529 struct rtattr vf_mac_rta;
530 struct ifla_vf_mac ivm;
533 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
534 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
535 .nlmsg_type = RTM_BASE,
538 .ifi_index = iface_idx,
541 .rta_type = IFLA_VFINFO_LIST,
542 .rta_len = RTA_ALIGN(RTA_LENGTH(0)),
545 .rta_type = IFLA_VF_INFO,
546 .rta_len = RTA_ALIGN(RTA_LENGTH(0)),
549 .rta_type = IFLA_VF_MAC,
552 uint32_t sn = priv->nl_sn++;
553 struct ifla_vf_mac ivm = {
557 memcpy(&ivm.mac, mac, RTE_ETHER_ADDR_LEN);
558 memcpy(RTA_DATA(&req.vf_mac_rta), &ivm, sizeof(ivm));
560 req.vf_mac_rta.rta_len = RTA_LENGTH(sizeof(ivm));
561 req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
562 RTA_ALIGN(req.vf_list_rta.rta_len) +
563 RTA_ALIGN(req.vf_info_rta.rta_len) +
564 RTA_ALIGN(req.vf_mac_rta.rta_len);
565 req.vf_list_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
567 req.vf_info_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
570 fd = priv->nl_socket_route;
573 ret = mlx5_nl_send(fd, &req.hdr, sn);
576 ret = mlx5_nl_recv(fd, sn, NULL, NULL);
582 "representor %u cannot set VF MAC address "
583 "%02X:%02X:%02X:%02X:%02X:%02X : %s",
585 mac->addr_bytes[0], mac->addr_bytes[1],
586 mac->addr_bytes[2], mac->addr_bytes[3],
587 mac->addr_bytes[4], mac->addr_bytes[5],
588 strerror(rte_errno));
596 * Pointer to Ethernet device.
598 * MAC address to register.
603 * 0 on success, a negative errno value otherwise and rte_errno is set.
606 mlx5_nl_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
609 struct mlx5_priv *priv = dev->data->dev_private;
612 ret = mlx5_nl_mac_addr_modify(dev, mac, 1);
614 BITFIELD_SET(priv->mac_own, index);
621 * Remove a MAC address.
624 * Pointer to Ethernet device.
626 * MAC address to remove.
631 * 0 on success, a negative errno value otherwise and rte_errno is set.
634 mlx5_nl_mac_addr_remove(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
637 struct mlx5_priv *priv = dev->data->dev_private;
639 BITFIELD_RESET(priv->mac_own, index);
640 return mlx5_nl_mac_addr_modify(dev, mac, 0);
644 * Synchronize Netlink bridge table to the internal table.
647 * Pointer to Ethernet device.
650 mlx5_nl_mac_addr_sync(struct rte_eth_dev *dev)
652 struct rte_ether_addr macs[MLX5_MAX_MAC_ADDRESSES];
657 ret = mlx5_nl_mac_addr_list(dev, &macs, &macs_n);
660 for (i = 0; i != macs_n; ++i) {
663 /* Verify the address is not in the array yet. */
664 for (j = 0; j != MLX5_MAX_MAC_ADDRESSES; ++j)
665 if (rte_is_same_ether_addr(&macs[i],
666 &dev->data->mac_addrs[j]))
668 if (j != MLX5_MAX_MAC_ADDRESSES)
670 /* Find the first entry available. */
671 for (j = 0; j != MLX5_MAX_MAC_ADDRESSES; ++j) {
672 if (rte_is_zero_ether_addr(&dev->data->mac_addrs[j])) {
673 dev->data->mac_addrs[j] = macs[i];
681 * Flush all added MAC addresses.
684 * Pointer to Ethernet device.
687 mlx5_nl_mac_addr_flush(struct rte_eth_dev *dev)
689 struct mlx5_priv *priv = dev->data->dev_private;
692 for (i = MLX5_MAX_MAC_ADDRESSES - 1; i >= 0; --i) {
693 struct rte_ether_addr *m = &dev->data->mac_addrs[i];
695 if (BITFIELD_ISSET(priv->mac_own, i))
696 mlx5_nl_mac_addr_remove(dev, m, i);
701 * Enable promiscuous / all multicast mode through Netlink.
704 * Pointer to Ethernet device structure.
706 * IFF_PROMISC for promiscuous, IFF_ALLMULTI for allmulti.
708 * Nonzero to enable, disable otherwise.
711 * 0 on success, a negative errno value otherwise and rte_errno is set.
714 mlx5_nl_device_flags(struct rte_eth_dev *dev, uint32_t flags, int enable)
716 struct mlx5_priv *priv = dev->data->dev_private;
717 unsigned int iface_idx = mlx5_ifindex(dev);
720 struct ifinfomsg ifi;
723 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
724 .nlmsg_type = RTM_NEWLINK,
725 .nlmsg_flags = NLM_F_REQUEST,
728 .ifi_flags = enable ? flags : 0,
730 .ifi_index = iface_idx,
736 assert(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI)));
737 if (priv->nl_socket_route < 0)
739 fd = priv->nl_socket_route;
740 ret = mlx5_nl_send(fd, &req.hdr, priv->nl_sn++);
747 * Enable promiscuous mode through Netlink.
750 * Pointer to Ethernet device structure.
752 * Nonzero to enable, disable otherwise.
755 * 0 on success, a negative errno value otherwise and rte_errno is set.
758 mlx5_nl_promisc(struct rte_eth_dev *dev, int enable)
760 int ret = mlx5_nl_device_flags(dev, IFF_PROMISC, enable);
764 "port %u cannot %s promisc mode: Netlink error %s",
765 dev->data->port_id, enable ? "enable" : "disable",
766 strerror(rte_errno));
771 * Enable all multicast mode through Netlink.
774 * Pointer to Ethernet device structure.
776 * Nonzero to enable, disable otherwise.
779 * 0 on success, a negative errno value otherwise and rte_errno is set.
782 mlx5_nl_allmulti(struct rte_eth_dev *dev, int enable)
784 int ret = mlx5_nl_device_flags(dev, IFF_ALLMULTI, enable);
788 "port %u cannot %s allmulti mode: Netlink error %s",
789 dev->data->port_id, enable ? "enable" : "disable",
790 strerror(rte_errno));
795 * Process network interface information from Netlink message.
798 * Pointer to Netlink message header.
800 * Opaque data pointer for this callback.
803 * 0 on success, a negative errno value otherwise and rte_errno is set.
806 mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
808 struct mlx5_nl_ifindex_data *data = arg;
809 struct mlx5_nl_ifindex_data local = {
812 size_t off = NLMSG_HDRLEN;
814 if (nh->nlmsg_type !=
815 RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) &&
817 RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET))
819 while (off < nh->nlmsg_len) {
820 struct nlattr *na = (void *)((uintptr_t)nh + off);
821 void *payload = (void *)((uintptr_t)na + NLA_HDRLEN);
823 if (na->nla_len > nh->nlmsg_len - off)
825 switch (na->nla_type) {
826 case RDMA_NLDEV_ATTR_DEV_INDEX:
827 local.ibindex = *(uint32_t *)payload;
828 local.flags |= MLX5_NL_CMD_GET_IB_INDEX;
830 case RDMA_NLDEV_ATTR_DEV_NAME:
831 if (!strcmp(payload, data->name))
832 local.flags |= MLX5_NL_CMD_GET_IB_NAME;
834 case RDMA_NLDEV_ATTR_NDEV_INDEX:
835 local.ifindex = *(uint32_t *)payload;
836 local.flags |= MLX5_NL_CMD_GET_NET_INDEX;
838 case RDMA_NLDEV_ATTR_PORT_INDEX:
839 local.portnum = *(uint32_t *)payload;
840 local.flags |= MLX5_NL_CMD_GET_PORT_INDEX;
845 off += NLA_ALIGN(na->nla_len);
848 * It is possible to have multiple messages for all
849 * Infiniband devices in the system with appropriate name.
850 * So we should gather parameters locally and copy to
851 * query context only in case of coinciding device name.
853 if (local.flags & MLX5_NL_CMD_GET_IB_NAME) {
854 data->flags = local.flags;
855 data->ibindex = local.ibindex;
856 data->ifindex = local.ifindex;
857 data->portnum = local.portnum;
866 * Get index of network interface associated with some IB device.
868 * This is the only somewhat safe method to avoid resorting to heuristics
869 * when faced with port representors. Unfortunately it requires at least
873 * Netlink socket of the RDMA kind (NETLINK_RDMA).
877 * IB device port index, starting from 1
879 * A valid (nonzero) interface index on success, 0 otherwise and rte_errno
883 mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
885 uint32_t seq = random();
886 struct mlx5_nl_ifindex_data data = {
889 .ibindex = 0, /* Determined during first pass. */
890 .ifindex = 0, /* Determined during second pass. */
894 uint8_t buf[NLMSG_HDRLEN +
895 NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) +
896 NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
899 .nlmsg_len = NLMSG_LENGTH(0),
900 .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
902 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
908 ret = mlx5_nl_send(nl, &req.nh, seq);
911 ret = mlx5_nl_recv(nl, seq, mlx5_nl_cmdget_cb, &data);
914 if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
915 !(data.flags & MLX5_NL_CMD_GET_IB_INDEX))
919 req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
920 RDMA_NLDEV_CMD_PORT_GET);
921 req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
922 req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
923 na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
924 na->nla_len = NLA_HDRLEN + sizeof(data.ibindex);
925 na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
926 memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
927 &data.ibindex, sizeof(data.ibindex));
928 na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
929 na->nla_len = NLA_HDRLEN + sizeof(pindex);
930 na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
931 memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
932 &pindex, sizeof(pindex));
933 ret = mlx5_nl_send(nl, &req.nh, seq);
936 ret = mlx5_nl_recv(nl, seq, mlx5_nl_cmdget_cb, &data);
939 if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
940 !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
941 !(data.flags & MLX5_NL_CMD_GET_NET_INDEX) ||
951 * Get the number of physical ports of given IB device.
954 * Netlink socket of the RDMA kind (NETLINK_RDMA).
959 * A valid (nonzero) number of ports on success, 0 otherwise
960 * and rte_errno is set.
963 mlx5_nl_portnum(int nl, const char *name)
965 uint32_t seq = random();
966 struct mlx5_nl_ifindex_data data = {
972 struct nlmsghdr req = {
973 .nlmsg_len = NLMSG_LENGTH(0),
974 .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
976 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
980 ret = mlx5_nl_send(nl, &req, seq);
983 ret = mlx5_nl_recv(nl, seq, mlx5_nl_cmdget_cb, &data);
986 if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
987 !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
988 !(data.flags & MLX5_NL_CMD_GET_PORT_INDEX)) {
998 * Process switch information from Netlink message.
1001 * Pointer to Netlink message header.
1003 * Opaque data pointer for this callback.
1006 * 0 on success, a negative errno value otherwise and rte_errno is set.
1009 mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
1011 struct mlx5_switch_info info = {
1014 .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
1018 size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg));
1019 bool switch_id_set = false;
1020 bool num_vf_set = false;
1022 if (nh->nlmsg_type != RTM_NEWLINK)
1024 while (off < nh->nlmsg_len) {
1025 struct rtattr *ra = (void *)((uintptr_t)nh + off);
1026 void *payload = RTA_DATA(ra);
1029 if (ra->rta_len > nh->nlmsg_len - off)
1031 switch (ra->rta_type) {
1035 case IFLA_PHYS_PORT_NAME:
1036 mlx5_translate_port_name((char *)payload, &info);
1038 case IFLA_PHYS_SWITCH_ID:
1040 for (i = 0; i < RTA_PAYLOAD(ra); ++i) {
1041 info.switch_id <<= 8;
1042 info.switch_id |= ((uint8_t *)payload)[i];
1044 switch_id_set = true;
1047 off += RTA_ALIGN(ra->rta_len);
1049 if (switch_id_set) {
1050 /* We have some E-Switch configuration. */
1051 mlx5_nl_check_switch_info(num_vf_set, &info);
1053 assert(!(info.master && info.representor));
1054 memcpy(arg, &info, sizeof(info));
1062 * Get switch information associated with network interface.
1065 * Netlink socket of the ROUTE kind (NETLINK_ROUTE).
1067 * Network interface index.
1069 * Switch information object, populated in case of success.
1072 * 0 on success, a negative errno value otherwise and rte_errno is set.
1075 mlx5_nl_switch_info(int nl, unsigned int ifindex, struct mlx5_switch_info *info)
1077 uint32_t seq = random();
1080 struct ifinfomsg info;
1085 .nlmsg_len = NLMSG_LENGTH
1087 RTA_LENGTH(sizeof(uint32_t))),
1088 .nlmsg_type = RTM_GETLINK,
1089 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1092 .ifi_family = AF_UNSPEC,
1093 .ifi_index = ifindex,
1096 .rta_type = IFLA_EXT_MASK,
1097 .rta_len = RTA_LENGTH(sizeof(int32_t)),
1099 .extmask = RTE_LE32(1),
1103 ret = mlx5_nl_send(nl, &req.nh, seq);
1105 ret = mlx5_nl_recv(nl, seq, mlx5_nl_switch_info_cb, info);
1106 if (info->master && info->representor) {
1107 DRV_LOG(ERR, "ifindex %u device is recognized as master"
1108 " and as representor", ifindex);
1116 * Delete VLAN network device by ifindex.
1119 * Context object initialized by mlx5_vlan_vmwa_init().
1120 * @param[in] ifindex
1121 * Interface index of network device to delete.
1124 mlx5_vlan_vmwa_delete(struct mlx5_vlan_vmwa_context *vmwa,
1130 struct ifinfomsg info;
1133 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
1134 .nlmsg_type = RTM_DELLINK,
1135 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1138 .ifi_family = AF_UNSPEC,
1139 .ifi_index = ifindex,
1147 ret = mlx5_nl_send(vmwa->nl_socket, &req.nh, vmwa->nl_sn);
1149 ret = mlx5_nl_recv(vmwa->nl_socket,
1153 DRV_LOG(WARNING, "netlink: error deleting"
1154 " VLAN WA ifindex %u, %d",
1159 /* Set of subroutines to build Netlink message. */
1160 static struct nlattr *
1161 nl_msg_tail(struct nlmsghdr *nlh)
1163 return (struct nlattr *)
1164 (((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
1168 nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
1170 struct nlattr *nla = nl_msg_tail(nlh);
1172 nla->nla_type = type;
1173 nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr) + alen);
1174 nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + nla->nla_len;
1177 memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
1180 static struct nlattr *
1181 nl_attr_nest_start(struct nlmsghdr *nlh, int type)
1183 struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
1185 nl_attr_put(nlh, type, NULL, 0);
1190 nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest)
1192 nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest;
1196 * Create network VLAN device with specified VLAN tag.
1199 * Context object initialized by mlx5_vlan_vmwa_init().
1200 * @param[in] ifindex
1201 * Base network interface index.
1203 * VLAN tag for VLAN network device to create.
1206 mlx5_vlan_vmwa_create(struct mlx5_vlan_vmwa_context *vmwa,
1210 struct nlmsghdr *nlh;
1211 struct ifinfomsg *ifm;
1212 char name[sizeof(MLX5_VMWA_VLAN_DEVICE_PFX) + 32];
1214 alignas(RTE_CACHE_LINE_SIZE)
1215 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1216 NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
1217 NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
1218 NLMSG_ALIGN(sizeof(uint32_t)) +
1219 NLMSG_ALIGN(sizeof(name)) +
1220 NLMSG_ALIGN(sizeof("vlan")) +
1221 NLMSG_ALIGN(sizeof(uint32_t)) +
1222 NLMSG_ALIGN(sizeof(uint16_t)) + 16];
1223 struct nlattr *na_info;
1224 struct nlattr *na_vlan;
1227 memset(buf, 0, sizeof(buf));
1231 nlh = (struct nlmsghdr *)buf;
1232 nlh->nlmsg_len = sizeof(struct nlmsghdr);
1233 nlh->nlmsg_type = RTM_NEWLINK;
1234 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
1235 NLM_F_EXCL | NLM_F_ACK;
1236 ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
1237 nlh->nlmsg_len += sizeof(struct ifinfomsg);
1238 ifm->ifi_family = AF_UNSPEC;
1241 ifm->ifi_flags = IFF_UP;
1242 ifm->ifi_change = 0xffffffff;
1243 nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
1244 ret = snprintf(name, sizeof(name), "%s.%u.%u",
1245 MLX5_VMWA_VLAN_DEVICE_PFX, ifindex, tag);
1246 nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
1247 na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
1248 nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
1249 na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
1250 nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
1251 nl_attr_nest_end(nlh, na_vlan);
1252 nl_attr_nest_end(nlh, na_info);
1253 assert(sizeof(buf) >= nlh->nlmsg_len);
1254 ret = mlx5_nl_send(vmwa->nl_socket, nlh, vmwa->nl_sn);
1256 ret = mlx5_nl_recv(vmwa->nl_socket, vmwa->nl_sn, NULL, NULL);
1259 "netlink: VLAN %s create failure (%d)",
1262 // Try to get ifindex of created or pre-existing device.
1263 ret = if_nametoindex(name);
1266 "VLAN %s failed to get index (%d)",
1274 * Release VLAN network device, created for VM workaround.
1277 * Ethernet device object, Netlink context provider.
1279 * Object representing the network device to release.
1281 void mlx5_vlan_vmwa_release(struct rte_eth_dev *dev,
1282 struct mlx5_vf_vlan *vlan)
1284 struct mlx5_priv *priv = dev->data->dev_private;
1285 struct mlx5_vlan_vmwa_context *vmwa = priv->vmwa_context;
1286 struct mlx5_vlan_dev *vlan_dev = &vmwa->vlan_dev[0];
1288 assert(vlan->created);
1289 assert(priv->vmwa_context);
1290 if (!vlan->created || !vmwa)
1293 assert(vlan_dev[vlan->tag].refcnt);
1294 if (--vlan_dev[vlan->tag].refcnt == 0 &&
1295 vlan_dev[vlan->tag].ifindex) {
1296 mlx5_vlan_vmwa_delete(vmwa, vlan_dev[vlan->tag].ifindex);
1297 vlan_dev[vlan->tag].ifindex = 0;
1302 * Acquire VLAN interface with specified tag for VM workaround.
1305 * Ethernet device object, Netlink context provider.
1307 * Object representing the network device to acquire.
1309 void mlx5_vlan_vmwa_acquire(struct rte_eth_dev *dev,
1310 struct mlx5_vf_vlan *vlan)
1312 struct mlx5_priv *priv = dev->data->dev_private;
1313 struct mlx5_vlan_vmwa_context *vmwa = priv->vmwa_context;
1314 struct mlx5_vlan_dev *vlan_dev = &vmwa->vlan_dev[0];
1316 assert(!vlan->created);
1317 assert(priv->vmwa_context);
1318 if (vlan->created || !vmwa)
1320 if (vlan_dev[vlan->tag].refcnt == 0) {
1321 assert(!vlan_dev[vlan->tag].ifindex);
1322 vlan_dev[vlan->tag].ifindex =
1323 mlx5_vlan_vmwa_create(vmwa,
1327 if (vlan_dev[vlan->tag].ifindex) {
1328 vlan_dev[vlan->tag].refcnt++;
1334 * Create per ethernet device VLAN VM workaround context
1336 struct mlx5_vlan_vmwa_context *
1337 mlx5_vlan_vmwa_init(struct rte_eth_dev *dev,
1340 struct mlx5_priv *priv = dev->data->dev_private;
1341 struct mlx5_dev_config *config = &priv->config;
1342 struct mlx5_vlan_vmwa_context *vmwa;
1343 enum rte_hypervisor hv_type;
1345 /* Do not engage workaround over PF. */
1348 /* Check whether there is desired virtual environment */
1349 hv_type = rte_hypervisor_get();
1351 case RTE_HYPERVISOR_UNKNOWN:
1352 case RTE_HYPERVISOR_VMWARE:
1354 * The "white list" of configurations
1355 * to engage the workaround.
1360 * The configuration is not found in the "white list".
1361 * We should not engage the VLAN workaround.
1365 vmwa = rte_zmalloc(__func__, sizeof(*vmwa), sizeof(uint32_t));
1368 "Can not allocate memory"
1369 " for VLAN workaround context");
1372 vmwa->nl_socket = mlx5_nl_init(NETLINK_ROUTE);
1373 if (vmwa->nl_socket < 0) {
1375 "Can not create Netlink socket"
1376 " for VLAN workaround context");
1380 vmwa->nl_sn = random();
1381 vmwa->vf_ifindex = ifindex;
1383 /* Cleanup for existing VLAN devices. */
1388 * Destroy per ethernet device VLAN VM workaround context
1390 void mlx5_vlan_vmwa_exit(struct mlx5_vlan_vmwa_context *vmwa)
1394 /* Delete all remaining VLAN devices. */
1395 for (i = 0; i < RTE_DIM(vmwa->vlan_dev); i++) {
1396 if (vmwa->vlan_dev[i].ifindex)
1397 mlx5_vlan_vmwa_delete(vmwa, vmwa->vlan_dev[i].ifindex);
1399 if (vmwa->nl_socket >= 0)
1400 close(vmwa->nl_socket);