1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2018 6WIND S.A.
3 * Copyright 2018 Mellanox Technologies, Ltd
7 #include <linux/if_link.h>
8 #include <linux/rtnetlink.h>
10 #include <rdma/rdma_netlink.h>
16 #include <sys/socket.h>
19 #include <rte_errno.h>
20 #include <rte_atomic.h>
23 #include "mlx5_common_utils.h"
25 /* Size of the buffer to receive kernel messages */
26 #define MLX5_NL_BUF_SIZE (32 * 1024)
27 /* Send buffer size for the Netlink socket */
28 #define MLX5_SEND_BUF_SIZE 32768
29 /* Receive buffer size for the Netlink socket */
30 #define MLX5_RECV_BUF_SIZE 32768
32 /** Parameters of VLAN devices created by driver. */
33 #define MLX5_VMWA_VLAN_DEVICE_PFX "evmlx"
35 * Define NDA_RTA as defined in iproute2 sources.
37 * see in iproute2 sources file include/libnetlink.h
40 #define MLX5_NDA_RTA(r) \
41 ((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
44 * Define NLMSG_TAIL as defined in iproute2 sources.
46 * see in iproute2 sources file include/libnetlink.h
49 #define NLMSG_TAIL(nmsg) \
50 ((struct rtattr *)(((char *)(nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
53 * The following definitions are normally found in rdma/rdma_netlink.h,
54 * however they are so recent that most systems do not expose them yet.
56 #ifndef HAVE_RDMA_NL_NLDEV
57 #define RDMA_NL_NLDEV 5
59 #ifndef HAVE_RDMA_NLDEV_CMD_GET
60 #define RDMA_NLDEV_CMD_GET 1
62 #ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET
63 #define RDMA_NLDEV_CMD_PORT_GET 5
65 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX
66 #define RDMA_NLDEV_ATTR_DEV_INDEX 1
68 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME
69 #define RDMA_NLDEV_ATTR_DEV_NAME 2
71 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
72 #define RDMA_NLDEV_ATTR_PORT_INDEX 3
74 #ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
75 #define RDMA_NLDEV_ATTR_NDEV_INDEX 50
78 /* These are normally found in linux/if_link.h. */
79 #ifndef HAVE_IFLA_NUM_VF
80 #define IFLA_NUM_VF 21
82 #ifndef HAVE_IFLA_EXT_MASK
83 #define IFLA_EXT_MASK 29
85 #ifndef HAVE_IFLA_PHYS_SWITCH_ID
86 #define IFLA_PHYS_SWITCH_ID 36
88 #ifndef HAVE_IFLA_PHYS_PORT_NAME
89 #define IFLA_PHYS_PORT_NAME 38
92 /* Add/remove MAC address through Netlink */
93 struct mlx5_nl_mac_addr {
94 struct rte_ether_addr (*mac)[];
95 /**< MAC address handled by the device. */
96 int mac_n; /**< Number of addresses in the array. */
99 #define MLX5_NL_CMD_GET_IB_NAME (1 << 0)
100 #define MLX5_NL_CMD_GET_IB_INDEX (1 << 1)
101 #define MLX5_NL_CMD_GET_NET_INDEX (1 << 2)
102 #define MLX5_NL_CMD_GET_PORT_INDEX (1 << 3)
104 /** Data structure used by mlx5_nl_cmdget_cb(). */
105 struct mlx5_nl_ifindex_data {
106 const char *name; /**< IB device name (in). */
107 uint32_t flags; /**< found attribute flags (out). */
108 uint32_t ibindex; /**< IB device index (out). */
109 uint32_t ifindex; /**< Network interface index (out). */
110 uint32_t portnum; /**< IB device max port number (out). */
113 rte_atomic32_t atomic_sn = RTE_ATOMIC32_INIT(0);
115 /* Generate Netlink sequence number. */
116 #define MLX5_NL_SN_GENERATE ((uint32_t)rte_atomic32_add_return(&atomic_sn, 1))
119 * Opens a Netlink socket.
122 * Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA).
125 * A file descriptor on success, a negative errno value otherwise and
129 mlx5_nl_init(int protocol)
132 int sndbuf_size = MLX5_SEND_BUF_SIZE;
133 int rcvbuf_size = MLX5_RECV_BUF_SIZE;
134 struct sockaddr_nl local = {
135 .nl_family = AF_NETLINK,
139 fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
144 ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int));
149 ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int));
154 ret = bind(fd, (struct sockaddr *)&local, sizeof(local));
166 * Send a request message to the kernel on the Netlink socket.
169 * Netlink socket file descriptor.
171 * The Netlink message send to the kernel.
175 * Pointer to the request structure.
177 * Length of the request in bytes.
180 * The number of sent bytes on success, a negative errno value otherwise and
184 mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req,
187 struct sockaddr_nl sa = {
188 .nl_family = AF_NETLINK,
190 struct iovec iov[2] = {
191 { .iov_base = nh, .iov_len = sizeof(*nh), },
192 { .iov_base = req, .iov_len = len, },
194 struct msghdr msg = {
196 .msg_namelen = sizeof(sa),
202 nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
204 send_bytes = sendmsg(nlsk_fd, &msg, 0);
205 if (send_bytes < 0) {
213 * Send a message to the kernel on the Netlink socket.
216 * The Netlink socket file descriptor used for communication.
218 * The Netlink message send to the kernel.
223 * The number of sent bytes on success, a negative errno value otherwise and
227 mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn)
229 struct sockaddr_nl sa = {
230 .nl_family = AF_NETLINK,
234 .iov_len = nh->nlmsg_len,
236 struct msghdr msg = {
238 .msg_namelen = sizeof(sa),
244 nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
246 send_bytes = sendmsg(nlsk_fd, &msg, 0);
247 if (send_bytes < 0) {
255 * Receive a message from the kernel on the Netlink socket, following
259 * The Netlink socket file descriptor used for communication.
263 * The callback function to call for each Netlink message received.
264 * @param[in, out] arg
265 * Custom arguments for the callback.
268 * 0 on success, a negative errno value otherwise and rte_errno is set.
271 mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg),
274 struct sockaddr_nl sa;
275 char buf[MLX5_RECV_BUF_SIZE];
278 .iov_len = sizeof(buf),
280 struct msghdr msg = {
282 .msg_namelen = sizeof(sa),
284 /* One message at a time */
295 recv_bytes = recvmsg(nlsk_fd, &msg, 0);
296 if (recv_bytes == -1) {
300 nh = (struct nlmsghdr *)buf;
301 } while (nh->nlmsg_seq != sn);
303 NLMSG_OK(nh, (unsigned int)recv_bytes);
304 nh = NLMSG_NEXT(nh, recv_bytes)) {
305 if (nh->nlmsg_type == NLMSG_ERROR) {
306 struct nlmsgerr *err_data = NLMSG_DATA(nh);
308 if (err_data->error < 0) {
309 rte_errno = -err_data->error;
315 /* Multi-part msgs and their trailing DONE message. */
316 if (nh->nlmsg_flags & NLM_F_MULTI) {
317 if (nh->nlmsg_type == NLMSG_DONE)
332 * Parse Netlink message to retrieve the bridge MAC address.
335 * Pointer to Netlink Message Header.
337 * PMD data register with this callback.
340 * 0 on success, a negative errno value otherwise and rte_errno is set.
343 mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg)
345 struct mlx5_nl_mac_addr *data = arg;
346 struct ndmsg *r = NLMSG_DATA(nh);
347 struct rtattr *attribute;
350 len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
351 for (attribute = MLX5_NDA_RTA(r);
352 RTA_OK(attribute, len);
353 attribute = RTA_NEXT(attribute, len)) {
354 if (attribute->rta_type == NDA_LLADDR) {
355 if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) {
357 "not enough room to finalize the"
365 rte_ether_format_addr(m, 18, RTA_DATA(attribute));
366 DRV_LOG(DEBUG, "bridge MAC address %s", m);
368 memcpy(&(*data->mac)[data->mac_n++],
369 RTA_DATA(attribute), RTE_ETHER_ADDR_LEN);
376 * Get bridge MAC addresses.
379 * Netlink socket file descriptor.
380 * @param[in] iface_idx
381 * Net device interface index.
383 * Pointer to the array table of MAC addresses to fill.
384 * Its size should be of MLX5_MAX_MAC_ADDRESSES.
386 * Number of entries filled in MAC array.
389 * 0 on success, a negative errno value otherwise and rte_errno is set.
392 mlx5_nl_mac_addr_list(int nlsk_fd, unsigned int iface_idx,
393 struct rte_ether_addr (*mac)[], int *mac_n)
397 struct ifinfomsg ifm;
400 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
401 .nlmsg_type = RTM_GETNEIGH,
402 .nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
405 .ifi_family = PF_BRIDGE,
406 .ifi_index = iface_idx,
409 struct mlx5_nl_mac_addr data = {
413 uint32_t sn = MLX5_NL_SN_GENERATE;
418 ret = mlx5_nl_request(nlsk_fd, &req.hdr, sn, &req.ifm,
419 sizeof(struct ifinfomsg));
422 ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_mac_addr_cb, &data);
428 DRV_LOG(DEBUG, "Interface %u cannot retrieve MAC address list %s",
429 iface_idx, strerror(rte_errno));
434 * Modify the MAC address neighbour table with Netlink.
437 * Netlink socket file descriptor.
438 * @param[in] iface_idx
439 * Net device interface index.
441 * MAC address to consider.
443 * 1 to add the MAC address, 0 to remove the MAC address.
446 * 0 on success, a negative errno value otherwise and rte_errno is set.
449 mlx5_nl_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
450 struct rte_ether_addr *mac, int add)
456 uint8_t buffer[RTE_ETHER_ADDR_LEN];
459 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
460 .nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
461 NLM_F_EXCL | NLM_F_ACK,
462 .nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH,
465 .ndm_family = PF_BRIDGE,
466 .ndm_state = NUD_NOARP | NUD_PERMANENT,
467 .ndm_ifindex = iface_idx,
468 .ndm_flags = NTF_SELF,
471 .rta_type = NDA_LLADDR,
472 .rta_len = RTA_LENGTH(RTE_ETHER_ADDR_LEN),
475 uint32_t sn = MLX5_NL_SN_GENERATE;
480 memcpy(RTA_DATA(&req.rta), mac, RTE_ETHER_ADDR_LEN);
481 req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
482 RTA_ALIGN(req.rta.rta_len);
483 ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
486 ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
492 "Interface %u cannot %s MAC address"
493 " %02X:%02X:%02X:%02X:%02X:%02X %s",
495 add ? "add" : "remove",
496 mac->addr_bytes[0], mac->addr_bytes[1],
497 mac->addr_bytes[2], mac->addr_bytes[3],
498 mac->addr_bytes[4], mac->addr_bytes[5],
499 strerror(rte_errno));
504 * Modify the VF MAC address neighbour table with Netlink.
507 * Netlink socket file descriptor.
508 * @param[in] iface_idx
509 * Net device interface index.
511 * MAC address to consider.
516 * 0 on success, a negative errno value otherwise and rte_errno is set.
519 mlx5_nl_vf_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
520 struct rte_ether_addr *mac, int vf_index)
525 struct ifinfomsg ifm;
526 struct rtattr vf_list_rta;
527 struct rtattr vf_info_rta;
528 struct rtattr vf_mac_rta;
529 struct ifla_vf_mac ivm;
532 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
533 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
534 .nlmsg_type = RTM_BASE,
537 .ifi_index = iface_idx,
540 .rta_type = IFLA_VFINFO_LIST,
541 .rta_len = RTA_ALIGN(RTA_LENGTH(0)),
544 .rta_type = IFLA_VF_INFO,
545 .rta_len = RTA_ALIGN(RTA_LENGTH(0)),
548 .rta_type = IFLA_VF_MAC,
551 struct ifla_vf_mac ivm = {
554 uint32_t sn = MLX5_NL_SN_GENERATE;
556 memcpy(&ivm.mac, mac, RTE_ETHER_ADDR_LEN);
557 memcpy(RTA_DATA(&req.vf_mac_rta), &ivm, sizeof(ivm));
559 req.vf_mac_rta.rta_len = RTA_LENGTH(sizeof(ivm));
560 req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
561 RTA_ALIGN(req.vf_list_rta.rta_len) +
562 RTA_ALIGN(req.vf_info_rta.rta_len) +
563 RTA_ALIGN(req.vf_mac_rta.rta_len);
564 req.vf_list_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
566 req.vf_info_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
571 ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
574 ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
580 "representor %u cannot set VF MAC address "
581 "%02X:%02X:%02X:%02X:%02X:%02X : %s",
583 mac->addr_bytes[0], mac->addr_bytes[1],
584 mac->addr_bytes[2], mac->addr_bytes[3],
585 mac->addr_bytes[4], mac->addr_bytes[5],
586 strerror(rte_errno));
594 * Netlink socket file descriptor.
595 * @param[in] iface_idx
596 * Net device interface index.
598 * BITFIELD_DECLARE array to store the mac.
600 * MAC address to register.
605 * 0 on success, a negative errno value otherwise and rte_errno is set.
608 mlx5_nl_mac_addr_add(int nlsk_fd, unsigned int iface_idx,
609 uint64_t *mac_own, struct rte_ether_addr *mac,
614 ret = mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 1);
616 BITFIELD_SET(mac_own, index);
623 * Remove a MAC address.
626 * Netlink socket file descriptor.
627 * @param[in] iface_idx
628 * Net device interface index.
630 * BITFIELD_DECLARE array to store the mac.
632 * MAC address to remove.
637 * 0 on success, a negative errno value otherwise and rte_errno is set.
640 mlx5_nl_mac_addr_remove(int nlsk_fd, unsigned int iface_idx, uint64_t *mac_own,
641 struct rte_ether_addr *mac, uint32_t index)
643 BITFIELD_RESET(mac_own, index);
644 return mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 0);
648 * Synchronize Netlink bridge table to the internal table.
651 * Netlink socket file descriptor.
652 * @param[in] iface_idx
653 * Net device interface index.
655 * Mac addresses array to sync.
657 * @p mac_addrs array size.
660 mlx5_nl_mac_addr_sync(int nlsk_fd, unsigned int iface_idx,
661 struct rte_ether_addr *mac_addrs, int n)
663 struct rte_ether_addr macs[n];
668 ret = mlx5_nl_mac_addr_list(nlsk_fd, iface_idx, &macs, &macs_n);
671 for (i = 0; i != macs_n; ++i) {
674 /* Verify the address is not in the array yet. */
675 for (j = 0; j != n; ++j)
676 if (rte_is_same_ether_addr(&macs[i], &mac_addrs[j]))
680 /* Find the first entry available. */
681 for (j = 0; j != n; ++j) {
682 if (rte_is_zero_ether_addr(&mac_addrs[j])) {
683 mac_addrs[j] = macs[i];
691 * Flush all added MAC addresses.
694 * Netlink socket file descriptor.
695 * @param[in] iface_idx
696 * Net device interface index.
697 * @param[in] mac_addrs
698 * Mac addresses array to flush.
700 * @p mac_addrs array size.
702 * BITFIELD_DECLARE array to store the mac.
705 mlx5_nl_mac_addr_flush(int nlsk_fd, unsigned int iface_idx,
706 struct rte_ether_addr *mac_addrs, int n,
711 for (i = n - 1; i >= 0; --i) {
712 struct rte_ether_addr *m = &mac_addrs[i];
714 if (BITFIELD_ISSET(mac_own, i))
715 mlx5_nl_mac_addr_remove(nlsk_fd, iface_idx, mac_own, m,
721 * Enable promiscuous / all multicast mode through Netlink.
724 * Netlink socket file descriptor.
725 * @param[in] iface_idx
726 * Net device interface index.
728 * IFF_PROMISC for promiscuous, IFF_ALLMULTI for allmulti.
730 * Nonzero to enable, disable otherwise.
733 * 0 on success, a negative errno value otherwise and rte_errno is set.
736 mlx5_nl_device_flags(int nlsk_fd, unsigned int iface_idx, uint32_t flags,
741 struct ifinfomsg ifi;
744 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
745 .nlmsg_type = RTM_NEWLINK,
746 .nlmsg_flags = NLM_F_REQUEST,
749 .ifi_flags = enable ? flags : 0,
751 .ifi_index = iface_idx,
754 uint32_t sn = MLX5_NL_SN_GENERATE;
757 assert(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI)));
760 ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
767 * Enable promiscuous mode through Netlink.
770 * Netlink socket file descriptor.
771 * @param[in] iface_idx
772 * Net device interface index.
774 * Nonzero to enable, disable otherwise.
777 * 0 on success, a negative errno value otherwise and rte_errno is set.
780 mlx5_nl_promisc(int nlsk_fd, unsigned int iface_idx, int enable)
782 int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_PROMISC, enable);
786 "Interface %u cannot %s promisc mode: Netlink error %s",
787 iface_idx, enable ? "enable" : "disable",
788 strerror(rte_errno));
793 * Enable all multicast mode through Netlink.
796 * Netlink socket file descriptor.
797 * @param[in] iface_idx
798 * Net device interface index.
800 * Nonzero to enable, disable otherwise.
803 * 0 on success, a negative errno value otherwise and rte_errno is set.
806 mlx5_nl_allmulti(int nlsk_fd, unsigned int iface_idx, int enable)
808 int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_ALLMULTI,
813 "Interface %u cannot %s allmulti : Netlink error %s",
814 iface_idx, enable ? "enable" : "disable",
815 strerror(rte_errno));
820 * Process network interface information from Netlink message.
823 * Pointer to Netlink message header.
825 * Opaque data pointer for this callback.
828 * 0 on success, a negative errno value otherwise and rte_errno is set.
831 mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
833 struct mlx5_nl_ifindex_data *data = arg;
834 struct mlx5_nl_ifindex_data local = {
837 size_t off = NLMSG_HDRLEN;
839 if (nh->nlmsg_type !=
840 RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) &&
842 RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET))
844 while (off < nh->nlmsg_len) {
845 struct nlattr *na = (void *)((uintptr_t)nh + off);
846 void *payload = (void *)((uintptr_t)na + NLA_HDRLEN);
848 if (na->nla_len > nh->nlmsg_len - off)
850 switch (na->nla_type) {
851 case RDMA_NLDEV_ATTR_DEV_INDEX:
852 local.ibindex = *(uint32_t *)payload;
853 local.flags |= MLX5_NL_CMD_GET_IB_INDEX;
855 case RDMA_NLDEV_ATTR_DEV_NAME:
856 if (!strcmp(payload, data->name))
857 local.flags |= MLX5_NL_CMD_GET_IB_NAME;
859 case RDMA_NLDEV_ATTR_NDEV_INDEX:
860 local.ifindex = *(uint32_t *)payload;
861 local.flags |= MLX5_NL_CMD_GET_NET_INDEX;
863 case RDMA_NLDEV_ATTR_PORT_INDEX:
864 local.portnum = *(uint32_t *)payload;
865 local.flags |= MLX5_NL_CMD_GET_PORT_INDEX;
870 off += NLA_ALIGN(na->nla_len);
873 * It is possible to have multiple messages for all
874 * Infiniband devices in the system with appropriate name.
875 * So we should gather parameters locally and copy to
876 * query context only in case of coinciding device name.
878 if (local.flags & MLX5_NL_CMD_GET_IB_NAME) {
879 data->flags = local.flags;
880 data->ibindex = local.ibindex;
881 data->ifindex = local.ifindex;
882 data->portnum = local.portnum;
891 * Get index of network interface associated with some IB device.
893 * This is the only somewhat safe method to avoid resorting to heuristics
894 * when faced with port representors. Unfortunately it requires at least
898 * Netlink socket of the RDMA kind (NETLINK_RDMA).
902 * IB device port index, starting from 1
904 * A valid (nonzero) interface index on success, 0 otherwise and rte_errno
908 mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
910 struct mlx5_nl_ifindex_data data = {
913 .ibindex = 0, /* Determined during first pass. */
914 .ifindex = 0, /* Determined during second pass. */
918 uint8_t buf[NLMSG_HDRLEN +
919 NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) +
920 NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
923 .nlmsg_len = NLMSG_LENGTH(0),
924 .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
926 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
930 uint32_t sn = MLX5_NL_SN_GENERATE;
933 ret = mlx5_nl_send(nl, &req.nh, sn);
936 ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
939 if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
940 !(data.flags & MLX5_NL_CMD_GET_IB_INDEX))
943 sn = MLX5_NL_SN_GENERATE;
944 req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
945 RDMA_NLDEV_CMD_PORT_GET);
946 req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
947 req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
948 na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
949 na->nla_len = NLA_HDRLEN + sizeof(data.ibindex);
950 na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
951 memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
952 &data.ibindex, sizeof(data.ibindex));
953 na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
954 na->nla_len = NLA_HDRLEN + sizeof(pindex);
955 na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
956 memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
957 &pindex, sizeof(pindex));
958 ret = mlx5_nl_send(nl, &req.nh, sn);
961 ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
964 if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
965 !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
966 !(data.flags & MLX5_NL_CMD_GET_NET_INDEX) ||
976 * Get the number of physical ports of given IB device.
979 * Netlink socket of the RDMA kind (NETLINK_RDMA).
984 * A valid (nonzero) number of ports on success, 0 otherwise
985 * and rte_errno is set.
988 mlx5_nl_portnum(int nl, const char *name)
990 struct mlx5_nl_ifindex_data data = {
996 struct nlmsghdr req = {
997 .nlmsg_len = NLMSG_LENGTH(0),
998 .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1000 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
1002 uint32_t sn = MLX5_NL_SN_GENERATE;
1005 ret = mlx5_nl_send(nl, &req, sn);
1008 ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
1011 if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
1012 !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
1013 !(data.flags & MLX5_NL_CMD_GET_PORT_INDEX)) {
1019 return data.portnum;
1023 * Analyze gathered port parameters via Netlink to recognize master
1024 * and representor devices for E-Switch configuration.
1026 * @param[in] num_vf_set
1027 * flag of presence of number of VFs port attribute.
1028 * @param[inout] switch_info
1029 * Port information, including port name as a number and port name
1030 * type if recognized
1033 * master and representor flags are set in switch_info according to
1034 * recognized parameters (if any).
1037 mlx5_nl_check_switch_info(bool num_vf_set,
1038 struct mlx5_switch_info *switch_info)
1040 switch (switch_info->name_type) {
1041 case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1043 * Name is not recognized, assume the master,
1044 * check the number of VFs key presence.
1046 switch_info->master = num_vf_set;
1048 case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1050 * Name is not set, this assumes the legacy naming
1051 * schema for master, just check if there is a
1052 * number of VFs key.
1054 switch_info->master = num_vf_set;
1056 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1057 /* New uplink naming schema recognized. */
1058 switch_info->master = 1;
1060 case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1061 /* Legacy representors naming schema. */
1062 switch_info->representor = !num_vf_set;
1064 case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1065 /* New representors naming schema. */
1066 switch_info->representor = 1;
1072 * Process switch information from Netlink message.
1075 * Pointer to Netlink message header.
1077 * Opaque data pointer for this callback.
1080 * 0 on success, a negative errno value otherwise and rte_errno is set.
1083 mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
1085 struct mlx5_switch_info info = {
1088 .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
1092 size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg));
1093 bool switch_id_set = false;
1094 bool num_vf_set = false;
1096 if (nh->nlmsg_type != RTM_NEWLINK)
1098 while (off < nh->nlmsg_len) {
1099 struct rtattr *ra = (void *)((uintptr_t)nh + off);
1100 void *payload = RTA_DATA(ra);
1103 if (ra->rta_len > nh->nlmsg_len - off)
1105 switch (ra->rta_type) {
1109 case IFLA_PHYS_PORT_NAME:
1110 mlx5_translate_port_name((char *)payload, &info);
1112 case IFLA_PHYS_SWITCH_ID:
1114 for (i = 0; i < RTA_PAYLOAD(ra); ++i) {
1115 info.switch_id <<= 8;
1116 info.switch_id |= ((uint8_t *)payload)[i];
1118 switch_id_set = true;
1121 off += RTA_ALIGN(ra->rta_len);
1123 if (switch_id_set) {
1124 /* We have some E-Switch configuration. */
1125 mlx5_nl_check_switch_info(num_vf_set, &info);
1127 assert(!(info.master && info.representor));
1128 memcpy(arg, &info, sizeof(info));
1136 * Get switch information associated with network interface.
1139 * Netlink socket of the ROUTE kind (NETLINK_ROUTE).
1141 * Network interface index.
1143 * Switch information object, populated in case of success.
1146 * 0 on success, a negative errno value otherwise and rte_errno is set.
1149 mlx5_nl_switch_info(int nl, unsigned int ifindex,
1150 struct mlx5_switch_info *info)
1154 struct ifinfomsg info;
1159 .nlmsg_len = NLMSG_LENGTH
1161 RTA_LENGTH(sizeof(uint32_t))),
1162 .nlmsg_type = RTM_GETLINK,
1163 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1166 .ifi_family = AF_UNSPEC,
1167 .ifi_index = ifindex,
1170 .rta_type = IFLA_EXT_MASK,
1171 .rta_len = RTA_LENGTH(sizeof(int32_t)),
1173 .extmask = RTE_LE32(1),
1175 uint32_t sn = MLX5_NL_SN_GENERATE;
1178 ret = mlx5_nl_send(nl, &req.nh, sn);
1180 ret = mlx5_nl_recv(nl, sn, mlx5_nl_switch_info_cb, info);
1181 if (info->master && info->representor) {
1182 DRV_LOG(ERR, "ifindex %u device is recognized as master"
1183 " and as representor", ifindex);
1191 * Delete VLAN network device by ifindex.
1194 * Context object initialized by mlx5_nl_vlan_vmwa_init().
1195 * @param[in] ifindex
1196 * Interface index of network device to delete.
1199 mlx5_nl_vlan_vmwa_delete(struct mlx5_nl_vlan_vmwa_context *vmwa,
1202 uint32_t sn = MLX5_NL_SN_GENERATE;
1206 struct ifinfomsg info;
1209 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
1210 .nlmsg_type = RTM_DELLINK,
1211 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1214 .ifi_family = AF_UNSPEC,
1215 .ifi_index = ifindex,
1220 ret = mlx5_nl_send(vmwa->nl_socket, &req.nh, sn);
1222 ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1224 DRV_LOG(WARNING, "netlink: error deleting VLAN WA"
1225 " ifindex %u, %d", ifindex, ret);
1229 /* Set of subroutines to build Netlink message. */
1230 static struct nlattr *
1231 nl_msg_tail(struct nlmsghdr *nlh)
1233 return (struct nlattr *)
1234 (((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
1238 nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
1240 struct nlattr *nla = nl_msg_tail(nlh);
1242 nla->nla_type = type;
1243 nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr) + alen);
1244 nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + nla->nla_len;
1247 memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
1250 static struct nlattr *
1251 nl_attr_nest_start(struct nlmsghdr *nlh, int type)
1253 struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
1255 nl_attr_put(nlh, type, NULL, 0);
1260 nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest)
1262 nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest;
1266 * Create network VLAN device with specified VLAN tag.
1269 * Context object initialized by mlx5_nl_vlan_vmwa_init().
1270 * @param[in] ifindex
1271 * Base network interface index.
1273 * VLAN tag for VLAN network device to create.
1276 mlx5_nl_vlan_vmwa_create(struct mlx5_nl_vlan_vmwa_context *vmwa,
1277 uint32_t ifindex, uint16_t tag)
1279 struct nlmsghdr *nlh;
1280 struct ifinfomsg *ifm;
1281 char name[sizeof(MLX5_VMWA_VLAN_DEVICE_PFX) + 32];
1283 alignas(RTE_CACHE_LINE_SIZE)
1284 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1285 NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
1286 NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
1287 NLMSG_ALIGN(sizeof(uint32_t)) +
1288 NLMSG_ALIGN(sizeof(name)) +
1289 NLMSG_ALIGN(sizeof("vlan")) +
1290 NLMSG_ALIGN(sizeof(uint32_t)) +
1291 NLMSG_ALIGN(sizeof(uint16_t)) + 16];
1292 struct nlattr *na_info;
1293 struct nlattr *na_vlan;
1294 uint32_t sn = MLX5_NL_SN_GENERATE;
1297 memset(buf, 0, sizeof(buf));
1298 nlh = (struct nlmsghdr *)buf;
1299 nlh->nlmsg_len = sizeof(struct nlmsghdr);
1300 nlh->nlmsg_type = RTM_NEWLINK;
1301 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
1302 NLM_F_EXCL | NLM_F_ACK;
1303 ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
1304 nlh->nlmsg_len += sizeof(struct ifinfomsg);
1305 ifm->ifi_family = AF_UNSPEC;
1308 ifm->ifi_flags = IFF_UP;
1309 ifm->ifi_change = 0xffffffff;
1310 nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
1311 ret = snprintf(name, sizeof(name), "%s.%u.%u",
1312 MLX5_VMWA_VLAN_DEVICE_PFX, ifindex, tag);
1313 nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
1314 na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
1315 nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
1316 na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
1317 nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
1318 nl_attr_nest_end(nlh, na_vlan);
1319 nl_attr_nest_end(nlh, na_info);
1320 assert(sizeof(buf) >= nlh->nlmsg_len);
1321 ret = mlx5_nl_send(vmwa->nl_socket, nlh, sn);
1323 ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1325 DRV_LOG(WARNING, "netlink: VLAN %s create failure (%d)", name,
1328 // Try to get ifindex of created or pre-existing device.
1329 ret = if_nametoindex(name);
1331 DRV_LOG(WARNING, "VLAN %s failed to get index (%d)", name,