1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2018 6WIND S.A.
3 * Copyright 2018 Mellanox Technologies, Ltd
7 #include <linux/if_link.h>
8 #include <linux/rtnetlink.h>
10 #include <rdma/rdma_netlink.h>
16 #include <sys/socket.h>
19 #include <rte_errno.h>
20 #include <rte_atomic.h>
21 #include <rte_ether.h>
25 #include "mlx5_utils.h"
27 /* Size of the buffer to receive kernel messages */
28 #define MLX5_NL_BUF_SIZE (32 * 1024)
29 /* Send buffer size for the Netlink socket */
30 #define MLX5_SEND_BUF_SIZE 32768
31 /* Receive buffer size for the Netlink socket */
32 #define MLX5_RECV_BUF_SIZE 32768
34 /** Parameters of VLAN devices created by driver. */
35 #define MLX5_VMWA_VLAN_DEVICE_PFX "evmlx"
37 * Define NDA_RTA as defined in iproute2 sources.
39 * see in iproute2 sources file include/libnetlink.h
42 #define MLX5_NDA_RTA(r) \
43 ((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
46 * Define NLMSG_TAIL as defined in iproute2 sources.
48 * see in iproute2 sources file include/libnetlink.h
51 #define NLMSG_TAIL(nmsg) \
52 ((struct rtattr *)(((char *)(nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
55 * The following definitions are normally found in rdma/rdma_netlink.h,
56 * however they are so recent that most systems do not expose them yet.
58 #ifndef HAVE_RDMA_NL_NLDEV
59 #define RDMA_NL_NLDEV 5
61 #ifndef HAVE_RDMA_NLDEV_CMD_GET
62 #define RDMA_NLDEV_CMD_GET 1
64 #ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET
65 #define RDMA_NLDEV_CMD_PORT_GET 5
67 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX
68 #define RDMA_NLDEV_ATTR_DEV_INDEX 1
70 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME
71 #define RDMA_NLDEV_ATTR_DEV_NAME 2
73 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
74 #define RDMA_NLDEV_ATTR_PORT_INDEX 3
76 #ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
77 #define RDMA_NLDEV_ATTR_NDEV_INDEX 50
80 /* These are normally found in linux/if_link.h. */
81 #ifndef HAVE_IFLA_NUM_VF
82 #define IFLA_NUM_VF 21
84 #ifndef HAVE_IFLA_EXT_MASK
85 #define IFLA_EXT_MASK 29
87 #ifndef HAVE_IFLA_PHYS_SWITCH_ID
88 #define IFLA_PHYS_SWITCH_ID 36
90 #ifndef HAVE_IFLA_PHYS_PORT_NAME
91 #define IFLA_PHYS_PORT_NAME 38
94 /* Add/remove MAC address through Netlink */
95 struct mlx5_nl_mac_addr {
96 struct rte_ether_addr (*mac)[];
97 /**< MAC address handled by the device. */
98 int mac_n; /**< Number of addresses in the array. */
101 #define MLX5_NL_CMD_GET_IB_NAME (1 << 0)
102 #define MLX5_NL_CMD_GET_IB_INDEX (1 << 1)
103 #define MLX5_NL_CMD_GET_NET_INDEX (1 << 2)
104 #define MLX5_NL_CMD_GET_PORT_INDEX (1 << 3)
106 /** Data structure used by mlx5_nl_cmdget_cb(). */
107 struct mlx5_nl_ifindex_data {
108 const char *name; /**< IB device name (in). */
109 uint32_t flags; /**< found attribute flags (out). */
110 uint32_t ibindex; /**< IB device index (out). */
111 uint32_t ifindex; /**< Network interface index (out). */
112 uint32_t portnum; /**< IB device max port number (out). */
115 rte_atomic32_t atomic_sn = RTE_ATOMIC32_INIT(0);
117 /* Generate Netlink sequence number. */
118 #define MLX5_NL_SN_GENERATE ((uint32_t)rte_atomic32_add_return(&atomic_sn, 1))
121 * Opens a Netlink socket.
124 * Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA).
127 * A file descriptor on success, a negative errno value otherwise and
131 mlx5_nl_init(int protocol)
134 int sndbuf_size = MLX5_SEND_BUF_SIZE;
135 int rcvbuf_size = MLX5_RECV_BUF_SIZE;
136 struct sockaddr_nl local = {
137 .nl_family = AF_NETLINK,
141 fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
146 ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int));
151 ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int));
156 ret = bind(fd, (struct sockaddr *)&local, sizeof(local));
168 * Send a request message to the kernel on the Netlink socket.
171 * Netlink socket file descriptor.
173 * The Netlink message send to the kernel.
177 * Pointer to the request structure.
179 * Length of the request in bytes.
182 * The number of sent bytes on success, a negative errno value otherwise and
186 mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req,
189 struct sockaddr_nl sa = {
190 .nl_family = AF_NETLINK,
192 struct iovec iov[2] = {
193 { .iov_base = nh, .iov_len = sizeof(*nh), },
194 { .iov_base = req, .iov_len = len, },
196 struct msghdr msg = {
198 .msg_namelen = sizeof(sa),
204 nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
206 send_bytes = sendmsg(nlsk_fd, &msg, 0);
207 if (send_bytes < 0) {
215 * Send a message to the kernel on the Netlink socket.
218 * The Netlink socket file descriptor used for communication.
220 * The Netlink message send to the kernel.
225 * The number of sent bytes on success, a negative errno value otherwise and
229 mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn)
231 struct sockaddr_nl sa = {
232 .nl_family = AF_NETLINK,
236 .iov_len = nh->nlmsg_len,
238 struct msghdr msg = {
240 .msg_namelen = sizeof(sa),
246 nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
248 send_bytes = sendmsg(nlsk_fd, &msg, 0);
249 if (send_bytes < 0) {
257 * Receive a message from the kernel on the Netlink socket, following
261 * The Netlink socket file descriptor used for communication.
265 * The callback function to call for each Netlink message received.
266 * @param[in, out] arg
267 * Custom arguments for the callback.
270 * 0 on success, a negative errno value otherwise and rte_errno is set.
273 mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg),
276 struct sockaddr_nl sa;
277 char buf[MLX5_RECV_BUF_SIZE];
280 .iov_len = sizeof(buf),
282 struct msghdr msg = {
284 .msg_namelen = sizeof(sa),
286 /* One message at a time */
297 recv_bytes = recvmsg(nlsk_fd, &msg, 0);
298 if (recv_bytes == -1) {
302 nh = (struct nlmsghdr *)buf;
303 } while (nh->nlmsg_seq != sn);
305 NLMSG_OK(nh, (unsigned int)recv_bytes);
306 nh = NLMSG_NEXT(nh, recv_bytes)) {
307 if (nh->nlmsg_type == NLMSG_ERROR) {
308 struct nlmsgerr *err_data = NLMSG_DATA(nh);
310 if (err_data->error < 0) {
311 rte_errno = -err_data->error;
317 /* Multi-part msgs and their trailing DONE message. */
318 if (nh->nlmsg_flags & NLM_F_MULTI) {
319 if (nh->nlmsg_type == NLMSG_DONE)
334 * Parse Netlink message to retrieve the bridge MAC address.
337 * Pointer to Netlink Message Header.
339 * PMD data register with this callback.
342 * 0 on success, a negative errno value otherwise and rte_errno is set.
345 mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg)
347 struct mlx5_nl_mac_addr *data = arg;
348 struct ndmsg *r = NLMSG_DATA(nh);
349 struct rtattr *attribute;
352 len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
353 for (attribute = MLX5_NDA_RTA(r);
354 RTA_OK(attribute, len);
355 attribute = RTA_NEXT(attribute, len)) {
356 if (attribute->rta_type == NDA_LLADDR) {
357 if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) {
359 "not enough room to finalize the"
367 rte_ether_format_addr(m, 18, RTA_DATA(attribute));
368 DRV_LOG(DEBUG, "bridge MAC address %s", m);
370 memcpy(&(*data->mac)[data->mac_n++],
371 RTA_DATA(attribute), RTE_ETHER_ADDR_LEN);
378 * Get bridge MAC addresses.
381 * Netlink socket file descriptor.
382 * @param[in] iface_idx
383 * Net device interface index.
385 * Pointer to the array table of MAC addresses to fill.
386 * Its size should be of MLX5_MAX_MAC_ADDRESSES.
388 * Number of entries filled in MAC array.
391 * 0 on success, a negative errno value otherwise and rte_errno is set.
394 mlx5_nl_mac_addr_list(int nlsk_fd, unsigned int iface_idx,
395 struct rte_ether_addr (*mac)[], int *mac_n)
399 struct ifinfomsg ifm;
402 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
403 .nlmsg_type = RTM_GETNEIGH,
404 .nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
407 .ifi_family = PF_BRIDGE,
408 .ifi_index = iface_idx,
411 struct mlx5_nl_mac_addr data = {
415 uint32_t sn = MLX5_NL_SN_GENERATE;
420 ret = mlx5_nl_request(nlsk_fd, &req.hdr, sn, &req.ifm,
421 sizeof(struct ifinfomsg));
424 ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_mac_addr_cb, &data);
430 DRV_LOG(DEBUG, "Interface %u cannot retrieve MAC address list %s",
431 iface_idx, strerror(rte_errno));
436 * Modify the MAC address neighbour table with Netlink.
439 * Netlink socket file descriptor.
440 * @param[in] iface_idx
441 * Net device interface index.
443 * MAC address to consider.
445 * 1 to add the MAC address, 0 to remove the MAC address.
448 * 0 on success, a negative errno value otherwise and rte_errno is set.
451 mlx5_nl_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
452 struct rte_ether_addr *mac, int add)
458 uint8_t buffer[RTE_ETHER_ADDR_LEN];
461 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
462 .nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
463 NLM_F_EXCL | NLM_F_ACK,
464 .nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH,
467 .ndm_family = PF_BRIDGE,
468 .ndm_state = NUD_NOARP | NUD_PERMANENT,
469 .ndm_ifindex = iface_idx,
470 .ndm_flags = NTF_SELF,
473 .rta_type = NDA_LLADDR,
474 .rta_len = RTA_LENGTH(RTE_ETHER_ADDR_LEN),
477 uint32_t sn = MLX5_NL_SN_GENERATE;
482 memcpy(RTA_DATA(&req.rta), mac, RTE_ETHER_ADDR_LEN);
483 req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
484 RTA_ALIGN(req.rta.rta_len);
485 ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
488 ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
494 "Interface %u cannot %s MAC address"
495 " %02X:%02X:%02X:%02X:%02X:%02X %s",
497 add ? "add" : "remove",
498 mac->addr_bytes[0], mac->addr_bytes[1],
499 mac->addr_bytes[2], mac->addr_bytes[3],
500 mac->addr_bytes[4], mac->addr_bytes[5],
501 strerror(rte_errno));
506 * Modify the VF MAC address neighbour table with Netlink.
509 * Netlink socket file descriptor.
510 * @param[in] iface_idx
511 * Net device interface index.
513 * MAC address to consider.
518 * 0 on success, a negative errno value otherwise and rte_errno is set.
521 mlx5_nl_vf_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
522 struct rte_ether_addr *mac, int vf_index)
527 struct ifinfomsg ifm;
528 struct rtattr vf_list_rta;
529 struct rtattr vf_info_rta;
530 struct rtattr vf_mac_rta;
531 struct ifla_vf_mac ivm;
534 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
535 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
536 .nlmsg_type = RTM_BASE,
539 .ifi_index = iface_idx,
542 .rta_type = IFLA_VFINFO_LIST,
543 .rta_len = RTA_ALIGN(RTA_LENGTH(0)),
546 .rta_type = IFLA_VF_INFO,
547 .rta_len = RTA_ALIGN(RTA_LENGTH(0)),
550 .rta_type = IFLA_VF_MAC,
553 struct ifla_vf_mac ivm = {
556 uint32_t sn = MLX5_NL_SN_GENERATE;
558 memcpy(&ivm.mac, mac, RTE_ETHER_ADDR_LEN);
559 memcpy(RTA_DATA(&req.vf_mac_rta), &ivm, sizeof(ivm));
561 req.vf_mac_rta.rta_len = RTA_LENGTH(sizeof(ivm));
562 req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
563 RTA_ALIGN(req.vf_list_rta.rta_len) +
564 RTA_ALIGN(req.vf_info_rta.rta_len) +
565 RTA_ALIGN(req.vf_mac_rta.rta_len);
566 req.vf_list_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
568 req.vf_info_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
573 ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
576 ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
582 "representor %u cannot set VF MAC address "
583 "%02X:%02X:%02X:%02X:%02X:%02X : %s",
585 mac->addr_bytes[0], mac->addr_bytes[1],
586 mac->addr_bytes[2], mac->addr_bytes[3],
587 mac->addr_bytes[4], mac->addr_bytes[5],
588 strerror(rte_errno));
596 * Netlink socket file descriptor.
597 * @param[in] iface_idx
598 * Net device interface index.
600 * BITFIELD_DECLARE array to store the mac.
602 * MAC address to register.
607 * 0 on success, a negative errno value otherwise and rte_errno is set.
610 mlx5_nl_mac_addr_add(int nlsk_fd, unsigned int iface_idx,
611 uint64_t *mac_own, struct rte_ether_addr *mac,
616 ret = mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 1);
618 BITFIELD_SET(mac_own, index);
625 * Remove a MAC address.
628 * Netlink socket file descriptor.
629 * @param[in] iface_idx
630 * Net device interface index.
632 * BITFIELD_DECLARE array to store the mac.
634 * MAC address to remove.
639 * 0 on success, a negative errno value otherwise and rte_errno is set.
642 mlx5_nl_mac_addr_remove(int nlsk_fd, unsigned int iface_idx, uint64_t *mac_own,
643 struct rte_ether_addr *mac, uint32_t index)
645 BITFIELD_RESET(mac_own, index);
646 return mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 0);
650 * Synchronize Netlink bridge table to the internal table.
653 * Netlink socket file descriptor.
654 * @param[in] iface_idx
655 * Net device interface index.
657 * Mac addresses array to sync.
659 * @p mac_addrs array size.
662 mlx5_nl_mac_addr_sync(int nlsk_fd, unsigned int iface_idx,
663 struct rte_ether_addr *mac_addrs, int n)
665 struct rte_ether_addr macs[n];
670 ret = mlx5_nl_mac_addr_list(nlsk_fd, iface_idx, &macs, &macs_n);
673 for (i = 0; i != macs_n; ++i) {
676 /* Verify the address is not in the array yet. */
677 for (j = 0; j != n; ++j)
678 if (rte_is_same_ether_addr(&macs[i], &mac_addrs[j]))
682 /* Find the first entry available. */
683 for (j = 0; j != n; ++j) {
684 if (rte_is_zero_ether_addr(&mac_addrs[j])) {
685 mac_addrs[j] = macs[i];
693 * Flush all added MAC addresses.
696 * Netlink socket file descriptor.
697 * @param[in] iface_idx
698 * Net device interface index.
699 * @param[in] mac_addrs
700 * Mac addresses array to flush.
702 * @p mac_addrs array size.
704 * BITFIELD_DECLARE array to store the mac.
707 mlx5_nl_mac_addr_flush(int nlsk_fd, unsigned int iface_idx,
708 struct rte_ether_addr *mac_addrs, int n,
713 for (i = n - 1; i >= 0; --i) {
714 struct rte_ether_addr *m = &mac_addrs[i];
716 if (BITFIELD_ISSET(mac_own, i))
717 mlx5_nl_mac_addr_remove(nlsk_fd, iface_idx, mac_own, m,
723 * Enable promiscuous / all multicast mode through Netlink.
726 * Netlink socket file descriptor.
727 * @param[in] iface_idx
728 * Net device interface index.
730 * IFF_PROMISC for promiscuous, IFF_ALLMULTI for allmulti.
732 * Nonzero to enable, disable otherwise.
735 * 0 on success, a negative errno value otherwise and rte_errno is set.
738 mlx5_nl_device_flags(int nlsk_fd, unsigned int iface_idx, uint32_t flags,
743 struct ifinfomsg ifi;
746 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
747 .nlmsg_type = RTM_NEWLINK,
748 .nlmsg_flags = NLM_F_REQUEST,
751 .ifi_flags = enable ? flags : 0,
753 .ifi_index = iface_idx,
756 uint32_t sn = MLX5_NL_SN_GENERATE;
759 assert(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI)));
762 ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
769 * Enable promiscuous mode through Netlink.
772 * Netlink socket file descriptor.
773 * @param[in] iface_idx
774 * Net device interface index.
776 * Nonzero to enable, disable otherwise.
779 * 0 on success, a negative errno value otherwise and rte_errno is set.
782 mlx5_nl_promisc(int nlsk_fd, unsigned int iface_idx, int enable)
784 int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_PROMISC, enable);
788 "Interface %u cannot %s promisc mode: Netlink error %s",
789 iface_idx, enable ? "enable" : "disable",
790 strerror(rte_errno));
795 * Enable all multicast mode through Netlink.
798 * Netlink socket file descriptor.
799 * @param[in] iface_idx
800 * Net device interface index.
802 * Nonzero to enable, disable otherwise.
805 * 0 on success, a negative errno value otherwise and rte_errno is set.
808 mlx5_nl_allmulti(int nlsk_fd, unsigned int iface_idx, int enable)
810 int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_ALLMULTI,
815 "Interface %u cannot %s allmulti : Netlink error %s",
816 iface_idx, enable ? "enable" : "disable",
817 strerror(rte_errno));
822 * Process network interface information from Netlink message.
825 * Pointer to Netlink message header.
827 * Opaque data pointer for this callback.
830 * 0 on success, a negative errno value otherwise and rte_errno is set.
833 mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
835 struct mlx5_nl_ifindex_data *data = arg;
836 struct mlx5_nl_ifindex_data local = {
839 size_t off = NLMSG_HDRLEN;
841 if (nh->nlmsg_type !=
842 RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) &&
844 RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET))
846 while (off < nh->nlmsg_len) {
847 struct nlattr *na = (void *)((uintptr_t)nh + off);
848 void *payload = (void *)((uintptr_t)na + NLA_HDRLEN);
850 if (na->nla_len > nh->nlmsg_len - off)
852 switch (na->nla_type) {
853 case RDMA_NLDEV_ATTR_DEV_INDEX:
854 local.ibindex = *(uint32_t *)payload;
855 local.flags |= MLX5_NL_CMD_GET_IB_INDEX;
857 case RDMA_NLDEV_ATTR_DEV_NAME:
858 if (!strcmp(payload, data->name))
859 local.flags |= MLX5_NL_CMD_GET_IB_NAME;
861 case RDMA_NLDEV_ATTR_NDEV_INDEX:
862 local.ifindex = *(uint32_t *)payload;
863 local.flags |= MLX5_NL_CMD_GET_NET_INDEX;
865 case RDMA_NLDEV_ATTR_PORT_INDEX:
866 local.portnum = *(uint32_t *)payload;
867 local.flags |= MLX5_NL_CMD_GET_PORT_INDEX;
872 off += NLA_ALIGN(na->nla_len);
875 * It is possible to have multiple messages for all
876 * Infiniband devices in the system with appropriate name.
877 * So we should gather parameters locally and copy to
878 * query context only in case of coinciding device name.
880 if (local.flags & MLX5_NL_CMD_GET_IB_NAME) {
881 data->flags = local.flags;
882 data->ibindex = local.ibindex;
883 data->ifindex = local.ifindex;
884 data->portnum = local.portnum;
893 * Get index of network interface associated with some IB device.
895 * This is the only somewhat safe method to avoid resorting to heuristics
896 * when faced with port representors. Unfortunately it requires at least
900 * Netlink socket of the RDMA kind (NETLINK_RDMA).
904 * IB device port index, starting from 1
906 * A valid (nonzero) interface index on success, 0 otherwise and rte_errno
910 mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
912 struct mlx5_nl_ifindex_data data = {
915 .ibindex = 0, /* Determined during first pass. */
916 .ifindex = 0, /* Determined during second pass. */
920 uint8_t buf[NLMSG_HDRLEN +
921 NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) +
922 NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
925 .nlmsg_len = NLMSG_LENGTH(0),
926 .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
928 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
932 uint32_t sn = MLX5_NL_SN_GENERATE;
935 ret = mlx5_nl_send(nl, &req.nh, sn);
938 ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
941 if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
942 !(data.flags & MLX5_NL_CMD_GET_IB_INDEX))
945 sn = MLX5_NL_SN_GENERATE;
946 req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
947 RDMA_NLDEV_CMD_PORT_GET);
948 req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
949 req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
950 na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
951 na->nla_len = NLA_HDRLEN + sizeof(data.ibindex);
952 na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
953 memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
954 &data.ibindex, sizeof(data.ibindex));
955 na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
956 na->nla_len = NLA_HDRLEN + sizeof(pindex);
957 na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
958 memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
959 &pindex, sizeof(pindex));
960 ret = mlx5_nl_send(nl, &req.nh, sn);
963 ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
966 if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
967 !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
968 !(data.flags & MLX5_NL_CMD_GET_NET_INDEX) ||
978 * Get the number of physical ports of given IB device.
981 * Netlink socket of the RDMA kind (NETLINK_RDMA).
986 * A valid (nonzero) number of ports on success, 0 otherwise
987 * and rte_errno is set.
990 mlx5_nl_portnum(int nl, const char *name)
992 struct mlx5_nl_ifindex_data data = {
998 struct nlmsghdr req = {
999 .nlmsg_len = NLMSG_LENGTH(0),
1000 .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1001 RDMA_NLDEV_CMD_GET),
1002 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
1004 uint32_t sn = MLX5_NL_SN_GENERATE;
1007 ret = mlx5_nl_send(nl, &req, sn);
1010 ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
1013 if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
1014 !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
1015 !(data.flags & MLX5_NL_CMD_GET_PORT_INDEX)) {
1021 return data.portnum;
1025 * Analyze gathered port parameters via Netlink to recognize master
1026 * and representor devices for E-Switch configuration.
1028 * @param[in] num_vf_set
1029 * flag of presence of number of VFs port attribute.
1030 * @param[inout] switch_info
1031 * Port information, including port name as a number and port name
1032 * type if recognized
1035 * master and representor flags are set in switch_info according to
1036 * recognized parameters (if any).
1039 mlx5_nl_check_switch_info(bool num_vf_set,
1040 struct mlx5_switch_info *switch_info)
1042 switch (switch_info->name_type) {
1043 case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1045 * Name is not recognized, assume the master,
1046 * check the number of VFs key presence.
1048 switch_info->master = num_vf_set;
1050 case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1052 * Name is not set, this assumes the legacy naming
1053 * schema for master, just check if there is a
1054 * number of VFs key.
1056 switch_info->master = num_vf_set;
1058 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1059 /* New uplink naming schema recognized. */
1060 switch_info->master = 1;
1062 case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1063 /* Legacy representors naming schema. */
1064 switch_info->representor = !num_vf_set;
1066 case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1067 /* New representors naming schema. */
1068 switch_info->representor = 1;
1074 * Process switch information from Netlink message.
1077 * Pointer to Netlink message header.
1079 * Opaque data pointer for this callback.
1082 * 0 on success, a negative errno value otherwise and rte_errno is set.
1085 mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
1087 struct mlx5_switch_info info = {
1090 .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
1094 size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg));
1095 bool switch_id_set = false;
1096 bool num_vf_set = false;
1098 if (nh->nlmsg_type != RTM_NEWLINK)
1100 while (off < nh->nlmsg_len) {
1101 struct rtattr *ra = (void *)((uintptr_t)nh + off);
1102 void *payload = RTA_DATA(ra);
1105 if (ra->rta_len > nh->nlmsg_len - off)
1107 switch (ra->rta_type) {
1111 case IFLA_PHYS_PORT_NAME:
1112 mlx5_translate_port_name((char *)payload, &info);
1114 case IFLA_PHYS_SWITCH_ID:
1116 for (i = 0; i < RTA_PAYLOAD(ra); ++i) {
1117 info.switch_id <<= 8;
1118 info.switch_id |= ((uint8_t *)payload)[i];
1120 switch_id_set = true;
1123 off += RTA_ALIGN(ra->rta_len);
1125 if (switch_id_set) {
1126 /* We have some E-Switch configuration. */
1127 mlx5_nl_check_switch_info(num_vf_set, &info);
1129 assert(!(info.master && info.representor));
1130 memcpy(arg, &info, sizeof(info));
1138 * Get switch information associated with network interface.
1141 * Netlink socket of the ROUTE kind (NETLINK_ROUTE).
1143 * Network interface index.
1145 * Switch information object, populated in case of success.
1148 * 0 on success, a negative errno value otherwise and rte_errno is set.
1151 mlx5_nl_switch_info(int nl, unsigned int ifindex,
1152 struct mlx5_switch_info *info)
1156 struct ifinfomsg info;
1161 .nlmsg_len = NLMSG_LENGTH
1163 RTA_LENGTH(sizeof(uint32_t))),
1164 .nlmsg_type = RTM_GETLINK,
1165 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1168 .ifi_family = AF_UNSPEC,
1169 .ifi_index = ifindex,
1172 .rta_type = IFLA_EXT_MASK,
1173 .rta_len = RTA_LENGTH(sizeof(int32_t)),
1175 .extmask = RTE_LE32(1),
1177 uint32_t sn = MLX5_NL_SN_GENERATE;
1180 ret = mlx5_nl_send(nl, &req.nh, sn);
1182 ret = mlx5_nl_recv(nl, sn, mlx5_nl_switch_info_cb, info);
1183 if (info->master && info->representor) {
1184 DRV_LOG(ERR, "ifindex %u device is recognized as master"
1185 " and as representor", ifindex);
1193 * Delete VLAN network device by ifindex.
1196 * Context object initialized by mlx5_nl_vlan_vmwa_init().
1197 * @param[in] ifindex
1198 * Interface index of network device to delete.
1201 mlx5_nl_vlan_vmwa_delete(struct mlx5_nl_vlan_vmwa_context *vmwa,
1204 uint32_t sn = MLX5_NL_SN_GENERATE;
1208 struct ifinfomsg info;
1211 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
1212 .nlmsg_type = RTM_DELLINK,
1213 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1216 .ifi_family = AF_UNSPEC,
1217 .ifi_index = ifindex,
1222 ret = mlx5_nl_send(vmwa->nl_socket, &req.nh, sn);
1224 ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1226 DRV_LOG(WARNING, "netlink: error deleting VLAN WA"
1227 " ifindex %u, %d", ifindex, ret);
1231 /* Set of subroutines to build Netlink message. */
1232 static struct nlattr *
1233 nl_msg_tail(struct nlmsghdr *nlh)
1235 return (struct nlattr *)
1236 (((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
1240 nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
1242 struct nlattr *nla = nl_msg_tail(nlh);
1244 nla->nla_type = type;
1245 nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr) + alen);
1246 nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + nla->nla_len;
1249 memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
1252 static struct nlattr *
1253 nl_attr_nest_start(struct nlmsghdr *nlh, int type)
1255 struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
1257 nl_attr_put(nlh, type, NULL, 0);
1262 nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest)
1264 nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest;
1268 * Create network VLAN device with specified VLAN tag.
1271 * Context object initialized by mlx5_nl_vlan_vmwa_init().
1272 * @param[in] ifindex
1273 * Base network interface index.
1275 * VLAN tag for VLAN network device to create.
1278 mlx5_nl_vlan_vmwa_create(struct mlx5_nl_vlan_vmwa_context *vmwa,
1279 uint32_t ifindex, uint16_t tag)
1281 struct nlmsghdr *nlh;
1282 struct ifinfomsg *ifm;
1283 char name[sizeof(MLX5_VMWA_VLAN_DEVICE_PFX) + 32];
1285 alignas(RTE_CACHE_LINE_SIZE)
1286 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1287 NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
1288 NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
1289 NLMSG_ALIGN(sizeof(uint32_t)) +
1290 NLMSG_ALIGN(sizeof(name)) +
1291 NLMSG_ALIGN(sizeof("vlan")) +
1292 NLMSG_ALIGN(sizeof(uint32_t)) +
1293 NLMSG_ALIGN(sizeof(uint16_t)) + 16];
1294 struct nlattr *na_info;
1295 struct nlattr *na_vlan;
1296 uint32_t sn = MLX5_NL_SN_GENERATE;
1299 memset(buf, 0, sizeof(buf));
1300 nlh = (struct nlmsghdr *)buf;
1301 nlh->nlmsg_len = sizeof(struct nlmsghdr);
1302 nlh->nlmsg_type = RTM_NEWLINK;
1303 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
1304 NLM_F_EXCL | NLM_F_ACK;
1305 ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
1306 nlh->nlmsg_len += sizeof(struct ifinfomsg);
1307 ifm->ifi_family = AF_UNSPEC;
1310 ifm->ifi_flags = IFF_UP;
1311 ifm->ifi_change = 0xffffffff;
1312 nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
1313 ret = snprintf(name, sizeof(name), "%s.%u.%u",
1314 MLX5_VMWA_VLAN_DEVICE_PFX, ifindex, tag);
1315 nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
1316 na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
1317 nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
1318 na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
1319 nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
1320 nl_attr_nest_end(nlh, na_vlan);
1321 nl_attr_nest_end(nlh, na_info);
1322 assert(sizeof(buf) >= nlh->nlmsg_len);
1323 ret = mlx5_nl_send(vmwa->nl_socket, nlh, sn);
1325 ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1327 DRV_LOG(WARNING, "netlink: VLAN %s create failure (%d)", name,
1330 // Try to get ifindex of created or pre-existing device.
1331 ret = if_nametoindex(name);
1333 DRV_LOG(WARNING, "VLAN %s failed to get index (%d)", name,