1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2018 6WIND S.A.
3 * Copyright 2018 Mellanox Technologies, Ltd
7 #include <linux/if_link.h>
8 #include <linux/netlink.h>
9 #include <linux/rtnetlink.h>
11 #include <rdma/rdma_netlink.h>
16 #include <sys/socket.h>
19 #include <rte_errno.h>
22 #include "mlx5_utils.h"
24 /* Size of the buffer to receive kernel messages */
25 #define MLX5_NL_BUF_SIZE (32 * 1024)
26 /* Send buffer size for the Netlink socket */
27 #define MLX5_SEND_BUF_SIZE 32768
28 /* Receive buffer size for the Netlink socket */
29 #define MLX5_RECV_BUF_SIZE 32768
32 * Define NDA_RTA as defined in iproute2 sources.
34 * see in iproute2 sources file include/libnetlink.h
37 #define MLX5_NDA_RTA(r) \
38 ((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
42 * The following definitions are normally found in rdma/rdma_netlink.h,
43 * however they are so recent that most systems do not expose them yet.
45 #ifndef HAVE_RDMA_NL_NLDEV
46 #define RDMA_NL_NLDEV 5
48 #ifndef HAVE_RDMA_NLDEV_CMD_GET
49 #define RDMA_NLDEV_CMD_GET 1
51 #ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET
52 #define RDMA_NLDEV_CMD_PORT_GET 5
54 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX
55 #define RDMA_NLDEV_ATTR_DEV_INDEX 1
57 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME
58 #define RDMA_NLDEV_ATTR_DEV_NAME 2
60 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
61 #define RDMA_NLDEV_ATTR_PORT_INDEX 3
63 #ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
64 #define RDMA_NLDEV_ATTR_NDEV_INDEX 50
67 /* These are normally found in linux/if_link.h. */
68 #ifndef HAVE_IFLA_NUM_VF
69 #define IFLA_NUM_VF 21
71 #ifndef HAVE_IFLA_EXT_MASK
72 #define IFLA_EXT_MASK 29
74 #ifndef HAVE_IFLA_PHYS_SWITCH_ID
75 #define IFLA_PHYS_SWITCH_ID 36
77 #ifndef HAVE_IFLA_PHYS_PORT_NAME
78 #define IFLA_PHYS_PORT_NAME 38
81 /* Add/remove MAC address through Netlink */
82 struct mlx5_nl_mac_addr {
83 struct ether_addr (*mac)[];
84 /**< MAC address handled by the device. */
85 int mac_n; /**< Number of addresses in the array. */
88 /** Data structure used by mlx5_nl_ifindex_cb(). */
89 struct mlx5_nl_ifindex_data {
90 const char *name; /**< IB device name (in). */
91 uint32_t ibindex; /**< IB device index (out). */
92 uint32_t ifindex; /**< Network interface index (out). */
96 * Opens a Netlink socket.
99 * Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA).
102 * A file descriptor on success, a negative errno value otherwise and
106 mlx5_nl_init(int protocol)
109 int sndbuf_size = MLX5_SEND_BUF_SIZE;
110 int rcvbuf_size = MLX5_RECV_BUF_SIZE;
111 struct sockaddr_nl local = {
112 .nl_family = AF_NETLINK,
116 fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
121 ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int));
126 ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int));
131 ret = bind(fd, (struct sockaddr *)&local, sizeof(local));
143 * Send a request message to the kernel on the Netlink socket.
146 * Netlink socket file descriptor.
148 * The Netlink message send to the kernel.
152 * Pointer to the request structure.
154 * Length of the request in bytes.
157 * The number of sent bytes on success, a negative errno value otherwise and
161 mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req,
164 struct sockaddr_nl sa = {
165 .nl_family = AF_NETLINK,
167 struct iovec iov[2] = {
168 { .iov_base = nh, .iov_len = sizeof(*nh), },
169 { .iov_base = req, .iov_len = len, },
171 struct msghdr msg = {
173 .msg_namelen = sizeof(sa),
179 nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
181 send_bytes = sendmsg(nlsk_fd, &msg, 0);
182 if (send_bytes < 0) {
190 * Send a message to the kernel on the Netlink socket.
193 * The Netlink socket file descriptor used for communication.
195 * The Netlink message send to the kernel.
200 * The number of sent bytes on success, a negative errno value otherwise and
204 mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn)
206 struct sockaddr_nl sa = {
207 .nl_family = AF_NETLINK,
211 .iov_len = nh->nlmsg_len,
213 struct msghdr msg = {
215 .msg_namelen = sizeof(sa),
221 nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
223 send_bytes = sendmsg(nlsk_fd, &msg, 0);
224 if (send_bytes < 0) {
232 * Receive a message from the kernel on the Netlink socket, following
236 * The Netlink socket file descriptor used for communication.
240 * The callback function to call for each Netlink message received.
241 * @param[in, out] arg
242 * Custom arguments for the callback.
245 * 0 on success, a negative errno value otherwise and rte_errno is set.
248 mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg),
251 struct sockaddr_nl sa;
252 char buf[MLX5_RECV_BUF_SIZE];
255 .iov_len = sizeof(buf),
257 struct msghdr msg = {
259 .msg_namelen = sizeof(sa),
261 /* One message at a time */
272 recv_bytes = recvmsg(nlsk_fd, &msg, 0);
273 if (recv_bytes == -1) {
277 nh = (struct nlmsghdr *)buf;
278 } while (nh->nlmsg_seq != sn);
280 NLMSG_OK(nh, (unsigned int)recv_bytes);
281 nh = NLMSG_NEXT(nh, recv_bytes)) {
282 if (nh->nlmsg_type == NLMSG_ERROR) {
283 struct nlmsgerr *err_data = NLMSG_DATA(nh);
285 if (err_data->error < 0) {
286 rte_errno = -err_data->error;
292 /* Multi-part msgs and their trailing DONE message. */
293 if (nh->nlmsg_flags & NLM_F_MULTI) {
294 if (nh->nlmsg_type == NLMSG_DONE)
309 * Parse Netlink message to retrieve the bridge MAC address.
312 * Pointer to Netlink Message Header.
314 * PMD data register with this callback.
317 * 0 on success, a negative errno value otherwise and rte_errno is set.
320 mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg)
322 struct mlx5_nl_mac_addr *data = arg;
323 struct ndmsg *r = NLMSG_DATA(nh);
324 struct rtattr *attribute;
327 len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
328 for (attribute = MLX5_NDA_RTA(r);
329 RTA_OK(attribute, len);
330 attribute = RTA_NEXT(attribute, len)) {
331 if (attribute->rta_type == NDA_LLADDR) {
332 if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) {
334 "not enough room to finalize the"
342 ether_format_addr(m, 18, RTA_DATA(attribute));
343 DRV_LOG(DEBUG, "bridge MAC address %s", m);
345 memcpy(&(*data->mac)[data->mac_n++],
346 RTA_DATA(attribute), ETHER_ADDR_LEN);
353 * Get bridge MAC addresses.
356 * Pointer to Ethernet device.
358 * Pointer to the array table of MAC addresses to fill.
359 * Its size should be of MLX5_MAX_MAC_ADDRESSES.
361 * Number of entries filled in MAC array.
364 * 0 on success, a negative errno value otherwise and rte_errno is set.
367 mlx5_nl_mac_addr_list(struct rte_eth_dev *dev, struct ether_addr (*mac)[],
370 struct mlx5_priv *priv = dev->data->dev_private;
371 unsigned int iface_idx = mlx5_ifindex(dev);
374 struct ifinfomsg ifm;
377 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
378 .nlmsg_type = RTM_GETNEIGH,
379 .nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
382 .ifi_family = PF_BRIDGE,
383 .ifi_index = iface_idx,
386 struct mlx5_nl_mac_addr data = {
392 uint32_t sn = priv->nl_sn++;
394 if (priv->nl_socket_route == -1)
396 fd = priv->nl_socket_route;
397 ret = mlx5_nl_request(fd, &req.hdr, sn, &req.ifm,
398 sizeof(struct ifinfomsg));
401 ret = mlx5_nl_recv(fd, sn, mlx5_nl_mac_addr_cb, &data);
407 DRV_LOG(DEBUG, "port %u cannot retrieve MAC address list %s",
408 dev->data->port_id, strerror(rte_errno));
413 * Modify the MAC address neighbour table with Netlink.
416 * Pointer to Ethernet device.
418 * MAC address to consider.
420 * 1 to add the MAC address, 0 to remove the MAC address.
423 * 0 on success, a negative errno value otherwise and rte_errno is set.
426 mlx5_nl_mac_addr_modify(struct rte_eth_dev *dev, struct ether_addr *mac,
429 struct mlx5_priv *priv = dev->data->dev_private;
430 unsigned int iface_idx = mlx5_ifindex(dev);
435 uint8_t buffer[ETHER_ADDR_LEN];
438 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
439 .nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
440 NLM_F_EXCL | NLM_F_ACK,
441 .nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH,
444 .ndm_family = PF_BRIDGE,
445 .ndm_state = NUD_NOARP | NUD_PERMANENT,
446 .ndm_ifindex = iface_idx,
447 .ndm_flags = NTF_SELF,
450 .rta_type = NDA_LLADDR,
451 .rta_len = RTA_LENGTH(ETHER_ADDR_LEN),
456 uint32_t sn = priv->nl_sn++;
458 if (priv->nl_socket_route == -1)
460 fd = priv->nl_socket_route;
461 memcpy(RTA_DATA(&req.rta), mac, ETHER_ADDR_LEN);
462 req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
463 RTA_ALIGN(req.rta.rta_len);
464 ret = mlx5_nl_send(fd, &req.hdr, sn);
467 ret = mlx5_nl_recv(fd, sn, NULL, NULL);
473 "port %u cannot %s MAC address %02X:%02X:%02X:%02X:%02X:%02X"
476 add ? "add" : "remove",
477 mac->addr_bytes[0], mac->addr_bytes[1],
478 mac->addr_bytes[2], mac->addr_bytes[3],
479 mac->addr_bytes[4], mac->addr_bytes[5],
480 strerror(rte_errno));
488 * Pointer to Ethernet device.
490 * MAC address to register.
495 * 0 on success, a negative errno value otherwise and rte_errno is set.
498 mlx5_nl_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac,
501 struct mlx5_priv *priv = dev->data->dev_private;
504 ret = mlx5_nl_mac_addr_modify(dev, mac, 1);
506 BITFIELD_SET(priv->mac_own, index);
513 * Remove a MAC address.
516 * Pointer to Ethernet device.
518 * MAC address to remove.
523 * 0 on success, a negative errno value otherwise and rte_errno is set.
526 mlx5_nl_mac_addr_remove(struct rte_eth_dev *dev, struct ether_addr *mac,
529 struct mlx5_priv *priv = dev->data->dev_private;
531 BITFIELD_RESET(priv->mac_own, index);
532 return mlx5_nl_mac_addr_modify(dev, mac, 0);
536 * Synchronize Netlink bridge table to the internal table.
539 * Pointer to Ethernet device.
542 mlx5_nl_mac_addr_sync(struct rte_eth_dev *dev)
544 struct ether_addr macs[MLX5_MAX_MAC_ADDRESSES];
549 ret = mlx5_nl_mac_addr_list(dev, &macs, &macs_n);
552 for (i = 0; i != macs_n; ++i) {
555 /* Verify the address is not in the array yet. */
556 for (j = 0; j != MLX5_MAX_MAC_ADDRESSES; ++j)
557 if (is_same_ether_addr(&macs[i],
558 &dev->data->mac_addrs[j]))
560 if (j != MLX5_MAX_MAC_ADDRESSES)
562 /* Find the first entry available. */
563 for (j = 0; j != MLX5_MAX_MAC_ADDRESSES; ++j) {
564 if (is_zero_ether_addr(&dev->data->mac_addrs[j])) {
565 dev->data->mac_addrs[j] = macs[i];
573 * Flush all added MAC addresses.
576 * Pointer to Ethernet device.
579 mlx5_nl_mac_addr_flush(struct rte_eth_dev *dev)
581 struct mlx5_priv *priv = dev->data->dev_private;
584 for (i = MLX5_MAX_MAC_ADDRESSES - 1; i >= 0; --i) {
585 struct ether_addr *m = &dev->data->mac_addrs[i];
587 if (BITFIELD_ISSET(priv->mac_own, i))
588 mlx5_nl_mac_addr_remove(dev, m, i);
593 * Enable promiscuous / all multicast mode through Netlink.
596 * Pointer to Ethernet device structure.
598 * IFF_PROMISC for promiscuous, IFF_ALLMULTI for allmulti.
600 * Nonzero to enable, disable otherwise.
603 * 0 on success, a negative errno value otherwise and rte_errno is set.
606 mlx5_nl_device_flags(struct rte_eth_dev *dev, uint32_t flags, int enable)
608 struct mlx5_priv *priv = dev->data->dev_private;
609 unsigned int iface_idx = mlx5_ifindex(dev);
612 struct ifinfomsg ifi;
615 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
616 .nlmsg_type = RTM_NEWLINK,
617 .nlmsg_flags = NLM_F_REQUEST,
620 .ifi_flags = enable ? flags : 0,
622 .ifi_index = iface_idx,
628 assert(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI)));
629 if (priv->nl_socket_route < 0)
631 fd = priv->nl_socket_route;
632 ret = mlx5_nl_send(fd, &req.hdr, priv->nl_sn++);
639 * Enable promiscuous mode through Netlink.
642 * Pointer to Ethernet device structure.
644 * Nonzero to enable, disable otherwise.
647 * 0 on success, a negative errno value otherwise and rte_errno is set.
650 mlx5_nl_promisc(struct rte_eth_dev *dev, int enable)
652 int ret = mlx5_nl_device_flags(dev, IFF_PROMISC, enable);
656 "port %u cannot %s promisc mode: Netlink error %s",
657 dev->data->port_id, enable ? "enable" : "disable",
658 strerror(rte_errno));
663 * Enable all multicast mode through Netlink.
666 * Pointer to Ethernet device structure.
668 * Nonzero to enable, disable otherwise.
671 * 0 on success, a negative errno value otherwise and rte_errno is set.
674 mlx5_nl_allmulti(struct rte_eth_dev *dev, int enable)
676 int ret = mlx5_nl_device_flags(dev, IFF_ALLMULTI, enable);
680 "port %u cannot %s allmulti mode: Netlink error %s",
681 dev->data->port_id, enable ? "enable" : "disable",
682 strerror(rte_errno));
687 * Process network interface information from Netlink message.
690 * Pointer to Netlink message header.
692 * Opaque data pointer for this callback.
695 * 0 on success, a negative errno value otherwise and rte_errno is set.
698 mlx5_nl_ifindex_cb(struct nlmsghdr *nh, void *arg)
700 struct mlx5_nl_ifindex_data *data = arg;
701 size_t off = NLMSG_HDRLEN;
702 uint32_t ibindex = 0;
703 uint32_t ifindex = 0;
706 if (nh->nlmsg_type !=
707 RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) &&
709 RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET))
711 while (off < nh->nlmsg_len) {
712 struct nlattr *na = (void *)((uintptr_t)nh + off);
713 void *payload = (void *)((uintptr_t)na + NLA_HDRLEN);
715 if (na->nla_len > nh->nlmsg_len - off)
717 switch (na->nla_type) {
718 case RDMA_NLDEV_ATTR_DEV_INDEX:
719 ibindex = *(uint32_t *)payload;
721 case RDMA_NLDEV_ATTR_DEV_NAME:
722 if (!strcmp(payload, data->name))
725 case RDMA_NLDEV_ATTR_NDEV_INDEX:
726 ifindex = *(uint32_t *)payload;
731 off += NLA_ALIGN(na->nla_len);
734 data->ibindex = ibindex;
735 data->ifindex = ifindex;
744 * Get index of network interface associated with some IB device.
746 * This is the only somewhat safe method to avoid resorting to heuristics
747 * when faced with port representors. Unfortunately it requires at least
751 * Netlink socket of the RDMA kind (NETLINK_RDMA).
756 * A valid (nonzero) interface index on success, 0 otherwise and rte_errno
760 mlx5_nl_ifindex(int nl, const char *name)
762 static const uint32_t pindex = 1;
763 uint32_t seq = random();
764 struct mlx5_nl_ifindex_data data = {
766 .ibindex = 0, /* Determined during first pass. */
767 .ifindex = 0, /* Determined during second pass. */
771 uint8_t buf[NLMSG_HDRLEN +
772 NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) +
773 NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
776 .nlmsg_len = NLMSG_LENGTH(0),
777 .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
779 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
785 ret = mlx5_nl_send(nl, &req.nh, seq);
788 ret = mlx5_nl_recv(nl, seq, mlx5_nl_ifindex_cb, &data);
794 req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
795 RDMA_NLDEV_CMD_PORT_GET);
796 req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
797 req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
798 na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
799 na->nla_len = NLA_HDRLEN + sizeof(data.ibindex);
800 na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
801 memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
802 &data.ibindex, sizeof(data.ibindex));
803 na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
804 na->nla_len = NLA_HDRLEN + sizeof(pindex);
805 na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
806 memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
807 &pindex, sizeof(pindex));
808 ret = mlx5_nl_send(nl, &req.nh, seq);
811 ret = mlx5_nl_recv(nl, seq, mlx5_nl_ifindex_cb, &data);
823 * Process switch information from Netlink message.
826 * Pointer to Netlink message header.
828 * Opaque data pointer for this callback.
831 * 0 on success, a negative errno value otherwise and rte_errno is set.
834 mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
836 struct mlx5_switch_info info = {
843 size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg));
844 bool port_name_set = false;
845 bool switch_id_set = false;
846 bool num_vf_set = false;
848 if (nh->nlmsg_type != RTM_NEWLINK)
850 while (off < nh->nlmsg_len) {
851 struct rtattr *ra = (void *)((uintptr_t)nh + off);
852 void *payload = RTA_DATA(ra);
855 if (ra->rta_len > nh->nlmsg_len - off)
857 switch (ra->rta_type) {
861 case IFLA_PHYS_PORT_NAME:
863 mlx5_translate_port_name((char *)payload,
866 case IFLA_PHYS_SWITCH_ID:
868 for (i = 0; i < RTA_PAYLOAD(ra); ++i) {
869 info.switch_id <<= 8;
870 info.switch_id |= ((uint8_t *)payload)[i];
872 switch_id_set = true;
875 off += RTA_ALIGN(ra->rta_len);
878 if (info.port_name_new) {
879 /* New representors naming schema. */
881 info.master = (info.port_name == -1);
882 info.representor = (info.port_name != -1);
885 /* Legacy representors naming schema. */
886 info.master = (!port_name_set || num_vf_set);
887 info.representor = port_name_set && !num_vf_set;
890 assert(!(info.master && info.representor));
891 memcpy(arg, &info, sizeof(info));
899 * Get switch information associated with network interface.
902 * Netlink socket of the ROUTE kind (NETLINK_ROUTE).
904 * Network interface index.
906 * Switch information object, populated in case of success.
909 * 0 on success, a negative errno value otherwise and rte_errno is set.
912 mlx5_nl_switch_info(int nl, unsigned int ifindex, struct mlx5_switch_info *info)
914 uint32_t seq = random();
917 struct ifinfomsg info;
922 .nlmsg_len = NLMSG_LENGTH
924 RTA_LENGTH(sizeof(uint32_t))),
925 .nlmsg_type = RTM_GETLINK,
926 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
929 .ifi_family = AF_UNSPEC,
930 .ifi_index = ifindex,
933 .rta_type = IFLA_EXT_MASK,
934 .rta_len = RTA_LENGTH(sizeof(int32_t)),
936 .extmask = RTE_LE32(1),
940 ret = mlx5_nl_send(nl, &req.nh, seq);
942 ret = mlx5_nl_recv(nl, seq, mlx5_nl_switch_info_cb, info);
943 if (info->master && info->representor) {
944 DRV_LOG(ERR, "ifindex %u device is recognized as master"
945 " and as representor", ifindex);