1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2018 6WIND S.A.
3 * Copyright 2018 Mellanox Technologies, Ltd
7 #include <linux/if_link.h>
8 #include <linux/netlink.h>
9 #include <linux/rtnetlink.h>
11 #include <rdma/rdma_netlink.h>
16 #include <sys/socket.h>
19 #include <rte_errno.h>
22 #include "mlx5_utils.h"
24 /* Size of the buffer to receive kernel messages */
25 #define MLX5_NL_BUF_SIZE (32 * 1024)
26 /* Send buffer size for the Netlink socket */
27 #define MLX5_SEND_BUF_SIZE 32768
28 /* Receive buffer size for the Netlink socket */
29 #define MLX5_RECV_BUF_SIZE 32768
32 * Define NDA_RTA as defined in iproute2 sources.
34 * see in iproute2 sources file include/libnetlink.h
37 #define MLX5_NDA_RTA(r) \
38 ((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
42 * The following definitions are normally found in rdma/rdma_netlink.h,
43 * however they are so recent that most systems do not expose them yet.
45 #ifndef HAVE_RDMA_NL_NLDEV
46 #define RDMA_NL_NLDEV 5
48 #ifndef HAVE_RDMA_NLDEV_CMD_GET
49 #define RDMA_NLDEV_CMD_GET 1
51 #ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET
52 #define RDMA_NLDEV_CMD_PORT_GET 5
54 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX
55 #define RDMA_NLDEV_ATTR_DEV_INDEX 1
57 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME
58 #define RDMA_NLDEV_ATTR_DEV_NAME 2
60 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
61 #define RDMA_NLDEV_ATTR_PORT_INDEX 3
63 #ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
64 #define RDMA_NLDEV_ATTR_NDEV_INDEX 50
67 /* These are normally found in linux/if_link.h. */
68 #ifndef HAVE_IFLA_NUM_VF
69 #define IFLA_NUM_VF 21
71 #ifndef HAVE_IFLA_EXT_MASK
72 #define IFLA_EXT_MASK 29
74 #ifndef HAVE_IFLA_PHYS_SWITCH_ID
75 #define IFLA_PHYS_SWITCH_ID 36
77 #ifndef HAVE_IFLA_PHYS_PORT_NAME
78 #define IFLA_PHYS_PORT_NAME 38
81 /* Add/remove MAC address through Netlink */
82 struct mlx5_nl_mac_addr {
83 struct ether_addr (*mac)[];
84 /**< MAC address handled by the device. */
85 int mac_n; /**< Number of addresses in the array. */
88 /** Data structure used by mlx5_nl_cmdget_cb(). */
89 struct mlx5_nl_ifindex_data {
90 const char *name; /**< IB device name (in). */
91 uint32_t ibindex; /**< IB device index (out). */
92 uint32_t ifindex; /**< Network interface index (out). */
93 uint32_t portnum; /**< IB device max port number. */
97 * Opens a Netlink socket.
100 * Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA).
103 * A file descriptor on success, a negative errno value otherwise and
107 mlx5_nl_init(int protocol)
110 int sndbuf_size = MLX5_SEND_BUF_SIZE;
111 int rcvbuf_size = MLX5_RECV_BUF_SIZE;
112 struct sockaddr_nl local = {
113 .nl_family = AF_NETLINK,
117 fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
122 ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int));
127 ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int));
132 ret = bind(fd, (struct sockaddr *)&local, sizeof(local));
144 * Send a request message to the kernel on the Netlink socket.
147 * Netlink socket file descriptor.
149 * The Netlink message send to the kernel.
153 * Pointer to the request structure.
155 * Length of the request in bytes.
158 * The number of sent bytes on success, a negative errno value otherwise and
162 mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req,
165 struct sockaddr_nl sa = {
166 .nl_family = AF_NETLINK,
168 struct iovec iov[2] = {
169 { .iov_base = nh, .iov_len = sizeof(*nh), },
170 { .iov_base = req, .iov_len = len, },
172 struct msghdr msg = {
174 .msg_namelen = sizeof(sa),
180 nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
182 send_bytes = sendmsg(nlsk_fd, &msg, 0);
183 if (send_bytes < 0) {
191 * Send a message to the kernel on the Netlink socket.
194 * The Netlink socket file descriptor used for communication.
196 * The Netlink message send to the kernel.
201 * The number of sent bytes on success, a negative errno value otherwise and
205 mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn)
207 struct sockaddr_nl sa = {
208 .nl_family = AF_NETLINK,
212 .iov_len = nh->nlmsg_len,
214 struct msghdr msg = {
216 .msg_namelen = sizeof(sa),
222 nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
224 send_bytes = sendmsg(nlsk_fd, &msg, 0);
225 if (send_bytes < 0) {
233 * Receive a message from the kernel on the Netlink socket, following
237 * The Netlink socket file descriptor used for communication.
241 * The callback function to call for each Netlink message received.
242 * @param[in, out] arg
243 * Custom arguments for the callback.
246 * 0 on success, a negative errno value otherwise and rte_errno is set.
249 mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg),
252 struct sockaddr_nl sa;
253 char buf[MLX5_RECV_BUF_SIZE];
256 .iov_len = sizeof(buf),
258 struct msghdr msg = {
260 .msg_namelen = sizeof(sa),
262 /* One message at a time */
273 recv_bytes = recvmsg(nlsk_fd, &msg, 0);
274 if (recv_bytes == -1) {
278 nh = (struct nlmsghdr *)buf;
279 } while (nh->nlmsg_seq != sn);
281 NLMSG_OK(nh, (unsigned int)recv_bytes);
282 nh = NLMSG_NEXT(nh, recv_bytes)) {
283 if (nh->nlmsg_type == NLMSG_ERROR) {
284 struct nlmsgerr *err_data = NLMSG_DATA(nh);
286 if (err_data->error < 0) {
287 rte_errno = -err_data->error;
293 /* Multi-part msgs and their trailing DONE message. */
294 if (nh->nlmsg_flags & NLM_F_MULTI) {
295 if (nh->nlmsg_type == NLMSG_DONE)
310 * Parse Netlink message to retrieve the bridge MAC address.
313 * Pointer to Netlink Message Header.
315 * PMD data register with this callback.
318 * 0 on success, a negative errno value otherwise and rte_errno is set.
321 mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg)
323 struct mlx5_nl_mac_addr *data = arg;
324 struct ndmsg *r = NLMSG_DATA(nh);
325 struct rtattr *attribute;
328 len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
329 for (attribute = MLX5_NDA_RTA(r);
330 RTA_OK(attribute, len);
331 attribute = RTA_NEXT(attribute, len)) {
332 if (attribute->rta_type == NDA_LLADDR) {
333 if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) {
335 "not enough room to finalize the"
343 ether_format_addr(m, 18, RTA_DATA(attribute));
344 DRV_LOG(DEBUG, "bridge MAC address %s", m);
346 memcpy(&(*data->mac)[data->mac_n++],
347 RTA_DATA(attribute), ETHER_ADDR_LEN);
354 * Get bridge MAC addresses.
357 * Pointer to Ethernet device.
359 * Pointer to the array table of MAC addresses to fill.
360 * Its size should be of MLX5_MAX_MAC_ADDRESSES.
362 * Number of entries filled in MAC array.
365 * 0 on success, a negative errno value otherwise and rte_errno is set.
368 mlx5_nl_mac_addr_list(struct rte_eth_dev *dev, struct ether_addr (*mac)[],
371 struct mlx5_priv *priv = dev->data->dev_private;
372 unsigned int iface_idx = mlx5_ifindex(dev);
375 struct ifinfomsg ifm;
378 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
379 .nlmsg_type = RTM_GETNEIGH,
380 .nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
383 .ifi_family = PF_BRIDGE,
384 .ifi_index = iface_idx,
387 struct mlx5_nl_mac_addr data = {
393 uint32_t sn = priv->nl_sn++;
395 if (priv->nl_socket_route == -1)
397 fd = priv->nl_socket_route;
398 ret = mlx5_nl_request(fd, &req.hdr, sn, &req.ifm,
399 sizeof(struct ifinfomsg));
402 ret = mlx5_nl_recv(fd, sn, mlx5_nl_mac_addr_cb, &data);
408 DRV_LOG(DEBUG, "port %u cannot retrieve MAC address list %s",
409 dev->data->port_id, strerror(rte_errno));
414 * Modify the MAC address neighbour table with Netlink.
417 * Pointer to Ethernet device.
419 * MAC address to consider.
421 * 1 to add the MAC address, 0 to remove the MAC address.
424 * 0 on success, a negative errno value otherwise and rte_errno is set.
427 mlx5_nl_mac_addr_modify(struct rte_eth_dev *dev, struct ether_addr *mac,
430 struct mlx5_priv *priv = dev->data->dev_private;
431 unsigned int iface_idx = mlx5_ifindex(dev);
436 uint8_t buffer[ETHER_ADDR_LEN];
439 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
440 .nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
441 NLM_F_EXCL | NLM_F_ACK,
442 .nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH,
445 .ndm_family = PF_BRIDGE,
446 .ndm_state = NUD_NOARP | NUD_PERMANENT,
447 .ndm_ifindex = iface_idx,
448 .ndm_flags = NTF_SELF,
451 .rta_type = NDA_LLADDR,
452 .rta_len = RTA_LENGTH(ETHER_ADDR_LEN),
457 uint32_t sn = priv->nl_sn++;
459 if (priv->nl_socket_route == -1)
461 fd = priv->nl_socket_route;
462 memcpy(RTA_DATA(&req.rta), mac, ETHER_ADDR_LEN);
463 req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
464 RTA_ALIGN(req.rta.rta_len);
465 ret = mlx5_nl_send(fd, &req.hdr, sn);
468 ret = mlx5_nl_recv(fd, sn, NULL, NULL);
474 "port %u cannot %s MAC address %02X:%02X:%02X:%02X:%02X:%02X"
477 add ? "add" : "remove",
478 mac->addr_bytes[0], mac->addr_bytes[1],
479 mac->addr_bytes[2], mac->addr_bytes[3],
480 mac->addr_bytes[4], mac->addr_bytes[5],
481 strerror(rte_errno));
489 * Pointer to Ethernet device.
491 * MAC address to register.
496 * 0 on success, a negative errno value otherwise and rte_errno is set.
499 mlx5_nl_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac,
502 struct mlx5_priv *priv = dev->data->dev_private;
505 ret = mlx5_nl_mac_addr_modify(dev, mac, 1);
507 BITFIELD_SET(priv->mac_own, index);
514 * Remove a MAC address.
517 * Pointer to Ethernet device.
519 * MAC address to remove.
524 * 0 on success, a negative errno value otherwise and rte_errno is set.
527 mlx5_nl_mac_addr_remove(struct rte_eth_dev *dev, struct ether_addr *mac,
530 struct mlx5_priv *priv = dev->data->dev_private;
532 BITFIELD_RESET(priv->mac_own, index);
533 return mlx5_nl_mac_addr_modify(dev, mac, 0);
537 * Synchronize Netlink bridge table to the internal table.
540 * Pointer to Ethernet device.
543 mlx5_nl_mac_addr_sync(struct rte_eth_dev *dev)
545 struct ether_addr macs[MLX5_MAX_MAC_ADDRESSES];
550 ret = mlx5_nl_mac_addr_list(dev, &macs, &macs_n);
553 for (i = 0; i != macs_n; ++i) {
556 /* Verify the address is not in the array yet. */
557 for (j = 0; j != MLX5_MAX_MAC_ADDRESSES; ++j)
558 if (is_same_ether_addr(&macs[i],
559 &dev->data->mac_addrs[j]))
561 if (j != MLX5_MAX_MAC_ADDRESSES)
563 /* Find the first entry available. */
564 for (j = 0; j != MLX5_MAX_MAC_ADDRESSES; ++j) {
565 if (is_zero_ether_addr(&dev->data->mac_addrs[j])) {
566 dev->data->mac_addrs[j] = macs[i];
574 * Flush all added MAC addresses.
577 * Pointer to Ethernet device.
580 mlx5_nl_mac_addr_flush(struct rte_eth_dev *dev)
582 struct mlx5_priv *priv = dev->data->dev_private;
585 for (i = MLX5_MAX_MAC_ADDRESSES - 1; i >= 0; --i) {
586 struct ether_addr *m = &dev->data->mac_addrs[i];
588 if (BITFIELD_ISSET(priv->mac_own, i))
589 mlx5_nl_mac_addr_remove(dev, m, i);
594 * Enable promiscuous / all multicast mode through Netlink.
597 * Pointer to Ethernet device structure.
599 * IFF_PROMISC for promiscuous, IFF_ALLMULTI for allmulti.
601 * Nonzero to enable, disable otherwise.
604 * 0 on success, a negative errno value otherwise and rte_errno is set.
607 mlx5_nl_device_flags(struct rte_eth_dev *dev, uint32_t flags, int enable)
609 struct mlx5_priv *priv = dev->data->dev_private;
610 unsigned int iface_idx = mlx5_ifindex(dev);
613 struct ifinfomsg ifi;
616 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
617 .nlmsg_type = RTM_NEWLINK,
618 .nlmsg_flags = NLM_F_REQUEST,
621 .ifi_flags = enable ? flags : 0,
623 .ifi_index = iface_idx,
629 assert(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI)));
630 if (priv->nl_socket_route < 0)
632 fd = priv->nl_socket_route;
633 ret = mlx5_nl_send(fd, &req.hdr, priv->nl_sn++);
640 * Enable promiscuous mode through Netlink.
643 * Pointer to Ethernet device structure.
645 * Nonzero to enable, disable otherwise.
648 * 0 on success, a negative errno value otherwise and rte_errno is set.
651 mlx5_nl_promisc(struct rte_eth_dev *dev, int enable)
653 int ret = mlx5_nl_device_flags(dev, IFF_PROMISC, enable);
657 "port %u cannot %s promisc mode: Netlink error %s",
658 dev->data->port_id, enable ? "enable" : "disable",
659 strerror(rte_errno));
664 * Enable all multicast mode through Netlink.
667 * Pointer to Ethernet device structure.
669 * Nonzero to enable, disable otherwise.
672 * 0 on success, a negative errno value otherwise and rte_errno is set.
675 mlx5_nl_allmulti(struct rte_eth_dev *dev, int enable)
677 int ret = mlx5_nl_device_flags(dev, IFF_ALLMULTI, enable);
681 "port %u cannot %s allmulti mode: Netlink error %s",
682 dev->data->port_id, enable ? "enable" : "disable",
683 strerror(rte_errno));
688 * Process network interface information from Netlink message.
691 * Pointer to Netlink message header.
693 * Opaque data pointer for this callback.
696 * 0 on success, a negative errno value otherwise and rte_errno is set.
699 mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
701 struct mlx5_nl_ifindex_data *data = arg;
702 size_t off = NLMSG_HDRLEN;
703 uint32_t ibindex = 0;
704 uint32_t ifindex = 0;
705 uint32_t portnum = 0;
708 if (nh->nlmsg_type !=
709 RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) &&
711 RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET))
713 while (off < nh->nlmsg_len) {
714 struct nlattr *na = (void *)((uintptr_t)nh + off);
715 void *payload = (void *)((uintptr_t)na + NLA_HDRLEN);
717 if (na->nla_len > nh->nlmsg_len - off)
719 switch (na->nla_type) {
720 case RDMA_NLDEV_ATTR_DEV_INDEX:
721 ibindex = *(uint32_t *)payload;
723 case RDMA_NLDEV_ATTR_DEV_NAME:
724 if (!strcmp(payload, data->name))
727 case RDMA_NLDEV_ATTR_NDEV_INDEX:
728 ifindex = *(uint32_t *)payload;
730 case RDMA_NLDEV_ATTR_PORT_INDEX:
731 portnum = *(uint32_t *)payload;
736 off += NLA_ALIGN(na->nla_len);
739 data->ibindex = ibindex;
740 data->ifindex = ifindex;
741 data->portnum = portnum;
750 * Get index of network interface associated with some IB device.
752 * This is the only somewhat safe method to avoid resorting to heuristics
753 * when faced with port representors. Unfortunately it requires at least
757 * Netlink socket of the RDMA kind (NETLINK_RDMA).
761 * IB device port index, starting from 1
763 * A valid (nonzero) interface index on success, 0 otherwise and rte_errno
767 mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
769 uint32_t seq = random();
770 struct mlx5_nl_ifindex_data data = {
772 .ibindex = 0, /* Determined during first pass. */
773 .ifindex = 0, /* Determined during second pass. */
777 uint8_t buf[NLMSG_HDRLEN +
778 NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) +
779 NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
782 .nlmsg_len = NLMSG_LENGTH(0),
783 .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
785 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
791 ret = mlx5_nl_send(nl, &req.nh, seq);
794 ret = mlx5_nl_recv(nl, seq, mlx5_nl_cmdget_cb, &data);
800 req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
801 RDMA_NLDEV_CMD_PORT_GET);
802 req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
803 req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
804 na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
805 na->nla_len = NLA_HDRLEN + sizeof(data.ibindex);
806 na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
807 memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
808 &data.ibindex, sizeof(data.ibindex));
809 na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
810 na->nla_len = NLA_HDRLEN + sizeof(pindex);
811 na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
812 memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
813 &pindex, sizeof(pindex));
814 ret = mlx5_nl_send(nl, &req.nh, seq);
817 ret = mlx5_nl_recv(nl, seq, mlx5_nl_cmdget_cb, &data);
829 * Process switch information from Netlink message.
832 * Pointer to Netlink message header.
834 * Opaque data pointer for this callback.
837 * 0 on success, a negative errno value otherwise and rte_errno is set.
840 mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
842 struct mlx5_switch_info info = {
849 size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg));
850 bool port_name_set = false;
851 bool switch_id_set = false;
852 bool num_vf_set = false;
854 if (nh->nlmsg_type != RTM_NEWLINK)
856 while (off < nh->nlmsg_len) {
857 struct rtattr *ra = (void *)((uintptr_t)nh + off);
858 void *payload = RTA_DATA(ra);
861 if (ra->rta_len > nh->nlmsg_len - off)
863 switch (ra->rta_type) {
867 case IFLA_PHYS_PORT_NAME:
869 mlx5_translate_port_name((char *)payload,
872 case IFLA_PHYS_SWITCH_ID:
874 for (i = 0; i < RTA_PAYLOAD(ra); ++i) {
875 info.switch_id <<= 8;
876 info.switch_id |= ((uint8_t *)payload)[i];
878 switch_id_set = true;
881 off += RTA_ALIGN(ra->rta_len);
884 if (info.port_name_new) {
885 /* New representors naming schema. */
887 info.master = (info.port_name == -1);
888 info.representor = (info.port_name != -1);
891 /* Legacy representors naming schema. */
892 info.master = (!port_name_set || num_vf_set);
893 info.representor = port_name_set && !num_vf_set;
896 assert(!(info.master && info.representor));
897 memcpy(arg, &info, sizeof(info));
905 * Get switch information associated with network interface.
908 * Netlink socket of the ROUTE kind (NETLINK_ROUTE).
910 * Network interface index.
912 * Switch information object, populated in case of success.
915 * 0 on success, a negative errno value otherwise and rte_errno is set.
918 mlx5_nl_switch_info(int nl, unsigned int ifindex, struct mlx5_switch_info *info)
920 uint32_t seq = random();
923 struct ifinfomsg info;
928 .nlmsg_len = NLMSG_LENGTH
930 RTA_LENGTH(sizeof(uint32_t))),
931 .nlmsg_type = RTM_GETLINK,
932 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
935 .ifi_family = AF_UNSPEC,
936 .ifi_index = ifindex,
939 .rta_type = IFLA_EXT_MASK,
940 .rta_len = RTA_LENGTH(sizeof(int32_t)),
942 .extmask = RTE_LE32(1),
946 ret = mlx5_nl_send(nl, &req.nh, seq);
948 ret = mlx5_nl_recv(nl, seq, mlx5_nl_switch_info_cb, info);
949 if (info->master && info->representor) {
950 DRV_LOG(ERR, "ifindex %u device is recognized as master"
951 " and as representor", ifindex);