net/mlx5: lay groundwork for switch offloads
authorAdrien Mazarguil <adrien.mazarguil@6wind.com>
Fri, 13 Jul 2018 09:40:37 +0000 (11:40 +0200)
committerThomas Monjalon <thomas@monjalon.net>
Thu, 26 Jul 2018 12:05:52 +0000 (14:05 +0200)
With mlx5, unlike normal flow rules implemented through Verbs for traffic
emitted and received by the application, those targeting different logical
ports of the device (VF representors for instance) are offloaded at the
switch level and must be configured through Netlink (TC interface).

This patch adds preliminary support to manage such flow rules through the
flow API (rte_flow).

Instead of rewriting tons of Netlink helpers and as previously suggested by
Stephen [1], this patch introduces a new dependency to libmnl [2]
(LGPL-2.1) when compiling mlx5.

[1] https://mails.dpdk.org/archives/dev/2018-March/092676.html
[2] https://netfilter.org/projects/libmnl/

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Acked-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Acked-by: Yongseok Koh <yskoh@mellanox.com>
doc/guides/nics/mlx5.rst
drivers/net/mlx5/Makefile
drivers/net/mlx5/mlx5.c
drivers/net/mlx5/mlx5.h
drivers/net/mlx5/mlx5_nl_flow.c [new file with mode: 0644]
mk/rte.app.mk

index ebf2336..4d692fa 100644 (file)
@@ -448,6 +448,12 @@ DPDK and must be installed separately:
   This library basically implements send/receive calls to the hardware
   queues.
 
+- **libmnl**
+
+  Minimalistic Netlink library mainly relied on to manage E-Switch flow
+  rules (i.e. those with the "transfer" attribute and typically involving
+  port representors).
+
 - **Kernel modules**
 
   They provide the kernel-side Verbs API and low level device drivers that
@@ -526,6 +532,19 @@ required from that distribution.
    this DPDK release was developed and tested against is strongly
    recommended. Please check the `prerequisites`_.
 
+Libmnl
+^^^^^^
+
+Minimal version for libmnl is **1.0.3**.
+
+As a dependency of the **iproute2** suite, this library is often installed
+by default. It is otherwise readily available through standard system
+packages.
+
+Its development headers must be installed in order to compile this PMD.
+These packages are usually named **libmnl-dev** or **libmnl-devel**
+depending on the Linux distribution.
+
 Supported NICs
 --------------
 
index d86c6bb..0fb890c 100644 (file)
@@ -33,6 +33,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mr.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_socket.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_nl.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_nl_flow.c
 
 ifeq ($(CONFIG_RTE_LIBRTE_MLX5_DLOPEN_DEPS),y)
 INSTALL-$(CONFIG_RTE_LIBRTE_MLX5_PMD)-lib += $(LIB_GLUE)
@@ -56,6 +57,7 @@ LDLIBS += -ldl
 else
 LDLIBS += -libverbs -lmlx5
 endif
+LDLIBS += -lmnl
 LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
 LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs
 LDLIBS += -lrte_bus_pci
index c62a52f..78a6922 100644 (file)
@@ -282,6 +282,8 @@ mlx5_dev_close(struct rte_eth_dev *dev)
                close(priv->nl_socket_route);
        if (priv->nl_socket_rdma >= 0)
                close(priv->nl_socket_rdma);
+       if (priv->mnl_socket)
+               mlx5_nl_flow_socket_destroy(priv->mnl_socket);
        ret = mlx5_hrxq_ibv_verify(dev);
        if (ret)
                DRV_LOG(WARNING, "port %u some hash Rx queue still remain",
@@ -1122,6 +1124,34 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
        claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
        if (vf && config.vf_nl_en)
                mlx5_nl_mac_addr_sync(eth_dev);
+       priv->mnl_socket = mlx5_nl_flow_socket_create();
+       if (!priv->mnl_socket) {
+               err = -rte_errno;
+               DRV_LOG(WARNING,
+                       "flow rules relying on switch offloads will not be"
+                       " supported: cannot open libmnl socket: %s",
+                       strerror(rte_errno));
+       } else {
+               struct rte_flow_error error;
+               unsigned int ifindex = mlx5_ifindex(eth_dev);
+
+               if (!ifindex) {
+                       err = -rte_errno;
+                       error.message =
+                               "cannot retrieve network interface index";
+               } else {
+                       err = mlx5_nl_flow_init(priv->mnl_socket, ifindex,
+                                               &error);
+               }
+               if (err) {
+                       DRV_LOG(WARNING,
+                               "flow rules relying on switch offloads will"
+                               " not be supported: %s: %s",
+                               error.message, strerror(rte_errno));
+                       mlx5_nl_flow_socket_destroy(priv->mnl_socket);
+                       priv->mnl_socket = NULL;
+               }
+       }
        TAILQ_INIT(&priv->flows);
        TAILQ_INIT(&priv->ctrl_flows);
        /* Hint libmlx5 to use PMD allocator for data plane resources */
@@ -1174,6 +1204,8 @@ error:
                        close(priv->nl_socket_route);
                if (priv->nl_socket_rdma >= 0)
                        close(priv->nl_socket_rdma);
+               if (priv->mnl_socket)
+                       mlx5_nl_flow_socket_destroy(priv->mnl_socket);
                if (own_domain_id)
                        claim_zero(rte_eth_switch_domain_free(priv->domain_id));
                rte_free(priv);
index 896158a..4786af8 100644 (file)
@@ -156,6 +156,8 @@ struct mlx5_drop {
        struct mlx5_rxq_ibv *rxq; /* Verbs Rx queue. */
 };
 
+struct mnl_socket;
+
 struct priv {
        LIST_ENTRY(priv) mem_event_cb; /* Called by memory event callback. */
        struct rte_eth_dev_data *dev_data;  /* Pointer to device data. */
@@ -220,6 +222,7 @@ struct priv {
        rte_spinlock_t uar_lock[MLX5_UAR_PAGE_NUM_MAX];
        /* UAR same-page access control required in 32bit implementations. */
 #endif
+       struct mnl_socket *mnl_socket; /* Libmnl socket. */
 };
 
 #define PORT_ID(priv) ((priv)->dev_data->port_id)
@@ -385,4 +388,11 @@ unsigned int mlx5_nl_ifindex(int nl, const char *name);
 int mlx5_nl_switch_info(int nl, unsigned int ifindex,
                        struct mlx5_switch_info *info);
 
+/* mlx5_nl_flow.c */
+
+int mlx5_nl_flow_init(struct mnl_socket *nl, unsigned int ifindex,
+                     struct rte_flow_error *error);
+struct mnl_socket *mlx5_nl_flow_socket_create(void);
+void mlx5_nl_flow_socket_destroy(struct mnl_socket *nl);
+
 #endif /* RTE_PMD_MLX5_H_ */
diff --git a/drivers/net/mlx5/mlx5_nl_flow.c b/drivers/net/mlx5/mlx5_nl_flow.c
new file mode 100644 (file)
index 0000000..60a4493
--- /dev/null
@@ -0,0 +1,147 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2018 6WIND S.A.
+ * Copyright 2018 Mellanox Technologies, Ltd
+ */
+
+#include <errno.h>
+#include <libmnl/libmnl.h>
+#include <linux/netlink.h>
+#include <linux/pkt_sched.h>
+#include <linux/rtnetlink.h>
+#include <stdalign.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+
+#include <rte_errno.h>
+#include <rte_flow.h>
+
+#include "mlx5.h"
+
+/* Normally found in linux/netlink.h. */
+#ifndef NETLINK_CAP_ACK
+#define NETLINK_CAP_ACK 10
+#endif
+
+/**
+ * Send Netlink message with acknowledgment.
+ *
+ * @param nl
+ *   Libmnl socket to use.
+ * @param nlh
+ *   Message to send. This function always raises the NLM_F_ACK flag before
+ *   sending.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_flow_nl_ack(struct mnl_socket *nl, struct nlmsghdr *nlh)
+{
+       alignas(struct nlmsghdr)
+       uint8_t ans[mnl_nlmsg_size(sizeof(struct nlmsgerr)) +
+                   nlh->nlmsg_len - sizeof(*nlh)];
+       uint32_t seq = random();
+       int ret;
+
+       nlh->nlmsg_flags |= NLM_F_ACK;
+       nlh->nlmsg_seq = seq;
+       ret = mnl_socket_sendto(nl, nlh, nlh->nlmsg_len);
+       if (ret != -1)
+               ret = mnl_socket_recvfrom(nl, ans, sizeof(ans));
+       if (ret != -1)
+               ret = mnl_cb_run
+                       (ans, ret, seq, mnl_socket_get_portid(nl), NULL, NULL);
+       if (!ret)
+               return 0;
+       rte_errno = errno;
+       return -rte_errno;
+}
+
+/**
+ * Initialize ingress qdisc of a given network interface.
+ *
+ * @param nl
+ *   Libmnl socket of the @p NETLINK_ROUTE kind.
+ * @param ifindex
+ *   Index of network interface to initialize.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_nl_flow_init(struct mnl_socket *nl, unsigned int ifindex,
+                 struct rte_flow_error *error)
+{
+       struct nlmsghdr *nlh;
+       struct tcmsg *tcm;
+       alignas(struct nlmsghdr)
+       uint8_t buf[mnl_nlmsg_size(sizeof(*tcm) + 128)];
+
+       /* Destroy existing ingress qdisc and everything attached to it. */
+       nlh = mnl_nlmsg_put_header(buf);
+       nlh->nlmsg_type = RTM_DELQDISC;
+       nlh->nlmsg_flags = NLM_F_REQUEST;
+       tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
+       tcm->tcm_family = AF_UNSPEC;
+       tcm->tcm_ifindex = ifindex;
+       tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
+       tcm->tcm_parent = TC_H_INGRESS;
+       /* Ignore errors when qdisc is already absent. */
+       if (mlx5_nl_flow_nl_ack(nl, nlh) &&
+           rte_errno != EINVAL && rte_errno != ENOENT)
+               return rte_flow_error_set
+                       (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+                        NULL, "netlink: failed to remove ingress qdisc");
+       /* Create fresh ingress qdisc. */
+       nlh = mnl_nlmsg_put_header(buf);
+       nlh->nlmsg_type = RTM_NEWQDISC;
+       nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
+       tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
+       tcm->tcm_family = AF_UNSPEC;
+       tcm->tcm_ifindex = ifindex;
+       tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
+       tcm->tcm_parent = TC_H_INGRESS;
+       mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
+       if (mlx5_nl_flow_nl_ack(nl, nlh))
+               return rte_flow_error_set
+                       (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+                        NULL, "netlink: failed to create ingress qdisc");
+       return 0;
+}
+
+/**
+ * Create and configure a libmnl socket for Netlink flow rules.
+ *
+ * @return
+ *   A valid libmnl socket object pointer on success, NULL otherwise and
+ *   rte_errno is set.
+ */
+struct mnl_socket *
+mlx5_nl_flow_socket_create(void)
+{
+       struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
+
+       if (nl) {
+               mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
+                                     sizeof(int));
+               if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
+                       return nl;
+       }
+       rte_errno = errno;
+       if (nl)
+               mnl_socket_close(nl);
+       return NULL;
+}
+
+/**
+ * Destroy a libmnl socket.
+ */
+void
+mlx5_nl_flow_socket_destroy(struct mnl_socket *nl)
+{
+       mnl_socket_close(nl);
+}
index ea448ef..de33883 100644 (file)
@@ -149,9 +149,9 @@ else
 _LDLIBS-$(CONFIG_RTE_LIBRTE_MLX4_PMD)       += -lrte_pmd_mlx4 -libverbs -lmlx4
 endif
 ifeq ($(CONFIG_RTE_LIBRTE_MLX5_DLOPEN_DEPS),y)
-_LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD)       += -lrte_pmd_mlx5 -ldl
+_LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD)       += -lrte_pmd_mlx5 -ldl -lmnl
 else
-_LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD)       += -lrte_pmd_mlx5 -libverbs -lmlx5
+_LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD)       += -lrte_pmd_mlx5 -libverbs -lmlx5 -lmnl
 endif
 _LDLIBS-$(CONFIG_RTE_LIBRTE_MVPP2_PMD)      += -lrte_pmd_mvpp2 -L$(LIBMUSDK_PATH)/lib -lmusdk
 _LDLIBS-$(CONFIG_RTE_LIBRTE_NFP_PMD)        += -lrte_pmd_nfp