From 8f9059ccee455e891e583cbfa6bce9891e7489ad Mon Sep 17 00:00:00 2001 From: Adrien Mazarguil Date: Fri, 13 Jul 2018 11:40:39 +0200 Subject: [PATCH] net/mlx5: add framework for switch flow rules Because mlx5 switch flow rules are configured through Netlink (TC interface) and have little in common with Verbs, this patch adds a separate parser function to handle them. - mlx5_nl_flow_transpose() converts a rte_flow rule to its TC equivalent and stores the result in a buffer. - mlx5_nl_flow_brand() gives a unique handle to a flow rule buffer. - mlx5_nl_flow_create() instantiates a flow rule on the device based on such a buffer. - mlx5_nl_flow_destroy() performs the reverse operation. These functions are called by the existing implementation when encountering flow rules which must be offloaded to the switch (currently relying on the transfer attribute). Signed-off-by: Adrien Mazarguil Signed-off-by: Nelio Laranjeiro Acked-by: Yongseok Koh --- drivers/net/mlx5/Makefile | 10 + drivers/net/mlx5/mlx5.h | 18 ++ drivers/net/mlx5/mlx5_flow.c | 111 ++++++++++++ drivers/net/mlx5/mlx5_nl_flow.c | 311 ++++++++++++++++++++++++++++++++ 4 files changed, 450 insertions(+) diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile index 0fb890ccb7..6aed41b486 100644 --- a/drivers/net/mlx5/Makefile +++ b/drivers/net/mlx5/Makefile @@ -199,6 +199,16 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh linux/if_link.h \ enum IFLA_PHYS_PORT_NAME \ $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_ACT \ + linux/pkt_cls.h \ + enum TCA_FLOWER_ACT \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TCA_FLOWER_FLAGS \ + linux/pkt_cls.h \ + enum TCA_FLOWER_FLAGS \ + $(AUTOCONF_OUTPUT) # Create mlx5_autoconf.h or update it in case it differs from the new one. diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index 4786af82d0..db8a5fa018 100644 --- a/drivers/net/mlx5/mlx5.h +++ b/drivers/net/mlx5/mlx5.h @@ -156,6 +156,12 @@ struct mlx5_drop { struct mlx5_rxq_ibv *rxq; /* Verbs Rx queue. */ }; +/** DPDK port to network interface index (ifindex) conversion. */ +struct mlx5_nl_flow_ptoi { + uint16_t port_id; /**< DPDK port ID. */ + unsigned int ifindex; /**< Network interface index. */ +}; + struct mnl_socket; struct priv { @@ -390,6 +396,18 @@ int mlx5_nl_switch_info(int nl, unsigned int ifindex, /* mlx5_nl_flow.c */ +int mlx5_nl_flow_transpose(void *buf, + size_t size, + const struct mlx5_nl_flow_ptoi *ptoi, + const struct rte_flow_attr *attr, + const struct rte_flow_item *pattern, + const struct rte_flow_action *actions, + struct rte_flow_error *error); +void mlx5_nl_flow_brand(void *buf, uint32_t handle); +int mlx5_nl_flow_create(struct mnl_socket *nl, void *buf, + struct rte_flow_error *error); +int mlx5_nl_flow_destroy(struct mnl_socket *nl, void *buf, + struct rte_flow_error *error); int mlx5_nl_flow_init(struct mnl_socket *nl, unsigned int ifindex, struct rte_flow_error *error); struct mnl_socket *mlx5_nl_flow_socket_create(void); diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c index 6fa4e30aef..32854198b9 100644 --- a/drivers/net/mlx5/mlx5_flow.c +++ b/drivers/net/mlx5/mlx5_flow.c @@ -4,6 +4,7 @@ */ #include +#include #include #include @@ -280,6 +281,7 @@ struct rte_flow { struct rte_flow_action_rss rss;/**< RSS context. */ uint8_t key[MLX5_RSS_HASH_KEY_LEN]; /**< RSS hash key. */ uint16_t (*queue)[]; /**< Destination queues to redirect traffic to. */ + void *nl_flow; /**< Netlink flow buffer if relevant. */ }; static const struct rte_flow_ops mlx5_flow_ops = { @@ -2365,6 +2367,103 @@ mlx5_flow_actions(struct rte_eth_dev *dev, return size; } +/** + * Validate flow rule and fill flow structure accordingly. + * + * @param dev + * Pointer to Ethernet device. + * @param[out] flow + * Pointer to flow structure. + * @param flow_size + * Size of allocated space for @p flow. + * @param[in] attr + * Flow rule attributes. + * @param[in] pattern + * Pattern specification (list terminated by the END pattern item). + * @param[in] actions + * Associated actions (list terminated by the END action). + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * A positive value representing the size of the flow object in bytes + * regardless of @p flow_size on success, a negative errno value otherwise + * and rte_errno is set. + */ +static int +mlx5_flow_merge_switch(struct rte_eth_dev *dev, + struct rte_flow *flow, + size_t flow_size, + const struct rte_flow_attr *attr, + const struct rte_flow_item pattern[], + const struct rte_flow_action actions[], + struct rte_flow_error *error) +{ + unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0); + uint16_t port_id[!n + n]; + struct mlx5_nl_flow_ptoi ptoi[!n + n + 1]; + size_t off = RTE_ALIGN_CEIL(sizeof(*flow), alignof(max_align_t)); + unsigned int i; + unsigned int own = 0; + int ret; + + /* At least one port is needed when no switch domain is present. */ + if (!n) { + n = 1; + port_id[0] = dev->data->port_id; + } else { + n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n); + } + for (i = 0; i != n; ++i) { + struct rte_eth_dev_info dev_info; + + rte_eth_dev_info_get(port_id[i], &dev_info); + if (port_id[i] == dev->data->port_id) + own = i; + ptoi[i].port_id = port_id[i]; + ptoi[i].ifindex = dev_info.if_index; + } + /* Ensure first entry of ptoi[] is the current device. */ + if (own) { + ptoi[n] = ptoi[0]; + ptoi[0] = ptoi[own]; + ptoi[own] = ptoi[n]; + } + /* An entry with zero ifindex terminates ptoi[]. */ + ptoi[n].port_id = 0; + ptoi[n].ifindex = 0; + if (flow_size < off) + flow_size = 0; + ret = mlx5_nl_flow_transpose((uint8_t *)flow + off, + flow_size ? flow_size - off : 0, + ptoi, attr, pattern, actions, error); + if (ret < 0) + return ret; + if (flow_size) { + *flow = (struct rte_flow){ + .attributes = *attr, + .nl_flow = (uint8_t *)flow + off, + }; + /* + * Generate a reasonably unique handle based on the address + * of the target buffer. + * + * This is straightforward on 32-bit systems where the flow + * pointer can be used directly. Otherwise, its least + * significant part is taken after shifting it by the + * previous power of two of the pointed buffer size. + */ + if (sizeof(flow) <= 4) + mlx5_nl_flow_brand(flow->nl_flow, (uintptr_t)flow); + else + mlx5_nl_flow_brand + (flow->nl_flow, + (uintptr_t)flow >> + rte_log2_u32(rte_align32prevpow2(flow_size))); + } + return off + ret; +} + /** * Convert the @p attributes, @p pattern, @p action, into an flow for the NIC * after ensuring the NIC will understand and process it correctly. @@ -2419,6 +2518,10 @@ mlx5_flow_merge(struct rte_eth_dev *dev, struct rte_flow *flow, int ret; uint32_t i; + if (attributes->transfer) + return mlx5_flow_merge_switch(dev, flow, flow_size, + attributes, pattern, + actions, error); if (size > flow_size) flow = &local_flow; ret = mlx5_flow_attributes(dev, attributes, flow, error); @@ -2709,8 +2812,11 @@ mlx5_flow_validate(struct rte_eth_dev *dev, static void mlx5_flow_remove(struct rte_eth_dev *dev, struct rte_flow *flow) { + struct priv *priv = dev->data->dev_private; struct mlx5_flow_verbs *verbs; + if (flow->nl_flow && priv->mnl_socket) + mlx5_nl_flow_destroy(priv->mnl_socket, flow->nl_flow, NULL); LIST_FOREACH(verbs, &flow->verbs, next) { if (verbs->flow) { claim_zero(mlx5_glue->destroy_flow(verbs->flow)); @@ -2747,6 +2853,7 @@ static int mlx5_flow_apply(struct rte_eth_dev *dev, struct rte_flow *flow, struct rte_flow_error *error) { + struct priv *priv = dev->data->dev_private; struct mlx5_flow_verbs *verbs; int err; @@ -2795,6 +2902,10 @@ mlx5_flow_apply(struct rte_eth_dev *dev, struct rte_flow *flow, goto error; } } + if (flow->nl_flow && + priv->mnl_socket && + mlx5_nl_flow_create(priv->mnl_socket, flow->nl_flow, error)) + goto error; return 0; error: err = rte_errno; /* Save rte_errno before cleanup. */ diff --git a/drivers/net/mlx5/mlx5_nl_flow.c b/drivers/net/mlx5/mlx5_nl_flow.c index 60a4493e53..a9a5bac499 100644 --- a/drivers/net/mlx5/mlx5_nl_flow.c +++ b/drivers/net/mlx5/mlx5_nl_flow.c @@ -5,7 +5,9 @@ #include #include +#include #include +#include #include #include #include @@ -14,6 +16,7 @@ #include #include +#include #include #include @@ -24,6 +27,258 @@ #define NETLINK_CAP_ACK 10 #endif +/* Normally found in linux/pkt_sched.h. */ +#ifndef TC_H_MIN_INGRESS +#define TC_H_MIN_INGRESS 0xfff2u +#endif + +/* Normally found in linux/pkt_cls.h. */ +#ifndef TCA_CLS_FLAGS_SKIP_SW +#define TCA_CLS_FLAGS_SKIP_SW (1 << 1) +#endif +#ifndef HAVE_TCA_FLOWER_ACT +#define TCA_FLOWER_ACT 3 +#endif +#ifndef HAVE_TCA_FLOWER_FLAGS +#define TCA_FLOWER_FLAGS 22 +#endif + +/** Parser state definitions for mlx5_nl_flow_trans[]. */ +enum mlx5_nl_flow_trans { + INVALID, + BACK, + ATTR, + PATTERN, + ITEM_VOID, + ACTIONS, + ACTION_VOID, + END, +}; + +#define TRANS(...) (const enum mlx5_nl_flow_trans []){ __VA_ARGS__, INVALID, } + +#define PATTERN_COMMON \ + ITEM_VOID, ACTIONS +#define ACTIONS_COMMON \ + ACTION_VOID, END + +/** Parser state transitions used by mlx5_nl_flow_transpose(). */ +static const enum mlx5_nl_flow_trans *const mlx5_nl_flow_trans[] = { + [INVALID] = NULL, + [BACK] = NULL, + [ATTR] = TRANS(PATTERN), + [PATTERN] = TRANS(PATTERN_COMMON), + [ITEM_VOID] = TRANS(BACK), + [ACTIONS] = TRANS(ACTIONS_COMMON), + [ACTION_VOID] = TRANS(BACK), + [END] = NULL, +}; + +/** + * Transpose flow rule description to rtnetlink message. + * + * This function transposes a flow rule description to a traffic control + * (TC) filter creation message ready to be sent over Netlink. + * + * Target interface is specified as the first entry of the @p ptoi table. + * Subsequent entries enable this function to resolve other DPDK port IDs + * found in the flow rule. + * + * @param[out] buf + * Output message buffer. May be NULL when @p size is 0. + * @param size + * Size of @p buf. Message may be truncated if not large enough. + * @param[in] ptoi + * DPDK port ID to network interface index translation table. This table + * is terminated by an entry with a zero ifindex value. + * @param[in] attr + * Flow rule attributes. + * @param[in] pattern + * Pattern specification. + * @param[in] actions + * Associated actions. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * A positive value representing the exact size of the message in bytes + * regardless of the @p size parameter on success, a negative errno value + * otherwise and rte_errno is set. + */ +int +mlx5_nl_flow_transpose(void *buf, + size_t size, + const struct mlx5_nl_flow_ptoi *ptoi, + const struct rte_flow_attr *attr, + const struct rte_flow_item *pattern, + const struct rte_flow_action *actions, + struct rte_flow_error *error) +{ + alignas(struct nlmsghdr) + uint8_t buf_tmp[mnl_nlmsg_size(sizeof(struct tcmsg) + 1024)]; + const struct rte_flow_item *item; + const struct rte_flow_action *action; + unsigned int n; + struct nlattr *na_flower; + struct nlattr *na_flower_act; + const enum mlx5_nl_flow_trans *trans; + const enum mlx5_nl_flow_trans *back; + + if (!size) + goto error_nobufs; +init: + item = pattern; + action = actions; + n = 0; + na_flower = NULL; + na_flower_act = NULL; + trans = TRANS(ATTR); + back = trans; +trans: + switch (trans[n++]) { + struct nlmsghdr *nlh; + struct tcmsg *tcm; + + case INVALID: + if (item->type) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, + item, "unsupported pattern item combination"); + else if (action->type) + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, + action, "unsupported action combination"); + return rte_flow_error_set + (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "flow rule lacks some kind of fate action"); + case BACK: + trans = back; + n = 0; + goto trans; + case ATTR: + /* + * Supported attributes: no groups, some priorities and + * ingress only. Don't care about transfer as it is the + * caller's problem. + */ + if (attr->group) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_GROUP, + attr, "groups are not supported"); + if (attr->priority > 0xfffe) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, + attr, "lowest priority level is 0xfffe"); + if (!attr->ingress) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, + attr, "only ingress is supported"); + if (attr->egress) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, + attr, "egress is not supported"); + if (size < mnl_nlmsg_size(sizeof(*tcm))) + goto error_nobufs; + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = 0; + nlh->nlmsg_flags = 0; + nlh->nlmsg_seq = 0; + tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm)); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm_ifindex = ptoi[0].ifindex; + /* + * Let kernel pick a handle by default. A predictable handle + * can be set by the caller on the resulting buffer through + * mlx5_nl_flow_brand(). + */ + tcm->tcm_handle = 0; + tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS); + /* + * Priority cannot be zero to prevent the kernel from + * picking one automatically. + */ + tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16, + RTE_BE16(ETH_P_ALL)); + break; + case PATTERN: + if (!mnl_attr_put_strz_check(buf, size, TCA_KIND, "flower")) + goto error_nobufs; + na_flower = mnl_attr_nest_start_check(buf, size, TCA_OPTIONS); + if (!na_flower) + goto error_nobufs; + if (!mnl_attr_put_u32_check(buf, size, TCA_FLOWER_FLAGS, + TCA_CLS_FLAGS_SKIP_SW)) + goto error_nobufs; + break; + case ITEM_VOID: + if (item->type != RTE_FLOW_ITEM_TYPE_VOID) + goto trans; + ++item; + break; + case ACTIONS: + if (item->type != RTE_FLOW_ITEM_TYPE_END) + goto trans; + assert(na_flower); + assert(!na_flower_act); + na_flower_act = + mnl_attr_nest_start_check(buf, size, TCA_FLOWER_ACT); + if (!na_flower_act) + goto error_nobufs; + break; + case ACTION_VOID: + if (action->type != RTE_FLOW_ACTION_TYPE_VOID) + goto trans; + ++action; + break; + case END: + if (item->type != RTE_FLOW_ITEM_TYPE_END || + action->type != RTE_FLOW_ACTION_TYPE_END) + goto trans; + if (na_flower_act) + mnl_attr_nest_end(buf, na_flower_act); + if (na_flower) + mnl_attr_nest_end(buf, na_flower); + nlh = buf; + return nlh->nlmsg_len; + } + back = trans; + trans = mlx5_nl_flow_trans[trans[n - 1]]; + n = 0; + goto trans; +error_nobufs: + if (buf != buf_tmp) { + buf = buf_tmp; + size = sizeof(buf_tmp); + goto init; + } + return rte_flow_error_set + (error, ENOBUFS, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "generated TC message is too large"); +} + +/** + * Brand rtnetlink buffer with unique handle. + * + * This handle should be unique for a given network interface to avoid + * collisions. + * + * @param buf + * Flow rule buffer previously initialized by mlx5_nl_flow_transpose(). + * @param handle + * Unique 32-bit handle to use. + */ +void +mlx5_nl_flow_brand(void *buf, uint32_t handle) +{ + struct tcmsg *tcm = mnl_nlmsg_get_payload(buf); + + tcm->tcm_handle = handle; +} + /** * Send Netlink message with acknowledgment. * @@ -59,6 +314,62 @@ mlx5_nl_flow_nl_ack(struct mnl_socket *nl, struct nlmsghdr *nlh) return -rte_errno; } +/** + * Create a Netlink flow rule. + * + * @param nl + * Libmnl socket to use. + * @param buf + * Flow rule buffer previously initialized by mlx5_nl_flow_transpose(). + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_nl_flow_create(struct mnl_socket *nl, void *buf, + struct rte_flow_error *error) +{ + struct nlmsghdr *nlh = buf; + + nlh->nlmsg_type = RTM_NEWTFILTER; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; + if (!mlx5_nl_flow_nl_ack(nl, nlh)) + return 0; + return rte_flow_error_set + (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "netlink: failed to create TC flow rule"); +} + +/** + * Destroy a Netlink flow rule. + * + * @param nl + * Libmnl socket to use. + * @param buf + * Flow rule buffer previously initialized by mlx5_nl_flow_transpose(). + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +int +mlx5_nl_flow_destroy(struct mnl_socket *nl, void *buf, + struct rte_flow_error *error) +{ + struct nlmsghdr *nlh = buf; + + nlh->nlmsg_type = RTM_DELTFILTER; + nlh->nlmsg_flags = NLM_F_REQUEST; + if (!mlx5_nl_flow_nl_ack(nl, nlh)) + return 0; + return rte_flow_error_set + (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "netlink: failed to destroy TC flow rule"); +} + /** * Initialize ingress qdisc of a given network interface. * -- 2.20.1