*/
#include <sys/queue.h>
+#include <stdalign.h>
#include <stdint.h>
#include <string.h>
struct rte_flow_action_rss rss;/**< RSS context. */
uint8_t key[MLX5_RSS_HASH_KEY_LEN]; /**< RSS hash key. */
uint16_t (*queue)[]; /**< Destination queues to redirect traffic to. */
+ void *nl_flow; /**< Netlink flow buffer if relevant. */
};
static const struct rte_flow_ops mlx5_flow_ops = {
return size;
}
+/**
+ * Validate flow rule and fill flow structure accordingly.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[out] flow
+ * Pointer to flow structure.
+ * @param flow_size
+ * Size of allocated space for @p flow.
+ * @param[in] attr
+ * Flow rule attributes.
+ * @param[in] pattern
+ * Pattern specification (list terminated by the END pattern item).
+ * @param[in] actions
+ * Associated actions (list terminated by the END action).
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * A positive value representing the size of the flow object in bytes
+ * regardless of @p flow_size on success, a negative errno value otherwise
+ * and rte_errno is set.
+ */
+static int
+mlx5_flow_merge_switch(struct rte_eth_dev *dev,
+ struct rte_flow *flow,
+ size_t flow_size,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_item pattern[],
+ const struct rte_flow_action actions[],
+ struct rte_flow_error *error)
+{
+ unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
+ uint16_t port_id[!n + n];
+ struct mlx5_nl_flow_ptoi ptoi[!n + n + 1];
+ size_t off = RTE_ALIGN_CEIL(sizeof(*flow), alignof(max_align_t));
+ unsigned int i;
+ unsigned int own = 0;
+ int ret;
+
+ /* At least one port is needed when no switch domain is present. */
+ if (!n) {
+ n = 1;
+ port_id[0] = dev->data->port_id;
+ } else {
+ n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
+ }
+ for (i = 0; i != n; ++i) {
+ struct rte_eth_dev_info dev_info;
+
+ rte_eth_dev_info_get(port_id[i], &dev_info);
+ if (port_id[i] == dev->data->port_id)
+ own = i;
+ ptoi[i].port_id = port_id[i];
+ ptoi[i].ifindex = dev_info.if_index;
+ }
+ /* Ensure first entry of ptoi[] is the current device. */
+ if (own) {
+ ptoi[n] = ptoi[0];
+ ptoi[0] = ptoi[own];
+ ptoi[own] = ptoi[n];
+ }
+ /* An entry with zero ifindex terminates ptoi[]. */
+ ptoi[n].port_id = 0;
+ ptoi[n].ifindex = 0;
+ if (flow_size < off)
+ flow_size = 0;
+ ret = mlx5_nl_flow_transpose((uint8_t *)flow + off,
+ flow_size ? flow_size - off : 0,
+ ptoi, attr, pattern, actions, error);
+ if (ret < 0)
+ return ret;
+ if (flow_size) {
+ *flow = (struct rte_flow){
+ .attributes = *attr,
+ .nl_flow = (uint8_t *)flow + off,
+ };
+ /*
+ * Generate a reasonably unique handle based on the address
+ * of the target buffer.
+ *
+ * This is straightforward on 32-bit systems where the flow
+ * pointer can be used directly. Otherwise, its least
+ * significant part is taken after shifting it by the
+ * previous power of two of the pointed buffer size.
+ */
+ if (sizeof(flow) <= 4)
+ mlx5_nl_flow_brand(flow->nl_flow, (uintptr_t)flow);
+ else
+ mlx5_nl_flow_brand
+ (flow->nl_flow,
+ (uintptr_t)flow >>
+ rte_log2_u32(rte_align32prevpow2(flow_size)));
+ }
+ return off + ret;
+}
+
/**
* Convert the @p attributes, @p pattern, @p action, into an flow for the NIC
* after ensuring the NIC will understand and process it correctly.
int ret;
uint32_t i;
+ if (attributes->transfer)
+ return mlx5_flow_merge_switch(dev, flow, flow_size,
+ attributes, pattern,
+ actions, error);
if (size > flow_size)
flow = &local_flow;
ret = mlx5_flow_attributes(dev, attributes, flow, error);
static void
mlx5_flow_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
{
+ struct priv *priv = dev->data->dev_private;
struct mlx5_flow_verbs *verbs;
+ if (flow->nl_flow && priv->mnl_socket)
+ mlx5_nl_flow_destroy(priv->mnl_socket, flow->nl_flow, NULL);
LIST_FOREACH(verbs, &flow->verbs, next) {
if (verbs->flow) {
claim_zero(mlx5_glue->destroy_flow(verbs->flow));
mlx5_flow_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
struct rte_flow_error *error)
{
+ struct priv *priv = dev->data->dev_private;
struct mlx5_flow_verbs *verbs;
int err;
goto error;
}
}
+ if (flow->nl_flow &&
+ priv->mnl_socket &&
+ mlx5_nl_flow_create(priv->mnl_socket, flow->nl_flow, error))
+ goto error;
return 0;
error:
err = rte_errno; /* Save rte_errno before cleanup. */
#include <errno.h>
#include <libmnl/libmnl.h>
+#include <linux/if_ether.h>
#include <linux/netlink.h>
+#include <linux/pkt_cls.h>
#include <linux/pkt_sched.h>
#include <linux/rtnetlink.h>
#include <stdalign.h>
#include <stdlib.h>
#include <sys/socket.h>
+#include <rte_byteorder.h>
#include <rte_errno.h>
#include <rte_flow.h>
#define NETLINK_CAP_ACK 10
#endif
+/* Normally found in linux/pkt_sched.h. */
+#ifndef TC_H_MIN_INGRESS
+#define TC_H_MIN_INGRESS 0xfff2u
+#endif
+
+/* Normally found in linux/pkt_cls.h. */
+#ifndef TCA_CLS_FLAGS_SKIP_SW
+#define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
+#endif
+#ifndef HAVE_TCA_FLOWER_ACT
+#define TCA_FLOWER_ACT 3
+#endif
+#ifndef HAVE_TCA_FLOWER_FLAGS
+#define TCA_FLOWER_FLAGS 22
+#endif
+
+/** Parser state definitions for mlx5_nl_flow_trans[]. */
+enum mlx5_nl_flow_trans {
+ INVALID,
+ BACK,
+ ATTR,
+ PATTERN,
+ ITEM_VOID,
+ ACTIONS,
+ ACTION_VOID,
+ END,
+};
+
+#define TRANS(...) (const enum mlx5_nl_flow_trans []){ __VA_ARGS__, INVALID, }
+
+#define PATTERN_COMMON \
+ ITEM_VOID, ACTIONS
+#define ACTIONS_COMMON \
+ ACTION_VOID, END
+
+/** Parser state transitions used by mlx5_nl_flow_transpose(). */
+static const enum mlx5_nl_flow_trans *const mlx5_nl_flow_trans[] = {
+ [INVALID] = NULL,
+ [BACK] = NULL,
+ [ATTR] = TRANS(PATTERN),
+ [PATTERN] = TRANS(PATTERN_COMMON),
+ [ITEM_VOID] = TRANS(BACK),
+ [ACTIONS] = TRANS(ACTIONS_COMMON),
+ [ACTION_VOID] = TRANS(BACK),
+ [END] = NULL,
+};
+
+/**
+ * Transpose flow rule description to rtnetlink message.
+ *
+ * This function transposes a flow rule description to a traffic control
+ * (TC) filter creation message ready to be sent over Netlink.
+ *
+ * Target interface is specified as the first entry of the @p ptoi table.
+ * Subsequent entries enable this function to resolve other DPDK port IDs
+ * found in the flow rule.
+ *
+ * @param[out] buf
+ * Output message buffer. May be NULL when @p size is 0.
+ * @param size
+ * Size of @p buf. Message may be truncated if not large enough.
+ * @param[in] ptoi
+ * DPDK port ID to network interface index translation table. This table
+ * is terminated by an entry with a zero ifindex value.
+ * @param[in] attr
+ * Flow rule attributes.
+ * @param[in] pattern
+ * Pattern specification.
+ * @param[in] actions
+ * Associated actions.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * A positive value representing the exact size of the message in bytes
+ * regardless of the @p size parameter on success, a negative errno value
+ * otherwise and rte_errno is set.
+ */
+int
+mlx5_nl_flow_transpose(void *buf,
+ size_t size,
+ const struct mlx5_nl_flow_ptoi *ptoi,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_item *pattern,
+ const struct rte_flow_action *actions,
+ struct rte_flow_error *error)
+{
+ alignas(struct nlmsghdr)
+ uint8_t buf_tmp[mnl_nlmsg_size(sizeof(struct tcmsg) + 1024)];
+ const struct rte_flow_item *item;
+ const struct rte_flow_action *action;
+ unsigned int n;
+ struct nlattr *na_flower;
+ struct nlattr *na_flower_act;
+ const enum mlx5_nl_flow_trans *trans;
+ const enum mlx5_nl_flow_trans *back;
+
+ if (!size)
+ goto error_nobufs;
+init:
+ item = pattern;
+ action = actions;
+ n = 0;
+ na_flower = NULL;
+ na_flower_act = NULL;
+ trans = TRANS(ATTR);
+ back = trans;
+trans:
+ switch (trans[n++]) {
+ struct nlmsghdr *nlh;
+ struct tcmsg *tcm;
+
+ case INVALID:
+ if (item->type)
+ return rte_flow_error_set
+ (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+ item, "unsupported pattern item combination");
+ else if (action->type)
+ return rte_flow_error_set
+ (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+ action, "unsupported action combination");
+ return rte_flow_error_set
+ (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "flow rule lacks some kind of fate action");
+ case BACK:
+ trans = back;
+ n = 0;
+ goto trans;
+ case ATTR:
+ /*
+ * Supported attributes: no groups, some priorities and
+ * ingress only. Don't care about transfer as it is the
+ * caller's problem.
+ */
+ if (attr->group)
+ return rte_flow_error_set
+ (error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
+ attr, "groups are not supported");
+ if (attr->priority > 0xfffe)
+ return rte_flow_error_set
+ (error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
+ attr, "lowest priority level is 0xfffe");
+ if (!attr->ingress)
+ return rte_flow_error_set
+ (error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
+ attr, "only ingress is supported");
+ if (attr->egress)
+ return rte_flow_error_set
+ (error, ENOTSUP,
+ RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
+ attr, "egress is not supported");
+ if (size < mnl_nlmsg_size(sizeof(*tcm)))
+ goto error_nobufs;
+ nlh = mnl_nlmsg_put_header(buf);
+ nlh->nlmsg_type = 0;
+ nlh->nlmsg_flags = 0;
+ nlh->nlmsg_seq = 0;
+ tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
+ tcm->tcm_family = AF_UNSPEC;
+ tcm->tcm_ifindex = ptoi[0].ifindex;
+ /*
+ * Let kernel pick a handle by default. A predictable handle
+ * can be set by the caller on the resulting buffer through
+ * mlx5_nl_flow_brand().
+ */
+ tcm->tcm_handle = 0;
+ tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
+ /*
+ * Priority cannot be zero to prevent the kernel from
+ * picking one automatically.
+ */
+ tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
+ RTE_BE16(ETH_P_ALL));
+ break;
+ case PATTERN:
+ if (!mnl_attr_put_strz_check(buf, size, TCA_KIND, "flower"))
+ goto error_nobufs;
+ na_flower = mnl_attr_nest_start_check(buf, size, TCA_OPTIONS);
+ if (!na_flower)
+ goto error_nobufs;
+ if (!mnl_attr_put_u32_check(buf, size, TCA_FLOWER_FLAGS,
+ TCA_CLS_FLAGS_SKIP_SW))
+ goto error_nobufs;
+ break;
+ case ITEM_VOID:
+ if (item->type != RTE_FLOW_ITEM_TYPE_VOID)
+ goto trans;
+ ++item;
+ break;
+ case ACTIONS:
+ if (item->type != RTE_FLOW_ITEM_TYPE_END)
+ goto trans;
+ assert(na_flower);
+ assert(!na_flower_act);
+ na_flower_act =
+ mnl_attr_nest_start_check(buf, size, TCA_FLOWER_ACT);
+ if (!na_flower_act)
+ goto error_nobufs;
+ break;
+ case ACTION_VOID:
+ if (action->type != RTE_FLOW_ACTION_TYPE_VOID)
+ goto trans;
+ ++action;
+ break;
+ case END:
+ if (item->type != RTE_FLOW_ITEM_TYPE_END ||
+ action->type != RTE_FLOW_ACTION_TYPE_END)
+ goto trans;
+ if (na_flower_act)
+ mnl_attr_nest_end(buf, na_flower_act);
+ if (na_flower)
+ mnl_attr_nest_end(buf, na_flower);
+ nlh = buf;
+ return nlh->nlmsg_len;
+ }
+ back = trans;
+ trans = mlx5_nl_flow_trans[trans[n - 1]];
+ n = 0;
+ goto trans;
+error_nobufs:
+ if (buf != buf_tmp) {
+ buf = buf_tmp;
+ size = sizeof(buf_tmp);
+ goto init;
+ }
+ return rte_flow_error_set
+ (error, ENOBUFS, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "generated TC message is too large");
+}
+
+/**
+ * Brand rtnetlink buffer with unique handle.
+ *
+ * This handle should be unique for a given network interface to avoid
+ * collisions.
+ *
+ * @param buf
+ * Flow rule buffer previously initialized by mlx5_nl_flow_transpose().
+ * @param handle
+ * Unique 32-bit handle to use.
+ */
+void
+mlx5_nl_flow_brand(void *buf, uint32_t handle)
+{
+ struct tcmsg *tcm = mnl_nlmsg_get_payload(buf);
+
+ tcm->tcm_handle = handle;
+}
+
/**
* Send Netlink message with acknowledgment.
*
return -rte_errno;
}
+/**
+ * Create a Netlink flow rule.
+ *
+ * @param nl
+ * Libmnl socket to use.
+ * @param buf
+ * Flow rule buffer previously initialized by mlx5_nl_flow_transpose().
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_nl_flow_create(struct mnl_socket *nl, void *buf,
+ struct rte_flow_error *error)
+{
+ struct nlmsghdr *nlh = buf;
+
+ nlh->nlmsg_type = RTM_NEWTFILTER;
+ nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
+ if (!mlx5_nl_flow_nl_ack(nl, nlh))
+ return 0;
+ return rte_flow_error_set
+ (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "netlink: failed to create TC flow rule");
+}
+
+/**
+ * Destroy a Netlink flow rule.
+ *
+ * @param nl
+ * Libmnl socket to use.
+ * @param buf
+ * Flow rule buffer previously initialized by mlx5_nl_flow_transpose().
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_nl_flow_destroy(struct mnl_socket *nl, void *buf,
+ struct rte_flow_error *error)
+{
+ struct nlmsghdr *nlh = buf;
+
+ nlh->nlmsg_type = RTM_DELTFILTER;
+ nlh->nlmsg_flags = NLM_F_REQUEST;
+ if (!mlx5_nl_flow_nl_ack(nl, nlh))
+ return 0;
+ return rte_flow_error_set
+ (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "netlink: failed to destroy TC flow rule");
+}
+
/**
* Initialize ingress qdisc of a given network interface.
*