From: Ophir Munk Date: Sat, 20 Jan 2018 21:11:36 +0000 (+0000) Subject: net/tap: implement RSS using eBPF X-Git-Tag: spdx-start~31 X-Git-Url: http://git.droids-corp.org/?a=commitdiff_plain;h=036d721a8229425687d9841f58e62bc7ebd06616;p=dpdk.git net/tap: implement RSS using eBPF TAP PMD is required to support RSS queue mapping based on rte_flow API. An example usage for this requirement is failsafe transparent switching from a PCI device to TAP device while keep redirecting packets to the same RSS queues on both devices. TAP RSS implementation is based on eBPF programs sent to Linux kernel through BPF system calls and using netlink messages to reference the programs as part of traffic control commands. TC uses eBPF programs as classifiers and actions. eBPF classification: packets marked with an RSS queue will be directed to this queue using TC with "skbedit" action. BPF classifiers are downloaded to the kernel once on TAP creation for each TAP Rx queue. eBPF action: calculate the Toeplitz RSS hash based on L3 addresses and L4 ports. Mark the packet with the RSS queue according the resulting RSS hash, then reclassify the packet. BPF actions are downloaded to the kernel for each new RSS rule. TAP eBPF requires Linux version 4.9 configured with BPF. TAP PMD will successfully compile on systems with old or non-BPF configured kernels but RSS rules creation on TAP devices will not be successful Signed-off-by: Ophir Munk Acked-by: Pascal Mazon --- diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile index fad8a94ceb..e23c3a2064 100644 --- a/drivers/net/tap/Makefile +++ b/drivers/net/tap/Makefile @@ -62,6 +62,26 @@ tap_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh linux/pkt_cls.h \ enum TCA_FLOWER_KEY_VLAN_PRIO \ $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TC_BPF \ + linux/pkt_cls.h \ + enum TCA_BPF_UNSPEC \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TC_BPF_FD \ + linux/pkt_cls.h \ + enum TCA_BPF_FD \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TC_ACT_BPF \ + linux/tc_act/tc_bpf.h \ + enum TCA_ACT_BPF_UNSPEC \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TC_ACT_BPF_FD \ + linux/tc_act/tc_bpf.h \ + enum TCA_ACT_BPF_FD \ + $(AUTOCONF_OUTPUT) $Q sh -- '$<' '$@' \ HAVE_BPF_PROG_LOAD \ linux/bpf.h \ diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h index e42323f2a6..73c201b938 100644 --- a/drivers/net/tap/rte_eth_tap.h +++ b/drivers/net/tap/rte_eth_tap.h @@ -93,6 +93,11 @@ struct pmd_internals { int flow_isolate; /* 1 if flow isolation is enabled */ int flower_support; /* 1 if kernel supports, else 0 */ int flower_vlan_support; /* 1 if kernel supports, else 0 */ + int rss_enabled; /* 1 if RSS is enabled, else 0 */ + /* implicit rules set when RSS is enabled */ + int map_fd; /* BPF RSS map fd */ + int bpf_fd[RTE_PMD_TAP_MAX_QUEUES];/* List of bpf fds per queue */ + LIST_HEAD(tap_rss_flows, rte_flow) rss_flows; LIST_HEAD(tap_flows, rte_flow) flows; /* rte_flow rules */ /* implicit rte_flow rules set when a remote device is active */ LIST_HEAD(tap_implicit_flows, rte_flow) implicit_flows; diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c index d2a69a7ab0..6aa53a7140 100644 --- a/drivers/net/tap/tap_flow.c +++ b/drivers/net/tap/tap_flow.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -43,6 +44,7 @@ #include #include #include +#include #ifndef HAVE_TC_FLOWER /* @@ -82,12 +84,79 @@ enum { TCA_FLOWER_KEY_VLAN_ETH_TYPE, /* be16 */ }; #endif +/* + * For kernels < 4.2 BPF related enums may not be defined. + * Runtime checks will be carried out to gracefully report on TC messages that + * are rejected by the kernel. Rejection reasons may be due to: + * 1. enum is not defined + * 2. enum is defined but kernel is not configured to support BPF system calls, + * BPF classifications or BPF actions. + */ +#ifndef HAVE_TC_BPF +enum { + TCA_BPF_UNSPEC, + TCA_BPF_ACT, + TCA_BPF_POLICE, + TCA_BPF_CLASSID, + TCA_BPF_OPS_LEN, + TCA_BPF_OPS, +}; +#endif +#ifndef HAVE_TC_BPF_FD +enum { + TCA_BPF_FD = TCA_BPF_OPS + 1, + TCA_BPF_NAME, +}; +#endif +#ifndef HAVE_TC_ACT_BPF +#define tc_gen \ + __u32 index; \ + __u32 capab; \ + int action; \ + int refcnt; \ + int bindcnt + +struct tc_act_bpf { + tc_gen; +}; + +enum { + TCA_ACT_BPF_UNSPEC, + TCA_ACT_BPF_TM, + TCA_ACT_BPF_PARMS, + TCA_ACT_BPF_OPS_LEN, + TCA_ACT_BPF_OPS, +}; + +#endif +#ifndef HAVE_TC_ACT_BPF_FD +enum { + TCA_ACT_BPF_FD = TCA_ACT_BPF_OPS + 1, + TCA_ACT_BPF_NAME, +}; +#endif + +/* RSS key management */ +enum bpf_rss_key_e { + KEY_CMD_GET = 1, + KEY_CMD_RELEASE, + KEY_CMD_INIT, + KEY_CMD_DEINIT, +}; + +enum key_status_e { + KEY_STAT_UNSPEC, + KEY_STAT_USED, + KEY_STAT_AVAILABLE, +}; #define ISOLATE_HANDLE 1 struct rte_flow { LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */ struct rte_flow *remote_flow; /* associated remote flow */ + int bpf_fd[SEC_MAX]; /* list of bfs fds per ELF section */ + uint32_t key_idx; /* RSS rule key index into BPF map */ struct nlmsg msg; }; @@ -115,6 +184,11 @@ struct action_data { struct tc_skbedit skbedit; uint16_t queue; } skbedit; + struct bpf { + struct tc_act_bpf bpf; + int bpf_fd; + const char *annotation; + } bpf; }; }; @@ -148,6 +222,14 @@ tap_flow_isolate(struct rte_eth_dev *dev, int set, struct rte_flow_error *error); +static int bpf_rss_key(enum bpf_rss_key_e cmd, __u32 *key_idx); +static int rss_enable(struct pmd_internals *pmd, + const struct rte_flow_attr *attr, + struct rte_flow_error *error); +static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd, + const struct rte_flow_action_rss *rss, + struct rte_flow_error *error); + static const struct rte_flow_ops tap_flow_ops = { .validate = tap_flow_validate, .create = tap_flow_create, @@ -877,6 +959,14 @@ add_action(struct rte_flow *flow, size_t *act_index, struct action_data *adata) &adata->skbedit.skbedit); tap_nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING, adata->skbedit.queue); + } else if (strcmp("bpf", adata->id) == 0) { + tap_nlattr_add32(&msg->nh, TCA_ACT_BPF_FD, adata->bpf.bpf_fd); + tap_nlattr_add(&msg->nh, TCA_ACT_BPF_NAME, + strlen(adata->bpf.annotation) + 1, + adata->bpf.annotation); + tap_nlattr_add(&msg->nh, TCA_ACT_BPF_PARMS, + sizeof(adata->bpf.bpf), + &adata->bpf.bpf); } else { return -1; } @@ -979,7 +1069,8 @@ priv_flow_process(struct pmd_internals *pmd, return -rte_errno; } else if (flow) { uint16_t group = attr->group << GROUP_SHIFT; - uint16_t prio = group | (attr->priority + PRIORITY_OFFSET); + uint16_t prio = group | (attr->priority + + RSS_PRIORITY_OFFSET + PRIORITY_OFFSET); flow->msg.t.tcm_info = TC_H_MAKE(prio << 16, flow->msg.t.tcm_info); } @@ -1134,7 +1225,6 @@ priv_flow_process(struct pmd_internals *pmd, TCA_FLOWER_ACT); } } else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) { - /* Fake RSS support. */ const struct rte_flow_action_rss *rss = (const struct rte_flow_action_rss *) actions->conf; @@ -1142,23 +1232,13 @@ priv_flow_process(struct pmd_internals *pmd, if (action++) goto exit_action_not_supported; - if (!rss || rss->num < 1 || - (rss->queue[0] > pmd->dev->data->nb_rx_queues - 1)) - goto exit_action_not_supported; - if (flow) { - struct action_data adata = { - .id = "skbedit", - .skbedit = { - .skbedit = { - .action = TC_ACT_PIPE, - }, - .queue = rss->queue[0], - }, - }; - - err = add_actions(flow, 1, &adata, - TCA_FLOWER_ACT); + if (!pmd->rss_enabled) { + err = rss_enable(pmd, attr, error); + if (err) + goto exit_action_not_supported; } + if (flow && rss) + err = rss_add_actions(flow, pmd, rss, error); } else { goto exit_action_not_supported; } @@ -1370,6 +1450,7 @@ tap_flow_destroy_pmd(struct pmd_internals *pmd, struct rte_flow_error *error) { struct rte_flow *remote_flow = flow->remote_flow; + int i; int ret = 0; LIST_REMOVE(flow, next); @@ -1395,6 +1476,23 @@ tap_flow_destroy_pmd(struct pmd_internals *pmd, "couldn't receive kernel ack to our request"); goto end; } + /* Close opened BPF file descriptors of this flow */ + for (i = 0; i < SEC_MAX; i++) + if (flow->bpf_fd[i] != 0) { + close(flow->bpf_fd[i]); + flow->bpf_fd[i] = 0; + } + + /* Release map key for this RSS rule */ + ret = bpf_rss_key(KEY_CMD_RELEASE, &flow->key_idx); + if (ret < 0) { + rte_flow_error_set( + error, EINVAL, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, + "Failed to release BPF RSS key"); + + goto end; + } + if (remote_flow) { remote_flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; remote_flow->msg.nh.nlmsg_type = RTM_DELTFILTER; @@ -1679,6 +1777,322 @@ tap_flow_implicit_flush(struct pmd_internals *pmd, struct rte_flow_error *error) return 0; } +#define MAX_RSS_KEYS 256 +#define SEC_NAME_CLS_Q "cls_q" + +const char *sec_name[SEC_MAX] = { + [SEC_L3_L4] = "l3_l4", +}; + +/** + * Enable RSS on tap: create TC rules for queuing. + * + * @param[in, out] pmd + * Pointer to private structure. + * + * @param[in] attr + * Pointer to rte_flow to get flow group + * + * @param[out] error + * Pointer to error reporting if not NULL. + * + * @return 0 on success, negative value on failure. + */ +static int rss_enable(struct pmd_internals *pmd, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) +{ + struct rte_flow *rss_flow = NULL; + struct nlmsg *msg = NULL; + /* 4096 is the maximum number of instructions for a BPF program */ + char annotation[64]; + int i; + int err = 0; + + /* unlimit locked memory */ + struct rlimit memlock_limit = { + .rlim_cur = RLIM_INFINITY, + .rlim_max = RLIM_INFINITY, + }; + setrlimit(RLIMIT_MEMLOCK, &memlock_limit); + + /* Get a new map key for a new RSS rule */ + err = bpf_rss_key(KEY_CMD_INIT, NULL); + if (err < 0) { + rte_flow_error_set( + error, EINVAL, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, + "Failed to initialize BPF RSS keys"); + + return -1; + } + + /* + * Create BPF RSS MAP + */ + pmd->map_fd = tap_flow_bpf_rss_map_create(sizeof(__u32), /* key size */ + sizeof(struct rss_key), + MAX_RSS_KEYS); + if (pmd->map_fd < 0) { + RTE_LOG(ERR, PMD, + "Failed to create BPF map (%d): %s\n", + errno, strerror(errno)); + rte_flow_error_set( + error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, + "Kernel too old or not configured " + "to support BPF maps"); + + return -ENOTSUP; + } + + /* + * Add a rule per queue to match reclassified packets and direct them to + * the correct queue. + */ + for (i = 0; i < pmd->dev->data->nb_rx_queues; i++) { + pmd->bpf_fd[i] = tap_flow_bpf_cls_q(i); + if (pmd->bpf_fd[i] < 0) { + RTE_LOG(ERR, PMD, + "Failed to load BPF section %s for queue %d", + SEC_NAME_CLS_Q, i); + rte_flow_error_set( + error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, + "Kernel too old or not configured " + "to support BPF programs loading"); + + return -ENOTSUP; + } + + rss_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0); + if (!rss_flow) { + RTE_LOG(ERR, PMD, + "Cannot allocate memory for rte_flow"); + return -1; + } + msg = &rss_flow->msg; + tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER, NLM_F_REQUEST | + NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE); + msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL)); + tap_flow_set_handle(rss_flow); + uint16_t group = attr->group << GROUP_SHIFT; + uint16_t prio = group | (i + PRIORITY_OFFSET); + msg->t.tcm_info = TC_H_MAKE(prio << 16, msg->t.tcm_info); + msg->t.tcm_parent = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0); + + tap_nlattr_add(&msg->nh, TCA_KIND, sizeof("bpf"), "bpf"); + if (tap_nlattr_nested_start(msg, TCA_OPTIONS) < 0) + return -1; + tap_nlattr_add32(&msg->nh, TCA_BPF_FD, pmd->bpf_fd[i]); + snprintf(annotation, sizeof(annotation), "[%s%d]", + SEC_NAME_CLS_Q, i); + tap_nlattr_add(&msg->nh, TCA_BPF_NAME, strlen(annotation) + 1, + annotation); + /* Actions */ + { + struct action_data adata = { + .id = "skbedit", + .skbedit = { + .skbedit = { + .action = TC_ACT_PIPE, + }, + .queue = i, + }, + }; + if (add_actions(rss_flow, 1, &adata, TCA_BPF_ACT) < 0) + return -1; + } + tap_nlattr_nested_finish(msg); /* nested TCA_OPTIONS */ + + /* Netlink message is now ready to be sent */ + if (tap_nl_send(pmd->nlsk_fd, &msg->nh) < 0) + return -1; + err = tap_nl_recv_ack(pmd->nlsk_fd); + if (err < 0) { + RTE_LOG(ERR, PMD, + "Kernel refused TC filter rule creation (%d): %s\n", + errno, strerror(errno)); + return err; + } + LIST_INSERT_HEAD(&pmd->rss_flows, rss_flow, next); + } + + pmd->rss_enabled = 1; + return err; +} + +/** + * Manage bpf RSS keys repository with operations: init, get, release + * + * @param[in] cmd + * Command on RSS keys: init, get, release + * + * @param[in, out] key_idx + * Pointer to RSS Key index (out for get command, in for release command) + * + * @return -1 if couldn't get, release or init the RSS keys, 0 otherwise. + */ +static int bpf_rss_key(enum bpf_rss_key_e cmd, __u32 *key_idx) +{ + __u32 i; + int err = -1; + static __u32 num_used_keys; + static __u32 rss_keys[MAX_RSS_KEYS] = {KEY_STAT_UNSPEC}; + static __u32 rss_keys_initialized; + + switch (cmd) { + case KEY_CMD_GET: + if (!rss_keys_initialized) + break; + + if (num_used_keys == RTE_DIM(rss_keys)) + break; + + *key_idx = num_used_keys % RTE_DIM(rss_keys); + while (rss_keys[*key_idx] == KEY_STAT_USED) + *key_idx = (*key_idx + 1) % RTE_DIM(rss_keys); + + rss_keys[*key_idx] = KEY_STAT_USED; + num_used_keys++; + err = 0; + break; + + case KEY_CMD_RELEASE: + if (!rss_keys_initialized) { + err = 0; + break; + } + + if (rss_keys[*key_idx] == KEY_STAT_USED) { + rss_keys[*key_idx] = KEY_STAT_AVAILABLE; + num_used_keys--; + err = 0; + } + break; + + case KEY_CMD_INIT: + for (i = 0; i < RTE_DIM(rss_keys); i++) + rss_keys[i] = KEY_STAT_AVAILABLE; + + rss_keys_initialized = 1; + num_used_keys = 0; + err = 0; + break; + + case KEY_CMD_DEINIT: + for (i = 0; i < RTE_DIM(rss_keys); i++) + rss_keys[i] = KEY_STAT_UNSPEC; + + rss_keys_initialized = 0; + num_used_keys = 0; + err = 0; + break; + + default: + break; + } + + return err; +} + +/** + * Add RSS hash calculations and queue selection + * + * @param[in, out] pmd + * Pointer to internal structure. Used to set/get RSS map fd + * + * @param[in] rss + * Pointer to RSS flow actions + * + * @param[out] error + * Pointer to error reporting if not NULL. + * + * @return 0 on success, negative value on failure + */ +static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd, + const struct rte_flow_action_rss *rss, + struct rte_flow_error *error) +{ + /* 4096 is the maximum number of instructions for a BPF program */ + int i; + int err; + struct rss_key rss_entry = { .hash_fields = 0, + .key_size = 0 }; + + /* Get a new map key for a new RSS rule */ + err = bpf_rss_key(KEY_CMD_GET, &flow->key_idx); + if (err < 0) { + rte_flow_error_set( + error, EINVAL, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, + "Failed to get BPF RSS key"); + + return -1; + } + + /* Update RSS map entry with queues */ + rss_entry.nb_queues = rss->num; + for (i = 0; i < rss->num; i++) + rss_entry.queues[i] = rss->queue[i]; + rss_entry.hash_fields = + (1 << HASH_FIELD_IPV4_L3_L4) | (1 << HASH_FIELD_IPV6_L3_L4); + + /* Add this RSS entry to map */ + err = tap_flow_bpf_update_rss_elem(pmd->map_fd, + &flow->key_idx, &rss_entry); + + if (err) { + RTE_LOG(ERR, PMD, + "Failed to update BPF map entry #%u (%d): %s\n", + flow->key_idx, errno, strerror(errno)); + rte_flow_error_set( + error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, + "Kernel too old or not configured " + "to support BPF maps updates"); + + return -ENOTSUP; + } + + + /* + * Load bpf rules to calculate hash for this key_idx + */ + + flow->bpf_fd[SEC_L3_L4] = + tap_flow_bpf_calc_l3_l4_hash(flow->key_idx, pmd->map_fd); + if (flow->bpf_fd[SEC_L3_L4] < 0) { + RTE_LOG(ERR, PMD, + "Failed to load BPF section %s (%d): %s\n", + sec_name[SEC_L3_L4], errno, strerror(errno)); + rte_flow_error_set( + error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, + "Kernel too old or not configured " + "to support BPF program loading"); + + return -ENOTSUP; + } + + /* Actions */ + { + struct action_data adata[] = { + { + .id = "bpf", + .bpf = { + .bpf_fd = flow->bpf_fd[SEC_L3_L4], + .annotation = sec_name[SEC_L3_L4], + .bpf = { + .action = TC_ACT_PIPE, + }, + }, + }, + }; + + if (add_actions(flow, RTE_DIM(adata), adata, + TCA_FLOWER_ACT) < 0) + return -1; + } + + return 0; +} + /** * Manage filter operations. * diff --git a/drivers/net/tap/tap_flow.h b/drivers/net/tap/tap_flow.h index 894c9596cb..daaf00b8b1 100644 --- a/drivers/net/tap/tap_flow.h +++ b/drivers/net/tap/tap_flow.h @@ -37,6 +37,7 @@ #include #include #include +#include /** * In TC, priority 0 means we require the kernel to allocate one for us. @@ -49,6 +50,7 @@ #define GROUP_MASK (0xf) #define GROUP_SHIFT 12 #define MAX_GROUP GROUP_MASK +#define RSS_PRIORITY_OFFSET RTE_PMD_TAP_MAX_QUEUES /** * These index are actually in reversed order: their priority is processed @@ -67,6 +69,11 @@ enum implicit_rule_index { TAP_REMOTE_MAX_IDX, }; +enum bpf_fd_idx { + SEC_L3_L4, + SEC_MAX, +}; + int tap_dev_filter_ctrl(struct rte_eth_dev *dev, enum rte_filter_type filter_type, enum rte_filter_op filter_op, diff --git a/drivers/net/tap/tap_rss.h b/drivers/net/tap/tap_rss.h new file mode 100644 index 0000000000..3bb0d140e0 --- /dev/null +++ b/drivers/net/tap/tap_rss.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 Mellanox Technologies, Ltd. + */ + +#ifndef _TAP_RSS_H_ +#define _TAP_RSS_H_ + +#ifndef TAP_MAX_QUEUES +#define TAP_MAX_QUEUES 16 +#endif + +/* hashed fields for RSS */ +enum hash_field { + HASH_FIELD_IPV4_L3, /* IPv4 src/dst addr */ + HASH_FIELD_IPV4_L3_L4, /* IPv4 src/dst addr + L4 src/dst ports */ + HASH_FIELD_IPV6_L3, /* IPv6 src/dst addr */ + HASH_FIELD_IPV6_L3_L4, /* IPv6 src/dst addr + L4 src/dst ports */ + HASH_FIELD_L2_SRC, /* Ethernet src addr */ + HASH_FIELD_L2_DST, /* Ethernet dst addr */ + HASH_FIELD_L3_SRC, /* L3 src addr */ + HASH_FIELD_L3_DST, /* L3 dst addr */ + HASH_FIELD_L4_SRC, /* TCP/UDP src ports */ + HASH_FIELD_L4_DST, /* TCP/UDP dst ports */ +}; + +struct rss_key { + __u8 key[128]; + __u32 hash_fields; + __u32 key_size; + __u32 queues[TAP_MAX_QUEUES]; + __u32 nb_queues; +} __attribute__((packed)); + +#endif /* _TAP_RSS_H_ */ diff --git a/drivers/net/tap/tap_tcmsgs.h b/drivers/net/tap/tap_tcmsgs.h index 789595771d..07c5074ae5 100644 --- a/drivers/net/tap/tap_tcmsgs.h +++ b/drivers/net/tap/tap_tcmsgs.h @@ -34,6 +34,7 @@ #ifndef _TAP_TCMSGS_H_ #define _TAP_TCMSGS_H_ +#include #include #include #include @@ -41,6 +42,9 @@ #include #include #include +#ifdef HAVE_TC_ACT_BPF +#include +#endif #include #include