From: Pascal Mazon Date: Thu, 23 Mar 2017 08:33:57 +0000 (+0100) Subject: net/tap: add basic flow API patterns and actions X-Git-Url: http://git.droids-corp.org/?a=commitdiff_plain;h=de96fe68ae95990f5ce2cd4c8d9547ab62d2a810;p=dpdk.git net/tap: add basic flow API patterns and actions Supported flow rules are now mapped to TC rules on the tap netdevice. The netlink message used for creating the TC rule is stored in struct rte_flow. That way, by simply changing a metadata in it, we can require for the rule deletion without further parsing. Supported items: - eth: src and dst (with variable masks), and eth_type (0xffff mask). - vlan: vid, pcp, tpid, but not eid. - ipv4/6: src and dst (with variable masks), and ip_proto (0xffff mask). - udp/tcp: src and dst port (0xffff) mask. Supported actions: - DROP - QUEUE - PASSTHRU It is generally not possible to provide a "last" item. However, if the "last" item, once masked, is identical to the masked spec, then it is supported. Only IPv4/6 and MAC addresses can use a variable mask. All other items need a full mask (exact match). Support for VLAN requires kernel headers >= 4.9, checked using auto-config.sh. Signed-off-by: Pascal Mazon Acked-by: Olga Shern Acked-by: Keith Wiles --- diff --git a/doc/guides/nics/tap.rst b/doc/guides/nics/tap.rst index c4f207be3b..4986e47e9f 100644 --- a/doc/guides/nics/tap.rst +++ b/doc/guides/nics/tap.rst @@ -82,6 +82,51 @@ can utilize that stack to handle the network protocols. Plus you would be able to address the interface using an IP address assigned to the internal interface. +Flow API support +---------------- + +The tap PMD supports major flow API pattern items and actions, when running on +linux kernels above 4.2 ("Flower" classifier required). Supported items: + +- eth: src and dst (with variable masks), and eth_type (0xffff mask). +- vlan: vid, pcp, tpid, but not eid. (requires kernel 4.9) +- ipv4/6: src and dst (with variable masks), and ip_proto (0xffff mask). +- udp/tcp: src and dst port (0xffff) mask. + +Supported actions: + +- DROP +- QUEUE +- PASSTHRU + +It is generally not possible to provide a "last" item. However, if the "last" +item, once masked, is identical to the masked spec, then it is supported. + +Only IPv4/6 and MAC addresses can use a variable mask. All other items need a +full mask (exact match). + +As rules are translated to TC, it is possible to show them with something like:: + + tc -s filter show dev tap1 parent 1: + +Examples of testpmd flow rules +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Drop packets for destination IP 192.168.0.1:: + + testpmd> flow create 0 priority 1 ingress pattern eth / ipv4 dst is 1.1.1.1 \ + / end actions drop / end + +Ensure packets from a given MAC address are received on a queue 2:: + + testpmd> flow create 0 priority 2 ingress pattern eth src is 06:05:04:03:02:01 \ + / end actions queue index 2 / end + +Drop UDP packets in vlan 3:: + + testpmd> flow create 0 priority 3 ingress pattern eth / vlan vid is 3 / \ + ipv4 proto is 17 / end actions drop / end + Example ------- diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile index 3a33b560d3..c42a680555 100644 --- a/drivers/net/tap/Makefile +++ b/drivers/net/tap/Makefile @@ -41,6 +41,7 @@ LIBABIVER := 1 CFLAGS += -O3 CFLAGS += -I$(SRCDIR) +CFLAGS += -I. CFLAGS += $(WERROR_FLAGS) # @@ -58,5 +59,44 @@ DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_mempool DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_ether DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_kvargs DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_net +DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_hash include $(RTE_SDK)/mk/rte.lib.mk + +# Generate and clean-up tap_autoconf.h. + +export CC CFLAGS CPPFLAGS EXTRA_CFLAGS EXTRA_CPPFLAGS +export AUTO_CONFIG_CFLAGS = -Wno-error + +ifndef V +AUTOCONF_OUTPUT := >/dev/null +endif + +tap_autoconf.h.new: FORCE + +tap_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh + $Q $(RM) -f -- '$@' + $Q sh -- '$<' '$@' \ + HAVE_TC_FLOWER \ + linux/pkt_cls.h \ + enum TCA_FLOWER_UNSPEC \ + $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_TC_VLAN_ID \ + linux/pkt_cls.h \ + enum TCA_FLOWER_KEY_VLAN_PRIO \ + $(AUTOCONF_OUTPUT) + +# Create tap_autoconf.h or update it in case it differs from the new one. + +tap_autoconf.h: tap_autoconf.h.new + $Q [ -f '$@' ] && \ + cmp '$<' '$@' $(AUTOCONF_OUTPUT) || \ + mv '$<' '$@' + +$(SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP):.c=.o): tap_autoconf.h + +clean_tap: FORCE + $Q rm -f -- tap_autoconf.h tap_autoconf.h.new + +clean: clean_tap diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c index 9127c739a2..c711b36c32 100644 --- a/drivers/net/tap/rte_eth_tap.c +++ b/drivers/net/tap/rte_eth_tap.c @@ -44,19 +44,22 @@ #include #include #include +#include #include #include #include #include #include #include -#include +#include #include #include +#include #include #include #include +#include /* Linux based path to the TUN device */ #define TUN_TAP_DEV_PATH "/dev/net/tun" @@ -71,6 +74,9 @@ #define RTE_PMD_TAP_MAX_QUEUES 1 #endif +#define FLOWER_KERNEL_VERSION KERNEL_VERSION(4, 2, 0) +#define FLOWER_VLAN_KERNEL_VERSION KERNEL_VERSION(4, 9, 0) + static struct rte_vdev_driver pmd_tap_drv; static const char *valid_arguments[] = { @@ -209,6 +215,28 @@ tun_alloc(struct pmd_internals *pmd, uint16_t qid) goto error; rte_memcpy(&pmd->eth_addr, ifr.ifr_hwaddr.sa_data, ETHER_ADDR_LEN); + + pmd->if_index = if_nametoindex(pmd->name); + if (!pmd->if_index) { + RTE_LOG(ERR, PMD, + "Could not find ifindex for %s: rte_flow won't be usable.\n", + pmd->name); + return fd; + } + if (!pmd->flower_support) + return fd; + if (qdisc_create_multiq(pmd->nlsk_fd, pmd->if_index) < 0) { + RTE_LOG(ERR, PMD, + "Could not create multiq qdisc for %s: rte_flow won't be usable.\n", + pmd->name); + return fd; + } + if (qdisc_create_ingress(pmd->nlsk_fd, pmd->if_index) < 0) { + RTE_LOG(ERR, PMD, + "Could not create multiq qdisc for %s: rte_flow won't be usable.\n", + pmd->name); + return fd; + } } return fd; @@ -811,6 +839,24 @@ static const struct eth_dev_ops ops = { .filter_ctrl = tap_dev_filter_ctrl, }; +static int +tap_kernel_support(struct pmd_internals *pmd) +{ + struct utsname utsname; + int ver[3]; + + if (uname(&utsname) == -1 || + sscanf(utsname.release, "%d.%d.%d", + &ver[0], &ver[1], &ver[2]) != 3) + return 0; + if (KERNEL_VERSION(ver[0], ver[1], ver[2]) >= FLOWER_KERNEL_VERSION) + pmd->flower_support = 1; + if (KERNEL_VERSION(ver[0], ver[1], ver[2]) >= + FLOWER_VLAN_KERNEL_VERSION) + pmd->flower_vlan_support = 1; + return 1; +} + static int eth_dev_tap_create(const char *name, char *tap_name) { @@ -880,7 +926,15 @@ eth_dev_tap_create(const char *name, char *tap_name) pmd->txq[i].fd = -1; } + tap_kernel_support(pmd); + if (!pmd->flower_support) + return 0; LIST_INIT(&pmd->flows); + /* + * If no netlink socket can be created, then it will fail when + * creating/destroying flow rules. + */ + pmd->nlsk_fd = nl_init(); return 0; @@ -995,7 +1049,10 @@ rte_pmd_tap_remove(const char *name) return 0; internals = eth_dev->data->dev_private; - tap_flow_flush(eth_dev, NULL); + if (internals->flower_support && internals->nlsk_fd) { + tap_flow_flush(eth_dev, NULL); + nl_final(internals->nlsk_fd); + } for (i = 0; i < internals->nb_queues; i++) if (internals->rxq[i].fd != -1) close(internals->rxq[i].fd); diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h index bf82267366..741ec53508 100644 --- a/drivers/net/tap/rte_eth_tap.h +++ b/drivers/net/tap/rte_eth_tap.h @@ -69,6 +69,9 @@ struct pmd_internals { struct ether_addr eth_addr; /* Mac address of the device port */ int if_index; /* IF_INDEX for the port */ int ioctl_sock; /* socket for ioctl calls */ + int nlsk_fd; /* Netlink socket fd */ + int flower_support; /* 1 if kernel supports, else 0 */ + int flower_vlan_support; /* 1 if kernel supports, else 0 */ LIST_HEAD(tap_flows, rte_flow) flows; /* rte_flow rules */ struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */ struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */ diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c index c32ed382d7..6adacdc22d 100644 --- a/drivers/net/tap/tap_flow.c +++ b/drivers/net/tap/tap_flow.c @@ -33,14 +33,71 @@ #include +#include +#include #include #include #include +#include +#include + +#ifndef HAVE_TC_FLOWER +/* + * For kernels < 4.2, this enum is not defined. Runtime checks will be made to + * avoid sending TC messages the kernel cannot understand. + */ +enum { + TCA_FLOWER_UNSPEC, + TCA_FLOWER_CLASSID, + TCA_FLOWER_INDEV, + TCA_FLOWER_ACT, + TCA_FLOWER_KEY_ETH_DST, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_DST_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_SRC, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_SRC_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_TYPE, /* be16 */ + TCA_FLOWER_KEY_IP_PROTO, /* u8 */ + TCA_FLOWER_KEY_IPV4_SRC, /* be32 */ + TCA_FLOWER_KEY_IPV4_SRC_MASK, /* be32 */ + TCA_FLOWER_KEY_IPV4_DST, /* be32 */ + TCA_FLOWER_KEY_IPV4_DST_MASK, /* be32 */ + TCA_FLOWER_KEY_IPV6_SRC, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_SRC_MASK, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_DST, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_DST_MASK, /* struct in6_addr */ + TCA_FLOWER_KEY_TCP_SRC, /* be16 */ + TCA_FLOWER_KEY_TCP_DST, /* be16 */ + TCA_FLOWER_KEY_UDP_SRC, /* be16 */ + TCA_FLOWER_KEY_UDP_DST, /* be16 */ +}; +#endif +#ifndef HAVE_TC_VLAN_ID +enum { + /* TCA_FLOWER_FLAGS, */ + TCA_FLOWER_KEY_VLAN_ID = TCA_FLOWER_KEY_UDP_DST + 2, /* be16 */ + TCA_FLOWER_KEY_VLAN_PRIO, /* u8 */ + TCA_FLOWER_KEY_VLAN_ETH_TYPE, /* be16 */ +}; +#endif struct rte_flow { LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */ + struct nlmsg msg; +}; + +struct convert_data { + uint16_t eth_type; + uint16_t ip_proto; + uint8_t vlan; + struct rte_flow *flow; }; +static int tap_flow_create_eth(const struct rte_flow_item *item, void *data); +static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data); +static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data); +static int tap_flow_create_ipv6(const struct rte_flow_item *item, void *data); +static int tap_flow_create_udp(const struct rte_flow_item *item, void *data); +static int tap_flow_create_tcp(const struct rte_flow_item *item, void *data); static int tap_flow_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, @@ -67,6 +124,752 @@ static const struct rte_flow_ops tap_flow_ops = { .flush = tap_flow_flush, }; +/* Static initializer for items. */ +#define ITEMS(...) \ + (const enum rte_flow_item_type []){ \ + __VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \ + } + +/* Structure to generate a simple graph of layers supported by the NIC. */ +struct tap_flow_items { + /* Bit-mask corresponding to what is supported for this item. */ + const void *mask; + const unsigned int mask_sz; /* Bit-mask size in bytes. */ + /* + * Bit-mask corresponding to the default mask, if none is provided + * along with the item. + */ + const void *default_mask; + /** + * Conversion function from rte_flow to netlink attributes. + * + * @param item + * rte_flow item to convert. + * @param data + * Internal structure to store the conversion. + * + * @return + * 0 on success, negative value otherwise. + */ + int (*convert)(const struct rte_flow_item *item, void *data); + /** List of possible following items. */ + const enum rte_flow_item_type *const items; +}; + +/* Graph of supported items and associated actions. */ +static const struct tap_flow_items tap_flow_items[] = { + [RTE_FLOW_ITEM_TYPE_END] = { + .items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH), + }, + [RTE_FLOW_ITEM_TYPE_ETH] = { + .items = ITEMS( + RTE_FLOW_ITEM_TYPE_VLAN, + RTE_FLOW_ITEM_TYPE_IPV4, + RTE_FLOW_ITEM_TYPE_IPV6), + .mask = &(const struct rte_flow_item_eth){ + .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", + .src.addr_bytes = "\xff\xff\xff\xff\xff\xff", + .type = -1, + }, + .mask_sz = sizeof(struct rte_flow_item_eth), + .default_mask = &rte_flow_item_eth_mask, + .convert = tap_flow_create_eth, + }, + [RTE_FLOW_ITEM_TYPE_VLAN] = { + .items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4, + RTE_FLOW_ITEM_TYPE_IPV6), + .mask = &(const struct rte_flow_item_vlan){ + .tpid = -1, + /* DEI matching is not supported */ +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN + .tci = 0xffef, +#else + .tci = 0xefff, +#endif + }, + .mask_sz = sizeof(struct rte_flow_item_vlan), + .default_mask = &rte_flow_item_vlan_mask, + .convert = tap_flow_create_vlan, + }, + [RTE_FLOW_ITEM_TYPE_IPV4] = { + .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP, + RTE_FLOW_ITEM_TYPE_TCP), + .mask = &(const struct rte_flow_item_ipv4){ + .hdr = { + .src_addr = -1, + .dst_addr = -1, + .next_proto_id = -1, + }, + }, + .mask_sz = sizeof(struct rte_flow_item_ipv4), + .default_mask = &rte_flow_item_ipv4_mask, + .convert = tap_flow_create_ipv4, + }, + [RTE_FLOW_ITEM_TYPE_IPV6] = { + .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP, + RTE_FLOW_ITEM_TYPE_TCP), + .mask = &(const struct rte_flow_item_ipv6){ + .hdr = { + .src_addr = { + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff", + }, + .dst_addr = { + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff", + }, + .proto = -1, + }, + }, + .mask_sz = sizeof(struct rte_flow_item_ipv6), + .default_mask = &rte_flow_item_ipv6_mask, + .convert = tap_flow_create_ipv6, + }, + [RTE_FLOW_ITEM_TYPE_UDP] = { + .mask = &(const struct rte_flow_item_udp){ + .hdr = { + .src_port = -1, + .dst_port = -1, + }, + }, + .mask_sz = sizeof(struct rte_flow_item_udp), + .default_mask = &rte_flow_item_udp_mask, + .convert = tap_flow_create_udp, + }, + [RTE_FLOW_ITEM_TYPE_TCP] = { + .mask = &(const struct rte_flow_item_tcp){ + .hdr = { + .src_port = -1, + .dst_port = -1, + }, + }, + .mask_sz = sizeof(struct rte_flow_item_tcp), + .default_mask = &rte_flow_item_tcp_mask, + .convert = tap_flow_create_tcp, + }, +}; + +/** + * Make as much checks as possible on an Ethernet item, and if a flow is + * provided, fill it appropriately with Ethernet info. + * + * @param[in] item + * Item specification. + * @param[in, out] data + * Additional data structure to tell next layers we've been here. + * + * @return + * 0 if checks are alright, -1 otherwise. + */ +static int +tap_flow_create_eth(const struct rte_flow_item *item, void *data) +{ + struct convert_data *info = (struct convert_data *)data; + const struct rte_flow_item_eth *spec = item->spec; + const struct rte_flow_item_eth *mask = item->mask; + struct rte_flow *flow = info->flow; + struct nlmsg *msg; + + /* use default mask if none provided */ + if (!mask) + mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_ETH].default_mask; + /* TC does not support eth_type masking. Only accept if exact match. */ + if (mask->type && mask->type != 0xffff) + return -1; + if (!spec) + return 0; + /* store eth_type for consistency if ipv4/6 pattern item comes next */ + if (spec->type & mask->type) + info->eth_type = spec->type; + if (!flow) + return 0; + msg = &flow->msg; + if (spec->type & mask->type) + msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, + (spec->type & mask->type)); + if (!is_zero_ether_addr(&spec->dst)) { + nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_DST, ETHER_ADDR_LEN, + &spec->dst.addr_bytes); + nlattr_add(&msg->nh, + TCA_FLOWER_KEY_ETH_DST_MASK, ETHER_ADDR_LEN, + &mask->dst.addr_bytes); + } + if (!is_zero_ether_addr(&mask->src)) { + nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_SRC, ETHER_ADDR_LEN, + &spec->src.addr_bytes); + nlattr_add(&msg->nh, + TCA_FLOWER_KEY_ETH_SRC_MASK, ETHER_ADDR_LEN, + &mask->src.addr_bytes); + } + return 0; +} + +/** + * Make as much checks as possible on a VLAN item, and if a flow is provided, + * fill it appropriately with VLAN info. + * + * @param[in] item + * Item specification. + * @param[in, out] data + * Additional data structure to tell next layers we've been here. + * + * @return + * 0 if checks are alright, -1 otherwise. + */ +static int +tap_flow_create_vlan(const struct rte_flow_item *item, void *data) +{ + struct convert_data *info = (struct convert_data *)data; + const struct rte_flow_item_vlan *spec = item->spec; + const struct rte_flow_item_vlan *mask = item->mask; + struct rte_flow *flow = info->flow; + struct nlmsg *msg; + + /* use default mask if none provided */ + if (!mask) + mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask; + /* TC does not support tpid masking. Only accept if exact match. */ + if (mask->tpid && mask->tpid != 0xffff) + return -1; + /* Double-tagging not supported. */ + if (spec && mask->tpid && spec->tpid != htons(ETH_P_8021Q)) + return -1; + info->vlan = 1; + if (!flow) + return 0; + msg = &flow->msg; + msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_8021Q)); +#define VLAN_PRIO(tci) ((tci) >> 13) +#define VLAN_ID(tci) ((tci) & 0xfff) + if (!spec) + return 0; + if (spec->tci) { + uint16_t tci = ntohs(spec->tci) & mask->tci; + uint16_t prio = VLAN_PRIO(tci); + uint8_t vid = VLAN_ID(tci); + + if (prio) + nlattr_add8(&msg->nh, TCA_FLOWER_KEY_VLAN_PRIO, prio); + if (vid) + nlattr_add16(&msg->nh, TCA_FLOWER_KEY_VLAN_ID, vid); + } + return 0; +} + +/** + * Make as much checks as possible on an IPv4 item, and if a flow is provided, + * fill it appropriately with IPv4 info. + * + * @param[in] item + * Item specification. + * @param[in, out] data + * Additional data structure to tell next layers we've been here. + * + * @return + * 0 if checks are alright, -1 otherwise. + */ +static int +tap_flow_create_ipv4(const struct rte_flow_item *item, void *data) +{ + struct convert_data *info = (struct convert_data *)data; + const struct rte_flow_item_ipv4 *spec = item->spec; + const struct rte_flow_item_ipv4 *mask = item->mask; + struct rte_flow *flow = info->flow; + struct nlmsg *msg; + + /* use default mask if none provided */ + if (!mask) + mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV4].default_mask; + /* check that previous eth type is compatible with ipv4 */ + if (info->eth_type && info->eth_type != htons(ETH_P_IP)) + return -1; + /* store ip_proto for consistency if udp/tcp pattern item comes next */ + if (spec) + info->ip_proto = spec->hdr.next_proto_id; + if (!flow) + return 0; + msg = &flow->msg; + if (!info->eth_type) + info->eth_type = htons(ETH_P_IP); + if (!info->vlan) + msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_IP)); + if (!spec) + return 0; + if (spec->hdr.dst_addr) { + nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST, + spec->hdr.dst_addr); + nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST_MASK, + mask->hdr.dst_addr); + } + if (spec->hdr.src_addr) { + nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC, + spec->hdr.src_addr); + nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC_MASK, + mask->hdr.src_addr); + } + if (spec->hdr.next_proto_id) + nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, + spec->hdr.next_proto_id); + return 0; +} + +/** + * Make as much checks as possible on an IPv6 item, and if a flow is provided, + * fill it appropriately with IPv6 info. + * + * @param[in] item + * Item specification. + * @param[in, out] data + * Additional data structure to tell next layers we've been here. + * + * @return + * 0 if checks are alright, -1 otherwise. + */ +static int +tap_flow_create_ipv6(const struct rte_flow_item *item, void *data) +{ + struct convert_data *info = (struct convert_data *)data; + const struct rte_flow_item_ipv6 *spec = item->spec; + const struct rte_flow_item_ipv6 *mask = item->mask; + struct rte_flow *flow = info->flow; + uint8_t empty_addr[16] = { 0 }; + struct nlmsg *msg; + + /* use default mask if none provided */ + if (!mask) + mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV6].default_mask; + /* check that previous eth type is compatible with ipv6 */ + if (info->eth_type && info->eth_type != htons(ETH_P_IPV6)) + return -1; + /* store ip_proto for consistency if udp/tcp pattern item comes next */ + if (spec) + info->ip_proto = spec->hdr.proto; + if (!flow) + return 0; + msg = &flow->msg; + if (!info->eth_type) + info->eth_type = htons(ETH_P_IPV6); + if (!info->vlan) + msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_IPV6)); + if (!spec) + return 0; + if (memcmp(spec->hdr.dst_addr, empty_addr, 16)) { + nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST, + sizeof(spec->hdr.dst_addr), &spec->hdr.dst_addr); + nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST_MASK, + sizeof(mask->hdr.dst_addr), &mask->hdr.dst_addr); + } + if (memcmp(spec->hdr.src_addr, empty_addr, 16)) { + nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC, + sizeof(spec->hdr.src_addr), &spec->hdr.src_addr); + nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC_MASK, + sizeof(mask->hdr.src_addr), &mask->hdr.src_addr); + } + if (spec->hdr.proto) + nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, spec->hdr.proto); + return 0; +} + +/** + * Make as much checks as possible on a UDP item, and if a flow is provided, + * fill it appropriately with UDP info. + * + * @param[in] item + * Item specification. + * @param[in, out] data + * Additional data structure to tell next layers we've been here. + * + * @return + * 0 if checks are alright, -1 otherwise. + */ +static int +tap_flow_create_udp(const struct rte_flow_item *item, void *data) +{ + struct convert_data *info = (struct convert_data *)data; + const struct rte_flow_item_udp *spec = item->spec; + const struct rte_flow_item_udp *mask = item->mask; + struct rte_flow *flow = info->flow; + struct nlmsg *msg; + + /* use default mask if none provided */ + if (!mask) + mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_UDP].default_mask; + /* check that previous ip_proto is compatible with udp */ + if (info->ip_proto && info->ip_proto != IPPROTO_UDP) + return -1; + if (!flow) + return 0; + msg = &flow->msg; + nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_UDP); + if (!spec) + return 0; + if (spec->hdr.dst_port && + (spec->hdr.dst_port & mask->hdr.dst_port) == spec->hdr.dst_port) + nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_DST, + spec->hdr.dst_port); + if (spec->hdr.src_port && + (spec->hdr.src_port & mask->hdr.src_port) == spec->hdr.src_port) + nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_SRC, + spec->hdr.src_port); + return 0; +} + +/** + * Make as much checks as possible on a TCP item, and if a flow is provided, + * fill it appropriately with TCP info. + * + * @param[in] item + * Item specification. + * @param[in, out] data + * Additional data structure to tell next layers we've been here. + * + * @return + * 0 if checks are alright, -1 otherwise. + */ +static int +tap_flow_create_tcp(const struct rte_flow_item *item, void *data) +{ + struct convert_data *info = (struct convert_data *)data; + const struct rte_flow_item_tcp *spec = item->spec; + const struct rte_flow_item_tcp *mask = item->mask; + struct rte_flow *flow = info->flow; + struct nlmsg *msg; + + /* use default mask if none provided */ + if (!mask) + mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_TCP].default_mask; + /* check that previous ip_proto is compatible with tcp */ + if (info->ip_proto && info->ip_proto != IPPROTO_TCP) + return -1; + if (!flow) + return 0; + msg = &flow->msg; + nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_TCP); + if (!spec) + return 0; + if (spec->hdr.dst_port && + (spec->hdr.dst_port & mask->hdr.dst_port) == spec->hdr.dst_port) + nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_DST, + spec->hdr.dst_port); + if (spec->hdr.src_port && + (spec->hdr.src_port & mask->hdr.src_port) == spec->hdr.src_port) + nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_SRC, + spec->hdr.src_port); + return 0; +} + +/** + * Check support for a given item. + * + * @param[in] item + * Item specification. + * @param size + * Bit-Mask size in bytes. + * @param[in] supported_mask + * Bit-mask covering supported fields to compare with spec, last and mask in + * \item. + * @param[in] default_mask + * Bit-mask default mask if none is provided in \item. + * + * @return + * 0 on success. + */ +static int +tap_flow_item_validate(const struct rte_flow_item *item, + unsigned int size, + const uint8_t *supported_mask, + const uint8_t *default_mask) +{ + int ret = 0; + + /* An empty layer is allowed, as long as all fields are NULL */ + if (!item->spec && (item->mask || item->last)) + return -1; + /* Is the item spec compatible with what the NIC supports? */ + if (item->spec && !item->mask) { + unsigned int i; + const uint8_t *spec = item->spec; + + for (i = 0; i < size; ++i) + if ((spec[i] | supported_mask[i]) != supported_mask[i]) + return -1; + /* Is the default mask compatible with what the NIC supports? */ + for (i = 0; i < size; i++) + if ((default_mask[i] | supported_mask[i]) != + supported_mask[i]) + return -1; + } + /* Is the item last compatible with what the NIC supports? */ + if (item->last && !item->mask) { + unsigned int i; + const uint8_t *spec = item->last; + + for (i = 0; i < size; ++i) + if ((spec[i] | supported_mask[i]) != supported_mask[i]) + return -1; + } + /* Is the item mask compatible with what the NIC supports? */ + if (item->mask) { + unsigned int i; + const uint8_t *spec = item->mask; + + for (i = 0; i < size; ++i) + if ((spec[i] | supported_mask[i]) != supported_mask[i]) + return -1; + } + /** + * Once masked, Are item spec and item last equal? + * TC does not support range so anything else is invalid. + */ + if (item->spec && item->last) { + uint8_t spec[size]; + uint8_t last[size]; + const uint8_t *apply = default_mask; + unsigned int i; + + if (item->mask) + apply = item->mask; + for (i = 0; i < size; ++i) { + spec[i] = ((const uint8_t *)item->spec)[i] & apply[i]; + last[i] = ((const uint8_t *)item->last)[i] & apply[i]; + } + ret = memcmp(spec, last, size); + } + return ret; +} + +/** + * Transform a DROP/PASSTHRU action item in the provided flow for TC. + * + * @param[in, out] flow + * Flow to be filled. + * @param[in] action + * Appropriate action to be set in the TCA_GACT_PARMS structure. + * + * @return + * 0 if checks are alright, -1 otherwise. + */ +static int +add_action_gact(struct rte_flow *flow, int action) +{ + struct nlmsg *msg = &flow->msg; + size_t act_index = 1; + struct tc_gact p = { + .action = action + }; + + if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0) + return -1; + if (nlattr_nested_start(msg, act_index++) < 0) + return -1; + nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("gact"), "gact"); + if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0) + return -1; + nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(p), &p); + nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */ + nlattr_nested_finish(msg); /* nested act_index */ + nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */ + return 0; +} + +/** + * Transform a QUEUE action item in the provided flow for TC. + * + * @param[in, out] flow + * Flow to be filled. + * @param[in] queue + * Queue id to use. + * + * @return + * 0 if checks are alright, -1 otherwise. + */ +static int +add_action_skbedit(struct rte_flow *flow, uint16_t queue) +{ + struct nlmsg *msg = &flow->msg; + size_t act_index = 1; + struct tc_skbedit p = { + .action = TC_ACT_PIPE + }; + + if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0) + return -1; + if (nlattr_nested_start(msg, act_index++) < 0) + return -1; + nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("skbedit"), "skbedit"); + if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0) + return -1; + nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS, sizeof(p), &p); + nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING, queue); + nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */ + nlattr_nested_finish(msg); /* nested act_index */ + nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */ + return 0; +} + +/** + * Validate a flow supported by TC. + * If flow param is not NULL, then also fill the netlink message inside. + * + * @param pmd + * Pointer to private structure. + * @param[in] attr + * Flow rule attributes. + * @param[in] pattern + * Pattern specification (list terminated by the END pattern item). + * @param[in] actions + * Associated actions (list terminated by the END action). + * @param[out] error + * Perform verbose error reporting if not NULL. + * @param[in, out] flow + * Flow structure to update. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +priv_flow_process(struct pmd_internals *pmd, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + struct rte_flow_error *error, + struct rte_flow *flow) +{ + const struct tap_flow_items *cur_item = tap_flow_items; + struct convert_data data = { + .eth_type = 0, + .ip_proto = 0, + .flow = flow, + }; + int action = 0; /* Only one action authorized for now */ + + if (attr->group > MAX_GROUP) { + rte_flow_error_set( + error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP, + NULL, "group value too big: cannot exceed 15"); + return -rte_errno; + } + if (attr->priority > MAX_PRIORITY) { + rte_flow_error_set( + error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, + NULL, "priority value too big"); + return -rte_errno; + } else if (flow) { + uint16_t group = attr->group << GROUP_SHIFT; + uint16_t prio = group | (attr->priority + PRIORITY_OFFSET); + flow->msg.t.tcm_info = TC_H_MAKE(prio << 16, + flow->msg.t.tcm_info); + } + if (!attr->ingress) { + rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR, + NULL, "direction should be ingress"); + return -rte_errno; + } + /* rte_flow ingress is actually egress as seen in the kernel */ + if (attr->ingress && flow) + flow->msg.t.tcm_parent = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0); + if (flow) { + /* use flower filter type */ + nlattr_add(&flow->msg.nh, TCA_KIND, sizeof("flower"), "flower"); + if (nlattr_nested_start(&flow->msg, TCA_OPTIONS) < 0) + goto exit_item_not_supported; + } + for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) { + const struct tap_flow_items *token = NULL; + unsigned int i; + int err = 0; + + if (items->type == RTE_FLOW_ITEM_TYPE_VOID) + continue; + for (i = 0; + cur_item->items && + cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END; + ++i) { + if (cur_item->items[i] == items->type) { + token = &tap_flow_items[items->type]; + break; + } + } + if (!token) + goto exit_item_not_supported; + cur_item = token; + err = tap_flow_item_validate( + items, cur_item->mask_sz, + (const uint8_t *)cur_item->mask, + (const uint8_t *)cur_item->default_mask); + if (err) + goto exit_item_not_supported; + if (flow && cur_item->convert) { + if (!pmd->flower_vlan_support && + cur_item->convert == tap_flow_create_vlan) + goto exit_item_not_supported; + err = cur_item->convert(items, &data); + if (err) + goto exit_item_not_supported; + } + } + if (flow) { + if (pmd->flower_vlan_support && data.vlan) { + nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE, + htons(ETH_P_8021Q)); + nlattr_add16(&flow->msg.nh, + TCA_FLOWER_KEY_VLAN_ETH_TYPE, + data.eth_type ? + data.eth_type : htons(ETH_P_ALL)); + } else if (data.eth_type) { + nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE, + data.eth_type); + } + } + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) { + int err = 0; + + if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) { + continue; + } else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) { + if (action) + goto exit_action_not_supported; + action = 1; + if (flow) + err = add_action_gact(flow, TC_ACT_SHOT); + } else if (actions->type == RTE_FLOW_ACTION_TYPE_PASSTHRU) { + if (action) + goto exit_action_not_supported; + action = 1; + if (flow) + err = add_action_gact(flow, TC_ACT_UNSPEC); + } else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) { + const struct rte_flow_action_queue *queue = + (const struct rte_flow_action_queue *) + actions->conf; + if (action) + goto exit_action_not_supported; + action = 1; + if (!queue || (queue->index >= pmd->nb_queues)) + goto exit_action_not_supported; + if (flow) + err = add_action_skbedit(flow, queue->index); + } else { + goto exit_action_not_supported; + } + if (err) + goto exit_action_not_supported; + } + if (flow) + nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */ + return 0; +exit_item_not_supported: + rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, + items, "item not supported"); + return -rte_errno; +exit_action_not_supported: + rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, + actions, "action not supported"); + return -rte_errno; +} + + + /** * Validate a flow. * @@ -74,15 +877,46 @@ static const struct rte_flow_ops tap_flow_ops = { * @see rte_flow_ops */ static int -tap_flow_validate(struct rte_eth_dev *dev __rte_unused, - const struct rte_flow_attr *attr __rte_unused, - const struct rte_flow_item items[] __rte_unused, - const struct rte_flow_action actions[] __rte_unused, +tap_flow_validate(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], struct rte_flow_error *error) { - return -rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_UNSPECIFIED, - NULL, "not implemented yet"); + struct pmd_internals *pmd = dev->data->dev_private; + + return priv_flow_process(pmd, attr, items, actions, error, NULL); +} + +/** + * Set a unique handle in a flow. + * + * The kernel supports TC rules with equal priority, as long as they use the + * same matching fields (e.g.: dst mac and ipv4) with different values (and + * full mask to ensure no collision is possible). + * In those rules, the handle (uint32_t) is the part that would identify + * specifically each rule. + * + * On 32-bit architectures, the handle can simply be the flow's pointer address. + * On 64-bit architectures, we rely on jhash(flow) to find a (sufficiently) + * unique handle. + * + * @param[in, out] flow + * The flow that needs its handle set. + */ +static void +tap_flow_set_handle(struct rte_flow *flow) +{ + uint32_t handle = 0; + + if (sizeof(flow) > 4) + handle = rte_jhash(&flow, sizeof(flow), 1); + else + handle = (uintptr_t)flow; + /* must be at least 1 to avoid letting the kernel choose one for us */ + if (!handle) + handle = 1; + flow->msg.t.tcm_handle = handle; } /** @@ -100,17 +934,46 @@ tap_flow_create(struct rte_eth_dev *dev, { struct pmd_internals *pmd = dev->data->dev_private; struct rte_flow *flow = NULL; + struct nlmsg *msg = NULL; + int err; - if (tap_flow_validate(dev, attr, items, actions, error)) - return NULL; + if (!pmd->if_index) { + rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, + "can't create rule, ifindex not found"); + goto fail; + } flow = rte_malloc(__func__, sizeof(struct rte_flow), 0); if (!flow) { rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, "cannot allocate memory for rte_flow"); - return NULL; + goto fail; + } + msg = &flow->msg; + tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER, + NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE); + msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL)); + tap_flow_set_handle(flow); + if (priv_flow_process(pmd, attr, items, actions, error, flow)) + goto fail; + err = nl_send(pmd->nlsk_fd, &msg->nh); + if (err < 0) { + rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, "couldn't send request to kernel"); + goto fail; + } + err = nl_recv_ack(pmd->nlsk_fd); + if (err < 0) { + rte_flow_error_set(error, EEXIST, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, "overlapping rules"); + goto fail; } LIST_INSERT_HEAD(&pmd->flows, flow, next); return flow; +fail: + if (flow) + rte_free(flow); + return NULL; } /** @@ -120,13 +983,31 @@ tap_flow_create(struct rte_eth_dev *dev, * @see rte_flow_ops */ static int -tap_flow_destroy(struct rte_eth_dev *dev __rte_unused, +tap_flow_destroy(struct rte_eth_dev *dev, struct rte_flow *flow, - struct rte_flow_error *error __rte_unused) + struct rte_flow_error *error) { + struct pmd_internals *pmd = dev->data->dev_private; + int ret = 0; + LIST_REMOVE(flow, next); + flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + flow->msg.nh.nlmsg_type = RTM_DELTFILTER; + + ret = nl_send(pmd->nlsk_fd, &flow->msg.nh); + if (ret < 0) { + rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, "couldn't send request to kernel"); + goto end; + } + ret = nl_recv_ack(pmd->nlsk_fd); + if (ret < 0) + rte_flow_error_set( + error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, + "couldn't receive kernel ack to our request"); +end: rte_free(flow); - return 0; + return ret; } /** @@ -170,6 +1051,10 @@ tap_dev_filter_ctrl(struct rte_eth_dev *dev, enum rte_filter_op filter_op, void *arg) { + struct pmd_internals *pmd = dev->data->dev_private; + + if (!pmd->flower_support) + return -ENOTSUP; switch (filter_type) { case RTE_ETH_FILTER_GENERIC: if (filter_op != RTE_ETH_FILTER_GET) diff --git a/drivers/net/tap/tap_flow.h b/drivers/net/tap/tap_flow.h index 377a9f7b75..a05e945df5 100644 --- a/drivers/net/tap/tap_flow.h +++ b/drivers/net/tap/tap_flow.h @@ -37,6 +37,18 @@ #include #include +/** + * In TC, priority 0 means we require the kernel to allocate one for us. + * In rte_flow, however, we want the priority 0 to be the most important one. + * Use an offset to have the most important priority being 1 in TC. + */ +#define PRIORITY_OFFSET 1 +#define PRIORITY_MASK (0xfff) +#define MAX_PRIORITY (PRIORITY_MASK - PRIORITY_OFFSET) +#define GROUP_MASK (0xf) +#define GROUP_SHIFT 12 +#define MAX_GROUP GROUP_MASK + int tap_dev_filter_ctrl(struct rte_eth_dev *dev, enum rte_filter_type filter_type, enum rte_filter_op filter_op,