4 * Copyright 2017 6WIND S.A.
5 * Copyright 2017 Mellanox.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #include <sys/queue.h>
36 #include <rte_byteorder.h>
37 #include <rte_jhash.h>
38 #include <rte_malloc.h>
39 #include <rte_eth_tap.h>
41 #include <tap_autoconf.h>
42 #include <tap_tcmsgs.h>
44 #ifndef HAVE_TC_FLOWER
46 * For kernels < 4.2, this enum is not defined. Runtime checks will be made to
47 * avoid sending TC messages the kernel cannot understand.
54 TCA_FLOWER_KEY_ETH_DST, /* ETH_ALEN */
55 TCA_FLOWER_KEY_ETH_DST_MASK, /* ETH_ALEN */
56 TCA_FLOWER_KEY_ETH_SRC, /* ETH_ALEN */
57 TCA_FLOWER_KEY_ETH_SRC_MASK, /* ETH_ALEN */
58 TCA_FLOWER_KEY_ETH_TYPE, /* be16 */
59 TCA_FLOWER_KEY_IP_PROTO, /* u8 */
60 TCA_FLOWER_KEY_IPV4_SRC, /* be32 */
61 TCA_FLOWER_KEY_IPV4_SRC_MASK, /* be32 */
62 TCA_FLOWER_KEY_IPV4_DST, /* be32 */
63 TCA_FLOWER_KEY_IPV4_DST_MASK, /* be32 */
64 TCA_FLOWER_KEY_IPV6_SRC, /* struct in6_addr */
65 TCA_FLOWER_KEY_IPV6_SRC_MASK, /* struct in6_addr */
66 TCA_FLOWER_KEY_IPV6_DST, /* struct in6_addr */
67 TCA_FLOWER_KEY_IPV6_DST_MASK, /* struct in6_addr */
68 TCA_FLOWER_KEY_TCP_SRC, /* be16 */
69 TCA_FLOWER_KEY_TCP_DST, /* be16 */
70 TCA_FLOWER_KEY_UDP_SRC, /* be16 */
71 TCA_FLOWER_KEY_UDP_DST, /* be16 */
74 #ifndef HAVE_TC_VLAN_ID
76 /* TCA_FLOWER_FLAGS, */
77 TCA_FLOWER_KEY_VLAN_ID = TCA_FLOWER_KEY_UDP_DST + 2, /* be16 */
78 TCA_FLOWER_KEY_VLAN_PRIO, /* u8 */
79 TCA_FLOWER_KEY_VLAN_ETH_TYPE, /* be16 */
84 LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
85 struct rte_flow *remote_flow; /* associated remote flow */
93 struct rte_flow *flow;
97 struct rte_flow_attr attr;
98 struct rte_flow_item items[2];
102 static int tap_flow_create_eth(const struct rte_flow_item *item, void *data);
103 static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data);
104 static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data);
105 static int tap_flow_create_ipv6(const struct rte_flow_item *item, void *data);
106 static int tap_flow_create_udp(const struct rte_flow_item *item, void *data);
107 static int tap_flow_create_tcp(const struct rte_flow_item *item, void *data);
109 tap_flow_validate(struct rte_eth_dev *dev,
110 const struct rte_flow_attr *attr,
111 const struct rte_flow_item items[],
112 const struct rte_flow_action actions[],
113 struct rte_flow_error *error);
115 static struct rte_flow *
116 tap_flow_create(struct rte_eth_dev *dev,
117 const struct rte_flow_attr *attr,
118 const struct rte_flow_item items[],
119 const struct rte_flow_action actions[],
120 struct rte_flow_error *error);
123 tap_flow_destroy(struct rte_eth_dev *dev,
124 struct rte_flow *flow,
125 struct rte_flow_error *error);
127 static const struct rte_flow_ops tap_flow_ops = {
128 .validate = tap_flow_validate,
129 .create = tap_flow_create,
130 .destroy = tap_flow_destroy,
131 .flush = tap_flow_flush,
134 /* Static initializer for items. */
136 (const enum rte_flow_item_type []){ \
137 __VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \
140 /* Structure to generate a simple graph of layers supported by the NIC. */
141 struct tap_flow_items {
142 /* Bit-mask corresponding to what is supported for this item. */
144 const unsigned int mask_sz; /* Bit-mask size in bytes. */
146 * Bit-mask corresponding to the default mask, if none is provided
147 * along with the item.
149 const void *default_mask;
151 * Conversion function from rte_flow to netlink attributes.
154 * rte_flow item to convert.
156 * Internal structure to store the conversion.
159 * 0 on success, negative value otherwise.
161 int (*convert)(const struct rte_flow_item *item, void *data);
162 /** List of possible following items. */
163 const enum rte_flow_item_type *const items;
166 /* Graph of supported items and associated actions. */
167 static const struct tap_flow_items tap_flow_items[] = {
168 [RTE_FLOW_ITEM_TYPE_END] = {
169 .items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH),
171 [RTE_FLOW_ITEM_TYPE_ETH] = {
173 RTE_FLOW_ITEM_TYPE_VLAN,
174 RTE_FLOW_ITEM_TYPE_IPV4,
175 RTE_FLOW_ITEM_TYPE_IPV6),
176 .mask = &(const struct rte_flow_item_eth){
177 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
178 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
181 .mask_sz = sizeof(struct rte_flow_item_eth),
182 .default_mask = &rte_flow_item_eth_mask,
183 .convert = tap_flow_create_eth,
185 [RTE_FLOW_ITEM_TYPE_VLAN] = {
186 .items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4,
187 RTE_FLOW_ITEM_TYPE_IPV6),
188 .mask = &(const struct rte_flow_item_vlan){
190 /* DEI matching is not supported */
191 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
197 .mask_sz = sizeof(struct rte_flow_item_vlan),
198 .default_mask = &rte_flow_item_vlan_mask,
199 .convert = tap_flow_create_vlan,
201 [RTE_FLOW_ITEM_TYPE_IPV4] = {
202 .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
203 RTE_FLOW_ITEM_TYPE_TCP),
204 .mask = &(const struct rte_flow_item_ipv4){
211 .mask_sz = sizeof(struct rte_flow_item_ipv4),
212 .default_mask = &rte_flow_item_ipv4_mask,
213 .convert = tap_flow_create_ipv4,
215 [RTE_FLOW_ITEM_TYPE_IPV6] = {
216 .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
217 RTE_FLOW_ITEM_TYPE_TCP),
218 .mask = &(const struct rte_flow_item_ipv6){
221 "\xff\xff\xff\xff\xff\xff\xff\xff"
222 "\xff\xff\xff\xff\xff\xff\xff\xff",
225 "\xff\xff\xff\xff\xff\xff\xff\xff"
226 "\xff\xff\xff\xff\xff\xff\xff\xff",
231 .mask_sz = sizeof(struct rte_flow_item_ipv6),
232 .default_mask = &rte_flow_item_ipv6_mask,
233 .convert = tap_flow_create_ipv6,
235 [RTE_FLOW_ITEM_TYPE_UDP] = {
236 .mask = &(const struct rte_flow_item_udp){
242 .mask_sz = sizeof(struct rte_flow_item_udp),
243 .default_mask = &rte_flow_item_udp_mask,
244 .convert = tap_flow_create_udp,
246 [RTE_FLOW_ITEM_TYPE_TCP] = {
247 .mask = &(const struct rte_flow_item_tcp){
253 .mask_sz = sizeof(struct rte_flow_item_tcp),
254 .default_mask = &rte_flow_item_tcp_mask,
255 .convert = tap_flow_create_tcp,
259 static struct remote_rule implicit_rte_flows[TAP_REMOTE_MAX_IDX] = {
260 [TAP_REMOTE_LOCAL_MAC] = {
263 .priority = PRIORITY_MASK - TAP_REMOTE_LOCAL_MAC,
267 .type = RTE_FLOW_ITEM_TYPE_ETH,
268 .mask = &(const struct rte_flow_item_eth){
269 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
273 .type = RTE_FLOW_ITEM_TYPE_END,
275 .mirred = TCA_EGRESS_REDIR,
277 [TAP_REMOTE_BROADCAST] = {
280 .priority = PRIORITY_MASK - TAP_REMOTE_BROADCAST,
284 .type = RTE_FLOW_ITEM_TYPE_ETH,
285 .mask = &(const struct rte_flow_item_eth){
286 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
288 .spec = &(const struct rte_flow_item_eth){
289 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
293 .type = RTE_FLOW_ITEM_TYPE_END,
295 .mirred = TCA_EGRESS_MIRROR,
297 [TAP_REMOTE_BROADCASTV6] = {
300 .priority = PRIORITY_MASK - TAP_REMOTE_BROADCASTV6,
304 .type = RTE_FLOW_ITEM_TYPE_ETH,
305 .mask = &(const struct rte_flow_item_eth){
306 .dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
308 .spec = &(const struct rte_flow_item_eth){
309 .dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
313 .type = RTE_FLOW_ITEM_TYPE_END,
315 .mirred = TCA_EGRESS_MIRROR,
317 [TAP_REMOTE_PROMISC] = {
320 .priority = PRIORITY_MASK - TAP_REMOTE_PROMISC,
324 .type = RTE_FLOW_ITEM_TYPE_VOID,
327 .type = RTE_FLOW_ITEM_TYPE_END,
329 .mirred = TCA_EGRESS_MIRROR,
331 [TAP_REMOTE_ALLMULTI] = {
334 .priority = PRIORITY_MASK - TAP_REMOTE_ALLMULTI,
338 .type = RTE_FLOW_ITEM_TYPE_ETH,
339 .mask = &(const struct rte_flow_item_eth){
340 .dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
342 .spec = &(const struct rte_flow_item_eth){
343 .dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
347 .type = RTE_FLOW_ITEM_TYPE_END,
349 .mirred = TCA_EGRESS_MIRROR,
354 .priority = TAP_REMOTE_TX,
358 .type = RTE_FLOW_ITEM_TYPE_VOID,
361 .type = RTE_FLOW_ITEM_TYPE_END,
363 .mirred = TCA_EGRESS_MIRROR,
368 * Make as much checks as possible on an Ethernet item, and if a flow is
369 * provided, fill it appropriately with Ethernet info.
372 * Item specification.
373 * @param[in, out] data
374 * Additional data structure to tell next layers we've been here.
377 * 0 if checks are alright, -1 otherwise.
380 tap_flow_create_eth(const struct rte_flow_item *item, void *data)
382 struct convert_data *info = (struct convert_data *)data;
383 const struct rte_flow_item_eth *spec = item->spec;
384 const struct rte_flow_item_eth *mask = item->mask;
385 struct rte_flow *flow = info->flow;
388 /* use default mask if none provided */
390 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_ETH].default_mask;
391 /* TC does not support eth_type masking. Only accept if exact match. */
392 if (mask->type && mask->type != 0xffff)
396 /* store eth_type for consistency if ipv4/6 pattern item comes next */
397 if (spec->type & mask->type)
398 info->eth_type = spec->type;
402 if (spec->type & mask->type)
403 msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info,
404 (spec->type & mask->type));
405 if (!is_zero_ether_addr(&spec->dst)) {
406 nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_DST, ETHER_ADDR_LEN,
407 &spec->dst.addr_bytes);
409 TCA_FLOWER_KEY_ETH_DST_MASK, ETHER_ADDR_LEN,
410 &mask->dst.addr_bytes);
412 if (!is_zero_ether_addr(&mask->src)) {
413 nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_SRC, ETHER_ADDR_LEN,
414 &spec->src.addr_bytes);
416 TCA_FLOWER_KEY_ETH_SRC_MASK, ETHER_ADDR_LEN,
417 &mask->src.addr_bytes);
423 * Make as much checks as possible on a VLAN item, and if a flow is provided,
424 * fill it appropriately with VLAN info.
427 * Item specification.
428 * @param[in, out] data
429 * Additional data structure to tell next layers we've been here.
432 * 0 if checks are alright, -1 otherwise.
435 tap_flow_create_vlan(const struct rte_flow_item *item, void *data)
437 struct convert_data *info = (struct convert_data *)data;
438 const struct rte_flow_item_vlan *spec = item->spec;
439 const struct rte_flow_item_vlan *mask = item->mask;
440 struct rte_flow *flow = info->flow;
443 /* use default mask if none provided */
445 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask;
446 /* TC does not support tpid masking. Only accept if exact match. */
447 if (mask->tpid && mask->tpid != 0xffff)
449 /* Double-tagging not supported. */
450 if (spec && mask->tpid && spec->tpid != htons(ETH_P_8021Q))
456 msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_8021Q));
457 #define VLAN_PRIO(tci) ((tci) >> 13)
458 #define VLAN_ID(tci) ((tci) & 0xfff)
462 uint16_t tci = ntohs(spec->tci) & mask->tci;
463 uint16_t prio = VLAN_PRIO(tci);
464 uint8_t vid = VLAN_ID(tci);
467 nlattr_add8(&msg->nh, TCA_FLOWER_KEY_VLAN_PRIO, prio);
469 nlattr_add16(&msg->nh, TCA_FLOWER_KEY_VLAN_ID, vid);
475 * Make as much checks as possible on an IPv4 item, and if a flow is provided,
476 * fill it appropriately with IPv4 info.
479 * Item specification.
480 * @param[in, out] data
481 * Additional data structure to tell next layers we've been here.
484 * 0 if checks are alright, -1 otherwise.
487 tap_flow_create_ipv4(const struct rte_flow_item *item, void *data)
489 struct convert_data *info = (struct convert_data *)data;
490 const struct rte_flow_item_ipv4 *spec = item->spec;
491 const struct rte_flow_item_ipv4 *mask = item->mask;
492 struct rte_flow *flow = info->flow;
495 /* use default mask if none provided */
497 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV4].default_mask;
498 /* check that previous eth type is compatible with ipv4 */
499 if (info->eth_type && info->eth_type != htons(ETH_P_IP))
501 /* store ip_proto for consistency if udp/tcp pattern item comes next */
503 info->ip_proto = spec->hdr.next_proto_id;
508 info->eth_type = htons(ETH_P_IP);
510 msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_IP));
513 if (spec->hdr.dst_addr) {
514 nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST,
516 nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST_MASK,
519 if (spec->hdr.src_addr) {
520 nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC,
522 nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC_MASK,
525 if (spec->hdr.next_proto_id)
526 nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO,
527 spec->hdr.next_proto_id);
532 * Make as much checks as possible on an IPv6 item, and if a flow is provided,
533 * fill it appropriately with IPv6 info.
536 * Item specification.
537 * @param[in, out] data
538 * Additional data structure to tell next layers we've been here.
541 * 0 if checks are alright, -1 otherwise.
544 tap_flow_create_ipv6(const struct rte_flow_item *item, void *data)
546 struct convert_data *info = (struct convert_data *)data;
547 const struct rte_flow_item_ipv6 *spec = item->spec;
548 const struct rte_flow_item_ipv6 *mask = item->mask;
549 struct rte_flow *flow = info->flow;
550 uint8_t empty_addr[16] = { 0 };
553 /* use default mask if none provided */
555 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV6].default_mask;
556 /* check that previous eth type is compatible with ipv6 */
557 if (info->eth_type && info->eth_type != htons(ETH_P_IPV6))
559 /* store ip_proto for consistency if udp/tcp pattern item comes next */
561 info->ip_proto = spec->hdr.proto;
566 info->eth_type = htons(ETH_P_IPV6);
568 msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_IPV6));
571 if (memcmp(spec->hdr.dst_addr, empty_addr, 16)) {
572 nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST,
573 sizeof(spec->hdr.dst_addr), &spec->hdr.dst_addr);
574 nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST_MASK,
575 sizeof(mask->hdr.dst_addr), &mask->hdr.dst_addr);
577 if (memcmp(spec->hdr.src_addr, empty_addr, 16)) {
578 nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC,
579 sizeof(spec->hdr.src_addr), &spec->hdr.src_addr);
580 nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC_MASK,
581 sizeof(mask->hdr.src_addr), &mask->hdr.src_addr);
584 nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, spec->hdr.proto);
589 * Make as much checks as possible on a UDP item, and if a flow is provided,
590 * fill it appropriately with UDP info.
593 * Item specification.
594 * @param[in, out] data
595 * Additional data structure to tell next layers we've been here.
598 * 0 if checks are alright, -1 otherwise.
601 tap_flow_create_udp(const struct rte_flow_item *item, void *data)
603 struct convert_data *info = (struct convert_data *)data;
604 const struct rte_flow_item_udp *spec = item->spec;
605 const struct rte_flow_item_udp *mask = item->mask;
606 struct rte_flow *flow = info->flow;
609 /* use default mask if none provided */
611 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_UDP].default_mask;
612 /* check that previous ip_proto is compatible with udp */
613 if (info->ip_proto && info->ip_proto != IPPROTO_UDP)
615 /* TC does not support UDP port masking. Only accept if exact match. */
616 if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) ||
617 (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff))
622 nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_UDP);
625 if (spec->hdr.dst_port & mask->hdr.dst_port)
626 nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_DST,
628 if (spec->hdr.src_port & mask->hdr.src_port)
629 nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_SRC,
635 * Make as much checks as possible on a TCP item, and if a flow is provided,
636 * fill it appropriately with TCP info.
639 * Item specification.
640 * @param[in, out] data
641 * Additional data structure to tell next layers we've been here.
644 * 0 if checks are alright, -1 otherwise.
647 tap_flow_create_tcp(const struct rte_flow_item *item, void *data)
649 struct convert_data *info = (struct convert_data *)data;
650 const struct rte_flow_item_tcp *spec = item->spec;
651 const struct rte_flow_item_tcp *mask = item->mask;
652 struct rte_flow *flow = info->flow;
655 /* use default mask if none provided */
657 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_TCP].default_mask;
658 /* check that previous ip_proto is compatible with tcp */
659 if (info->ip_proto && info->ip_proto != IPPROTO_TCP)
661 /* TC does not support TCP port masking. Only accept if exact match. */
662 if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) ||
663 (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff))
668 nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_TCP);
671 if (spec->hdr.dst_port & mask->hdr.dst_port)
672 nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_DST,
674 if (spec->hdr.src_port & mask->hdr.src_port)
675 nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_SRC,
681 * Check support for a given item.
684 * Item specification.
686 * Bit-Mask size in bytes.
687 * @param[in] supported_mask
688 * Bit-mask covering supported fields to compare with spec, last and mask in
690 * @param[in] default_mask
691 * Bit-mask default mask if none is provided in \item.
697 tap_flow_item_validate(const struct rte_flow_item *item,
699 const uint8_t *supported_mask,
700 const uint8_t *default_mask)
704 /* An empty layer is allowed, as long as all fields are NULL */
705 if (!item->spec && (item->mask || item->last))
707 /* Is the item spec compatible with what the NIC supports? */
708 if (item->spec && !item->mask) {
710 const uint8_t *spec = item->spec;
712 for (i = 0; i < size; ++i)
713 if ((spec[i] | supported_mask[i]) != supported_mask[i])
715 /* Is the default mask compatible with what the NIC supports? */
716 for (i = 0; i < size; i++)
717 if ((default_mask[i] | supported_mask[i]) !=
721 /* Is the item last compatible with what the NIC supports? */
722 if (item->last && !item->mask) {
724 const uint8_t *spec = item->last;
726 for (i = 0; i < size; ++i)
727 if ((spec[i] | supported_mask[i]) != supported_mask[i])
730 /* Is the item mask compatible with what the NIC supports? */
733 const uint8_t *spec = item->mask;
735 for (i = 0; i < size; ++i)
736 if ((spec[i] | supported_mask[i]) != supported_mask[i])
740 * Once masked, Are item spec and item last equal?
741 * TC does not support range so anything else is invalid.
743 if (item->spec && item->last) {
746 const uint8_t *apply = default_mask;
751 for (i = 0; i < size; ++i) {
752 spec[i] = ((const uint8_t *)item->spec)[i] & apply[i];
753 last[i] = ((const uint8_t *)item->last)[i] & apply[i];
755 ret = memcmp(spec, last, size);
761 * Transform a DROP/PASSTHRU action item in the provided flow for TC.
763 * @param[in, out] flow
766 * Appropriate action to be set in the TCA_GACT_PARMS structure.
769 * 0 if checks are alright, -1 otherwise.
772 add_action_gact(struct rte_flow *flow, int action)
774 struct nlmsg *msg = &flow->msg;
775 size_t act_index = 1;
780 if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
782 if (nlattr_nested_start(msg, act_index++) < 0)
784 nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("gact"), "gact");
785 if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
787 nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(p), &p);
788 nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
789 nlattr_nested_finish(msg); /* nested act_index */
790 nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
795 * Transform a MIRRED action item in the provided flow for TC.
797 * @param[in, out] flow
800 * Netdevice ifindex, where to mirror/redirect packet to.
801 * @param[in] action_type
802 * Either TCA_EGRESS_REDIR for redirection or TCA_EGRESS_MIRROR for mirroring.
805 * 0 if checks are alright, -1 otherwise.
808 add_action_mirred(struct rte_flow *flow, uint16_t ifindex, uint16_t action_type)
810 struct nlmsg *msg = &flow->msg;
811 size_t act_index = 1;
812 struct tc_mirred p = {
813 .eaction = action_type,
817 if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
819 if (nlattr_nested_start(msg, act_index++) < 0)
821 nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("mirred"), "mirred");
822 if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
824 if (action_type == TCA_EGRESS_MIRROR)
825 p.action = TC_ACT_PIPE;
827 p.action = TC_ACT_STOLEN;
828 nlattr_add(&msg->nh, TCA_MIRRED_PARMS, sizeof(p), &p);
829 nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
830 nlattr_nested_finish(msg); /* nested act_index */
831 nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
836 * Transform a QUEUE action item in the provided flow for TC.
838 * @param[in, out] flow
844 * 0 if checks are alright, -1 otherwise.
847 add_action_skbedit(struct rte_flow *flow, uint16_t queue)
849 struct nlmsg *msg = &flow->msg;
850 size_t act_index = 1;
851 struct tc_skbedit p = {
852 .action = TC_ACT_PIPE
855 if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
857 if (nlattr_nested_start(msg, act_index++) < 0)
859 nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("skbedit"), "skbedit");
860 if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
862 nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS, sizeof(p), &p);
863 nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING, queue);
864 nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
865 nlattr_nested_finish(msg); /* nested act_index */
866 nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
871 * Validate a flow supported by TC.
872 * If flow param is not NULL, then also fill the netlink message inside.
875 * Pointer to private structure.
877 * Flow rule attributes.
879 * Pattern specification (list terminated by the END pattern item).
881 * Associated actions (list terminated by the END action).
883 * Perform verbose error reporting if not NULL.
884 * @param[in, out] flow
885 * Flow structure to update.
887 * If set to TCA_EGRESS_REDIR, provided actions will be replaced with a
888 * redirection to the tap netdevice, and the TC rule will be configured
889 * on the remote netdevice in pmd.
890 * If set to TCA_EGRESS_MIRROR, provided actions will be replaced with a
891 * mirroring to the tap netdevice, and the TC rule will be configured
892 * on the remote netdevice in pmd. Matching packets will thus be duplicated.
893 * If set to 0, the standard behavior is to be used: set correct actions for
894 * the TC rule, and apply it on the tap netdevice.
897 * 0 on success, a negative errno value otherwise and rte_errno is set.
900 priv_flow_process(struct pmd_internals *pmd,
901 const struct rte_flow_attr *attr,
902 const struct rte_flow_item items[],
903 const struct rte_flow_action actions[],
904 struct rte_flow_error *error,
905 struct rte_flow *flow,
908 const struct tap_flow_items *cur_item = tap_flow_items;
909 struct convert_data data = {
914 int action = 0; /* Only one action authorized for now */
916 if (attr->group > MAX_GROUP) {
918 error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
919 NULL, "group value too big: cannot exceed 15");
922 if (attr->priority > MAX_PRIORITY) {
924 error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
925 NULL, "priority value too big");
928 uint16_t group = attr->group << GROUP_SHIFT;
929 uint16_t prio = group | (attr->priority + PRIORITY_OFFSET);
930 flow->msg.t.tcm_info = TC_H_MAKE(prio << 16,
931 flow->msg.t.tcm_info);
936 * If attr->ingress, the rule applies on remote ingress
937 * to match incoming packets
938 * If attr->egress, the rule applies on tap ingress (as
939 * seen from the kernel) to deal with packets going out
942 flow->msg.t.tcm_parent = TC_H_MAKE(TC_H_INGRESS, 0);
944 /* Standard rule on tap egress (kernel standpoint). */
945 flow->msg.t.tcm_parent =
946 TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
948 /* use flower filter type */
949 nlattr_add(&flow->msg.nh, TCA_KIND, sizeof("flower"), "flower");
950 if (nlattr_nested_start(&flow->msg, TCA_OPTIONS) < 0)
951 goto exit_item_not_supported;
953 for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) {
954 const struct tap_flow_items *token = NULL;
958 if (items->type == RTE_FLOW_ITEM_TYPE_VOID)
962 cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END;
964 if (cur_item->items[i] == items->type) {
965 token = &tap_flow_items[items->type];
970 goto exit_item_not_supported;
972 err = tap_flow_item_validate(
973 items, cur_item->mask_sz,
974 (const uint8_t *)cur_item->mask,
975 (const uint8_t *)cur_item->default_mask);
977 goto exit_item_not_supported;
978 if (flow && cur_item->convert) {
979 if (!pmd->flower_vlan_support &&
980 cur_item->convert == tap_flow_create_vlan)
981 goto exit_item_not_supported;
982 err = cur_item->convert(items, &data);
984 goto exit_item_not_supported;
988 if (pmd->flower_vlan_support && data.vlan) {
989 nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
991 nlattr_add16(&flow->msg.nh,
992 TCA_FLOWER_KEY_VLAN_ETH_TYPE,
994 data.eth_type : htons(ETH_P_ALL));
995 } else if (data.eth_type) {
996 nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
1000 if (mirred && flow) {
1001 uint16_t if_index = pmd->if_index;
1004 * If attr->egress && mirred, then this is a special
1005 * case where the rule must be applied on the tap, to
1006 * redirect packets coming from the DPDK App, out
1007 * through the remote netdevice.
1010 if_index = pmd->remote_if_index;
1011 if (add_action_mirred(flow, if_index, mirred) < 0)
1012 goto exit_action_not_supported;
1016 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
1019 if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
1021 } else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
1023 goto exit_action_not_supported;
1026 err = add_action_gact(flow, TC_ACT_SHOT);
1027 } else if (actions->type == RTE_FLOW_ACTION_TYPE_PASSTHRU) {
1029 goto exit_action_not_supported;
1032 err = add_action_gact(flow, TC_ACT_UNSPEC);
1033 } else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
1034 const struct rte_flow_action_queue *queue =
1035 (const struct rte_flow_action_queue *)
1038 goto exit_action_not_supported;
1040 if (!queue || (queue->index >= pmd->nb_queues))
1041 goto exit_action_not_supported;
1043 err = add_action_skbedit(flow, queue->index);
1045 goto exit_action_not_supported;
1048 goto exit_action_not_supported;
1052 nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */
1054 exit_item_not_supported:
1055 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
1056 items, "item not supported");
1058 exit_action_not_supported:
1059 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
1060 actions, "action not supported");
1069 * @see rte_flow_validate()
1073 tap_flow_validate(struct rte_eth_dev *dev,
1074 const struct rte_flow_attr *attr,
1075 const struct rte_flow_item items[],
1076 const struct rte_flow_action actions[],
1077 struct rte_flow_error *error)
1079 struct pmd_internals *pmd = dev->data->dev_private;
1081 return priv_flow_process(pmd, attr, items, actions, error, NULL, 0);
1085 * Set a unique handle in a flow.
1087 * The kernel supports TC rules with equal priority, as long as they use the
1088 * same matching fields (e.g.: dst mac and ipv4) with different values (and
1089 * full mask to ensure no collision is possible).
1090 * In those rules, the handle (uint32_t) is the part that would identify
1091 * specifically each rule.
1093 * On 32-bit architectures, the handle can simply be the flow's pointer address.
1094 * On 64-bit architectures, we rely on jhash(flow) to find a (sufficiently)
1097 * @param[in, out] flow
1098 * The flow that needs its handle set.
1101 tap_flow_set_handle(struct rte_flow *flow)
1103 uint32_t handle = 0;
1105 if (sizeof(flow) > 4)
1106 handle = rte_jhash(&flow, sizeof(flow), 1);
1108 handle = (uintptr_t)flow;
1109 /* must be at least 1 to avoid letting the kernel choose one for us */
1112 flow->msg.t.tcm_handle = handle;
1118 * @see rte_flow_create()
1121 static struct rte_flow *
1122 tap_flow_create(struct rte_eth_dev *dev,
1123 const struct rte_flow_attr *attr,
1124 const struct rte_flow_item items[],
1125 const struct rte_flow_action actions[],
1126 struct rte_flow_error *error)
1128 struct pmd_internals *pmd = dev->data->dev_private;
1129 struct rte_flow *remote_flow = NULL;
1130 struct rte_flow *flow = NULL;
1131 struct nlmsg *msg = NULL;
1134 if (!pmd->if_index) {
1135 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1137 "can't create rule, ifindex not found");
1141 * No rules configured through standard rte_flow should be set on the
1142 * priorities used by implicit rules.
1144 if ((attr->group == MAX_GROUP) &&
1145 attr->priority > (MAX_PRIORITY - TAP_REMOTE_MAX_IDX)) {
1147 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1148 NULL, "priority value too big");
1151 flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1153 rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1154 NULL, "cannot allocate memory for rte_flow");
1158 tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER,
1159 NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1160 msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1161 tap_flow_set_handle(flow);
1162 if (priv_flow_process(pmd, attr, items, actions, error, flow, 0))
1164 err = nl_send(pmd->nlsk_fd, &msg->nh);
1166 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1167 NULL, "couldn't send request to kernel");
1170 err = nl_recv_ack(pmd->nlsk_fd);
1172 rte_flow_error_set(error, EEXIST, RTE_FLOW_ERROR_TYPE_HANDLE,
1173 NULL, "overlapping rules");
1176 LIST_INSERT_HEAD(&pmd->flows, flow, next);
1178 * If a remote device is configured, a TC rule with identical items for
1179 * matching must be set on that device, with a single action: redirect
1180 * to the local pmd->if_index.
1182 if (pmd->remote_if_index) {
1183 remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1186 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1187 "cannot allocate memory for rte_flow");
1190 msg = &remote_flow->msg;
1191 /* set the rule if_index for the remote netdevice */
1193 msg, pmd->remote_if_index, RTM_NEWTFILTER,
1194 NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1195 msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1196 tap_flow_set_handle(remote_flow);
1197 if (priv_flow_process(pmd, attr, items, NULL,
1198 error, remote_flow, TCA_EGRESS_REDIR)) {
1200 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1201 NULL, "rte flow rule validation failed");
1204 err = nl_send(pmd->nlsk_fd, &msg->nh);
1207 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1208 NULL, "Failure sending nl request");
1211 err = nl_recv_ack(pmd->nlsk_fd);
1214 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1215 NULL, "overlapping rules");
1218 flow->remote_flow = remote_flow;
1223 rte_free(remote_flow);
1230 * Destroy a flow using pointer to pmd_internal.
1232 * @param[in, out] pmd
1233 * Pointer to private structure.
1235 * Pointer to the flow to destroy.
1236 * @param[in, out] error
1237 * Pointer to the flow error handler
1239 * @return 0 if the flow could be destroyed, -1 otherwise.
1242 tap_flow_destroy_pmd(struct pmd_internals *pmd,
1243 struct rte_flow *flow,
1244 struct rte_flow_error *error)
1246 struct rte_flow *remote_flow = flow->remote_flow;
1249 LIST_REMOVE(flow, next);
1250 flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1251 flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
1253 ret = nl_send(pmd->nlsk_fd, &flow->msg.nh);
1255 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1256 NULL, "couldn't send request to kernel");
1259 ret = nl_recv_ack(pmd->nlsk_fd);
1262 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1263 "couldn't receive kernel ack to our request");
1267 remote_flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1268 remote_flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
1270 ret = nl_send(pmd->nlsk_fd, &remote_flow->msg.nh);
1273 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1274 NULL, "Failure sending nl request");
1277 ret = nl_recv_ack(pmd->nlsk_fd);
1280 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1281 NULL, "Failure trying to receive nl ack");
1287 rte_free(remote_flow);
1295 * @see rte_flow_destroy()
1299 tap_flow_destroy(struct rte_eth_dev *dev,
1300 struct rte_flow *flow,
1301 struct rte_flow_error *error)
1303 struct pmd_internals *pmd = dev->data->dev_private;
1305 return tap_flow_destroy_pmd(pmd, flow, error);
1309 * Destroy all flows.
1311 * @see rte_flow_flush()
1315 tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error)
1317 struct pmd_internals *pmd = dev->data->dev_private;
1318 struct rte_flow *flow;
1320 while (!LIST_EMPTY(&pmd->flows)) {
1321 flow = LIST_FIRST(&pmd->flows);
1322 if (tap_flow_destroy(dev, flow, error) < 0)
1329 * Add an implicit flow rule on the remote device to make sure traffic gets to
1330 * the tap netdevice from there.
1333 * Pointer to private structure.
1335 * The idx in the implicit_rte_flows array specifying which rule to apply.
1337 * @return -1 if the rule couldn't be applied, 0 otherwise.
1339 int tap_flow_implicit_create(struct pmd_internals *pmd,
1340 enum implicit_rule_index idx)
1342 struct rte_flow_item *items = implicit_rte_flows[idx].items;
1343 struct rte_flow_attr *attr = &implicit_rte_flows[idx].attr;
1344 struct rte_flow_item_eth eth_local = { .type = 0 };
1345 uint16_t if_index = pmd->remote_if_index;
1346 struct rte_flow *remote_flow = NULL;
1347 struct nlmsg *msg = NULL;
1349 struct rte_flow_item items_local[2] = {
1351 .type = items[0].type,
1353 .mask = items[0].mask,
1356 .type = items[1].type,
1360 remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1362 RTE_LOG(ERR, PMD, "Cannot allocate memory for rte_flow");
1365 msg = &remote_flow->msg;
1366 if (idx == TAP_REMOTE_TX) {
1367 if_index = pmd->if_index;
1368 } else if (idx == TAP_REMOTE_LOCAL_MAC) {
1370 * eth addr couldn't be set in implicit_rte_flows[] as it is not
1371 * known at compile time.
1373 memcpy(ð_local.dst, &pmd->eth_addr, sizeof(pmd->eth_addr));
1374 items = items_local;
1376 tc_init_msg(msg, if_index, RTM_NEWTFILTER,
1377 NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1378 msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1379 tap_flow_set_handle(remote_flow);
1380 if (priv_flow_process(pmd, attr, items, NULL, NULL,
1381 remote_flow, implicit_rte_flows[idx].mirred)) {
1382 RTE_LOG(ERR, PMD, "rte flow rule validation failed\n");
1385 err = nl_send(pmd->nlsk_fd, &msg->nh);
1387 RTE_LOG(ERR, PMD, "Failure sending nl request");
1390 err = nl_recv_ack(pmd->nlsk_fd);
1393 "Kernel refused TC filter rule creation");
1396 LIST_INSERT_HEAD(&pmd->implicit_flows, remote_flow, next);
1400 rte_free(remote_flow);
1405 * Remove specific implicit flow rule on the remote device.
1407 * @param[in, out] pmd
1408 * Pointer to private structure.
1410 * The idx in the implicit_rte_flows array specifying which rule to remove.
1412 * @return -1 if one of the implicit rules couldn't be created, 0 otherwise.
1414 int tap_flow_implicit_destroy(struct pmd_internals *pmd,
1415 enum implicit_rule_index idx)
1417 struct rte_flow *remote_flow;
1419 int idx_prio = implicit_rte_flows[idx].attr.priority + PRIORITY_OFFSET;
1421 for (remote_flow = LIST_FIRST(&pmd->implicit_flows);
1423 remote_flow = LIST_NEXT(remote_flow, next)) {
1424 cur_prio = (remote_flow->msg.t.tcm_info >> 16) & PRIORITY_MASK;
1425 if (cur_prio != idx_prio)
1427 return tap_flow_destroy_pmd(pmd, remote_flow, NULL);
1433 * Destroy all implicit flows.
1435 * @see rte_flow_flush()
1438 tap_flow_implicit_flush(struct pmd_internals *pmd, struct rte_flow_error *error)
1440 struct rte_flow *remote_flow;
1442 while (!LIST_EMPTY(&pmd->implicit_flows)) {
1443 remote_flow = LIST_FIRST(&pmd->implicit_flows);
1444 if (tap_flow_destroy_pmd(pmd, remote_flow, error) < 0)
1451 * Manage filter operations.
1454 * Pointer to Ethernet device structure.
1455 * @param filter_type
1458 * Operation to perform.
1460 * Pointer to operation-specific structure.
1463 * 0 on success, negative errno value on failure.
1466 tap_dev_filter_ctrl(struct rte_eth_dev *dev,
1467 enum rte_filter_type filter_type,
1468 enum rte_filter_op filter_op,
1471 struct pmd_internals *pmd = dev->data->dev_private;
1473 if (!pmd->flower_support)
1475 switch (filter_type) {
1476 case RTE_ETH_FILTER_GENERIC:
1477 if (filter_op != RTE_ETH_FILTER_GET)
1479 *(const void **)arg = &tap_flow_ops;
1482 RTE_LOG(ERR, PMD, "%p: filter type (%d) not supported",
1483 (void *)dev, filter_type);