4 * Copyright 2017 6WIND S.A.
5 * Copyright 2017 Mellanox.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #include <sys/queue.h>
36 #include <rte_byteorder.h>
37 #include <rte_jhash.h>
38 #include <rte_malloc.h>
39 #include <rte_eth_tap.h>
41 #include <tap_autoconf.h>
42 #include <tap_tcmsgs.h>
44 #ifndef HAVE_TC_FLOWER
46 * For kernels < 4.2, this enum is not defined. Runtime checks will be made to
47 * avoid sending TC messages the kernel cannot understand.
54 TCA_FLOWER_KEY_ETH_DST, /* ETH_ALEN */
55 TCA_FLOWER_KEY_ETH_DST_MASK, /* ETH_ALEN */
56 TCA_FLOWER_KEY_ETH_SRC, /* ETH_ALEN */
57 TCA_FLOWER_KEY_ETH_SRC_MASK, /* ETH_ALEN */
58 TCA_FLOWER_KEY_ETH_TYPE, /* be16 */
59 TCA_FLOWER_KEY_IP_PROTO, /* u8 */
60 TCA_FLOWER_KEY_IPV4_SRC, /* be32 */
61 TCA_FLOWER_KEY_IPV4_SRC_MASK, /* be32 */
62 TCA_FLOWER_KEY_IPV4_DST, /* be32 */
63 TCA_FLOWER_KEY_IPV4_DST_MASK, /* be32 */
64 TCA_FLOWER_KEY_IPV6_SRC, /* struct in6_addr */
65 TCA_FLOWER_KEY_IPV6_SRC_MASK, /* struct in6_addr */
66 TCA_FLOWER_KEY_IPV6_DST, /* struct in6_addr */
67 TCA_FLOWER_KEY_IPV6_DST_MASK, /* struct in6_addr */
68 TCA_FLOWER_KEY_TCP_SRC, /* be16 */
69 TCA_FLOWER_KEY_TCP_DST, /* be16 */
70 TCA_FLOWER_KEY_UDP_SRC, /* be16 */
71 TCA_FLOWER_KEY_UDP_DST, /* be16 */
74 #ifndef HAVE_TC_VLAN_ID
76 /* TCA_FLOWER_FLAGS, */
77 TCA_FLOWER_KEY_VLAN_ID = TCA_FLOWER_KEY_UDP_DST + 2, /* be16 */
78 TCA_FLOWER_KEY_VLAN_PRIO, /* u8 */
79 TCA_FLOWER_KEY_VLAN_ETH_TYPE, /* be16 */
84 LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
85 struct rte_flow *remote_flow; /* associated remote flow */
93 struct rte_flow *flow;
97 struct rte_flow_attr attr;
98 struct rte_flow_item items[2];
102 static int tap_flow_create_eth(const struct rte_flow_item *item, void *data);
103 static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data);
104 static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data);
105 static int tap_flow_create_ipv6(const struct rte_flow_item *item, void *data);
106 static int tap_flow_create_udp(const struct rte_flow_item *item, void *data);
107 static int tap_flow_create_tcp(const struct rte_flow_item *item, void *data);
109 tap_flow_validate(struct rte_eth_dev *dev,
110 const struct rte_flow_attr *attr,
111 const struct rte_flow_item items[],
112 const struct rte_flow_action actions[],
113 struct rte_flow_error *error);
115 static struct rte_flow *
116 tap_flow_create(struct rte_eth_dev *dev,
117 const struct rte_flow_attr *attr,
118 const struct rte_flow_item items[],
119 const struct rte_flow_action actions[],
120 struct rte_flow_error *error);
123 tap_flow_destroy(struct rte_eth_dev *dev,
124 struct rte_flow *flow,
125 struct rte_flow_error *error);
127 static const struct rte_flow_ops tap_flow_ops = {
128 .validate = tap_flow_validate,
129 .create = tap_flow_create,
130 .destroy = tap_flow_destroy,
131 .flush = tap_flow_flush,
134 /* Static initializer for items. */
136 (const enum rte_flow_item_type []){ \
137 __VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \
140 /* Structure to generate a simple graph of layers supported by the NIC. */
141 struct tap_flow_items {
142 /* Bit-mask corresponding to what is supported for this item. */
144 const unsigned int mask_sz; /* Bit-mask size in bytes. */
146 * Bit-mask corresponding to the default mask, if none is provided
147 * along with the item.
149 const void *default_mask;
151 * Conversion function from rte_flow to netlink attributes.
154 * rte_flow item to convert.
156 * Internal structure to store the conversion.
159 * 0 on success, negative value otherwise.
161 int (*convert)(const struct rte_flow_item *item, void *data);
162 /** List of possible following items. */
163 const enum rte_flow_item_type *const items;
166 /* Graph of supported items and associated actions. */
167 static const struct tap_flow_items tap_flow_items[] = {
168 [RTE_FLOW_ITEM_TYPE_END] = {
169 .items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH),
171 [RTE_FLOW_ITEM_TYPE_ETH] = {
173 RTE_FLOW_ITEM_TYPE_VLAN,
174 RTE_FLOW_ITEM_TYPE_IPV4,
175 RTE_FLOW_ITEM_TYPE_IPV6),
176 .mask = &(const struct rte_flow_item_eth){
177 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
178 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
181 .mask_sz = sizeof(struct rte_flow_item_eth),
182 .default_mask = &rte_flow_item_eth_mask,
183 .convert = tap_flow_create_eth,
185 [RTE_FLOW_ITEM_TYPE_VLAN] = {
186 .items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4,
187 RTE_FLOW_ITEM_TYPE_IPV6),
188 .mask = &(const struct rte_flow_item_vlan){
190 /* DEI matching is not supported */
191 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
197 .mask_sz = sizeof(struct rte_flow_item_vlan),
198 .default_mask = &rte_flow_item_vlan_mask,
199 .convert = tap_flow_create_vlan,
201 [RTE_FLOW_ITEM_TYPE_IPV4] = {
202 .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
203 RTE_FLOW_ITEM_TYPE_TCP),
204 .mask = &(const struct rte_flow_item_ipv4){
211 .mask_sz = sizeof(struct rte_flow_item_ipv4),
212 .default_mask = &rte_flow_item_ipv4_mask,
213 .convert = tap_flow_create_ipv4,
215 [RTE_FLOW_ITEM_TYPE_IPV6] = {
216 .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
217 RTE_FLOW_ITEM_TYPE_TCP),
218 .mask = &(const struct rte_flow_item_ipv6){
221 "\xff\xff\xff\xff\xff\xff\xff\xff"
222 "\xff\xff\xff\xff\xff\xff\xff\xff",
225 "\xff\xff\xff\xff\xff\xff\xff\xff"
226 "\xff\xff\xff\xff\xff\xff\xff\xff",
231 .mask_sz = sizeof(struct rte_flow_item_ipv6),
232 .default_mask = &rte_flow_item_ipv6_mask,
233 .convert = tap_flow_create_ipv6,
235 [RTE_FLOW_ITEM_TYPE_UDP] = {
236 .mask = &(const struct rte_flow_item_udp){
242 .mask_sz = sizeof(struct rte_flow_item_udp),
243 .default_mask = &rte_flow_item_udp_mask,
244 .convert = tap_flow_create_udp,
246 [RTE_FLOW_ITEM_TYPE_TCP] = {
247 .mask = &(const struct rte_flow_item_tcp){
253 .mask_sz = sizeof(struct rte_flow_item_tcp),
254 .default_mask = &rte_flow_item_tcp_mask,
255 .convert = tap_flow_create_tcp,
259 static struct remote_rule implicit_rte_flows[TAP_REMOTE_MAX_IDX] = {
260 [TAP_REMOTE_LOCAL_MAC] = {
263 .priority = PRIORITY_MASK - TAP_REMOTE_LOCAL_MAC,
267 .type = RTE_FLOW_ITEM_TYPE_ETH,
268 .mask = &(const struct rte_flow_item_eth){
269 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
273 .type = RTE_FLOW_ITEM_TYPE_END,
275 .mirred = TCA_EGRESS_REDIR,
277 [TAP_REMOTE_BROADCAST] = {
280 .priority = PRIORITY_MASK - TAP_REMOTE_BROADCAST,
284 .type = RTE_FLOW_ITEM_TYPE_ETH,
285 .mask = &(const struct rte_flow_item_eth){
286 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
288 .spec = &(const struct rte_flow_item_eth){
289 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
293 .type = RTE_FLOW_ITEM_TYPE_END,
295 .mirred = TCA_EGRESS_MIRROR,
297 [TAP_REMOTE_BROADCASTV6] = {
300 .priority = PRIORITY_MASK - TAP_REMOTE_BROADCASTV6,
304 .type = RTE_FLOW_ITEM_TYPE_ETH,
305 .mask = &(const struct rte_flow_item_eth){
306 .dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
308 .spec = &(const struct rte_flow_item_eth){
309 .dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
313 .type = RTE_FLOW_ITEM_TYPE_END,
315 .mirred = TCA_EGRESS_MIRROR,
317 [TAP_REMOTE_PROMISC] = {
320 .priority = PRIORITY_MASK - TAP_REMOTE_PROMISC,
324 .type = RTE_FLOW_ITEM_TYPE_VOID,
327 .type = RTE_FLOW_ITEM_TYPE_END,
329 .mirred = TCA_EGRESS_MIRROR,
331 [TAP_REMOTE_ALLMULTI] = {
334 .priority = PRIORITY_MASK - TAP_REMOTE_ALLMULTI,
338 .type = RTE_FLOW_ITEM_TYPE_ETH,
339 .mask = &(const struct rte_flow_item_eth){
340 .dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
342 .spec = &(const struct rte_flow_item_eth){
343 .dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
347 .type = RTE_FLOW_ITEM_TYPE_END,
349 .mirred = TCA_EGRESS_MIRROR,
354 .priority = TAP_REMOTE_TX,
358 .type = RTE_FLOW_ITEM_TYPE_VOID,
361 .type = RTE_FLOW_ITEM_TYPE_END,
363 .mirred = TCA_EGRESS_MIRROR,
368 * Make as much checks as possible on an Ethernet item, and if a flow is
369 * provided, fill it appropriately with Ethernet info.
372 * Item specification.
373 * @param[in, out] data
374 * Additional data structure to tell next layers we've been here.
377 * 0 if checks are alright, -1 otherwise.
380 tap_flow_create_eth(const struct rte_flow_item *item, void *data)
382 struct convert_data *info = (struct convert_data *)data;
383 const struct rte_flow_item_eth *spec = item->spec;
384 const struct rte_flow_item_eth *mask = item->mask;
385 struct rte_flow *flow = info->flow;
388 /* use default mask if none provided */
390 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_ETH].default_mask;
391 /* TC does not support eth_type masking. Only accept if exact match. */
392 if (mask->type && mask->type != 0xffff)
396 /* store eth_type for consistency if ipv4/6 pattern item comes next */
397 if (spec->type & mask->type)
398 info->eth_type = spec->type;
402 if (spec->type & mask->type)
403 msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info,
404 (spec->type & mask->type));
405 if (!is_zero_ether_addr(&spec->dst)) {
406 nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_DST, ETHER_ADDR_LEN,
407 &spec->dst.addr_bytes);
409 TCA_FLOWER_KEY_ETH_DST_MASK, ETHER_ADDR_LEN,
410 &mask->dst.addr_bytes);
412 if (!is_zero_ether_addr(&mask->src)) {
413 nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_SRC, ETHER_ADDR_LEN,
414 &spec->src.addr_bytes);
416 TCA_FLOWER_KEY_ETH_SRC_MASK, ETHER_ADDR_LEN,
417 &mask->src.addr_bytes);
423 * Make as much checks as possible on a VLAN item, and if a flow is provided,
424 * fill it appropriately with VLAN info.
427 * Item specification.
428 * @param[in, out] data
429 * Additional data structure to tell next layers we've been here.
432 * 0 if checks are alright, -1 otherwise.
435 tap_flow_create_vlan(const struct rte_flow_item *item, void *data)
437 struct convert_data *info = (struct convert_data *)data;
438 const struct rte_flow_item_vlan *spec = item->spec;
439 const struct rte_flow_item_vlan *mask = item->mask;
440 struct rte_flow *flow = info->flow;
443 /* use default mask if none provided */
445 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask;
446 /* TC does not support tpid masking. Only accept if exact match. */
447 if (mask->tpid && mask->tpid != 0xffff)
449 /* Double-tagging not supported. */
450 if (spec && mask->tpid && spec->tpid != htons(ETH_P_8021Q))
456 msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_8021Q));
457 #define VLAN_PRIO(tci) ((tci) >> 13)
458 #define VLAN_ID(tci) ((tci) & 0xfff)
462 uint16_t tci = ntohs(spec->tci) & mask->tci;
463 uint16_t prio = VLAN_PRIO(tci);
464 uint8_t vid = VLAN_ID(tci);
467 nlattr_add8(&msg->nh, TCA_FLOWER_KEY_VLAN_PRIO, prio);
469 nlattr_add16(&msg->nh, TCA_FLOWER_KEY_VLAN_ID, vid);
475 * Make as much checks as possible on an IPv4 item, and if a flow is provided,
476 * fill it appropriately with IPv4 info.
479 * Item specification.
480 * @param[in, out] data
481 * Additional data structure to tell next layers we've been here.
484 * 0 if checks are alright, -1 otherwise.
487 tap_flow_create_ipv4(const struct rte_flow_item *item, void *data)
489 struct convert_data *info = (struct convert_data *)data;
490 const struct rte_flow_item_ipv4 *spec = item->spec;
491 const struct rte_flow_item_ipv4 *mask = item->mask;
492 struct rte_flow *flow = info->flow;
495 /* use default mask if none provided */
497 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV4].default_mask;
498 /* check that previous eth type is compatible with ipv4 */
499 if (info->eth_type && info->eth_type != htons(ETH_P_IP))
501 /* store ip_proto for consistency if udp/tcp pattern item comes next */
503 info->ip_proto = spec->hdr.next_proto_id;
508 info->eth_type = htons(ETH_P_IP);
510 msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_IP));
513 if (spec->hdr.dst_addr) {
514 nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST,
516 nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST_MASK,
519 if (spec->hdr.src_addr) {
520 nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC,
522 nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC_MASK,
525 if (spec->hdr.next_proto_id)
526 nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO,
527 spec->hdr.next_proto_id);
532 * Make as much checks as possible on an IPv6 item, and if a flow is provided,
533 * fill it appropriately with IPv6 info.
536 * Item specification.
537 * @param[in, out] data
538 * Additional data structure to tell next layers we've been here.
541 * 0 if checks are alright, -1 otherwise.
544 tap_flow_create_ipv6(const struct rte_flow_item *item, void *data)
546 struct convert_data *info = (struct convert_data *)data;
547 const struct rte_flow_item_ipv6 *spec = item->spec;
548 const struct rte_flow_item_ipv6 *mask = item->mask;
549 struct rte_flow *flow = info->flow;
550 uint8_t empty_addr[16] = { 0 };
553 /* use default mask if none provided */
555 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV6].default_mask;
556 /* check that previous eth type is compatible with ipv6 */
557 if (info->eth_type && info->eth_type != htons(ETH_P_IPV6))
559 /* store ip_proto for consistency if udp/tcp pattern item comes next */
561 info->ip_proto = spec->hdr.proto;
566 info->eth_type = htons(ETH_P_IPV6);
568 msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_IPV6));
571 if (memcmp(spec->hdr.dst_addr, empty_addr, 16)) {
572 nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST,
573 sizeof(spec->hdr.dst_addr), &spec->hdr.dst_addr);
574 nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST_MASK,
575 sizeof(mask->hdr.dst_addr), &mask->hdr.dst_addr);
577 if (memcmp(spec->hdr.src_addr, empty_addr, 16)) {
578 nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC,
579 sizeof(spec->hdr.src_addr), &spec->hdr.src_addr);
580 nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC_MASK,
581 sizeof(mask->hdr.src_addr), &mask->hdr.src_addr);
584 nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, spec->hdr.proto);
589 * Make as much checks as possible on a UDP item, and if a flow is provided,
590 * fill it appropriately with UDP info.
593 * Item specification.
594 * @param[in, out] data
595 * Additional data structure to tell next layers we've been here.
598 * 0 if checks are alright, -1 otherwise.
601 tap_flow_create_udp(const struct rte_flow_item *item, void *data)
603 struct convert_data *info = (struct convert_data *)data;
604 const struct rte_flow_item_udp *spec = item->spec;
605 const struct rte_flow_item_udp *mask = item->mask;
606 struct rte_flow *flow = info->flow;
609 /* use default mask if none provided */
611 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_UDP].default_mask;
612 /* check that previous ip_proto is compatible with udp */
613 if (info->ip_proto && info->ip_proto != IPPROTO_UDP)
618 nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_UDP);
621 if (spec->hdr.dst_port &&
622 (spec->hdr.dst_port & mask->hdr.dst_port) == spec->hdr.dst_port)
623 nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_DST,
625 if (spec->hdr.src_port &&
626 (spec->hdr.src_port & mask->hdr.src_port) == spec->hdr.src_port)
627 nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_SRC,
633 * Make as much checks as possible on a TCP item, and if a flow is provided,
634 * fill it appropriately with TCP info.
637 * Item specification.
638 * @param[in, out] data
639 * Additional data structure to tell next layers we've been here.
642 * 0 if checks are alright, -1 otherwise.
645 tap_flow_create_tcp(const struct rte_flow_item *item, void *data)
647 struct convert_data *info = (struct convert_data *)data;
648 const struct rte_flow_item_tcp *spec = item->spec;
649 const struct rte_flow_item_tcp *mask = item->mask;
650 struct rte_flow *flow = info->flow;
653 /* use default mask if none provided */
655 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_TCP].default_mask;
656 /* check that previous ip_proto is compatible with tcp */
657 if (info->ip_proto && info->ip_proto != IPPROTO_TCP)
662 nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_TCP);
665 if (spec->hdr.dst_port &&
666 (spec->hdr.dst_port & mask->hdr.dst_port) == spec->hdr.dst_port)
667 nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_DST,
669 if (spec->hdr.src_port &&
670 (spec->hdr.src_port & mask->hdr.src_port) == spec->hdr.src_port)
671 nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_SRC,
677 * Check support for a given item.
680 * Item specification.
682 * Bit-Mask size in bytes.
683 * @param[in] supported_mask
684 * Bit-mask covering supported fields to compare with spec, last and mask in
686 * @param[in] default_mask
687 * Bit-mask default mask if none is provided in \item.
693 tap_flow_item_validate(const struct rte_flow_item *item,
695 const uint8_t *supported_mask,
696 const uint8_t *default_mask)
700 /* An empty layer is allowed, as long as all fields are NULL */
701 if (!item->spec && (item->mask || item->last))
703 /* Is the item spec compatible with what the NIC supports? */
704 if (item->spec && !item->mask) {
706 const uint8_t *spec = item->spec;
708 for (i = 0; i < size; ++i)
709 if ((spec[i] | supported_mask[i]) != supported_mask[i])
711 /* Is the default mask compatible with what the NIC supports? */
712 for (i = 0; i < size; i++)
713 if ((default_mask[i] | supported_mask[i]) !=
717 /* Is the item last compatible with what the NIC supports? */
718 if (item->last && !item->mask) {
720 const uint8_t *spec = item->last;
722 for (i = 0; i < size; ++i)
723 if ((spec[i] | supported_mask[i]) != supported_mask[i])
726 /* Is the item mask compatible with what the NIC supports? */
729 const uint8_t *spec = item->mask;
731 for (i = 0; i < size; ++i)
732 if ((spec[i] | supported_mask[i]) != supported_mask[i])
736 * Once masked, Are item spec and item last equal?
737 * TC does not support range so anything else is invalid.
739 if (item->spec && item->last) {
742 const uint8_t *apply = default_mask;
747 for (i = 0; i < size; ++i) {
748 spec[i] = ((const uint8_t *)item->spec)[i] & apply[i];
749 last[i] = ((const uint8_t *)item->last)[i] & apply[i];
751 ret = memcmp(spec, last, size);
757 * Transform a DROP/PASSTHRU action item in the provided flow for TC.
759 * @param[in, out] flow
762 * Appropriate action to be set in the TCA_GACT_PARMS structure.
765 * 0 if checks are alright, -1 otherwise.
768 add_action_gact(struct rte_flow *flow, int action)
770 struct nlmsg *msg = &flow->msg;
771 size_t act_index = 1;
776 if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
778 if (nlattr_nested_start(msg, act_index++) < 0)
780 nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("gact"), "gact");
781 if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
783 nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(p), &p);
784 nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
785 nlattr_nested_finish(msg); /* nested act_index */
786 nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
791 * Transform a MIRRED action item in the provided flow for TC.
793 * @param[in, out] flow
796 * Netdevice ifindex, where to mirror/redirect packet to.
797 * @param[in] action_type
798 * Either TCA_EGRESS_REDIR for redirection or TCA_EGRESS_MIRROR for mirroring.
801 * 0 if checks are alright, -1 otherwise.
804 add_action_mirred(struct rte_flow *flow, uint16_t ifindex, uint16_t action_type)
806 struct nlmsg *msg = &flow->msg;
807 size_t act_index = 1;
808 struct tc_mirred p = {
809 .eaction = action_type,
813 if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
815 if (nlattr_nested_start(msg, act_index++) < 0)
817 nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("mirred"), "mirred");
818 if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
820 if (action_type == TCA_EGRESS_MIRROR)
821 p.action = TC_ACT_PIPE;
823 p.action = TC_ACT_STOLEN;
824 nlattr_add(&msg->nh, TCA_MIRRED_PARMS, sizeof(p), &p);
825 nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
826 nlattr_nested_finish(msg); /* nested act_index */
827 nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
832 * Transform a QUEUE action item in the provided flow for TC.
834 * @param[in, out] flow
840 * 0 if checks are alright, -1 otherwise.
843 add_action_skbedit(struct rte_flow *flow, uint16_t queue)
845 struct nlmsg *msg = &flow->msg;
846 size_t act_index = 1;
847 struct tc_skbedit p = {
848 .action = TC_ACT_PIPE
851 if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
853 if (nlattr_nested_start(msg, act_index++) < 0)
855 nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("skbedit"), "skbedit");
856 if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
858 nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS, sizeof(p), &p);
859 nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING, queue);
860 nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
861 nlattr_nested_finish(msg); /* nested act_index */
862 nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
867 * Validate a flow supported by TC.
868 * If flow param is not NULL, then also fill the netlink message inside.
871 * Pointer to private structure.
873 * Flow rule attributes.
875 * Pattern specification (list terminated by the END pattern item).
877 * Associated actions (list terminated by the END action).
879 * Perform verbose error reporting if not NULL.
880 * @param[in, out] flow
881 * Flow structure to update.
883 * If set to TCA_EGRESS_REDIR, provided actions will be replaced with a
884 * redirection to the tap netdevice, and the TC rule will be configured
885 * on the remote netdevice in pmd.
886 * If set to TCA_EGRESS_MIRROR, provided actions will be replaced with a
887 * mirroring to the tap netdevice, and the TC rule will be configured
888 * on the remote netdevice in pmd. Matching packets will thus be duplicated.
889 * If set to 0, the standard behavior is to be used: set correct actions for
890 * the TC rule, and apply it on the tap netdevice.
893 * 0 on success, a negative errno value otherwise and rte_errno is set.
896 priv_flow_process(struct pmd_internals *pmd,
897 const struct rte_flow_attr *attr,
898 const struct rte_flow_item items[],
899 const struct rte_flow_action actions[],
900 struct rte_flow_error *error,
901 struct rte_flow *flow,
904 const struct tap_flow_items *cur_item = tap_flow_items;
905 struct convert_data data = {
910 int action = 0; /* Only one action authorized for now */
912 if (attr->group > MAX_GROUP) {
914 error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
915 NULL, "group value too big: cannot exceed 15");
918 if (attr->priority > MAX_PRIORITY) {
920 error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
921 NULL, "priority value too big");
924 uint16_t group = attr->group << GROUP_SHIFT;
925 uint16_t prio = group | (attr->priority + PRIORITY_OFFSET);
926 flow->msg.t.tcm_info = TC_H_MAKE(prio << 16,
927 flow->msg.t.tcm_info);
932 * If attr->ingress, the rule applies on remote ingress
933 * to match incoming packets
934 * If attr->egress, the rule applies on tap ingress (as
935 * seen from the kernel) to deal with packets going out
938 flow->msg.t.tcm_parent = TC_H_MAKE(TC_H_INGRESS, 0);
940 /* Standard rule on tap egress (kernel standpoint). */
941 flow->msg.t.tcm_parent =
942 TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
944 /* use flower filter type */
945 nlattr_add(&flow->msg.nh, TCA_KIND, sizeof("flower"), "flower");
946 if (nlattr_nested_start(&flow->msg, TCA_OPTIONS) < 0)
947 goto exit_item_not_supported;
949 for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) {
950 const struct tap_flow_items *token = NULL;
954 if (items->type == RTE_FLOW_ITEM_TYPE_VOID)
958 cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END;
960 if (cur_item->items[i] == items->type) {
961 token = &tap_flow_items[items->type];
966 goto exit_item_not_supported;
968 err = tap_flow_item_validate(
969 items, cur_item->mask_sz,
970 (const uint8_t *)cur_item->mask,
971 (const uint8_t *)cur_item->default_mask);
973 goto exit_item_not_supported;
974 if (flow && cur_item->convert) {
975 if (!pmd->flower_vlan_support &&
976 cur_item->convert == tap_flow_create_vlan)
977 goto exit_item_not_supported;
978 err = cur_item->convert(items, &data);
980 goto exit_item_not_supported;
984 if (pmd->flower_vlan_support && data.vlan) {
985 nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
987 nlattr_add16(&flow->msg.nh,
988 TCA_FLOWER_KEY_VLAN_ETH_TYPE,
990 data.eth_type : htons(ETH_P_ALL));
991 } else if (data.eth_type) {
992 nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
996 if (mirred && flow) {
997 uint16_t if_index = pmd->if_index;
1000 * If attr->egress && mirred, then this is a special
1001 * case where the rule must be applied on the tap, to
1002 * redirect packets coming from the DPDK App, out
1003 * through the remote netdevice.
1006 if_index = pmd->remote_if_index;
1007 if (add_action_mirred(flow, if_index, mirred) < 0)
1008 goto exit_action_not_supported;
1012 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
1015 if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
1017 } else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
1019 goto exit_action_not_supported;
1022 err = add_action_gact(flow, TC_ACT_SHOT);
1023 } else if (actions->type == RTE_FLOW_ACTION_TYPE_PASSTHRU) {
1025 goto exit_action_not_supported;
1028 err = add_action_gact(flow, TC_ACT_UNSPEC);
1029 } else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
1030 const struct rte_flow_action_queue *queue =
1031 (const struct rte_flow_action_queue *)
1034 goto exit_action_not_supported;
1036 if (!queue || (queue->index >= pmd->nb_queues))
1037 goto exit_action_not_supported;
1039 err = add_action_skbedit(flow, queue->index);
1041 goto exit_action_not_supported;
1044 goto exit_action_not_supported;
1048 nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */
1050 exit_item_not_supported:
1051 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
1052 items, "item not supported");
1054 exit_action_not_supported:
1055 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
1056 actions, "action not supported");
1065 * @see rte_flow_validate()
1069 tap_flow_validate(struct rte_eth_dev *dev,
1070 const struct rte_flow_attr *attr,
1071 const struct rte_flow_item items[],
1072 const struct rte_flow_action actions[],
1073 struct rte_flow_error *error)
1075 struct pmd_internals *pmd = dev->data->dev_private;
1077 return priv_flow_process(pmd, attr, items, actions, error, NULL, 0);
1081 * Set a unique handle in a flow.
1083 * The kernel supports TC rules with equal priority, as long as they use the
1084 * same matching fields (e.g.: dst mac and ipv4) with different values (and
1085 * full mask to ensure no collision is possible).
1086 * In those rules, the handle (uint32_t) is the part that would identify
1087 * specifically each rule.
1089 * On 32-bit architectures, the handle can simply be the flow's pointer address.
1090 * On 64-bit architectures, we rely on jhash(flow) to find a (sufficiently)
1093 * @param[in, out] flow
1094 * The flow that needs its handle set.
1097 tap_flow_set_handle(struct rte_flow *flow)
1099 uint32_t handle = 0;
1101 if (sizeof(flow) > 4)
1102 handle = rte_jhash(&flow, sizeof(flow), 1);
1104 handle = (uintptr_t)flow;
1105 /* must be at least 1 to avoid letting the kernel choose one for us */
1108 flow->msg.t.tcm_handle = handle;
1114 * @see rte_flow_create()
1117 static struct rte_flow *
1118 tap_flow_create(struct rte_eth_dev *dev,
1119 const struct rte_flow_attr *attr,
1120 const struct rte_flow_item items[],
1121 const struct rte_flow_action actions[],
1122 struct rte_flow_error *error)
1124 struct pmd_internals *pmd = dev->data->dev_private;
1125 struct rte_flow *remote_flow = NULL;
1126 struct rte_flow *flow = NULL;
1127 struct nlmsg *msg = NULL;
1130 if (!pmd->if_index) {
1131 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1133 "can't create rule, ifindex not found");
1137 * No rules configured through standard rte_flow should be set on the
1138 * priorities used by implicit rules.
1140 if ((attr->group == MAX_GROUP) &&
1141 attr->priority > (MAX_PRIORITY - TAP_REMOTE_MAX_IDX)) {
1143 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1144 NULL, "priority value too big");
1147 flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1149 rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1150 NULL, "cannot allocate memory for rte_flow");
1154 tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER,
1155 NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1156 msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1157 tap_flow_set_handle(flow);
1158 if (priv_flow_process(pmd, attr, items, actions, error, flow, 0))
1160 err = nl_send(pmd->nlsk_fd, &msg->nh);
1162 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1163 NULL, "couldn't send request to kernel");
1166 err = nl_recv_ack(pmd->nlsk_fd);
1168 rte_flow_error_set(error, EEXIST, RTE_FLOW_ERROR_TYPE_HANDLE,
1169 NULL, "overlapping rules");
1172 LIST_INSERT_HEAD(&pmd->flows, flow, next);
1174 * If a remote device is configured, a TC rule with identical items for
1175 * matching must be set on that device, with a single action: redirect
1176 * to the local pmd->if_index.
1178 if (pmd->remote_if_index) {
1179 remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1182 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1183 "cannot allocate memory for rte_flow");
1186 msg = &remote_flow->msg;
1187 /* set the rule if_index for the remote netdevice */
1189 msg, pmd->remote_if_index, RTM_NEWTFILTER,
1190 NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1191 msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1192 tap_flow_set_handle(remote_flow);
1193 if (priv_flow_process(pmd, attr, items, NULL,
1194 error, remote_flow, TCA_EGRESS_REDIR)) {
1196 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1197 NULL, "rte flow rule validation failed");
1200 err = nl_send(pmd->nlsk_fd, &msg->nh);
1203 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1204 NULL, "Failure sending nl request");
1207 err = nl_recv_ack(pmd->nlsk_fd);
1210 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1211 NULL, "overlapping rules");
1214 flow->remote_flow = remote_flow;
1219 rte_free(remote_flow);
1226 * Destroy a flow using pointer to pmd_internal.
1228 * @param[in, out] pmd
1229 * Pointer to private structure.
1231 * Pointer to the flow to destroy.
1232 * @param[in, out] error
1233 * Pointer to the flow error handler
1235 * @return 0 if the flow could be destroyed, -1 otherwise.
1238 tap_flow_destroy_pmd(struct pmd_internals *pmd,
1239 struct rte_flow *flow,
1240 struct rte_flow_error *error)
1242 struct rte_flow *remote_flow = flow->remote_flow;
1245 LIST_REMOVE(flow, next);
1246 flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1247 flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
1249 ret = nl_send(pmd->nlsk_fd, &flow->msg.nh);
1251 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1252 NULL, "couldn't send request to kernel");
1255 ret = nl_recv_ack(pmd->nlsk_fd);
1258 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1259 "couldn't receive kernel ack to our request");
1263 remote_flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1264 remote_flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
1266 ret = nl_send(pmd->nlsk_fd, &remote_flow->msg.nh);
1269 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1270 NULL, "Failure sending nl request");
1273 ret = nl_recv_ack(pmd->nlsk_fd);
1276 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1277 NULL, "Failure trying to receive nl ack");
1283 rte_free(remote_flow);
1291 * @see rte_flow_destroy()
1295 tap_flow_destroy(struct rte_eth_dev *dev,
1296 struct rte_flow *flow,
1297 struct rte_flow_error *error)
1299 struct pmd_internals *pmd = dev->data->dev_private;
1301 return tap_flow_destroy_pmd(pmd, flow, error);
1305 * Destroy all flows.
1307 * @see rte_flow_flush()
1311 tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error)
1313 struct pmd_internals *pmd = dev->data->dev_private;
1314 struct rte_flow *flow;
1316 while (!LIST_EMPTY(&pmd->flows)) {
1317 flow = LIST_FIRST(&pmd->flows);
1318 if (tap_flow_destroy(dev, flow, error) < 0)
1325 * Add an implicit flow rule on the remote device to make sure traffic gets to
1326 * the tap netdevice from there.
1329 * Pointer to private structure.
1331 * The idx in the implicit_rte_flows array specifying which rule to apply.
1333 * @return -1 if the rule couldn't be applied, 0 otherwise.
1335 int tap_flow_implicit_create(struct pmd_internals *pmd,
1336 enum implicit_rule_index idx)
1338 struct rte_flow_item *items = implicit_rte_flows[idx].items;
1339 struct rte_flow_attr *attr = &implicit_rte_flows[idx].attr;
1340 struct rte_flow_item_eth eth_local = { .type = 0 };
1341 uint16_t if_index = pmd->remote_if_index;
1342 struct rte_flow *remote_flow = NULL;
1343 struct nlmsg *msg = NULL;
1345 struct rte_flow_item items_local[2] = {
1347 .type = items[0].type,
1349 .mask = items[0].mask,
1352 .type = items[1].type,
1356 remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1358 RTE_LOG(ERR, PMD, "Cannot allocate memory for rte_flow");
1361 msg = &remote_flow->msg;
1362 if (idx == TAP_REMOTE_TX) {
1363 if_index = pmd->if_index;
1364 } else if (idx == TAP_REMOTE_LOCAL_MAC) {
1366 * eth addr couldn't be set in implicit_rte_flows[] as it is not
1367 * known at compile time.
1369 memcpy(ð_local.dst, &pmd->eth_addr, sizeof(pmd->eth_addr));
1370 items = items_local;
1372 tc_init_msg(msg, if_index, RTM_NEWTFILTER,
1373 NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1374 msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1375 tap_flow_set_handle(remote_flow);
1376 if (priv_flow_process(pmd, attr, items, NULL, NULL,
1377 remote_flow, implicit_rte_flows[idx].mirred)) {
1378 RTE_LOG(ERR, PMD, "rte flow rule validation failed\n");
1381 err = nl_send(pmd->nlsk_fd, &msg->nh);
1383 RTE_LOG(ERR, PMD, "Failure sending nl request");
1386 err = nl_recv_ack(pmd->nlsk_fd);
1389 "Kernel refused TC filter rule creation");
1392 LIST_INSERT_HEAD(&pmd->implicit_flows, remote_flow, next);
1396 rte_free(remote_flow);
1401 * Remove specific implicit flow rule on the remote device.
1403 * @param[in, out] pmd
1404 * Pointer to private structure.
1406 * The idx in the implicit_rte_flows array specifying which rule to remove.
1408 * @return -1 if one of the implicit rules couldn't be created, 0 otherwise.
1410 int tap_flow_implicit_destroy(struct pmd_internals *pmd,
1411 enum implicit_rule_index idx)
1413 struct rte_flow *remote_flow;
1415 int idx_prio = implicit_rte_flows[idx].attr.priority + PRIORITY_OFFSET;
1417 for (remote_flow = LIST_FIRST(&pmd->implicit_flows);
1419 remote_flow = LIST_NEXT(remote_flow, next)) {
1420 cur_prio = (remote_flow->msg.t.tcm_info >> 16) & PRIORITY_MASK;
1421 if (cur_prio != idx_prio)
1423 return tap_flow_destroy_pmd(pmd, remote_flow, NULL);
1429 * Destroy all implicit flows.
1431 * @see rte_flow_flush()
1434 tap_flow_implicit_flush(struct pmd_internals *pmd, struct rte_flow_error *error)
1436 struct rte_flow *remote_flow;
1438 while (!LIST_EMPTY(&pmd->implicit_flows)) {
1439 remote_flow = LIST_FIRST(&pmd->implicit_flows);
1440 if (tap_flow_destroy_pmd(pmd, remote_flow, error) < 0)
1447 * Manage filter operations.
1450 * Pointer to Ethernet device structure.
1451 * @param filter_type
1454 * Operation to perform.
1456 * Pointer to operation-specific structure.
1459 * 0 on success, negative errno value on failure.
1462 tap_dev_filter_ctrl(struct rte_eth_dev *dev,
1463 enum rte_filter_type filter_type,
1464 enum rte_filter_op filter_op,
1467 struct pmd_internals *pmd = dev->data->dev_private;
1469 if (!pmd->flower_support)
1471 switch (filter_type) {
1472 case RTE_ETH_FILTER_GENERIC:
1473 if (filter_op != RTE_ETH_FILTER_GET)
1475 *(const void **)arg = &tap_flow_ops;
1478 RTE_LOG(ERR, PMD, "%p: filter type (%d) not supported",
1479 (void *)dev, filter_type);