net/tap: remove unsupported UDP/TCP port mask in flow
[dpdk.git] / drivers / net / tap / tap_flow.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright 2017 6WIND S.A.
5  *   Copyright 2017 Mellanox.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of 6WIND S.A. nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <sys/queue.h>
35
36 #include <rte_byteorder.h>
37 #include <rte_jhash.h>
38 #include <rte_malloc.h>
39 #include <rte_eth_tap.h>
40 #include <tap_flow.h>
41 #include <tap_autoconf.h>
42 #include <tap_tcmsgs.h>
43
44 #ifndef HAVE_TC_FLOWER
45 /*
46  * For kernels < 4.2, this enum is not defined. Runtime checks will be made to
47  * avoid sending TC messages the kernel cannot understand.
48  */
49 enum {
50         TCA_FLOWER_UNSPEC,
51         TCA_FLOWER_CLASSID,
52         TCA_FLOWER_INDEV,
53         TCA_FLOWER_ACT,
54         TCA_FLOWER_KEY_ETH_DST,         /* ETH_ALEN */
55         TCA_FLOWER_KEY_ETH_DST_MASK,    /* ETH_ALEN */
56         TCA_FLOWER_KEY_ETH_SRC,         /* ETH_ALEN */
57         TCA_FLOWER_KEY_ETH_SRC_MASK,    /* ETH_ALEN */
58         TCA_FLOWER_KEY_ETH_TYPE,        /* be16 */
59         TCA_FLOWER_KEY_IP_PROTO,        /* u8 */
60         TCA_FLOWER_KEY_IPV4_SRC,        /* be32 */
61         TCA_FLOWER_KEY_IPV4_SRC_MASK,   /* be32 */
62         TCA_FLOWER_KEY_IPV4_DST,        /* be32 */
63         TCA_FLOWER_KEY_IPV4_DST_MASK,   /* be32 */
64         TCA_FLOWER_KEY_IPV6_SRC,        /* struct in6_addr */
65         TCA_FLOWER_KEY_IPV6_SRC_MASK,   /* struct in6_addr */
66         TCA_FLOWER_KEY_IPV6_DST,        /* struct in6_addr */
67         TCA_FLOWER_KEY_IPV6_DST_MASK,   /* struct in6_addr */
68         TCA_FLOWER_KEY_TCP_SRC,         /* be16 */
69         TCA_FLOWER_KEY_TCP_DST,         /* be16 */
70         TCA_FLOWER_KEY_UDP_SRC,         /* be16 */
71         TCA_FLOWER_KEY_UDP_DST,         /* be16 */
72 };
73 #endif
74 #ifndef HAVE_TC_VLAN_ID
75 enum {
76         /* TCA_FLOWER_FLAGS, */
77         TCA_FLOWER_KEY_VLAN_ID = TCA_FLOWER_KEY_UDP_DST + 2, /* be16 */
78         TCA_FLOWER_KEY_VLAN_PRIO,       /* u8   */
79         TCA_FLOWER_KEY_VLAN_ETH_TYPE,   /* be16 */
80 };
81 #endif
82
83 struct rte_flow {
84         LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
85         struct rte_flow *remote_flow; /* associated remote flow */
86         struct nlmsg msg;
87 };
88
89 struct convert_data {
90         uint16_t eth_type;
91         uint16_t ip_proto;
92         uint8_t vlan;
93         struct rte_flow *flow;
94 };
95
96 struct remote_rule {
97         struct rte_flow_attr attr;
98         struct rte_flow_item items[2];
99         int mirred;
100 };
101
102 static int tap_flow_create_eth(const struct rte_flow_item *item, void *data);
103 static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data);
104 static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data);
105 static int tap_flow_create_ipv6(const struct rte_flow_item *item, void *data);
106 static int tap_flow_create_udp(const struct rte_flow_item *item, void *data);
107 static int tap_flow_create_tcp(const struct rte_flow_item *item, void *data);
108 static int
109 tap_flow_validate(struct rte_eth_dev *dev,
110                   const struct rte_flow_attr *attr,
111                   const struct rte_flow_item items[],
112                   const struct rte_flow_action actions[],
113                   struct rte_flow_error *error);
114
115 static struct rte_flow *
116 tap_flow_create(struct rte_eth_dev *dev,
117                 const struct rte_flow_attr *attr,
118                 const struct rte_flow_item items[],
119                 const struct rte_flow_action actions[],
120                 struct rte_flow_error *error);
121
122 static int
123 tap_flow_destroy(struct rte_eth_dev *dev,
124                  struct rte_flow *flow,
125                  struct rte_flow_error *error);
126
127 static const struct rte_flow_ops tap_flow_ops = {
128         .validate = tap_flow_validate,
129         .create = tap_flow_create,
130         .destroy = tap_flow_destroy,
131         .flush = tap_flow_flush,
132 };
133
134 /* Static initializer for items. */
135 #define ITEMS(...) \
136         (const enum rte_flow_item_type []){ \
137                 __VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \
138         }
139
140 /* Structure to generate a simple graph of layers supported by the NIC. */
141 struct tap_flow_items {
142         /* Bit-mask corresponding to what is supported for this item. */
143         const void *mask;
144         const unsigned int mask_sz; /* Bit-mask size in bytes. */
145         /*
146          * Bit-mask corresponding to the default mask, if none is provided
147          * along with the item.
148          */
149         const void *default_mask;
150         /**
151          * Conversion function from rte_flow to netlink attributes.
152          *
153          * @param item
154          *   rte_flow item to convert.
155          * @param data
156          *   Internal structure to store the conversion.
157          *
158          * @return
159          *   0 on success, negative value otherwise.
160          */
161         int (*convert)(const struct rte_flow_item *item, void *data);
162         /** List of possible following items.  */
163         const enum rte_flow_item_type *const items;
164 };
165
166 /* Graph of supported items and associated actions. */
167 static const struct tap_flow_items tap_flow_items[] = {
168         [RTE_FLOW_ITEM_TYPE_END] = {
169                 .items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH),
170         },
171         [RTE_FLOW_ITEM_TYPE_ETH] = {
172                 .items = ITEMS(
173                         RTE_FLOW_ITEM_TYPE_VLAN,
174                         RTE_FLOW_ITEM_TYPE_IPV4,
175                         RTE_FLOW_ITEM_TYPE_IPV6),
176                 .mask = &(const struct rte_flow_item_eth){
177                         .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
178                         .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
179                         .type = -1,
180                 },
181                 .mask_sz = sizeof(struct rte_flow_item_eth),
182                 .default_mask = &rte_flow_item_eth_mask,
183                 .convert = tap_flow_create_eth,
184         },
185         [RTE_FLOW_ITEM_TYPE_VLAN] = {
186                 .items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4,
187                                RTE_FLOW_ITEM_TYPE_IPV6),
188                 .mask = &(const struct rte_flow_item_vlan){
189                         .tpid = -1,
190                         /* DEI matching is not supported */
191 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
192                         .tci = 0xffef,
193 #else
194                         .tci = 0xefff,
195 #endif
196                 },
197                 .mask_sz = sizeof(struct rte_flow_item_vlan),
198                 .default_mask = &rte_flow_item_vlan_mask,
199                 .convert = tap_flow_create_vlan,
200         },
201         [RTE_FLOW_ITEM_TYPE_IPV4] = {
202                 .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
203                                RTE_FLOW_ITEM_TYPE_TCP),
204                 .mask = &(const struct rte_flow_item_ipv4){
205                         .hdr = {
206                                 .src_addr = -1,
207                                 .dst_addr = -1,
208                                 .next_proto_id = -1,
209                         },
210                 },
211                 .mask_sz = sizeof(struct rte_flow_item_ipv4),
212                 .default_mask = &rte_flow_item_ipv4_mask,
213                 .convert = tap_flow_create_ipv4,
214         },
215         [RTE_FLOW_ITEM_TYPE_IPV6] = {
216                 .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
217                                RTE_FLOW_ITEM_TYPE_TCP),
218                 .mask = &(const struct rte_flow_item_ipv6){
219                         .hdr = {
220                                 .src_addr = {
221                                         "\xff\xff\xff\xff\xff\xff\xff\xff"
222                                         "\xff\xff\xff\xff\xff\xff\xff\xff",
223                                 },
224                                 .dst_addr = {
225                                         "\xff\xff\xff\xff\xff\xff\xff\xff"
226                                         "\xff\xff\xff\xff\xff\xff\xff\xff",
227                                 },
228                                 .proto = -1,
229                         },
230                 },
231                 .mask_sz = sizeof(struct rte_flow_item_ipv6),
232                 .default_mask = &rte_flow_item_ipv6_mask,
233                 .convert = tap_flow_create_ipv6,
234         },
235         [RTE_FLOW_ITEM_TYPE_UDP] = {
236                 .mask = &(const struct rte_flow_item_udp){
237                         .hdr = {
238                                 .src_port = -1,
239                                 .dst_port = -1,
240                         },
241                 },
242                 .mask_sz = sizeof(struct rte_flow_item_udp),
243                 .default_mask = &rte_flow_item_udp_mask,
244                 .convert = tap_flow_create_udp,
245         },
246         [RTE_FLOW_ITEM_TYPE_TCP] = {
247                 .mask = &(const struct rte_flow_item_tcp){
248                         .hdr = {
249                                 .src_port = -1,
250                                 .dst_port = -1,
251                         },
252                 },
253                 .mask_sz = sizeof(struct rte_flow_item_tcp),
254                 .default_mask = &rte_flow_item_tcp_mask,
255                 .convert = tap_flow_create_tcp,
256         },
257 };
258
259 static struct remote_rule implicit_rte_flows[TAP_REMOTE_MAX_IDX] = {
260         [TAP_REMOTE_LOCAL_MAC] = {
261                 .attr = {
262                         .group = MAX_GROUP,
263                         .priority = PRIORITY_MASK - TAP_REMOTE_LOCAL_MAC,
264                         .ingress = 1,
265                 },
266                 .items[0] = {
267                         .type = RTE_FLOW_ITEM_TYPE_ETH,
268                         .mask =  &(const struct rte_flow_item_eth){
269                                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
270                         },
271                 },
272                 .items[1] = {
273                         .type = RTE_FLOW_ITEM_TYPE_END,
274                 },
275                 .mirred = TCA_EGRESS_REDIR,
276         },
277         [TAP_REMOTE_BROADCAST] = {
278                 .attr = {
279                         .group = MAX_GROUP,
280                         .priority = PRIORITY_MASK - TAP_REMOTE_BROADCAST,
281                         .ingress = 1,
282                 },
283                 .items[0] = {
284                         .type = RTE_FLOW_ITEM_TYPE_ETH,
285                         .mask =  &(const struct rte_flow_item_eth){
286                                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
287                         },
288                         .spec = &(const struct rte_flow_item_eth){
289                                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
290                         },
291                 },
292                 .items[1] = {
293                         .type = RTE_FLOW_ITEM_TYPE_END,
294                 },
295                 .mirred = TCA_EGRESS_MIRROR,
296         },
297         [TAP_REMOTE_BROADCASTV6] = {
298                 .attr = {
299                         .group = MAX_GROUP,
300                         .priority = PRIORITY_MASK - TAP_REMOTE_BROADCASTV6,
301                         .ingress = 1,
302                 },
303                 .items[0] = {
304                         .type = RTE_FLOW_ITEM_TYPE_ETH,
305                         .mask =  &(const struct rte_flow_item_eth){
306                                 .dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
307                         },
308                         .spec = &(const struct rte_flow_item_eth){
309                                 .dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
310                         },
311                 },
312                 .items[1] = {
313                         .type = RTE_FLOW_ITEM_TYPE_END,
314                 },
315                 .mirred = TCA_EGRESS_MIRROR,
316         },
317         [TAP_REMOTE_PROMISC] = {
318                 .attr = {
319                         .group = MAX_GROUP,
320                         .priority = PRIORITY_MASK - TAP_REMOTE_PROMISC,
321                         .ingress = 1,
322                 },
323                 .items[0] = {
324                         .type = RTE_FLOW_ITEM_TYPE_VOID,
325                 },
326                 .items[1] = {
327                         .type = RTE_FLOW_ITEM_TYPE_END,
328                 },
329                 .mirred = TCA_EGRESS_MIRROR,
330         },
331         [TAP_REMOTE_ALLMULTI] = {
332                 .attr = {
333                         .group = MAX_GROUP,
334                         .priority = PRIORITY_MASK - TAP_REMOTE_ALLMULTI,
335                         .ingress = 1,
336                 },
337                 .items[0] = {
338                         .type = RTE_FLOW_ITEM_TYPE_ETH,
339                         .mask =  &(const struct rte_flow_item_eth){
340                                 .dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
341                         },
342                         .spec = &(const struct rte_flow_item_eth){
343                                 .dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
344                         },
345                 },
346                 .items[1] = {
347                         .type = RTE_FLOW_ITEM_TYPE_END,
348                 },
349                 .mirred = TCA_EGRESS_MIRROR,
350         },
351         [TAP_REMOTE_TX] = {
352                 .attr = {
353                         .group = 0,
354                         .priority = TAP_REMOTE_TX,
355                         .egress = 1,
356                 },
357                 .items[0] = {
358                         .type = RTE_FLOW_ITEM_TYPE_VOID,
359                 },
360                 .items[1] = {
361                         .type = RTE_FLOW_ITEM_TYPE_END,
362                 },
363                 .mirred = TCA_EGRESS_MIRROR,
364         },
365 };
366
367 /**
368  * Make as much checks as possible on an Ethernet item, and if a flow is
369  * provided, fill it appropriately with Ethernet info.
370  *
371  * @param[in] item
372  *   Item specification.
373  * @param[in, out] data
374  *   Additional data structure to tell next layers we've been here.
375  *
376  * @return
377  *   0 if checks are alright, -1 otherwise.
378  */
379 static int
380 tap_flow_create_eth(const struct rte_flow_item *item, void *data)
381 {
382         struct convert_data *info = (struct convert_data *)data;
383         const struct rte_flow_item_eth *spec = item->spec;
384         const struct rte_flow_item_eth *mask = item->mask;
385         struct rte_flow *flow = info->flow;
386         struct nlmsg *msg;
387
388         /* use default mask if none provided */
389         if (!mask)
390                 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_ETH].default_mask;
391         /* TC does not support eth_type masking. Only accept if exact match. */
392         if (mask->type && mask->type != 0xffff)
393                 return -1;
394         if (!spec)
395                 return 0;
396         /* store eth_type for consistency if ipv4/6 pattern item comes next */
397         if (spec->type & mask->type)
398                 info->eth_type = spec->type;
399         if (!flow)
400                 return 0;
401         msg = &flow->msg;
402         if (spec->type & mask->type)
403                 msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info,
404                                             (spec->type & mask->type));
405         if (!is_zero_ether_addr(&spec->dst)) {
406                 nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_DST, ETHER_ADDR_LEN,
407                            &spec->dst.addr_bytes);
408                 nlattr_add(&msg->nh,
409                            TCA_FLOWER_KEY_ETH_DST_MASK, ETHER_ADDR_LEN,
410                            &mask->dst.addr_bytes);
411         }
412         if (!is_zero_ether_addr(&mask->src)) {
413                 nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_SRC, ETHER_ADDR_LEN,
414                            &spec->src.addr_bytes);
415                 nlattr_add(&msg->nh,
416                            TCA_FLOWER_KEY_ETH_SRC_MASK, ETHER_ADDR_LEN,
417                            &mask->src.addr_bytes);
418         }
419         return 0;
420 }
421
422 /**
423  * Make as much checks as possible on a VLAN item, and if a flow is provided,
424  * fill it appropriately with VLAN info.
425  *
426  * @param[in] item
427  *   Item specification.
428  * @param[in, out] data
429  *   Additional data structure to tell next layers we've been here.
430  *
431  * @return
432  *   0 if checks are alright, -1 otherwise.
433  */
434 static int
435 tap_flow_create_vlan(const struct rte_flow_item *item, void *data)
436 {
437         struct convert_data *info = (struct convert_data *)data;
438         const struct rte_flow_item_vlan *spec = item->spec;
439         const struct rte_flow_item_vlan *mask = item->mask;
440         struct rte_flow *flow = info->flow;
441         struct nlmsg *msg;
442
443         /* use default mask if none provided */
444         if (!mask)
445                 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask;
446         /* TC does not support tpid masking. Only accept if exact match. */
447         if (mask->tpid && mask->tpid != 0xffff)
448                 return -1;
449         /* Double-tagging not supported. */
450         if (spec && mask->tpid && spec->tpid != htons(ETH_P_8021Q))
451                 return -1;
452         info->vlan = 1;
453         if (!flow)
454                 return 0;
455         msg = &flow->msg;
456         msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_8021Q));
457 #define VLAN_PRIO(tci) ((tci) >> 13)
458 #define VLAN_ID(tci) ((tci) & 0xfff)
459         if (!spec)
460                 return 0;
461         if (spec->tci) {
462                 uint16_t tci = ntohs(spec->tci) & mask->tci;
463                 uint16_t prio = VLAN_PRIO(tci);
464                 uint8_t vid = VLAN_ID(tci);
465
466                 if (prio)
467                         nlattr_add8(&msg->nh, TCA_FLOWER_KEY_VLAN_PRIO, prio);
468                 if (vid)
469                         nlattr_add16(&msg->nh, TCA_FLOWER_KEY_VLAN_ID, vid);
470         }
471         return 0;
472 }
473
474 /**
475  * Make as much checks as possible on an IPv4 item, and if a flow is provided,
476  * fill it appropriately with IPv4 info.
477  *
478  * @param[in] item
479  *   Item specification.
480  * @param[in, out] data
481  *   Additional data structure to tell next layers we've been here.
482  *
483  * @return
484  *   0 if checks are alright, -1 otherwise.
485  */
486 static int
487 tap_flow_create_ipv4(const struct rte_flow_item *item, void *data)
488 {
489         struct convert_data *info = (struct convert_data *)data;
490         const struct rte_flow_item_ipv4 *spec = item->spec;
491         const struct rte_flow_item_ipv4 *mask = item->mask;
492         struct rte_flow *flow = info->flow;
493         struct nlmsg *msg;
494
495         /* use default mask if none provided */
496         if (!mask)
497                 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV4].default_mask;
498         /* check that previous eth type is compatible with ipv4 */
499         if (info->eth_type && info->eth_type != htons(ETH_P_IP))
500                 return -1;
501         /* store ip_proto for consistency if udp/tcp pattern item comes next */
502         if (spec)
503                 info->ip_proto = spec->hdr.next_proto_id;
504         if (!flow)
505                 return 0;
506         msg = &flow->msg;
507         if (!info->eth_type)
508                 info->eth_type = htons(ETH_P_IP);
509         if (!info->vlan)
510                 msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_IP));
511         if (!spec)
512                 return 0;
513         if (spec->hdr.dst_addr) {
514                 nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST,
515                              spec->hdr.dst_addr);
516                 nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST_MASK,
517                              mask->hdr.dst_addr);
518         }
519         if (spec->hdr.src_addr) {
520                 nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC,
521                              spec->hdr.src_addr);
522                 nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC_MASK,
523                              mask->hdr.src_addr);
524         }
525         if (spec->hdr.next_proto_id)
526                 nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO,
527                             spec->hdr.next_proto_id);
528         return 0;
529 }
530
531 /**
532  * Make as much checks as possible on an IPv6 item, and if a flow is provided,
533  * fill it appropriately with IPv6 info.
534  *
535  * @param[in] item
536  *   Item specification.
537  * @param[in, out] data
538  *   Additional data structure to tell next layers we've been here.
539  *
540  * @return
541  *   0 if checks are alright, -1 otherwise.
542  */
543 static int
544 tap_flow_create_ipv6(const struct rte_flow_item *item, void *data)
545 {
546         struct convert_data *info = (struct convert_data *)data;
547         const struct rte_flow_item_ipv6 *spec = item->spec;
548         const struct rte_flow_item_ipv6 *mask = item->mask;
549         struct rte_flow *flow = info->flow;
550         uint8_t empty_addr[16] = { 0 };
551         struct nlmsg *msg;
552
553         /* use default mask if none provided */
554         if (!mask)
555                 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV6].default_mask;
556         /* check that previous eth type is compatible with ipv6 */
557         if (info->eth_type && info->eth_type != htons(ETH_P_IPV6))
558                 return -1;
559         /* store ip_proto for consistency if udp/tcp pattern item comes next */
560         if (spec)
561                 info->ip_proto = spec->hdr.proto;
562         if (!flow)
563                 return 0;
564         msg = &flow->msg;
565         if (!info->eth_type)
566                 info->eth_type = htons(ETH_P_IPV6);
567         if (!info->vlan)
568                 msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_IPV6));
569         if (!spec)
570                 return 0;
571         if (memcmp(spec->hdr.dst_addr, empty_addr, 16)) {
572                 nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST,
573                            sizeof(spec->hdr.dst_addr), &spec->hdr.dst_addr);
574                 nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST_MASK,
575                            sizeof(mask->hdr.dst_addr), &mask->hdr.dst_addr);
576         }
577         if (memcmp(spec->hdr.src_addr, empty_addr, 16)) {
578                 nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC,
579                            sizeof(spec->hdr.src_addr), &spec->hdr.src_addr);
580                 nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC_MASK,
581                            sizeof(mask->hdr.src_addr), &mask->hdr.src_addr);
582         }
583         if (spec->hdr.proto)
584                 nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, spec->hdr.proto);
585         return 0;
586 }
587
588 /**
589  * Make as much checks as possible on a UDP item, and if a flow is provided,
590  * fill it appropriately with UDP info.
591  *
592  * @param[in] item
593  *   Item specification.
594  * @param[in, out] data
595  *   Additional data structure to tell next layers we've been here.
596  *
597  * @return
598  *   0 if checks are alright, -1 otherwise.
599  */
600 static int
601 tap_flow_create_udp(const struct rte_flow_item *item, void *data)
602 {
603         struct convert_data *info = (struct convert_data *)data;
604         const struct rte_flow_item_udp *spec = item->spec;
605         const struct rte_flow_item_udp *mask = item->mask;
606         struct rte_flow *flow = info->flow;
607         struct nlmsg *msg;
608
609         /* use default mask if none provided */
610         if (!mask)
611                 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_UDP].default_mask;
612         /* check that previous ip_proto is compatible with udp */
613         if (info->ip_proto && info->ip_proto != IPPROTO_UDP)
614                 return -1;
615         /* TC does not support UDP port masking. Only accept if exact match. */
616         if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) ||
617             (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff))
618                 return -1;
619         if (!flow)
620                 return 0;
621         msg = &flow->msg;
622         nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_UDP);
623         if (!spec)
624                 return 0;
625         if (spec->hdr.dst_port & mask->hdr.dst_port)
626                 nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_DST,
627                              spec->hdr.dst_port);
628         if (spec->hdr.src_port & mask->hdr.src_port)
629                 nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_SRC,
630                              spec->hdr.src_port);
631         return 0;
632 }
633
634 /**
635  * Make as much checks as possible on a TCP item, and if a flow is provided,
636  * fill it appropriately with TCP info.
637  *
638  * @param[in] item
639  *   Item specification.
640  * @param[in, out] data
641  *   Additional data structure to tell next layers we've been here.
642  *
643  * @return
644  *   0 if checks are alright, -1 otherwise.
645  */
646 static int
647 tap_flow_create_tcp(const struct rte_flow_item *item, void *data)
648 {
649         struct convert_data *info = (struct convert_data *)data;
650         const struct rte_flow_item_tcp *spec = item->spec;
651         const struct rte_flow_item_tcp *mask = item->mask;
652         struct rte_flow *flow = info->flow;
653         struct nlmsg *msg;
654
655         /* use default mask if none provided */
656         if (!mask)
657                 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_TCP].default_mask;
658         /* check that previous ip_proto is compatible with tcp */
659         if (info->ip_proto && info->ip_proto != IPPROTO_TCP)
660                 return -1;
661         /* TC does not support TCP port masking. Only accept if exact match. */
662         if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) ||
663             (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff))
664                 return -1;
665         if (!flow)
666                 return 0;
667         msg = &flow->msg;
668         nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_TCP);
669         if (!spec)
670                 return 0;
671         if (spec->hdr.dst_port & mask->hdr.dst_port)
672                 nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_DST,
673                              spec->hdr.dst_port);
674         if (spec->hdr.src_port & mask->hdr.src_port)
675                 nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_SRC,
676                              spec->hdr.src_port);
677         return 0;
678 }
679
680 /**
681  * Check support for a given item.
682  *
683  * @param[in] item
684  *   Item specification.
685  * @param size
686  *   Bit-Mask size in bytes.
687  * @param[in] supported_mask
688  *   Bit-mask covering supported fields to compare with spec, last and mask in
689  *   \item.
690  * @param[in] default_mask
691  *   Bit-mask default mask if none is provided in \item.
692  *
693  * @return
694  *   0 on success.
695  */
696 static int
697 tap_flow_item_validate(const struct rte_flow_item *item,
698                        unsigned int size,
699                        const uint8_t *supported_mask,
700                        const uint8_t *default_mask)
701 {
702         int ret = 0;
703
704         /* An empty layer is allowed, as long as all fields are NULL */
705         if (!item->spec && (item->mask || item->last))
706                 return -1;
707         /* Is the item spec compatible with what the NIC supports? */
708         if (item->spec && !item->mask) {
709                 unsigned int i;
710                 const uint8_t *spec = item->spec;
711
712                 for (i = 0; i < size; ++i)
713                         if ((spec[i] | supported_mask[i]) != supported_mask[i])
714                                 return -1;
715                 /* Is the default mask compatible with what the NIC supports? */
716                 for (i = 0; i < size; i++)
717                         if ((default_mask[i] | supported_mask[i]) !=
718                             supported_mask[i])
719                                 return -1;
720         }
721         /* Is the item last compatible with what the NIC supports? */
722         if (item->last && !item->mask) {
723                 unsigned int i;
724                 const uint8_t *spec = item->last;
725
726                 for (i = 0; i < size; ++i)
727                         if ((spec[i] | supported_mask[i]) != supported_mask[i])
728                                 return -1;
729         }
730         /* Is the item mask compatible with what the NIC supports? */
731         if (item->mask) {
732                 unsigned int i;
733                 const uint8_t *spec = item->mask;
734
735                 for (i = 0; i < size; ++i)
736                         if ((spec[i] | supported_mask[i]) != supported_mask[i])
737                                 return -1;
738         }
739         /**
740          * Once masked, Are item spec and item last equal?
741          * TC does not support range so anything else is invalid.
742          */
743         if (item->spec && item->last) {
744                 uint8_t spec[size];
745                 uint8_t last[size];
746                 const uint8_t *apply = default_mask;
747                 unsigned int i;
748
749                 if (item->mask)
750                         apply = item->mask;
751                 for (i = 0; i < size; ++i) {
752                         spec[i] = ((const uint8_t *)item->spec)[i] & apply[i];
753                         last[i] = ((const uint8_t *)item->last)[i] & apply[i];
754                 }
755                 ret = memcmp(spec, last, size);
756         }
757         return ret;
758 }
759
760 /**
761  * Transform a DROP/PASSTHRU action item in the provided flow for TC.
762  *
763  * @param[in, out] flow
764  *   Flow to be filled.
765  * @param[in] action
766  *   Appropriate action to be set in the TCA_GACT_PARMS structure.
767  *
768  * @return
769  *   0 if checks are alright, -1 otherwise.
770  */
771 static int
772 add_action_gact(struct rte_flow *flow, int action)
773 {
774         struct nlmsg *msg = &flow->msg;
775         size_t act_index = 1;
776         struct tc_gact p = {
777                 .action = action
778         };
779
780         if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
781                 return -1;
782         if (nlattr_nested_start(msg, act_index++) < 0)
783                 return -1;
784         nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("gact"), "gact");
785         if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
786                 return -1;
787         nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(p), &p);
788         nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
789         nlattr_nested_finish(msg); /* nested act_index */
790         nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
791         return 0;
792 }
793
794 /**
795  * Transform a MIRRED action item in the provided flow for TC.
796  *
797  * @param[in, out] flow
798  *   Flow to be filled.
799  * @param[in] ifindex
800  *   Netdevice ifindex, where to mirror/redirect packet to.
801  * @param[in] action_type
802  *   Either TCA_EGRESS_REDIR for redirection or TCA_EGRESS_MIRROR for mirroring.
803  *
804  * @return
805  *   0 if checks are alright, -1 otherwise.
806  */
807 static int
808 add_action_mirred(struct rte_flow *flow, uint16_t ifindex, uint16_t action_type)
809 {
810         struct nlmsg *msg = &flow->msg;
811         size_t act_index = 1;
812         struct tc_mirred p = {
813                 .eaction = action_type,
814                 .ifindex = ifindex,
815         };
816
817         if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
818                 return -1;
819         if (nlattr_nested_start(msg, act_index++) < 0)
820                 return -1;
821         nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("mirred"), "mirred");
822         if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
823                 return -1;
824         if (action_type == TCA_EGRESS_MIRROR)
825                 p.action = TC_ACT_PIPE;
826         else /* REDIRECT */
827                 p.action = TC_ACT_STOLEN;
828         nlattr_add(&msg->nh, TCA_MIRRED_PARMS, sizeof(p), &p);
829         nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
830         nlattr_nested_finish(msg); /* nested act_index */
831         nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
832         return 0;
833 }
834
835 /**
836  * Transform a QUEUE action item in the provided flow for TC.
837  *
838  * @param[in, out] flow
839  *   Flow to be filled.
840  * @param[in] queue
841  *   Queue id to use.
842  *
843  * @return
844  *   0 if checks are alright, -1 otherwise.
845  */
846 static int
847 add_action_skbedit(struct rte_flow *flow, uint16_t queue)
848 {
849         struct nlmsg *msg = &flow->msg;
850         size_t act_index = 1;
851         struct tc_skbedit p = {
852                 .action = TC_ACT_PIPE
853         };
854
855         if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
856                 return -1;
857         if (nlattr_nested_start(msg, act_index++) < 0)
858                 return -1;
859         nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("skbedit"), "skbedit");
860         if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
861                 return -1;
862         nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS, sizeof(p), &p);
863         nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING, queue);
864         nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
865         nlattr_nested_finish(msg); /* nested act_index */
866         nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
867         return 0;
868 }
869
870 /**
871  * Validate a flow supported by TC.
872  * If flow param is not NULL, then also fill the netlink message inside.
873  *
874  * @param pmd
875  *   Pointer to private structure.
876  * @param[in] attr
877  *   Flow rule attributes.
878  * @param[in] pattern
879  *   Pattern specification (list terminated by the END pattern item).
880  * @param[in] actions
881  *   Associated actions (list terminated by the END action).
882  * @param[out] error
883  *   Perform verbose error reporting if not NULL.
884  * @param[in, out] flow
885  *   Flow structure to update.
886  * @param[in] mirred
887  *   If set to TCA_EGRESS_REDIR, provided actions will be replaced with a
888  *   redirection to the tap netdevice, and the TC rule will be configured
889  *   on the remote netdevice in pmd.
890  *   If set to TCA_EGRESS_MIRROR, provided actions will be replaced with a
891  *   mirroring to the tap netdevice, and the TC rule will be configured
892  *   on the remote netdevice in pmd. Matching packets will thus be duplicated.
893  *   If set to 0, the standard behavior is to be used: set correct actions for
894  *   the TC rule, and apply it on the tap netdevice.
895  *
896  * @return
897  *   0 on success, a negative errno value otherwise and rte_errno is set.
898  */
899 static int
900 priv_flow_process(struct pmd_internals *pmd,
901                   const struct rte_flow_attr *attr,
902                   const struct rte_flow_item items[],
903                   const struct rte_flow_action actions[],
904                   struct rte_flow_error *error,
905                   struct rte_flow *flow,
906                   int mirred)
907 {
908         const struct tap_flow_items *cur_item = tap_flow_items;
909         struct convert_data data = {
910                 .eth_type = 0,
911                 .ip_proto = 0,
912                 .flow = flow,
913         };
914         int action = 0; /* Only one action authorized for now */
915
916         if (attr->group > MAX_GROUP) {
917                 rte_flow_error_set(
918                         error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
919                         NULL, "group value too big: cannot exceed 15");
920                 return -rte_errno;
921         }
922         if (attr->priority > MAX_PRIORITY) {
923                 rte_flow_error_set(
924                         error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
925                         NULL, "priority value too big");
926                 return -rte_errno;
927         } else if (flow) {
928                 uint16_t group = attr->group << GROUP_SHIFT;
929                 uint16_t prio = group | (attr->priority + PRIORITY_OFFSET);
930                 flow->msg.t.tcm_info = TC_H_MAKE(prio << 16,
931                                                  flow->msg.t.tcm_info);
932         }
933         if (flow) {
934                 if (mirred) {
935                         /*
936                          * If attr->ingress, the rule applies on remote ingress
937                          * to match incoming packets
938                          * If attr->egress, the rule applies on tap ingress (as
939                          * seen from the kernel) to deal with packets going out
940                          * from the DPDK app.
941                          */
942                         flow->msg.t.tcm_parent = TC_H_MAKE(TC_H_INGRESS, 0);
943                 } else {
944                         /* Standard rule on tap egress (kernel standpoint). */
945                         flow->msg.t.tcm_parent =
946                                 TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
947                 }
948                 /* use flower filter type */
949                 nlattr_add(&flow->msg.nh, TCA_KIND, sizeof("flower"), "flower");
950                 if (nlattr_nested_start(&flow->msg, TCA_OPTIONS) < 0)
951                         goto exit_item_not_supported;
952         }
953         for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) {
954                 const struct tap_flow_items *token = NULL;
955                 unsigned int i;
956                 int err = 0;
957
958                 if (items->type == RTE_FLOW_ITEM_TYPE_VOID)
959                         continue;
960                 for (i = 0;
961                      cur_item->items &&
962                      cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END;
963                      ++i) {
964                         if (cur_item->items[i] == items->type) {
965                                 token = &tap_flow_items[items->type];
966                                 break;
967                         }
968                 }
969                 if (!token)
970                         goto exit_item_not_supported;
971                 cur_item = token;
972                 err = tap_flow_item_validate(
973                         items, cur_item->mask_sz,
974                         (const uint8_t *)cur_item->mask,
975                         (const uint8_t *)cur_item->default_mask);
976                 if (err)
977                         goto exit_item_not_supported;
978                 if (flow && cur_item->convert) {
979                         if (!pmd->flower_vlan_support &&
980                             cur_item->convert == tap_flow_create_vlan)
981                                 goto exit_item_not_supported;
982                         err = cur_item->convert(items, &data);
983                         if (err)
984                                 goto exit_item_not_supported;
985                 }
986         }
987         if (flow) {
988                 if (pmd->flower_vlan_support && data.vlan) {
989                         nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
990                                      htons(ETH_P_8021Q));
991                         nlattr_add16(&flow->msg.nh,
992                                      TCA_FLOWER_KEY_VLAN_ETH_TYPE,
993                                      data.eth_type ?
994                                      data.eth_type : htons(ETH_P_ALL));
995                 } else if (data.eth_type) {
996                         nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
997                                      data.eth_type);
998                 }
999         }
1000         if (mirred && flow) {
1001                 uint16_t if_index = pmd->if_index;
1002
1003                 /*
1004                  * If attr->egress && mirred, then this is a special
1005                  * case where the rule must be applied on the tap, to
1006                  * redirect packets coming from the DPDK App, out
1007                  * through the remote netdevice.
1008                  */
1009                 if (attr->egress)
1010                         if_index = pmd->remote_if_index;
1011                 if (add_action_mirred(flow, if_index, mirred) < 0)
1012                         goto exit_action_not_supported;
1013                 else
1014                         goto end;
1015         }
1016         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
1017                 int err = 0;
1018
1019                 if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
1020                         continue;
1021                 } else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
1022                         if (action)
1023                                 goto exit_action_not_supported;
1024                         action = 1;
1025                         if (flow)
1026                                 err = add_action_gact(flow, TC_ACT_SHOT);
1027                 } else if (actions->type == RTE_FLOW_ACTION_TYPE_PASSTHRU) {
1028                         if (action)
1029                                 goto exit_action_not_supported;
1030                         action = 1;
1031                         if (flow)
1032                                 err = add_action_gact(flow, TC_ACT_UNSPEC);
1033                 } else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
1034                         const struct rte_flow_action_queue *queue =
1035                                 (const struct rte_flow_action_queue *)
1036                                 actions->conf;
1037                         if (action)
1038                                 goto exit_action_not_supported;
1039                         action = 1;
1040                         if (!queue || (queue->index >= pmd->nb_queues))
1041                                 goto exit_action_not_supported;
1042                         if (flow)
1043                                 err = add_action_skbedit(flow, queue->index);
1044                 } else {
1045                         goto exit_action_not_supported;
1046                 }
1047                 if (err)
1048                         goto exit_action_not_supported;
1049         }
1050 end:
1051         if (flow)
1052                 nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */
1053         return 0;
1054 exit_item_not_supported:
1055         rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
1056                            items, "item not supported");
1057         return -rte_errno;
1058 exit_action_not_supported:
1059         rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
1060                            actions, "action not supported");
1061         return -rte_errno;
1062 }
1063
1064
1065
1066 /**
1067  * Validate a flow.
1068  *
1069  * @see rte_flow_validate()
1070  * @see rte_flow_ops
1071  */
1072 static int
1073 tap_flow_validate(struct rte_eth_dev *dev,
1074                   const struct rte_flow_attr *attr,
1075                   const struct rte_flow_item items[],
1076                   const struct rte_flow_action actions[],
1077                   struct rte_flow_error *error)
1078 {
1079         struct pmd_internals *pmd = dev->data->dev_private;
1080
1081         return priv_flow_process(pmd, attr, items, actions, error, NULL, 0);
1082 }
1083
1084 /**
1085  * Set a unique handle in a flow.
1086  *
1087  * The kernel supports TC rules with equal priority, as long as they use the
1088  * same matching fields (e.g.: dst mac and ipv4) with different values (and
1089  * full mask to ensure no collision is possible).
1090  * In those rules, the handle (uint32_t) is the part that would identify
1091  * specifically each rule.
1092  *
1093  * On 32-bit architectures, the handle can simply be the flow's pointer address.
1094  * On 64-bit architectures, we rely on jhash(flow) to find a (sufficiently)
1095  * unique handle.
1096  *
1097  * @param[in, out] flow
1098  *   The flow that needs its handle set.
1099  */
1100 static void
1101 tap_flow_set_handle(struct rte_flow *flow)
1102 {
1103         uint32_t handle = 0;
1104
1105         if (sizeof(flow) > 4)
1106                 handle = rte_jhash(&flow, sizeof(flow), 1);
1107         else
1108                 handle = (uintptr_t)flow;
1109         /* must be at least 1 to avoid letting the kernel choose one for us */
1110         if (!handle)
1111                 handle = 1;
1112         flow->msg.t.tcm_handle = handle;
1113 }
1114
1115 /**
1116  * Create a flow.
1117  *
1118  * @see rte_flow_create()
1119  * @see rte_flow_ops
1120  */
1121 static struct rte_flow *
1122 tap_flow_create(struct rte_eth_dev *dev,
1123                 const struct rte_flow_attr *attr,
1124                 const struct rte_flow_item items[],
1125                 const struct rte_flow_action actions[],
1126                 struct rte_flow_error *error)
1127 {
1128         struct pmd_internals *pmd = dev->data->dev_private;
1129         struct rte_flow *remote_flow = NULL;
1130         struct rte_flow *flow = NULL;
1131         struct nlmsg *msg = NULL;
1132         int err;
1133
1134         if (!pmd->if_index) {
1135                 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1136                                    NULL,
1137                                    "can't create rule, ifindex not found");
1138                 goto fail;
1139         }
1140         /*
1141          * No rules configured through standard rte_flow should be set on the
1142          * priorities used by implicit rules.
1143          */
1144         if ((attr->group == MAX_GROUP) &&
1145             attr->priority > (MAX_PRIORITY - TAP_REMOTE_MAX_IDX)) {
1146                 rte_flow_error_set(
1147                         error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1148                         NULL, "priority value too big");
1149                 goto fail;
1150         }
1151         flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1152         if (!flow) {
1153                 rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1154                                    NULL, "cannot allocate memory for rte_flow");
1155                 goto fail;
1156         }
1157         msg = &flow->msg;
1158         tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER,
1159                     NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1160         msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1161         tap_flow_set_handle(flow);
1162         if (priv_flow_process(pmd, attr, items, actions, error, flow, 0))
1163                 goto fail;
1164         err = nl_send(pmd->nlsk_fd, &msg->nh);
1165         if (err < 0) {
1166                 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1167                                    NULL, "couldn't send request to kernel");
1168                 goto fail;
1169         }
1170         err = nl_recv_ack(pmd->nlsk_fd);
1171         if (err < 0) {
1172                 rte_flow_error_set(error, EEXIST, RTE_FLOW_ERROR_TYPE_HANDLE,
1173                                    NULL, "overlapping rules");
1174                 goto fail;
1175         }
1176         LIST_INSERT_HEAD(&pmd->flows, flow, next);
1177         /**
1178          * If a remote device is configured, a TC rule with identical items for
1179          * matching must be set on that device, with a single action: redirect
1180          * to the local pmd->if_index.
1181          */
1182         if (pmd->remote_if_index) {
1183                 remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1184                 if (!remote_flow) {
1185                         rte_flow_error_set(
1186                                 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1187                                 "cannot allocate memory for rte_flow");
1188                         goto fail;
1189                 }
1190                 msg = &remote_flow->msg;
1191                 /* set the rule if_index for the remote netdevice */
1192                 tc_init_msg(
1193                         msg, pmd->remote_if_index, RTM_NEWTFILTER,
1194                         NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1195                 msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1196                 tap_flow_set_handle(remote_flow);
1197                 if (priv_flow_process(pmd, attr, items, NULL,
1198                                       error, remote_flow, TCA_EGRESS_REDIR)) {
1199                         rte_flow_error_set(
1200                                 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1201                                 NULL, "rte flow rule validation failed");
1202                         goto fail;
1203                 }
1204                 err = nl_send(pmd->nlsk_fd, &msg->nh);
1205                 if (err < 0) {
1206                         rte_flow_error_set(
1207                                 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1208                                 NULL, "Failure sending nl request");
1209                         goto fail;
1210                 }
1211                 err = nl_recv_ack(pmd->nlsk_fd);
1212                 if (err < 0) {
1213                         rte_flow_error_set(
1214                                 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1215                                 NULL, "overlapping rules");
1216                         goto fail;
1217                 }
1218                 flow->remote_flow = remote_flow;
1219         }
1220         return flow;
1221 fail:
1222         if (remote_flow)
1223                 rte_free(remote_flow);
1224         if (flow)
1225                 rte_free(flow);
1226         return NULL;
1227 }
1228
1229 /**
1230  * Destroy a flow using pointer to pmd_internal.
1231  *
1232  * @param[in, out] pmd
1233  *   Pointer to private structure.
1234  * @param[in] flow
1235  *   Pointer to the flow to destroy.
1236  * @param[in, out] error
1237  *   Pointer to the flow error handler
1238  *
1239  * @return 0 if the flow could be destroyed, -1 otherwise.
1240  */
1241 static int
1242 tap_flow_destroy_pmd(struct pmd_internals *pmd,
1243                      struct rte_flow *flow,
1244                      struct rte_flow_error *error)
1245 {
1246         struct rte_flow *remote_flow = flow->remote_flow;
1247         int ret = 0;
1248
1249         LIST_REMOVE(flow, next);
1250         flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1251         flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
1252
1253         ret = nl_send(pmd->nlsk_fd, &flow->msg.nh);
1254         if (ret < 0) {
1255                 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1256                                    NULL, "couldn't send request to kernel");
1257                 goto end;
1258         }
1259         ret = nl_recv_ack(pmd->nlsk_fd);
1260         if (ret < 0) {
1261                 rte_flow_error_set(
1262                         error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1263                         "couldn't receive kernel ack to our request");
1264                 goto end;
1265         }
1266         if (remote_flow) {
1267                 remote_flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1268                 remote_flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
1269
1270                 ret = nl_send(pmd->nlsk_fd, &remote_flow->msg.nh);
1271                 if (ret < 0) {
1272                         rte_flow_error_set(
1273                                 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1274                                 NULL, "Failure sending nl request");
1275                         goto end;
1276                 }
1277                 ret = nl_recv_ack(pmd->nlsk_fd);
1278                 if (ret < 0) {
1279                         rte_flow_error_set(
1280                                 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1281                                 NULL, "Failure trying to receive nl ack");
1282                         goto end;
1283                 }
1284         }
1285 end:
1286         if (remote_flow)
1287                 rte_free(remote_flow);
1288         rte_free(flow);
1289         return ret;
1290 }
1291
1292 /**
1293  * Destroy a flow.
1294  *
1295  * @see rte_flow_destroy()
1296  * @see rte_flow_ops
1297  */
1298 static int
1299 tap_flow_destroy(struct rte_eth_dev *dev,
1300                  struct rte_flow *flow,
1301                  struct rte_flow_error *error)
1302 {
1303         struct pmd_internals *pmd = dev->data->dev_private;
1304
1305         return tap_flow_destroy_pmd(pmd, flow, error);
1306 }
1307
1308 /**
1309  * Destroy all flows.
1310  *
1311  * @see rte_flow_flush()
1312  * @see rte_flow_ops
1313  */
1314 int
1315 tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error)
1316 {
1317         struct pmd_internals *pmd = dev->data->dev_private;
1318         struct rte_flow *flow;
1319
1320         while (!LIST_EMPTY(&pmd->flows)) {
1321                 flow = LIST_FIRST(&pmd->flows);
1322                 if (tap_flow_destroy(dev, flow, error) < 0)
1323                         return -1;
1324         }
1325         return 0;
1326 }
1327
1328 /**
1329  * Add an implicit flow rule on the remote device to make sure traffic gets to
1330  * the tap netdevice from there.
1331  *
1332  * @param pmd
1333  *   Pointer to private structure.
1334  * @param[in] idx
1335  *   The idx in the implicit_rte_flows array specifying which rule to apply.
1336  *
1337  * @return -1 if the rule couldn't be applied, 0 otherwise.
1338  */
1339 int tap_flow_implicit_create(struct pmd_internals *pmd,
1340                              enum implicit_rule_index idx)
1341 {
1342         struct rte_flow_item *items = implicit_rte_flows[idx].items;
1343         struct rte_flow_attr *attr = &implicit_rte_flows[idx].attr;
1344         struct rte_flow_item_eth eth_local = { .type = 0 };
1345         uint16_t if_index = pmd->remote_if_index;
1346         struct rte_flow *remote_flow = NULL;
1347         struct nlmsg *msg = NULL;
1348         int err = 0;
1349         struct rte_flow_item items_local[2] = {
1350                 [0] = {
1351                         .type = items[0].type,
1352                         .spec = &eth_local,
1353                         .mask = items[0].mask,
1354                 },
1355                 [1] = {
1356                         .type = items[1].type,
1357                 }
1358         };
1359
1360         remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1361         if (!remote_flow) {
1362                 RTE_LOG(ERR, PMD, "Cannot allocate memory for rte_flow");
1363                 goto fail;
1364         }
1365         msg = &remote_flow->msg;
1366         if (idx == TAP_REMOTE_TX) {
1367                 if_index = pmd->if_index;
1368         } else if (idx == TAP_REMOTE_LOCAL_MAC) {
1369                 /*
1370                  * eth addr couldn't be set in implicit_rte_flows[] as it is not
1371                  * known at compile time.
1372                  */
1373                 memcpy(&eth_local.dst, &pmd->eth_addr, sizeof(pmd->eth_addr));
1374                 items = items_local;
1375         }
1376         tc_init_msg(msg, if_index, RTM_NEWTFILTER,
1377                     NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1378         msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1379         tap_flow_set_handle(remote_flow);
1380         if (priv_flow_process(pmd, attr, items, NULL, NULL,
1381                               remote_flow, implicit_rte_flows[idx].mirred)) {
1382                 RTE_LOG(ERR, PMD, "rte flow rule validation failed\n");
1383                 goto fail;
1384         }
1385         err = nl_send(pmd->nlsk_fd, &msg->nh);
1386         if (err < 0) {
1387                 RTE_LOG(ERR, PMD, "Failure sending nl request");
1388                 goto fail;
1389         }
1390         err = nl_recv_ack(pmd->nlsk_fd);
1391         if (err < 0) {
1392                 RTE_LOG(ERR, PMD,
1393                         "Kernel refused TC filter rule creation");
1394                 goto fail;
1395         }
1396         LIST_INSERT_HEAD(&pmd->implicit_flows, remote_flow, next);
1397         return 0;
1398 fail:
1399         if (remote_flow)
1400                 rte_free(remote_flow);
1401         return -1;
1402 }
1403
1404 /**
1405  * Remove specific implicit flow rule on the remote device.
1406  *
1407  * @param[in, out] pmd
1408  *   Pointer to private structure.
1409  * @param[in] idx
1410  *   The idx in the implicit_rte_flows array specifying which rule to remove.
1411  *
1412  * @return -1 if one of the implicit rules couldn't be created, 0 otherwise.
1413  */
1414 int tap_flow_implicit_destroy(struct pmd_internals *pmd,
1415                               enum implicit_rule_index idx)
1416 {
1417         struct rte_flow *remote_flow;
1418         int cur_prio = -1;
1419         int idx_prio = implicit_rte_flows[idx].attr.priority + PRIORITY_OFFSET;
1420
1421         for (remote_flow = LIST_FIRST(&pmd->implicit_flows);
1422              remote_flow;
1423              remote_flow = LIST_NEXT(remote_flow, next)) {
1424                 cur_prio = (remote_flow->msg.t.tcm_info >> 16) & PRIORITY_MASK;
1425                 if (cur_prio != idx_prio)
1426                         continue;
1427                 return tap_flow_destroy_pmd(pmd, remote_flow, NULL);
1428         }
1429         return 0;
1430 }
1431
1432 /**
1433  * Destroy all implicit flows.
1434  *
1435  * @see rte_flow_flush()
1436  */
1437 int
1438 tap_flow_implicit_flush(struct pmd_internals *pmd, struct rte_flow_error *error)
1439 {
1440         struct rte_flow *remote_flow;
1441
1442         while (!LIST_EMPTY(&pmd->implicit_flows)) {
1443                 remote_flow = LIST_FIRST(&pmd->implicit_flows);
1444                 if (tap_flow_destroy_pmd(pmd, remote_flow, error) < 0)
1445                         return -1;
1446         }
1447         return 0;
1448 }
1449
1450 /**
1451  * Manage filter operations.
1452  *
1453  * @param dev
1454  *   Pointer to Ethernet device structure.
1455  * @param filter_type
1456  *   Filter type.
1457  * @param filter_op
1458  *   Operation to perform.
1459  * @param arg
1460  *   Pointer to operation-specific structure.
1461  *
1462  * @return
1463  *   0 on success, negative errno value on failure.
1464  */
1465 int
1466 tap_dev_filter_ctrl(struct rte_eth_dev *dev,
1467                     enum rte_filter_type filter_type,
1468                     enum rte_filter_op filter_op,
1469                     void *arg)
1470 {
1471         struct pmd_internals *pmd = dev->data->dev_private;
1472
1473         if (!pmd->flower_support)
1474                 return -ENOTSUP;
1475         switch (filter_type) {
1476         case RTE_ETH_FILTER_GENERIC:
1477                 if (filter_op != RTE_ETH_FILTER_GET)
1478                         return -EINVAL;
1479                 *(const void **)arg = &tap_flow_ops;
1480                 return 0;
1481         default:
1482                 RTE_LOG(ERR, PMD, "%p: filter type (%d) not supported",
1483                         (void *)dev, filter_type);
1484         }
1485         return -EINVAL;
1486 }
1487