net/tap: add eBPF API
[dpdk.git] / drivers / net / tap / tap_flow.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright 2017 6WIND S.A.
5  *   Copyright 2017 Mellanox.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of 6WIND S.A. nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <errno.h>
35 #include <string.h>
36 #include <unistd.h>
37 #include <sys/queue.h>
38
39 #include <rte_byteorder.h>
40 #include <rte_jhash.h>
41 #include <rte_malloc.h>
42 #include <rte_eth_tap.h>
43 #include <tap_flow.h>
44 #include <tap_autoconf.h>
45 #include <tap_tcmsgs.h>
46
47 #ifndef HAVE_TC_FLOWER
48 /*
49  * For kernels < 4.2, this enum is not defined. Runtime checks will be made to
50  * avoid sending TC messages the kernel cannot understand.
51  */
52 enum {
53         TCA_FLOWER_UNSPEC,
54         TCA_FLOWER_CLASSID,
55         TCA_FLOWER_INDEV,
56         TCA_FLOWER_ACT,
57         TCA_FLOWER_KEY_ETH_DST,         /* ETH_ALEN */
58         TCA_FLOWER_KEY_ETH_DST_MASK,    /* ETH_ALEN */
59         TCA_FLOWER_KEY_ETH_SRC,         /* ETH_ALEN */
60         TCA_FLOWER_KEY_ETH_SRC_MASK,    /* ETH_ALEN */
61         TCA_FLOWER_KEY_ETH_TYPE,        /* be16 */
62         TCA_FLOWER_KEY_IP_PROTO,        /* u8 */
63         TCA_FLOWER_KEY_IPV4_SRC,        /* be32 */
64         TCA_FLOWER_KEY_IPV4_SRC_MASK,   /* be32 */
65         TCA_FLOWER_KEY_IPV4_DST,        /* be32 */
66         TCA_FLOWER_KEY_IPV4_DST_MASK,   /* be32 */
67         TCA_FLOWER_KEY_IPV6_SRC,        /* struct in6_addr */
68         TCA_FLOWER_KEY_IPV6_SRC_MASK,   /* struct in6_addr */
69         TCA_FLOWER_KEY_IPV6_DST,        /* struct in6_addr */
70         TCA_FLOWER_KEY_IPV6_DST_MASK,   /* struct in6_addr */
71         TCA_FLOWER_KEY_TCP_SRC,         /* be16 */
72         TCA_FLOWER_KEY_TCP_DST,         /* be16 */
73         TCA_FLOWER_KEY_UDP_SRC,         /* be16 */
74         TCA_FLOWER_KEY_UDP_DST,         /* be16 */
75 };
76 #endif
77 #ifndef HAVE_TC_VLAN_ID
78 enum {
79         /* TCA_FLOWER_FLAGS, */
80         TCA_FLOWER_KEY_VLAN_ID = TCA_FLOWER_KEY_UDP_DST + 2, /* be16 */
81         TCA_FLOWER_KEY_VLAN_PRIO,       /* u8   */
82         TCA_FLOWER_KEY_VLAN_ETH_TYPE,   /* be16 */
83 };
84 #endif
85
86 #define ISOLATE_HANDLE 1
87
88 struct rte_flow {
89         LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
90         struct rte_flow *remote_flow; /* associated remote flow */
91         struct nlmsg msg;
92 };
93
94 struct convert_data {
95         uint16_t eth_type;
96         uint16_t ip_proto;
97         uint8_t vlan;
98         struct rte_flow *flow;
99 };
100
101 struct remote_rule {
102         struct rte_flow_attr attr;
103         struct rte_flow_item items[2];
104         struct rte_flow_action actions[2];
105         int mirred;
106 };
107
108 struct action_data {
109         char id[16];
110
111         union {
112                 struct tc_gact gact;
113                 struct tc_mirred mirred;
114                 struct skbedit {
115                         struct tc_skbedit skbedit;
116                         uint16_t queue;
117                 } skbedit;
118         };
119 };
120
121 static int tap_flow_create_eth(const struct rte_flow_item *item, void *data);
122 static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data);
123 static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data);
124 static int tap_flow_create_ipv6(const struct rte_flow_item *item, void *data);
125 static int tap_flow_create_udp(const struct rte_flow_item *item, void *data);
126 static int tap_flow_create_tcp(const struct rte_flow_item *item, void *data);
127 static int
128 tap_flow_validate(struct rte_eth_dev *dev,
129                   const struct rte_flow_attr *attr,
130                   const struct rte_flow_item items[],
131                   const struct rte_flow_action actions[],
132                   struct rte_flow_error *error);
133
134 static struct rte_flow *
135 tap_flow_create(struct rte_eth_dev *dev,
136                 const struct rte_flow_attr *attr,
137                 const struct rte_flow_item items[],
138                 const struct rte_flow_action actions[],
139                 struct rte_flow_error *error);
140
141 static int
142 tap_flow_destroy(struct rte_eth_dev *dev,
143                  struct rte_flow *flow,
144                  struct rte_flow_error *error);
145
146 static int
147 tap_flow_isolate(struct rte_eth_dev *dev,
148                  int set,
149                  struct rte_flow_error *error);
150
151 static const struct rte_flow_ops tap_flow_ops = {
152         .validate = tap_flow_validate,
153         .create = tap_flow_create,
154         .destroy = tap_flow_destroy,
155         .flush = tap_flow_flush,
156         .isolate = tap_flow_isolate,
157 };
158
159 /* Static initializer for items. */
160 #define ITEMS(...) \
161         (const enum rte_flow_item_type []){ \
162                 __VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \
163         }
164
165 /* Structure to generate a simple graph of layers supported by the NIC. */
166 struct tap_flow_items {
167         /* Bit-mask corresponding to what is supported for this item. */
168         const void *mask;
169         const unsigned int mask_sz; /* Bit-mask size in bytes. */
170         /*
171          * Bit-mask corresponding to the default mask, if none is provided
172          * along with the item.
173          */
174         const void *default_mask;
175         /**
176          * Conversion function from rte_flow to netlink attributes.
177          *
178          * @param item
179          *   rte_flow item to convert.
180          * @param data
181          *   Internal structure to store the conversion.
182          *
183          * @return
184          *   0 on success, negative value otherwise.
185          */
186         int (*convert)(const struct rte_flow_item *item, void *data);
187         /** List of possible following items.  */
188         const enum rte_flow_item_type *const items;
189 };
190
191 /* Graph of supported items and associated actions. */
192 static const struct tap_flow_items tap_flow_items[] = {
193         [RTE_FLOW_ITEM_TYPE_END] = {
194                 .items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH),
195         },
196         [RTE_FLOW_ITEM_TYPE_ETH] = {
197                 .items = ITEMS(
198                         RTE_FLOW_ITEM_TYPE_VLAN,
199                         RTE_FLOW_ITEM_TYPE_IPV4,
200                         RTE_FLOW_ITEM_TYPE_IPV6),
201                 .mask = &(const struct rte_flow_item_eth){
202                         .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
203                         .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
204                         .type = -1,
205                 },
206                 .mask_sz = sizeof(struct rte_flow_item_eth),
207                 .default_mask = &rte_flow_item_eth_mask,
208                 .convert = tap_flow_create_eth,
209         },
210         [RTE_FLOW_ITEM_TYPE_VLAN] = {
211                 .items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4,
212                                RTE_FLOW_ITEM_TYPE_IPV6),
213                 .mask = &(const struct rte_flow_item_vlan){
214                         .tpid = -1,
215                         /* DEI matching is not supported */
216 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
217                         .tci = 0xffef,
218 #else
219                         .tci = 0xefff,
220 #endif
221                 },
222                 .mask_sz = sizeof(struct rte_flow_item_vlan),
223                 .default_mask = &rte_flow_item_vlan_mask,
224                 .convert = tap_flow_create_vlan,
225         },
226         [RTE_FLOW_ITEM_TYPE_IPV4] = {
227                 .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
228                                RTE_FLOW_ITEM_TYPE_TCP),
229                 .mask = &(const struct rte_flow_item_ipv4){
230                         .hdr = {
231                                 .src_addr = -1,
232                                 .dst_addr = -1,
233                                 .next_proto_id = -1,
234                         },
235                 },
236                 .mask_sz = sizeof(struct rte_flow_item_ipv4),
237                 .default_mask = &rte_flow_item_ipv4_mask,
238                 .convert = tap_flow_create_ipv4,
239         },
240         [RTE_FLOW_ITEM_TYPE_IPV6] = {
241                 .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
242                                RTE_FLOW_ITEM_TYPE_TCP),
243                 .mask = &(const struct rte_flow_item_ipv6){
244                         .hdr = {
245                                 .src_addr = {
246                                         "\xff\xff\xff\xff\xff\xff\xff\xff"
247                                         "\xff\xff\xff\xff\xff\xff\xff\xff",
248                                 },
249                                 .dst_addr = {
250                                         "\xff\xff\xff\xff\xff\xff\xff\xff"
251                                         "\xff\xff\xff\xff\xff\xff\xff\xff",
252                                 },
253                                 .proto = -1,
254                         },
255                 },
256                 .mask_sz = sizeof(struct rte_flow_item_ipv6),
257                 .default_mask = &rte_flow_item_ipv6_mask,
258                 .convert = tap_flow_create_ipv6,
259         },
260         [RTE_FLOW_ITEM_TYPE_UDP] = {
261                 .mask = &(const struct rte_flow_item_udp){
262                         .hdr = {
263                                 .src_port = -1,
264                                 .dst_port = -1,
265                         },
266                 },
267                 .mask_sz = sizeof(struct rte_flow_item_udp),
268                 .default_mask = &rte_flow_item_udp_mask,
269                 .convert = tap_flow_create_udp,
270         },
271         [RTE_FLOW_ITEM_TYPE_TCP] = {
272                 .mask = &(const struct rte_flow_item_tcp){
273                         .hdr = {
274                                 .src_port = -1,
275                                 .dst_port = -1,
276                         },
277                 },
278                 .mask_sz = sizeof(struct rte_flow_item_tcp),
279                 .default_mask = &rte_flow_item_tcp_mask,
280                 .convert = tap_flow_create_tcp,
281         },
282 };
283
284 /*
285  *                TC rules, by growing priority
286  *
287  *        Remote netdevice                  Tap netdevice
288  * +-------------+-------------+  +-------------+-------------+
289  * |   Ingress   |   Egress    |  |   Ingress   |   Egress    |
290  * |-------------|-------------|  |-------------|-------------|
291  * |             |  \       /  |  |             |  REMOTE TX  | prio 1
292  * |             |   \     /   |  |             |   \     /   | prio 2
293  * |  EXPLICIT   |    \   /    |  |  EXPLICIT   |    \   /    |   .
294  * |             |     \ /     |  |             |     \ /     |   .
295  * |    RULES    |      X      |  |    RULES    |      X      |   .
296  * |      .      |     / \     |  |      .      |     / \     |   .
297  * |      .      |    /   \    |  |      .      |    /   \    |   .
298  * |      .      |   /     \   |  |      .      |   /     \   |   .
299  * |      .      |  /       \  |  |      .      |  /       \  |   .
300  *
301  *      ....           ....           ....           ....
302  *
303  * |      .      |  \       /  |  |      .      |  \       /  |   .
304  * |      .      |   \     /   |  |      .      |   \     /   |   .
305  * |             |    \   /    |  |             |    \   /    |
306  * |  LOCAL_MAC  |     \ /     |  |    \   /    |     \ /     | last prio - 5
307  * |   PROMISC   |      X      |  |     \ /     |      X      | last prio - 4
308  * |   ALLMULTI  |     / \     |  |      X      |     / \     | last prio - 3
309  * |  BROADCAST  |    /   \    |  |     / \     |    /   \    | last prio - 2
310  * | BROADCASTV6 |   /     \   |  |    /   \    |   /     \   | last prio - 1
311  * |     xx      |  /       \  |  |   ISOLATE   |  /       \  | last prio
312  * +-------------+-------------+  +-------------+-------------+
313  *
314  * The implicit flow rules are stored in a list in with mandatorily the last two
315  * being the ISOLATE and REMOTE_TX rules. e.g.:
316  *
317  * LOCAL_MAC -> BROADCAST -> BROADCASTV6 -> REMOTE_TX -> ISOLATE -> NULL
318  *
319  * That enables tap_flow_isolate() to remove implicit rules by popping the list
320  * head and remove it as long as it applies on the remote netdevice. The
321  * implicit rule for TX redirection is not removed, as isolate concerns only
322  * incoming traffic.
323  */
324
325 static struct remote_rule implicit_rte_flows[TAP_REMOTE_MAX_IDX] = {
326         [TAP_REMOTE_LOCAL_MAC] = {
327                 .attr = {
328                         .group = MAX_GROUP,
329                         .priority = PRIORITY_MASK - TAP_REMOTE_LOCAL_MAC,
330                         .ingress = 1,
331                 },
332                 .items[0] = {
333                         .type = RTE_FLOW_ITEM_TYPE_ETH,
334                         .mask =  &(const struct rte_flow_item_eth){
335                                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
336                         },
337                 },
338                 .items[1] = {
339                         .type = RTE_FLOW_ITEM_TYPE_END,
340                 },
341                 .mirred = TCA_EGRESS_REDIR,
342         },
343         [TAP_REMOTE_BROADCAST] = {
344                 .attr = {
345                         .group = MAX_GROUP,
346                         .priority = PRIORITY_MASK - TAP_REMOTE_BROADCAST,
347                         .ingress = 1,
348                 },
349                 .items[0] = {
350                         .type = RTE_FLOW_ITEM_TYPE_ETH,
351                         .mask =  &(const struct rte_flow_item_eth){
352                                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
353                         },
354                         .spec = &(const struct rte_flow_item_eth){
355                                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
356                         },
357                 },
358                 .items[1] = {
359                         .type = RTE_FLOW_ITEM_TYPE_END,
360                 },
361                 .mirred = TCA_EGRESS_MIRROR,
362         },
363         [TAP_REMOTE_BROADCASTV6] = {
364                 .attr = {
365                         .group = MAX_GROUP,
366                         .priority = PRIORITY_MASK - TAP_REMOTE_BROADCASTV6,
367                         .ingress = 1,
368                 },
369                 .items[0] = {
370                         .type = RTE_FLOW_ITEM_TYPE_ETH,
371                         .mask =  &(const struct rte_flow_item_eth){
372                                 .dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
373                         },
374                         .spec = &(const struct rte_flow_item_eth){
375                                 .dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
376                         },
377                 },
378                 .items[1] = {
379                         .type = RTE_FLOW_ITEM_TYPE_END,
380                 },
381                 .mirred = TCA_EGRESS_MIRROR,
382         },
383         [TAP_REMOTE_PROMISC] = {
384                 .attr = {
385                         .group = MAX_GROUP,
386                         .priority = PRIORITY_MASK - TAP_REMOTE_PROMISC,
387                         .ingress = 1,
388                 },
389                 .items[0] = {
390                         .type = RTE_FLOW_ITEM_TYPE_VOID,
391                 },
392                 .items[1] = {
393                         .type = RTE_FLOW_ITEM_TYPE_END,
394                 },
395                 .mirred = TCA_EGRESS_MIRROR,
396         },
397         [TAP_REMOTE_ALLMULTI] = {
398                 .attr = {
399                         .group = MAX_GROUP,
400                         .priority = PRIORITY_MASK - TAP_REMOTE_ALLMULTI,
401                         .ingress = 1,
402                 },
403                 .items[0] = {
404                         .type = RTE_FLOW_ITEM_TYPE_ETH,
405                         .mask =  &(const struct rte_flow_item_eth){
406                                 .dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
407                         },
408                         .spec = &(const struct rte_flow_item_eth){
409                                 .dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
410                         },
411                 },
412                 .items[1] = {
413                         .type = RTE_FLOW_ITEM_TYPE_END,
414                 },
415                 .mirred = TCA_EGRESS_MIRROR,
416         },
417         [TAP_REMOTE_TX] = {
418                 .attr = {
419                         .group = 0,
420                         .priority = TAP_REMOTE_TX,
421                         .egress = 1,
422                 },
423                 .items[0] = {
424                         .type = RTE_FLOW_ITEM_TYPE_VOID,
425                 },
426                 .items[1] = {
427                         .type = RTE_FLOW_ITEM_TYPE_END,
428                 },
429                 .mirred = TCA_EGRESS_MIRROR,
430         },
431         [TAP_ISOLATE] = {
432                 .attr = {
433                         .group = MAX_GROUP,
434                         .priority = PRIORITY_MASK - TAP_ISOLATE,
435                         .ingress = 1,
436                 },
437                 .items[0] = {
438                         .type = RTE_FLOW_ITEM_TYPE_VOID,
439                 },
440                 .items[1] = {
441                         .type = RTE_FLOW_ITEM_TYPE_END,
442                 },
443         },
444 };
445
446 /**
447  * Make as much checks as possible on an Ethernet item, and if a flow is
448  * provided, fill it appropriately with Ethernet info.
449  *
450  * @param[in] item
451  *   Item specification.
452  * @param[in, out] data
453  *   Additional data structure to tell next layers we've been here.
454  *
455  * @return
456  *   0 if checks are alright, -1 otherwise.
457  */
458 static int
459 tap_flow_create_eth(const struct rte_flow_item *item, void *data)
460 {
461         struct convert_data *info = (struct convert_data *)data;
462         const struct rte_flow_item_eth *spec = item->spec;
463         const struct rte_flow_item_eth *mask = item->mask;
464         struct rte_flow *flow = info->flow;
465         struct nlmsg *msg;
466
467         /* use default mask if none provided */
468         if (!mask)
469                 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_ETH].default_mask;
470         /* TC does not support eth_type masking. Only accept if exact match. */
471         if (mask->type && mask->type != 0xffff)
472                 return -1;
473         if (!spec)
474                 return 0;
475         /* store eth_type for consistency if ipv4/6 pattern item comes next */
476         if (spec->type & mask->type)
477                 info->eth_type = spec->type;
478         if (!flow)
479                 return 0;
480         msg = &flow->msg;
481         if (!is_zero_ether_addr(&spec->dst)) {
482                 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_DST, ETHER_ADDR_LEN,
483                            &spec->dst.addr_bytes);
484                 tap_nlattr_add(&msg->nh,
485                            TCA_FLOWER_KEY_ETH_DST_MASK, ETHER_ADDR_LEN,
486                            &mask->dst.addr_bytes);
487         }
488         if (!is_zero_ether_addr(&mask->src)) {
489                 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_SRC, ETHER_ADDR_LEN,
490                            &spec->src.addr_bytes);
491                 tap_nlattr_add(&msg->nh,
492                            TCA_FLOWER_KEY_ETH_SRC_MASK, ETHER_ADDR_LEN,
493                            &mask->src.addr_bytes);
494         }
495         return 0;
496 }
497
498 /**
499  * Make as much checks as possible on a VLAN item, and if a flow is provided,
500  * fill it appropriately with VLAN info.
501  *
502  * @param[in] item
503  *   Item specification.
504  * @param[in, out] data
505  *   Additional data structure to tell next layers we've been here.
506  *
507  * @return
508  *   0 if checks are alright, -1 otherwise.
509  */
510 static int
511 tap_flow_create_vlan(const struct rte_flow_item *item, void *data)
512 {
513         struct convert_data *info = (struct convert_data *)data;
514         const struct rte_flow_item_vlan *spec = item->spec;
515         const struct rte_flow_item_vlan *mask = item->mask;
516         struct rte_flow *flow = info->flow;
517         struct nlmsg *msg;
518
519         /* use default mask if none provided */
520         if (!mask)
521                 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask;
522         /* TC does not support tpid masking. Only accept if exact match. */
523         if (mask->tpid && mask->tpid != 0xffff)
524                 return -1;
525         /* Double-tagging not supported. */
526         if (spec && mask->tpid && spec->tpid != htons(ETH_P_8021Q))
527                 return -1;
528         info->vlan = 1;
529         if (!flow)
530                 return 0;
531         msg = &flow->msg;
532         msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_8021Q));
533 #define VLAN_PRIO(tci) ((tci) >> 13)
534 #define VLAN_ID(tci) ((tci) & 0xfff)
535         if (!spec)
536                 return 0;
537         if (spec->tci) {
538                 uint16_t tci = ntohs(spec->tci) & mask->tci;
539                 uint16_t prio = VLAN_PRIO(tci);
540                 uint8_t vid = VLAN_ID(tci);
541
542                 if (prio)
543                         tap_nlattr_add8(&msg->nh,
544                                         TCA_FLOWER_KEY_VLAN_PRIO, prio);
545                 if (vid)
546                         tap_nlattr_add16(&msg->nh,
547                                          TCA_FLOWER_KEY_VLAN_ID, vid);
548         }
549         return 0;
550 }
551
552 /**
553  * Make as much checks as possible on an IPv4 item, and if a flow is provided,
554  * fill it appropriately with IPv4 info.
555  *
556  * @param[in] item
557  *   Item specification.
558  * @param[in, out] data
559  *   Additional data structure to tell next layers we've been here.
560  *
561  * @return
562  *   0 if checks are alright, -1 otherwise.
563  */
564 static int
565 tap_flow_create_ipv4(const struct rte_flow_item *item, void *data)
566 {
567         struct convert_data *info = (struct convert_data *)data;
568         const struct rte_flow_item_ipv4 *spec = item->spec;
569         const struct rte_flow_item_ipv4 *mask = item->mask;
570         struct rte_flow *flow = info->flow;
571         struct nlmsg *msg;
572
573         /* use default mask if none provided */
574         if (!mask)
575                 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV4].default_mask;
576         /* check that previous eth type is compatible with ipv4 */
577         if (info->eth_type && info->eth_type != htons(ETH_P_IP))
578                 return -1;
579         /* store ip_proto for consistency if udp/tcp pattern item comes next */
580         if (spec)
581                 info->ip_proto = spec->hdr.next_proto_id;
582         if (!flow)
583                 return 0;
584         msg = &flow->msg;
585         if (!info->eth_type)
586                 info->eth_type = htons(ETH_P_IP);
587         if (!spec)
588                 return 0;
589         if (spec->hdr.dst_addr) {
590                 tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST,
591                              spec->hdr.dst_addr);
592                 tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST_MASK,
593                              mask->hdr.dst_addr);
594         }
595         if (spec->hdr.src_addr) {
596                 tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC,
597                              spec->hdr.src_addr);
598                 tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC_MASK,
599                              mask->hdr.src_addr);
600         }
601         if (spec->hdr.next_proto_id)
602                 tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO,
603                             spec->hdr.next_proto_id);
604         return 0;
605 }
606
607 /**
608  * Make as much checks as possible on an IPv6 item, and if a flow is provided,
609  * fill it appropriately with IPv6 info.
610  *
611  * @param[in] item
612  *   Item specification.
613  * @param[in, out] data
614  *   Additional data structure to tell next layers we've been here.
615  *
616  * @return
617  *   0 if checks are alright, -1 otherwise.
618  */
619 static int
620 tap_flow_create_ipv6(const struct rte_flow_item *item, void *data)
621 {
622         struct convert_data *info = (struct convert_data *)data;
623         const struct rte_flow_item_ipv6 *spec = item->spec;
624         const struct rte_flow_item_ipv6 *mask = item->mask;
625         struct rte_flow *flow = info->flow;
626         uint8_t empty_addr[16] = { 0 };
627         struct nlmsg *msg;
628
629         /* use default mask if none provided */
630         if (!mask)
631                 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV6].default_mask;
632         /* check that previous eth type is compatible with ipv6 */
633         if (info->eth_type && info->eth_type != htons(ETH_P_IPV6))
634                 return -1;
635         /* store ip_proto for consistency if udp/tcp pattern item comes next */
636         if (spec)
637                 info->ip_proto = spec->hdr.proto;
638         if (!flow)
639                 return 0;
640         msg = &flow->msg;
641         if (!info->eth_type)
642                 info->eth_type = htons(ETH_P_IPV6);
643         if (!spec)
644                 return 0;
645         if (memcmp(spec->hdr.dst_addr, empty_addr, 16)) {
646                 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST,
647                            sizeof(spec->hdr.dst_addr), &spec->hdr.dst_addr);
648                 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST_MASK,
649                            sizeof(mask->hdr.dst_addr), &mask->hdr.dst_addr);
650         }
651         if (memcmp(spec->hdr.src_addr, empty_addr, 16)) {
652                 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC,
653                            sizeof(spec->hdr.src_addr), &spec->hdr.src_addr);
654                 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC_MASK,
655                            sizeof(mask->hdr.src_addr), &mask->hdr.src_addr);
656         }
657         if (spec->hdr.proto)
658                 tap_nlattr_add8(&msg->nh,
659                                 TCA_FLOWER_KEY_IP_PROTO, spec->hdr.proto);
660         return 0;
661 }
662
663 /**
664  * Make as much checks as possible on a UDP item, and if a flow is provided,
665  * fill it appropriately with UDP info.
666  *
667  * @param[in] item
668  *   Item specification.
669  * @param[in, out] data
670  *   Additional data structure to tell next layers we've been here.
671  *
672  * @return
673  *   0 if checks are alright, -1 otherwise.
674  */
675 static int
676 tap_flow_create_udp(const struct rte_flow_item *item, void *data)
677 {
678         struct convert_data *info = (struct convert_data *)data;
679         const struct rte_flow_item_udp *spec = item->spec;
680         const struct rte_flow_item_udp *mask = item->mask;
681         struct rte_flow *flow = info->flow;
682         struct nlmsg *msg;
683
684         /* use default mask if none provided */
685         if (!mask)
686                 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_UDP].default_mask;
687         /* check that previous ip_proto is compatible with udp */
688         if (info->ip_proto && info->ip_proto != IPPROTO_UDP)
689                 return -1;
690         /* TC does not support UDP port masking. Only accept if exact match. */
691         if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) ||
692             (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff))
693                 return -1;
694         if (!flow)
695                 return 0;
696         msg = &flow->msg;
697         tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_UDP);
698         if (!spec)
699                 return 0;
700         if (spec->hdr.dst_port & mask->hdr.dst_port)
701                 tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_DST,
702                              spec->hdr.dst_port);
703         if (spec->hdr.src_port & mask->hdr.src_port)
704                 tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_SRC,
705                              spec->hdr.src_port);
706         return 0;
707 }
708
709 /**
710  * Make as much checks as possible on a TCP item, and if a flow is provided,
711  * fill it appropriately with TCP info.
712  *
713  * @param[in] item
714  *   Item specification.
715  * @param[in, out] data
716  *   Additional data structure to tell next layers we've been here.
717  *
718  * @return
719  *   0 if checks are alright, -1 otherwise.
720  */
721 static int
722 tap_flow_create_tcp(const struct rte_flow_item *item, void *data)
723 {
724         struct convert_data *info = (struct convert_data *)data;
725         const struct rte_flow_item_tcp *spec = item->spec;
726         const struct rte_flow_item_tcp *mask = item->mask;
727         struct rte_flow *flow = info->flow;
728         struct nlmsg *msg;
729
730         /* use default mask if none provided */
731         if (!mask)
732                 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_TCP].default_mask;
733         /* check that previous ip_proto is compatible with tcp */
734         if (info->ip_proto && info->ip_proto != IPPROTO_TCP)
735                 return -1;
736         /* TC does not support TCP port masking. Only accept if exact match. */
737         if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) ||
738             (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff))
739                 return -1;
740         if (!flow)
741                 return 0;
742         msg = &flow->msg;
743         tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_TCP);
744         if (!spec)
745                 return 0;
746         if (spec->hdr.dst_port & mask->hdr.dst_port)
747                 tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_DST,
748                              spec->hdr.dst_port);
749         if (spec->hdr.src_port & mask->hdr.src_port)
750                 tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_SRC,
751                              spec->hdr.src_port);
752         return 0;
753 }
754
755 /**
756  * Check support for a given item.
757  *
758  * @param[in] item
759  *   Item specification.
760  * @param size
761  *   Bit-Mask size in bytes.
762  * @param[in] supported_mask
763  *   Bit-mask covering supported fields to compare with spec, last and mask in
764  *   \item.
765  * @param[in] default_mask
766  *   Bit-mask default mask if none is provided in \item.
767  *
768  * @return
769  *   0 on success.
770  */
771 static int
772 tap_flow_item_validate(const struct rte_flow_item *item,
773                        unsigned int size,
774                        const uint8_t *supported_mask,
775                        const uint8_t *default_mask)
776 {
777         int ret = 0;
778
779         /* An empty layer is allowed, as long as all fields are NULL */
780         if (!item->spec && (item->mask || item->last))
781                 return -1;
782         /* Is the item spec compatible with what the NIC supports? */
783         if (item->spec && !item->mask) {
784                 unsigned int i;
785                 const uint8_t *spec = item->spec;
786
787                 for (i = 0; i < size; ++i)
788                         if ((spec[i] | supported_mask[i]) != supported_mask[i])
789                                 return -1;
790                 /* Is the default mask compatible with what the NIC supports? */
791                 for (i = 0; i < size; i++)
792                         if ((default_mask[i] | supported_mask[i]) !=
793                             supported_mask[i])
794                                 return -1;
795         }
796         /* Is the item last compatible with what the NIC supports? */
797         if (item->last && !item->mask) {
798                 unsigned int i;
799                 const uint8_t *spec = item->last;
800
801                 for (i = 0; i < size; ++i)
802                         if ((spec[i] | supported_mask[i]) != supported_mask[i])
803                                 return -1;
804         }
805         /* Is the item mask compatible with what the NIC supports? */
806         if (item->mask) {
807                 unsigned int i;
808                 const uint8_t *spec = item->mask;
809
810                 for (i = 0; i < size; ++i)
811                         if ((spec[i] | supported_mask[i]) != supported_mask[i])
812                                 return -1;
813         }
814         /**
815          * Once masked, Are item spec and item last equal?
816          * TC does not support range so anything else is invalid.
817          */
818         if (item->spec && item->last) {
819                 uint8_t spec[size];
820                 uint8_t last[size];
821                 const uint8_t *apply = default_mask;
822                 unsigned int i;
823
824                 if (item->mask)
825                         apply = item->mask;
826                 for (i = 0; i < size; ++i) {
827                         spec[i] = ((const uint8_t *)item->spec)[i] & apply[i];
828                         last[i] = ((const uint8_t *)item->last)[i] & apply[i];
829                 }
830                 ret = memcmp(spec, last, size);
831         }
832         return ret;
833 }
834
835 /**
836  * Configure the kernel with a TC action and its configured parameters
837  * Handled actions: "gact", "mirred", "skbedit", "bpf"
838  *
839  * @param[in] flow
840  *   Pointer to rte flow containing the netlink message
841  *
842  * @param[in, out] act_index
843  *   Pointer to action sequence number in the TC command
844  *
845  * @param[in] adata
846  *  Pointer to struct holding the action parameters
847  *
848  * @return
849  *   -1 on failure, 0 on success
850  */
851 static int
852 add_action(struct rte_flow *flow, size_t *act_index, struct action_data *adata)
853 {
854         struct nlmsg *msg = &flow->msg;
855
856         if (tap_nlattr_nested_start(msg, (*act_index)++) < 0)
857                 return -1;
858
859         tap_nlattr_add(&msg->nh, TCA_ACT_KIND,
860                                 strlen(adata->id) + 1, adata->id);
861         if (tap_nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
862                 return -1;
863         if (strcmp("gact", adata->id) == 0) {
864                 tap_nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(adata->gact),
865                            &adata->gact);
866         } else if (strcmp("mirred", adata->id) == 0) {
867                 if (adata->mirred.eaction == TCA_EGRESS_MIRROR)
868                         adata->mirred.action = TC_ACT_PIPE;
869                 else /* REDIRECT */
870                         adata->mirred.action = TC_ACT_STOLEN;
871                 tap_nlattr_add(&msg->nh, TCA_MIRRED_PARMS,
872                            sizeof(adata->mirred),
873                            &adata->mirred);
874         } else if (strcmp("skbedit", adata->id) == 0) {
875                 tap_nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS,
876                            sizeof(adata->skbedit.skbedit),
877                            &adata->skbedit.skbedit);
878                 tap_nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING,
879                              adata->skbedit.queue);
880         } else {
881                 return -1;
882         }
883         tap_nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
884         tap_nlattr_nested_finish(msg); /* nested act_index */
885         return 0;
886 }
887
888 /**
889  * Helper function to send a serie of TC actions to the kernel
890  *
891  * @param[in] flow
892  *   Pointer to rte flow containing the netlink message
893  *
894  * @param[in] nb_actions
895  *   Number of actions in an array of action structs
896  *
897  * @param[in] data
898  *   Pointer to an array of action structs
899  *
900  * @param[in] classifier_actions
901  *   The classifier on behave of which the actions are configured
902  *
903  * @return
904  *   -1 on failure, 0 on success
905  */
906 static int
907 add_actions(struct rte_flow *flow, int nb_actions, struct action_data *data,
908             int classifier_action)
909 {
910         struct nlmsg *msg = &flow->msg;
911         size_t act_index = 1;
912         int i;
913
914         if (tap_nlattr_nested_start(msg, classifier_action) < 0)
915                 return -1;
916         for (i = 0; i < nb_actions; i++)
917                 if (add_action(flow, &act_index, data + i) < 0)
918                         return -1;
919         tap_nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
920         return 0;
921 }
922
923 /**
924  * Validate a flow supported by TC.
925  * If flow param is not NULL, then also fill the netlink message inside.
926  *
927  * @param pmd
928  *   Pointer to private structure.
929  * @param[in] attr
930  *   Flow rule attributes.
931  * @param[in] pattern
932  *   Pattern specification (list terminated by the END pattern item).
933  * @param[in] actions
934  *   Associated actions (list terminated by the END action).
935  * @param[out] error
936  *   Perform verbose error reporting if not NULL.
937  * @param[in, out] flow
938  *   Flow structure to update.
939  * @param[in] mirred
940  *   If set to TCA_EGRESS_REDIR, provided actions will be replaced with a
941  *   redirection to the tap netdevice, and the TC rule will be configured
942  *   on the remote netdevice in pmd.
943  *   If set to TCA_EGRESS_MIRROR, provided actions will be replaced with a
944  *   mirroring to the tap netdevice, and the TC rule will be configured
945  *   on the remote netdevice in pmd. Matching packets will thus be duplicated.
946  *   If set to 0, the standard behavior is to be used: set correct actions for
947  *   the TC rule, and apply it on the tap netdevice.
948  *
949  * @return
950  *   0 on success, a negative errno value otherwise and rte_errno is set.
951  */
952 static int
953 priv_flow_process(struct pmd_internals *pmd,
954                   const struct rte_flow_attr *attr,
955                   const struct rte_flow_item items[],
956                   const struct rte_flow_action actions[],
957                   struct rte_flow_error *error,
958                   struct rte_flow *flow,
959                   int mirred)
960 {
961         const struct tap_flow_items *cur_item = tap_flow_items;
962         struct convert_data data = {
963                 .eth_type = 0,
964                 .ip_proto = 0,
965                 .flow = flow,
966         };
967         int action = 0; /* Only one action authorized for now */
968
969         if (attr->group > MAX_GROUP) {
970                 rte_flow_error_set(
971                         error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
972                         NULL, "group value too big: cannot exceed 15");
973                 return -rte_errno;
974         }
975         if (attr->priority > MAX_PRIORITY) {
976                 rte_flow_error_set(
977                         error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
978                         NULL, "priority value too big");
979                 return -rte_errno;
980         } else if (flow) {
981                 uint16_t group = attr->group << GROUP_SHIFT;
982                 uint16_t prio = group | (attr->priority + PRIORITY_OFFSET);
983                 flow->msg.t.tcm_info = TC_H_MAKE(prio << 16,
984                                                  flow->msg.t.tcm_info);
985         }
986         if (flow) {
987                 if (mirred) {
988                         /*
989                          * If attr->ingress, the rule applies on remote ingress
990                          * to match incoming packets
991                          * If attr->egress, the rule applies on tap ingress (as
992                          * seen from the kernel) to deal with packets going out
993                          * from the DPDK app.
994                          */
995                         flow->msg.t.tcm_parent = TC_H_MAKE(TC_H_INGRESS, 0);
996                 } else {
997                         /* Standard rule on tap egress (kernel standpoint). */
998                         flow->msg.t.tcm_parent =
999                                 TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
1000                 }
1001                 /* use flower filter type */
1002                 tap_nlattr_add(&flow->msg.nh, TCA_KIND, sizeof("flower"), "flower");
1003                 if (tap_nlattr_nested_start(&flow->msg, TCA_OPTIONS) < 0)
1004                         goto exit_item_not_supported;
1005         }
1006         for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) {
1007                 const struct tap_flow_items *token = NULL;
1008                 unsigned int i;
1009                 int err = 0;
1010
1011                 if (items->type == RTE_FLOW_ITEM_TYPE_VOID)
1012                         continue;
1013                 for (i = 0;
1014                      cur_item->items &&
1015                      cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END;
1016                      ++i) {
1017                         if (cur_item->items[i] == items->type) {
1018                                 token = &tap_flow_items[items->type];
1019                                 break;
1020                         }
1021                 }
1022                 if (!token)
1023                         goto exit_item_not_supported;
1024                 cur_item = token;
1025                 err = tap_flow_item_validate(
1026                         items, cur_item->mask_sz,
1027                         (const uint8_t *)cur_item->mask,
1028                         (const uint8_t *)cur_item->default_mask);
1029                 if (err)
1030                         goto exit_item_not_supported;
1031                 if (flow && cur_item->convert) {
1032                         err = cur_item->convert(items, &data);
1033                         if (err)
1034                                 goto exit_item_not_supported;
1035                 }
1036         }
1037         if (flow) {
1038                 if (data.vlan) {
1039                         tap_nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
1040                                      htons(ETH_P_8021Q));
1041                         tap_nlattr_add16(&flow->msg.nh,
1042                                      TCA_FLOWER_KEY_VLAN_ETH_TYPE,
1043                                      data.eth_type ?
1044                                      data.eth_type : htons(ETH_P_ALL));
1045                 } else if (data.eth_type) {
1046                         tap_nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
1047                                      data.eth_type);
1048                 }
1049         }
1050         if (mirred && flow) {
1051                 struct action_data adata = {
1052                         .id = "mirred",
1053                         .mirred = {
1054                                 .eaction = mirred,
1055                         },
1056                 };
1057
1058                 /*
1059                  * If attr->egress && mirred, then this is a special
1060                  * case where the rule must be applied on the tap, to
1061                  * redirect packets coming from the DPDK App, out
1062                  * through the remote netdevice.
1063                  */
1064                 adata.mirred.ifindex = attr->ingress ? pmd->if_index :
1065                         pmd->remote_if_index;
1066                 if (mirred == TCA_EGRESS_MIRROR)
1067                         adata.mirred.action = TC_ACT_PIPE;
1068                 else
1069                         adata.mirred.action = TC_ACT_STOLEN;
1070                 if (add_actions(flow, 1, &adata, TCA_FLOWER_ACT) < 0)
1071                         goto exit_action_not_supported;
1072                 else
1073                         goto end;
1074         }
1075         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
1076                 int err = 0;
1077
1078                 if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
1079                         continue;
1080                 } else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
1081                         if (action)
1082                                 goto exit_action_not_supported;
1083                         action = 1;
1084                         if (flow) {
1085                                 struct action_data adata = {
1086                                         .id = "gact",
1087                                         .gact = {
1088                                                 .action = TC_ACT_SHOT,
1089                                         },
1090                                 };
1091
1092                                 err = add_actions(flow, 1, &adata,
1093                                                   TCA_FLOWER_ACT);
1094                         }
1095                 } else if (actions->type == RTE_FLOW_ACTION_TYPE_PASSTHRU) {
1096                         if (action)
1097                                 goto exit_action_not_supported;
1098                         action = 1;
1099                         if (flow) {
1100                                 struct action_data adata = {
1101                                         .id = "gact",
1102                                         .gact = {
1103                                                 /* continue */
1104                                                 .action = TC_ACT_UNSPEC,
1105                                         },
1106                                 };
1107
1108                                 err = add_actions(flow, 1, &adata,
1109                                                   TCA_FLOWER_ACT);
1110                         }
1111                 } else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
1112                         const struct rte_flow_action_queue *queue =
1113                                 (const struct rte_flow_action_queue *)
1114                                 actions->conf;
1115
1116                         if (action)
1117                                 goto exit_action_not_supported;
1118                         action = 1;
1119                         if (!queue ||
1120                             (queue->index > pmd->dev->data->nb_rx_queues - 1))
1121                                 goto exit_action_not_supported;
1122                         if (flow) {
1123                                 struct action_data adata = {
1124                                         .id = "skbedit",
1125                                         .skbedit = {
1126                                                 .skbedit = {
1127                                                         .action = TC_ACT_PIPE,
1128                                                 },
1129                                                 .queue = queue->index,
1130                                         },
1131                                 };
1132
1133                                 err = add_actions(flow, 1, &adata,
1134                                         TCA_FLOWER_ACT);
1135                         }
1136                 } else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) {
1137                         /* Fake RSS support. */
1138                         const struct rte_flow_action_rss *rss =
1139                                 (const struct rte_flow_action_rss *)
1140                                 actions->conf;
1141
1142                         if (action++)
1143                                 goto exit_action_not_supported;
1144
1145                         if (!rss || rss->num < 1 ||
1146                             (rss->queue[0] > pmd->dev->data->nb_rx_queues - 1))
1147                                 goto exit_action_not_supported;
1148                         if (flow) {
1149                                 struct action_data adata = {
1150                                         .id = "skbedit",
1151                                         .skbedit = {
1152                                                 .skbedit = {
1153                                                         .action = TC_ACT_PIPE,
1154                                                 },
1155                                                 .queue = rss->queue[0],
1156                                         },
1157                                 };
1158
1159                                 err = add_actions(flow, 1, &adata,
1160                                         TCA_FLOWER_ACT);
1161                         }
1162                 } else {
1163                         goto exit_action_not_supported;
1164                 }
1165                 if (err)
1166                         goto exit_action_not_supported;
1167         }
1168 end:
1169         if (flow)
1170                 tap_nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */
1171         return 0;
1172 exit_item_not_supported:
1173         rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
1174                            items, "item not supported");
1175         return -rte_errno;
1176 exit_action_not_supported:
1177         rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
1178                            actions, "action not supported");
1179         return -rte_errno;
1180 }
1181
1182
1183
1184 /**
1185  * Validate a flow.
1186  *
1187  * @see rte_flow_validate()
1188  * @see rte_flow_ops
1189  */
1190 static int
1191 tap_flow_validate(struct rte_eth_dev *dev,
1192                   const struct rte_flow_attr *attr,
1193                   const struct rte_flow_item items[],
1194                   const struct rte_flow_action actions[],
1195                   struct rte_flow_error *error)
1196 {
1197         struct pmd_internals *pmd = dev->data->dev_private;
1198
1199         return priv_flow_process(pmd, attr, items, actions, error, NULL, 0);
1200 }
1201
1202 /**
1203  * Set a unique handle in a flow.
1204  *
1205  * The kernel supports TC rules with equal priority, as long as they use the
1206  * same matching fields (e.g.: dst mac and ipv4) with different values (and
1207  * full mask to ensure no collision is possible).
1208  * In those rules, the handle (uint32_t) is the part that would identify
1209  * specifically each rule.
1210  *
1211  * On 32-bit architectures, the handle can simply be the flow's pointer address.
1212  * On 64-bit architectures, we rely on jhash(flow) to find a (sufficiently)
1213  * unique handle.
1214  *
1215  * @param[in, out] flow
1216  *   The flow that needs its handle set.
1217  */
1218 static void
1219 tap_flow_set_handle(struct rte_flow *flow)
1220 {
1221         uint32_t handle = 0;
1222
1223         if (sizeof(flow) > 4)
1224                 handle = rte_jhash(&flow, sizeof(flow), 1);
1225         else
1226                 handle = (uintptr_t)flow;
1227         /* must be at least 1 to avoid letting the kernel choose one for us */
1228         if (!handle)
1229                 handle = 1;
1230         flow->msg.t.tcm_handle = handle;
1231 }
1232
1233 /**
1234  * Create a flow.
1235  *
1236  * @see rte_flow_create()
1237  * @see rte_flow_ops
1238  */
1239 static struct rte_flow *
1240 tap_flow_create(struct rte_eth_dev *dev,
1241                 const struct rte_flow_attr *attr,
1242                 const struct rte_flow_item items[],
1243                 const struct rte_flow_action actions[],
1244                 struct rte_flow_error *error)
1245 {
1246         struct pmd_internals *pmd = dev->data->dev_private;
1247         struct rte_flow *remote_flow = NULL;
1248         struct rte_flow *flow = NULL;
1249         struct nlmsg *msg = NULL;
1250         int err;
1251
1252         if (!pmd->if_index) {
1253                 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1254                                    NULL,
1255                                    "can't create rule, ifindex not found");
1256                 goto fail;
1257         }
1258         /*
1259          * No rules configured through standard rte_flow should be set on the
1260          * priorities used by implicit rules.
1261          */
1262         if ((attr->group == MAX_GROUP) &&
1263             attr->priority > (MAX_PRIORITY - TAP_REMOTE_MAX_IDX)) {
1264                 rte_flow_error_set(
1265                         error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1266                         NULL, "priority value too big");
1267                 goto fail;
1268         }
1269         flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1270         if (!flow) {
1271                 rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1272                                    NULL, "cannot allocate memory for rte_flow");
1273                 goto fail;
1274         }
1275         msg = &flow->msg;
1276         tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER,
1277                     NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1278         msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1279         tap_flow_set_handle(flow);
1280         if (priv_flow_process(pmd, attr, items, actions, error, flow, 0))
1281                 goto fail;
1282         err = tap_nl_send(pmd->nlsk_fd, &msg->nh);
1283         if (err < 0) {
1284                 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1285                                    NULL, "couldn't send request to kernel");
1286                 goto fail;
1287         }
1288         err = tap_nl_recv_ack(pmd->nlsk_fd);
1289         if (err < 0) {
1290                 RTE_LOG(ERR, PMD,
1291                         "Kernel refused TC filter rule creation (%d): %s\n",
1292                         errno, strerror(errno));
1293                 rte_flow_error_set(error, EEXIST, RTE_FLOW_ERROR_TYPE_HANDLE,
1294                                    NULL,
1295                                    "overlapping rules or Kernel too old for flower support");
1296                 goto fail;
1297         }
1298         LIST_INSERT_HEAD(&pmd->flows, flow, next);
1299         /**
1300          * If a remote device is configured, a TC rule with identical items for
1301          * matching must be set on that device, with a single action: redirect
1302          * to the local pmd->if_index.
1303          */
1304         if (pmd->remote_if_index) {
1305                 remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1306                 if (!remote_flow) {
1307                         rte_flow_error_set(
1308                                 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1309                                 "cannot allocate memory for rte_flow");
1310                         goto fail;
1311                 }
1312                 msg = &remote_flow->msg;
1313                 /* set the rule if_index for the remote netdevice */
1314                 tc_init_msg(
1315                         msg, pmd->remote_if_index, RTM_NEWTFILTER,
1316                         NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1317                 msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1318                 tap_flow_set_handle(remote_flow);
1319                 if (priv_flow_process(pmd, attr, items, NULL,
1320                                       error, remote_flow, TCA_EGRESS_REDIR)) {
1321                         rte_flow_error_set(
1322                                 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1323                                 NULL, "rte flow rule validation failed");
1324                         goto fail;
1325                 }
1326                 err = tap_nl_send(pmd->nlsk_fd, &msg->nh);
1327                 if (err < 0) {
1328                         rte_flow_error_set(
1329                                 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1330                                 NULL, "Failure sending nl request");
1331                         goto fail;
1332                 }
1333                 err = tap_nl_recv_ack(pmd->nlsk_fd);
1334                 if (err < 0) {
1335                         RTE_LOG(ERR, PMD,
1336                                 "Kernel refused TC filter rule creation (%d): %s\n",
1337                                 errno, strerror(errno));
1338                         rte_flow_error_set(
1339                                 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1340                                 NULL,
1341                                 "overlapping rules or Kernel too old for flower support");
1342                         goto fail;
1343                 }
1344                 flow->remote_flow = remote_flow;
1345         }
1346         return flow;
1347 fail:
1348         if (remote_flow)
1349                 rte_free(remote_flow);
1350         if (flow)
1351                 rte_free(flow);
1352         return NULL;
1353 }
1354
1355 /**
1356  * Destroy a flow using pointer to pmd_internal.
1357  *
1358  * @param[in, out] pmd
1359  *   Pointer to private structure.
1360  * @param[in] flow
1361  *   Pointer to the flow to destroy.
1362  * @param[in, out] error
1363  *   Pointer to the flow error handler
1364  *
1365  * @return 0 if the flow could be destroyed, -1 otherwise.
1366  */
1367 static int
1368 tap_flow_destroy_pmd(struct pmd_internals *pmd,
1369                      struct rte_flow *flow,
1370                      struct rte_flow_error *error)
1371 {
1372         struct rte_flow *remote_flow = flow->remote_flow;
1373         int ret = 0;
1374
1375         LIST_REMOVE(flow, next);
1376         flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1377         flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
1378
1379         ret = tap_nl_send(pmd->nlsk_fd, &flow->msg.nh);
1380         if (ret < 0) {
1381                 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1382                                    NULL, "couldn't send request to kernel");
1383                 goto end;
1384         }
1385         ret = tap_nl_recv_ack(pmd->nlsk_fd);
1386         /* If errno is ENOENT, the rule is already no longer in the kernel. */
1387         if (ret < 0 && errno == ENOENT)
1388                 ret = 0;
1389         if (ret < 0) {
1390                 RTE_LOG(ERR, PMD,
1391                         "Kernel refused TC filter rule deletion (%d): %s\n",
1392                         errno, strerror(errno));
1393                 rte_flow_error_set(
1394                         error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1395                         "couldn't receive kernel ack to our request");
1396                 goto end;
1397         }
1398         if (remote_flow) {
1399                 remote_flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1400                 remote_flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
1401
1402                 ret = tap_nl_send(pmd->nlsk_fd, &remote_flow->msg.nh);
1403                 if (ret < 0) {
1404                         rte_flow_error_set(
1405                                 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1406                                 NULL, "Failure sending nl request");
1407                         goto end;
1408                 }
1409                 ret = tap_nl_recv_ack(pmd->nlsk_fd);
1410                 if (ret < 0 && errno == ENOENT)
1411                         ret = 0;
1412                 if (ret < 0) {
1413                         RTE_LOG(ERR, PMD,
1414                                 "Kernel refused TC filter rule deletion (%d): %s\n",
1415                                 errno, strerror(errno));
1416                         rte_flow_error_set(
1417                                 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1418                                 NULL, "Failure trying to receive nl ack");
1419                         goto end;
1420                 }
1421         }
1422 end:
1423         if (remote_flow)
1424                 rte_free(remote_flow);
1425         rte_free(flow);
1426         return ret;
1427 }
1428
1429 /**
1430  * Destroy a flow.
1431  *
1432  * @see rte_flow_destroy()
1433  * @see rte_flow_ops
1434  */
1435 static int
1436 tap_flow_destroy(struct rte_eth_dev *dev,
1437                  struct rte_flow *flow,
1438                  struct rte_flow_error *error)
1439 {
1440         struct pmd_internals *pmd = dev->data->dev_private;
1441
1442         return tap_flow_destroy_pmd(pmd, flow, error);
1443 }
1444
1445 /**
1446  * Enable/disable flow isolation.
1447  *
1448  * @see rte_flow_isolate()
1449  * @see rte_flow_ops
1450  */
1451 static int
1452 tap_flow_isolate(struct rte_eth_dev *dev,
1453                  int set,
1454                  struct rte_flow_error *error __rte_unused)
1455 {
1456         struct pmd_internals *pmd = dev->data->dev_private;
1457
1458         if (set)
1459                 pmd->flow_isolate = 1;
1460         else
1461                 pmd->flow_isolate = 0;
1462         /*
1463          * If netdevice is there, setup appropriate flow rules immediately.
1464          * Otherwise it will be set when bringing up the netdevice (tun_alloc).
1465          */
1466         if (!pmd->rxq[0].fd)
1467                 return 0;
1468         if (set) {
1469                 struct rte_flow *flow;
1470
1471                 while (1) {
1472                         flow = LIST_FIRST(&pmd->implicit_flows);
1473                         if (!flow)
1474                                 break;
1475                         /*
1476                          * Remove all implicit rules on the remote.
1477                          * Keep the local rule to redirect packets on TX.
1478                          * Keep also the last implicit local rule: ISOLATE.
1479                          */
1480                         if (flow->msg.t.tcm_ifindex == pmd->if_index)
1481                                 break;
1482                         if (tap_flow_destroy_pmd(pmd, flow, NULL) < 0)
1483                                 goto error;
1484                 }
1485                 /* Switch the TC rule according to pmd->flow_isolate */
1486                 if (tap_flow_implicit_create(pmd, TAP_ISOLATE) == -1)
1487                         goto error;
1488         } else {
1489                 /* Switch the TC rule according to pmd->flow_isolate */
1490                 if (tap_flow_implicit_create(pmd, TAP_ISOLATE) == -1)
1491                         goto error;
1492                 if (!pmd->remote_if_index)
1493                         return 0;
1494                 if (tap_flow_implicit_create(pmd, TAP_REMOTE_TX) < 0)
1495                         goto error;
1496                 if (tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC) < 0)
1497                         goto error;
1498                 if (tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCAST) < 0)
1499                         goto error;
1500                 if (tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCASTV6) < 0)
1501                         goto error;
1502                 if (dev->data->promiscuous &&
1503                     tap_flow_implicit_create(pmd, TAP_REMOTE_PROMISC) < 0)
1504                         goto error;
1505                 if (dev->data->all_multicast &&
1506                     tap_flow_implicit_create(pmd, TAP_REMOTE_ALLMULTI) < 0)
1507                         goto error;
1508         }
1509         return 0;
1510 error:
1511         pmd->flow_isolate = 0;
1512         return rte_flow_error_set(
1513                 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
1514                 "TC rule creation failed");
1515 }
1516
1517 /**
1518  * Destroy all flows.
1519  *
1520  * @see rte_flow_flush()
1521  * @see rte_flow_ops
1522  */
1523 int
1524 tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error)
1525 {
1526         struct pmd_internals *pmd = dev->data->dev_private;
1527         struct rte_flow *flow;
1528
1529         while (!LIST_EMPTY(&pmd->flows)) {
1530                 flow = LIST_FIRST(&pmd->flows);
1531                 if (tap_flow_destroy(dev, flow, error) < 0)
1532                         return -1;
1533         }
1534         return 0;
1535 }
1536
1537 /**
1538  * Add an implicit flow rule on the remote device to make sure traffic gets to
1539  * the tap netdevice from there.
1540  *
1541  * @param pmd
1542  *   Pointer to private structure.
1543  * @param[in] idx
1544  *   The idx in the implicit_rte_flows array specifying which rule to apply.
1545  *
1546  * @return -1 if the rule couldn't be applied, 0 otherwise.
1547  */
1548 int tap_flow_implicit_create(struct pmd_internals *pmd,
1549                              enum implicit_rule_index idx)
1550 {
1551         uint16_t flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE;
1552         struct rte_flow_action *actions = implicit_rte_flows[idx].actions;
1553         struct rte_flow_action isolate_actions[2] = {
1554                 [1] = {
1555                         .type = RTE_FLOW_ACTION_TYPE_END,
1556                 },
1557         };
1558         struct rte_flow_item *items = implicit_rte_flows[idx].items;
1559         struct rte_flow_attr *attr = &implicit_rte_flows[idx].attr;
1560         struct rte_flow_item_eth eth_local = { .type = 0 };
1561         uint16_t if_index = pmd->remote_if_index;
1562         struct rte_flow *remote_flow = NULL;
1563         struct nlmsg *msg = NULL;
1564         int err = 0;
1565         struct rte_flow_item items_local[2] = {
1566                 [0] = {
1567                         .type = items[0].type,
1568                         .spec = &eth_local,
1569                         .mask = items[0].mask,
1570                 },
1571                 [1] = {
1572                         .type = items[1].type,
1573                 }
1574         };
1575
1576         remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1577         if (!remote_flow) {
1578                 RTE_LOG(ERR, PMD, "Cannot allocate memory for rte_flow\n");
1579                 goto fail;
1580         }
1581         msg = &remote_flow->msg;
1582         if (idx == TAP_REMOTE_TX) {
1583                 if_index = pmd->if_index;
1584         } else if (idx == TAP_ISOLATE) {
1585                 if_index = pmd->if_index;
1586                 /* Don't be exclusive for this rule, it can be changed later. */
1587                 flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE;
1588                 isolate_actions[0].type = pmd->flow_isolate ?
1589                         RTE_FLOW_ACTION_TYPE_DROP :
1590                         RTE_FLOW_ACTION_TYPE_PASSTHRU;
1591                 actions = isolate_actions;
1592         } else if (idx == TAP_REMOTE_LOCAL_MAC) {
1593                 /*
1594                  * eth addr couldn't be set in implicit_rte_flows[] as it is not
1595                  * known at compile time.
1596                  */
1597                 memcpy(&eth_local.dst, &pmd->eth_addr, sizeof(pmd->eth_addr));
1598                 items = items_local;
1599         }
1600         tc_init_msg(msg, if_index, RTM_NEWTFILTER, flags);
1601         msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1602         /*
1603          * The ISOLATE rule is always present and must have a static handle, as
1604          * the action is changed whether the feature is enabled (DROP) or
1605          * disabled (PASSTHRU).
1606          */
1607         if (idx == TAP_ISOLATE)
1608                 remote_flow->msg.t.tcm_handle = ISOLATE_HANDLE;
1609         else
1610                 tap_flow_set_handle(remote_flow);
1611         if (priv_flow_process(pmd, attr, items, actions, NULL,
1612                               remote_flow, implicit_rte_flows[idx].mirred)) {
1613                 RTE_LOG(ERR, PMD, "rte flow rule validation failed\n");
1614                 goto fail;
1615         }
1616         err = tap_nl_send(pmd->nlsk_fd, &msg->nh);
1617         if (err < 0) {
1618                 RTE_LOG(ERR, PMD, "Failure sending nl request\n");
1619                 goto fail;
1620         }
1621         err = tap_nl_recv_ack(pmd->nlsk_fd);
1622         if (err < 0) {
1623                 RTE_LOG(ERR, PMD,
1624                         "Kernel refused TC filter rule creation (%d): %s\n",
1625                         errno, strerror(errno));
1626                 goto fail;
1627         }
1628         LIST_INSERT_HEAD(&pmd->implicit_flows, remote_flow, next);
1629         return 0;
1630 fail:
1631         if (remote_flow)
1632                 rte_free(remote_flow);
1633         return -1;
1634 }
1635
1636 /**
1637  * Remove specific implicit flow rule on the remote device.
1638  *
1639  * @param[in, out] pmd
1640  *   Pointer to private structure.
1641  * @param[in] idx
1642  *   The idx in the implicit_rte_flows array specifying which rule to remove.
1643  *
1644  * @return -1 if one of the implicit rules couldn't be created, 0 otherwise.
1645  */
1646 int tap_flow_implicit_destroy(struct pmd_internals *pmd,
1647                               enum implicit_rule_index idx)
1648 {
1649         struct rte_flow *remote_flow;
1650         int cur_prio = -1;
1651         int idx_prio = implicit_rte_flows[idx].attr.priority + PRIORITY_OFFSET;
1652
1653         for (remote_flow = LIST_FIRST(&pmd->implicit_flows);
1654              remote_flow;
1655              remote_flow = LIST_NEXT(remote_flow, next)) {
1656                 cur_prio = (remote_flow->msg.t.tcm_info >> 16) & PRIORITY_MASK;
1657                 if (cur_prio != idx_prio)
1658                         continue;
1659                 return tap_flow_destroy_pmd(pmd, remote_flow, NULL);
1660         }
1661         return 0;
1662 }
1663
1664 /**
1665  * Destroy all implicit flows.
1666  *
1667  * @see rte_flow_flush()
1668  */
1669 int
1670 tap_flow_implicit_flush(struct pmd_internals *pmd, struct rte_flow_error *error)
1671 {
1672         struct rte_flow *remote_flow;
1673
1674         while (!LIST_EMPTY(&pmd->implicit_flows)) {
1675                 remote_flow = LIST_FIRST(&pmd->implicit_flows);
1676                 if (tap_flow_destroy_pmd(pmd, remote_flow, error) < 0)
1677                         return -1;
1678         }
1679         return 0;
1680 }
1681
1682 /**
1683  * Manage filter operations.
1684  *
1685  * @param dev
1686  *   Pointer to Ethernet device structure.
1687  * @param filter_type
1688  *   Filter type.
1689  * @param filter_op
1690  *   Operation to perform.
1691  * @param arg
1692  *   Pointer to operation-specific structure.
1693  *
1694  * @return
1695  *   0 on success, negative errno value on failure.
1696  */
1697 int
1698 tap_dev_filter_ctrl(struct rte_eth_dev *dev,
1699                     enum rte_filter_type filter_type,
1700                     enum rte_filter_op filter_op,
1701                     void *arg)
1702 {
1703         switch (filter_type) {
1704         case RTE_ETH_FILTER_GENERIC:
1705                 if (filter_op != RTE_ETH_FILTER_GET)
1706                         return -EINVAL;
1707                 *(const void **)arg = &tap_flow_ops;
1708                 return 0;
1709         default:
1710                 RTE_LOG(ERR, PMD, "%p: filter type (%d) not supported\n",
1711                         (void *)dev, filter_type);
1712         }
1713         return -EINVAL;
1714 }
1715