18cfb91b65f450b44448a04904dbdb1210489c47
[dpdk.git] / drivers / net / tap / tap_flow.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright 2017 6WIND S.A.
5  *   Copyright 2017 Mellanox.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of 6WIND S.A. nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <errno.h>
35 #include <string.h>
36 #include <unistd.h>
37 #include <sys/queue.h>
38 #include <sys/resource.h>
39
40 #include <rte_byteorder.h>
41 #include <rte_jhash.h>
42 #include <rte_malloc.h>
43 #include <rte_eth_tap.h>
44 #include <tap_flow.h>
45 #include <tap_autoconf.h>
46 #include <tap_tcmsgs.h>
47 #include <tap_rss.h>
48
49 #ifndef HAVE_TC_FLOWER
50 /*
51  * For kernels < 4.2, this enum is not defined. Runtime checks will be made to
52  * avoid sending TC messages the kernel cannot understand.
53  */
54 enum {
55         TCA_FLOWER_UNSPEC,
56         TCA_FLOWER_CLASSID,
57         TCA_FLOWER_INDEV,
58         TCA_FLOWER_ACT,
59         TCA_FLOWER_KEY_ETH_DST,         /* ETH_ALEN */
60         TCA_FLOWER_KEY_ETH_DST_MASK,    /* ETH_ALEN */
61         TCA_FLOWER_KEY_ETH_SRC,         /* ETH_ALEN */
62         TCA_FLOWER_KEY_ETH_SRC_MASK,    /* ETH_ALEN */
63         TCA_FLOWER_KEY_ETH_TYPE,        /* be16 */
64         TCA_FLOWER_KEY_IP_PROTO,        /* u8 */
65         TCA_FLOWER_KEY_IPV4_SRC,        /* be32 */
66         TCA_FLOWER_KEY_IPV4_SRC_MASK,   /* be32 */
67         TCA_FLOWER_KEY_IPV4_DST,        /* be32 */
68         TCA_FLOWER_KEY_IPV4_DST_MASK,   /* be32 */
69         TCA_FLOWER_KEY_IPV6_SRC,        /* struct in6_addr */
70         TCA_FLOWER_KEY_IPV6_SRC_MASK,   /* struct in6_addr */
71         TCA_FLOWER_KEY_IPV6_DST,        /* struct in6_addr */
72         TCA_FLOWER_KEY_IPV6_DST_MASK,   /* struct in6_addr */
73         TCA_FLOWER_KEY_TCP_SRC,         /* be16 */
74         TCA_FLOWER_KEY_TCP_DST,         /* be16 */
75         TCA_FLOWER_KEY_UDP_SRC,         /* be16 */
76         TCA_FLOWER_KEY_UDP_DST,         /* be16 */
77 };
78 #endif
79 #ifndef HAVE_TC_VLAN_ID
80 enum {
81         /* TCA_FLOWER_FLAGS, */
82         TCA_FLOWER_KEY_VLAN_ID = TCA_FLOWER_KEY_UDP_DST + 2, /* be16 */
83         TCA_FLOWER_KEY_VLAN_PRIO,       /* u8   */
84         TCA_FLOWER_KEY_VLAN_ETH_TYPE,   /* be16 */
85 };
86 #endif
87 /*
88  * For kernels < 4.2 BPF related enums may not be defined.
89  * Runtime checks will be carried out to gracefully report on TC messages that
90  * are rejected by the kernel. Rejection reasons may be due to:
91  * 1. enum is not defined
92  * 2. enum is defined but kernel is not configured to support BPF system calls,
93  *    BPF classifications or BPF actions.
94  */
95 #ifndef HAVE_TC_BPF
96 enum {
97         TCA_BPF_UNSPEC,
98         TCA_BPF_ACT,
99         TCA_BPF_POLICE,
100         TCA_BPF_CLASSID,
101         TCA_BPF_OPS_LEN,
102         TCA_BPF_OPS,
103 };
104 #endif
105 #ifndef HAVE_TC_BPF_FD
106 enum {
107         TCA_BPF_FD = TCA_BPF_OPS + 1,
108         TCA_BPF_NAME,
109 };
110 #endif
111 #ifndef HAVE_TC_ACT_BPF
112 #define tc_gen \
113         __u32                 index; \
114         __u32                 capab; \
115         int                   action; \
116         int                   refcnt; \
117         int                   bindcnt
118
119 struct tc_act_bpf {
120         tc_gen;
121 };
122
123 enum {
124         TCA_ACT_BPF_UNSPEC,
125         TCA_ACT_BPF_TM,
126         TCA_ACT_BPF_PARMS,
127         TCA_ACT_BPF_OPS_LEN,
128         TCA_ACT_BPF_OPS,
129 };
130
131 #endif
132 #ifndef HAVE_TC_ACT_BPF_FD
133 enum {
134         TCA_ACT_BPF_FD = TCA_ACT_BPF_OPS + 1,
135         TCA_ACT_BPF_NAME,
136 };
137 #endif
138
139 /* RSS key management */
140 enum bpf_rss_key_e {
141         KEY_CMD_GET = 1,
142         KEY_CMD_RELEASE,
143         KEY_CMD_INIT,
144         KEY_CMD_DEINIT,
145 };
146
147 enum key_status_e {
148         KEY_STAT_UNSPEC,
149         KEY_STAT_USED,
150         KEY_STAT_AVAILABLE,
151 };
152
153 #define ISOLATE_HANDLE 1
154
155 struct rte_flow {
156         LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
157         struct rte_flow *remote_flow; /* associated remote flow */
158         int bpf_fd[SEC_MAX]; /* list of bfs fds per ELF section */
159         uint32_t key_idx; /* RSS rule key index into BPF map */
160         struct nlmsg msg;
161 };
162
163 struct convert_data {
164         uint16_t eth_type;
165         uint16_t ip_proto;
166         uint8_t vlan;
167         struct rte_flow *flow;
168 };
169
170 struct remote_rule {
171         struct rte_flow_attr attr;
172         struct rte_flow_item items[2];
173         struct rte_flow_action actions[2];
174         int mirred;
175 };
176
177 struct action_data {
178         char id[16];
179
180         union {
181                 struct tc_gact gact;
182                 struct tc_mirred mirred;
183                 struct skbedit {
184                         struct tc_skbedit skbedit;
185                         uint16_t queue;
186                 } skbedit;
187                 struct bpf {
188                         struct tc_act_bpf bpf;
189                         int bpf_fd;
190                         const char *annotation;
191                 } bpf;
192         };
193 };
194
195 static int tap_flow_create_eth(const struct rte_flow_item *item, void *data);
196 static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data);
197 static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data);
198 static int tap_flow_create_ipv6(const struct rte_flow_item *item, void *data);
199 static int tap_flow_create_udp(const struct rte_flow_item *item, void *data);
200 static int tap_flow_create_tcp(const struct rte_flow_item *item, void *data);
201 static int
202 tap_flow_validate(struct rte_eth_dev *dev,
203                   const struct rte_flow_attr *attr,
204                   const struct rte_flow_item items[],
205                   const struct rte_flow_action actions[],
206                   struct rte_flow_error *error);
207
208 static struct rte_flow *
209 tap_flow_create(struct rte_eth_dev *dev,
210                 const struct rte_flow_attr *attr,
211                 const struct rte_flow_item items[],
212                 const struct rte_flow_action actions[],
213                 struct rte_flow_error *error);
214
215 static void
216 tap_flow_free(struct pmd_internals *pmd,
217         struct rte_flow *flow);
218
219 static int
220 tap_flow_destroy(struct rte_eth_dev *dev,
221                  struct rte_flow *flow,
222                  struct rte_flow_error *error);
223
224 static int
225 tap_flow_isolate(struct rte_eth_dev *dev,
226                  int set,
227                  struct rte_flow_error *error);
228
229 static int bpf_rss_key(enum bpf_rss_key_e cmd, __u32 *key_idx);
230 static int rss_enable(struct pmd_internals *pmd,
231                         const struct rte_flow_attr *attr,
232                         struct rte_flow_error *error);
233 static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
234                         const struct rte_flow_action_rss *rss,
235                         struct rte_flow_error *error);
236
237 static const struct rte_flow_ops tap_flow_ops = {
238         .validate = tap_flow_validate,
239         .create = tap_flow_create,
240         .destroy = tap_flow_destroy,
241         .flush = tap_flow_flush,
242         .isolate = tap_flow_isolate,
243 };
244
245 /* Static initializer for items. */
246 #define ITEMS(...) \
247         (const enum rte_flow_item_type []){ \
248                 __VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \
249         }
250
251 /* Structure to generate a simple graph of layers supported by the NIC. */
252 struct tap_flow_items {
253         /* Bit-mask corresponding to what is supported for this item. */
254         const void *mask;
255         const unsigned int mask_sz; /* Bit-mask size in bytes. */
256         /*
257          * Bit-mask corresponding to the default mask, if none is provided
258          * along with the item.
259          */
260         const void *default_mask;
261         /**
262          * Conversion function from rte_flow to netlink attributes.
263          *
264          * @param item
265          *   rte_flow item to convert.
266          * @param data
267          *   Internal structure to store the conversion.
268          *
269          * @return
270          *   0 on success, negative value otherwise.
271          */
272         int (*convert)(const struct rte_flow_item *item, void *data);
273         /** List of possible following items.  */
274         const enum rte_flow_item_type *const items;
275 };
276
277 /* Graph of supported items and associated actions. */
278 static const struct tap_flow_items tap_flow_items[] = {
279         [RTE_FLOW_ITEM_TYPE_END] = {
280                 .items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH),
281         },
282         [RTE_FLOW_ITEM_TYPE_ETH] = {
283                 .items = ITEMS(
284                         RTE_FLOW_ITEM_TYPE_VLAN,
285                         RTE_FLOW_ITEM_TYPE_IPV4,
286                         RTE_FLOW_ITEM_TYPE_IPV6),
287                 .mask = &(const struct rte_flow_item_eth){
288                         .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
289                         .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
290                         .type = -1,
291                 },
292                 .mask_sz = sizeof(struct rte_flow_item_eth),
293                 .default_mask = &rte_flow_item_eth_mask,
294                 .convert = tap_flow_create_eth,
295         },
296         [RTE_FLOW_ITEM_TYPE_VLAN] = {
297                 .items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4,
298                                RTE_FLOW_ITEM_TYPE_IPV6),
299                 .mask = &(const struct rte_flow_item_vlan){
300                         .tpid = -1,
301                         /* DEI matching is not supported */
302 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
303                         .tci = 0xffef,
304 #else
305                         .tci = 0xefff,
306 #endif
307                 },
308                 .mask_sz = sizeof(struct rte_flow_item_vlan),
309                 .default_mask = &rte_flow_item_vlan_mask,
310                 .convert = tap_flow_create_vlan,
311         },
312         [RTE_FLOW_ITEM_TYPE_IPV4] = {
313                 .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
314                                RTE_FLOW_ITEM_TYPE_TCP),
315                 .mask = &(const struct rte_flow_item_ipv4){
316                         .hdr = {
317                                 .src_addr = -1,
318                                 .dst_addr = -1,
319                                 .next_proto_id = -1,
320                         },
321                 },
322                 .mask_sz = sizeof(struct rte_flow_item_ipv4),
323                 .default_mask = &rte_flow_item_ipv4_mask,
324                 .convert = tap_flow_create_ipv4,
325         },
326         [RTE_FLOW_ITEM_TYPE_IPV6] = {
327                 .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP,
328                                RTE_FLOW_ITEM_TYPE_TCP),
329                 .mask = &(const struct rte_flow_item_ipv6){
330                         .hdr = {
331                                 .src_addr = {
332                                         "\xff\xff\xff\xff\xff\xff\xff\xff"
333                                         "\xff\xff\xff\xff\xff\xff\xff\xff",
334                                 },
335                                 .dst_addr = {
336                                         "\xff\xff\xff\xff\xff\xff\xff\xff"
337                                         "\xff\xff\xff\xff\xff\xff\xff\xff",
338                                 },
339                                 .proto = -1,
340                         },
341                 },
342                 .mask_sz = sizeof(struct rte_flow_item_ipv6),
343                 .default_mask = &rte_flow_item_ipv6_mask,
344                 .convert = tap_flow_create_ipv6,
345         },
346         [RTE_FLOW_ITEM_TYPE_UDP] = {
347                 .mask = &(const struct rte_flow_item_udp){
348                         .hdr = {
349                                 .src_port = -1,
350                                 .dst_port = -1,
351                         },
352                 },
353                 .mask_sz = sizeof(struct rte_flow_item_udp),
354                 .default_mask = &rte_flow_item_udp_mask,
355                 .convert = tap_flow_create_udp,
356         },
357         [RTE_FLOW_ITEM_TYPE_TCP] = {
358                 .mask = &(const struct rte_flow_item_tcp){
359                         .hdr = {
360                                 .src_port = -1,
361                                 .dst_port = -1,
362                         },
363                 },
364                 .mask_sz = sizeof(struct rte_flow_item_tcp),
365                 .default_mask = &rte_flow_item_tcp_mask,
366                 .convert = tap_flow_create_tcp,
367         },
368 };
369
370 /*
371  *                TC rules, by growing priority
372  *
373  *        Remote netdevice                  Tap netdevice
374  * +-------------+-------------+  +-------------+-------------+
375  * |   Ingress   |   Egress    |  |   Ingress   |   Egress    |
376  * |-------------|-------------|  |-------------|-------------|
377  * |             |  \       /  |  |             |  REMOTE TX  | prio 1
378  * |             |   \     /   |  |             |   \     /   | prio 2
379  * |  EXPLICIT   |    \   /    |  |  EXPLICIT   |    \   /    |   .
380  * |             |     \ /     |  |             |     \ /     |   .
381  * |    RULES    |      X      |  |    RULES    |      X      |   .
382  * |      .      |     / \     |  |      .      |     / \     |   .
383  * |      .      |    /   \    |  |      .      |    /   \    |   .
384  * |      .      |   /     \   |  |      .      |   /     \   |   .
385  * |      .      |  /       \  |  |      .      |  /       \  |   .
386  *
387  *      ....           ....           ....           ....
388  *
389  * |      .      |  \       /  |  |      .      |  \       /  |   .
390  * |      .      |   \     /   |  |      .      |   \     /   |   .
391  * |             |    \   /    |  |             |    \   /    |
392  * |  LOCAL_MAC  |     \ /     |  |    \   /    |     \ /     | last prio - 5
393  * |   PROMISC   |      X      |  |     \ /     |      X      | last prio - 4
394  * |   ALLMULTI  |     / \     |  |      X      |     / \     | last prio - 3
395  * |  BROADCAST  |    /   \    |  |     / \     |    /   \    | last prio - 2
396  * | BROADCASTV6 |   /     \   |  |    /   \    |   /     \   | last prio - 1
397  * |     xx      |  /       \  |  |   ISOLATE   |  /       \  | last prio
398  * +-------------+-------------+  +-------------+-------------+
399  *
400  * The implicit flow rules are stored in a list in with mandatorily the last two
401  * being the ISOLATE and REMOTE_TX rules. e.g.:
402  *
403  * LOCAL_MAC -> BROADCAST -> BROADCASTV6 -> REMOTE_TX -> ISOLATE -> NULL
404  *
405  * That enables tap_flow_isolate() to remove implicit rules by popping the list
406  * head and remove it as long as it applies on the remote netdevice. The
407  * implicit rule for TX redirection is not removed, as isolate concerns only
408  * incoming traffic.
409  */
410
411 static struct remote_rule implicit_rte_flows[TAP_REMOTE_MAX_IDX] = {
412         [TAP_REMOTE_LOCAL_MAC] = {
413                 .attr = {
414                         .group = MAX_GROUP,
415                         .priority = PRIORITY_MASK - TAP_REMOTE_LOCAL_MAC,
416                         .ingress = 1,
417                 },
418                 .items[0] = {
419                         .type = RTE_FLOW_ITEM_TYPE_ETH,
420                         .mask =  &(const struct rte_flow_item_eth){
421                                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
422                         },
423                 },
424                 .items[1] = {
425                         .type = RTE_FLOW_ITEM_TYPE_END,
426                 },
427                 .mirred = TCA_EGRESS_REDIR,
428         },
429         [TAP_REMOTE_BROADCAST] = {
430                 .attr = {
431                         .group = MAX_GROUP,
432                         .priority = PRIORITY_MASK - TAP_REMOTE_BROADCAST,
433                         .ingress = 1,
434                 },
435                 .items[0] = {
436                         .type = RTE_FLOW_ITEM_TYPE_ETH,
437                         .mask =  &(const struct rte_flow_item_eth){
438                                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
439                         },
440                         .spec = &(const struct rte_flow_item_eth){
441                                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
442                         },
443                 },
444                 .items[1] = {
445                         .type = RTE_FLOW_ITEM_TYPE_END,
446                 },
447                 .mirred = TCA_EGRESS_MIRROR,
448         },
449         [TAP_REMOTE_BROADCASTV6] = {
450                 .attr = {
451                         .group = MAX_GROUP,
452                         .priority = PRIORITY_MASK - TAP_REMOTE_BROADCASTV6,
453                         .ingress = 1,
454                 },
455                 .items[0] = {
456                         .type = RTE_FLOW_ITEM_TYPE_ETH,
457                         .mask =  &(const struct rte_flow_item_eth){
458                                 .dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
459                         },
460                         .spec = &(const struct rte_flow_item_eth){
461                                 .dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
462                         },
463                 },
464                 .items[1] = {
465                         .type = RTE_FLOW_ITEM_TYPE_END,
466                 },
467                 .mirred = TCA_EGRESS_MIRROR,
468         },
469         [TAP_REMOTE_PROMISC] = {
470                 .attr = {
471                         .group = MAX_GROUP,
472                         .priority = PRIORITY_MASK - TAP_REMOTE_PROMISC,
473                         .ingress = 1,
474                 },
475                 .items[0] = {
476                         .type = RTE_FLOW_ITEM_TYPE_VOID,
477                 },
478                 .items[1] = {
479                         .type = RTE_FLOW_ITEM_TYPE_END,
480                 },
481                 .mirred = TCA_EGRESS_MIRROR,
482         },
483         [TAP_REMOTE_ALLMULTI] = {
484                 .attr = {
485                         .group = MAX_GROUP,
486                         .priority = PRIORITY_MASK - TAP_REMOTE_ALLMULTI,
487                         .ingress = 1,
488                 },
489                 .items[0] = {
490                         .type = RTE_FLOW_ITEM_TYPE_ETH,
491                         .mask =  &(const struct rte_flow_item_eth){
492                                 .dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
493                         },
494                         .spec = &(const struct rte_flow_item_eth){
495                                 .dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
496                         },
497                 },
498                 .items[1] = {
499                         .type = RTE_FLOW_ITEM_TYPE_END,
500                 },
501                 .mirred = TCA_EGRESS_MIRROR,
502         },
503         [TAP_REMOTE_TX] = {
504                 .attr = {
505                         .group = 0,
506                         .priority = TAP_REMOTE_TX,
507                         .egress = 1,
508                 },
509                 .items[0] = {
510                         .type = RTE_FLOW_ITEM_TYPE_VOID,
511                 },
512                 .items[1] = {
513                         .type = RTE_FLOW_ITEM_TYPE_END,
514                 },
515                 .mirred = TCA_EGRESS_MIRROR,
516         },
517         [TAP_ISOLATE] = {
518                 .attr = {
519                         .group = MAX_GROUP,
520                         .priority = PRIORITY_MASK - TAP_ISOLATE,
521                         .ingress = 1,
522                 },
523                 .items[0] = {
524                         .type = RTE_FLOW_ITEM_TYPE_VOID,
525                 },
526                 .items[1] = {
527                         .type = RTE_FLOW_ITEM_TYPE_END,
528                 },
529         },
530 };
531
532 /**
533  * Make as much checks as possible on an Ethernet item, and if a flow is
534  * provided, fill it appropriately with Ethernet info.
535  *
536  * @param[in] item
537  *   Item specification.
538  * @param[in, out] data
539  *   Additional data structure to tell next layers we've been here.
540  *
541  * @return
542  *   0 if checks are alright, -1 otherwise.
543  */
544 static int
545 tap_flow_create_eth(const struct rte_flow_item *item, void *data)
546 {
547         struct convert_data *info = (struct convert_data *)data;
548         const struct rte_flow_item_eth *spec = item->spec;
549         const struct rte_flow_item_eth *mask = item->mask;
550         struct rte_flow *flow = info->flow;
551         struct nlmsg *msg;
552
553         /* use default mask if none provided */
554         if (!mask)
555                 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_ETH].default_mask;
556         /* TC does not support eth_type masking. Only accept if exact match. */
557         if (mask->type && mask->type != 0xffff)
558                 return -1;
559         if (!spec)
560                 return 0;
561         /* store eth_type for consistency if ipv4/6 pattern item comes next */
562         if (spec->type & mask->type)
563                 info->eth_type = spec->type;
564         if (!flow)
565                 return 0;
566         msg = &flow->msg;
567         if (!is_zero_ether_addr(&spec->dst)) {
568                 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_DST, ETHER_ADDR_LEN,
569                            &spec->dst.addr_bytes);
570                 tap_nlattr_add(&msg->nh,
571                            TCA_FLOWER_KEY_ETH_DST_MASK, ETHER_ADDR_LEN,
572                            &mask->dst.addr_bytes);
573         }
574         if (!is_zero_ether_addr(&mask->src)) {
575                 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_SRC, ETHER_ADDR_LEN,
576                            &spec->src.addr_bytes);
577                 tap_nlattr_add(&msg->nh,
578                            TCA_FLOWER_KEY_ETH_SRC_MASK, ETHER_ADDR_LEN,
579                            &mask->src.addr_bytes);
580         }
581         return 0;
582 }
583
584 /**
585  * Make as much checks as possible on a VLAN item, and if a flow is provided,
586  * fill it appropriately with VLAN info.
587  *
588  * @param[in] item
589  *   Item specification.
590  * @param[in, out] data
591  *   Additional data structure to tell next layers we've been here.
592  *
593  * @return
594  *   0 if checks are alright, -1 otherwise.
595  */
596 static int
597 tap_flow_create_vlan(const struct rte_flow_item *item, void *data)
598 {
599         struct convert_data *info = (struct convert_data *)data;
600         const struct rte_flow_item_vlan *spec = item->spec;
601         const struct rte_flow_item_vlan *mask = item->mask;
602         struct rte_flow *flow = info->flow;
603         struct nlmsg *msg;
604
605         /* use default mask if none provided */
606         if (!mask)
607                 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask;
608         /* TC does not support tpid masking. Only accept if exact match. */
609         if (mask->tpid && mask->tpid != 0xffff)
610                 return -1;
611         /* Double-tagging not supported. */
612         if (spec && mask->tpid && spec->tpid != htons(ETH_P_8021Q))
613                 return -1;
614         info->vlan = 1;
615         if (!flow)
616                 return 0;
617         msg = &flow->msg;
618         msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_8021Q));
619 #define VLAN_PRIO(tci) ((tci) >> 13)
620 #define VLAN_ID(tci) ((tci) & 0xfff)
621         if (!spec)
622                 return 0;
623         if (spec->tci) {
624                 uint16_t tci = ntohs(spec->tci) & mask->tci;
625                 uint16_t prio = VLAN_PRIO(tci);
626                 uint8_t vid = VLAN_ID(tci);
627
628                 if (prio)
629                         tap_nlattr_add8(&msg->nh,
630                                         TCA_FLOWER_KEY_VLAN_PRIO, prio);
631                 if (vid)
632                         tap_nlattr_add16(&msg->nh,
633                                          TCA_FLOWER_KEY_VLAN_ID, vid);
634         }
635         return 0;
636 }
637
638 /**
639  * Make as much checks as possible on an IPv4 item, and if a flow is provided,
640  * fill it appropriately with IPv4 info.
641  *
642  * @param[in] item
643  *   Item specification.
644  * @param[in, out] data
645  *   Additional data structure to tell next layers we've been here.
646  *
647  * @return
648  *   0 if checks are alright, -1 otherwise.
649  */
650 static int
651 tap_flow_create_ipv4(const struct rte_flow_item *item, void *data)
652 {
653         struct convert_data *info = (struct convert_data *)data;
654         const struct rte_flow_item_ipv4 *spec = item->spec;
655         const struct rte_flow_item_ipv4 *mask = item->mask;
656         struct rte_flow *flow = info->flow;
657         struct nlmsg *msg;
658
659         /* use default mask if none provided */
660         if (!mask)
661                 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV4].default_mask;
662         /* check that previous eth type is compatible with ipv4 */
663         if (info->eth_type && info->eth_type != htons(ETH_P_IP))
664                 return -1;
665         /* store ip_proto for consistency if udp/tcp pattern item comes next */
666         if (spec)
667                 info->ip_proto = spec->hdr.next_proto_id;
668         if (!flow)
669                 return 0;
670         msg = &flow->msg;
671         if (!info->eth_type)
672                 info->eth_type = htons(ETH_P_IP);
673         if (!spec)
674                 return 0;
675         if (spec->hdr.dst_addr) {
676                 tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST,
677                              spec->hdr.dst_addr);
678                 tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST_MASK,
679                              mask->hdr.dst_addr);
680         }
681         if (spec->hdr.src_addr) {
682                 tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC,
683                              spec->hdr.src_addr);
684                 tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC_MASK,
685                              mask->hdr.src_addr);
686         }
687         if (spec->hdr.next_proto_id)
688                 tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO,
689                             spec->hdr.next_proto_id);
690         return 0;
691 }
692
693 /**
694  * Make as much checks as possible on an IPv6 item, and if a flow is provided,
695  * fill it appropriately with IPv6 info.
696  *
697  * @param[in] item
698  *   Item specification.
699  * @param[in, out] data
700  *   Additional data structure to tell next layers we've been here.
701  *
702  * @return
703  *   0 if checks are alright, -1 otherwise.
704  */
705 static int
706 tap_flow_create_ipv6(const struct rte_flow_item *item, void *data)
707 {
708         struct convert_data *info = (struct convert_data *)data;
709         const struct rte_flow_item_ipv6 *spec = item->spec;
710         const struct rte_flow_item_ipv6 *mask = item->mask;
711         struct rte_flow *flow = info->flow;
712         uint8_t empty_addr[16] = { 0 };
713         struct nlmsg *msg;
714
715         /* use default mask if none provided */
716         if (!mask)
717                 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV6].default_mask;
718         /* check that previous eth type is compatible with ipv6 */
719         if (info->eth_type && info->eth_type != htons(ETH_P_IPV6))
720                 return -1;
721         /* store ip_proto for consistency if udp/tcp pattern item comes next */
722         if (spec)
723                 info->ip_proto = spec->hdr.proto;
724         if (!flow)
725                 return 0;
726         msg = &flow->msg;
727         if (!info->eth_type)
728                 info->eth_type = htons(ETH_P_IPV6);
729         if (!spec)
730                 return 0;
731         if (memcmp(spec->hdr.dst_addr, empty_addr, 16)) {
732                 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST,
733                            sizeof(spec->hdr.dst_addr), &spec->hdr.dst_addr);
734                 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST_MASK,
735                            sizeof(mask->hdr.dst_addr), &mask->hdr.dst_addr);
736         }
737         if (memcmp(spec->hdr.src_addr, empty_addr, 16)) {
738                 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC,
739                            sizeof(spec->hdr.src_addr), &spec->hdr.src_addr);
740                 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC_MASK,
741                            sizeof(mask->hdr.src_addr), &mask->hdr.src_addr);
742         }
743         if (spec->hdr.proto)
744                 tap_nlattr_add8(&msg->nh,
745                                 TCA_FLOWER_KEY_IP_PROTO, spec->hdr.proto);
746         return 0;
747 }
748
749 /**
750  * Make as much checks as possible on a UDP item, and if a flow is provided,
751  * fill it appropriately with UDP info.
752  *
753  * @param[in] item
754  *   Item specification.
755  * @param[in, out] data
756  *   Additional data structure to tell next layers we've been here.
757  *
758  * @return
759  *   0 if checks are alright, -1 otherwise.
760  */
761 static int
762 tap_flow_create_udp(const struct rte_flow_item *item, void *data)
763 {
764         struct convert_data *info = (struct convert_data *)data;
765         const struct rte_flow_item_udp *spec = item->spec;
766         const struct rte_flow_item_udp *mask = item->mask;
767         struct rte_flow *flow = info->flow;
768         struct nlmsg *msg;
769
770         /* use default mask if none provided */
771         if (!mask)
772                 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_UDP].default_mask;
773         /* check that previous ip_proto is compatible with udp */
774         if (info->ip_proto && info->ip_proto != IPPROTO_UDP)
775                 return -1;
776         /* TC does not support UDP port masking. Only accept if exact match. */
777         if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) ||
778             (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff))
779                 return -1;
780         if (!flow)
781                 return 0;
782         msg = &flow->msg;
783         tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_UDP);
784         if (!spec)
785                 return 0;
786         if (spec->hdr.dst_port & mask->hdr.dst_port)
787                 tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_DST,
788                              spec->hdr.dst_port);
789         if (spec->hdr.src_port & mask->hdr.src_port)
790                 tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_SRC,
791                              spec->hdr.src_port);
792         return 0;
793 }
794
795 /**
796  * Make as much checks as possible on a TCP item, and if a flow is provided,
797  * fill it appropriately with TCP info.
798  *
799  * @param[in] item
800  *   Item specification.
801  * @param[in, out] data
802  *   Additional data structure to tell next layers we've been here.
803  *
804  * @return
805  *   0 if checks are alright, -1 otherwise.
806  */
807 static int
808 tap_flow_create_tcp(const struct rte_flow_item *item, void *data)
809 {
810         struct convert_data *info = (struct convert_data *)data;
811         const struct rte_flow_item_tcp *spec = item->spec;
812         const struct rte_flow_item_tcp *mask = item->mask;
813         struct rte_flow *flow = info->flow;
814         struct nlmsg *msg;
815
816         /* use default mask if none provided */
817         if (!mask)
818                 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_TCP].default_mask;
819         /* check that previous ip_proto is compatible with tcp */
820         if (info->ip_proto && info->ip_proto != IPPROTO_TCP)
821                 return -1;
822         /* TC does not support TCP port masking. Only accept if exact match. */
823         if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) ||
824             (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff))
825                 return -1;
826         if (!flow)
827                 return 0;
828         msg = &flow->msg;
829         tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_TCP);
830         if (!spec)
831                 return 0;
832         if (spec->hdr.dst_port & mask->hdr.dst_port)
833                 tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_DST,
834                              spec->hdr.dst_port);
835         if (spec->hdr.src_port & mask->hdr.src_port)
836                 tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_SRC,
837                              spec->hdr.src_port);
838         return 0;
839 }
840
841 /**
842  * Check support for a given item.
843  *
844  * @param[in] item
845  *   Item specification.
846  * @param size
847  *   Bit-Mask size in bytes.
848  * @param[in] supported_mask
849  *   Bit-mask covering supported fields to compare with spec, last and mask in
850  *   \item.
851  * @param[in] default_mask
852  *   Bit-mask default mask if none is provided in \item.
853  *
854  * @return
855  *   0 on success.
856  */
857 static int
858 tap_flow_item_validate(const struct rte_flow_item *item,
859                        unsigned int size,
860                        const uint8_t *supported_mask,
861                        const uint8_t *default_mask)
862 {
863         int ret = 0;
864
865         /* An empty layer is allowed, as long as all fields are NULL */
866         if (!item->spec && (item->mask || item->last))
867                 return -1;
868         /* Is the item spec compatible with what the NIC supports? */
869         if (item->spec && !item->mask) {
870                 unsigned int i;
871                 const uint8_t *spec = item->spec;
872
873                 for (i = 0; i < size; ++i)
874                         if ((spec[i] | supported_mask[i]) != supported_mask[i])
875                                 return -1;
876                 /* Is the default mask compatible with what the NIC supports? */
877                 for (i = 0; i < size; i++)
878                         if ((default_mask[i] | supported_mask[i]) !=
879                             supported_mask[i])
880                                 return -1;
881         }
882         /* Is the item last compatible with what the NIC supports? */
883         if (item->last && !item->mask) {
884                 unsigned int i;
885                 const uint8_t *spec = item->last;
886
887                 for (i = 0; i < size; ++i)
888                         if ((spec[i] | supported_mask[i]) != supported_mask[i])
889                                 return -1;
890         }
891         /* Is the item mask compatible with what the NIC supports? */
892         if (item->mask) {
893                 unsigned int i;
894                 const uint8_t *spec = item->mask;
895
896                 for (i = 0; i < size; ++i)
897                         if ((spec[i] | supported_mask[i]) != supported_mask[i])
898                                 return -1;
899         }
900         /**
901          * Once masked, Are item spec and item last equal?
902          * TC does not support range so anything else is invalid.
903          */
904         if (item->spec && item->last) {
905                 uint8_t spec[size];
906                 uint8_t last[size];
907                 const uint8_t *apply = default_mask;
908                 unsigned int i;
909
910                 if (item->mask)
911                         apply = item->mask;
912                 for (i = 0; i < size; ++i) {
913                         spec[i] = ((const uint8_t *)item->spec)[i] & apply[i];
914                         last[i] = ((const uint8_t *)item->last)[i] & apply[i];
915                 }
916                 ret = memcmp(spec, last, size);
917         }
918         return ret;
919 }
920
921 /**
922  * Configure the kernel with a TC action and its configured parameters
923  * Handled actions: "gact", "mirred", "skbedit", "bpf"
924  *
925  * @param[in] flow
926  *   Pointer to rte flow containing the netlink message
927  *
928  * @param[in, out] act_index
929  *   Pointer to action sequence number in the TC command
930  *
931  * @param[in] adata
932  *  Pointer to struct holding the action parameters
933  *
934  * @return
935  *   -1 on failure, 0 on success
936  */
937 static int
938 add_action(struct rte_flow *flow, size_t *act_index, struct action_data *adata)
939 {
940         struct nlmsg *msg = &flow->msg;
941
942         if (tap_nlattr_nested_start(msg, (*act_index)++) < 0)
943                 return -1;
944
945         tap_nlattr_add(&msg->nh, TCA_ACT_KIND,
946                                 strlen(adata->id) + 1, adata->id);
947         if (tap_nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
948                 return -1;
949         if (strcmp("gact", adata->id) == 0) {
950                 tap_nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(adata->gact),
951                            &adata->gact);
952         } else if (strcmp("mirred", adata->id) == 0) {
953                 if (adata->mirred.eaction == TCA_EGRESS_MIRROR)
954                         adata->mirred.action = TC_ACT_PIPE;
955                 else /* REDIRECT */
956                         adata->mirred.action = TC_ACT_STOLEN;
957                 tap_nlattr_add(&msg->nh, TCA_MIRRED_PARMS,
958                            sizeof(adata->mirred),
959                            &adata->mirred);
960         } else if (strcmp("skbedit", adata->id) == 0) {
961                 tap_nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS,
962                            sizeof(adata->skbedit.skbedit),
963                            &adata->skbedit.skbedit);
964                 tap_nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING,
965                              adata->skbedit.queue);
966         } else if (strcmp("bpf", adata->id) == 0) {
967                 tap_nlattr_add32(&msg->nh, TCA_ACT_BPF_FD, adata->bpf.bpf_fd);
968                 tap_nlattr_add(&msg->nh, TCA_ACT_BPF_NAME,
969                            strlen(adata->bpf.annotation) + 1,
970                            adata->bpf.annotation);
971                 tap_nlattr_add(&msg->nh, TCA_ACT_BPF_PARMS,
972                            sizeof(adata->bpf.bpf),
973                            &adata->bpf.bpf);
974         } else {
975                 return -1;
976         }
977         tap_nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
978         tap_nlattr_nested_finish(msg); /* nested act_index */
979         return 0;
980 }
981
982 /**
983  * Helper function to send a serie of TC actions to the kernel
984  *
985  * @param[in] flow
986  *   Pointer to rte flow containing the netlink message
987  *
988  * @param[in] nb_actions
989  *   Number of actions in an array of action structs
990  *
991  * @param[in] data
992  *   Pointer to an array of action structs
993  *
994  * @param[in] classifier_actions
995  *   The classifier on behave of which the actions are configured
996  *
997  * @return
998  *   -1 on failure, 0 on success
999  */
1000 static int
1001 add_actions(struct rte_flow *flow, int nb_actions, struct action_data *data,
1002             int classifier_action)
1003 {
1004         struct nlmsg *msg = &flow->msg;
1005         size_t act_index = 1;
1006         int i;
1007
1008         if (tap_nlattr_nested_start(msg, classifier_action) < 0)
1009                 return -1;
1010         for (i = 0; i < nb_actions; i++)
1011                 if (add_action(flow, &act_index, data + i) < 0)
1012                         return -1;
1013         tap_nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
1014         return 0;
1015 }
1016
1017 /**
1018  * Validate a flow supported by TC.
1019  * If flow param is not NULL, then also fill the netlink message inside.
1020  *
1021  * @param pmd
1022  *   Pointer to private structure.
1023  * @param[in] attr
1024  *   Flow rule attributes.
1025  * @param[in] pattern
1026  *   Pattern specification (list terminated by the END pattern item).
1027  * @param[in] actions
1028  *   Associated actions (list terminated by the END action).
1029  * @param[out] error
1030  *   Perform verbose error reporting if not NULL.
1031  * @param[in, out] flow
1032  *   Flow structure to update.
1033  * @param[in] mirred
1034  *   If set to TCA_EGRESS_REDIR, provided actions will be replaced with a
1035  *   redirection to the tap netdevice, and the TC rule will be configured
1036  *   on the remote netdevice in pmd.
1037  *   If set to TCA_EGRESS_MIRROR, provided actions will be replaced with a
1038  *   mirroring to the tap netdevice, and the TC rule will be configured
1039  *   on the remote netdevice in pmd. Matching packets will thus be duplicated.
1040  *   If set to 0, the standard behavior is to be used: set correct actions for
1041  *   the TC rule, and apply it on the tap netdevice.
1042  *
1043  * @return
1044  *   0 on success, a negative errno value otherwise and rte_errno is set.
1045  */
1046 static int
1047 priv_flow_process(struct pmd_internals *pmd,
1048                   const struct rte_flow_attr *attr,
1049                   const struct rte_flow_item items[],
1050                   const struct rte_flow_action actions[],
1051                   struct rte_flow_error *error,
1052                   struct rte_flow *flow,
1053                   int mirred)
1054 {
1055         const struct tap_flow_items *cur_item = tap_flow_items;
1056         struct convert_data data = {
1057                 .eth_type = 0,
1058                 .ip_proto = 0,
1059                 .flow = flow,
1060         };
1061         int action = 0; /* Only one action authorized for now */
1062
1063         if (attr->group > MAX_GROUP) {
1064                 rte_flow_error_set(
1065                         error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
1066                         NULL, "group value too big: cannot exceed 15");
1067                 return -rte_errno;
1068         }
1069         if (attr->priority > MAX_PRIORITY) {
1070                 rte_flow_error_set(
1071                         error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1072                         NULL, "priority value too big");
1073                 return -rte_errno;
1074         } else if (flow) {
1075                 uint16_t group = attr->group << GROUP_SHIFT;
1076                 uint16_t prio = group | (attr->priority +
1077                                 RSS_PRIORITY_OFFSET + PRIORITY_OFFSET);
1078                 flow->msg.t.tcm_info = TC_H_MAKE(prio << 16,
1079                                                  flow->msg.t.tcm_info);
1080         }
1081         if (flow) {
1082                 if (mirred) {
1083                         /*
1084                          * If attr->ingress, the rule applies on remote ingress
1085                          * to match incoming packets
1086                          * If attr->egress, the rule applies on tap ingress (as
1087                          * seen from the kernel) to deal with packets going out
1088                          * from the DPDK app.
1089                          */
1090                         flow->msg.t.tcm_parent = TC_H_MAKE(TC_H_INGRESS, 0);
1091                 } else {
1092                         /* Standard rule on tap egress (kernel standpoint). */
1093                         flow->msg.t.tcm_parent =
1094                                 TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
1095                 }
1096                 /* use flower filter type */
1097                 tap_nlattr_add(&flow->msg.nh, TCA_KIND, sizeof("flower"), "flower");
1098                 if (tap_nlattr_nested_start(&flow->msg, TCA_OPTIONS) < 0)
1099                         goto exit_item_not_supported;
1100         }
1101         for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) {
1102                 const struct tap_flow_items *token = NULL;
1103                 unsigned int i;
1104                 int err = 0;
1105
1106                 if (items->type == RTE_FLOW_ITEM_TYPE_VOID)
1107                         continue;
1108                 for (i = 0;
1109                      cur_item->items &&
1110                      cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END;
1111                      ++i) {
1112                         if (cur_item->items[i] == items->type) {
1113                                 token = &tap_flow_items[items->type];
1114                                 break;
1115                         }
1116                 }
1117                 if (!token)
1118                         goto exit_item_not_supported;
1119                 cur_item = token;
1120                 err = tap_flow_item_validate(
1121                         items, cur_item->mask_sz,
1122                         (const uint8_t *)cur_item->mask,
1123                         (const uint8_t *)cur_item->default_mask);
1124                 if (err)
1125                         goto exit_item_not_supported;
1126                 if (flow && cur_item->convert) {
1127                         err = cur_item->convert(items, &data);
1128                         if (err)
1129                                 goto exit_item_not_supported;
1130                 }
1131         }
1132         if (flow) {
1133                 if (data.vlan) {
1134                         tap_nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
1135                                      htons(ETH_P_8021Q));
1136                         tap_nlattr_add16(&flow->msg.nh,
1137                                      TCA_FLOWER_KEY_VLAN_ETH_TYPE,
1138                                      data.eth_type ?
1139                                      data.eth_type : htons(ETH_P_ALL));
1140                 } else if (data.eth_type) {
1141                         tap_nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE,
1142                                      data.eth_type);
1143                 }
1144         }
1145         if (mirred && flow) {
1146                 struct action_data adata = {
1147                         .id = "mirred",
1148                         .mirred = {
1149                                 .eaction = mirred,
1150                         },
1151                 };
1152
1153                 /*
1154                  * If attr->egress && mirred, then this is a special
1155                  * case where the rule must be applied on the tap, to
1156                  * redirect packets coming from the DPDK App, out
1157                  * through the remote netdevice.
1158                  */
1159                 adata.mirred.ifindex = attr->ingress ? pmd->if_index :
1160                         pmd->remote_if_index;
1161                 if (mirred == TCA_EGRESS_MIRROR)
1162                         adata.mirred.action = TC_ACT_PIPE;
1163                 else
1164                         adata.mirred.action = TC_ACT_STOLEN;
1165                 if (add_actions(flow, 1, &adata, TCA_FLOWER_ACT) < 0)
1166                         goto exit_action_not_supported;
1167                 else
1168                         goto end;
1169         }
1170         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
1171                 int err = 0;
1172
1173                 if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) {
1174                         continue;
1175                 } else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) {
1176                         if (action)
1177                                 goto exit_action_not_supported;
1178                         action = 1;
1179                         if (flow) {
1180                                 struct action_data adata = {
1181                                         .id = "gact",
1182                                         .gact = {
1183                                                 .action = TC_ACT_SHOT,
1184                                         },
1185                                 };
1186
1187                                 err = add_actions(flow, 1, &adata,
1188                                                   TCA_FLOWER_ACT);
1189                         }
1190                 } else if (actions->type == RTE_FLOW_ACTION_TYPE_PASSTHRU) {
1191                         if (action)
1192                                 goto exit_action_not_supported;
1193                         action = 1;
1194                         if (flow) {
1195                                 struct action_data adata = {
1196                                         .id = "gact",
1197                                         .gact = {
1198                                                 /* continue */
1199                                                 .action = TC_ACT_UNSPEC,
1200                                         },
1201                                 };
1202
1203                                 err = add_actions(flow, 1, &adata,
1204                                                   TCA_FLOWER_ACT);
1205                         }
1206                 } else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
1207                         const struct rte_flow_action_queue *queue =
1208                                 (const struct rte_flow_action_queue *)
1209                                 actions->conf;
1210
1211                         if (action)
1212                                 goto exit_action_not_supported;
1213                         action = 1;
1214                         if (!queue ||
1215                             (queue->index > pmd->dev->data->nb_rx_queues - 1))
1216                                 goto exit_action_not_supported;
1217                         if (flow) {
1218                                 struct action_data adata = {
1219                                         .id = "skbedit",
1220                                         .skbedit = {
1221                                                 .skbedit = {
1222                                                         .action = TC_ACT_PIPE,
1223                                                 },
1224                                                 .queue = queue->index,
1225                                         },
1226                                 };
1227
1228                                 err = add_actions(flow, 1, &adata,
1229                                         TCA_FLOWER_ACT);
1230                         }
1231                 } else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) {
1232                         const struct rte_flow_action_rss *rss =
1233                                 (const struct rte_flow_action_rss *)
1234                                 actions->conf;
1235
1236                         if (action++)
1237                                 goto exit_action_not_supported;
1238
1239                         if (!pmd->rss_enabled) {
1240                                 err = rss_enable(pmd, attr, error);
1241                                 if (err)
1242                                         goto exit_action_not_supported;
1243                         }
1244                         if (flow && rss)
1245                                 err = rss_add_actions(flow, pmd, rss, error);
1246                 } else {
1247                         goto exit_action_not_supported;
1248                 }
1249                 if (err)
1250                         goto exit_action_not_supported;
1251         }
1252 end:
1253         if (flow)
1254                 tap_nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */
1255         return 0;
1256 exit_item_not_supported:
1257         rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
1258                            items, "item not supported");
1259         return -rte_errno;
1260 exit_action_not_supported:
1261         rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
1262                            actions, "action not supported");
1263         return -rte_errno;
1264 }
1265
1266
1267
1268 /**
1269  * Validate a flow.
1270  *
1271  * @see rte_flow_validate()
1272  * @see rte_flow_ops
1273  */
1274 static int
1275 tap_flow_validate(struct rte_eth_dev *dev,
1276                   const struct rte_flow_attr *attr,
1277                   const struct rte_flow_item items[],
1278                   const struct rte_flow_action actions[],
1279                   struct rte_flow_error *error)
1280 {
1281         struct pmd_internals *pmd = dev->data->dev_private;
1282
1283         return priv_flow_process(pmd, attr, items, actions, error, NULL, 0);
1284 }
1285
1286 /**
1287  * Set a unique handle in a flow.
1288  *
1289  * The kernel supports TC rules with equal priority, as long as they use the
1290  * same matching fields (e.g.: dst mac and ipv4) with different values (and
1291  * full mask to ensure no collision is possible).
1292  * In those rules, the handle (uint32_t) is the part that would identify
1293  * specifically each rule.
1294  *
1295  * On 32-bit architectures, the handle can simply be the flow's pointer address.
1296  * On 64-bit architectures, we rely on jhash(flow) to find a (sufficiently)
1297  * unique handle.
1298  *
1299  * @param[in, out] flow
1300  *   The flow that needs its handle set.
1301  */
1302 static void
1303 tap_flow_set_handle(struct rte_flow *flow)
1304 {
1305         uint32_t handle = 0;
1306
1307         if (sizeof(flow) > 4)
1308                 handle = rte_jhash(&flow, sizeof(flow), 1);
1309         else
1310                 handle = (uintptr_t)flow;
1311         /* must be at least 1 to avoid letting the kernel choose one for us */
1312         if (!handle)
1313                 handle = 1;
1314         flow->msg.t.tcm_handle = handle;
1315 }
1316
1317 /**
1318  * Free the flow opened file descriptors and allocated memory
1319  *
1320  * @param[in] flow
1321  *   Pointer to the flow to free
1322  *
1323  */
1324 static void
1325 tap_flow_free(struct pmd_internals *pmd, struct rte_flow *flow)
1326 {
1327         int i;
1328
1329         if (!flow)
1330                 return;
1331
1332         if (pmd->rss_enabled) {
1333                 /* Close flow BPF file descriptors */
1334                 for (i = 0; i < SEC_MAX; i++)
1335                         if (flow->bpf_fd[i] != 0) {
1336                                 close(flow->bpf_fd[i]);
1337                                 flow->bpf_fd[i] = 0;
1338                         }
1339
1340                 /* Release the map key for this RSS rule */
1341                 bpf_rss_key(KEY_CMD_RELEASE, &flow->key_idx);
1342                 flow->key_idx = 0;
1343         }
1344
1345         /* Free flow allocated memory */
1346         rte_free(flow);
1347 }
1348
1349 /**
1350  * Create a flow.
1351  *
1352  * @see rte_flow_create()
1353  * @see rte_flow_ops
1354  */
1355 static struct rte_flow *
1356 tap_flow_create(struct rte_eth_dev *dev,
1357                 const struct rte_flow_attr *attr,
1358                 const struct rte_flow_item items[],
1359                 const struct rte_flow_action actions[],
1360                 struct rte_flow_error *error)
1361 {
1362         struct pmd_internals *pmd = dev->data->dev_private;
1363         struct rte_flow *remote_flow = NULL;
1364         struct rte_flow *flow = NULL;
1365         struct nlmsg *msg = NULL;
1366         int err;
1367
1368         if (!pmd->if_index) {
1369                 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1370                                    NULL,
1371                                    "can't create rule, ifindex not found");
1372                 goto fail;
1373         }
1374         /*
1375          * No rules configured through standard rte_flow should be set on the
1376          * priorities used by implicit rules.
1377          */
1378         if ((attr->group == MAX_GROUP) &&
1379             attr->priority > (MAX_PRIORITY - TAP_REMOTE_MAX_IDX)) {
1380                 rte_flow_error_set(
1381                         error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1382                         NULL, "priority value too big");
1383                 goto fail;
1384         }
1385         flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1386         if (!flow) {
1387                 rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1388                                    NULL, "cannot allocate memory for rte_flow");
1389                 goto fail;
1390         }
1391         msg = &flow->msg;
1392         tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER,
1393                     NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1394         msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1395         tap_flow_set_handle(flow);
1396         if (priv_flow_process(pmd, attr, items, actions, error, flow, 0))
1397                 goto fail;
1398         err = tap_nl_send(pmd->nlsk_fd, &msg->nh);
1399         if (err < 0) {
1400                 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1401                                    NULL, "couldn't send request to kernel");
1402                 goto fail;
1403         }
1404         err = tap_nl_recv_ack(pmd->nlsk_fd);
1405         if (err < 0) {
1406                 RTE_LOG(ERR, PMD,
1407                         "Kernel refused TC filter rule creation (%d): %s\n",
1408                         errno, strerror(errno));
1409                 rte_flow_error_set(error, EEXIST, RTE_FLOW_ERROR_TYPE_HANDLE,
1410                                    NULL,
1411                                    "overlapping rules or Kernel too old for flower support");
1412                 goto fail;
1413         }
1414         LIST_INSERT_HEAD(&pmd->flows, flow, next);
1415         /**
1416          * If a remote device is configured, a TC rule with identical items for
1417          * matching must be set on that device, with a single action: redirect
1418          * to the local pmd->if_index.
1419          */
1420         if (pmd->remote_if_index) {
1421                 remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1422                 if (!remote_flow) {
1423                         rte_flow_error_set(
1424                                 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1425                                 "cannot allocate memory for rte_flow");
1426                         goto fail;
1427                 }
1428                 msg = &remote_flow->msg;
1429                 /* set the rule if_index for the remote netdevice */
1430                 tc_init_msg(
1431                         msg, pmd->remote_if_index, RTM_NEWTFILTER,
1432                         NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1433                 msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1434                 tap_flow_set_handle(remote_flow);
1435                 if (priv_flow_process(pmd, attr, items, NULL,
1436                                       error, remote_flow, TCA_EGRESS_REDIR)) {
1437                         rte_flow_error_set(
1438                                 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1439                                 NULL, "rte flow rule validation failed");
1440                         goto fail;
1441                 }
1442                 err = tap_nl_send(pmd->nlsk_fd, &msg->nh);
1443                 if (err < 0) {
1444                         rte_flow_error_set(
1445                                 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1446                                 NULL, "Failure sending nl request");
1447                         goto fail;
1448                 }
1449                 err = tap_nl_recv_ack(pmd->nlsk_fd);
1450                 if (err < 0) {
1451                         RTE_LOG(ERR, PMD,
1452                                 "Kernel refused TC filter rule creation (%d): %s\n",
1453                                 errno, strerror(errno));
1454                         rte_flow_error_set(
1455                                 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1456                                 NULL,
1457                                 "overlapping rules or Kernel too old for flower support");
1458                         goto fail;
1459                 }
1460                 flow->remote_flow = remote_flow;
1461         }
1462         return flow;
1463 fail:
1464         if (remote_flow)
1465                 rte_free(remote_flow);
1466         if (flow)
1467                 tap_flow_free(pmd, flow);
1468         return NULL;
1469 }
1470
1471 /**
1472  * Destroy a flow using pointer to pmd_internal.
1473  *
1474  * @param[in, out] pmd
1475  *   Pointer to private structure.
1476  * @param[in] flow
1477  *   Pointer to the flow to destroy.
1478  * @param[in, out] error
1479  *   Pointer to the flow error handler
1480  *
1481  * @return 0 if the flow could be destroyed, -1 otherwise.
1482  */
1483 static int
1484 tap_flow_destroy_pmd(struct pmd_internals *pmd,
1485                      struct rte_flow *flow,
1486                      struct rte_flow_error *error)
1487 {
1488         struct rte_flow *remote_flow = flow->remote_flow;
1489         int ret = 0;
1490
1491         LIST_REMOVE(flow, next);
1492         flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1493         flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
1494
1495         ret = tap_nl_send(pmd->nlsk_fd, &flow->msg.nh);
1496         if (ret < 0) {
1497                 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1498                                    NULL, "couldn't send request to kernel");
1499                 goto end;
1500         }
1501         ret = tap_nl_recv_ack(pmd->nlsk_fd);
1502         /* If errno is ENOENT, the rule is already no longer in the kernel. */
1503         if (ret < 0 && errno == ENOENT)
1504                 ret = 0;
1505         if (ret < 0) {
1506                 RTE_LOG(ERR, PMD,
1507                         "Kernel refused TC filter rule deletion (%d): %s\n",
1508                         errno, strerror(errno));
1509                 rte_flow_error_set(
1510                         error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1511                         "couldn't receive kernel ack to our request");
1512                 goto end;
1513         }
1514
1515         if (remote_flow) {
1516                 remote_flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1517                 remote_flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
1518
1519                 ret = tap_nl_send(pmd->nlsk_fd, &remote_flow->msg.nh);
1520                 if (ret < 0) {
1521                         rte_flow_error_set(
1522                                 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1523                                 NULL, "Failure sending nl request");
1524                         goto end;
1525                 }
1526                 ret = tap_nl_recv_ack(pmd->nlsk_fd);
1527                 if (ret < 0 && errno == ENOENT)
1528                         ret = 0;
1529                 if (ret < 0) {
1530                         RTE_LOG(ERR, PMD,
1531                                 "Kernel refused TC filter rule deletion (%d): %s\n",
1532                                 errno, strerror(errno));
1533                         rte_flow_error_set(
1534                                 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
1535                                 NULL, "Failure trying to receive nl ack");
1536                         goto end;
1537                 }
1538         }
1539 end:
1540         if (remote_flow)
1541                 rte_free(remote_flow);
1542         tap_flow_free(pmd, flow);
1543         return ret;
1544 }
1545
1546 /**
1547  * Destroy a flow.
1548  *
1549  * @see rte_flow_destroy()
1550  * @see rte_flow_ops
1551  */
1552 static int
1553 tap_flow_destroy(struct rte_eth_dev *dev,
1554                  struct rte_flow *flow,
1555                  struct rte_flow_error *error)
1556 {
1557         struct pmd_internals *pmd = dev->data->dev_private;
1558
1559         return tap_flow_destroy_pmd(pmd, flow, error);
1560 }
1561
1562 /**
1563  * Enable/disable flow isolation.
1564  *
1565  * @see rte_flow_isolate()
1566  * @see rte_flow_ops
1567  */
1568 static int
1569 tap_flow_isolate(struct rte_eth_dev *dev,
1570                  int set,
1571                  struct rte_flow_error *error __rte_unused)
1572 {
1573         struct pmd_internals *pmd = dev->data->dev_private;
1574
1575         if (set)
1576                 pmd->flow_isolate = 1;
1577         else
1578                 pmd->flow_isolate = 0;
1579         /*
1580          * If netdevice is there, setup appropriate flow rules immediately.
1581          * Otherwise it will be set when bringing up the netdevice (tun_alloc).
1582          */
1583         if (!pmd->rxq[0].fd)
1584                 return 0;
1585         if (set) {
1586                 struct rte_flow *flow;
1587
1588                 while (1) {
1589                         flow = LIST_FIRST(&pmd->implicit_flows);
1590                         if (!flow)
1591                                 break;
1592                         /*
1593                          * Remove all implicit rules on the remote.
1594                          * Keep the local rule to redirect packets on TX.
1595                          * Keep also the last implicit local rule: ISOLATE.
1596                          */
1597                         if (flow->msg.t.tcm_ifindex == pmd->if_index)
1598                                 break;
1599                         if (tap_flow_destroy_pmd(pmd, flow, NULL) < 0)
1600                                 goto error;
1601                 }
1602                 /* Switch the TC rule according to pmd->flow_isolate */
1603                 if (tap_flow_implicit_create(pmd, TAP_ISOLATE) == -1)
1604                         goto error;
1605         } else {
1606                 /* Switch the TC rule according to pmd->flow_isolate */
1607                 if (tap_flow_implicit_create(pmd, TAP_ISOLATE) == -1)
1608                         goto error;
1609                 if (!pmd->remote_if_index)
1610                         return 0;
1611                 if (tap_flow_implicit_create(pmd, TAP_REMOTE_TX) < 0)
1612                         goto error;
1613                 if (tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC) < 0)
1614                         goto error;
1615                 if (tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCAST) < 0)
1616                         goto error;
1617                 if (tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCASTV6) < 0)
1618                         goto error;
1619                 if (dev->data->promiscuous &&
1620                     tap_flow_implicit_create(pmd, TAP_REMOTE_PROMISC) < 0)
1621                         goto error;
1622                 if (dev->data->all_multicast &&
1623                     tap_flow_implicit_create(pmd, TAP_REMOTE_ALLMULTI) < 0)
1624                         goto error;
1625         }
1626         return 0;
1627 error:
1628         pmd->flow_isolate = 0;
1629         return rte_flow_error_set(
1630                 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
1631                 "TC rule creation failed");
1632 }
1633
1634 /**
1635  * Destroy all flows.
1636  *
1637  * @see rte_flow_flush()
1638  * @see rte_flow_ops
1639  */
1640 int
1641 tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error)
1642 {
1643         struct pmd_internals *pmd = dev->data->dev_private;
1644         struct rte_flow *flow;
1645
1646         while (!LIST_EMPTY(&pmd->flows)) {
1647                 flow = LIST_FIRST(&pmd->flows);
1648                 if (tap_flow_destroy(dev, flow, error) < 0)
1649                         return -1;
1650         }
1651         return 0;
1652 }
1653
1654 /**
1655  * Add an implicit flow rule on the remote device to make sure traffic gets to
1656  * the tap netdevice from there.
1657  *
1658  * @param pmd
1659  *   Pointer to private structure.
1660  * @param[in] idx
1661  *   The idx in the implicit_rte_flows array specifying which rule to apply.
1662  *
1663  * @return -1 if the rule couldn't be applied, 0 otherwise.
1664  */
1665 int tap_flow_implicit_create(struct pmd_internals *pmd,
1666                              enum implicit_rule_index idx)
1667 {
1668         uint16_t flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE;
1669         struct rte_flow_action *actions = implicit_rte_flows[idx].actions;
1670         struct rte_flow_action isolate_actions[2] = {
1671                 [1] = {
1672                         .type = RTE_FLOW_ACTION_TYPE_END,
1673                 },
1674         };
1675         struct rte_flow_item *items = implicit_rte_flows[idx].items;
1676         struct rte_flow_attr *attr = &implicit_rte_flows[idx].attr;
1677         struct rte_flow_item_eth eth_local = { .type = 0 };
1678         uint16_t if_index = pmd->remote_if_index;
1679         struct rte_flow *remote_flow = NULL;
1680         struct nlmsg *msg = NULL;
1681         int err = 0;
1682         struct rte_flow_item items_local[2] = {
1683                 [0] = {
1684                         .type = items[0].type,
1685                         .spec = &eth_local,
1686                         .mask = items[0].mask,
1687                 },
1688                 [1] = {
1689                         .type = items[1].type,
1690                 }
1691         };
1692
1693         remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1694         if (!remote_flow) {
1695                 RTE_LOG(ERR, PMD, "Cannot allocate memory for rte_flow\n");
1696                 goto fail;
1697         }
1698         msg = &remote_flow->msg;
1699         if (idx == TAP_REMOTE_TX) {
1700                 if_index = pmd->if_index;
1701         } else if (idx == TAP_ISOLATE) {
1702                 if_index = pmd->if_index;
1703                 /* Don't be exclusive for this rule, it can be changed later. */
1704                 flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE;
1705                 isolate_actions[0].type = pmd->flow_isolate ?
1706                         RTE_FLOW_ACTION_TYPE_DROP :
1707                         RTE_FLOW_ACTION_TYPE_PASSTHRU;
1708                 actions = isolate_actions;
1709         } else if (idx == TAP_REMOTE_LOCAL_MAC) {
1710                 /*
1711                  * eth addr couldn't be set in implicit_rte_flows[] as it is not
1712                  * known at compile time.
1713                  */
1714                 memcpy(&eth_local.dst, &pmd->eth_addr, sizeof(pmd->eth_addr));
1715                 items = items_local;
1716         }
1717         tc_init_msg(msg, if_index, RTM_NEWTFILTER, flags);
1718         msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1719         /*
1720          * The ISOLATE rule is always present and must have a static handle, as
1721          * the action is changed whether the feature is enabled (DROP) or
1722          * disabled (PASSTHRU).
1723          */
1724         if (idx == TAP_ISOLATE)
1725                 remote_flow->msg.t.tcm_handle = ISOLATE_HANDLE;
1726         else
1727                 tap_flow_set_handle(remote_flow);
1728         if (priv_flow_process(pmd, attr, items, actions, NULL,
1729                               remote_flow, implicit_rte_flows[idx].mirred)) {
1730                 RTE_LOG(ERR, PMD, "rte flow rule validation failed\n");
1731                 goto fail;
1732         }
1733         err = tap_nl_send(pmd->nlsk_fd, &msg->nh);
1734         if (err < 0) {
1735                 RTE_LOG(ERR, PMD, "Failure sending nl request\n");
1736                 goto fail;
1737         }
1738         err = tap_nl_recv_ack(pmd->nlsk_fd);
1739         if (err < 0) {
1740                 RTE_LOG(ERR, PMD,
1741                         "Kernel refused TC filter rule creation (%d): %s\n",
1742                         errno, strerror(errno));
1743                 goto fail;
1744         }
1745         LIST_INSERT_HEAD(&pmd->implicit_flows, remote_flow, next);
1746         return 0;
1747 fail:
1748         if (remote_flow)
1749                 rte_free(remote_flow);
1750         return -1;
1751 }
1752
1753 /**
1754  * Remove specific implicit flow rule on the remote device.
1755  *
1756  * @param[in, out] pmd
1757  *   Pointer to private structure.
1758  * @param[in] idx
1759  *   The idx in the implicit_rte_flows array specifying which rule to remove.
1760  *
1761  * @return -1 if one of the implicit rules couldn't be created, 0 otherwise.
1762  */
1763 int tap_flow_implicit_destroy(struct pmd_internals *pmd,
1764                               enum implicit_rule_index idx)
1765 {
1766         struct rte_flow *remote_flow;
1767         int cur_prio = -1;
1768         int idx_prio = implicit_rte_flows[idx].attr.priority + PRIORITY_OFFSET;
1769
1770         for (remote_flow = LIST_FIRST(&pmd->implicit_flows);
1771              remote_flow;
1772              remote_flow = LIST_NEXT(remote_flow, next)) {
1773                 cur_prio = (remote_flow->msg.t.tcm_info >> 16) & PRIORITY_MASK;
1774                 if (cur_prio != idx_prio)
1775                         continue;
1776                 return tap_flow_destroy_pmd(pmd, remote_flow, NULL);
1777         }
1778         return 0;
1779 }
1780
1781 /**
1782  * Destroy all implicit flows.
1783  *
1784  * @see rte_flow_flush()
1785  */
1786 int
1787 tap_flow_implicit_flush(struct pmd_internals *pmd, struct rte_flow_error *error)
1788 {
1789         struct rte_flow *remote_flow;
1790
1791         while (!LIST_EMPTY(&pmd->implicit_flows)) {
1792                 remote_flow = LIST_FIRST(&pmd->implicit_flows);
1793                 if (tap_flow_destroy_pmd(pmd, remote_flow, error) < 0)
1794                         return -1;
1795         }
1796         return 0;
1797 }
1798
1799 #define MAX_RSS_KEYS 256
1800 #define KEY_IDX_OFFSET (3 * MAX_RSS_KEYS)
1801 #define SEC_NAME_CLS_Q "cls_q"
1802
1803 const char *sec_name[SEC_MAX] = {
1804         [SEC_L3_L4] = "l3_l4",
1805 };
1806
1807 /**
1808  * Enable RSS on tap: create TC rules for queuing.
1809  *
1810  * @param[in, out] pmd
1811  *   Pointer to private structure.
1812  *
1813  * @param[in] attr
1814  *   Pointer to rte_flow to get flow group
1815  *
1816  * @param[out] error
1817  *   Pointer to error reporting if not NULL.
1818  *
1819  * @return 0 on success, negative value on failure.
1820  */
1821 static int rss_enable(struct pmd_internals *pmd,
1822                         const struct rte_flow_attr *attr,
1823                         struct rte_flow_error *error)
1824 {
1825         struct rte_flow *rss_flow = NULL;
1826         struct nlmsg *msg = NULL;
1827         /* 4096 is the maximum number of instructions for a BPF program */
1828         char annotation[64];
1829         int i;
1830         int err = 0;
1831
1832         /* unlimit locked memory */
1833         struct rlimit memlock_limit = {
1834                 .rlim_cur = RLIM_INFINITY,
1835                 .rlim_max = RLIM_INFINITY,
1836         };
1837         setrlimit(RLIMIT_MEMLOCK, &memlock_limit);
1838
1839          /* Get a new map key for a new RSS rule */
1840         err = bpf_rss_key(KEY_CMD_INIT, NULL);
1841         if (err < 0) {
1842                 rte_flow_error_set(
1843                         error, EINVAL, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1844                         "Failed to initialize BPF RSS keys");
1845
1846                 return -1;
1847         }
1848
1849         /*
1850          *  Create BPF RSS MAP
1851          */
1852         pmd->map_fd = tap_flow_bpf_rss_map_create(sizeof(__u32), /* key size */
1853                                 sizeof(struct rss_key),
1854                                 MAX_RSS_KEYS);
1855         if (pmd->map_fd < 0) {
1856                 RTE_LOG(ERR, PMD,
1857                         "Failed to create BPF map (%d): %s\n",
1858                                 errno, strerror(errno));
1859                 rte_flow_error_set(
1860                         error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
1861                         "Kernel too old or not configured "
1862                         "to support BPF maps");
1863
1864                 return -ENOTSUP;
1865         }
1866
1867         /*
1868          * Add a rule per queue to match reclassified packets and direct them to
1869          * the correct queue.
1870          */
1871         for (i = 0; i < pmd->dev->data->nb_rx_queues; i++) {
1872                 pmd->bpf_fd[i] = tap_flow_bpf_cls_q(i);
1873                 if (pmd->bpf_fd[i] < 0) {
1874                         RTE_LOG(ERR, PMD,
1875                                 "Failed to load BPF section %s for queue %d",
1876                                 SEC_NAME_CLS_Q, i);
1877                         rte_flow_error_set(
1878                                 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
1879                                 NULL,
1880                                 "Kernel too old or not configured "
1881                                 "to support BPF programs loading");
1882
1883                         return -ENOTSUP;
1884                 }
1885
1886                 rss_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
1887                 if (!rss_flow) {
1888                         RTE_LOG(ERR, PMD,
1889                                 "Cannot allocate memory for rte_flow");
1890                         return -1;
1891                 }
1892                 msg = &rss_flow->msg;
1893                 tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER, NLM_F_REQUEST |
1894                             NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
1895                 msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
1896                 tap_flow_set_handle(rss_flow);
1897                 uint16_t group = attr->group << GROUP_SHIFT;
1898                 uint16_t prio = group | (i + PRIORITY_OFFSET);
1899                 msg->t.tcm_info = TC_H_MAKE(prio << 16, msg->t.tcm_info);
1900                 msg->t.tcm_parent = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
1901
1902                 tap_nlattr_add(&msg->nh, TCA_KIND, sizeof("bpf"), "bpf");
1903                 if (tap_nlattr_nested_start(msg, TCA_OPTIONS) < 0)
1904                         return -1;
1905                 tap_nlattr_add32(&msg->nh, TCA_BPF_FD, pmd->bpf_fd[i]);
1906                 snprintf(annotation, sizeof(annotation), "[%s%d]",
1907                         SEC_NAME_CLS_Q, i);
1908                 tap_nlattr_add(&msg->nh, TCA_BPF_NAME, strlen(annotation) + 1,
1909                            annotation);
1910                 /* Actions */
1911                 {
1912                         struct action_data adata = {
1913                                 .id = "skbedit",
1914                                 .skbedit = {
1915                                         .skbedit = {
1916                                                 .action = TC_ACT_PIPE,
1917                                         },
1918                                         .queue = i,
1919                                 },
1920                         };
1921                         if (add_actions(rss_flow, 1, &adata, TCA_BPF_ACT) < 0)
1922                                 return -1;
1923                 }
1924                 tap_nlattr_nested_finish(msg); /* nested TCA_OPTIONS */
1925
1926                 /* Netlink message is now ready to be sent */
1927                 if (tap_nl_send(pmd->nlsk_fd, &msg->nh) < 0)
1928                         return -1;
1929                 err = tap_nl_recv_ack(pmd->nlsk_fd);
1930                 if (err < 0) {
1931                         RTE_LOG(ERR, PMD,
1932                                 "Kernel refused TC filter rule creation (%d): %s\n",
1933                                 errno, strerror(errno));
1934                         return err;
1935                 }
1936                 LIST_INSERT_HEAD(&pmd->rss_flows, rss_flow, next);
1937         }
1938
1939         pmd->rss_enabled = 1;
1940         return err;
1941 }
1942
1943 /**
1944  * Manage bpf RSS keys repository with operations: init, get, release
1945  *
1946  * @param[in] cmd
1947  *   Command on RSS keys: init, get, release
1948  *
1949  * @param[in, out] key_idx
1950  *   Pointer to RSS Key index (out for get command, in for release command)
1951  *
1952  * @return -1 if couldn't get, release or init the RSS keys, 0 otherwise.
1953  */
1954 static int bpf_rss_key(enum bpf_rss_key_e cmd, __u32 *key_idx)
1955 {
1956         __u32 i;
1957         int err = 0;
1958         static __u32 num_used_keys;
1959         static __u32 rss_keys[MAX_RSS_KEYS] = {KEY_STAT_UNSPEC};
1960         static __u32 rss_keys_initialized;
1961
1962         switch (cmd) {
1963         case KEY_CMD_GET:
1964                 if (!rss_keys_initialized) {
1965                         err = -1;
1966                         break;
1967                 }
1968
1969                 if (num_used_keys == RTE_DIM(rss_keys)) {
1970                         err = -1;
1971                         break;
1972                 }
1973
1974                 *key_idx = num_used_keys % RTE_DIM(rss_keys);
1975                 while (rss_keys[*key_idx] == KEY_STAT_USED)
1976                         *key_idx = (*key_idx + 1) % RTE_DIM(rss_keys);
1977
1978                 rss_keys[*key_idx] = KEY_STAT_USED;
1979
1980                 /*
1981                  * Add an offset to key_idx in order to handle a case of
1982                  * RSS and non RSS flows mixture.
1983                  * If a non RSS flow is destroyed it has an eBPF map
1984                  * index 0 (initialized on flow creation) and might
1985                  * unintentionally remove RSS entry 0 from eBPF map.
1986                  * To avoid this issue, add an offset to the real index
1987                  * during a KEY_CMD_GET operation and subtract this offset
1988                  * during a KEY_CMD_RELEASE operation in order to restore
1989                  * the real index.
1990                  */
1991                 *key_idx += KEY_IDX_OFFSET;
1992                 num_used_keys++;
1993         break;
1994
1995         case KEY_CMD_RELEASE:
1996                 if (!rss_keys_initialized)
1997                         break;
1998
1999                 /*
2000                  * Subtract offest to restore real key index
2001                  * If a non RSS flow is falsely trying to release map
2002                  * entry 0 - the offset subtraction will calculate the real
2003                  * map index as an out-of-range value and the release operation
2004                  * will be silently ignored.
2005                  */
2006                 __u32 key = *key_idx - KEY_IDX_OFFSET;
2007                 if (key >= RTE_DIM(rss_keys))
2008                         break;
2009
2010                 if (rss_keys[key] == KEY_STAT_USED) {
2011                         rss_keys[key] = KEY_STAT_AVAILABLE;
2012                         num_used_keys--;
2013                 }
2014         break;
2015
2016         case KEY_CMD_INIT:
2017                 for (i = 0; i < RTE_DIM(rss_keys); i++)
2018                         rss_keys[i] = KEY_STAT_AVAILABLE;
2019
2020                 rss_keys_initialized = 1;
2021                 num_used_keys = 0;
2022         break;
2023
2024         case KEY_CMD_DEINIT:
2025                 for (i = 0; i < RTE_DIM(rss_keys); i++)
2026                         rss_keys[i] = KEY_STAT_UNSPEC;
2027
2028                 rss_keys_initialized = 0;
2029                 num_used_keys = 0;
2030         break;
2031
2032         default:
2033                 break;
2034         }
2035
2036         return err;
2037 }
2038
2039 /**
2040  * Add RSS hash calculations and queue selection
2041  *
2042  * @param[in, out] pmd
2043  *   Pointer to internal structure. Used to set/get RSS map fd
2044  *
2045  * @param[in] rss
2046  *   Pointer to RSS flow actions
2047  *
2048  * @param[out] error
2049  *   Pointer to error reporting if not NULL.
2050  *
2051  * @return 0 on success, negative value on failure
2052  */
2053 static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
2054                            const struct rte_flow_action_rss *rss,
2055                            struct rte_flow_error *error)
2056 {
2057         /* 4096 is the maximum number of instructions for a BPF program */
2058         int i;
2059         int err;
2060         struct rss_key rss_entry = { .hash_fields = 0,
2061                                      .key_size = 0 };
2062
2063         /* Get a new map key for a new RSS rule */
2064         err = bpf_rss_key(KEY_CMD_GET, &flow->key_idx);
2065         if (err < 0) {
2066                 rte_flow_error_set(
2067                         error, EINVAL, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
2068                         "Failed to get BPF RSS key");
2069
2070                 return -1;
2071         }
2072
2073         /* Update RSS map entry with queues */
2074         rss_entry.nb_queues = rss->num;
2075         for (i = 0; i < rss->num; i++)
2076                 rss_entry.queues[i] = rss->queue[i];
2077         rss_entry.hash_fields =
2078                 (1 << HASH_FIELD_IPV4_L3_L4) | (1 << HASH_FIELD_IPV6_L3_L4);
2079
2080         /* Add this RSS entry to map */
2081         err = tap_flow_bpf_update_rss_elem(pmd->map_fd,
2082                                 &flow->key_idx, &rss_entry);
2083
2084         if (err) {
2085                 RTE_LOG(ERR, PMD,
2086                         "Failed to update BPF map entry #%u (%d): %s\n",
2087                         flow->key_idx, errno, strerror(errno));
2088                 rte_flow_error_set(
2089                         error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
2090                         "Kernel too old or not configured "
2091                         "to support BPF maps updates");
2092
2093                 return -ENOTSUP;
2094         }
2095
2096
2097         /*
2098          * Load bpf rules to calculate hash for this key_idx
2099          */
2100
2101         flow->bpf_fd[SEC_L3_L4] =
2102                 tap_flow_bpf_calc_l3_l4_hash(flow->key_idx, pmd->map_fd);
2103         if (flow->bpf_fd[SEC_L3_L4] < 0) {
2104                 RTE_LOG(ERR, PMD,
2105                         "Failed to load BPF section %s (%d): %s\n",
2106                                 sec_name[SEC_L3_L4], errno, strerror(errno));
2107                 rte_flow_error_set(
2108                         error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
2109                         "Kernel too old or not configured "
2110                         "to support BPF program loading");
2111
2112                 return -ENOTSUP;
2113         }
2114
2115         /* Actions */
2116         {
2117                 struct action_data adata[] = {
2118                         {
2119                                 .id = "bpf",
2120                                 .bpf = {
2121                                         .bpf_fd = flow->bpf_fd[SEC_L3_L4],
2122                                         .annotation = sec_name[SEC_L3_L4],
2123                                         .bpf = {
2124                                                 .action = TC_ACT_PIPE,
2125                                         },
2126                                 },
2127                         },
2128                 };
2129
2130                 if (add_actions(flow, RTE_DIM(adata), adata,
2131                         TCA_FLOWER_ACT) < 0)
2132                         return -1;
2133         }
2134
2135         return 0;
2136 }
2137
2138 /**
2139  * Manage filter operations.
2140  *
2141  * @param dev
2142  *   Pointer to Ethernet device structure.
2143  * @param filter_type
2144  *   Filter type.
2145  * @param filter_op
2146  *   Operation to perform.
2147  * @param arg
2148  *   Pointer to operation-specific structure.
2149  *
2150  * @return
2151  *   0 on success, negative errno value on failure.
2152  */
2153 int
2154 tap_dev_filter_ctrl(struct rte_eth_dev *dev,
2155                     enum rte_filter_type filter_type,
2156                     enum rte_filter_op filter_op,
2157                     void *arg)
2158 {
2159         switch (filter_type) {
2160         case RTE_ETH_FILTER_GENERIC:
2161                 if (filter_op != RTE_ETH_FILTER_GET)
2162                         return -EINVAL;
2163                 *(const void **)arg = &tap_flow_ops;
2164                 return 0;
2165         default:
2166                 RTE_LOG(ERR, PMD, "%p: filter type (%d) not supported\n",
2167                         (void *)dev, filter_type);
2168         }
2169         return -EINVAL;
2170 }
2171