net/mlx5: add TOS and TTL flower match and tunnel keys
[dpdk.git] / drivers / net / mlx5 / mlx5_flow_tcf.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018 6WIND S.A.
3  * Copyright 2018 Mellanox Technologies, Ltd
4  */
5
6 #include <assert.h>
7 #include <errno.h>
8 #include <libmnl/libmnl.h>
9 #include <linux/gen_stats.h>
10 #include <linux/if_ether.h>
11 #include <linux/netlink.h>
12 #include <linux/pkt_cls.h>
13 #include <linux/pkt_sched.h>
14 #include <linux/rtnetlink.h>
15 #include <linux/tc_act/tc_gact.h>
16 #include <linux/tc_act/tc_mirred.h>
17 #include <netinet/in.h>
18 #include <stdalign.h>
19 #include <stdbool.h>
20 #include <stddef.h>
21 #include <stdint.h>
22 #include <stdlib.h>
23 #include <sys/socket.h>
24
25 #include <rte_byteorder.h>
26 #include <rte_errno.h>
27 #include <rte_ether.h>
28 #include <rte_flow.h>
29 #include <rte_malloc.h>
30 #include <rte_common.h>
31
32 #include "mlx5.h"
33 #include "mlx5_flow.h"
34 #include "mlx5_autoconf.h"
35
36 #ifdef HAVE_TC_ACT_VLAN
37
38 #include <linux/tc_act/tc_vlan.h>
39
40 #else /* HAVE_TC_ACT_VLAN */
41
42 #define TCA_VLAN_ACT_POP 1
43 #define TCA_VLAN_ACT_PUSH 2
44 #define TCA_VLAN_ACT_MODIFY 3
45 #define TCA_VLAN_PARMS 2
46 #define TCA_VLAN_PUSH_VLAN_ID 3
47 #define TCA_VLAN_PUSH_VLAN_PROTOCOL 4
48 #define TCA_VLAN_PAD 5
49 #define TCA_VLAN_PUSH_VLAN_PRIORITY 6
50
51 struct tc_vlan {
52         tc_gen;
53         int v_action;
54 };
55
56 #endif /* HAVE_TC_ACT_VLAN */
57
58 #ifdef HAVE_TC_ACT_PEDIT
59
60 #include <linux/tc_act/tc_pedit.h>
61
62 #else /* HAVE_TC_ACT_VLAN */
63
64 enum {
65         TCA_PEDIT_UNSPEC,
66         TCA_PEDIT_TM,
67         TCA_PEDIT_PARMS,
68         TCA_PEDIT_PAD,
69         TCA_PEDIT_PARMS_EX,
70         TCA_PEDIT_KEYS_EX,
71         TCA_PEDIT_KEY_EX,
72         __TCA_PEDIT_MAX
73 };
74
75 enum {
76         TCA_PEDIT_KEY_EX_HTYPE = 1,
77         TCA_PEDIT_KEY_EX_CMD = 2,
78         __TCA_PEDIT_KEY_EX_MAX
79 };
80
81 enum pedit_header_type {
82         TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK = 0,
83         TCA_PEDIT_KEY_EX_HDR_TYPE_ETH = 1,
84         TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 = 2,
85         TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 = 3,
86         TCA_PEDIT_KEY_EX_HDR_TYPE_TCP = 4,
87         TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
88         __PEDIT_HDR_TYPE_MAX,
89 };
90
91 enum pedit_cmd {
92         TCA_PEDIT_KEY_EX_CMD_SET = 0,
93         TCA_PEDIT_KEY_EX_CMD_ADD = 1,
94         __PEDIT_CMD_MAX,
95 };
96
97 struct tc_pedit_key {
98         __u32 mask; /* AND */
99         __u32 val; /*XOR */
100         __u32 off; /*offset */
101         __u32 at;
102         __u32 offmask;
103         __u32 shift;
104 };
105
106 __extension__
107 struct tc_pedit_sel {
108         tc_gen;
109         unsigned char nkeys;
110         unsigned char flags;
111         struct tc_pedit_key keys[0];
112 };
113
114 #endif /* HAVE_TC_ACT_VLAN */
115
116 #ifdef HAVE_TC_ACT_TUNNEL_KEY
117
118 #include <linux/tc_act/tc_tunnel_key.h>
119
120 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
121 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
122 #endif
123
124 #ifndef HAVE_TCA_TUNNEL_KEY_NO_CSUM
125 #define TCA_TUNNEL_KEY_NO_CSUM 10
126 #endif
127
128 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_TOS
129 #define TCA_TUNNEL_KEY_ENC_TOS 12
130 #endif
131
132 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_TTL
133 #define TCA_TUNNEL_KEY_ENC_TTL 13
134 #endif
135
136 #else /* HAVE_TC_ACT_TUNNEL_KEY */
137
138 #define TCA_ACT_TUNNEL_KEY 17
139 #define TCA_TUNNEL_KEY_ACT_SET 1
140 #define TCA_TUNNEL_KEY_ACT_RELEASE 2
141 #define TCA_TUNNEL_KEY_PARMS 2
142 #define TCA_TUNNEL_KEY_ENC_IPV4_SRC 3
143 #define TCA_TUNNEL_KEY_ENC_IPV4_DST 4
144 #define TCA_TUNNEL_KEY_ENC_IPV6_SRC 5
145 #define TCA_TUNNEL_KEY_ENC_IPV6_DST 6
146 #define TCA_TUNNEL_KEY_ENC_KEY_ID 7
147 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
148 #define TCA_TUNNEL_KEY_NO_CSUM 10
149 #define TCA_TUNNEL_KEY_ENC_TOS 12
150 #define TCA_TUNNEL_KEY_ENC_TTL 13
151
152 struct tc_tunnel_key {
153         tc_gen;
154         int t_action;
155 };
156
157 #endif /* HAVE_TC_ACT_TUNNEL_KEY */
158
159 /* Normally found in linux/netlink.h. */
160 #ifndef NETLINK_CAP_ACK
161 #define NETLINK_CAP_ACK 10
162 #endif
163
164 /* Normally found in linux/pkt_sched.h. */
165 #ifndef TC_H_MIN_INGRESS
166 #define TC_H_MIN_INGRESS 0xfff2u
167 #endif
168
169 /* Normally found in linux/pkt_cls.h. */
170 #ifndef TCA_CLS_FLAGS_SKIP_SW
171 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
172 #endif
173 #ifndef TCA_CLS_FLAGS_IN_HW
174 #define TCA_CLS_FLAGS_IN_HW (1 << 2)
175 #endif
176 #ifndef HAVE_TCA_CHAIN
177 #define TCA_CHAIN 11
178 #endif
179 #ifndef HAVE_TCA_FLOWER_ACT
180 #define TCA_FLOWER_ACT 3
181 #endif
182 #ifndef HAVE_TCA_FLOWER_FLAGS
183 #define TCA_FLOWER_FLAGS 22
184 #endif
185 #ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE
186 #define TCA_FLOWER_KEY_ETH_TYPE 8
187 #endif
188 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST
189 #define TCA_FLOWER_KEY_ETH_DST 4
190 #endif
191 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK
192 #define TCA_FLOWER_KEY_ETH_DST_MASK 5
193 #endif
194 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC
195 #define TCA_FLOWER_KEY_ETH_SRC 6
196 #endif
197 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK
198 #define TCA_FLOWER_KEY_ETH_SRC_MASK 7
199 #endif
200 #ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO
201 #define TCA_FLOWER_KEY_IP_PROTO 9
202 #endif
203 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC
204 #define TCA_FLOWER_KEY_IPV4_SRC 10
205 #endif
206 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK
207 #define TCA_FLOWER_KEY_IPV4_SRC_MASK 11
208 #endif
209 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST
210 #define TCA_FLOWER_KEY_IPV4_DST 12
211 #endif
212 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK
213 #define TCA_FLOWER_KEY_IPV4_DST_MASK 13
214 #endif
215 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC
216 #define TCA_FLOWER_KEY_IPV6_SRC 14
217 #endif
218 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK
219 #define TCA_FLOWER_KEY_IPV6_SRC_MASK 15
220 #endif
221 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST
222 #define TCA_FLOWER_KEY_IPV6_DST 16
223 #endif
224 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK
225 #define TCA_FLOWER_KEY_IPV6_DST_MASK 17
226 #endif
227 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC
228 #define TCA_FLOWER_KEY_TCP_SRC 18
229 #endif
230 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK
231 #define TCA_FLOWER_KEY_TCP_SRC_MASK 35
232 #endif
233 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST
234 #define TCA_FLOWER_KEY_TCP_DST 19
235 #endif
236 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK
237 #define TCA_FLOWER_KEY_TCP_DST_MASK 36
238 #endif
239 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC
240 #define TCA_FLOWER_KEY_UDP_SRC 20
241 #endif
242 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK
243 #define TCA_FLOWER_KEY_UDP_SRC_MASK 37
244 #endif
245 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST
246 #define TCA_FLOWER_KEY_UDP_DST 21
247 #endif
248 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK
249 #define TCA_FLOWER_KEY_UDP_DST_MASK 38
250 #endif
251 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID
252 #define TCA_FLOWER_KEY_VLAN_ID 23
253 #endif
254 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO
255 #define TCA_FLOWER_KEY_VLAN_PRIO 24
256 #endif
257 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
258 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
259 #endif
260 #ifndef HAVE_TCA_FLOWER_KEY_ENC_KEY_ID
261 #define TCA_FLOWER_KEY_ENC_KEY_ID 26
262 #endif
263 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC
264 #define TCA_FLOWER_KEY_ENC_IPV4_SRC 27
265 #endif
266 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK
267 #define TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK 28
268 #endif
269 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST
270 #define TCA_FLOWER_KEY_ENC_IPV4_DST 29
271 #endif
272 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK
273 #define TCA_FLOWER_KEY_ENC_IPV4_DST_MASK 30
274 #endif
275 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC
276 #define TCA_FLOWER_KEY_ENC_IPV6_SRC 31
277 #endif
278 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK
279 #define TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK 32
280 #endif
281 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST
282 #define TCA_FLOWER_KEY_ENC_IPV6_DST 33
283 #endif
284 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK
285 #define TCA_FLOWER_KEY_ENC_IPV6_DST_MASK 34
286 #endif
287 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT
288 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT 43
289 #endif
290 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK
291 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK 44
292 #endif
293 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT
294 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT 45
295 #endif
296 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK
297 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK 46
298 #endif
299 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS
300 #define TCA_FLOWER_KEY_TCP_FLAGS 71
301 #endif
302 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK
303 #define TCA_FLOWER_KEY_TCP_FLAGS_MASK 72
304 #endif
305 #ifndef HAVE_TCA_FLOWER_KEY_IP_TOS
306 #define TCA_FLOWER_KEY_IP_TOS 73
307 #endif
308 #ifndef HAVE_TCA_FLOWER_KEY_IP_TOS_MASK
309 #define TCA_FLOWER_KEY_IP_TOS_MASK 74
310 #endif
311 #ifndef HAVE_TCA_FLOWER_KEY_IP_TTL
312 #define TCA_FLOWER_KEY_IP_TTL 75
313 #endif
314 #ifndef HAVE_TCA_FLOWER_KEY_IP_TTL_MASK
315 #define TCA_FLOWER_KEY_IP_TTL_MASK 76
316 #endif
317 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TOS
318 #define TCA_FLOWER_KEY_ENC_IP_TOS 80
319 #endif
320 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TOS_MASK
321 #define TCA_FLOWER_KEY_ENC_IP_TOS_MASK 81
322 #endif
323 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TTL
324 #define TCA_FLOWER_KEY_ENC_IP_TTL 82
325 #endif
326 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TTL_MASK
327 #define TCA_FLOWER_KEY_ENC_IP_TTL_MASK 83
328 #endif
329
330 #ifndef HAVE_TC_ACT_GOTO_CHAIN
331 #define TC_ACT_GOTO_CHAIN 0x20000000
332 #endif
333
334 #ifndef IPV6_ADDR_LEN
335 #define IPV6_ADDR_LEN 16
336 #endif
337
338 #ifndef IPV4_ADDR_LEN
339 #define IPV4_ADDR_LEN 4
340 #endif
341
342 #ifndef TP_PORT_LEN
343 #define TP_PORT_LEN 2 /* Transport Port (UDP/TCP) Length */
344 #endif
345
346 #ifndef TTL_LEN
347 #define TTL_LEN 1
348 #endif
349
350 #ifndef TCA_ACT_MAX_PRIO
351 #define TCA_ACT_MAX_PRIO 32
352 #endif
353
354 /** Parameters of VXLAN devices created by driver. */
355 #define MLX5_VXLAN_DEFAULT_VNI  1
356 #define MLX5_VXLAN_DEVICE_PFX "vmlx_"
357
358 /** Tunnel action type, used for @p type in header structure. */
359 enum flow_tcf_tunact_type {
360         FLOW_TCF_TUNACT_VXLAN_DECAP,
361         FLOW_TCF_TUNACT_VXLAN_ENCAP,
362 };
363
364 /** Flags used for @p mask in tunnel action encap descriptors. */
365 #define FLOW_TCF_ENCAP_ETH_SRC (1u << 0)
366 #define FLOW_TCF_ENCAP_ETH_DST (1u << 1)
367 #define FLOW_TCF_ENCAP_IPV4_SRC (1u << 2)
368 #define FLOW_TCF_ENCAP_IPV4_DST (1u << 3)
369 #define FLOW_TCF_ENCAP_IPV6_SRC (1u << 4)
370 #define FLOW_TCF_ENCAP_IPV6_DST (1u << 5)
371 #define FLOW_TCF_ENCAP_UDP_SRC (1u << 6)
372 #define FLOW_TCF_ENCAP_UDP_DST (1u << 7)
373 #define FLOW_TCF_ENCAP_VXLAN_VNI (1u << 8)
374
375 /**
376  * Structure for holding netlink context.
377  * Note the size of the message buffer which is MNL_SOCKET_BUFFER_SIZE.
378  * Using this (8KB) buffer size ensures that netlink messages will never be
379  * truncated.
380  */
381 struct mlx5_flow_tcf_context {
382         struct mnl_socket *nl; /* NETLINK_ROUTE libmnl socket. */
383         uint32_t seq; /* Message sequence number. */
384         uint32_t buf_size; /* Message buffer size. */
385         uint8_t *buf; /* Message buffer. */
386 };
387
388 /**
389  * Neigh rule structure. The neigh rule is applied via Netlink to
390  * outer tunnel iface in order to provide destination MAC address
391  * for the VXLAN encapsultion. The neigh rule is implicitly related
392  * to the Flow itself and can be shared by multiple Flows.
393  */
394 struct tcf_neigh_rule {
395         LIST_ENTRY(tcf_neigh_rule) next;
396         uint32_t refcnt;
397         struct ether_addr eth;
398         uint16_t mask;
399         union {
400                 struct {
401                         rte_be32_t dst;
402                 } ipv4;
403                 struct {
404                         uint8_t dst[IPV6_ADDR_LEN];
405                 } ipv6;
406         };
407 };
408
409 /**
410  * Local rule structure. The local rule is applied via Netlink to
411  * outer tunnel iface in order to provide local and peer IP addresses
412  * of the VXLAN tunnel for encapsulation. The local rule is implicitly
413  * related to the Flow itself and can be shared by multiple Flows.
414  */
415 struct tcf_local_rule {
416         LIST_ENTRY(tcf_local_rule) next;
417         uint32_t refcnt;
418         uint16_t mask;
419         union {
420                 struct {
421                         rte_be32_t dst;
422                         rte_be32_t src;
423                 } ipv4;
424                 struct {
425                         uint8_t dst[IPV6_ADDR_LEN];
426                         uint8_t src[IPV6_ADDR_LEN];
427                 } ipv6;
428         };
429 };
430
431 /** Outer interface VXLAN encapsulation rules container. */
432 struct tcf_irule {
433         LIST_ENTRY(tcf_irule) next;
434         LIST_HEAD(, tcf_neigh_rule) neigh;
435         LIST_HEAD(, tcf_local_rule) local;
436         uint32_t refcnt;
437         unsigned int ifouter; /**< Own interface index. */
438 };
439
440 /** VXLAN virtual netdev. */
441 struct tcf_vtep {
442         LIST_ENTRY(tcf_vtep) next;
443         uint32_t refcnt;
444         unsigned int ifindex; /**< Own interface index. */
445         uint16_t port;
446         uint8_t created;
447 };
448
449 /** Tunnel descriptor header, common for all tunnel types. */
450 struct flow_tcf_tunnel_hdr {
451         uint32_t type; /**< Tunnel action type. */
452         struct tcf_vtep *vtep; /**< Virtual tunnel endpoint device. */
453         unsigned int ifindex_org; /**< Original dst/src interface */
454         unsigned int *ifindex_ptr; /**< Interface ptr in message. */
455 };
456
457 struct flow_tcf_vxlan_decap {
458         struct flow_tcf_tunnel_hdr hdr;
459         uint16_t udp_port;
460 };
461
462 struct flow_tcf_vxlan_encap {
463         struct flow_tcf_tunnel_hdr hdr;
464         struct tcf_irule *iface;
465         uint32_t mask;
466         struct {
467                 struct ether_addr dst;
468                 struct ether_addr src;
469         } eth;
470         union {
471                 struct {
472                         rte_be32_t dst;
473                         rte_be32_t src;
474                 } ipv4;
475                 struct {
476                         uint8_t dst[IPV6_ADDR_LEN];
477                         uint8_t src[IPV6_ADDR_LEN];
478                 } ipv6;
479         };
480         struct {
481                 rte_be16_t src;
482                 rte_be16_t dst;
483         } udp;
484         struct {
485                 uint8_t vni[3];
486         } vxlan;
487 };
488
489 /** Structure used when extracting the values of a flow counters
490  * from a netlink message.
491  */
492 struct flow_tcf_stats_basic {
493         bool valid;
494         struct gnet_stats_basic counters;
495 };
496
497 /** Empty masks for known item types. */
498 static const union {
499         struct rte_flow_item_port_id port_id;
500         struct rte_flow_item_eth eth;
501         struct rte_flow_item_vlan vlan;
502         struct rte_flow_item_ipv4 ipv4;
503         struct rte_flow_item_ipv6 ipv6;
504         struct rte_flow_item_tcp tcp;
505         struct rte_flow_item_udp udp;
506         struct rte_flow_item_vxlan vxlan;
507 } flow_tcf_mask_empty = {
508         {0},
509 };
510
511 /** Supported masks for known item types. */
512 static const struct {
513         struct rte_flow_item_port_id port_id;
514         struct rte_flow_item_eth eth;
515         struct rte_flow_item_vlan vlan;
516         struct rte_flow_item_ipv4 ipv4;
517         struct rte_flow_item_ipv6 ipv6;
518         struct rte_flow_item_tcp tcp;
519         struct rte_flow_item_udp udp;
520         struct rte_flow_item_vxlan vxlan;
521 } flow_tcf_mask_supported = {
522         .port_id = {
523                 .id = 0xffffffff,
524         },
525         .eth = {
526                 .type = RTE_BE16(0xffff),
527                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
528                 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
529         },
530         .vlan = {
531                 /* PCP and VID only, no DEI. */
532                 .tci = RTE_BE16(0xefff),
533                 .inner_type = RTE_BE16(0xffff),
534         },
535         .ipv4.hdr = {
536                 .next_proto_id = 0xff,
537                 .src_addr = RTE_BE32(0xffffffff),
538                 .dst_addr = RTE_BE32(0xffffffff),
539         },
540         .ipv6.hdr = {
541                 .proto = 0xff,
542                 .src_addr =
543                         "\xff\xff\xff\xff\xff\xff\xff\xff"
544                         "\xff\xff\xff\xff\xff\xff\xff\xff",
545                 .dst_addr =
546                         "\xff\xff\xff\xff\xff\xff\xff\xff"
547                         "\xff\xff\xff\xff\xff\xff\xff\xff",
548         },
549         .tcp.hdr = {
550                 .src_port = RTE_BE16(0xffff),
551                 .dst_port = RTE_BE16(0xffff),
552                 .tcp_flags = 0xff,
553         },
554         .udp.hdr = {
555                 .src_port = RTE_BE16(0xffff),
556                 .dst_port = RTE_BE16(0xffff),
557         },
558         .vxlan = {
559                .vni = "\xff\xff\xff",
560         },
561 };
562
563 #define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr))
564 #define SZ_NLATTR_NEST SZ_NLATTR_HDR
565 #define SZ_NLATTR_DATA_OF(len) MNL_ALIGN(SZ_NLATTR_HDR + (len))
566 #define SZ_NLATTR_TYPE_OF(typ) SZ_NLATTR_DATA_OF(sizeof(typ))
567 #define SZ_NLATTR_STRZ_OF(str) SZ_NLATTR_DATA_OF(strlen(str) + 1)
568
569 #define PTOI_TABLE_SZ_MAX(dev) (mlx5_dev_to_port_id((dev)->device, NULL, 0) + 2)
570
571 /** DPDK port to network interface index (ifindex) conversion. */
572 struct flow_tcf_ptoi {
573         uint16_t port_id; /**< DPDK port ID. */
574         unsigned int ifindex; /**< Network interface index. */
575 };
576
577 /* Due to a limitation on driver/FW. */
578 #define MLX5_TCF_GROUP_ID_MAX 3
579
580 /*
581  * Due to a limitation on driver/FW, priority ranges from 1 to 16 in kernel.
582  * Priority in rte_flow attribute starts from 0 and is added by 1 in
583  * translation. This is subject to be changed to determine the max priority
584  * based on trial-and-error like Verbs driver once the restriction is lifted or
585  * the range is extended.
586  */
587 #define MLX5_TCF_GROUP_PRIORITY_MAX 15
588
589 #define MLX5_TCF_FATE_ACTIONS \
590         (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \
591          MLX5_FLOW_ACTION_JUMP)
592
593 #define MLX5_TCF_VLAN_ACTIONS \
594         (MLX5_FLOW_ACTION_OF_POP_VLAN | MLX5_FLOW_ACTION_OF_PUSH_VLAN | \
595          MLX5_FLOW_ACTION_OF_SET_VLAN_VID | MLX5_FLOW_ACTION_OF_SET_VLAN_PCP)
596
597 #define MLX5_TCF_VXLAN_ACTIONS \
598         (MLX5_FLOW_ACTION_VXLAN_ENCAP | MLX5_FLOW_ACTION_VXLAN_DECAP)
599
600 #define MLX5_TCF_PEDIT_ACTIONS \
601         (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST | \
602          MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST | \
603          MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST | \
604          MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL | \
605          MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)
606
607 #define MLX5_TCF_CONFIG_ACTIONS \
608         (MLX5_FLOW_ACTION_PORT_ID | MLX5_FLOW_ACTION_JUMP | \
609          MLX5_FLOW_ACTION_OF_PUSH_VLAN | MLX5_FLOW_ACTION_OF_SET_VLAN_VID | \
610          MLX5_FLOW_ACTION_OF_SET_VLAN_PCP | \
611          (MLX5_TCF_PEDIT_ACTIONS & ~MLX5_FLOW_ACTION_DEC_TTL))
612
613 #define MAX_PEDIT_KEYS 128
614 #define SZ_PEDIT_KEY_VAL 4
615
616 #define NUM_OF_PEDIT_KEYS(sz) \
617         (((sz) / SZ_PEDIT_KEY_VAL) + (((sz) % SZ_PEDIT_KEY_VAL) ? 1 : 0))
618
619 struct pedit_key_ex {
620         enum pedit_header_type htype;
621         enum pedit_cmd cmd;
622 };
623
624 struct pedit_parser {
625         struct tc_pedit_sel sel;
626         struct tc_pedit_key keys[MAX_PEDIT_KEYS];
627         struct pedit_key_ex keys_ex[MAX_PEDIT_KEYS];
628 };
629
630 /**
631  * Create space for using the implicitly created TC flow counter.
632  *
633  * @param[in] dev
634  *   Pointer to the Ethernet device structure.
635  *
636  * @return
637  *   A pointer to the counter data structure, NULL otherwise and
638  *   rte_errno is set.
639  */
640 static struct mlx5_flow_counter *
641 flow_tcf_counter_new(void)
642 {
643         struct mlx5_flow_counter *cnt;
644
645         /*
646          * eswitch counter cannot be shared and its id is unknown.
647          * currently returning all with id 0.
648          * in the future maybe better to switch to unique numbers.
649          */
650         struct mlx5_flow_counter tmpl = {
651                 .ref_cnt = 1,
652         };
653         cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0);
654         if (!cnt) {
655                 rte_errno = ENOMEM;
656                 return NULL;
657         }
658         *cnt = tmpl;
659         /* Implicit counter, do not add to list. */
660         return cnt;
661 }
662
663 /**
664  * Set pedit key of MAC address
665  *
666  * @param[in] actions
667  *   pointer to action specification
668  * @param[in,out] p_parser
669  *   pointer to pedit_parser
670  */
671 static void
672 flow_tcf_pedit_key_set_mac(const struct rte_flow_action *actions,
673                            struct pedit_parser *p_parser)
674 {
675         int idx = p_parser->sel.nkeys;
676         uint32_t off = actions->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ?
677                                         offsetof(struct ether_hdr, s_addr) :
678                                         offsetof(struct ether_hdr, d_addr);
679         const struct rte_flow_action_set_mac *conf =
680                 (const struct rte_flow_action_set_mac *)actions->conf;
681
682         p_parser->keys[idx].off = off;
683         p_parser->keys[idx].mask = ~UINT32_MAX;
684         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
685         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
686         memcpy(&p_parser->keys[idx].val,
687                 conf->mac_addr, SZ_PEDIT_KEY_VAL);
688         idx++;
689         p_parser->keys[idx].off = off + SZ_PEDIT_KEY_VAL;
690         p_parser->keys[idx].mask = 0xFFFF0000;
691         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
692         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
693         memcpy(&p_parser->keys[idx].val,
694                 conf->mac_addr + SZ_PEDIT_KEY_VAL,
695                 ETHER_ADDR_LEN - SZ_PEDIT_KEY_VAL);
696         p_parser->sel.nkeys = (++idx);
697 }
698
699 /**
700  * Set pedit key of decrease/set ttl
701  *
702  * @param[in] actions
703  *   pointer to action specification
704  * @param[in,out] p_parser
705  *   pointer to pedit_parser
706  * @param[in] item_flags
707  *   flags of all items presented
708  */
709 static void
710 flow_tcf_pedit_key_set_dec_ttl(const struct rte_flow_action *actions,
711                                 struct pedit_parser *p_parser,
712                                 uint64_t item_flags)
713 {
714         int idx = p_parser->sel.nkeys;
715
716         p_parser->keys[idx].mask = 0xFFFFFF00;
717         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) {
718                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
719                 p_parser->keys[idx].off =
720                         offsetof(struct ipv4_hdr, time_to_live);
721         }
722         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6) {
723                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
724                 p_parser->keys[idx].off =
725                         offsetof(struct ipv6_hdr, hop_limits);
726         }
727         if (actions->type == RTE_FLOW_ACTION_TYPE_DEC_TTL) {
728                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_ADD;
729                 p_parser->keys[idx].val = 0x000000FF;
730         } else {
731                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
732                 p_parser->keys[idx].val =
733                         (__u32)((const struct rte_flow_action_set_ttl *)
734                          actions->conf)->ttl_value;
735         }
736         p_parser->sel.nkeys = (++idx);
737 }
738
739 /**
740  * Set pedit key of transport (TCP/UDP) port value
741  *
742  * @param[in] actions
743  *   pointer to action specification
744  * @param[in,out] p_parser
745  *   pointer to pedit_parser
746  * @param[in] item_flags
747  *   flags of all items presented
748  */
749 static void
750 flow_tcf_pedit_key_set_tp_port(const struct rte_flow_action *actions,
751                                 struct pedit_parser *p_parser,
752                                 uint64_t item_flags)
753 {
754         int idx = p_parser->sel.nkeys;
755
756         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)
757                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_UDP;
758         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_TCP)
759                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_TCP;
760         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
761         /* offset of src/dst port is same for TCP and UDP */
762         p_parser->keys[idx].off =
763                 actions->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC ?
764                 offsetof(struct tcp_hdr, src_port) :
765                 offsetof(struct tcp_hdr, dst_port);
766         p_parser->keys[idx].mask = 0xFFFF0000;
767         p_parser->keys[idx].val =
768                 (__u32)((const struct rte_flow_action_set_tp *)
769                                 actions->conf)->port;
770         p_parser->sel.nkeys = (++idx);
771 }
772
773 /**
774  * Set pedit key of ipv6 address
775  *
776  * @param[in] actions
777  *   pointer to action specification
778  * @param[in,out] p_parser
779  *   pointer to pedit_parser
780  */
781 static void
782 flow_tcf_pedit_key_set_ipv6_addr(const struct rte_flow_action *actions,
783                                  struct pedit_parser *p_parser)
784 {
785         int idx = p_parser->sel.nkeys;
786         int keys = NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
787         int off_base =
788                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ?
789                 offsetof(struct ipv6_hdr, src_addr) :
790                 offsetof(struct ipv6_hdr, dst_addr);
791         const struct rte_flow_action_set_ipv6 *conf =
792                 (const struct rte_flow_action_set_ipv6 *)actions->conf;
793
794         for (int i = 0; i < keys; i++, idx++) {
795                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
796                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
797                 p_parser->keys[idx].off = off_base + i * SZ_PEDIT_KEY_VAL;
798                 p_parser->keys[idx].mask = ~UINT32_MAX;
799                 memcpy(&p_parser->keys[idx].val,
800                         conf->ipv6_addr + i *  SZ_PEDIT_KEY_VAL,
801                         SZ_PEDIT_KEY_VAL);
802         }
803         p_parser->sel.nkeys += keys;
804 }
805
806 /**
807  * Set pedit key of ipv4 address
808  *
809  * @param[in] actions
810  *   pointer to action specification
811  * @param[in,out] p_parser
812  *   pointer to pedit_parser
813  */
814 static void
815 flow_tcf_pedit_key_set_ipv4_addr(const struct rte_flow_action *actions,
816                                  struct pedit_parser *p_parser)
817 {
818         int idx = p_parser->sel.nkeys;
819
820         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
821         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
822         p_parser->keys[idx].off =
823                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ?
824                 offsetof(struct ipv4_hdr, src_addr) :
825                 offsetof(struct ipv4_hdr, dst_addr);
826         p_parser->keys[idx].mask = ~UINT32_MAX;
827         p_parser->keys[idx].val =
828                 ((const struct rte_flow_action_set_ipv4 *)
829                  actions->conf)->ipv4_addr;
830         p_parser->sel.nkeys = (++idx);
831 }
832
833 /**
834  * Create the pedit's na attribute in netlink message
835  * on pre-allocate message buffer
836  *
837  * @param[in,out] nl
838  *   pointer to pre-allocated netlink message buffer
839  * @param[in,out] actions
840  *   pointer to pointer of actions specification.
841  * @param[in,out] action_flags
842  *   pointer to actions flags
843  * @param[in] item_flags
844  *   flags of all item presented
845  */
846 static void
847 flow_tcf_create_pedit_mnl_msg(struct nlmsghdr *nl,
848                               const struct rte_flow_action **actions,
849                               uint64_t item_flags)
850 {
851         struct pedit_parser p_parser;
852         struct nlattr *na_act_options;
853         struct nlattr *na_pedit_keys;
854
855         memset(&p_parser, 0, sizeof(p_parser));
856         mnl_attr_put_strz(nl, TCA_ACT_KIND, "pedit");
857         na_act_options = mnl_attr_nest_start(nl, TCA_ACT_OPTIONS);
858         /* all modify header actions should be in one tc-pedit action */
859         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
860                 switch ((*actions)->type) {
861                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
862                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
863                         flow_tcf_pedit_key_set_ipv4_addr(*actions, &p_parser);
864                         break;
865                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
866                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
867                         flow_tcf_pedit_key_set_ipv6_addr(*actions, &p_parser);
868                         break;
869                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
870                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
871                         flow_tcf_pedit_key_set_tp_port(*actions,
872                                                         &p_parser, item_flags);
873                         break;
874                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
875                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
876                         flow_tcf_pedit_key_set_dec_ttl(*actions,
877                                                         &p_parser, item_flags);
878                         break;
879                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
880                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
881                         flow_tcf_pedit_key_set_mac(*actions, &p_parser);
882                         break;
883                 default:
884                         goto pedit_mnl_msg_done;
885                 }
886         }
887 pedit_mnl_msg_done:
888         p_parser.sel.action = TC_ACT_PIPE;
889         mnl_attr_put(nl, TCA_PEDIT_PARMS_EX,
890                      sizeof(p_parser.sel) +
891                      p_parser.sel.nkeys * sizeof(struct tc_pedit_key),
892                      &p_parser);
893         na_pedit_keys =
894                 mnl_attr_nest_start(nl, TCA_PEDIT_KEYS_EX | NLA_F_NESTED);
895         for (int i = 0; i < p_parser.sel.nkeys; i++) {
896                 struct nlattr *na_pedit_key =
897                         mnl_attr_nest_start(nl,
898                                             TCA_PEDIT_KEY_EX | NLA_F_NESTED);
899                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_HTYPE,
900                                  p_parser.keys_ex[i].htype);
901                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_CMD,
902                                  p_parser.keys_ex[i].cmd);
903                 mnl_attr_nest_end(nl, na_pedit_key);
904         }
905         mnl_attr_nest_end(nl, na_pedit_keys);
906         mnl_attr_nest_end(nl, na_act_options);
907         (*actions)--;
908 }
909
910 /**
911  * Calculate max memory size of one TC-pedit actions.
912  * One TC-pedit action can contain set of keys each defining
913  * a rewrite element (rte_flow action)
914  *
915  * @param[in,out] actions
916  *   actions specification.
917  * @param[in,out] action_flags
918  *   actions flags
919  * @param[in,out] size
920  *   accumulated size
921  * @return
922  *   Max memory size of one TC-pedit action
923  */
924 static int
925 flow_tcf_get_pedit_actions_size(const struct rte_flow_action **actions,
926                                 uint64_t *action_flags)
927 {
928         int pedit_size = 0;
929         int keys = 0;
930         uint64_t flags = 0;
931
932         pedit_size += SZ_NLATTR_NEST + /* na_act_index. */
933                       SZ_NLATTR_STRZ_OF("pedit") +
934                       SZ_NLATTR_NEST; /* TCA_ACT_OPTIONS. */
935         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
936                 switch ((*actions)->type) {
937                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
938                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
939                         flags |= MLX5_FLOW_ACTION_SET_IPV4_SRC;
940                         break;
941                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
942                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
943                         flags |= MLX5_FLOW_ACTION_SET_IPV4_DST;
944                         break;
945                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
946                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
947                         flags |= MLX5_FLOW_ACTION_SET_IPV6_SRC;
948                         break;
949                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
950                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
951                         flags |= MLX5_FLOW_ACTION_SET_IPV6_DST;
952                         break;
953                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
954                         /* TCP is as same as UDP */
955                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
956                         flags |= MLX5_FLOW_ACTION_SET_TP_SRC;
957                         break;
958                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
959                         /* TCP is as same as UDP */
960                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
961                         flags |= MLX5_FLOW_ACTION_SET_TP_DST;
962                         break;
963                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
964                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
965                         flags |= MLX5_FLOW_ACTION_SET_TTL;
966                         break;
967                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
968                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
969                         flags |= MLX5_FLOW_ACTION_DEC_TTL;
970                         break;
971                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
972                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
973                         flags |= MLX5_FLOW_ACTION_SET_MAC_SRC;
974                         break;
975                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
976                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
977                         flags |= MLX5_FLOW_ACTION_SET_MAC_DST;
978                         break;
979                 default:
980                         goto get_pedit_action_size_done;
981                 }
982         }
983 get_pedit_action_size_done:
984         /* TCA_PEDIT_PARAMS_EX */
985         pedit_size +=
986                 SZ_NLATTR_DATA_OF(sizeof(struct tc_pedit_sel) +
987                                   keys * sizeof(struct tc_pedit_key));
988         pedit_size += SZ_NLATTR_NEST; /* TCA_PEDIT_KEYS */
989         pedit_size += keys *
990                       /* TCA_PEDIT_KEY_EX + HTYPE + CMD */
991                       (SZ_NLATTR_NEST + SZ_NLATTR_DATA_OF(2) +
992                        SZ_NLATTR_DATA_OF(2));
993         (*action_flags) |= flags;
994         (*actions)--;
995         return pedit_size;
996 }
997
998 /**
999  * Retrieve mask for pattern item.
1000  *
1001  * This function does basic sanity checks on a pattern item in order to
1002  * return the most appropriate mask for it.
1003  *
1004  * @param[in] item
1005  *   Item specification.
1006  * @param[in] mask_default
1007  *   Default mask for pattern item as specified by the flow API.
1008  * @param[in] mask_supported
1009  *   Mask fields supported by the implementation.
1010  * @param[in] mask_empty
1011  *   Empty mask to return when there is no specification.
1012  * @param[out] error
1013  *   Perform verbose error reporting if not NULL.
1014  *
1015  * @return
1016  *   Either @p item->mask or one of the mask parameters on success, NULL
1017  *   otherwise and rte_errno is set.
1018  */
1019 static const void *
1020 flow_tcf_item_mask(const struct rte_flow_item *item, const void *mask_default,
1021                    const void *mask_supported, const void *mask_empty,
1022                    size_t mask_size, struct rte_flow_error *error)
1023 {
1024         const uint8_t *mask;
1025         size_t i;
1026
1027         /* item->last and item->mask cannot exist without item->spec. */
1028         if (!item->spec && (item->mask || item->last)) {
1029                 rte_flow_error_set(error, EINVAL,
1030                                    RTE_FLOW_ERROR_TYPE_ITEM, item,
1031                                    "\"mask\" or \"last\" field provided without"
1032                                    " a corresponding \"spec\"");
1033                 return NULL;
1034         }
1035         /* No spec, no mask, no problem. */
1036         if (!item->spec)
1037                 return mask_empty;
1038         mask = item->mask ? item->mask : mask_default;
1039         assert(mask);
1040         /*
1041          * Single-pass check to make sure that:
1042          * - Mask is supported, no bits are set outside mask_supported.
1043          * - Both item->spec and item->last are included in mask.
1044          */
1045         for (i = 0; i != mask_size; ++i) {
1046                 if (!mask[i])
1047                         continue;
1048                 if ((mask[i] | ((const uint8_t *)mask_supported)[i]) !=
1049                     ((const uint8_t *)mask_supported)[i]) {
1050                         rte_flow_error_set(error, ENOTSUP,
1051                                            RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1052                                            "unsupported field found"
1053                                            " in \"mask\"");
1054                         return NULL;
1055                 }
1056                 if (item->last &&
1057                     (((const uint8_t *)item->spec)[i] & mask[i]) !=
1058                     (((const uint8_t *)item->last)[i] & mask[i])) {
1059                         rte_flow_error_set(error, EINVAL,
1060                                            RTE_FLOW_ERROR_TYPE_ITEM_LAST,
1061                                            item->last,
1062                                            "range between \"spec\" and \"last\""
1063                                            " not comprised in \"mask\"");
1064                         return NULL;
1065                 }
1066         }
1067         return mask;
1068 }
1069
1070 /**
1071  * Build a conversion table between port ID and ifindex.
1072  *
1073  * @param[in] dev
1074  *   Pointer to Ethernet device.
1075  * @param[out] ptoi
1076  *   Pointer to ptoi table.
1077  * @param[in] len
1078  *   Size of ptoi table provided.
1079  *
1080  * @return
1081  *   Size of ptoi table filled.
1082  */
1083 static unsigned int
1084 flow_tcf_build_ptoi_table(struct rte_eth_dev *dev, struct flow_tcf_ptoi *ptoi,
1085                           unsigned int len)
1086 {
1087         unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
1088         uint16_t port_id[n + 1];
1089         unsigned int i;
1090         unsigned int own = 0;
1091
1092         /* At least one port is needed when no switch domain is present. */
1093         if (!n) {
1094                 n = 1;
1095                 port_id[0] = dev->data->port_id;
1096         } else {
1097                 n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
1098         }
1099         if (n > len)
1100                 return 0;
1101         for (i = 0; i != n; ++i) {
1102                 struct rte_eth_dev_info dev_info;
1103
1104                 rte_eth_dev_info_get(port_id[i], &dev_info);
1105                 if (port_id[i] == dev->data->port_id)
1106                         own = i;
1107                 ptoi[i].port_id = port_id[i];
1108                 ptoi[i].ifindex = dev_info.if_index;
1109         }
1110         /* Ensure first entry of ptoi[] is the current device. */
1111         if (own) {
1112                 ptoi[n] = ptoi[0];
1113                 ptoi[0] = ptoi[own];
1114                 ptoi[own] = ptoi[n];
1115         }
1116         /* An entry with zero ifindex terminates ptoi[]. */
1117         ptoi[n].port_id = 0;
1118         ptoi[n].ifindex = 0;
1119         return n;
1120 }
1121
1122 /**
1123  * Verify the @p attr will be correctly understood by the E-switch.
1124  *
1125  * @param[in] attr
1126  *   Pointer to flow attributes
1127  * @param[out] error
1128  *   Pointer to error structure.
1129  *
1130  * @return
1131  *   0 on success, a negative errno value otherwise and rte_errno is set.
1132  */
1133 static int
1134 flow_tcf_validate_attributes(const struct rte_flow_attr *attr,
1135                              struct rte_flow_error *error)
1136 {
1137         /*
1138          * Supported attributes: groups, some priorities and ingress only.
1139          * group is supported only if kernel supports chain. Don't care about
1140          * transfer as it is the caller's problem.
1141          */
1142         if (attr->group > MLX5_TCF_GROUP_ID_MAX)
1143                 return rte_flow_error_set(error, ENOTSUP,
1144                                           RTE_FLOW_ERROR_TYPE_ATTR_GROUP, attr,
1145                                           "group ID larger than "
1146                                           RTE_STR(MLX5_TCF_GROUP_ID_MAX)
1147                                           " isn't supported");
1148         else if (attr->priority > MLX5_TCF_GROUP_PRIORITY_MAX)
1149                 return rte_flow_error_set(error, ENOTSUP,
1150                                           RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1151                                           attr,
1152                                           "priority more than "
1153                                           RTE_STR(MLX5_TCF_GROUP_PRIORITY_MAX)
1154                                           " is not supported");
1155         if (!attr->ingress)
1156                 return rte_flow_error_set(error, EINVAL,
1157                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1158                                           attr, "only ingress is supported");
1159         if (attr->egress)
1160                 return rte_flow_error_set(error, ENOTSUP,
1161                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1162                                           attr, "egress is not supported");
1163         return 0;
1164 }
1165
1166 /**
1167  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_ETH item for E-Switch.
1168  * The routine checks the L2 fields to be used in encapsulation header.
1169  *
1170  * @param[in] item
1171  *   Pointer to the item structure.
1172  * @param[out] error
1173  *   Pointer to the error structure.
1174  *
1175  * @return
1176  *   0 on success, a negative errno value otherwise and rte_errno is set.
1177  **/
1178 static int
1179 flow_tcf_validate_vxlan_encap_eth(const struct rte_flow_item *item,
1180                                   struct rte_flow_error *error)
1181 {
1182         const struct rte_flow_item_eth *spec = item->spec;
1183         const struct rte_flow_item_eth *mask = item->mask;
1184
1185         if (!spec) {
1186                 /*
1187                  * Specification for L2 addresses can be empty
1188                  * because these ones are optional and not
1189                  * required directly by tc rule. Kernel tries
1190                  * to resolve these ones on its own
1191                  */
1192                 return 0;
1193         }
1194         if (!mask) {
1195                 /* If mask is not specified use the default one. */
1196                 mask = &rte_flow_item_eth_mask;
1197         }
1198         if (memcmp(&mask->dst,
1199                    &flow_tcf_mask_empty.eth.dst,
1200                    sizeof(flow_tcf_mask_empty.eth.dst))) {
1201                 if (memcmp(&mask->dst,
1202                            &rte_flow_item_eth_mask.dst,
1203                            sizeof(rte_flow_item_eth_mask.dst)))
1204                         return rte_flow_error_set
1205                                 (error, ENOTSUP,
1206                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1207                                  "no support for partial mask on"
1208                                  " \"eth.dst\" field");
1209         }
1210         if (memcmp(&mask->src,
1211                    &flow_tcf_mask_empty.eth.src,
1212                    sizeof(flow_tcf_mask_empty.eth.src))) {
1213                 if (memcmp(&mask->src,
1214                            &rte_flow_item_eth_mask.src,
1215                            sizeof(rte_flow_item_eth_mask.src)))
1216                         return rte_flow_error_set
1217                                 (error, ENOTSUP,
1218                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1219                                  "no support for partial mask on"
1220                                  " \"eth.src\" field");
1221         }
1222         if (mask->type != RTE_BE16(0x0000)) {
1223                 if (mask->type != RTE_BE16(0xffff))
1224                         return rte_flow_error_set
1225                                 (error, ENOTSUP,
1226                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1227                                  "no support for partial mask on"
1228                                  " \"eth.type\" field");
1229                 DRV_LOG(WARNING,
1230                         "outer ethernet type field"
1231                         " cannot be forced for vxlan"
1232                         " encapsulation, parameter ignored");
1233         }
1234         return 0;
1235 }
1236
1237 /**
1238  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV4 item for E-Switch.
1239  * The routine checks the IPv4 fields to be used in encapsulation header.
1240  *
1241  * @param[in] item
1242  *   Pointer to the item structure.
1243  * @param[out] error
1244  *   Pointer to the error structure.
1245  *
1246  * @return
1247  *   0 on success, a negative errno value otherwise and rte_errno is set.
1248  **/
1249 static int
1250 flow_tcf_validate_vxlan_encap_ipv4(const struct rte_flow_item *item,
1251                                    struct rte_flow_error *error)
1252 {
1253         const struct rte_flow_item_ipv4 *spec = item->spec;
1254         const struct rte_flow_item_ipv4 *mask = item->mask;
1255
1256         if (!spec) {
1257                 /*
1258                  * Specification for IP addresses cannot be empty
1259                  * because it is required by tunnel_key parameter.
1260                  */
1261                 return rte_flow_error_set(error, EINVAL,
1262                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1263                                           "NULL outer ipv4 address"
1264                                           " specification for vxlan"
1265                                           " encapsulation");
1266         }
1267         if (!mask)
1268                 mask = &rte_flow_item_ipv4_mask;
1269         if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) {
1270                 if (mask->hdr.dst_addr != RTE_BE32(0xffffffff))
1271                         return rte_flow_error_set
1272                                 (error, ENOTSUP,
1273                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1274                                  "no support for partial mask on"
1275                                  " \"ipv4.hdr.dst_addr\" field"
1276                                  " for vxlan encapsulation");
1277                 /* More IPv4 address validations can be put here. */
1278         } else {
1279                 /*
1280                  * Kernel uses the destination IP address to determine
1281                  * the routing path and obtain the MAC destination
1282                  * address, so IP destination address must be
1283                  * specified in the tc rule.
1284                  */
1285                 return rte_flow_error_set(error, EINVAL,
1286                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1287                                           "outer ipv4 destination address"
1288                                           " must be specified for"
1289                                           " vxlan encapsulation");
1290         }
1291         if (mask->hdr.src_addr != RTE_BE32(0x00000000)) {
1292                 if (mask->hdr.src_addr != RTE_BE32(0xffffffff))
1293                         return rte_flow_error_set
1294                                 (error, ENOTSUP,
1295                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1296                                  "no support for partial mask on"
1297                                  " \"ipv4.hdr.src_addr\" field"
1298                                  " for vxlan encapsulation");
1299                 /* More IPv4 address validations can be put here. */
1300         } else {
1301                 /*
1302                  * Kernel uses the source IP address to select the
1303                  * interface for egress encapsulated traffic, so
1304                  * it must be specified in the tc rule.
1305                  */
1306                 return rte_flow_error_set(error, EINVAL,
1307                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1308                                           "outer ipv4 source address"
1309                                           " must be specified for"
1310                                           " vxlan encapsulation");
1311         }
1312         return 0;
1313 }
1314
1315 /**
1316  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV6 item for E-Switch.
1317  * The routine checks the IPv6 fields to be used in encapsulation header.
1318  *
1319  * @param[in] item
1320  *   Pointer to the item structure.
1321  * @param[out] error
1322  *   Pointer to the error structure.
1323  *
1324  * @return
1325  *   0 on success, a negative errno value otherwise and rte_errno is set.
1326  **/
1327 static int
1328 flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item,
1329                                    struct rte_flow_error *error)
1330 {
1331         const struct rte_flow_item_ipv6 *spec = item->spec;
1332         const struct rte_flow_item_ipv6 *mask = item->mask;
1333
1334         if (!spec) {
1335                 /*
1336                  * Specification for IP addresses cannot be empty
1337                  * because it is required by tunnel_key parameter.
1338                  */
1339                 return rte_flow_error_set(error, EINVAL,
1340                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1341                                           "NULL outer ipv6 address"
1342                                           " specification for"
1343                                           " vxlan encapsulation");
1344         }
1345         if (!mask)
1346                 mask = &rte_flow_item_ipv6_mask;
1347         if (memcmp(&mask->hdr.dst_addr,
1348                    &flow_tcf_mask_empty.ipv6.hdr.dst_addr,
1349                    IPV6_ADDR_LEN)) {
1350                 if (memcmp(&mask->hdr.dst_addr,
1351                            &rte_flow_item_ipv6_mask.hdr.dst_addr,
1352                            IPV6_ADDR_LEN))
1353                         return rte_flow_error_set
1354                                         (error, ENOTSUP,
1355                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1356                                          "no support for partial mask on"
1357                                          " \"ipv6.hdr.dst_addr\" field"
1358                                          " for vxlan encapsulation");
1359                 /* More IPv6 address validations can be put here. */
1360         } else {
1361                 /*
1362                  * Kernel uses the destination IP address to determine
1363                  * the routing path and obtain the MAC destination
1364                  * address (heigh or gate), so IP destination address
1365                  * must be specified within the tc rule.
1366                  */
1367                 return rte_flow_error_set(error, EINVAL,
1368                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1369                                           "outer ipv6 destination address"
1370                                           " must be specified for"
1371                                           " vxlan encapsulation");
1372         }
1373         if (memcmp(&mask->hdr.src_addr,
1374                    &flow_tcf_mask_empty.ipv6.hdr.src_addr,
1375                    IPV6_ADDR_LEN)) {
1376                 if (memcmp(&mask->hdr.src_addr,
1377                            &rte_flow_item_ipv6_mask.hdr.src_addr,
1378                            IPV6_ADDR_LEN))
1379                         return rte_flow_error_set
1380                                         (error, ENOTSUP,
1381                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1382                                          "no support for partial mask on"
1383                                          " \"ipv6.hdr.src_addr\" field"
1384                                          " for vxlan encapsulation");
1385                 /* More L3 address validation can be put here. */
1386         } else {
1387                 /*
1388                  * Kernel uses the source IP address to select the
1389                  * interface for egress encapsulated traffic, so
1390                  * it must be specified in the tc rule.
1391                  */
1392                 return rte_flow_error_set(error, EINVAL,
1393                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1394                                           "outer L3 source address"
1395                                           " must be specified for"
1396                                           " vxlan encapsulation");
1397         }
1398         return 0;
1399 }
1400
1401 /**
1402  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_UDP item for E-Switch.
1403  * The routine checks the UDP fields to be used in encapsulation header.
1404  *
1405  * @param[in] item
1406  *   Pointer to the item structure.
1407  * @param[out] error
1408  *   Pointer to the error structure.
1409  *
1410  * @return
1411  *   0 on success, a negative errno value otherwise and rte_errno is set.
1412  **/
1413 static int
1414 flow_tcf_validate_vxlan_encap_udp(const struct rte_flow_item *item,
1415                                   struct rte_flow_error *error)
1416 {
1417         const struct rte_flow_item_udp *spec = item->spec;
1418         const struct rte_flow_item_udp *mask = item->mask;
1419
1420         if (!spec) {
1421                 /*
1422                  * Specification for UDP ports cannot be empty
1423                  * because it is required by tunnel_key parameter.
1424                  */
1425                 return rte_flow_error_set(error, EINVAL,
1426                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1427                                           "NULL UDP port specification "
1428                                           " for vxlan encapsulation");
1429         }
1430         if (!mask)
1431                 mask = &rte_flow_item_udp_mask;
1432         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1433                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1434                         return rte_flow_error_set
1435                                         (error, ENOTSUP,
1436                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1437                                          "no support for partial mask on"
1438                                          " \"udp.hdr.dst_port\" field"
1439                                          " for vxlan encapsulation");
1440                 if (!spec->hdr.dst_port)
1441                         return rte_flow_error_set
1442                                         (error, EINVAL,
1443                                          RTE_FLOW_ERROR_TYPE_ITEM, item,
1444                                          "outer UDP remote port cannot be"
1445                                          " 0 for vxlan encapsulation");
1446         } else {
1447                 return rte_flow_error_set(error, EINVAL,
1448                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1449                                           "outer UDP remote port"
1450                                           " must be specified for"
1451                                           " vxlan encapsulation");
1452         }
1453         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1454                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1455                         return rte_flow_error_set
1456                                         (error, ENOTSUP,
1457                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1458                                          "no support for partial mask on"
1459                                          " \"udp.hdr.src_port\" field"
1460                                          " for vxlan encapsulation");
1461                 DRV_LOG(WARNING,
1462                         "outer UDP source port cannot be"
1463                         " forced for vxlan encapsulation,"
1464                         " parameter ignored");
1465         }
1466         return 0;
1467 }
1468
1469 /**
1470  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_VXLAN item for E-Switch.
1471  * The routine checks the VNIP fields to be used in encapsulation header.
1472  *
1473  * @param[in] item
1474  *   Pointer to the item structure.
1475  * @param[out] error
1476  *   Pointer to the error structure.
1477  *
1478  * @return
1479  *   0 on success, a negative errno value otherwise and rte_errno is set.
1480  **/
1481 static int
1482 flow_tcf_validate_vxlan_encap_vni(const struct rte_flow_item *item,
1483                                   struct rte_flow_error *error)
1484 {
1485         const struct rte_flow_item_vxlan *spec = item->spec;
1486         const struct rte_flow_item_vxlan *mask = item->mask;
1487
1488         if (!spec) {
1489                 /* Outer VNI is required by tunnel_key parameter. */
1490                 return rte_flow_error_set(error, EINVAL,
1491                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1492                                           "NULL VNI specification"
1493                                           " for vxlan encapsulation");
1494         }
1495         if (!mask)
1496                 mask = &rte_flow_item_vxlan_mask;
1497         if (!mask->vni[0] && !mask->vni[1] && !mask->vni[2])
1498                 return rte_flow_error_set(error, EINVAL,
1499                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1500                                           "outer VNI must be specified "
1501                                           "for vxlan encapsulation");
1502         if (mask->vni[0] != 0xff ||
1503             mask->vni[1] != 0xff ||
1504             mask->vni[2] != 0xff)
1505                 return rte_flow_error_set(error, ENOTSUP,
1506                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1507                                           "no support for partial mask on"
1508                                           " \"vxlan.vni\" field");
1509
1510         if (!spec->vni[0] && !spec->vni[1] && !spec->vni[2])
1511                 return rte_flow_error_set(error, EINVAL,
1512                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1513                                           "vxlan vni cannot be 0");
1514         return 0;
1515 }
1516
1517 /**
1518  * Validate VXLAN_ENCAP action item list for E-Switch.
1519  * The routine checks items to be used in encapsulation header.
1520  *
1521  * @param[in] action
1522  *   Pointer to the VXLAN_ENCAP action structure.
1523  * @param[out] error
1524  *   Pointer to the error structure.
1525  *
1526  * @return
1527  *   0 on success, a negative errno value otherwise and rte_errno is set.
1528  **/
1529 static int
1530 flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action,
1531                               struct rte_flow_error *error)
1532 {
1533         const struct rte_flow_item *items;
1534         int ret;
1535         uint32_t item_flags = 0;
1536
1537         if (!action->conf)
1538                 return rte_flow_error_set(error, EINVAL,
1539                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1540                                           "Missing vxlan tunnel"
1541                                           " action configuration");
1542         items = ((const struct rte_flow_action_vxlan_encap *)
1543                                         action->conf)->definition;
1544         if (!items)
1545                 return rte_flow_error_set(error, EINVAL,
1546                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1547                                           "Missing vxlan tunnel"
1548                                           " encapsulation parameters");
1549         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1550                 switch (items->type) {
1551                 case RTE_FLOW_ITEM_TYPE_VOID:
1552                         break;
1553                 case RTE_FLOW_ITEM_TYPE_ETH:
1554                         ret = mlx5_flow_validate_item_eth(items, item_flags,
1555                                                           error);
1556                         if (ret < 0)
1557                                 return ret;
1558                         ret = flow_tcf_validate_vxlan_encap_eth(items, error);
1559                         if (ret < 0)
1560                                 return ret;
1561                         item_flags |= MLX5_FLOW_LAYER_OUTER_L2;
1562                         break;
1563                 break;
1564                 case RTE_FLOW_ITEM_TYPE_IPV4:
1565                         ret = mlx5_flow_validate_item_ipv4(items, item_flags,
1566                                                            error);
1567                         if (ret < 0)
1568                                 return ret;
1569                         ret = flow_tcf_validate_vxlan_encap_ipv4(items, error);
1570                         if (ret < 0)
1571                                 return ret;
1572                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
1573                         break;
1574                 case RTE_FLOW_ITEM_TYPE_IPV6:
1575                         ret = mlx5_flow_validate_item_ipv6(items, item_flags,
1576                                                            error);
1577                         if (ret < 0)
1578                                 return ret;
1579                         ret = flow_tcf_validate_vxlan_encap_ipv6(items, error);
1580                         if (ret < 0)
1581                                 return ret;
1582                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
1583                         break;
1584                 case RTE_FLOW_ITEM_TYPE_UDP:
1585                         ret = mlx5_flow_validate_item_udp(items, item_flags,
1586                                                            0xFF, error);
1587                         if (ret < 0)
1588                                 return ret;
1589                         ret = flow_tcf_validate_vxlan_encap_udp(items, error);
1590                         if (ret < 0)
1591                                 return ret;
1592                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
1593                         break;
1594                 case RTE_FLOW_ITEM_TYPE_VXLAN:
1595                         ret = mlx5_flow_validate_item_vxlan(items,
1596                                                             item_flags, error);
1597                         if (ret < 0)
1598                                 return ret;
1599                         ret = flow_tcf_validate_vxlan_encap_vni(items, error);
1600                         if (ret < 0)
1601                                 return ret;
1602                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
1603                         break;
1604                 default:
1605                         return rte_flow_error_set
1606                                         (error, ENOTSUP,
1607                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
1608                                          "vxlan encap item not supported");
1609                 }
1610         }
1611         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3))
1612                 return rte_flow_error_set(error, EINVAL,
1613                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1614                                           "no outer IP layer found"
1615                                           " for vxlan encapsulation");
1616         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
1617                 return rte_flow_error_set(error, EINVAL,
1618                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1619                                           "no outer UDP layer found"
1620                                           " for vxlan encapsulation");
1621         if (!(item_flags & MLX5_FLOW_LAYER_VXLAN))
1622                 return rte_flow_error_set(error, EINVAL,
1623                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1624                                           "no VXLAN VNI found"
1625                                           " for vxlan encapsulation");
1626         return 0;
1627 }
1628
1629 /**
1630  * Validate outer RTE_FLOW_ITEM_TYPE_UDP item if tunnel item
1631  * RTE_FLOW_ITEM_TYPE_VXLAN is present in item list.
1632  *
1633  * @param[in] udp
1634  *   Outer UDP layer item (if any, NULL otherwise).
1635  * @param[out] error
1636  *   Pointer to the error structure.
1637  *
1638  * @return
1639  *   0 on success, a negative errno value otherwise and rte_errno is set.
1640  **/
1641 static int
1642 flow_tcf_validate_vxlan_decap_udp(const struct rte_flow_item *udp,
1643                                   struct rte_flow_error *error)
1644 {
1645         const struct rte_flow_item_udp *spec = udp->spec;
1646         const struct rte_flow_item_udp *mask = udp->mask;
1647
1648         if (!spec)
1649                 /*
1650                  * Specification for UDP ports cannot be empty
1651                  * because it is required as decap parameter.
1652                  */
1653                 return rte_flow_error_set(error, EINVAL,
1654                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1655                                           "NULL UDP port specification"
1656                                           " for VXLAN decapsulation");
1657         if (!mask)
1658                 mask = &rte_flow_item_udp_mask;
1659         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1660                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1661                         return rte_flow_error_set
1662                                         (error, ENOTSUP,
1663                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1664                                          "no support for partial mask on"
1665                                          " \"udp.hdr.dst_port\" field");
1666                 if (!spec->hdr.dst_port)
1667                         return rte_flow_error_set
1668                                         (error, EINVAL,
1669                                          RTE_FLOW_ERROR_TYPE_ITEM, udp,
1670                                          "zero decap local UDP port");
1671         } else {
1672                 return rte_flow_error_set(error, EINVAL,
1673                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1674                                           "outer UDP destination port must be "
1675                                           "specified for vxlan decapsulation");
1676         }
1677         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1678                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1679                         return rte_flow_error_set
1680                                         (error, ENOTSUP,
1681                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1682                                          "no support for partial mask on"
1683                                          " \"udp.hdr.src_port\" field");
1684                 DRV_LOG(WARNING,
1685                         "outer UDP local port cannot be "
1686                         "forced for VXLAN encapsulation, "
1687                         "parameter ignored");
1688         }
1689         return 0;
1690 }
1691
1692 /**
1693  * Validate flow for E-Switch.
1694  *
1695  * @param[in] priv
1696  *   Pointer to the priv structure.
1697  * @param[in] attr
1698  *   Pointer to the flow attributes.
1699  * @param[in] items
1700  *   Pointer to the list of items.
1701  * @param[in] actions
1702  *   Pointer to the list of actions.
1703  * @param[out] error
1704  *   Pointer to the error structure.
1705  *
1706  * @return
1707  *   0 on success, a negative errno value otherwise and rte_errno is set.
1708  */
1709 static int
1710 flow_tcf_validate(struct rte_eth_dev *dev,
1711                   const struct rte_flow_attr *attr,
1712                   const struct rte_flow_item items[],
1713                   const struct rte_flow_action actions[],
1714                   struct rte_flow_error *error)
1715 {
1716         union {
1717                 const struct rte_flow_item_port_id *port_id;
1718                 const struct rte_flow_item_eth *eth;
1719                 const struct rte_flow_item_vlan *vlan;
1720                 const struct rte_flow_item_ipv4 *ipv4;
1721                 const struct rte_flow_item_ipv6 *ipv6;
1722                 const struct rte_flow_item_tcp *tcp;
1723                 const struct rte_flow_item_udp *udp;
1724                 const struct rte_flow_item_vxlan *vxlan;
1725         } spec, mask;
1726         union {
1727                 const struct rte_flow_action_port_id *port_id;
1728                 const struct rte_flow_action_jump *jump;
1729                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
1730                 const struct rte_flow_action_of_set_vlan_vid *
1731                         of_set_vlan_vid;
1732                 const struct rte_flow_action_of_set_vlan_pcp *
1733                         of_set_vlan_pcp;
1734                 const struct rte_flow_action_vxlan_encap *vxlan_encap;
1735                 const struct rte_flow_action_set_ipv4 *set_ipv4;
1736                 const struct rte_flow_action_set_ipv6 *set_ipv6;
1737         } conf;
1738         const struct rte_flow_item *outer_udp = NULL;
1739         rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
1740         rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
1741         rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
1742         uint64_t item_flags = 0;
1743         uint64_t action_flags = 0;
1744         uint8_t next_protocol = 0xff;
1745         unsigned int tcm_ifindex = 0;
1746         uint8_t pedit_validated = 0;
1747         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
1748         struct rte_eth_dev *port_id_dev = NULL;
1749         bool in_port_id_set;
1750         int ret;
1751
1752         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
1753                                                 PTOI_TABLE_SZ_MAX(dev)));
1754         ret = flow_tcf_validate_attributes(attr, error);
1755         if (ret < 0)
1756                 return ret;
1757         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
1758                 unsigned int i;
1759                 uint64_t current_action_flag = 0;
1760
1761                 switch (actions->type) {
1762                 case RTE_FLOW_ACTION_TYPE_VOID:
1763                         break;
1764                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
1765                         current_action_flag = MLX5_FLOW_ACTION_PORT_ID;
1766                         if (!actions->conf)
1767                                 break;
1768                         conf.port_id = actions->conf;
1769                         if (conf.port_id->original)
1770                                 i = 0;
1771                         else
1772                                 for (i = 0; ptoi[i].ifindex; ++i)
1773                                         if (ptoi[i].port_id == conf.port_id->id)
1774                                                 break;
1775                         if (!ptoi[i].ifindex)
1776                                 return rte_flow_error_set
1777                                         (error, ENODEV,
1778                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1779                                          conf.port_id,
1780                                          "missing data to convert port ID to"
1781                                          " ifindex");
1782                         port_id_dev = &rte_eth_devices[conf.port_id->id];
1783                         break;
1784                 case RTE_FLOW_ACTION_TYPE_JUMP:
1785                         current_action_flag = MLX5_FLOW_ACTION_JUMP;
1786                         if (!actions->conf)
1787                                 break;
1788                         conf.jump = actions->conf;
1789                         if (attr->group >= conf.jump->group)
1790                                 return rte_flow_error_set
1791                                         (error, ENOTSUP,
1792                                          RTE_FLOW_ERROR_TYPE_ACTION,
1793                                          actions,
1794                                          "can jump only to a group forward");
1795                         break;
1796                 case RTE_FLOW_ACTION_TYPE_DROP:
1797                         current_action_flag = MLX5_FLOW_ACTION_DROP;
1798                         break;
1799                 case RTE_FLOW_ACTION_TYPE_COUNT:
1800                         break;
1801                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
1802                         current_action_flag = MLX5_FLOW_ACTION_OF_POP_VLAN;
1803                         break;
1804                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: {
1805                         rte_be16_t ethertype;
1806
1807                         current_action_flag = MLX5_FLOW_ACTION_OF_PUSH_VLAN;
1808                         if (!actions->conf)
1809                                 break;
1810                         conf.of_push_vlan = actions->conf;
1811                         ethertype = conf.of_push_vlan->ethertype;
1812                         if (ethertype != RTE_BE16(ETH_P_8021Q) &&
1813                             ethertype != RTE_BE16(ETH_P_8021AD))
1814                                 return rte_flow_error_set
1815                                         (error, EINVAL,
1816                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1817                                          "vlan push TPID must be "
1818                                          "802.1Q or 802.1AD");
1819                         break;
1820                 }
1821                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
1822                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1823                                 return rte_flow_error_set
1824                                         (error, ENOTSUP,
1825                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1826                                          "vlan modify is not supported,"
1827                                          " set action must follow push action");
1828                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
1829                         break;
1830                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
1831                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1832                                 return rte_flow_error_set
1833                                         (error, ENOTSUP,
1834                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1835                                          "vlan modify is not supported,"
1836                                          " set action must follow push action");
1837                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
1838                         break;
1839                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
1840                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_DECAP;
1841                         break;
1842                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
1843                         ret = flow_tcf_validate_vxlan_encap(actions, error);
1844                         if (ret < 0)
1845                                 return ret;
1846                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_ENCAP;
1847                         break;
1848                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
1849                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_SRC;
1850                         break;
1851                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
1852                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_DST;
1853                         break;
1854                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
1855                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_SRC;
1856                         break;
1857                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
1858                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_DST;
1859                         break;
1860                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
1861                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_SRC;
1862                         break;
1863                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
1864                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_DST;
1865                         break;
1866                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
1867                         current_action_flag = MLX5_FLOW_ACTION_SET_TTL;
1868                         break;
1869                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
1870                         current_action_flag = MLX5_FLOW_ACTION_DEC_TTL;
1871                         break;
1872                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
1873                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_SRC;
1874                         break;
1875                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
1876                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_DST;
1877                         break;
1878                 default:
1879                         return rte_flow_error_set(error, ENOTSUP,
1880                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1881                                                   actions,
1882                                                   "action not supported");
1883                 }
1884                 if (current_action_flag & MLX5_TCF_CONFIG_ACTIONS) {
1885                         if (!actions->conf)
1886                                 return rte_flow_error_set
1887                                         (error, EINVAL,
1888                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1889                                          actions,
1890                                          "action configuration not set");
1891                 }
1892                 if ((current_action_flag & MLX5_TCF_PEDIT_ACTIONS) &&
1893                     pedit_validated)
1894                         return rte_flow_error_set(error, ENOTSUP,
1895                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1896                                                   actions,
1897                                                   "set actions should be "
1898                                                   "listed successively");
1899                 if ((current_action_flag & ~MLX5_TCF_PEDIT_ACTIONS) &&
1900                     (action_flags & MLX5_TCF_PEDIT_ACTIONS))
1901                         pedit_validated = 1;
1902                 if ((current_action_flag & MLX5_TCF_FATE_ACTIONS) &&
1903                     (action_flags & MLX5_TCF_FATE_ACTIONS))
1904                         return rte_flow_error_set(error, EINVAL,
1905                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1906                                                   actions,
1907                                                   "can't have multiple fate"
1908                                                   " actions");
1909                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1910                     (action_flags & MLX5_TCF_VXLAN_ACTIONS))
1911                         return rte_flow_error_set(error, EINVAL,
1912                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1913                                                   actions,
1914                                                   "can't have multiple vxlan"
1915                                                   " actions");
1916                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1917                     (action_flags & MLX5_TCF_VLAN_ACTIONS))
1918                         return rte_flow_error_set(error, ENOTSUP,
1919                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1920                                                   actions,
1921                                                   "can't have vxlan and vlan"
1922                                                   " actions in the same rule");
1923                 action_flags |= current_action_flag;
1924         }
1925         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1926                 unsigned int i;
1927
1928                 switch (items->type) {
1929                 case RTE_FLOW_ITEM_TYPE_VOID:
1930                         break;
1931                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
1932                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
1933                                 return rte_flow_error_set
1934                                         (error, ENOTSUP,
1935                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
1936                                          "inner tunnel port id"
1937                                          " item is not supported");
1938                         mask.port_id = flow_tcf_item_mask
1939                                 (items, &rte_flow_item_port_id_mask,
1940                                  &flow_tcf_mask_supported.port_id,
1941                                  &flow_tcf_mask_empty.port_id,
1942                                  sizeof(flow_tcf_mask_supported.port_id),
1943                                  error);
1944                         if (!mask.port_id)
1945                                 return -rte_errno;
1946                         if (mask.port_id == &flow_tcf_mask_empty.port_id) {
1947                                 in_port_id_set = 1;
1948                                 break;
1949                         }
1950                         spec.port_id = items->spec;
1951                         if (mask.port_id->id && mask.port_id->id != 0xffffffff)
1952                                 return rte_flow_error_set
1953                                         (error, ENOTSUP,
1954                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
1955                                          mask.port_id,
1956                                          "no support for partial mask on"
1957                                          " \"id\" field");
1958                         if (!mask.port_id->id)
1959                                 i = 0;
1960                         else
1961                                 for (i = 0; ptoi[i].ifindex; ++i)
1962                                         if (ptoi[i].port_id == spec.port_id->id)
1963                                                 break;
1964                         if (!ptoi[i].ifindex)
1965                                 return rte_flow_error_set
1966                                         (error, ENODEV,
1967                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
1968                                          spec.port_id,
1969                                          "missing data to convert port ID to"
1970                                          " ifindex");
1971                         if (in_port_id_set && ptoi[i].ifindex != tcm_ifindex)
1972                                 return rte_flow_error_set
1973                                         (error, ENOTSUP,
1974                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
1975                                          spec.port_id,
1976                                          "cannot match traffic for"
1977                                          " several port IDs through"
1978                                          " a single flow rule");
1979                         tcm_ifindex = ptoi[i].ifindex;
1980                         in_port_id_set = 1;
1981                         break;
1982                 case RTE_FLOW_ITEM_TYPE_ETH:
1983                         ret = mlx5_flow_validate_item_eth(items, item_flags,
1984                                                           error);
1985                         if (ret < 0)
1986                                 return ret;
1987                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
1988                                       MLX5_FLOW_LAYER_INNER_L2 :
1989                                       MLX5_FLOW_LAYER_OUTER_L2;
1990                         /* TODO:
1991                          * Redundant check due to different supported mask.
1992                          * Same for the rest of items.
1993                          */
1994                         mask.eth = flow_tcf_item_mask
1995                                 (items, &rte_flow_item_eth_mask,
1996                                  &flow_tcf_mask_supported.eth,
1997                                  &flow_tcf_mask_empty.eth,
1998                                  sizeof(flow_tcf_mask_supported.eth),
1999                                  error);
2000                         if (!mask.eth)
2001                                 return -rte_errno;
2002                         if (mask.eth->type && mask.eth->type !=
2003                             RTE_BE16(0xffff))
2004                                 return rte_flow_error_set
2005                                         (error, ENOTSUP,
2006                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2007                                          mask.eth,
2008                                          "no support for partial mask on"
2009                                          " \"type\" field");
2010                         assert(items->spec);
2011                         spec.eth = items->spec;
2012                         if (mask.eth->type &&
2013                             (item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
2014                             inner_etype != RTE_BE16(ETH_P_ALL) &&
2015                             inner_etype != spec.eth->type)
2016                                 return rte_flow_error_set
2017                                         (error, EINVAL,
2018                                          RTE_FLOW_ERROR_TYPE_ITEM,
2019                                          items,
2020                                          "inner eth_type conflict");
2021                         if (mask.eth->type &&
2022                             !(item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
2023                             outer_etype != RTE_BE16(ETH_P_ALL) &&
2024                             outer_etype != spec.eth->type)
2025                                 return rte_flow_error_set
2026                                         (error, EINVAL,
2027                                          RTE_FLOW_ERROR_TYPE_ITEM,
2028                                          items,
2029                                          "outer eth_type conflict");
2030                         if (mask.eth->type) {
2031                                 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
2032                                         inner_etype = spec.eth->type;
2033                                 else
2034                                         outer_etype = spec.eth->type;
2035                         }
2036                         break;
2037                 case RTE_FLOW_ITEM_TYPE_VLAN:
2038                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
2039                                 return rte_flow_error_set
2040                                         (error, ENOTSUP,
2041                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
2042                                          "inner tunnel VLAN"
2043                                          " is not supported");
2044                         ret = mlx5_flow_validate_item_vlan(items, item_flags,
2045                                                            error);
2046                         if (ret < 0)
2047                                 return ret;
2048                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
2049                         mask.vlan = flow_tcf_item_mask
2050                                 (items, &rte_flow_item_vlan_mask,
2051                                  &flow_tcf_mask_supported.vlan,
2052                                  &flow_tcf_mask_empty.vlan,
2053                                  sizeof(flow_tcf_mask_supported.vlan),
2054                                  error);
2055                         if (!mask.vlan)
2056                                 return -rte_errno;
2057                         if ((mask.vlan->tci & RTE_BE16(0xe000) &&
2058                              (mask.vlan->tci & RTE_BE16(0xe000)) !=
2059                               RTE_BE16(0xe000)) ||
2060                             (mask.vlan->tci & RTE_BE16(0x0fff) &&
2061                              (mask.vlan->tci & RTE_BE16(0x0fff)) !=
2062                               RTE_BE16(0x0fff)) ||
2063                             (mask.vlan->inner_type &&
2064                              mask.vlan->inner_type != RTE_BE16(0xffff)))
2065                                 return rte_flow_error_set
2066                                         (error, ENOTSUP,
2067                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2068                                          mask.vlan,
2069                                          "no support for partial masks on"
2070                                          " \"tci\" (PCP and VID parts) and"
2071                                          " \"inner_type\" fields");
2072                         if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2073                             outer_etype != RTE_BE16(ETH_P_8021Q))
2074                                 return rte_flow_error_set
2075                                         (error, EINVAL,
2076                                          RTE_FLOW_ERROR_TYPE_ITEM,
2077                                          items,
2078                                          "outer eth_type conflict,"
2079                                          " must be 802.1Q");
2080                         outer_etype = RTE_BE16(ETH_P_8021Q);
2081                         assert(items->spec);
2082                         spec.vlan = items->spec;
2083                         if (mask.vlan->inner_type &&
2084                             vlan_etype != RTE_BE16(ETH_P_ALL) &&
2085                             vlan_etype != spec.vlan->inner_type)
2086                                 return rte_flow_error_set
2087                                         (error, EINVAL,
2088                                          RTE_FLOW_ERROR_TYPE_ITEM,
2089                                          items,
2090                                          "vlan eth_type conflict");
2091                         if (mask.vlan->inner_type)
2092                                 vlan_etype = spec.vlan->inner_type;
2093                         break;
2094                 case RTE_FLOW_ITEM_TYPE_IPV4:
2095                         ret = mlx5_flow_validate_item_ipv4(items, item_flags,
2096                                                            error);
2097                         if (ret < 0)
2098                                 return ret;
2099                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2100                                       MLX5_FLOW_LAYER_INNER_L3_IPV4 :
2101                                       MLX5_FLOW_LAYER_OUTER_L3_IPV4;
2102                         mask.ipv4 = flow_tcf_item_mask
2103                                 (items, &rte_flow_item_ipv4_mask,
2104                                  &flow_tcf_mask_supported.ipv4,
2105                                  &flow_tcf_mask_empty.ipv4,
2106                                  sizeof(flow_tcf_mask_supported.ipv4),
2107                                  error);
2108                         if (!mask.ipv4)
2109                                 return -rte_errno;
2110                         if (mask.ipv4->hdr.next_proto_id &&
2111                             mask.ipv4->hdr.next_proto_id != 0xff)
2112                                 return rte_flow_error_set
2113                                         (error, ENOTSUP,
2114                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2115                                          mask.ipv4,
2116                                          "no support for partial mask on"
2117                                          " \"hdr.next_proto_id\" field");
2118                         else if (mask.ipv4->hdr.next_proto_id)
2119                                 next_protocol =
2120                                         ((const struct rte_flow_item_ipv4 *)
2121                                          (items->spec))->hdr.next_proto_id;
2122                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2123                                 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2124                                     inner_etype != RTE_BE16(ETH_P_IP))
2125                                         return rte_flow_error_set
2126                                                 (error, EINVAL,
2127                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2128                                                  items,
2129                                                  "inner eth_type conflict,"
2130                                                  " IPv4 is required");
2131                                 inner_etype = RTE_BE16(ETH_P_IP);
2132                         } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2133                                 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2134                                     vlan_etype != RTE_BE16(ETH_P_IP))
2135                                         return rte_flow_error_set
2136                                                 (error, EINVAL,
2137                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2138                                                  items,
2139                                                  "vlan eth_type conflict,"
2140                                                  " IPv4 is required");
2141                                 vlan_etype = RTE_BE16(ETH_P_IP);
2142                         } else {
2143                                 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2144                                     outer_etype != RTE_BE16(ETH_P_IP))
2145                                         return rte_flow_error_set
2146                                                 (error, EINVAL,
2147                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2148                                                  items,
2149                                                  "eth_type conflict,"
2150                                                  " IPv4 is required");
2151                                 outer_etype = RTE_BE16(ETH_P_IP);
2152                         }
2153                         break;
2154                 case RTE_FLOW_ITEM_TYPE_IPV6:
2155                         ret = mlx5_flow_validate_item_ipv6(items, item_flags,
2156                                                            error);
2157                         if (ret < 0)
2158                                 return ret;
2159                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2160                                       MLX5_FLOW_LAYER_INNER_L3_IPV6 :
2161                                       MLX5_FLOW_LAYER_OUTER_L3_IPV6;
2162                         mask.ipv6 = flow_tcf_item_mask
2163                                 (items, &rte_flow_item_ipv6_mask,
2164                                  &flow_tcf_mask_supported.ipv6,
2165                                  &flow_tcf_mask_empty.ipv6,
2166                                  sizeof(flow_tcf_mask_supported.ipv6),
2167                                  error);
2168                         if (!mask.ipv6)
2169                                 return -rte_errno;
2170                         if (mask.ipv6->hdr.proto &&
2171                             mask.ipv6->hdr.proto != 0xff)
2172                                 return rte_flow_error_set
2173                                         (error, ENOTSUP,
2174                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2175                                          mask.ipv6,
2176                                          "no support for partial mask on"
2177                                          " \"hdr.proto\" field");
2178                         else if (mask.ipv6->hdr.proto)
2179                                 next_protocol =
2180                                         ((const struct rte_flow_item_ipv6 *)
2181                                          (items->spec))->hdr.proto;
2182                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2183                                 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2184                                     inner_etype != RTE_BE16(ETH_P_IPV6))
2185                                         return rte_flow_error_set
2186                                                 (error, EINVAL,
2187                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2188                                                  items,
2189                                                  "inner eth_type conflict,"
2190                                                  " IPv6 is required");
2191                                 inner_etype = RTE_BE16(ETH_P_IPV6);
2192                         } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2193                                 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2194                                     vlan_etype != RTE_BE16(ETH_P_IPV6))
2195                                         return rte_flow_error_set
2196                                                 (error, EINVAL,
2197                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2198                                                  items,
2199                                                  "vlan eth_type conflict,"
2200                                                  " IPv6 is required");
2201                                 vlan_etype = RTE_BE16(ETH_P_IPV6);
2202                         } else {
2203                                 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2204                                     outer_etype != RTE_BE16(ETH_P_IPV6))
2205                                         return rte_flow_error_set
2206                                                 (error, EINVAL,
2207                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2208                                                  items,
2209                                                  "eth_type conflict,"
2210                                                  " IPv6 is required");
2211                                 outer_etype = RTE_BE16(ETH_P_IPV6);
2212                         }
2213                         break;
2214                 case RTE_FLOW_ITEM_TYPE_UDP:
2215                         ret = mlx5_flow_validate_item_udp(items, item_flags,
2216                                                           next_protocol, error);
2217                         if (ret < 0)
2218                                 return ret;
2219                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2220                                       MLX5_FLOW_LAYER_INNER_L4_UDP :
2221                                       MLX5_FLOW_LAYER_OUTER_L4_UDP;
2222                         mask.udp = flow_tcf_item_mask
2223                                 (items, &rte_flow_item_udp_mask,
2224                                  &flow_tcf_mask_supported.udp,
2225                                  &flow_tcf_mask_empty.udp,
2226                                  sizeof(flow_tcf_mask_supported.udp),
2227                                  error);
2228                         if (!mask.udp)
2229                                 return -rte_errno;
2230                         /*
2231                          * Save the presumed outer UDP item for extra check
2232                          * if the tunnel item will be found later in the list.
2233                          */
2234                         if (!(item_flags & MLX5_FLOW_LAYER_TUNNEL))
2235                                 outer_udp = items;
2236                         break;
2237                 case RTE_FLOW_ITEM_TYPE_TCP:
2238                         ret = mlx5_flow_validate_item_tcp
2239                                              (items, item_flags,
2240                                               next_protocol,
2241                                               &flow_tcf_mask_supported.tcp,
2242                                               error);
2243                         if (ret < 0)
2244                                 return ret;
2245                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2246                                       MLX5_FLOW_LAYER_INNER_L4_TCP :
2247                                       MLX5_FLOW_LAYER_OUTER_L4_TCP;
2248                         mask.tcp = flow_tcf_item_mask
2249                                 (items, &rte_flow_item_tcp_mask,
2250                                  &flow_tcf_mask_supported.tcp,
2251                                  &flow_tcf_mask_empty.tcp,
2252                                  sizeof(flow_tcf_mask_supported.tcp),
2253                                  error);
2254                         if (!mask.tcp)
2255                                 return -rte_errno;
2256                         break;
2257                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2258                         if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN)
2259                                 return rte_flow_error_set
2260                                         (error, ENOTSUP,
2261                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
2262                                          "vxlan tunnel over vlan"
2263                                          " is not supported");
2264                         ret = mlx5_flow_validate_item_vxlan(items,
2265                                                             item_flags, error);
2266                         if (ret < 0)
2267                                 return ret;
2268                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
2269                         mask.vxlan = flow_tcf_item_mask
2270                                 (items, &rte_flow_item_vxlan_mask,
2271                                  &flow_tcf_mask_supported.vxlan,
2272                                  &flow_tcf_mask_empty.vxlan,
2273                                  sizeof(flow_tcf_mask_supported.vxlan), error);
2274                         if (!mask.vxlan)
2275                                 return -rte_errno;
2276                         if (mask.vxlan->vni[0] != 0xff ||
2277                             mask.vxlan->vni[1] != 0xff ||
2278                             mask.vxlan->vni[2] != 0xff)
2279                                 return rte_flow_error_set
2280                                         (error, ENOTSUP,
2281                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2282                                          mask.vxlan,
2283                                          "no support for partial or "
2284                                          "empty mask on \"vxlan.vni\" field");
2285                         /*
2286                          * The VNI item assumes the VXLAN tunnel, it requires
2287                          * at least the outer destination UDP port must be
2288                          * specified without wildcards to allow kernel select
2289                          * the virtual VXLAN device by port. Also outer IPv4
2290                          * or IPv6 item must be specified (wilcards or even
2291                          * zero mask are allowed) to let driver know the tunnel
2292                          * IP version and process UDP traffic correctly.
2293                          */
2294                         if (!(item_flags &
2295                              (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2296                               MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2297                                 return rte_flow_error_set
2298                                                  (error, EINVAL,
2299                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2300                                                   NULL,
2301                                                   "no outer IP pattern found"
2302                                                   " for vxlan tunnel");
2303                         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
2304                                 return rte_flow_error_set
2305                                                  (error, EINVAL,
2306                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2307                                                   NULL,
2308                                                   "no outer UDP pattern found"
2309                                                   " for vxlan tunnel");
2310                         /*
2311                          * All items preceding the tunnel item become outer
2312                          * ones and we should do extra validation for them
2313                          * due to tc limitations for tunnel outer parameters.
2314                          * Currently only outer UDP item requres extra check,
2315                          * use the saved pointer instead of item list rescan.
2316                          */
2317                         assert(outer_udp);
2318                         ret = flow_tcf_validate_vxlan_decap_udp
2319                                                 (outer_udp, error);
2320                         if (ret < 0)
2321                                 return ret;
2322                         /* Reset L4 protocol for inner parameters. */
2323                         next_protocol = 0xff;
2324                         break;
2325                 default:
2326                         return rte_flow_error_set(error, ENOTSUP,
2327                                                   RTE_FLOW_ERROR_TYPE_ITEM,
2328                                                   items, "item not supported");
2329                 }
2330         }
2331         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2332             (action_flags & MLX5_FLOW_ACTION_DROP))
2333                 return rte_flow_error_set(error, ENOTSUP,
2334                                           RTE_FLOW_ERROR_TYPE_ACTION,
2335                                           actions,
2336                                           "set action is not compatible with "
2337                                           "drop action");
2338         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2339             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2340                 return rte_flow_error_set(error, ENOTSUP,
2341                                           RTE_FLOW_ERROR_TYPE_ACTION,
2342                                           actions,
2343                                           "set action must be followed by "
2344                                           "port_id action");
2345         if (action_flags &
2346            (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST)) {
2347                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4))
2348                         return rte_flow_error_set(error, EINVAL,
2349                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2350                                                   actions,
2351                                                   "no ipv4 item found in"
2352                                                   " pattern");
2353         }
2354         if (action_flags &
2355            (MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST)) {
2356                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6))
2357                         return rte_flow_error_set(error, EINVAL,
2358                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2359                                                   actions,
2360                                                   "no ipv6 item found in"
2361                                                   " pattern");
2362         }
2363         if (action_flags &
2364            (MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST)) {
2365                 if (!(item_flags &
2366                      (MLX5_FLOW_LAYER_OUTER_L4_UDP |
2367                       MLX5_FLOW_LAYER_OUTER_L4_TCP)))
2368                         return rte_flow_error_set(error, EINVAL,
2369                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2370                                                   actions,
2371                                                   "no TCP/UDP item found in"
2372                                                   " pattern");
2373         }
2374         /*
2375          * FW syndrome (0xA9C090):
2376          *     set_flow_table_entry: push vlan action fte in fdb can ONLY be
2377          *     forward to the uplink.
2378          */
2379         if ((action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) &&
2380             (action_flags & MLX5_FLOW_ACTION_PORT_ID) &&
2381             ((struct priv *)port_id_dev->data->dev_private)->representor)
2382                 return rte_flow_error_set(error, ENOTSUP,
2383                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2384                                           "vlan push can only be applied"
2385                                           " when forwarding to uplink port");
2386         /*
2387          * FW syndrome (0x294609):
2388          *     set_flow_table_entry: modify/pop/push actions in fdb flow table
2389          *     are supported only while forwarding to vport.
2390          */
2391         if ((action_flags & MLX5_TCF_VLAN_ACTIONS) &&
2392             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2393                 return rte_flow_error_set(error, ENOTSUP,
2394                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2395                                           "vlan actions are supported"
2396                                           " only with port_id action");
2397         if ((action_flags & MLX5_TCF_VXLAN_ACTIONS) &&
2398             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2399                 return rte_flow_error_set(error, ENOTSUP,
2400                                           RTE_FLOW_ERROR_TYPE_ACTION, NULL,
2401                                           "vxlan actions are supported"
2402                                           " only with port_id action");
2403         if (!(action_flags & MLX5_TCF_FATE_ACTIONS))
2404                 return rte_flow_error_set(error, EINVAL,
2405                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2406                                           "no fate action is found");
2407         if (action_flags &
2408            (MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL)) {
2409                 if (!(item_flags &
2410                      (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2411                       MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2412                         return rte_flow_error_set(error, EINVAL,
2413                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2414                                                   actions,
2415                                                   "no IP found in pattern");
2416         }
2417         if (action_flags &
2418             (MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)) {
2419                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L2))
2420                         return rte_flow_error_set(error, ENOTSUP,
2421                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2422                                                   actions,
2423                                                   "no ethernet found in"
2424                                                   " pattern");
2425         }
2426         if ((action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) &&
2427             !(item_flags & MLX5_FLOW_LAYER_VXLAN))
2428                 return rte_flow_error_set(error, EINVAL,
2429                                           RTE_FLOW_ERROR_TYPE_ACTION,
2430                                           NULL,
2431                                           "no VNI pattern found"
2432                                           " for vxlan decap action");
2433         if ((action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) &&
2434             (item_flags & MLX5_FLOW_LAYER_TUNNEL))
2435                 return rte_flow_error_set(error, EINVAL,
2436                                           RTE_FLOW_ERROR_TYPE_ACTION,
2437                                           NULL,
2438                                           "vxlan encap not supported"
2439                                           " for tunneled traffic");
2440         return 0;
2441 }
2442
2443 /**
2444  * Calculate maximum size of memory for flow items of Linux TC flower.
2445  *
2446  * @param[in] attr
2447  *   Pointer to the flow attributes.
2448  * @param[in] items
2449  *   Pointer to the list of items.
2450  * @param[out] action_flags
2451  *   Pointer to the detected actions.
2452  *
2453  * @return
2454  *   Maximum size of memory for items.
2455  */
2456 static int
2457 flow_tcf_get_items_size(const struct rte_flow_attr *attr,
2458                         const struct rte_flow_item items[],
2459                         uint64_t *action_flags)
2460 {
2461         int size = 0;
2462
2463         size += SZ_NLATTR_STRZ_OF("flower") +
2464                 SZ_NLATTR_TYPE_OF(uint16_t) + /* Outer ether type. */
2465                 SZ_NLATTR_NEST + /* TCA_OPTIONS. */
2466                 SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */
2467         if (attr->group > 0)
2468                 size += SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CHAIN. */
2469         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2470                 switch (items->type) {
2471                 case RTE_FLOW_ITEM_TYPE_VOID:
2472                         break;
2473                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
2474                         break;
2475                 case RTE_FLOW_ITEM_TYPE_ETH:
2476                         size += SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4;
2477                                 /* dst/src MAC addr and mask. */
2478                         break;
2479                 case RTE_FLOW_ITEM_TYPE_VLAN:
2480                         size += SZ_NLATTR_TYPE_OF(uint16_t) +
2481                                 /* VLAN Ether type. */
2482                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */
2483                                 SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */
2484                         break;
2485                 case RTE_FLOW_ITEM_TYPE_IPV4:
2486                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2487                                 SZ_NLATTR_TYPE_OF(uint32_t) * 4;
2488                                 /* dst/src IP addr and mask. */
2489                         break;
2490                 case RTE_FLOW_ITEM_TYPE_IPV6:
2491                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2492                                 SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4;
2493                                 /* dst/src IP addr and mask. */
2494                         break;
2495                 case RTE_FLOW_ITEM_TYPE_UDP:
2496                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2497                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2498                                 /* dst/src port and mask. */
2499                         break;
2500                 case RTE_FLOW_ITEM_TYPE_TCP:
2501                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2502                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2503                                 /* dst/src port and mask. */
2504                         break;
2505                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2506                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2507                         /*
2508                          * There might be no VXLAN decap action in the action
2509                          * list, nonetheless the VXLAN tunnel flow requires
2510                          * the decap structure to be correctly applied to
2511                          * VXLAN device, set the flag to create the structure.
2512                          * Translation routine will not put the decap action
2513                          * in tne Netlink message if there is no actual action
2514                          * in the list.
2515                          */
2516                         *action_flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2517                         break;
2518                 default:
2519                         DRV_LOG(WARNING,
2520                                 "unsupported item %p type %d,"
2521                                 " items must be validated before flow creation",
2522                                 (const void *)items, items->type);
2523                         break;
2524                 }
2525         }
2526         return size;
2527 }
2528
2529 /**
2530  * Calculate size of memory to store the VXLAN encapsultion
2531  * related items in the Netlink message buffer. Items list
2532  * is specified by RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action.
2533  * The item list should be validated.
2534  *
2535  * @param[in] action
2536  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2537  *   List of pattern items to scan data from.
2538  *
2539  * @return
2540  *   The size the part of Netlink message buffer to store the
2541  *   VXLAN encapsulation item attributes.
2542  */
2543 static int
2544 flow_tcf_vxlan_encap_size(const struct rte_flow_action *action)
2545 {
2546         const struct rte_flow_item *items;
2547         int size = 0;
2548
2549         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
2550         assert(action->conf);
2551
2552         items = ((const struct rte_flow_action_vxlan_encap *)
2553                                         action->conf)->definition;
2554         assert(items);
2555         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2556                 switch (items->type) {
2557                 case RTE_FLOW_ITEM_TYPE_VOID:
2558                         break;
2559                 case RTE_FLOW_ITEM_TYPE_ETH:
2560                         /* This item does not require message buffer. */
2561                         break;
2562                 case RTE_FLOW_ITEM_TYPE_IPV4:
2563                         size += SZ_NLATTR_DATA_OF(IPV4_ADDR_LEN) * 2;
2564                         break;
2565                 case RTE_FLOW_ITEM_TYPE_IPV6:
2566                         size += SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 2;
2567                         break;
2568                 case RTE_FLOW_ITEM_TYPE_UDP: {
2569                         const struct rte_flow_item_udp *udp = items->mask;
2570
2571                         size += SZ_NLATTR_TYPE_OF(uint16_t);
2572                         if (!udp || udp->hdr.src_port != RTE_BE16(0x0000))
2573                                 size += SZ_NLATTR_TYPE_OF(uint16_t);
2574                         break;
2575                 }
2576                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2577                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2578                         break;
2579                 default:
2580                         assert(false);
2581                         DRV_LOG(WARNING,
2582                                 "unsupported item %p type %d,"
2583                                 " items must be validated"
2584                                 " before flow creation",
2585                                 (const void *)items, items->type);
2586                         return 0;
2587                 }
2588         }
2589         return size;
2590 }
2591
2592 /**
2593  * Calculate maximum size of memory for flow actions of Linux TC flower and
2594  * extract specified actions.
2595  *
2596  * @param[in] actions
2597  *   Pointer to the list of actions.
2598  * @param[out] action_flags
2599  *   Pointer to the detected actions.
2600  *
2601  * @return
2602  *   Maximum size of memory for actions.
2603  */
2604 static int
2605 flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
2606                               uint64_t *action_flags)
2607 {
2608         int size = 0;
2609         uint64_t flags = 0;
2610
2611         size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */
2612         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
2613                 switch (actions->type) {
2614                 case RTE_FLOW_ACTION_TYPE_VOID:
2615                         break;
2616                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
2617                         size += SZ_NLATTR_NEST + /* na_act_index. */
2618                                 SZ_NLATTR_STRZ_OF("mirred") +
2619                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2620                                 SZ_NLATTR_TYPE_OF(struct tc_mirred);
2621                         flags |= MLX5_FLOW_ACTION_PORT_ID;
2622                         break;
2623                 case RTE_FLOW_ACTION_TYPE_JUMP:
2624                         size += SZ_NLATTR_NEST + /* na_act_index. */
2625                                 SZ_NLATTR_STRZ_OF("gact") +
2626                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2627                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2628                         flags |= MLX5_FLOW_ACTION_JUMP;
2629                         break;
2630                 case RTE_FLOW_ACTION_TYPE_DROP:
2631                         size += SZ_NLATTR_NEST + /* na_act_index. */
2632                                 SZ_NLATTR_STRZ_OF("gact") +
2633                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2634                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2635                         flags |= MLX5_FLOW_ACTION_DROP;
2636                         break;
2637                 case RTE_FLOW_ACTION_TYPE_COUNT:
2638                         break;
2639                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
2640                         flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
2641                         goto action_of_vlan;
2642                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
2643                         flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
2644                         goto action_of_vlan;
2645                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
2646                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
2647                         goto action_of_vlan;
2648                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
2649                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
2650                         goto action_of_vlan;
2651 action_of_vlan:
2652                         size += SZ_NLATTR_NEST + /* na_act_index. */
2653                                 SZ_NLATTR_STRZ_OF("vlan") +
2654                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2655                                 SZ_NLATTR_TYPE_OF(struct tc_vlan) +
2656                                 SZ_NLATTR_TYPE_OF(uint16_t) +
2657                                 /* VLAN protocol. */
2658                                 SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
2659                                 SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
2660                         break;
2661                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
2662                         size += SZ_NLATTR_NEST + /* na_act_index. */
2663                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2664                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2665                                 SZ_NLATTR_TYPE_OF(uint8_t);
2666                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2667                         size += flow_tcf_vxlan_encap_size(actions) +
2668                                 RTE_ALIGN_CEIL /* preceding encap params. */
2669                                 (sizeof(struct flow_tcf_vxlan_encap),
2670                                 MNL_ALIGNTO);
2671                         flags |= MLX5_FLOW_ACTION_VXLAN_ENCAP;
2672                         break;
2673                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
2674                         size += SZ_NLATTR_NEST + /* na_act_index. */
2675                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2676                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2677                                 SZ_NLATTR_TYPE_OF(uint8_t);
2678                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2679                         size += RTE_ALIGN_CEIL /* preceding decap params. */
2680                                 (sizeof(struct flow_tcf_vxlan_decap),
2681                                 MNL_ALIGNTO);
2682                         flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2683                         break;
2684                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
2685                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
2686                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
2687                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
2688                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
2689                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
2690                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
2691                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
2692                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
2693                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
2694                         size += flow_tcf_get_pedit_actions_size(&actions,
2695                                                                 &flags);
2696                         break;
2697                 default:
2698                         DRV_LOG(WARNING,
2699                                 "unsupported action %p type %d,"
2700                                 " items must be validated before flow creation",
2701                                 (const void *)actions, actions->type);
2702                         break;
2703                 }
2704         }
2705         *action_flags = flags;
2706         return size;
2707 }
2708
2709 /**
2710  * Brand rtnetlink buffer with unique handle.
2711  *
2712  * This handle should be unique for a given network interface to avoid
2713  * collisions.
2714  *
2715  * @param nlh
2716  *   Pointer to Netlink message.
2717  * @param handle
2718  *   Unique 32-bit handle to use.
2719  */
2720 static void
2721 flow_tcf_nl_brand(struct nlmsghdr *nlh, uint32_t handle)
2722 {
2723         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
2724
2725         tcm->tcm_handle = handle;
2726         DRV_LOG(DEBUG, "Netlink msg %p is branded with handle %x",
2727                 (void *)nlh, handle);
2728 }
2729
2730 /**
2731  * Prepare a flow object for Linux TC flower. It calculates the maximum size of
2732  * memory required, allocates the memory, initializes Netlink message headers
2733  * and set unique TC message handle.
2734  *
2735  * @param[in] attr
2736  *   Pointer to the flow attributes.
2737  * @param[in] items
2738  *   Pointer to the list of items.
2739  * @param[in] actions
2740  *   Pointer to the list of actions.
2741  * @param[out] error
2742  *   Pointer to the error structure.
2743  *
2744  * @return
2745  *   Pointer to mlx5_flow object on success,
2746  *   otherwise NULL and rte_errno is set.
2747  */
2748 static struct mlx5_flow *
2749 flow_tcf_prepare(const struct rte_flow_attr *attr,
2750                  const struct rte_flow_item items[],
2751                  const struct rte_flow_action actions[],
2752                  struct rte_flow_error *error)
2753 {
2754         size_t size = RTE_ALIGN_CEIL
2755                         (sizeof(struct mlx5_flow),
2756                          alignof(struct flow_tcf_tunnel_hdr)) +
2757                       MNL_ALIGN(sizeof(struct nlmsghdr)) +
2758                       MNL_ALIGN(sizeof(struct tcmsg));
2759         struct mlx5_flow *dev_flow;
2760         uint64_t action_flags = 0;
2761         struct nlmsghdr *nlh;
2762         struct tcmsg *tcm;
2763         uint8_t *sp, *tun = NULL;
2764
2765         size += flow_tcf_get_items_size(attr, items, &action_flags);
2766         size += flow_tcf_get_actions_and_size(actions, &action_flags);
2767         dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
2768         if (!dev_flow) {
2769                 rte_flow_error_set(error, ENOMEM,
2770                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
2771                                    "not enough memory to create E-Switch flow");
2772                 return NULL;
2773         }
2774         sp = (uint8_t *)(dev_flow + 1);
2775         if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) {
2776                 sp = RTE_PTR_ALIGN
2777                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2778                 tun = sp;
2779                 sp += RTE_ALIGN_CEIL
2780                         (sizeof(struct flow_tcf_vxlan_encap),
2781                         MNL_ALIGNTO);
2782 #ifndef NDEBUG
2783                 size -= RTE_ALIGN_CEIL
2784                         (sizeof(struct flow_tcf_vxlan_encap),
2785                         MNL_ALIGNTO);
2786 #endif
2787         } else if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2788                 sp = RTE_PTR_ALIGN
2789                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2790                 tun = sp;
2791                 sp += RTE_ALIGN_CEIL
2792                         (sizeof(struct flow_tcf_vxlan_decap),
2793                         MNL_ALIGNTO);
2794 #ifndef NDEBUG
2795                 size -= RTE_ALIGN_CEIL
2796                         (sizeof(struct flow_tcf_vxlan_decap),
2797                         MNL_ALIGNTO);
2798 #endif
2799         } else {
2800                 sp = RTE_PTR_ALIGN(sp, MNL_ALIGNTO);
2801         }
2802         nlh = mnl_nlmsg_put_header(sp);
2803         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
2804         *dev_flow = (struct mlx5_flow){
2805                 .tcf = (struct mlx5_flow_tcf){
2806 #ifndef NDEBUG
2807                         .nlsize = size - RTE_ALIGN_CEIL
2808                                 (sizeof(struct mlx5_flow),
2809                                  alignof(struct flow_tcf_tunnel_hdr)),
2810 #endif
2811                         .tunnel = (struct flow_tcf_tunnel_hdr *)tun,
2812                         .nlh = nlh,
2813                         .tcm = tcm,
2814                 },
2815         };
2816         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP)
2817                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_DECAP;
2818         else if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP)
2819                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_ENCAP;
2820         /*
2821          * Generate a reasonably unique handle based on the address of the
2822          * target buffer.
2823          *
2824          * This is straightforward on 32-bit systems where the flow pointer can
2825          * be used directly. Otherwise, its least significant part is taken
2826          * after shifting it by the previous power of two of the pointed buffer
2827          * size.
2828          */
2829         if (sizeof(dev_flow) <= 4)
2830                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow);
2831         else
2832                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow >>
2833                                        rte_log2_u32(rte_align32prevpow2(size)));
2834         return dev_flow;
2835 }
2836
2837 /**
2838  * Make adjustments for supporting count actions.
2839  *
2840  * @param[in] dev
2841  *   Pointer to the Ethernet device structure.
2842  * @param[in] dev_flow
2843  *   Pointer to mlx5_flow.
2844  * @param[out] error
2845  *   Pointer to error structure.
2846  *
2847  * @return
2848  *   0 On success else a negative errno value is returned and rte_errno is set.
2849  */
2850 static int
2851 flow_tcf_translate_action_count(struct rte_eth_dev *dev __rte_unused,
2852                                   struct mlx5_flow *dev_flow,
2853                                   struct rte_flow_error *error)
2854 {
2855         struct rte_flow *flow = dev_flow->flow;
2856
2857         if (!flow->counter) {
2858                 flow->counter = flow_tcf_counter_new();
2859                 if (!flow->counter)
2860                         return rte_flow_error_set(error, rte_errno,
2861                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2862                                                   NULL,
2863                                                   "cannot get counter"
2864                                                   " context.");
2865         }
2866         return 0;
2867 }
2868
2869 /**
2870  * Convert VXLAN VNI to 32-bit integer.
2871  *
2872  * @param[in] vni
2873  *   VXLAN VNI in 24-bit wire format.
2874  *
2875  * @return
2876  *   VXLAN VNI as a 32-bit integer value in network endian.
2877  */
2878 static inline rte_be32_t
2879 vxlan_vni_as_be32(const uint8_t vni[3])
2880 {
2881         union {
2882                 uint8_t vni[4];
2883                 rte_be32_t dword;
2884         } ret = {
2885                 .vni = { 0, vni[0], vni[1], vni[2] },
2886         };
2887         return ret.dword;
2888 }
2889
2890 /**
2891  * Helper function to process RTE_FLOW_ITEM_TYPE_ETH entry in configuration
2892  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the MAC address fields
2893  * in the encapsulation parameters structure. The item must be prevalidated,
2894  * no any validation checks performed by function.
2895  *
2896  * @param[in] spec
2897  *   RTE_FLOW_ITEM_TYPE_ETH entry specification.
2898  * @param[in] mask
2899  *   RTE_FLOW_ITEM_TYPE_ETH entry mask.
2900  * @param[out] encap
2901  *   Structure to fill the gathered MAC address data.
2902  */
2903 static void
2904 flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec,
2905                                const struct rte_flow_item_eth *mask,
2906                                struct flow_tcf_vxlan_encap *encap)
2907 {
2908         /* Item must be validated before. No redundant checks. */
2909         assert(spec);
2910         if (!mask || !memcmp(&mask->dst,
2911                              &rte_flow_item_eth_mask.dst,
2912                              sizeof(rte_flow_item_eth_mask.dst))) {
2913                 /*
2914                  * Ethernet addresses are not supported by
2915                  * tc as tunnel_key parameters. Destination
2916                  * address is needed to form encap packet
2917                  * header and retrieved by kernel from
2918                  * implicit sources (ARP table, etc),
2919                  * address masks are not supported at all.
2920                  */
2921                 encap->eth.dst = spec->dst;
2922                 encap->mask |= FLOW_TCF_ENCAP_ETH_DST;
2923         }
2924         if (!mask || !memcmp(&mask->src,
2925                              &rte_flow_item_eth_mask.src,
2926                              sizeof(rte_flow_item_eth_mask.src))) {
2927                 /*
2928                  * Ethernet addresses are not supported by
2929                  * tc as tunnel_key parameters. Source ethernet
2930                  * address is ignored anyway.
2931                  */
2932                 encap->eth.src = spec->src;
2933                 encap->mask |= FLOW_TCF_ENCAP_ETH_SRC;
2934         }
2935 }
2936
2937 /**
2938  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV4 entry in configuration
2939  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV4 address fields
2940  * in the encapsulation parameters structure. The item must be prevalidated,
2941  * no any validation checks performed by function.
2942  *
2943  * @param[in] spec
2944  *   RTE_FLOW_ITEM_TYPE_IPV4 entry specification.
2945  * @param[out] encap
2946  *   Structure to fill the gathered IPV4 address data.
2947  */
2948 static void
2949 flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec,
2950                                 struct flow_tcf_vxlan_encap *encap)
2951 {
2952         /* Item must be validated before. No redundant checks. */
2953         assert(spec);
2954         encap->ipv4.dst = spec->hdr.dst_addr;
2955         encap->ipv4.src = spec->hdr.src_addr;
2956         encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC |
2957                        FLOW_TCF_ENCAP_IPV4_DST;
2958 }
2959
2960 /**
2961  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV6 entry in configuration
2962  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV6 address fields
2963  * in the encapsulation parameters structure. The item must be prevalidated,
2964  * no any validation checks performed by function.
2965  *
2966  * @param[in] spec
2967  *   RTE_FLOW_ITEM_TYPE_IPV6 entry specification.
2968  * @param[out] encap
2969  *   Structure to fill the gathered IPV6 address data.
2970  */
2971 static void
2972 flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec,
2973                                 struct flow_tcf_vxlan_encap *encap)
2974 {
2975         /* Item must be validated before. No redundant checks. */
2976         assert(spec);
2977         memcpy(encap->ipv6.dst, spec->hdr.dst_addr, IPV6_ADDR_LEN);
2978         memcpy(encap->ipv6.src, spec->hdr.src_addr, IPV6_ADDR_LEN);
2979         encap->mask |= FLOW_TCF_ENCAP_IPV6_SRC |
2980                        FLOW_TCF_ENCAP_IPV6_DST;
2981 }
2982
2983 /**
2984  * Helper function to process RTE_FLOW_ITEM_TYPE_UDP entry in configuration
2985  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the UDP port fields
2986  * in the encapsulation parameters structure. The item must be prevalidated,
2987  * no any validation checks performed by function.
2988  *
2989  * @param[in] spec
2990  *   RTE_FLOW_ITEM_TYPE_UDP entry specification.
2991  * @param[in] mask
2992  *   RTE_FLOW_ITEM_TYPE_UDP entry mask.
2993  * @param[out] encap
2994  *   Structure to fill the gathered UDP port data.
2995  */
2996 static void
2997 flow_tcf_parse_vxlan_encap_udp(const struct rte_flow_item_udp *spec,
2998                                const struct rte_flow_item_udp *mask,
2999                                struct flow_tcf_vxlan_encap *encap)
3000 {
3001         assert(spec);
3002         encap->udp.dst = spec->hdr.dst_port;
3003         encap->mask |= FLOW_TCF_ENCAP_UDP_DST;
3004         if (!mask || mask->hdr.src_port != RTE_BE16(0x0000)) {
3005                 encap->udp.src = spec->hdr.src_port;
3006                 encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC;
3007         }
3008 }
3009
3010 /**
3011  * Helper function to process RTE_FLOW_ITEM_TYPE_VXLAN entry in configuration
3012  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the VNI fields
3013  * in the encapsulation parameters structure. The item must be prevalidated,
3014  * no any validation checks performed by function.
3015  *
3016  * @param[in] spec
3017  *   RTE_FLOW_ITEM_TYPE_VXLAN entry specification.
3018  * @param[out] encap
3019  *   Structure to fill the gathered VNI address data.
3020  */
3021 static void
3022 flow_tcf_parse_vxlan_encap_vni(const struct rte_flow_item_vxlan *spec,
3023                                struct flow_tcf_vxlan_encap *encap)
3024 {
3025         /* Item must be validated before. Do not redundant checks. */
3026         assert(spec);
3027         memcpy(encap->vxlan.vni, spec->vni, sizeof(encap->vxlan.vni));
3028         encap->mask |= FLOW_TCF_ENCAP_VXLAN_VNI;
3029 }
3030
3031 /**
3032  * Populate consolidated encapsulation object from list of pattern items.
3033  *
3034  * Helper function to process configuration of action such as
3035  * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. The item list should be
3036  * validated, there is no way to return an meaningful error.
3037  *
3038  * @param[in] action
3039  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
3040  *   List of pattern items to gather data from.
3041  * @param[out] src
3042  *   Structure to fill gathered data.
3043  */
3044 static void
3045 flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action,
3046                            struct flow_tcf_vxlan_encap *encap)
3047 {
3048         union {
3049                 const struct rte_flow_item_eth *eth;
3050                 const struct rte_flow_item_ipv4 *ipv4;
3051                 const struct rte_flow_item_ipv6 *ipv6;
3052                 const struct rte_flow_item_udp *udp;
3053                 const struct rte_flow_item_vxlan *vxlan;
3054         } spec, mask;
3055         const struct rte_flow_item *items;
3056
3057         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
3058         assert(action->conf);
3059
3060         items = ((const struct rte_flow_action_vxlan_encap *)
3061                                         action->conf)->definition;
3062         assert(items);
3063         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3064                 switch (items->type) {
3065                 case RTE_FLOW_ITEM_TYPE_VOID:
3066                         break;
3067                 case RTE_FLOW_ITEM_TYPE_ETH:
3068                         mask.eth = items->mask;
3069                         spec.eth = items->spec;
3070                         flow_tcf_parse_vxlan_encap_eth(spec.eth, mask.eth,
3071                                                        encap);
3072                         break;
3073                 case RTE_FLOW_ITEM_TYPE_IPV4:
3074                         spec.ipv4 = items->spec;
3075                         flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4, encap);
3076                         break;
3077                 case RTE_FLOW_ITEM_TYPE_IPV6:
3078                         spec.ipv6 = items->spec;
3079                         flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6, encap);
3080                         break;
3081                 case RTE_FLOW_ITEM_TYPE_UDP:
3082                         mask.udp = items->mask;
3083                         spec.udp = items->spec;
3084                         flow_tcf_parse_vxlan_encap_udp(spec.udp, mask.udp,
3085                                                        encap);
3086                         break;
3087                 case RTE_FLOW_ITEM_TYPE_VXLAN:
3088                         spec.vxlan = items->spec;
3089                         flow_tcf_parse_vxlan_encap_vni(spec.vxlan, encap);
3090                         break;
3091                 default:
3092                         assert(false);
3093                         DRV_LOG(WARNING,
3094                                 "unsupported item %p type %d,"
3095                                 " items must be validated"
3096                                 " before flow creation",
3097                                 (const void *)items, items->type);
3098                         encap->mask = 0;
3099                         return;
3100                 }
3101         }
3102 }
3103
3104 /**
3105  * Translate flow for Linux TC flower and construct Netlink message.
3106  *
3107  * @param[in] priv
3108  *   Pointer to the priv structure.
3109  * @param[in, out] flow
3110  *   Pointer to the sub flow.
3111  * @param[in] attr
3112  *   Pointer to the flow attributes.
3113  * @param[in] items
3114  *   Pointer to the list of items.
3115  * @param[in] actions
3116  *   Pointer to the list of actions.
3117  * @param[out] error
3118  *   Pointer to the error structure.
3119  *
3120  * @return
3121  *   0 on success, a negative errno value otherwise and rte_errno is set.
3122  */
3123 static int
3124 flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow,
3125                    const struct rte_flow_attr *attr,
3126                    const struct rte_flow_item items[],
3127                    const struct rte_flow_action actions[],
3128                    struct rte_flow_error *error)
3129 {
3130         union {
3131                 const struct rte_flow_item_port_id *port_id;
3132                 const struct rte_flow_item_eth *eth;
3133                 const struct rte_flow_item_vlan *vlan;
3134                 const struct rte_flow_item_ipv4 *ipv4;
3135                 const struct rte_flow_item_ipv6 *ipv6;
3136                 const struct rte_flow_item_tcp *tcp;
3137                 const struct rte_flow_item_udp *udp;
3138                 const struct rte_flow_item_vxlan *vxlan;
3139         } spec, mask;
3140         union {
3141                 const struct rte_flow_action_port_id *port_id;
3142                 const struct rte_flow_action_jump *jump;
3143                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
3144                 const struct rte_flow_action_of_set_vlan_vid *
3145                         of_set_vlan_vid;
3146                 const struct rte_flow_action_of_set_vlan_pcp *
3147                         of_set_vlan_pcp;
3148         } conf;
3149         union {
3150                 struct flow_tcf_tunnel_hdr *hdr;
3151                 struct flow_tcf_vxlan_decap *vxlan;
3152         } decap = {
3153                 .hdr = NULL,
3154         };
3155         union {
3156                 struct flow_tcf_tunnel_hdr *hdr;
3157                 struct flow_tcf_vxlan_encap *vxlan;
3158         } encap = {
3159                 .hdr = NULL,
3160         };
3161         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
3162         struct nlmsghdr *nlh = dev_flow->tcf.nlh;
3163         struct tcmsg *tcm = dev_flow->tcf.tcm;
3164         uint32_t na_act_index_cur;
3165         rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
3166         rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
3167         rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
3168         bool ip_proto_set = 0;
3169         bool tunnel_outer = 0;
3170         struct nlattr *na_flower;
3171         struct nlattr *na_flower_act;
3172         struct nlattr *na_vlan_id = NULL;
3173         struct nlattr *na_vlan_priority = NULL;
3174         uint64_t item_flags = 0;
3175         int ret;
3176
3177         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
3178                                                 PTOI_TABLE_SZ_MAX(dev)));
3179         if (dev_flow->tcf.tunnel) {
3180                 switch (dev_flow->tcf.tunnel->type) {
3181                 case FLOW_TCF_TUNACT_VXLAN_DECAP:
3182                         decap.vxlan = dev_flow->tcf.vxlan_decap;
3183                         tunnel_outer = 1;
3184                         break;
3185                 case FLOW_TCF_TUNACT_VXLAN_ENCAP:
3186                         encap.vxlan = dev_flow->tcf.vxlan_encap;
3187                         break;
3188                 /* New tunnel actions can be added here. */
3189                 default:
3190                         assert(false);
3191                         break;
3192                 }
3193         }
3194         nlh = dev_flow->tcf.nlh;
3195         tcm = dev_flow->tcf.tcm;
3196         /* Prepare API must have been called beforehand. */
3197         assert(nlh != NULL && tcm != NULL);
3198         tcm->tcm_family = AF_UNSPEC;
3199         tcm->tcm_ifindex = ptoi[0].ifindex;
3200         tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
3201         /*
3202          * Priority cannot be zero to prevent the kernel from picking one
3203          * automatically.
3204          */
3205         tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16, outer_etype);
3206         if (attr->group > 0)
3207                 mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group);
3208         mnl_attr_put_strz(nlh, TCA_KIND, "flower");
3209         na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
3210         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3211                 unsigned int i;
3212
3213                 switch (items->type) {
3214                 case RTE_FLOW_ITEM_TYPE_VOID:
3215                         break;
3216                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
3217                         mask.port_id = flow_tcf_item_mask
3218                                 (items, &rte_flow_item_port_id_mask,
3219                                  &flow_tcf_mask_supported.port_id,
3220                                  &flow_tcf_mask_empty.port_id,
3221                                  sizeof(flow_tcf_mask_supported.port_id),
3222                                  error);
3223                         assert(mask.port_id);
3224                         if (mask.port_id == &flow_tcf_mask_empty.port_id)
3225                                 break;
3226                         spec.port_id = items->spec;
3227                         if (!mask.port_id->id)
3228                                 i = 0;
3229                         else
3230                                 for (i = 0; ptoi[i].ifindex; ++i)
3231                                         if (ptoi[i].port_id == spec.port_id->id)
3232                                                 break;
3233                         assert(ptoi[i].ifindex);
3234                         tcm->tcm_ifindex = ptoi[i].ifindex;
3235                         break;
3236                 case RTE_FLOW_ITEM_TYPE_ETH:
3237                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3238                                       MLX5_FLOW_LAYER_INNER_L2 :
3239                                       MLX5_FLOW_LAYER_OUTER_L2;
3240                         mask.eth = flow_tcf_item_mask
3241                                 (items, &rte_flow_item_eth_mask,
3242                                  &flow_tcf_mask_supported.eth,
3243                                  &flow_tcf_mask_empty.eth,
3244                                  sizeof(flow_tcf_mask_supported.eth),
3245                                  error);
3246                         assert(mask.eth);
3247                         if (mask.eth == &flow_tcf_mask_empty.eth)
3248                                 break;
3249                         spec.eth = items->spec;
3250                         if (mask.eth->type) {
3251                                 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
3252                                         inner_etype = spec.eth->type;
3253                                 else
3254                                         outer_etype = spec.eth->type;
3255                         }
3256                         if (tunnel_outer) {
3257                                 DRV_LOG(WARNING,
3258                                         "outer L2 addresses cannot be"
3259                                         " forced is outer ones for tunnel,"
3260                                         " parameter is ignored");
3261                                 break;
3262                         }
3263                         if (!is_zero_ether_addr(&mask.eth->dst)) {
3264                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
3265                                              ETHER_ADDR_LEN,
3266                                              spec.eth->dst.addr_bytes);
3267                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK,
3268                                              ETHER_ADDR_LEN,
3269                                              mask.eth->dst.addr_bytes);
3270                         }
3271                         if (!is_zero_ether_addr(&mask.eth->src)) {
3272                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC,
3273                                              ETHER_ADDR_LEN,
3274                                              spec.eth->src.addr_bytes);
3275                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK,
3276                                              ETHER_ADDR_LEN,
3277                                              mask.eth->src.addr_bytes);
3278                         }
3279                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3280                         break;
3281                 case RTE_FLOW_ITEM_TYPE_VLAN:
3282                         assert(!encap.hdr);
3283                         assert(!decap.hdr);
3284                         assert(!tunnel_outer);
3285                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
3286                         mask.vlan = flow_tcf_item_mask
3287                                 (items, &rte_flow_item_vlan_mask,
3288                                  &flow_tcf_mask_supported.vlan,
3289                                  &flow_tcf_mask_empty.vlan,
3290                                  sizeof(flow_tcf_mask_supported.vlan),
3291                                  error);
3292                         assert(mask.vlan);
3293                         if (mask.vlan == &flow_tcf_mask_empty.vlan)
3294                                 break;
3295                         spec.vlan = items->spec;
3296                         assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3297                                outer_etype == RTE_BE16(ETH_P_8021Q));
3298                         outer_etype = RTE_BE16(ETH_P_8021Q);
3299                         if (mask.vlan->inner_type)
3300                                 vlan_etype = spec.vlan->inner_type;
3301                         if (mask.vlan->tci & RTE_BE16(0xe000))
3302                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_VLAN_PRIO,
3303                                                 (rte_be_to_cpu_16
3304                                                  (spec.vlan->tci) >> 13) & 0x7);
3305                         if (mask.vlan->tci & RTE_BE16(0x0fff))
3306                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_VLAN_ID,
3307                                                  rte_be_to_cpu_16
3308                                                  (spec.vlan->tci &
3309                                                   RTE_BE16(0x0fff)));
3310                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3311                         break;
3312                 case RTE_FLOW_ITEM_TYPE_IPV4:
3313                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3314                                       MLX5_FLOW_LAYER_INNER_L3_IPV4 :
3315                                       MLX5_FLOW_LAYER_OUTER_L3_IPV4;
3316                         mask.ipv4 = flow_tcf_item_mask
3317                                 (items, &rte_flow_item_ipv4_mask,
3318                                  &flow_tcf_mask_supported.ipv4,
3319                                  &flow_tcf_mask_empty.ipv4,
3320                                  sizeof(flow_tcf_mask_supported.ipv4),
3321                                  error);
3322                         assert(mask.ipv4);
3323                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3324                                 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3325                                        inner_etype == RTE_BE16(ETH_P_IP));
3326                                 inner_etype = RTE_BE16(ETH_P_IP);
3327                         } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3328                                 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3329                                        vlan_etype == RTE_BE16(ETH_P_IP));
3330                                 vlan_etype = RTE_BE16(ETH_P_IP);
3331                         } else {
3332                                 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3333                                        outer_etype == RTE_BE16(ETH_P_IP));
3334                                 outer_etype = RTE_BE16(ETH_P_IP);
3335                         }
3336                         spec.ipv4 = items->spec;
3337                         if (!tunnel_outer && mask.ipv4->hdr.next_proto_id) {
3338                                 /*
3339                                  * No way to set IP protocol for outer tunnel
3340                                  * layers. Usually it is fixed, for example,
3341                                  * to UDP for VXLAN/GPE.
3342                                  */
3343                                 assert(spec.ipv4); /* Mask is not empty. */
3344                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3345                                                 spec.ipv4->hdr.next_proto_id);
3346                                 ip_proto_set = 1;
3347                         }
3348                         if (mask.ipv4 == &flow_tcf_mask_empty.ipv4 ||
3349                              (!mask.ipv4->hdr.src_addr &&
3350                               !mask.ipv4->hdr.dst_addr)) {
3351                                 if (!tunnel_outer)
3352                                         break;
3353                                 /*
3354                                  * For tunnel outer we must set outer IP key
3355                                  * anyway, even if the specification/mask is
3356                                  * empty. There is no another way to tell
3357                                  * kernel about he outer layer protocol.
3358                                  */
3359                                 mnl_attr_put_u32
3360                                         (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC,
3361                                          mask.ipv4->hdr.src_addr);
3362                                 mnl_attr_put_u32
3363                                         (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
3364                                          mask.ipv4->hdr.src_addr);
3365                                 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3366                                 break;
3367                         }
3368                         if (mask.ipv4->hdr.src_addr) {
3369                                 mnl_attr_put_u32
3370                                         (nlh, tunnel_outer ?
3371                                          TCA_FLOWER_KEY_ENC_IPV4_SRC :
3372                                          TCA_FLOWER_KEY_IPV4_SRC,
3373                                          spec.ipv4->hdr.src_addr);
3374                                 mnl_attr_put_u32
3375                                         (nlh, tunnel_outer ?
3376                                          TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK :
3377                                          TCA_FLOWER_KEY_IPV4_SRC_MASK,
3378                                          mask.ipv4->hdr.src_addr);
3379                         }
3380                         if (mask.ipv4->hdr.dst_addr) {
3381                                 mnl_attr_put_u32
3382                                         (nlh, tunnel_outer ?
3383                                          TCA_FLOWER_KEY_ENC_IPV4_DST :
3384                                          TCA_FLOWER_KEY_IPV4_DST,
3385                                          spec.ipv4->hdr.dst_addr);
3386                                 mnl_attr_put_u32
3387                                         (nlh, tunnel_outer ?
3388                                          TCA_FLOWER_KEY_ENC_IPV4_DST_MASK :
3389                                          TCA_FLOWER_KEY_IPV4_DST_MASK,
3390                                          mask.ipv4->hdr.dst_addr);
3391                         }
3392                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3393                         break;
3394                 case RTE_FLOW_ITEM_TYPE_IPV6: {
3395                         bool ipv6_src, ipv6_dst;
3396
3397                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3398                                       MLX5_FLOW_LAYER_INNER_L3_IPV6 :
3399                                       MLX5_FLOW_LAYER_OUTER_L3_IPV6;
3400                         mask.ipv6 = flow_tcf_item_mask
3401                                 (items, &rte_flow_item_ipv6_mask,
3402                                  &flow_tcf_mask_supported.ipv6,
3403                                  &flow_tcf_mask_empty.ipv6,
3404                                  sizeof(flow_tcf_mask_supported.ipv6),
3405                                  error);
3406                         assert(mask.ipv6);
3407                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3408                                 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3409                                        inner_etype == RTE_BE16(ETH_P_IPV6));
3410                                 inner_etype = RTE_BE16(ETH_P_IPV6);
3411                         } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3412                                 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3413                                        vlan_etype == RTE_BE16(ETH_P_IPV6));
3414                                 vlan_etype = RTE_BE16(ETH_P_IPV6);
3415                         } else {
3416                                 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3417                                        outer_etype == RTE_BE16(ETH_P_IPV6));
3418                                 outer_etype = RTE_BE16(ETH_P_IPV6);
3419                         }
3420                         spec.ipv6 = items->spec;
3421                         if (!tunnel_outer && mask.ipv6->hdr.proto) {
3422                                 /*
3423                                  * No way to set IP protocol for outer tunnel
3424                                  * layers. Usually it is fixed, for example,
3425                                  * to UDP for VXLAN/GPE.
3426                                  */
3427                                 assert(spec.ipv6); /* Mask is not empty. */
3428                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3429                                                 spec.ipv6->hdr.proto);
3430                                 ip_proto_set = 1;
3431                         }
3432                         ipv6_dst = !IN6_IS_ADDR_UNSPECIFIED
3433                                                 (mask.ipv6->hdr.dst_addr);
3434                         ipv6_src = !IN6_IS_ADDR_UNSPECIFIED
3435                                                 (mask.ipv6->hdr.src_addr);
3436                         if (mask.ipv6 == &flow_tcf_mask_empty.ipv6 ||
3437                              (!ipv6_dst && !ipv6_src)) {
3438                                 if (!tunnel_outer)
3439                                         break;
3440                                 /*
3441                                  * For tunnel outer we must set outer IP key
3442                                  * anyway, even if the specification/mask is
3443                                  * empty. There is no another way to tell
3444                                  * kernel about he outer layer protocol.
3445                                  */
3446                                 mnl_attr_put(nlh,
3447                                              TCA_FLOWER_KEY_ENC_IPV6_SRC,
3448                                              IPV6_ADDR_LEN,
3449                                              mask.ipv6->hdr.src_addr);
3450                                 mnl_attr_put(nlh,
3451                                              TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
3452                                              IPV6_ADDR_LEN,
3453                                              mask.ipv6->hdr.src_addr);
3454                                 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3455                                 break;
3456                         }
3457                         if (ipv6_src) {
3458                                 mnl_attr_put(nlh, tunnel_outer ?
3459                                              TCA_FLOWER_KEY_ENC_IPV6_SRC :
3460                                              TCA_FLOWER_KEY_IPV6_SRC,
3461                                              IPV6_ADDR_LEN,
3462                                              spec.ipv6->hdr.src_addr);
3463                                 mnl_attr_put(nlh, tunnel_outer ?
3464                                              TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK :
3465                                              TCA_FLOWER_KEY_IPV6_SRC_MASK,
3466                                              IPV6_ADDR_LEN,
3467                                              mask.ipv6->hdr.src_addr);
3468                         }
3469                         if (ipv6_dst) {
3470                                 mnl_attr_put(nlh, tunnel_outer ?
3471                                              TCA_FLOWER_KEY_ENC_IPV6_DST :
3472                                              TCA_FLOWER_KEY_IPV6_DST,
3473                                              IPV6_ADDR_LEN,
3474                                              spec.ipv6->hdr.dst_addr);
3475                                 mnl_attr_put(nlh, tunnel_outer ?
3476                                              TCA_FLOWER_KEY_ENC_IPV6_DST_MASK :
3477                                              TCA_FLOWER_KEY_IPV6_DST_MASK,
3478                                              IPV6_ADDR_LEN,
3479                                              mask.ipv6->hdr.dst_addr);
3480                         }
3481                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3482                         break;
3483                 }
3484                 case RTE_FLOW_ITEM_TYPE_UDP:
3485                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3486                                       MLX5_FLOW_LAYER_INNER_L4_UDP :
3487                                       MLX5_FLOW_LAYER_OUTER_L4_UDP;
3488                         mask.udp = flow_tcf_item_mask
3489                                 (items, &rte_flow_item_udp_mask,
3490                                  &flow_tcf_mask_supported.udp,
3491                                  &flow_tcf_mask_empty.udp,
3492                                  sizeof(flow_tcf_mask_supported.udp),
3493                                  error);
3494                         assert(mask.udp);
3495                         spec.udp = items->spec;
3496                         if (!tunnel_outer) {
3497                                 if (!ip_proto_set)
3498                                         mnl_attr_put_u8
3499                                                 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3500                                                 IPPROTO_UDP);
3501                                 if (mask.udp == &flow_tcf_mask_empty.udp)
3502                                         break;
3503                         } else {
3504                                 assert(mask.udp != &flow_tcf_mask_empty.udp);
3505                                 decap.vxlan->udp_port =
3506                                         rte_be_to_cpu_16
3507                                                 (spec.udp->hdr.dst_port);
3508                         }
3509                         if (mask.udp->hdr.src_port) {
3510                                 mnl_attr_put_u16
3511                                         (nlh, tunnel_outer ?
3512                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT :
3513                                          TCA_FLOWER_KEY_UDP_SRC,
3514                                          spec.udp->hdr.src_port);
3515                                 mnl_attr_put_u16
3516                                         (nlh, tunnel_outer ?
3517                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK :
3518                                          TCA_FLOWER_KEY_UDP_SRC_MASK,
3519                                          mask.udp->hdr.src_port);
3520                         }
3521                         if (mask.udp->hdr.dst_port) {
3522                                 mnl_attr_put_u16
3523                                         (nlh, tunnel_outer ?
3524                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT :
3525                                          TCA_FLOWER_KEY_UDP_DST,
3526                                          spec.udp->hdr.dst_port);
3527                                 mnl_attr_put_u16
3528                                         (nlh, tunnel_outer ?
3529                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK :
3530                                          TCA_FLOWER_KEY_UDP_DST_MASK,
3531                                          mask.udp->hdr.dst_port);
3532                         }
3533                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3534                         break;
3535                 case RTE_FLOW_ITEM_TYPE_TCP:
3536                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3537                                       MLX5_FLOW_LAYER_INNER_L4_TCP :
3538                                       MLX5_FLOW_LAYER_OUTER_L4_TCP;
3539                         mask.tcp = flow_tcf_item_mask
3540                                 (items, &rte_flow_item_tcp_mask,
3541                                  &flow_tcf_mask_supported.tcp,
3542                                  &flow_tcf_mask_empty.tcp,
3543                                  sizeof(flow_tcf_mask_supported.tcp),
3544                                  error);
3545                         assert(mask.tcp);
3546                         if (!ip_proto_set)
3547                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3548                                                 IPPROTO_TCP);
3549                         if (mask.tcp == &flow_tcf_mask_empty.tcp)
3550                                 break;
3551                         spec.tcp = items->spec;
3552                         if (mask.tcp->hdr.src_port) {
3553                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_SRC,
3554                                                  spec.tcp->hdr.src_port);
3555                                 mnl_attr_put_u16(nlh,
3556                                                  TCA_FLOWER_KEY_TCP_SRC_MASK,
3557                                                  mask.tcp->hdr.src_port);
3558                         }
3559                         if (mask.tcp->hdr.dst_port) {
3560                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_DST,
3561                                                  spec.tcp->hdr.dst_port);
3562                                 mnl_attr_put_u16(nlh,
3563                                                  TCA_FLOWER_KEY_TCP_DST_MASK,
3564                                                  mask.tcp->hdr.dst_port);
3565                         }
3566                         if (mask.tcp->hdr.tcp_flags) {
3567                                 mnl_attr_put_u16
3568                                         (nlh,
3569                                          TCA_FLOWER_KEY_TCP_FLAGS,
3570                                          rte_cpu_to_be_16
3571                                                 (spec.tcp->hdr.tcp_flags));
3572                                 mnl_attr_put_u16
3573                                         (nlh,
3574                                          TCA_FLOWER_KEY_TCP_FLAGS_MASK,
3575                                          rte_cpu_to_be_16
3576                                                 (mask.tcp->hdr.tcp_flags));
3577                         }
3578                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3579                         break;
3580                 case RTE_FLOW_ITEM_TYPE_VXLAN:
3581                         assert(decap.vxlan);
3582                         tunnel_outer = 0;
3583                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
3584                         spec.vxlan = items->spec;
3585                         mnl_attr_put_u32(nlh,
3586                                          TCA_FLOWER_KEY_ENC_KEY_ID,
3587                                          vxlan_vni_as_be32(spec.vxlan->vni));
3588                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3589                         break;
3590                 default:
3591                         return rte_flow_error_set(error, ENOTSUP,
3592                                                   RTE_FLOW_ERROR_TYPE_ITEM,
3593                                                   NULL, "item not supported");
3594                 }
3595         }
3596         /*
3597          * Set the ether_type flower key and tc rule protocol:
3598          * - if there is nor VLAN neither VXLAN the key is taken from
3599          *   eth item directly or deduced from L3 items.
3600          * - if there is vlan item then key is fixed to 802.1q.
3601          * - if there is vxlan item then key is set to inner tunnel type.
3602          * - simultaneous vlan and vxlan items are prohibited.
3603          */
3604         if (outer_etype != RTE_BE16(ETH_P_ALL)) {
3605                 tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
3606                                            outer_etype);
3607                 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3608                         if (inner_etype != RTE_BE16(ETH_P_ALL))
3609                                 mnl_attr_put_u16(nlh,
3610                                                  TCA_FLOWER_KEY_ETH_TYPE,
3611                                                  inner_etype);
3612                 } else {
3613                         mnl_attr_put_u16(nlh,
3614                                          TCA_FLOWER_KEY_ETH_TYPE,
3615                                          outer_etype);
3616                         if (outer_etype == RTE_BE16(ETH_P_8021Q) &&
3617                             vlan_etype != RTE_BE16(ETH_P_ALL))
3618                                 mnl_attr_put_u16(nlh,
3619                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE,
3620                                                  vlan_etype);
3621                 }
3622                 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3623         }
3624         na_flower_act = mnl_attr_nest_start(nlh, TCA_FLOWER_ACT);
3625         na_act_index_cur = 1;
3626         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
3627                 struct nlattr *na_act_index;
3628                 struct nlattr *na_act;
3629                 unsigned int vlan_act;
3630                 unsigned int i;
3631
3632                 switch (actions->type) {
3633                 case RTE_FLOW_ACTION_TYPE_VOID:
3634                         break;
3635                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
3636                         conf.port_id = actions->conf;
3637                         if (conf.port_id->original)
3638                                 i = 0;
3639                         else
3640                                 for (i = 0; ptoi[i].ifindex; ++i)
3641                                         if (ptoi[i].port_id == conf.port_id->id)
3642                                                 break;
3643                         assert(ptoi[i].ifindex);
3644                         na_act_index =
3645                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3646                         assert(na_act_index);
3647                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
3648                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3649                         assert(na_act);
3650                         if (encap.hdr) {
3651                                 assert(dev_flow->tcf.tunnel);
3652                                 dev_flow->tcf.tunnel->ifindex_ptr =
3653                                         &((struct tc_mirred *)
3654                                         mnl_attr_get_payload
3655                                         (mnl_nlmsg_get_payload_tail
3656                                                 (nlh)))->ifindex;
3657                         }
3658                         mnl_attr_put(nlh, TCA_MIRRED_PARMS,
3659                                      sizeof(struct tc_mirred),
3660                                      &(struct tc_mirred){
3661                                         .action = TC_ACT_STOLEN,
3662                                         .eaction = TCA_EGRESS_REDIR,
3663                                         .ifindex = ptoi[i].ifindex,
3664                                      });
3665                         mnl_attr_nest_end(nlh, na_act);
3666                         mnl_attr_nest_end(nlh, na_act_index);
3667                         break;
3668                 case RTE_FLOW_ACTION_TYPE_JUMP:
3669                         conf.jump = actions->conf;
3670                         na_act_index =
3671                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3672                         assert(na_act_index);
3673                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3674                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3675                         assert(na_act);
3676                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3677                                      sizeof(struct tc_gact),
3678                                      &(struct tc_gact){
3679                                         .action = TC_ACT_GOTO_CHAIN |
3680                                                   conf.jump->group,
3681                                      });
3682                         mnl_attr_nest_end(nlh, na_act);
3683                         mnl_attr_nest_end(nlh, na_act_index);
3684                         break;
3685                 case RTE_FLOW_ACTION_TYPE_DROP:
3686                         na_act_index =
3687                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3688                         assert(na_act_index);
3689                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3690                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3691                         assert(na_act);
3692                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3693                                      sizeof(struct tc_gact),
3694                                      &(struct tc_gact){
3695                                         .action = TC_ACT_SHOT,
3696                                      });
3697                         mnl_attr_nest_end(nlh, na_act);
3698                         mnl_attr_nest_end(nlh, na_act_index);
3699                         break;
3700                 case RTE_FLOW_ACTION_TYPE_COUNT:
3701                         /*
3702                          * Driver adds the count action implicitly for
3703                          * each rule it creates.
3704                          */
3705                         ret = flow_tcf_translate_action_count(dev,
3706                                                               dev_flow, error);
3707                         if (ret < 0)
3708                                 return ret;
3709                         break;
3710                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
3711                         conf.of_push_vlan = NULL;
3712                         vlan_act = TCA_VLAN_ACT_POP;
3713                         goto action_of_vlan;
3714                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
3715                         conf.of_push_vlan = actions->conf;
3716                         vlan_act = TCA_VLAN_ACT_PUSH;
3717                         goto action_of_vlan;
3718                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
3719                         conf.of_set_vlan_vid = actions->conf;
3720                         if (na_vlan_id)
3721                                 goto override_na_vlan_id;
3722                         vlan_act = TCA_VLAN_ACT_MODIFY;
3723                         goto action_of_vlan;
3724                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
3725                         conf.of_set_vlan_pcp = actions->conf;
3726                         if (na_vlan_priority)
3727                                 goto override_na_vlan_priority;
3728                         vlan_act = TCA_VLAN_ACT_MODIFY;
3729                         goto action_of_vlan;
3730 action_of_vlan:
3731                         na_act_index =
3732                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3733                         assert(na_act_index);
3734                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "vlan");
3735                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3736                         assert(na_act);
3737                         mnl_attr_put(nlh, TCA_VLAN_PARMS,
3738                                      sizeof(struct tc_vlan),
3739                                      &(struct tc_vlan){
3740                                         .action = TC_ACT_PIPE,
3741                                         .v_action = vlan_act,
3742                                      });
3743                         if (vlan_act == TCA_VLAN_ACT_POP) {
3744                                 mnl_attr_nest_end(nlh, na_act);
3745                                 mnl_attr_nest_end(nlh, na_act_index);
3746                                 break;
3747                         }
3748                         if (vlan_act == TCA_VLAN_ACT_PUSH)
3749                                 mnl_attr_put_u16(nlh,
3750                                                  TCA_VLAN_PUSH_VLAN_PROTOCOL,
3751                                                  conf.of_push_vlan->ethertype);
3752                         na_vlan_id = mnl_nlmsg_get_payload_tail(nlh);
3753                         mnl_attr_put_u16(nlh, TCA_VLAN_PAD, 0);
3754                         na_vlan_priority = mnl_nlmsg_get_payload_tail(nlh);
3755                         mnl_attr_put_u8(nlh, TCA_VLAN_PAD, 0);
3756                         mnl_attr_nest_end(nlh, na_act);
3757                         mnl_attr_nest_end(nlh, na_act_index);
3758                         if (actions->type ==
3759                             RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
3760 override_na_vlan_id:
3761                                 na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID;
3762                                 *(uint16_t *)mnl_attr_get_payload(na_vlan_id) =
3763                                         rte_be_to_cpu_16
3764                                         (conf.of_set_vlan_vid->vlan_vid);
3765                         } else if (actions->type ==
3766                                    RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
3767 override_na_vlan_priority:
3768                                 na_vlan_priority->nla_type =
3769                                         TCA_VLAN_PUSH_VLAN_PRIORITY;
3770                                 *(uint8_t *)mnl_attr_get_payload
3771                                         (na_vlan_priority) =
3772                                         conf.of_set_vlan_pcp->vlan_pcp;
3773                         }
3774                         break;
3775                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
3776                         assert(decap.vxlan);
3777                         assert(dev_flow->tcf.tunnel);
3778                         dev_flow->tcf.tunnel->ifindex_ptr =
3779                                 (unsigned int *)&tcm->tcm_ifindex;
3780                         na_act_index =
3781                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3782                         assert(na_act_index);
3783                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3784                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3785                         assert(na_act);
3786                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3787                                 sizeof(struct tc_tunnel_key),
3788                                 &(struct tc_tunnel_key){
3789                                         .action = TC_ACT_PIPE,
3790                                         .t_action = TCA_TUNNEL_KEY_ACT_RELEASE,
3791                                         });
3792                         mnl_attr_nest_end(nlh, na_act);
3793                         mnl_attr_nest_end(nlh, na_act_index);
3794                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3795                         break;
3796                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
3797                         assert(encap.vxlan);
3798                         flow_tcf_vxlan_encap_parse(actions, encap.vxlan);
3799                         na_act_index =
3800                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3801                         assert(na_act_index);
3802                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3803                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3804                         assert(na_act);
3805                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3806                                 sizeof(struct tc_tunnel_key),
3807                                 &(struct tc_tunnel_key){
3808                                         .action = TC_ACT_PIPE,
3809                                         .t_action = TCA_TUNNEL_KEY_ACT_SET,
3810                                         });
3811                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_UDP_DST)
3812                                 mnl_attr_put_u16(nlh,
3813                                          TCA_TUNNEL_KEY_ENC_DST_PORT,
3814                                          encap.vxlan->udp.dst);
3815                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_SRC)
3816                                 mnl_attr_put_u32(nlh,
3817                                          TCA_TUNNEL_KEY_ENC_IPV4_SRC,
3818                                          encap.vxlan->ipv4.src);
3819                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_DST)
3820                                 mnl_attr_put_u32(nlh,
3821                                          TCA_TUNNEL_KEY_ENC_IPV4_DST,
3822                                          encap.vxlan->ipv4.dst);
3823                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_SRC)
3824                                 mnl_attr_put(nlh,
3825                                          TCA_TUNNEL_KEY_ENC_IPV6_SRC,
3826                                          sizeof(encap.vxlan->ipv6.src),
3827                                          &encap.vxlan->ipv6.src);
3828                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_DST)
3829                                 mnl_attr_put(nlh,
3830                                          TCA_TUNNEL_KEY_ENC_IPV6_DST,
3831                                          sizeof(encap.vxlan->ipv6.dst),
3832                                          &encap.vxlan->ipv6.dst);
3833                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_VXLAN_VNI)
3834                                 mnl_attr_put_u32(nlh,
3835                                          TCA_TUNNEL_KEY_ENC_KEY_ID,
3836                                          vxlan_vni_as_be32
3837                                                 (encap.vxlan->vxlan.vni));
3838                         mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM, 0);
3839                         mnl_attr_nest_end(nlh, na_act);
3840                         mnl_attr_nest_end(nlh, na_act_index);
3841                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3842                         break;
3843                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
3844                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
3845                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
3846                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
3847                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
3848                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
3849                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
3850                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
3851                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
3852                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
3853                         na_act_index =
3854                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3855                         flow_tcf_create_pedit_mnl_msg(nlh,
3856                                                       &actions, item_flags);
3857                         mnl_attr_nest_end(nlh, na_act_index);
3858                         break;
3859                 default:
3860                         return rte_flow_error_set(error, ENOTSUP,
3861                                                   RTE_FLOW_ERROR_TYPE_ACTION,
3862                                                   actions,
3863                                                   "action not supported");
3864                 }
3865         }
3866         assert(na_flower);
3867         assert(na_flower_act);
3868         mnl_attr_nest_end(nlh, na_flower_act);
3869         dev_flow->tcf.ptc_flags = mnl_attr_get_payload
3870                                         (mnl_nlmsg_get_payload_tail(nlh));
3871         mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, decap.vxlan ?
3872                                                 0 : TCA_CLS_FLAGS_SKIP_SW);
3873         mnl_attr_nest_end(nlh, na_flower);
3874         if (dev_flow->tcf.tunnel && dev_flow->tcf.tunnel->ifindex_ptr)
3875                 dev_flow->tcf.tunnel->ifindex_org =
3876                         *dev_flow->tcf.tunnel->ifindex_ptr;
3877         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3878         return 0;
3879 }
3880
3881 /**
3882  * Send Netlink message with acknowledgment.
3883  *
3884  * @param tcf
3885  *   Flow context to use.
3886  * @param nlh
3887  *   Message to send. This function always raises the NLM_F_ACK flag before
3888  *   sending.
3889  * @param[in] cb
3890  *   Callback handler for received message.
3891  * @param[in] arg
3892  *   Context pointer for callback handler.
3893  *
3894  * @return
3895  *   0 on success, a negative errno value otherwise and rte_errno is set.
3896  */
3897 static int
3898 flow_tcf_nl_ack(struct mlx5_flow_tcf_context *tcf,
3899                 struct nlmsghdr *nlh,
3900                 mnl_cb_t cb, void *arg)
3901 {
3902         unsigned int portid = mnl_socket_get_portid(tcf->nl);
3903         uint32_t seq = tcf->seq++;
3904         int ret, err = 0;
3905
3906         assert(tcf->nl);
3907         assert(tcf->buf);
3908         if (!seq) {
3909                 /* seq 0 is reserved for kernel event-driven notifications. */
3910                 seq = tcf->seq++;
3911         }
3912         nlh->nlmsg_seq = seq;
3913         nlh->nlmsg_flags |= NLM_F_ACK;
3914         ret = mnl_socket_sendto(tcf->nl, nlh, nlh->nlmsg_len);
3915         if (ret <= 0) {
3916                 /* Message send error occurres. */
3917                 rte_errno = errno;
3918                 return -rte_errno;
3919         }
3920         nlh = (struct nlmsghdr *)(tcf->buf);
3921         /*
3922          * The following loop postpones non-fatal errors until multipart
3923          * messages are complete.
3924          */
3925         while (true) {
3926                 ret = mnl_socket_recvfrom(tcf->nl, tcf->buf, tcf->buf_size);
3927                 if (ret < 0) {
3928                         err = errno;
3929                         /*
3930                          * In case of overflow Will receive till
3931                          * end of multipart message. We may lost part
3932                          * of reply messages but mark and return an error.
3933                          */
3934                         if (err != ENOSPC ||
3935                             !(nlh->nlmsg_flags & NLM_F_MULTI) ||
3936                             nlh->nlmsg_type == NLMSG_DONE)
3937                                 break;
3938                 } else {
3939                         ret = mnl_cb_run(nlh, ret, seq, portid, cb, arg);
3940                         if (!ret) {
3941                                 /*
3942                                  * libmnl returns 0 if DONE or
3943                                  * success ACK message found.
3944                                  */
3945                                 break;
3946                         }
3947                         if (ret < 0) {
3948                                 /*
3949                                  * ACK message with error found
3950                                  * or some error occurred.
3951                                  */
3952                                 err = errno;
3953                                 break;
3954                         }
3955                         /* We should continue receiving. */
3956                 }
3957         }
3958         if (!err)
3959                 return 0;
3960         rte_errno = err;
3961         return -err;
3962 }
3963
3964 #define MNL_BUF_EXTRA_SPACE 16
3965 #define MNL_REQUEST_SIZE_MIN 256
3966 #define MNL_REQUEST_SIZE_MAX 2048
3967 #define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \
3968                                  MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX)
3969
3970 /* Data structures used by flow_tcf_xxx_cb() routines. */
3971 struct tcf_nlcb_buf {
3972         LIST_ENTRY(tcf_nlcb_buf) next;
3973         uint32_t size;
3974         alignas(struct nlmsghdr)
3975         uint8_t msg[]; /**< Netlink message data. */
3976 };
3977
3978 struct tcf_nlcb_context {
3979         unsigned int ifindex; /**< Base interface index. */
3980         uint32_t bufsize;
3981         LIST_HEAD(, tcf_nlcb_buf) nlbuf;
3982 };
3983
3984 /**
3985  * Allocate space for netlink command in buffer list
3986  *
3987  * @param[in, out] ctx
3988  *   Pointer to callback context with command buffers list.
3989  * @param[in] size
3990  *   Required size of data buffer to be allocated.
3991  *
3992  * @return
3993  *   Pointer to allocated memory, aligned as message header.
3994  *   NULL if some error occurred.
3995  */
3996 static struct nlmsghdr *
3997 flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size)
3998 {
3999         struct tcf_nlcb_buf *buf;
4000         struct nlmsghdr *nlh;
4001
4002         size = NLMSG_ALIGN(size);
4003         buf = LIST_FIRST(&ctx->nlbuf);
4004         if (buf && (buf->size + size) <= ctx->bufsize) {
4005                 nlh = (struct nlmsghdr *)&buf->msg[buf->size];
4006                 buf->size += size;
4007                 return nlh;
4008         }
4009         if (size > ctx->bufsize) {
4010                 DRV_LOG(WARNING, "netlink: too long command buffer requested");
4011                 return NULL;
4012         }
4013         buf = rte_malloc(__func__,
4014                         ctx->bufsize + sizeof(struct tcf_nlcb_buf),
4015                         alignof(struct tcf_nlcb_buf));
4016         if (!buf) {
4017                 DRV_LOG(WARNING, "netlink: no memory for command buffer");
4018                 return NULL;
4019         }
4020         LIST_INSERT_HEAD(&ctx->nlbuf, buf, next);
4021         buf->size = size;
4022         nlh = (struct nlmsghdr *)&buf->msg[0];
4023         return nlh;
4024 }
4025
4026 /**
4027  * Send the buffers with prepared netlink commands. Scans the list and
4028  * sends all found buffers. Buffers are sent and freed anyway in order
4029  * to prevent memory leakage if some every message in received packet.
4030  *
4031  * @param[in] tcf
4032  *   Context object initialized by mlx5_flow_tcf_context_create().
4033  * @param[in, out] ctx
4034  *   Pointer to callback context with command buffers list.
4035  *
4036  * @return
4037  *   Zero value on success, negative errno value otherwise
4038  *   and rte_errno is set.
4039  */
4040 static int
4041 flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf,
4042                     struct tcf_nlcb_context *ctx)
4043 {
4044         struct tcf_nlcb_buf *bc = LIST_FIRST(&ctx->nlbuf);
4045         int ret = 0;
4046
4047         while (bc) {
4048                 struct tcf_nlcb_buf *bn = LIST_NEXT(bc, next);
4049                 struct nlmsghdr *nlh;
4050                 uint32_t msg = 0;
4051                 int rc;
4052
4053                 while (msg < bc->size) {
4054                         /*
4055                          * Send Netlink commands from buffer in one by one
4056                          * fashion. If we send multiple rule deletion commands
4057                          * in one Netlink message and some error occurs it may
4058                          * cause multiple ACK error messages and break sequence
4059                          * numbers of Netlink communication, because we expect
4060                          * the only one ACK reply.
4061                          */
4062                         assert((bc->size - msg) >= sizeof(struct nlmsghdr));
4063                         nlh = (struct nlmsghdr *)&bc->msg[msg];
4064                         assert((bc->size - msg) >= nlh->nlmsg_len);
4065                         msg += nlh->nlmsg_len;
4066                         rc = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4067                         if (rc) {
4068                                 DRV_LOG(WARNING,
4069                                         "netlink: cleanup error %d", rc);
4070                                 if (!ret)
4071                                         ret = rc;
4072                         }
4073                 }
4074                 rte_free(bc);
4075                 bc = bn;
4076         }
4077         LIST_INIT(&ctx->nlbuf);
4078         return ret;
4079 }
4080
4081 /**
4082  * Collect local IP address rules with scope link attribute  on specified
4083  * network device. This is callback routine called by libmnl mnl_cb_run()
4084  * in loop for every message in received packet.
4085  *
4086  * @param[in] nlh
4087  *   Pointer to reply header.
4088  * @param[in, out] arg
4089  *   Opaque data pointer for this callback.
4090  *
4091  * @return
4092  *   A positive, nonzero value on success, negative errno value otherwise
4093  *   and rte_errno is set.
4094  */
4095 static int
4096 flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg)
4097 {
4098         struct tcf_nlcb_context *ctx = arg;
4099         struct nlmsghdr *cmd;
4100         struct ifaddrmsg *ifa;
4101         struct nlattr *na;
4102         struct nlattr *na_local = NULL;
4103         struct nlattr *na_peer = NULL;
4104         unsigned char family;
4105         uint32_t size;
4106
4107         if (nlh->nlmsg_type != RTM_NEWADDR) {
4108                 rte_errno = EINVAL;
4109                 return -rte_errno;
4110         }
4111         ifa = mnl_nlmsg_get_payload(nlh);
4112         family = ifa->ifa_family;
4113         if (ifa->ifa_index != ctx->ifindex ||
4114             ifa->ifa_scope != RT_SCOPE_LINK ||
4115             !(ifa->ifa_flags & IFA_F_PERMANENT) ||
4116             (family != AF_INET && family != AF_INET6))
4117                 return 1;
4118         mnl_attr_for_each(na, nlh, sizeof(*ifa)) {
4119                 switch (mnl_attr_get_type(na)) {
4120                 case IFA_LOCAL:
4121                         na_local = na;
4122                         break;
4123                 case IFA_ADDRESS:
4124                         na_peer = na;
4125                         break;
4126                 }
4127                 if (na_local && na_peer)
4128                         break;
4129         }
4130         if (!na_local || !na_peer)
4131                 return 1;
4132         /* Local rule found with scope link, permanent and assigned peer. */
4133         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4134                MNL_ALIGN(sizeof(struct ifaddrmsg)) +
4135                (family == AF_INET6 ? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4136                                    : 2 * SZ_NLATTR_TYPE_OF(uint32_t));
4137         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4138         if (!cmd) {
4139                 rte_errno = ENOMEM;
4140                 return -rte_errno;
4141         }
4142         cmd = mnl_nlmsg_put_header(cmd);
4143         cmd->nlmsg_type = RTM_DELADDR;
4144         cmd->nlmsg_flags = NLM_F_REQUEST;
4145         ifa = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifa));
4146         ifa->ifa_flags = IFA_F_PERMANENT;
4147         ifa->ifa_scope = RT_SCOPE_LINK;
4148         ifa->ifa_index = ctx->ifindex;
4149         if (family == AF_INET) {
4150                 ifa->ifa_family = AF_INET;
4151                 ifa->ifa_prefixlen = 32;
4152                 mnl_attr_put_u32(cmd, IFA_LOCAL, mnl_attr_get_u32(na_local));
4153                 mnl_attr_put_u32(cmd, IFA_ADDRESS, mnl_attr_get_u32(na_peer));
4154         } else {
4155                 ifa->ifa_family = AF_INET6;
4156                 ifa->ifa_prefixlen = 128;
4157                 mnl_attr_put(cmd, IFA_LOCAL, IPV6_ADDR_LEN,
4158                         mnl_attr_get_payload(na_local));
4159                 mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN,
4160                         mnl_attr_get_payload(na_peer));
4161         }
4162         assert(size == cmd->nlmsg_len);
4163         return 1;
4164 }
4165
4166 /**
4167  * Cleanup the local IP addresses on outer interface.
4168  *
4169  * @param[in] tcf
4170  *   Context object initialized by mlx5_flow_tcf_context_create().
4171  * @param[in] ifindex
4172  *   Network inferface index to perform cleanup.
4173  */
4174 static void
4175 flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf,
4176                             unsigned int ifindex)
4177 {
4178         struct nlmsghdr *nlh;
4179         struct ifaddrmsg *ifa;
4180         struct tcf_nlcb_context ctx = {
4181                 .ifindex = ifindex,
4182                 .bufsize = MNL_REQUEST_SIZE,
4183                 .nlbuf = LIST_HEAD_INITIALIZER(),
4184         };
4185         int ret;
4186
4187         assert(ifindex);
4188         /*
4189          * Seek and destroy leftovers of local IP addresses with
4190          * matching properties "scope link".
4191          */
4192         nlh = mnl_nlmsg_put_header(tcf->buf);
4193         nlh->nlmsg_type = RTM_GETADDR;
4194         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4195         ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4196         ifa->ifa_family = AF_UNSPEC;
4197         ifa->ifa_index = ifindex;
4198         ifa->ifa_scope = RT_SCOPE_LINK;
4199         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_local_cb, &ctx);
4200         if (ret)
4201                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4202         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4203         if (ret)
4204                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4205 }
4206
4207 /**
4208  * Collect neigh permament rules on specified network device.
4209  * This is callback routine called by libmnl mnl_cb_run() in loop for
4210  * every message in received packet.
4211  *
4212  * @param[in] nlh
4213  *   Pointer to reply header.
4214  * @param[in, out] arg
4215  *   Opaque data pointer for this callback.
4216  *
4217  * @return
4218  *   A positive, nonzero value on success, negative errno value otherwise
4219  *   and rte_errno is set.
4220  */
4221 static int
4222 flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg)
4223 {
4224         struct tcf_nlcb_context *ctx = arg;
4225         struct nlmsghdr *cmd;
4226         struct ndmsg *ndm;
4227         struct nlattr *na;
4228         struct nlattr *na_ip = NULL;
4229         struct nlattr *na_mac = NULL;
4230         unsigned char family;
4231         uint32_t size;
4232
4233         if (nlh->nlmsg_type != RTM_NEWNEIGH) {
4234                 rte_errno = EINVAL;
4235                 return -rte_errno;
4236         }
4237         ndm = mnl_nlmsg_get_payload(nlh);
4238         family = ndm->ndm_family;
4239         if (ndm->ndm_ifindex != (int)ctx->ifindex ||
4240            !(ndm->ndm_state & NUD_PERMANENT) ||
4241            (family != AF_INET && family != AF_INET6))
4242                 return 1;
4243         mnl_attr_for_each(na, nlh, sizeof(*ndm)) {
4244                 switch (mnl_attr_get_type(na)) {
4245                 case NDA_DST:
4246                         na_ip = na;
4247                         break;
4248                 case NDA_LLADDR:
4249                         na_mac = na;
4250                         break;
4251                 }
4252                 if (na_mac && na_ip)
4253                         break;
4254         }
4255         if (!na_mac || !na_ip)
4256                 return 1;
4257         /* Neigh rule with permenent attribute found. */
4258         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4259                MNL_ALIGN(sizeof(struct ndmsg)) +
4260                SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) +
4261                (family == AF_INET6 ? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4262                                    : SZ_NLATTR_TYPE_OF(uint32_t));
4263         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4264         if (!cmd) {
4265                 rte_errno = ENOMEM;
4266                 return -rte_errno;
4267         }
4268         cmd = mnl_nlmsg_put_header(cmd);
4269         cmd->nlmsg_type = RTM_DELNEIGH;
4270         cmd->nlmsg_flags = NLM_F_REQUEST;
4271         ndm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ndm));
4272         ndm->ndm_ifindex = ctx->ifindex;
4273         ndm->ndm_state = NUD_PERMANENT;
4274         ndm->ndm_flags = 0;
4275         ndm->ndm_type = 0;
4276         if (family == AF_INET) {
4277                 ndm->ndm_family = AF_INET;
4278                 mnl_attr_put_u32(cmd, NDA_DST, mnl_attr_get_u32(na_ip));
4279         } else {
4280                 ndm->ndm_family = AF_INET6;
4281                 mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN,
4282                              mnl_attr_get_payload(na_ip));
4283         }
4284         mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN,
4285                      mnl_attr_get_payload(na_mac));
4286         assert(size == cmd->nlmsg_len);
4287         return 1;
4288 }
4289
4290 /**
4291  * Cleanup the neigh rules on outer interface.
4292  *
4293  * @param[in] tcf
4294  *   Context object initialized by mlx5_flow_tcf_context_create().
4295  * @param[in] ifindex
4296  *   Network inferface index to perform cleanup.
4297  */
4298 static void
4299 flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf,
4300                             unsigned int ifindex)
4301 {
4302         struct nlmsghdr *nlh;
4303         struct ndmsg *ndm;
4304         struct tcf_nlcb_context ctx = {
4305                 .ifindex = ifindex,
4306                 .bufsize = MNL_REQUEST_SIZE,
4307                 .nlbuf = LIST_HEAD_INITIALIZER(),
4308         };
4309         int ret;
4310
4311         assert(ifindex);
4312         /* Seek and destroy leftovers of neigh rules. */
4313         nlh = mnl_nlmsg_put_header(tcf->buf);
4314         nlh->nlmsg_type = RTM_GETNEIGH;
4315         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4316         ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4317         ndm->ndm_family = AF_UNSPEC;
4318         ndm->ndm_ifindex = ifindex;
4319         ndm->ndm_state = NUD_PERMANENT;
4320         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_neigh_cb, &ctx);
4321         if (ret)
4322                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4323         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4324         if (ret)
4325                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4326 }
4327
4328 /**
4329  * Collect indices of VXLAN encap/decap interfaces associated with device.
4330  * This is callback routine called by libmnl mnl_cb_run() in loop for
4331  * every message in received packet.
4332  *
4333  * @param[in] nlh
4334  *   Pointer to reply header.
4335  * @param[in, out] arg
4336  *   Opaque data pointer for this callback.
4337  *
4338  * @return
4339  *   A positive, nonzero value on success, negative errno value otherwise
4340  *   and rte_errno is set.
4341  */
4342 static int
4343 flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg)
4344 {
4345         struct tcf_nlcb_context *ctx = arg;
4346         struct nlmsghdr *cmd;
4347         struct ifinfomsg *ifm;
4348         struct nlattr *na;
4349         struct nlattr *na_info = NULL;
4350         struct nlattr *na_vxlan = NULL;
4351         bool found = false;
4352         unsigned int vxindex;
4353         uint32_t size;
4354
4355         if (nlh->nlmsg_type != RTM_NEWLINK) {
4356                 rte_errno = EINVAL;
4357                 return -rte_errno;
4358         }
4359         ifm = mnl_nlmsg_get_payload(nlh);
4360         if (!ifm->ifi_index) {
4361                 rte_errno = EINVAL;
4362                 return -rte_errno;
4363         }
4364         mnl_attr_for_each(na, nlh, sizeof(*ifm))
4365                 if (mnl_attr_get_type(na) == IFLA_LINKINFO) {
4366                         na_info = na;
4367                         break;
4368                 }
4369         if (!na_info)
4370                 return 1;
4371         mnl_attr_for_each_nested(na, na_info) {
4372                 switch (mnl_attr_get_type(na)) {
4373                 case IFLA_INFO_KIND:
4374                         if (!strncmp("vxlan", mnl_attr_get_str(na),
4375                                      mnl_attr_get_len(na)))
4376                                 found = true;
4377                         break;
4378                 case IFLA_INFO_DATA:
4379                         na_vxlan = na;
4380                         break;
4381                 }
4382                 if (found && na_vxlan)
4383                         break;
4384         }
4385         if (!found || !na_vxlan)
4386                 return 1;
4387         found = false;
4388         mnl_attr_for_each_nested(na, na_vxlan) {
4389                 if (mnl_attr_get_type(na) == IFLA_VXLAN_LINK &&
4390                     mnl_attr_get_u32(na) == ctx->ifindex) {
4391                         found = true;
4392                         break;
4393                 }
4394         }
4395         if (!found)
4396                 return 1;
4397         /* Attached VXLAN device found, store the command to delete. */
4398         vxindex = ifm->ifi_index;
4399         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4400                MNL_ALIGN(sizeof(struct ifinfomsg));
4401         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4402         if (!cmd) {
4403                 rte_errno = ENOMEM;
4404                 return -rte_errno;
4405         }
4406         cmd = mnl_nlmsg_put_header(cmd);
4407         cmd->nlmsg_type = RTM_DELLINK;
4408         cmd->nlmsg_flags = NLM_F_REQUEST;
4409         ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm));
4410         ifm->ifi_family = AF_UNSPEC;
4411         ifm->ifi_index = vxindex;
4412         assert(size == cmd->nlmsg_len);
4413         return 1;
4414 }
4415
4416 /**
4417  * Cleanup the outer interface. Removes all found vxlan devices
4418  * attached to specified index, flushes the neigh and local IP
4419  * database.
4420  *
4421  * @param[in] tcf
4422  *   Context object initialized by mlx5_flow_tcf_context_create().
4423  * @param[in] ifindex
4424  *   Network inferface index to perform cleanup.
4425  */
4426 static void
4427 flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf,
4428                             unsigned int ifindex)
4429 {
4430         struct nlmsghdr *nlh;
4431         struct ifinfomsg *ifm;
4432         struct tcf_nlcb_context ctx = {
4433                 .ifindex = ifindex,
4434                 .bufsize = MNL_REQUEST_SIZE,
4435                 .nlbuf = LIST_HEAD_INITIALIZER(),
4436         };
4437         int ret;
4438
4439         assert(ifindex);
4440         /*
4441          * Seek and destroy leftover VXLAN encap/decap interfaces with
4442          * matching properties.
4443          */
4444         nlh = mnl_nlmsg_put_header(tcf->buf);
4445         nlh->nlmsg_type = RTM_GETLINK;
4446         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4447         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4448         ifm->ifi_family = AF_UNSPEC;
4449         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_vxlan_cb, &ctx);
4450         if (ret)
4451                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4452         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4453         if (ret)
4454                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4455 }
4456
4457 /**
4458  * Emit Netlink message to add/remove local address to the outer device.
4459  * The address being added is visible within the link only (scope link).
4460  *
4461  * Note that an implicit route is maintained by the kernel due to the
4462  * presence of a peer address (IFA_ADDRESS).
4463  *
4464  * These rules are used for encapsultion only and allow to assign
4465  * the outer tunnel source IP address.
4466  *
4467  * @param[in] tcf
4468  *   Libmnl socket context object.
4469  * @param[in] encap
4470  *   Encapsulation properties (source address and its peer).
4471  * @param[in] ifindex
4472  *   Network interface to apply rule.
4473  * @param[in] enable
4474  *   Toggle between add and remove.
4475  * @param[out] error
4476  *   Perform verbose error reporting if not NULL.
4477  *
4478  * @return
4479  *   0 on success, a negative errno value otherwise and rte_errno is set.
4480  */
4481 static int
4482 flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf,
4483                     const struct flow_tcf_vxlan_encap *encap,
4484                     unsigned int ifindex,
4485                     bool enable,
4486                     struct rte_flow_error *error)
4487 {
4488         struct nlmsghdr *nlh;
4489         struct ifaddrmsg *ifa;
4490         alignas(struct nlmsghdr)
4491         uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
4492
4493         nlh = mnl_nlmsg_put_header(buf);
4494         nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
4495         nlh->nlmsg_flags =
4496                 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4497         nlh->nlmsg_seq = 0;
4498         ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4499         ifa->ifa_flags = IFA_F_PERMANENT;
4500         ifa->ifa_scope = RT_SCOPE_LINK;
4501         ifa->ifa_index = ifindex;
4502         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4503                 ifa->ifa_family = AF_INET;
4504                 ifa->ifa_prefixlen = 32;
4505                 mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src);
4506                 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST)
4507                         mnl_attr_put_u32(nlh, IFA_ADDRESS,
4508                                               encap->ipv4.dst);
4509         } else {
4510                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4511                 ifa->ifa_family = AF_INET6;
4512                 ifa->ifa_prefixlen = 128;
4513                 mnl_attr_put(nlh, IFA_LOCAL,
4514                                   sizeof(encap->ipv6.src),
4515                                   &encap->ipv6.src);
4516                 if (encap->mask & FLOW_TCF_ENCAP_IPV6_DST)
4517                         mnl_attr_put(nlh, IFA_ADDRESS,
4518                                           sizeof(encap->ipv6.dst),
4519                                           &encap->ipv6.dst);
4520         }
4521         if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4522                 return 0;
4523         return rte_flow_error_set(error, rte_errno,
4524                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4525                                   "netlink: cannot complete IFA request"
4526                                   " (ip addr add)");
4527 }
4528
4529 /**
4530  * Emit Netlink message to add/remove neighbor.
4531  *
4532  * @param[in] tcf
4533  *   Libmnl socket context object.
4534  * @param[in] encap
4535  *   Encapsulation properties (destination address).
4536  * @param[in] ifindex
4537  *   Network interface.
4538  * @param[in] enable
4539  *   Toggle between add and remove.
4540  * @param[out] error
4541  *   Perform verbose error reporting if not NULL.
4542  *
4543  * @return
4544  *   0 on success, a negative errno value otherwise and rte_errno is set.
4545  */
4546 static int
4547 flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf,
4548                      const struct flow_tcf_vxlan_encap *encap,
4549                      unsigned int ifindex,
4550                      bool enable,
4551                      struct rte_flow_error *error)
4552 {
4553         struct nlmsghdr *nlh;
4554         struct ndmsg *ndm;
4555         alignas(struct nlmsghdr)
4556         uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
4557
4558         nlh = mnl_nlmsg_put_header(buf);
4559         nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
4560         nlh->nlmsg_flags =
4561                 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4562         nlh->nlmsg_seq = 0;
4563         ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4564         ndm->ndm_ifindex = ifindex;
4565         ndm->ndm_state = NUD_PERMANENT;
4566         ndm->ndm_flags = 0;
4567         ndm->ndm_type = 0;
4568         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4569                 ndm->ndm_family = AF_INET;
4570                 mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst);
4571         } else {
4572                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4573                 ndm->ndm_family = AF_INET6;
4574                 mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst),
4575                                                  &encap->ipv6.dst);
4576         }
4577         if (encap->mask & FLOW_TCF_ENCAP_ETH_SRC && enable)
4578                 DRV_LOG(WARNING,
4579                         "outer ethernet source address cannot be "
4580                         "forced for VXLAN encapsulation");
4581         if (encap->mask & FLOW_TCF_ENCAP_ETH_DST)
4582                 mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst),
4583                                                     &encap->eth.dst);
4584         if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4585                 return 0;
4586         return rte_flow_error_set(error, rte_errno,
4587                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4588                                   "netlink: cannot complete ND request"
4589                                   " (ip neigh)");
4590 }
4591
4592 /**
4593  * Manage the local IP addresses and their peers IP addresses on the
4594  * outer interface for encapsulation purposes. The kernel searches the
4595  * appropriate device for tunnel egress traffic using the outer source
4596  * IP, this IP should be assigned to the outer network device, otherwise
4597  * kernel rejects the rule.
4598  *
4599  * Adds or removes the addresses using the Netlink command like this:
4600  *   ip addr add <src_ip> peer <dst_ip> scope link dev <ifouter>
4601  *
4602  * The addresses are local to the netdev ("scope link"), this reduces
4603  * the risk of conflicts. Note that an implicit route is maintained by
4604  * the kernel due to the presence of a peer address (IFA_ADDRESS).
4605  *
4606  * @param[in] tcf
4607  *   Libmnl socket context object.
4608  * @param[in] iface
4609  *   Object, contains rule database and ifouter index.
4610  * @param[in] dev_flow
4611  *   Flow object, contains the tunnel parameters (for encap only).
4612  * @param[in] enable
4613  *   Toggle between add and remove.
4614  * @param[out] error
4615  *   Perform verbose error reporting if not NULL.
4616  *
4617  * @return
4618  *   0 on success, a negative errno value otherwise and rte_errno is set.
4619  */
4620 static int
4621 flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf,
4622                      struct tcf_irule *iface,
4623                      struct mlx5_flow *dev_flow,
4624                      bool enable,
4625                      struct rte_flow_error *error)
4626 {
4627         const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4628         struct tcf_local_rule *rule = NULL;
4629         int ret;
4630
4631         assert(encap);
4632         assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4633         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4634                 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_DST);
4635                 LIST_FOREACH(rule, &iface->local, next) {
4636                         if (rule->mask & FLOW_TCF_ENCAP_IPV4_SRC &&
4637                             encap->ipv4.src == rule->ipv4.src &&
4638                             encap->ipv4.dst == rule->ipv4.dst) {
4639                                 break;
4640                         }
4641                 }
4642         } else {
4643                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4644                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4645                 LIST_FOREACH(rule, &iface->local, next) {
4646                         if (rule->mask & FLOW_TCF_ENCAP_IPV6_SRC &&
4647                             !memcmp(&encap->ipv6.src, &rule->ipv6.src,
4648                                             sizeof(encap->ipv6.src)) &&
4649                             !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4650                                             sizeof(encap->ipv6.dst))) {
4651                                 break;
4652                         }
4653                 }
4654         }
4655         if (rule) {
4656                 if (enable) {
4657                         rule->refcnt++;
4658                         return 0;
4659                 }
4660                 if (!rule->refcnt || !--rule->refcnt) {
4661                         LIST_REMOVE(rule, next);
4662                         return flow_tcf_rule_local(tcf, encap,
4663                                         iface->ifouter, false, error);
4664                 }
4665                 return 0;
4666         }
4667         if (!enable) {
4668                 DRV_LOG(WARNING, "disabling not existing local rule");
4669                 rte_flow_error_set(error, ENOENT,
4670                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4671                                    "disabling not existing local rule");
4672                 return -ENOENT;
4673         }
4674         rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule),
4675                                 alignof(struct tcf_local_rule));
4676         if (!rule) {
4677                 rte_flow_error_set(error, ENOMEM,
4678                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4679                                    "unable to allocate memory for local rule");
4680                 return -rte_errno;
4681         }
4682         *rule = (struct tcf_local_rule){.refcnt = 0,
4683                                         .mask = 0,
4684                                         };
4685         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4686                 rule->mask = FLOW_TCF_ENCAP_IPV4_SRC
4687                            | FLOW_TCF_ENCAP_IPV4_DST;
4688                 rule->ipv4.src = encap->ipv4.src;
4689                 rule->ipv4.dst = encap->ipv4.dst;
4690         } else {
4691                 rule->mask = FLOW_TCF_ENCAP_IPV6_SRC
4692                            | FLOW_TCF_ENCAP_IPV6_DST;
4693                 memcpy(&rule->ipv6.src, &encap->ipv6.src, IPV6_ADDR_LEN);
4694                 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4695         }
4696         ret = flow_tcf_rule_local(tcf, encap, iface->ifouter, true, error);
4697         if (ret) {
4698                 rte_free(rule);
4699                 return ret;
4700         }
4701         rule->refcnt++;
4702         LIST_INSERT_HEAD(&iface->local, rule, next);
4703         return 0;
4704 }
4705
4706 /**
4707  * Manage the destination MAC/IP addresses neigh database, kernel uses
4708  * this one to determine the destination MAC address within encapsulation
4709  * header. Adds or removes the entries using the Netlink command like this:
4710  *   ip neigh add dev <ifouter> lladdr <dst_mac> to <dst_ip> nud permanent
4711  *
4712  * @param[in] tcf
4713  *   Libmnl socket context object.
4714  * @param[in] iface
4715  *   Object, contains rule database and ifouter index.
4716  * @param[in] dev_flow
4717  *   Flow object, contains the tunnel parameters (for encap only).
4718  * @param[in] enable
4719  *   Toggle between add and remove.
4720  * @param[out] error
4721  *   Perform verbose error reporting if not NULL.
4722  *
4723  * @return
4724  *   0 on success, a negative errno value otherwise and rte_errno is set.
4725  */
4726 static int
4727 flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf,
4728                      struct tcf_irule *iface,
4729                      struct mlx5_flow *dev_flow,
4730                      bool enable,
4731                      struct rte_flow_error *error)
4732 {
4733         const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4734         struct tcf_neigh_rule *rule = NULL;
4735         int ret;
4736
4737         assert(encap);
4738         assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4739         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4740                 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_SRC);
4741                 LIST_FOREACH(rule, &iface->neigh, next) {
4742                         if (rule->mask & FLOW_TCF_ENCAP_IPV4_DST &&
4743                             encap->ipv4.dst == rule->ipv4.dst) {
4744                                 break;
4745                         }
4746                 }
4747         } else {
4748                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4749                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4750                 LIST_FOREACH(rule, &iface->neigh, next) {
4751                         if (rule->mask & FLOW_TCF_ENCAP_IPV6_DST &&
4752                             !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4753                                                 sizeof(encap->ipv6.dst))) {
4754                                 break;
4755                         }
4756                 }
4757         }
4758         if (rule) {
4759                 if (memcmp(&encap->eth.dst, &rule->eth,
4760                            sizeof(encap->eth.dst))) {
4761                         DRV_LOG(WARNING, "Destination MAC differs"
4762                                          " in neigh rule");
4763                         rte_flow_error_set(error, EEXIST,
4764                                            RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
4765                                            NULL, "Different MAC address"
4766                                            " neigh rule for the same"
4767                                            " destination IP");
4768                                         return -EEXIST;
4769                 }
4770                 if (enable) {
4771                         rule->refcnt++;
4772                         return 0;
4773                 }
4774                 if (!rule->refcnt || !--rule->refcnt) {
4775                         LIST_REMOVE(rule, next);
4776                         return flow_tcf_rule_neigh(tcf, encap,
4777                                                    iface->ifouter,
4778                                                    false, error);
4779                 }
4780                 return 0;
4781         }
4782         if (!enable) {
4783                 DRV_LOG(WARNING, "Disabling not existing neigh rule");
4784                 rte_flow_error_set(error, ENOENT,
4785                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4786                                    "unable to allocate memory for neigh rule");
4787                 return -ENOENT;
4788         }
4789         rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule),
4790                                 alignof(struct tcf_neigh_rule));
4791         if (!rule) {
4792                 rte_flow_error_set(error, ENOMEM,
4793                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4794                                    "unable to allocate memory for neigh rule");
4795                 return -rte_errno;
4796         }
4797         *rule = (struct tcf_neigh_rule){.refcnt = 0,
4798                                         .mask = 0,
4799                                         };
4800         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4801                 rule->mask = FLOW_TCF_ENCAP_IPV4_DST;
4802                 rule->ipv4.dst = encap->ipv4.dst;
4803         } else {
4804                 rule->mask = FLOW_TCF_ENCAP_IPV6_DST;
4805                 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4806         }
4807         memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth));
4808         ret = flow_tcf_rule_neigh(tcf, encap, iface->ifouter, true, error);
4809         if (ret) {
4810                 rte_free(rule);
4811                 return ret;
4812         }
4813         rule->refcnt++;
4814         LIST_INSERT_HEAD(&iface->neigh, rule, next);
4815         return 0;
4816 }
4817
4818 /* VXLAN encap rule database for outer interfaces. */
4819 static  LIST_HEAD(, tcf_irule) iface_list_vxlan = LIST_HEAD_INITIALIZER();
4820
4821 /* VTEP device list is shared between PMD port instances. */
4822 static LIST_HEAD(, tcf_vtep) vtep_list_vxlan = LIST_HEAD_INITIALIZER();
4823 static pthread_mutex_t vtep_list_mutex = PTHREAD_MUTEX_INITIALIZER;
4824
4825 /**
4826  * Acquire the VXLAN encap rules container for specified interface.
4827  * First looks for the container in the existing ones list, creates
4828  * and initializes the new container if existing not found.
4829  *
4830  * @param[in] tcf
4831  *   Context object initialized by mlx5_flow_tcf_context_create().
4832  * @param[in] ifouter
4833  *   Network interface index to create VXLAN encap rules on.
4834  * @param[out] error
4835  *   Perform verbose error reporting if not NULL.
4836  * @return
4837  *   Rule container pointer on success,
4838  *   NULL otherwise and rte_errno is set.
4839  */
4840 static struct tcf_irule*
4841 flow_tcf_encap_irule_acquire(struct mlx5_flow_tcf_context *tcf,
4842                              unsigned int ifouter,
4843                              struct rte_flow_error *error)
4844 {
4845         struct tcf_irule *iface;
4846
4847         /* Look whether the container for encap rules is created. */
4848         assert(ifouter);
4849         LIST_FOREACH(iface, &iface_list_vxlan, next) {
4850                 if (iface->ifouter == ifouter)
4851                         break;
4852         }
4853         if (iface) {
4854                 /* Container already exists, just increment the reference. */
4855                 iface->refcnt++;
4856                 return iface;
4857         }
4858         /* Not found, we should create the new container. */
4859         iface = rte_zmalloc(__func__, sizeof(*iface),
4860                             alignof(struct tcf_irule));
4861         if (!iface) {
4862                 rte_flow_error_set(error, ENOMEM,
4863                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4864                                    "unable to allocate memory for container");
4865                 return NULL;
4866         }
4867         *iface = (struct tcf_irule){
4868                         .local = LIST_HEAD_INITIALIZER(),
4869                         .neigh = LIST_HEAD_INITIALIZER(),
4870                         .ifouter = ifouter,
4871                         .refcnt = 1,
4872         };
4873         /* Interface cleanup for new container created. */
4874         flow_tcf_encap_iface_cleanup(tcf, ifouter);
4875         flow_tcf_encap_local_cleanup(tcf, ifouter);
4876         flow_tcf_encap_neigh_cleanup(tcf, ifouter);
4877         LIST_INSERT_HEAD(&iface_list_vxlan, iface, next);
4878         return iface;
4879 }
4880
4881 /**
4882  * Releases VXLAN encap rules container by pointer. Decrements the
4883  * reference cointer and deletes the container if counter is zero.
4884  *
4885  * @param[in] irule
4886  *   VXLAN rule container pointer to release.
4887  */
4888 static void
4889 flow_tcf_encap_irule_release(struct tcf_irule *iface)
4890 {
4891         assert(iface->refcnt);
4892         if (--iface->refcnt == 0) {
4893                 /* Reference counter is zero, delete the container. */
4894                 assert(LIST_EMPTY(&iface->local));
4895                 assert(LIST_EMPTY(&iface->neigh));
4896                 LIST_REMOVE(iface, next);
4897                 rte_free(iface);
4898         }
4899 }
4900
4901 /**
4902  * Deletes VTEP network device.
4903  *
4904  * @param[in] tcf
4905  *   Context object initialized by mlx5_flow_tcf_context_create().
4906  * @param[in] vtep
4907  *   Object represinting the network device to delete. Memory
4908  *   allocated for this object is freed by routine.
4909  */
4910 static void
4911 flow_tcf_vtep_delete(struct mlx5_flow_tcf_context *tcf,
4912                      struct tcf_vtep *vtep)
4913 {
4914         struct nlmsghdr *nlh;
4915         struct ifinfomsg *ifm;
4916         alignas(struct nlmsghdr)
4917         uint8_t buf[mnl_nlmsg_size(MNL_ALIGN(sizeof(*ifm))) +
4918                     MNL_BUF_EXTRA_SPACE];
4919         int ret;
4920
4921         assert(!vtep->refcnt);
4922         /* Delete only ifaces those we actually created. */
4923         if (vtep->created && vtep->ifindex) {
4924                 DRV_LOG(INFO, "VTEP delete (%d)", vtep->ifindex);
4925                 nlh = mnl_nlmsg_put_header(buf);
4926                 nlh->nlmsg_type = RTM_DELLINK;
4927                 nlh->nlmsg_flags = NLM_F_REQUEST;
4928                 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4929                 ifm->ifi_family = AF_UNSPEC;
4930                 ifm->ifi_index = vtep->ifindex;
4931                 assert(sizeof(buf) >= nlh->nlmsg_len);
4932                 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4933                 if (ret)
4934                         DRV_LOG(WARNING, "netlink: error deleting vxlan"
4935                                          " encap/decap ifindex %u",
4936                                          ifm->ifi_index);
4937         }
4938         rte_free(vtep);
4939 }
4940
4941 /**
4942  * Creates VTEP network device.
4943  *
4944  * @param[in] tcf
4945  *   Context object initialized by mlx5_flow_tcf_context_create().
4946  * @param[in] port
4947  *   UDP port of created VTEP device.
4948  * @param[out] error
4949  *   Perform verbose error reporting if not NULL.
4950  *
4951  * @return
4952  * Pointer to created device structure on success,
4953  * NULL otherwise and rte_errno is set.
4954  */
4955 static struct tcf_vtep*
4956 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf,
4957                      uint16_t port, struct rte_flow_error *error)
4958 {
4959         struct tcf_vtep *vtep;
4960         struct nlmsghdr *nlh;
4961         struct ifinfomsg *ifm;
4962         char name[sizeof(MLX5_VXLAN_DEVICE_PFX) + 24];
4963         alignas(struct nlmsghdr)
4964         uint8_t buf[mnl_nlmsg_size(sizeof(*ifm)) +
4965                     SZ_NLATTR_DATA_OF(sizeof(name)) +
4966                     SZ_NLATTR_NEST * 2 +
4967                     SZ_NLATTR_STRZ_OF("vxlan") +
4968                     SZ_NLATTR_DATA_OF(sizeof(uint32_t)) +
4969                     SZ_NLATTR_DATA_OF(sizeof(uint16_t)) +
4970                     SZ_NLATTR_DATA_OF(sizeof(uint8_t)) * 3 +
4971                     MNL_BUF_EXTRA_SPACE];
4972         struct nlattr *na_info;
4973         struct nlattr *na_vxlan;
4974         rte_be16_t vxlan_port = rte_cpu_to_be_16(port);
4975         int ret;
4976
4977         vtep = rte_zmalloc(__func__, sizeof(*vtep), alignof(struct tcf_vtep));
4978         if (!vtep) {
4979                 rte_flow_error_set(error, ENOMEM,
4980                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4981                                    "unable to allocate memory for VTEP");
4982                 return NULL;
4983         }
4984         *vtep = (struct tcf_vtep){
4985                         .port = port,
4986         };
4987         memset(buf, 0, sizeof(buf));
4988         nlh = mnl_nlmsg_put_header(buf);
4989         nlh->nlmsg_type = RTM_NEWLINK;
4990         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE  | NLM_F_EXCL;
4991         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4992         ifm->ifi_family = AF_UNSPEC;
4993         ifm->ifi_type = 0;
4994         ifm->ifi_index = 0;
4995         ifm->ifi_flags = IFF_UP;
4996         ifm->ifi_change = 0xffffffff;
4997         snprintf(name, sizeof(name), "%s%u", MLX5_VXLAN_DEVICE_PFX, port);
4998         mnl_attr_put_strz(nlh, IFLA_IFNAME, name);
4999         na_info = mnl_attr_nest_start(nlh, IFLA_LINKINFO);
5000         assert(na_info);
5001         mnl_attr_put_strz(nlh, IFLA_INFO_KIND, "vxlan");
5002         na_vxlan = mnl_attr_nest_start(nlh, IFLA_INFO_DATA);
5003         assert(na_vxlan);
5004 #ifdef HAVE_IFLA_VXLAN_COLLECT_METADATA
5005         /*
5006          * RH 7.2 does not support metadata for tunnel device.
5007          * It does not matter because we are going to use the
5008          * hardware offload by mlx5 driver.
5009          */
5010         mnl_attr_put_u8(nlh, IFLA_VXLAN_COLLECT_METADATA, 1);
5011 #endif
5012         mnl_attr_put_u8(nlh, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1);
5013         mnl_attr_put_u8(nlh, IFLA_VXLAN_LEARNING, 0);
5014         mnl_attr_put_u16(nlh, IFLA_VXLAN_PORT, vxlan_port);
5015 #ifndef HAVE_IFLA_VXLAN_COLLECT_METADATA
5016         /*
5017          *  We must specify VNI explicitly if metadata not supported.
5018          *  Note, VNI is transferred with native endianness format.
5019          */
5020         mnl_attr_put_u16(nlh, IFLA_VXLAN_ID, MLX5_VXLAN_DEFAULT_VNI);
5021 #endif
5022         mnl_attr_nest_end(nlh, na_vxlan);
5023         mnl_attr_nest_end(nlh, na_info);
5024         assert(sizeof(buf) >= nlh->nlmsg_len);
5025         ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5026         if (ret) {
5027                 DRV_LOG(WARNING,
5028                         "netlink: VTEP %s create failure (%d)",
5029                         name, rte_errno);
5030                 if (rte_errno != EEXIST)
5031                         /*
5032                          * Some unhandled error occurred or device is
5033                          * for encapsulation and cannot be shared.
5034                          */
5035                         goto error;
5036         } else {
5037                 /*
5038                  * Mark device we actually created.
5039                  * We should explicitly delete
5040                  * when we do not need it anymore.
5041                  */
5042                 vtep->created = 1;
5043         }
5044         /* Try to get ifindex of created of pre-existing device. */
5045         ret = if_nametoindex(name);
5046         if (!ret) {
5047                 DRV_LOG(WARNING,
5048                         "VTEP %s failed to get index (%d)", name, errno);
5049                 rte_flow_error_set
5050                         (error, -errno,
5051                          RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5052                          "netlink: failed to retrieve VTEP ifindex");
5053                 goto error;
5054         }
5055         vtep->ifindex = ret;
5056         memset(buf, 0, sizeof(buf));
5057         nlh = mnl_nlmsg_put_header(buf);
5058         nlh->nlmsg_type = RTM_NEWLINK;
5059         nlh->nlmsg_flags = NLM_F_REQUEST;
5060         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5061         ifm->ifi_family = AF_UNSPEC;
5062         ifm->ifi_type = 0;
5063         ifm->ifi_index = vtep->ifindex;
5064         ifm->ifi_flags = IFF_UP;
5065         ifm->ifi_change = IFF_UP;
5066         ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5067         if (ret) {
5068                 rte_flow_error_set(error, -errno,
5069                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5070                                    "netlink: failed to set VTEP link up");
5071                 DRV_LOG(WARNING, "netlink: VTEP %s set link up failure (%d)",
5072                         name, rte_errno);
5073                 goto clean;
5074         }
5075         ret = mlx5_flow_tcf_init(tcf, vtep->ifindex, error);
5076         if (ret) {
5077                 DRV_LOG(WARNING, "VTEP %s init failure (%d)", name, rte_errno);
5078                 goto clean;
5079         }
5080         DRV_LOG(INFO, "VTEP create (%d, %d)", vtep->port, vtep->ifindex);
5081         vtep->refcnt = 1;
5082         return vtep;
5083 clean:
5084         flow_tcf_vtep_delete(tcf, vtep);
5085         return NULL;
5086 error:
5087         rte_free(vtep);
5088         return NULL;
5089 }
5090
5091 /**
5092  * Acquire target interface index for VXLAN tunneling decapsulation.
5093  * In order to share the UDP port within the other interfaces the
5094  * VXLAN device created as not attached to any interface (if created).
5095  *
5096  * @param[in] tcf
5097  *   Context object initialized by mlx5_flow_tcf_context_create().
5098  * @param[in] dev_flow
5099  *   Flow tcf object with tunnel structure pointer set.
5100  * @param[out] error
5101  *   Perform verbose error reporting if not NULL.
5102  * @return
5103  *   Interface descriptor pointer on success,
5104  *   NULL otherwise and rte_errno is set.
5105  */
5106 static struct tcf_vtep*
5107 flow_tcf_decap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5108                             struct mlx5_flow *dev_flow,
5109                             struct rte_flow_error *error)
5110 {
5111         struct tcf_vtep *vtep;
5112         uint16_t port = dev_flow->tcf.vxlan_decap->udp_port;
5113
5114         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5115                 if (vtep->port == port)
5116                         break;
5117         }
5118         if (vtep) {
5119                 /* Device exists, just increment the reference counter. */
5120                 vtep->refcnt++;
5121                 assert(vtep->ifindex);
5122                 return vtep;
5123         }
5124         /* No decapsulation device exists, try to create the new one. */
5125         vtep = flow_tcf_vtep_create(tcf, port, error);
5126         if (vtep)
5127                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5128         return vtep;
5129 }
5130
5131 /**
5132  * Aqcuire target interface index for VXLAN tunneling encapsulation.
5133  *
5134  * @param[in] tcf
5135  *   Context object initialized by mlx5_flow_tcf_context_create().
5136  * @param[in] ifouter
5137  *   Network interface index to attach VXLAN encap device to.
5138  * @param[in] dev_flow
5139  *   Flow tcf object with tunnel structure pointer set.
5140  * @param[out] error
5141  *   Perform verbose error reporting if not NULL.
5142  * @return
5143  *   Interface descriptor pointer on success,
5144  *   NULL otherwise and rte_errno is set.
5145  */
5146 static struct tcf_vtep*
5147 flow_tcf_encap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5148                             unsigned int ifouter,
5149                             struct mlx5_flow *dev_flow,
5150                             struct rte_flow_error *error)
5151 {
5152         static uint16_t port;
5153         struct tcf_vtep *vtep;
5154         struct tcf_irule *iface;
5155         int ret;
5156
5157         assert(ifouter);
5158         /* Look whether the VTEP for specified port is created. */
5159         port = rte_be_to_cpu_16(dev_flow->tcf.vxlan_encap->udp.dst);
5160         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5161                 if (vtep->port == port)
5162                         break;
5163         }
5164         if (vtep) {
5165                 /* VTEP already exists, just increment the reference. */
5166                 vtep->refcnt++;
5167         } else {
5168                 /* Not found, we should create the new VTEP. */
5169                 vtep = flow_tcf_vtep_create(tcf, port, error);
5170                 if (!vtep)
5171                         return NULL;
5172                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5173         }
5174         assert(vtep->ifindex);
5175         iface = flow_tcf_encap_irule_acquire(tcf, ifouter, error);
5176         if (!iface) {
5177                 if (--vtep->refcnt == 0)
5178                         flow_tcf_vtep_delete(tcf, vtep);
5179                 return NULL;
5180         }
5181         dev_flow->tcf.vxlan_encap->iface = iface;
5182         /* Create local ipaddr with peer to specify the outer IPs. */
5183         ret = flow_tcf_encap_local(tcf, iface, dev_flow, true, error);
5184         if (!ret) {
5185                 /* Create neigh rule to specify outer destination MAC. */
5186                 ret = flow_tcf_encap_neigh(tcf, iface, dev_flow, true, error);
5187                 if (ret)
5188                         flow_tcf_encap_local(tcf, iface,
5189                                              dev_flow, false, error);
5190         }
5191         if (ret) {
5192                 dev_flow->tcf.vxlan_encap->iface = NULL;
5193                 flow_tcf_encap_irule_release(iface);
5194                 if (--vtep->refcnt == 0)
5195                         flow_tcf_vtep_delete(tcf, vtep);
5196                 return NULL;
5197         }
5198         return vtep;
5199 }
5200
5201 /**
5202  * Acquires target interface index for tunneling of any type.
5203  * Creates the new VTEP if needed.
5204  *
5205  * @param[in] tcf
5206  *   Context object initialized by mlx5_flow_tcf_context_create().
5207  * @param[in] ifouter
5208  *   Network interface index to create VXLAN encap rules on.
5209  * @param[in] dev_flow
5210  *   Flow tcf object with tunnel structure pointer set.
5211  * @param[out] error
5212  *   Perform verbose error reporting if not NULL.
5213  * @return
5214  *   Interface descriptor pointer on success,
5215  *   NULL otherwise and rte_errno is set.
5216  */
5217 static struct tcf_vtep*
5218 flow_tcf_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5219                       unsigned int ifouter,
5220                       struct mlx5_flow *dev_flow,
5221                       struct rte_flow_error *error)
5222 {
5223         struct tcf_vtep *vtep = NULL;
5224
5225         assert(dev_flow->tcf.tunnel);
5226         pthread_mutex_lock(&vtep_list_mutex);
5227         switch (dev_flow->tcf.tunnel->type) {
5228         case FLOW_TCF_TUNACT_VXLAN_ENCAP:
5229                 vtep = flow_tcf_encap_vtep_acquire(tcf, ifouter,
5230                                                   dev_flow, error);
5231                 break;
5232         case FLOW_TCF_TUNACT_VXLAN_DECAP:
5233                 vtep = flow_tcf_decap_vtep_acquire(tcf, dev_flow, error);
5234                 break;
5235         default:
5236                 rte_flow_error_set(error, ENOTSUP,
5237                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5238                                    "unsupported tunnel type");
5239                 break;
5240         }
5241         pthread_mutex_unlock(&vtep_list_mutex);
5242         return vtep;
5243 }
5244
5245 /**
5246  * Release tunneling interface by ifindex. Decrements reference
5247  * counter and actually removes the device if counter is zero.
5248  *
5249  * @param[in] tcf
5250  *   Context object initialized by mlx5_flow_tcf_context_create().
5251  * @param[in] vtep
5252  *   VTEP device descriptor structure.
5253  * @param[in] dev_flow
5254  *   Flow tcf object with tunnel structure pointer set.
5255  */
5256 static void
5257 flow_tcf_vtep_release(struct mlx5_flow_tcf_context *tcf,
5258                       struct tcf_vtep *vtep,
5259                       struct mlx5_flow *dev_flow)
5260 {
5261         assert(dev_flow->tcf.tunnel);
5262         pthread_mutex_lock(&vtep_list_mutex);
5263         switch (dev_flow->tcf.tunnel->type) {
5264         case FLOW_TCF_TUNACT_VXLAN_DECAP:
5265                 break;
5266         case FLOW_TCF_TUNACT_VXLAN_ENCAP: {
5267                 struct tcf_irule *iface;
5268
5269                 /* Remove the encap ancillary rules first. */
5270                 iface = dev_flow->tcf.vxlan_encap->iface;
5271                 assert(iface);
5272                 flow_tcf_encap_neigh(tcf, iface, dev_flow, false, NULL);
5273                 flow_tcf_encap_local(tcf, iface, dev_flow, false, NULL);
5274                 flow_tcf_encap_irule_release(iface);
5275                 dev_flow->tcf.vxlan_encap->iface = NULL;
5276                 break;
5277         }
5278         default:
5279                 assert(false);
5280                 DRV_LOG(WARNING, "Unsupported tunnel type");
5281                 break;
5282         }
5283         assert(vtep->refcnt);
5284         if (--vtep->refcnt == 0) {
5285                 LIST_REMOVE(vtep, next);
5286                 flow_tcf_vtep_delete(tcf, vtep);
5287         }
5288         pthread_mutex_unlock(&vtep_list_mutex);
5289 }
5290
5291 struct tcf_nlcb_query {
5292         uint32_t handle;
5293         uint32_t tc_flags;
5294         uint32_t flags_valid:1;
5295 };
5296
5297 /**
5298  * Collect queried rule attributes. This is callback routine called by
5299  * libmnl mnl_cb_run() in loop for every message in received packet.
5300  * Current implementation collects the flower flags only.
5301  *
5302  * @param[in] nlh
5303  *   Pointer to reply header.
5304  * @param[in, out] arg
5305  *   Context pointer for this callback.
5306  *
5307  * @return
5308  *   A positive, nonzero value on success (required by libmnl
5309  *   to continue messages processing).
5310  */
5311 static int
5312 flow_tcf_collect_query_cb(const struct nlmsghdr *nlh, void *arg)
5313 {
5314         struct tcf_nlcb_query *query = arg;
5315         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
5316         struct nlattr *na, *na_opt;
5317         bool flower = false;
5318
5319         if (nlh->nlmsg_type != RTM_NEWTFILTER ||
5320             tcm->tcm_handle != query->handle)
5321                 return 1;
5322         mnl_attr_for_each(na, nlh, sizeof(*tcm)) {
5323                 switch (mnl_attr_get_type(na)) {
5324                 case TCA_KIND:
5325                         if (strcmp(mnl_attr_get_payload(na), "flower")) {
5326                                 /* Not flower filter, drop entire message. */
5327                                 return 1;
5328                         }
5329                         flower = true;
5330                         break;
5331                 case TCA_OPTIONS:
5332                         if (!flower) {
5333                                 /* Not flower options, drop entire message. */
5334                                 return 1;
5335                         }
5336                         /* Check nested flower options. */
5337                         mnl_attr_for_each_nested(na_opt, na) {
5338                                 switch (mnl_attr_get_type(na_opt)) {
5339                                 case TCA_FLOWER_FLAGS:
5340                                         query->flags_valid = 1;
5341                                         query->tc_flags =
5342                                                 mnl_attr_get_u32(na_opt);
5343                                         break;
5344                                 }
5345                         }
5346                         break;
5347                 }
5348         }
5349         return 1;
5350 }
5351
5352 /**
5353  * Query a TC flower rule flags via netlink.
5354  *
5355  * @param[in] tcf
5356  *   Context object initialized by mlx5_flow_tcf_context_create().
5357  * @param[in] dev_flow
5358  *   Pointer to the flow.
5359  * @param[out] pflags
5360  *   pointer to the data retrieved by the query.
5361  *
5362  * @return
5363  *   0 on success, a negative errno value otherwise.
5364  */
5365 static int
5366 flow_tcf_query_flags(struct mlx5_flow_tcf_context *tcf,
5367                      struct mlx5_flow *dev_flow,
5368                      uint32_t *pflags)
5369 {
5370         struct nlmsghdr *nlh;
5371         struct tcmsg *tcm;
5372         struct tcf_nlcb_query query = {
5373                 .handle = dev_flow->tcf.tcm->tcm_handle,
5374         };
5375
5376         nlh = mnl_nlmsg_put_header(tcf->buf);
5377         nlh->nlmsg_type = RTM_GETTFILTER;
5378         nlh->nlmsg_flags = NLM_F_REQUEST;
5379         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
5380         memcpy(tcm, dev_flow->tcf.tcm, sizeof(*tcm));
5381         /*
5382          * Ignore Netlink error for filter query operations.
5383          * The reply length is sent by kernel as errno.
5384          * Just check we got the flags option.
5385          */
5386         flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_query_cb, &query);
5387         if (!query.flags_valid) {
5388                 *pflags = 0;
5389                 return -ENOENT;
5390         }
5391         *pflags = query.tc_flags;
5392         return 0;
5393 }
5394
5395 /**
5396  * Query and check the in_hw set for specified rule.
5397  *
5398  * @param[in] tcf
5399  *   Context object initialized by mlx5_flow_tcf_context_create().
5400  * @param[in] dev_flow
5401  *   Pointer to the flow to check.
5402  *
5403  * @return
5404  *   0 on success, a negative errno value otherwise.
5405  */
5406 static int
5407 flow_tcf_check_inhw(struct mlx5_flow_tcf_context *tcf,
5408                     struct mlx5_flow *dev_flow)
5409 {
5410         uint32_t flags;
5411         int ret;
5412
5413         ret = flow_tcf_query_flags(tcf, dev_flow, &flags);
5414         if (ret)
5415                 return ret;
5416         return  (flags & TCA_CLS_FLAGS_IN_HW) ? 0 : -ENOENT;
5417 }
5418
5419 /**
5420  * Remove flow from E-Switch by sending Netlink message.
5421  *
5422  * @param[in] dev
5423  *   Pointer to Ethernet device.
5424  * @param[in, out] flow
5425  *   Pointer to the sub flow.
5426  */
5427 static void
5428 flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
5429 {
5430         struct priv *priv = dev->data->dev_private;
5431         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5432         struct mlx5_flow *dev_flow;
5433         struct nlmsghdr *nlh;
5434
5435         if (!flow)
5436                 return;
5437         dev_flow = LIST_FIRST(&flow->dev_flows);
5438         if (!dev_flow)
5439                 return;
5440         /* E-Switch flow can't be expanded. */
5441         assert(!LIST_NEXT(dev_flow, next));
5442         if (dev_flow->tcf.applied) {
5443                 nlh = dev_flow->tcf.nlh;
5444                 nlh->nlmsg_type = RTM_DELTFILTER;
5445                 nlh->nlmsg_flags = NLM_F_REQUEST;
5446                 flow_tcf_nl_ack(ctx, nlh, NULL, NULL);
5447                 if (dev_flow->tcf.tunnel) {
5448                         assert(dev_flow->tcf.tunnel->vtep);
5449                         flow_tcf_vtep_release(ctx,
5450                                 dev_flow->tcf.tunnel->vtep,
5451                                 dev_flow);
5452                         dev_flow->tcf.tunnel->vtep = NULL;
5453                 }
5454                 dev_flow->tcf.applied = 0;
5455         }
5456 }
5457
5458 /**
5459  * Apply flow to E-Switch by sending Netlink message.
5460  *
5461  * @param[in] dev
5462  *   Pointer to Ethernet device.
5463  * @param[in, out] flow
5464  *   Pointer to the sub flow.
5465  * @param[out] error
5466  *   Pointer to the error structure.
5467  *
5468  * @return
5469  *   0 on success, a negative errno value otherwise and rte_errno is set.
5470  */
5471 static int
5472 flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
5473                struct rte_flow_error *error)
5474 {
5475         struct priv *priv = dev->data->dev_private;
5476         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5477         struct mlx5_flow *dev_flow;
5478         struct nlmsghdr *nlh;
5479
5480         dev_flow = LIST_FIRST(&flow->dev_flows);
5481         /* E-Switch flow can't be expanded. */
5482         assert(!LIST_NEXT(dev_flow, next));
5483         if (dev_flow->tcf.applied)
5484                 return 0;
5485         nlh = dev_flow->tcf.nlh;
5486         nlh->nlmsg_type = RTM_NEWTFILTER;
5487         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
5488         if (dev_flow->tcf.tunnel) {
5489                 /*
5490                  * Replace the interface index, target for
5491                  * encapsulation, source for decapsulation.
5492                  */
5493                 assert(!dev_flow->tcf.tunnel->vtep);
5494                 assert(dev_flow->tcf.tunnel->ifindex_ptr);
5495                 /* Acquire actual VTEP device when rule is being applied. */
5496                 dev_flow->tcf.tunnel->vtep =
5497                         flow_tcf_vtep_acquire(ctx,
5498                                         dev_flow->tcf.tunnel->ifindex_org,
5499                                         dev_flow, error);
5500                 if (!dev_flow->tcf.tunnel->vtep)
5501                         return -rte_errno;
5502                 DRV_LOG(INFO, "Replace ifindex: %d->%d",
5503                                 dev_flow->tcf.tunnel->vtep->ifindex,
5504                                 dev_flow->tcf.tunnel->ifindex_org);
5505                 *dev_flow->tcf.tunnel->ifindex_ptr =
5506                         dev_flow->tcf.tunnel->vtep->ifindex;
5507         }
5508         if (!flow_tcf_nl_ack(ctx, nlh, NULL, NULL)) {
5509                 dev_flow->tcf.applied = 1;
5510                 if (*dev_flow->tcf.ptc_flags & TCA_CLS_FLAGS_SKIP_SW)
5511                         return 0;
5512                 /*
5513                  * Rule was applied without skip_sw flag set.
5514                  * We should check whether the rule was acctually
5515                  * accepted by hardware (have look at in_hw flag).
5516                  */
5517                 if (flow_tcf_check_inhw(ctx, dev_flow)) {
5518                         flow_tcf_remove(dev, flow);
5519                         return rte_flow_error_set
5520                                 (error, ENOENT,
5521                                  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5522                                  "netlink: rule has no in_hw flag set");
5523                 }
5524                 return 0;
5525         }
5526         if (dev_flow->tcf.tunnel) {
5527                 /* Rollback the VTEP configuration if rule apply failed. */
5528                 assert(dev_flow->tcf.tunnel->vtep);
5529                 flow_tcf_vtep_release(ctx, dev_flow->tcf.tunnel->vtep,
5530                                       dev_flow);
5531                 dev_flow->tcf.tunnel->vtep = NULL;
5532         }
5533         return rte_flow_error_set(error, rte_errno,
5534                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5535                                   "netlink: failed to create TC flow rule");
5536 }
5537
5538 /**
5539  * Remove flow from E-Switch and release resources of the device flow.
5540  *
5541  * @param[in] dev
5542  *   Pointer to Ethernet device.
5543  * @param[in, out] flow
5544  *   Pointer to the sub flow.
5545  */
5546 static void
5547 flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
5548 {
5549         struct mlx5_flow *dev_flow;
5550
5551         if (!flow)
5552                 return;
5553         flow_tcf_remove(dev, flow);
5554         if (flow->counter) {
5555                 if (--flow->counter->ref_cnt == 0) {
5556                         rte_free(flow->counter);
5557                         flow->counter = NULL;
5558                 }
5559         }
5560         dev_flow = LIST_FIRST(&flow->dev_flows);
5561         if (!dev_flow)
5562                 return;
5563         /* E-Switch flow can't be expanded. */
5564         assert(!LIST_NEXT(dev_flow, next));
5565         LIST_REMOVE(dev_flow, next);
5566         rte_free(dev_flow);
5567 }
5568
5569 /**
5570  * Helper routine for figuring the space size required for a parse buffer.
5571  *
5572  * @param array
5573  *   array of values to use.
5574  * @param idx
5575  *   Current location in array.
5576  * @param value
5577  *   Value to compare with.
5578  *
5579  * @return
5580  *   The maximum between the given value and the array value on index.
5581  */
5582 static uint16_t
5583 flow_tcf_arr_val_max(uint16_t array[], int idx, uint16_t value)
5584 {
5585         return idx < 0 ? (value) : RTE_MAX((array)[idx], value);
5586 }
5587
5588 /**
5589  * Parse rtnetlink message attributes filling the attribute table with the info
5590  * retrieved.
5591  *
5592  * @param tb
5593  *   Attribute table to be filled.
5594  * @param[out] max
5595  *   Maxinum entry in the attribute table.
5596  * @param rte
5597  *   The attributes section in the message to be parsed.
5598  * @param len
5599  *   The length of the attributes section in the message.
5600  */
5601 static void
5602 flow_tcf_nl_parse_rtattr(struct rtattr *tb[], int max,
5603                          struct rtattr *rta, int len)
5604 {
5605         unsigned short type;
5606         memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
5607         while (RTA_OK(rta, len)) {
5608                 type = rta->rta_type;
5609                 if (type <= max && !tb[type])
5610                         tb[type] = rta;
5611                 rta = RTA_NEXT(rta, len);
5612         }
5613 }
5614
5615 /**
5616  * Extract flow counters from flower action.
5617  *
5618  * @param rta
5619  *   flower action stats properties in the Netlink message received.
5620  * @param rta_type
5621  *   The backward sequence of rta_types, as written in the attribute table,
5622  *   we need to traverse in order to get to the requested object.
5623  * @param idx
5624  *   Current location in rta_type table.
5625  * @param[out] data
5626  *   data holding the count statistics of the rte_flow retrieved from
5627  *   the message.
5628  *
5629  * @return
5630  *   0 if data was found and retrieved, -1 otherwise.
5631  */
5632 static int
5633 flow_tcf_nl_action_stats_parse_and_get(struct rtattr *rta,
5634                                        uint16_t rta_type[], int idx,
5635                                        struct gnet_stats_basic *data)
5636 {
5637         int tca_stats_max = flow_tcf_arr_val_max(rta_type, idx,
5638                                                  TCA_STATS_BASIC);
5639         struct rtattr *tbs[tca_stats_max + 1];
5640
5641         if (rta == NULL || idx < 0)
5642                 return -1;
5643         flow_tcf_nl_parse_rtattr(tbs, tca_stats_max,
5644                                  RTA_DATA(rta), RTA_PAYLOAD(rta));
5645         switch (rta_type[idx]) {
5646         case TCA_STATS_BASIC:
5647                 if (tbs[TCA_STATS_BASIC]) {
5648                         memcpy(data, RTA_DATA(tbs[TCA_STATS_BASIC]),
5649                                RTE_MIN(RTA_PAYLOAD(tbs[TCA_STATS_BASIC]),
5650                                sizeof(*data)));
5651                         return 0;
5652                 }
5653                 break;
5654         default:
5655                 break;
5656         }
5657         return -1;
5658 }
5659
5660 /**
5661  * Parse flower single action retrieving the requested action attribute,
5662  * if found.
5663  *
5664  * @param arg
5665  *   flower action properties in the Netlink message received.
5666  * @param rta_type
5667  *   The backward sequence of rta_types, as written in the attribute table,
5668  *   we need to traverse in order to get to the requested object.
5669  * @param idx
5670  *   Current location in rta_type table.
5671  * @param[out] data
5672  *   Count statistics retrieved from the message query.
5673  *
5674  * @return
5675  *   0 if data was found and retrieved, -1 otherwise.
5676  */
5677 static int
5678 flow_tcf_nl_parse_one_action_and_get(struct rtattr *arg,
5679                                      uint16_t rta_type[], int idx, void *data)
5680 {
5681         int tca_act_max = flow_tcf_arr_val_max(rta_type, idx, TCA_ACT_STATS);
5682         struct rtattr *tb[tca_act_max + 1];
5683
5684         if (arg == NULL || idx < 0)
5685                 return -1;
5686         flow_tcf_nl_parse_rtattr(tb, tca_act_max,
5687                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
5688         if (tb[TCA_ACT_KIND] == NULL)
5689                 return -1;
5690         switch (rta_type[idx]) {
5691         case TCA_ACT_STATS:
5692                 if (tb[TCA_ACT_STATS])
5693                         return flow_tcf_nl_action_stats_parse_and_get
5694                                         (tb[TCA_ACT_STATS],
5695                                          rta_type, --idx,
5696                                          (struct gnet_stats_basic *)data);
5697                 break;
5698         default:
5699                 break;
5700         }
5701         return -1;
5702 }
5703
5704 /**
5705  * Parse flower action section in the message retrieving the requested
5706  * attribute from the first action that provides it.
5707  *
5708  * @param opt
5709  *   flower section in the Netlink message received.
5710  * @param rta_type
5711  *   The backward sequence of rta_types, as written in the attribute table,
5712  *   we need to traverse in order to get to the requested object.
5713  * @param idx
5714  *   Current location in rta_type table.
5715  * @param[out] data
5716  *   data retrieved from the message query.
5717  *
5718  * @return
5719  *   0 if data was found and retrieved, -1 otherwise.
5720  */
5721 static int
5722 flow_tcf_nl_action_parse_and_get(struct rtattr *arg,
5723                                  uint16_t rta_type[], int idx, void *data)
5724 {
5725         struct rtattr *tb[TCA_ACT_MAX_PRIO + 1];
5726         int i;
5727
5728         if (arg == NULL || idx < 0)
5729                 return -1;
5730         flow_tcf_nl_parse_rtattr(tb, TCA_ACT_MAX_PRIO,
5731                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
5732         switch (rta_type[idx]) {
5733         /*
5734          * flow counters are stored in the actions defined by the flow
5735          * and not in the flow itself, therefore we need to traverse the
5736          * flower chain of actions in search for them.
5737          *
5738          * Note that the index is not decremented here.
5739          */
5740         case TCA_ACT_STATS:
5741                 for (i = 0; i <= TCA_ACT_MAX_PRIO; i++) {
5742                         if (tb[i] &&
5743                         !flow_tcf_nl_parse_one_action_and_get(tb[i],
5744                                                               rta_type,
5745                                                               idx, data))
5746                                 return 0;
5747                 }
5748                 break;
5749         default:
5750                 break;
5751         }
5752         return -1;
5753 }
5754
5755 /**
5756  * Parse flower classifier options in the message, retrieving the requested
5757  * attribute if found.
5758  *
5759  * @param opt
5760  *   flower section in the Netlink message received.
5761  * @param rta_type
5762  *   The backward sequence of rta_types, as written in the attribute table,
5763  *   we need to traverse in order to get to the requested object.
5764  * @param idx
5765  *   Current location in rta_type table.
5766  * @param[out] data
5767  *   data retrieved from the message query.
5768  *
5769  * @return
5770  *   0 if data was found and retrieved, -1 otherwise.
5771  */
5772 static int
5773 flow_tcf_nl_opts_parse_and_get(struct rtattr *opt,
5774                                uint16_t rta_type[], int idx, void *data)
5775 {
5776         int tca_flower_max = flow_tcf_arr_val_max(rta_type, idx,
5777                                                   TCA_FLOWER_ACT);
5778         struct rtattr *tb[tca_flower_max + 1];
5779
5780         if (!opt || idx < 0)
5781                 return -1;
5782         flow_tcf_nl_parse_rtattr(tb, tca_flower_max,
5783                                  RTA_DATA(opt), RTA_PAYLOAD(opt));
5784         switch (rta_type[idx]) {
5785         case TCA_FLOWER_ACT:
5786                 if (tb[TCA_FLOWER_ACT])
5787                         return flow_tcf_nl_action_parse_and_get
5788                                                         (tb[TCA_FLOWER_ACT],
5789                                                          rta_type, --idx, data);
5790                 break;
5791         default:
5792                 break;
5793         }
5794         return -1;
5795 }
5796
5797 /**
5798  * Parse Netlink reply on filter query, retrieving the flow counters.
5799  *
5800  * @param nlh
5801  *   Message received from Netlink.
5802  * @param rta_type
5803  *   The backward sequence of rta_types, as written in the attribute table,
5804  *   we need to traverse in order to get to the requested object.
5805  * @param idx
5806  *   Current location in rta_type table.
5807  * @param[out] data
5808  *   data retrieved from the message query.
5809  *
5810  * @return
5811  *   0 if data was found and retrieved, -1 otherwise.
5812  */
5813 static int
5814 flow_tcf_nl_filter_parse_and_get(struct nlmsghdr *cnlh,
5815                                  uint16_t rta_type[], int idx, void *data)
5816 {
5817         struct nlmsghdr *nlh = cnlh;
5818         struct tcmsg *t = NLMSG_DATA(nlh);
5819         int len = nlh->nlmsg_len;
5820         int tca_max = flow_tcf_arr_val_max(rta_type, idx, TCA_OPTIONS);
5821         struct rtattr *tb[tca_max + 1];
5822
5823         if (idx < 0)
5824                 return -1;
5825         if (nlh->nlmsg_type != RTM_NEWTFILTER &&
5826             nlh->nlmsg_type != RTM_GETTFILTER &&
5827             nlh->nlmsg_type != RTM_DELTFILTER)
5828                 return -1;
5829         len -= NLMSG_LENGTH(sizeof(*t));
5830         if (len < 0)
5831                 return -1;
5832         flow_tcf_nl_parse_rtattr(tb, tca_max, TCA_RTA(t), len);
5833         /* Not a TC flower flow - bail out */
5834         if (!tb[TCA_KIND] ||
5835             strcmp(RTA_DATA(tb[TCA_KIND]), "flower"))
5836                 return -1;
5837         switch (rta_type[idx]) {
5838         case TCA_OPTIONS:
5839                 if (tb[TCA_OPTIONS])
5840                         return flow_tcf_nl_opts_parse_and_get(tb[TCA_OPTIONS],
5841                                                               rta_type,
5842                                                               --idx, data);
5843                 break;
5844         default:
5845                 break;
5846         }
5847         return -1;
5848 }
5849
5850 /**
5851  * A callback to parse Netlink reply on TC flower query.
5852  *
5853  * @param nlh
5854  *   Message received from Netlink.
5855  * @param[out] data
5856  *   Pointer to data area to be filled by the parsing routine.
5857  *   assumed to be a pointer to struct flow_tcf_stats_basic.
5858  *
5859  * @return
5860  *   MNL_CB_OK value.
5861  */
5862 static int
5863 flow_tcf_nl_message_get_stats_basic(const struct nlmsghdr *nlh, void *data)
5864 {
5865         /*
5866          * The backward sequence of rta_types to pass in order to get
5867          *  to the counters.
5868          */
5869         uint16_t rta_type[] = { TCA_STATS_BASIC, TCA_ACT_STATS,
5870                                 TCA_FLOWER_ACT, TCA_OPTIONS };
5871         struct flow_tcf_stats_basic *sb_data = data;
5872         union {
5873                 const struct nlmsghdr *c;
5874                 struct nlmsghdr *nc;
5875         } tnlh = { .c = nlh };
5876
5877         if (!flow_tcf_nl_filter_parse_and_get(tnlh.nc, rta_type,
5878                                               RTE_DIM(rta_type) - 1,
5879                                               (void *)&sb_data->counters))
5880                 sb_data->valid = true;
5881         return MNL_CB_OK;
5882 }
5883
5884 /**
5885  * Query a TC flower rule for its statistics via netlink.
5886  *
5887  * @param[in] dev
5888  *   Pointer to Ethernet device.
5889  * @param[in] flow
5890  *   Pointer to the sub flow.
5891  * @param[out] data
5892  *   data retrieved by the query.
5893  * @param[out] error
5894  *   Perform verbose error reporting if not NULL.
5895  *
5896  * @return
5897  *   0 on success, a negative errno value otherwise and rte_errno is set.
5898  */
5899 static int
5900 flow_tcf_query_count(struct rte_eth_dev *dev,
5901                           struct rte_flow *flow,
5902                           void *data,
5903                           struct rte_flow_error *error)
5904 {
5905         struct flow_tcf_stats_basic sb_data;
5906         struct rte_flow_query_count *qc = data;
5907         struct priv *priv = dev->data->dev_private;
5908         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5909         struct mnl_socket *nl = ctx->nl;
5910         struct mlx5_flow *dev_flow;
5911         struct nlmsghdr *nlh;
5912         uint32_t seq = priv->tcf_context->seq++;
5913         ssize_t ret;
5914         assert(qc);
5915
5916         memset(&sb_data, 0, sizeof(sb_data));
5917         dev_flow = LIST_FIRST(&flow->dev_flows);
5918         /* E-Switch flow can't be expanded. */
5919         assert(!LIST_NEXT(dev_flow, next));
5920         if (!dev_flow->flow->counter)
5921                 goto notsup_exit;
5922         nlh = dev_flow->tcf.nlh;
5923         nlh->nlmsg_type = RTM_GETTFILTER;
5924         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ECHO;
5925         nlh->nlmsg_seq = seq;
5926         if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) == -1)
5927                 goto error_exit;
5928         do {
5929                 ret = mnl_socket_recvfrom(nl, ctx->buf, ctx->buf_size);
5930                 if (ret <= 0)
5931                         break;
5932                 ret = mnl_cb_run(ctx->buf, ret, seq,
5933                                  mnl_socket_get_portid(nl),
5934                                  flow_tcf_nl_message_get_stats_basic,
5935                                  (void *)&sb_data);
5936         } while (ret > 0);
5937         /* Return the delta from last reset. */
5938         if (sb_data.valid) {
5939                 /* Return the delta from last reset. */
5940                 qc->hits_set = 1;
5941                 qc->bytes_set = 1;
5942                 qc->hits = sb_data.counters.packets - flow->counter->hits;
5943                 qc->bytes = sb_data.counters.bytes - flow->counter->bytes;
5944                 if (qc->reset) {
5945                         flow->counter->hits = sb_data.counters.packets;
5946                         flow->counter->bytes = sb_data.counters.bytes;
5947                 }
5948                 return 0;
5949         }
5950         return rte_flow_error_set(error, EINVAL,
5951                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5952                                   NULL,
5953                                   "flow does not have counter");
5954 error_exit:
5955         return rte_flow_error_set
5956                         (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5957                          NULL, "netlink: failed to read flow rule counters");
5958 notsup_exit:
5959         return rte_flow_error_set
5960                         (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5961                          NULL, "counters are not available.");
5962 }
5963
5964 /**
5965  * Query a flow.
5966  *
5967  * @see rte_flow_query()
5968  * @see rte_flow_ops
5969  */
5970 static int
5971 flow_tcf_query(struct rte_eth_dev *dev,
5972                struct rte_flow *flow,
5973                const struct rte_flow_action *actions,
5974                void *data,
5975                struct rte_flow_error *error)
5976 {
5977         int ret = -EINVAL;
5978
5979         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
5980                 switch (actions->type) {
5981                 case RTE_FLOW_ACTION_TYPE_VOID:
5982                         break;
5983                 case RTE_FLOW_ACTION_TYPE_COUNT:
5984                         ret = flow_tcf_query_count(dev, flow, data, error);
5985                         break;
5986                 default:
5987                         return rte_flow_error_set(error, ENOTSUP,
5988                                                   RTE_FLOW_ERROR_TYPE_ACTION,
5989                                                   actions,
5990                                                   "action not supported");
5991                 }
5992         }
5993         return ret;
5994 }
5995
5996 const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops = {
5997         .validate = flow_tcf_validate,
5998         .prepare = flow_tcf_prepare,
5999         .translate = flow_tcf_translate,
6000         .apply = flow_tcf_apply,
6001         .remove = flow_tcf_remove,
6002         .destroy = flow_tcf_destroy,
6003         .query = flow_tcf_query,
6004 };
6005
6006 /**
6007  * Create and configure a libmnl socket for Netlink flow rules.
6008  *
6009  * @return
6010  *   A valid libmnl socket object pointer on success, NULL otherwise and
6011  *   rte_errno is set.
6012  */
6013 static struct mnl_socket *
6014 flow_tcf_mnl_socket_create(void)
6015 {
6016         struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
6017
6018         if (nl) {
6019                 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
6020                                       sizeof(int));
6021                 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
6022                         return nl;
6023         }
6024         rte_errno = errno;
6025         if (nl)
6026                 mnl_socket_close(nl);
6027         return NULL;
6028 }
6029
6030 /**
6031  * Destroy a libmnl socket.
6032  *
6033  * @param nl
6034  *   Libmnl socket of the @p NETLINK_ROUTE kind.
6035  */
6036 static void
6037 flow_tcf_mnl_socket_destroy(struct mnl_socket *nl)
6038 {
6039         if (nl)
6040                 mnl_socket_close(nl);
6041 }
6042
6043 /**
6044  * Initialize ingress qdisc of a given network interface.
6045  *
6046  * @param ctx
6047  *   Pointer to tc-flower context to use.
6048  * @param ifindex
6049  *   Index of network interface to initialize.
6050  * @param[out] error
6051  *   Perform verbose error reporting if not NULL.
6052  *
6053  * @return
6054  *   0 on success, a negative errno value otherwise and rte_errno is set.
6055  */
6056 int
6057 mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx,
6058                    unsigned int ifindex, struct rte_flow_error *error)
6059 {
6060         struct nlmsghdr *nlh;
6061         struct tcmsg *tcm;
6062         alignas(struct nlmsghdr)
6063         uint8_t buf[mnl_nlmsg_size(sizeof(*tcm)) +
6064                     SZ_NLATTR_STRZ_OF("ingress") +
6065                     MNL_BUF_EXTRA_SPACE];
6066
6067         /* Destroy existing ingress qdisc and everything attached to it. */
6068         nlh = mnl_nlmsg_put_header(buf);
6069         nlh->nlmsg_type = RTM_DELQDISC;
6070         nlh->nlmsg_flags = NLM_F_REQUEST;
6071         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6072         tcm->tcm_family = AF_UNSPEC;
6073         tcm->tcm_ifindex = ifindex;
6074         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6075         tcm->tcm_parent = TC_H_INGRESS;
6076         assert(sizeof(buf) >= nlh->nlmsg_len);
6077         /* Ignore errors when qdisc is already absent. */
6078         if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL) &&
6079             rte_errno != EINVAL && rte_errno != ENOENT)
6080                 return rte_flow_error_set(error, rte_errno,
6081                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6082                                           "netlink: failed to remove ingress"
6083                                           " qdisc");
6084         /* Create fresh ingress qdisc. */
6085         nlh = mnl_nlmsg_put_header(buf);
6086         nlh->nlmsg_type = RTM_NEWQDISC;
6087         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
6088         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6089         tcm->tcm_family = AF_UNSPEC;
6090         tcm->tcm_ifindex = ifindex;
6091         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6092         tcm->tcm_parent = TC_H_INGRESS;
6093         mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
6094         assert(sizeof(buf) >= nlh->nlmsg_len);
6095         if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL))
6096                 return rte_flow_error_set(error, rte_errno,
6097                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6098                                           "netlink: failed to create ingress"
6099                                           " qdisc");
6100         return 0;
6101 }
6102
6103 /**
6104  * Create libmnl context for Netlink flow rules.
6105  *
6106  * @return
6107  *   A valid libmnl socket object pointer on success, NULL otherwise and
6108  *   rte_errno is set.
6109  */
6110 struct mlx5_flow_tcf_context *
6111 mlx5_flow_tcf_context_create(void)
6112 {
6113         struct mlx5_flow_tcf_context *ctx = rte_zmalloc(__func__,
6114                                                         sizeof(*ctx),
6115                                                         sizeof(uint32_t));
6116         if (!ctx)
6117                 goto error;
6118         ctx->nl = flow_tcf_mnl_socket_create();
6119         if (!ctx->nl)
6120                 goto error;
6121         ctx->buf_size = MNL_SOCKET_BUFFER_SIZE;
6122         ctx->buf = rte_zmalloc(__func__,
6123                                ctx->buf_size, sizeof(uint32_t));
6124         if (!ctx->buf)
6125                 goto error;
6126         ctx->seq = random();
6127         return ctx;
6128 error:
6129         mlx5_flow_tcf_context_destroy(ctx);
6130         return NULL;
6131 }
6132
6133 /**
6134  * Destroy a libmnl context.
6135  *
6136  * @param ctx
6137  *   Libmnl socket of the @p NETLINK_ROUTE kind.
6138  */
6139 void
6140 mlx5_flow_tcf_context_destroy(struct mlx5_flow_tcf_context *ctx)
6141 {
6142         if (!ctx)
6143                 return;
6144         flow_tcf_mnl_socket_destroy(ctx->nl);
6145         rte_free(ctx->buf);
6146         rte_free(ctx);
6147 }