net/mlx5: limit priority range for Linux TC flower driver
[dpdk.git] / drivers / net / mlx5 / mlx5_flow_tcf.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018 6WIND S.A.
3  * Copyright 2018 Mellanox Technologies, Ltd
4  */
5
6 #include <assert.h>
7 #include <errno.h>
8 #include <libmnl/libmnl.h>
9 #include <linux/gen_stats.h>
10 #include <linux/if_ether.h>
11 #include <linux/netlink.h>
12 #include <linux/pkt_cls.h>
13 #include <linux/pkt_sched.h>
14 #include <linux/rtnetlink.h>
15 #include <linux/tc_act/tc_gact.h>
16 #include <linux/tc_act/tc_mirred.h>
17 #include <netinet/in.h>
18 #include <stdalign.h>
19 #include <stdbool.h>
20 #include <stddef.h>
21 #include <stdint.h>
22 #include <stdlib.h>
23 #include <sys/socket.h>
24
25 #include <rte_byteorder.h>
26 #include <rte_errno.h>
27 #include <rte_ether.h>
28 #include <rte_flow.h>
29 #include <rte_malloc.h>
30 #include <rte_common.h>
31
32 #include "mlx5.h"
33 #include "mlx5_flow.h"
34 #include "mlx5_autoconf.h"
35
36 #ifdef HAVE_TC_ACT_VLAN
37
38 #include <linux/tc_act/tc_vlan.h>
39
40 #else /* HAVE_TC_ACT_VLAN */
41
42 #define TCA_VLAN_ACT_POP 1
43 #define TCA_VLAN_ACT_PUSH 2
44 #define TCA_VLAN_ACT_MODIFY 3
45 #define TCA_VLAN_PARMS 2
46 #define TCA_VLAN_PUSH_VLAN_ID 3
47 #define TCA_VLAN_PUSH_VLAN_PROTOCOL 4
48 #define TCA_VLAN_PAD 5
49 #define TCA_VLAN_PUSH_VLAN_PRIORITY 6
50
51 struct tc_vlan {
52         tc_gen;
53         int v_action;
54 };
55
56 #endif /* HAVE_TC_ACT_VLAN */
57
58 #ifdef HAVE_TC_ACT_PEDIT
59
60 #include <linux/tc_act/tc_pedit.h>
61
62 #else /* HAVE_TC_ACT_VLAN */
63
64 enum {
65         TCA_PEDIT_UNSPEC,
66         TCA_PEDIT_TM,
67         TCA_PEDIT_PARMS,
68         TCA_PEDIT_PAD,
69         TCA_PEDIT_PARMS_EX,
70         TCA_PEDIT_KEYS_EX,
71         TCA_PEDIT_KEY_EX,
72         __TCA_PEDIT_MAX
73 };
74
75 enum {
76         TCA_PEDIT_KEY_EX_HTYPE = 1,
77         TCA_PEDIT_KEY_EX_CMD = 2,
78         __TCA_PEDIT_KEY_EX_MAX
79 };
80
81 enum pedit_header_type {
82         TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK = 0,
83         TCA_PEDIT_KEY_EX_HDR_TYPE_ETH = 1,
84         TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 = 2,
85         TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 = 3,
86         TCA_PEDIT_KEY_EX_HDR_TYPE_TCP = 4,
87         TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
88         __PEDIT_HDR_TYPE_MAX,
89 };
90
91 enum pedit_cmd {
92         TCA_PEDIT_KEY_EX_CMD_SET = 0,
93         TCA_PEDIT_KEY_EX_CMD_ADD = 1,
94         __PEDIT_CMD_MAX,
95 };
96
97 struct tc_pedit_key {
98         __u32 mask; /* AND */
99         __u32 val; /*XOR */
100         __u32 off; /*offset */
101         __u32 at;
102         __u32 offmask;
103         __u32 shift;
104 };
105
106 __extension__
107 struct tc_pedit_sel {
108         tc_gen;
109         unsigned char nkeys;
110         unsigned char flags;
111         struct tc_pedit_key keys[0];
112 };
113
114 #endif /* HAVE_TC_ACT_VLAN */
115
116 #ifdef HAVE_TC_ACT_TUNNEL_KEY
117
118 #include <linux/tc_act/tc_tunnel_key.h>
119
120 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
121 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
122 #endif
123
124 #ifndef HAVE_TCA_TUNNEL_KEY_NO_CSUM
125 #define TCA_TUNNEL_KEY_NO_CSUM 10
126 #endif
127
128 #else /* HAVE_TC_ACT_TUNNEL_KEY */
129
130 #define TCA_ACT_TUNNEL_KEY 17
131 #define TCA_TUNNEL_KEY_ACT_SET 1
132 #define TCA_TUNNEL_KEY_ACT_RELEASE 2
133 #define TCA_TUNNEL_KEY_PARMS 2
134 #define TCA_TUNNEL_KEY_ENC_IPV4_SRC 3
135 #define TCA_TUNNEL_KEY_ENC_IPV4_DST 4
136 #define TCA_TUNNEL_KEY_ENC_IPV6_SRC 5
137 #define TCA_TUNNEL_KEY_ENC_IPV6_DST 6
138 #define TCA_TUNNEL_KEY_ENC_KEY_ID 7
139 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
140 #define TCA_TUNNEL_KEY_NO_CSUM 10
141
142 struct tc_tunnel_key {
143         tc_gen;
144         int t_action;
145 };
146
147 #endif /* HAVE_TC_ACT_TUNNEL_KEY */
148
149 /* Normally found in linux/netlink.h. */
150 #ifndef NETLINK_CAP_ACK
151 #define NETLINK_CAP_ACK 10
152 #endif
153
154 /* Normally found in linux/pkt_sched.h. */
155 #ifndef TC_H_MIN_INGRESS
156 #define TC_H_MIN_INGRESS 0xfff2u
157 #endif
158
159 /* Normally found in linux/pkt_cls.h. */
160 #ifndef TCA_CLS_FLAGS_SKIP_SW
161 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
162 #endif
163 #ifndef HAVE_TCA_CHAIN
164 #define TCA_CHAIN 11
165 #endif
166 #ifndef HAVE_TCA_FLOWER_ACT
167 #define TCA_FLOWER_ACT 3
168 #endif
169 #ifndef HAVE_TCA_FLOWER_FLAGS
170 #define TCA_FLOWER_FLAGS 22
171 #endif
172 #ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE
173 #define TCA_FLOWER_KEY_ETH_TYPE 8
174 #endif
175 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST
176 #define TCA_FLOWER_KEY_ETH_DST 4
177 #endif
178 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK
179 #define TCA_FLOWER_KEY_ETH_DST_MASK 5
180 #endif
181 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC
182 #define TCA_FLOWER_KEY_ETH_SRC 6
183 #endif
184 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK
185 #define TCA_FLOWER_KEY_ETH_SRC_MASK 7
186 #endif
187 #ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO
188 #define TCA_FLOWER_KEY_IP_PROTO 9
189 #endif
190 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC
191 #define TCA_FLOWER_KEY_IPV4_SRC 10
192 #endif
193 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK
194 #define TCA_FLOWER_KEY_IPV4_SRC_MASK 11
195 #endif
196 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST
197 #define TCA_FLOWER_KEY_IPV4_DST 12
198 #endif
199 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK
200 #define TCA_FLOWER_KEY_IPV4_DST_MASK 13
201 #endif
202 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC
203 #define TCA_FLOWER_KEY_IPV6_SRC 14
204 #endif
205 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK
206 #define TCA_FLOWER_KEY_IPV6_SRC_MASK 15
207 #endif
208 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST
209 #define TCA_FLOWER_KEY_IPV6_DST 16
210 #endif
211 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK
212 #define TCA_FLOWER_KEY_IPV6_DST_MASK 17
213 #endif
214 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC
215 #define TCA_FLOWER_KEY_TCP_SRC 18
216 #endif
217 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK
218 #define TCA_FLOWER_KEY_TCP_SRC_MASK 35
219 #endif
220 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST
221 #define TCA_FLOWER_KEY_TCP_DST 19
222 #endif
223 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK
224 #define TCA_FLOWER_KEY_TCP_DST_MASK 36
225 #endif
226 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC
227 #define TCA_FLOWER_KEY_UDP_SRC 20
228 #endif
229 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK
230 #define TCA_FLOWER_KEY_UDP_SRC_MASK 37
231 #endif
232 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST
233 #define TCA_FLOWER_KEY_UDP_DST 21
234 #endif
235 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK
236 #define TCA_FLOWER_KEY_UDP_DST_MASK 38
237 #endif
238 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID
239 #define TCA_FLOWER_KEY_VLAN_ID 23
240 #endif
241 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO
242 #define TCA_FLOWER_KEY_VLAN_PRIO 24
243 #endif
244 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
245 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
246 #endif
247 #ifndef HAVE_TCA_FLOWER_KEY_ENC_KEY_ID
248 #define TCA_FLOWER_KEY_ENC_KEY_ID 26
249 #endif
250 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC
251 #define TCA_FLOWER_KEY_ENC_IPV4_SRC 27
252 #endif
253 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK
254 #define TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK 28
255 #endif
256 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST
257 #define TCA_FLOWER_KEY_ENC_IPV4_DST 29
258 #endif
259 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK
260 #define TCA_FLOWER_KEY_ENC_IPV4_DST_MASK 30
261 #endif
262 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC
263 #define TCA_FLOWER_KEY_ENC_IPV6_SRC 31
264 #endif
265 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK
266 #define TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK 32
267 #endif
268 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST
269 #define TCA_FLOWER_KEY_ENC_IPV6_DST 33
270 #endif
271 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK
272 #define TCA_FLOWER_KEY_ENC_IPV6_DST_MASK 34
273 #endif
274 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT
275 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT 43
276 #endif
277 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK
278 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK 44
279 #endif
280 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT
281 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT 45
282 #endif
283 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK
284 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK 46
285 #endif
286 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS
287 #define TCA_FLOWER_KEY_TCP_FLAGS 71
288 #endif
289 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK
290 #define TCA_FLOWER_KEY_TCP_FLAGS_MASK 72
291 #endif
292 #ifndef HAVE_TC_ACT_GOTO_CHAIN
293 #define TC_ACT_GOTO_CHAIN 0x20000000
294 #endif
295
296 #ifndef IPV6_ADDR_LEN
297 #define IPV6_ADDR_LEN 16
298 #endif
299
300 #ifndef IPV4_ADDR_LEN
301 #define IPV4_ADDR_LEN 4
302 #endif
303
304 #ifndef TP_PORT_LEN
305 #define TP_PORT_LEN 2 /* Transport Port (UDP/TCP) Length */
306 #endif
307
308 #ifndef TTL_LEN
309 #define TTL_LEN 1
310 #endif
311
312 #ifndef TCA_ACT_MAX_PRIO
313 #define TCA_ACT_MAX_PRIO 32
314 #endif
315
316 /** UDP port range of VXLAN devices created by driver. */
317 #define MLX5_VXLAN_PORT_MIN 30000
318 #define MLX5_VXLAN_PORT_MAX 60000
319 #define MLX5_VXLAN_DEVICE_PFX "vmlx_"
320
321 /** Tunnel action type, used for @p type in header structure. */
322 enum flow_tcf_tunact_type {
323         FLOW_TCF_TUNACT_VXLAN_DECAP,
324         FLOW_TCF_TUNACT_VXLAN_ENCAP,
325 };
326
327 /** Flags used for @p mask in tunnel action encap descriptors. */
328 #define FLOW_TCF_ENCAP_ETH_SRC (1u << 0)
329 #define FLOW_TCF_ENCAP_ETH_DST (1u << 1)
330 #define FLOW_TCF_ENCAP_IPV4_SRC (1u << 2)
331 #define FLOW_TCF_ENCAP_IPV4_DST (1u << 3)
332 #define FLOW_TCF_ENCAP_IPV6_SRC (1u << 4)
333 #define FLOW_TCF_ENCAP_IPV6_DST (1u << 5)
334 #define FLOW_TCF_ENCAP_UDP_SRC (1u << 6)
335 #define FLOW_TCF_ENCAP_UDP_DST (1u << 7)
336 #define FLOW_TCF_ENCAP_VXLAN_VNI (1u << 8)
337
338 /**
339  * Structure for holding netlink context.
340  * Note the size of the message buffer which is MNL_SOCKET_BUFFER_SIZE.
341  * Using this (8KB) buffer size ensures that netlink messages will never be
342  * truncated.
343  */
344 struct mlx5_flow_tcf_context {
345         struct mnl_socket *nl; /* NETLINK_ROUTE libmnl socket. */
346         uint32_t seq; /* Message sequence number. */
347         uint32_t buf_size; /* Message buffer size. */
348         uint8_t *buf; /* Message buffer. */
349 };
350
351 /**
352  * Neigh rule structure. The neigh rule is applied via Netlink to
353  * outer tunnel iface in order to provide destination MAC address
354  * for the VXLAN encapsultion. The neigh rule is implicitly related
355  * to the Flow itself and can be shared by multiple Flows.
356  */
357 struct tcf_neigh_rule {
358         LIST_ENTRY(tcf_neigh_rule) next;
359         uint32_t refcnt;
360         struct ether_addr eth;
361         uint16_t mask;
362         union {
363                 struct {
364                         rte_be32_t dst;
365                 } ipv4;
366                 struct {
367                         uint8_t dst[IPV6_ADDR_LEN];
368                 } ipv6;
369         };
370 };
371
372 /**
373  * Local rule structure. The local rule is applied via Netlink to
374  * outer tunnel iface in order to provide local and peer IP addresses
375  * of the VXLAN tunnel for encapsulation. The local rule is implicitly
376  * related to the Flow itself and can be shared by multiple Flows.
377  */
378 struct tcf_local_rule {
379         LIST_ENTRY(tcf_local_rule) next;
380         uint32_t refcnt;
381         uint16_t mask;
382         union {
383                 struct {
384                         rte_be32_t dst;
385                         rte_be32_t src;
386                 } ipv4;
387                 struct {
388                         uint8_t dst[IPV6_ADDR_LEN];
389                         uint8_t src[IPV6_ADDR_LEN];
390                 } ipv6;
391         };
392 };
393
394 /** VXLAN virtual netdev. */
395 struct tcf_vtep {
396         LIST_ENTRY(tcf_vtep) next;
397         LIST_HEAD(, tcf_neigh_rule) neigh;
398         LIST_HEAD(, tcf_local_rule) local;
399         uint32_t refcnt;
400         unsigned int ifindex; /**< Own interface index. */
401         unsigned int ifouter; /**< Index of device attached to. */
402         uint16_t port;
403         uint8_t created;
404 };
405
406 /** Tunnel descriptor header, common for all tunnel types. */
407 struct flow_tcf_tunnel_hdr {
408         uint32_t type; /**< Tunnel action type. */
409         struct tcf_vtep *vtep; /**< Virtual tunnel endpoint device. */
410         unsigned int ifindex_org; /**< Original dst/src interface */
411         unsigned int *ifindex_ptr; /**< Interface ptr in message. */
412 };
413
414 struct flow_tcf_vxlan_decap {
415         struct flow_tcf_tunnel_hdr hdr;
416         uint16_t udp_port;
417 };
418
419 struct flow_tcf_vxlan_encap {
420         struct flow_tcf_tunnel_hdr hdr;
421         uint32_t mask;
422         struct {
423                 struct ether_addr dst;
424                 struct ether_addr src;
425         } eth;
426         union {
427                 struct {
428                         rte_be32_t dst;
429                         rte_be32_t src;
430                 } ipv4;
431                 struct {
432                         uint8_t dst[IPV6_ADDR_LEN];
433                         uint8_t src[IPV6_ADDR_LEN];
434                 } ipv6;
435         };
436 struct {
437                 rte_be16_t src;
438                 rte_be16_t dst;
439         } udp;
440         struct {
441                 uint8_t vni[3];
442         } vxlan;
443 };
444
445 /** Structure used when extracting the values of a flow counters
446  * from a netlink message.
447  */
448 struct flow_tcf_stats_basic {
449         bool valid;
450         struct gnet_stats_basic counters;
451 };
452
453 /** Empty masks for known item types. */
454 static const union {
455         struct rte_flow_item_port_id port_id;
456         struct rte_flow_item_eth eth;
457         struct rte_flow_item_vlan vlan;
458         struct rte_flow_item_ipv4 ipv4;
459         struct rte_flow_item_ipv6 ipv6;
460         struct rte_flow_item_tcp tcp;
461         struct rte_flow_item_udp udp;
462         struct rte_flow_item_vxlan vxlan;
463 } flow_tcf_mask_empty;
464
465 /** Supported masks for known item types. */
466 static const struct {
467         struct rte_flow_item_port_id port_id;
468         struct rte_flow_item_eth eth;
469         struct rte_flow_item_vlan vlan;
470         struct rte_flow_item_ipv4 ipv4;
471         struct rte_flow_item_ipv6 ipv6;
472         struct rte_flow_item_tcp tcp;
473         struct rte_flow_item_udp udp;
474         struct rte_flow_item_vxlan vxlan;
475 } flow_tcf_mask_supported = {
476         .port_id = {
477                 .id = 0xffffffff,
478         },
479         .eth = {
480                 .type = RTE_BE16(0xffff),
481                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
482                 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
483         },
484         .vlan = {
485                 /* PCP and VID only, no DEI. */
486                 .tci = RTE_BE16(0xefff),
487                 .inner_type = RTE_BE16(0xffff),
488         },
489         .ipv4.hdr = {
490                 .next_proto_id = 0xff,
491                 .src_addr = RTE_BE32(0xffffffff),
492                 .dst_addr = RTE_BE32(0xffffffff),
493         },
494         .ipv6.hdr = {
495                 .proto = 0xff,
496                 .src_addr =
497                         "\xff\xff\xff\xff\xff\xff\xff\xff"
498                         "\xff\xff\xff\xff\xff\xff\xff\xff",
499                 .dst_addr =
500                         "\xff\xff\xff\xff\xff\xff\xff\xff"
501                         "\xff\xff\xff\xff\xff\xff\xff\xff",
502         },
503         .tcp.hdr = {
504                 .src_port = RTE_BE16(0xffff),
505                 .dst_port = RTE_BE16(0xffff),
506                 .tcp_flags = 0xff,
507         },
508         .udp.hdr = {
509                 .src_port = RTE_BE16(0xffff),
510                 .dst_port = RTE_BE16(0xffff),
511         },
512         .vxlan = {
513                .vni = "\xff\xff\xff",
514         },
515 };
516
517 #define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr))
518 #define SZ_NLATTR_NEST SZ_NLATTR_HDR
519 #define SZ_NLATTR_DATA_OF(len) MNL_ALIGN(SZ_NLATTR_HDR + (len))
520 #define SZ_NLATTR_TYPE_OF(typ) SZ_NLATTR_DATA_OF(sizeof(typ))
521 #define SZ_NLATTR_STRZ_OF(str) SZ_NLATTR_DATA_OF(strlen(str) + 1)
522
523 #define PTOI_TABLE_SZ_MAX(dev) (mlx5_dev_to_port_id((dev)->device, NULL, 0) + 2)
524
525 /** DPDK port to network interface index (ifindex) conversion. */
526 struct flow_tcf_ptoi {
527         uint16_t port_id; /**< DPDK port ID. */
528         unsigned int ifindex; /**< Network interface index. */
529 };
530
531 /* Due to a limitation on driver/FW. */
532 #define MLX5_TCF_GROUP_ID_MAX 3
533
534 /*
535  * Due to a limitation on driver/FW, priority ranges from 1 to 16 in kernel.
536  * Priority in rte_flow attribute starts from 0 and is added by 1 in
537  * translation. This is subject to be changed to determine the max priority
538  * based on trial-and-error like Verbs driver once the restriction is lifted or
539  * the range is extended.
540  */
541 #define MLX5_TCF_GROUP_PRIORITY_MAX 15
542
543 #define MLX5_TCF_FATE_ACTIONS \
544         (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \
545          MLX5_FLOW_ACTION_JUMP)
546
547 #define MLX5_TCF_VLAN_ACTIONS \
548         (MLX5_FLOW_ACTION_OF_POP_VLAN | MLX5_FLOW_ACTION_OF_PUSH_VLAN | \
549          MLX5_FLOW_ACTION_OF_SET_VLAN_VID | MLX5_FLOW_ACTION_OF_SET_VLAN_PCP)
550
551 #define MLX5_TCF_VXLAN_ACTIONS \
552         (MLX5_FLOW_ACTION_VXLAN_ENCAP | MLX5_FLOW_ACTION_VXLAN_DECAP)
553
554 #define MLX5_TCF_PEDIT_ACTIONS \
555         (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST | \
556          MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST | \
557          MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST | \
558          MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL | \
559          MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)
560
561 #define MLX5_TCF_CONFIG_ACTIONS \
562         (MLX5_FLOW_ACTION_PORT_ID | MLX5_FLOW_ACTION_JUMP | \
563          MLX5_FLOW_ACTION_OF_PUSH_VLAN | MLX5_FLOW_ACTION_OF_SET_VLAN_VID | \
564          MLX5_FLOW_ACTION_OF_SET_VLAN_PCP | \
565          (MLX5_TCF_PEDIT_ACTIONS & ~MLX5_FLOW_ACTION_DEC_TTL))
566
567 #define MAX_PEDIT_KEYS 128
568 #define SZ_PEDIT_KEY_VAL 4
569
570 #define NUM_OF_PEDIT_KEYS(sz) \
571         (((sz) / SZ_PEDIT_KEY_VAL) + (((sz) % SZ_PEDIT_KEY_VAL) ? 1 : 0))
572
573 struct pedit_key_ex {
574         enum pedit_header_type htype;
575         enum pedit_cmd cmd;
576 };
577
578 struct pedit_parser {
579         struct tc_pedit_sel sel;
580         struct tc_pedit_key keys[MAX_PEDIT_KEYS];
581         struct pedit_key_ex keys_ex[MAX_PEDIT_KEYS];
582 };
583
584 /**
585  * Create space for using the implicitly created TC flow counter.
586  *
587  * @param[in] dev
588  *   Pointer to the Ethernet device structure.
589  *
590  * @return
591  *   A pointer to the counter data structure, NULL otherwise and
592  *   rte_errno is set.
593  */
594 static struct mlx5_flow_counter *
595 flow_tcf_counter_new(void)
596 {
597         struct mlx5_flow_counter *cnt;
598
599         /*
600          * eswitch counter cannot be shared and its id is unknown.
601          * currently returning all with id 0.
602          * in the future maybe better to switch to unique numbers.
603          */
604         struct mlx5_flow_counter tmpl = {
605                 .ref_cnt = 1,
606         };
607         cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0);
608         if (!cnt) {
609                 rte_errno = ENOMEM;
610                 return NULL;
611         }
612         *cnt = tmpl;
613         /* Implicit counter, do not add to list. */
614         return cnt;
615 }
616
617 /**
618  * Set pedit key of MAC address
619  *
620  * @param[in] actions
621  *   pointer to action specification
622  * @param[in,out] p_parser
623  *   pointer to pedit_parser
624  */
625 static void
626 flow_tcf_pedit_key_set_mac(const struct rte_flow_action *actions,
627                            struct pedit_parser *p_parser)
628 {
629         int idx = p_parser->sel.nkeys;
630         uint32_t off = actions->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ?
631                                         offsetof(struct ether_hdr, s_addr) :
632                                         offsetof(struct ether_hdr, d_addr);
633         const struct rte_flow_action_set_mac *conf =
634                 (const struct rte_flow_action_set_mac *)actions->conf;
635
636         p_parser->keys[idx].off = off;
637         p_parser->keys[idx].mask = ~UINT32_MAX;
638         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
639         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
640         memcpy(&p_parser->keys[idx].val,
641                 conf->mac_addr, SZ_PEDIT_KEY_VAL);
642         idx++;
643         p_parser->keys[idx].off = off + SZ_PEDIT_KEY_VAL;
644         p_parser->keys[idx].mask = 0xFFFF0000;
645         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
646         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
647         memcpy(&p_parser->keys[idx].val,
648                 conf->mac_addr + SZ_PEDIT_KEY_VAL,
649                 ETHER_ADDR_LEN - SZ_PEDIT_KEY_VAL);
650         p_parser->sel.nkeys = (++idx);
651 }
652
653 /**
654  * Set pedit key of decrease/set ttl
655  *
656  * @param[in] actions
657  *   pointer to action specification
658  * @param[in,out] p_parser
659  *   pointer to pedit_parser
660  * @param[in] item_flags
661  *   flags of all items presented
662  */
663 static void
664 flow_tcf_pedit_key_set_dec_ttl(const struct rte_flow_action *actions,
665                                 struct pedit_parser *p_parser,
666                                 uint64_t item_flags)
667 {
668         int idx = p_parser->sel.nkeys;
669
670         p_parser->keys[idx].mask = 0xFFFFFF00;
671         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) {
672                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
673                 p_parser->keys[idx].off =
674                         offsetof(struct ipv4_hdr, time_to_live);
675         }
676         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6) {
677                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
678                 p_parser->keys[idx].off =
679                         offsetof(struct ipv6_hdr, hop_limits);
680         }
681         if (actions->type == RTE_FLOW_ACTION_TYPE_DEC_TTL) {
682                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_ADD;
683                 p_parser->keys[idx].val = 0x000000FF;
684         } else {
685                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
686                 p_parser->keys[idx].val =
687                         (__u32)((const struct rte_flow_action_set_ttl *)
688                          actions->conf)->ttl_value;
689         }
690         p_parser->sel.nkeys = (++idx);
691 }
692
693 /**
694  * Set pedit key of transport (TCP/UDP) port value
695  *
696  * @param[in] actions
697  *   pointer to action specification
698  * @param[in,out] p_parser
699  *   pointer to pedit_parser
700  * @param[in] item_flags
701  *   flags of all items presented
702  */
703 static void
704 flow_tcf_pedit_key_set_tp_port(const struct rte_flow_action *actions,
705                                 struct pedit_parser *p_parser,
706                                 uint64_t item_flags)
707 {
708         int idx = p_parser->sel.nkeys;
709
710         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)
711                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_UDP;
712         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_TCP)
713                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_TCP;
714         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
715         /* offset of src/dst port is same for TCP and UDP */
716         p_parser->keys[idx].off =
717                 actions->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC ?
718                 offsetof(struct tcp_hdr, src_port) :
719                 offsetof(struct tcp_hdr, dst_port);
720         p_parser->keys[idx].mask = 0xFFFF0000;
721         p_parser->keys[idx].val =
722                 (__u32)((const struct rte_flow_action_set_tp *)
723                                 actions->conf)->port;
724         p_parser->sel.nkeys = (++idx);
725 }
726
727 /**
728  * Set pedit key of ipv6 address
729  *
730  * @param[in] actions
731  *   pointer to action specification
732  * @param[in,out] p_parser
733  *   pointer to pedit_parser
734  */
735 static void
736 flow_tcf_pedit_key_set_ipv6_addr(const struct rte_flow_action *actions,
737                                  struct pedit_parser *p_parser)
738 {
739         int idx = p_parser->sel.nkeys;
740         int keys = NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
741         int off_base =
742                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ?
743                 offsetof(struct ipv6_hdr, src_addr) :
744                 offsetof(struct ipv6_hdr, dst_addr);
745         const struct rte_flow_action_set_ipv6 *conf =
746                 (const struct rte_flow_action_set_ipv6 *)actions->conf;
747
748         for (int i = 0; i < keys; i++, idx++) {
749                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
750                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
751                 p_parser->keys[idx].off = off_base + i * SZ_PEDIT_KEY_VAL;
752                 p_parser->keys[idx].mask = ~UINT32_MAX;
753                 memcpy(&p_parser->keys[idx].val,
754                         conf->ipv6_addr + i *  SZ_PEDIT_KEY_VAL,
755                         SZ_PEDIT_KEY_VAL);
756         }
757         p_parser->sel.nkeys += keys;
758 }
759
760 /**
761  * Set pedit key of ipv4 address
762  *
763  * @param[in] actions
764  *   pointer to action specification
765  * @param[in,out] p_parser
766  *   pointer to pedit_parser
767  */
768 static void
769 flow_tcf_pedit_key_set_ipv4_addr(const struct rte_flow_action *actions,
770                                  struct pedit_parser *p_parser)
771 {
772         int idx = p_parser->sel.nkeys;
773
774         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
775         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
776         p_parser->keys[idx].off =
777                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ?
778                 offsetof(struct ipv4_hdr, src_addr) :
779                 offsetof(struct ipv4_hdr, dst_addr);
780         p_parser->keys[idx].mask = ~UINT32_MAX;
781         p_parser->keys[idx].val =
782                 ((const struct rte_flow_action_set_ipv4 *)
783                  actions->conf)->ipv4_addr;
784         p_parser->sel.nkeys = (++idx);
785 }
786
787 /**
788  * Create the pedit's na attribute in netlink message
789  * on pre-allocate message buffer
790  *
791  * @param[in,out] nl
792  *   pointer to pre-allocated netlink message buffer
793  * @param[in,out] actions
794  *   pointer to pointer of actions specification.
795  * @param[in,out] action_flags
796  *   pointer to actions flags
797  * @param[in] item_flags
798  *   flags of all item presented
799  */
800 static void
801 flow_tcf_create_pedit_mnl_msg(struct nlmsghdr *nl,
802                               const struct rte_flow_action **actions,
803                               uint64_t item_flags)
804 {
805         struct pedit_parser p_parser;
806         struct nlattr *na_act_options;
807         struct nlattr *na_pedit_keys;
808
809         memset(&p_parser, 0, sizeof(p_parser));
810         mnl_attr_put_strz(nl, TCA_ACT_KIND, "pedit");
811         na_act_options = mnl_attr_nest_start(nl, TCA_ACT_OPTIONS);
812         /* all modify header actions should be in one tc-pedit action */
813         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
814                 switch ((*actions)->type) {
815                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
816                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
817                         flow_tcf_pedit_key_set_ipv4_addr(*actions, &p_parser);
818                         break;
819                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
820                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
821                         flow_tcf_pedit_key_set_ipv6_addr(*actions, &p_parser);
822                         break;
823                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
824                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
825                         flow_tcf_pedit_key_set_tp_port(*actions,
826                                                         &p_parser, item_flags);
827                         break;
828                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
829                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
830                         flow_tcf_pedit_key_set_dec_ttl(*actions,
831                                                         &p_parser, item_flags);
832                         break;
833                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
834                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
835                         flow_tcf_pedit_key_set_mac(*actions, &p_parser);
836                         break;
837                 default:
838                         goto pedit_mnl_msg_done;
839                 }
840         }
841 pedit_mnl_msg_done:
842         p_parser.sel.action = TC_ACT_PIPE;
843         mnl_attr_put(nl, TCA_PEDIT_PARMS_EX,
844                      sizeof(p_parser.sel) +
845                      p_parser.sel.nkeys * sizeof(struct tc_pedit_key),
846                      &p_parser);
847         na_pedit_keys =
848                 mnl_attr_nest_start(nl, TCA_PEDIT_KEYS_EX | NLA_F_NESTED);
849         for (int i = 0; i < p_parser.sel.nkeys; i++) {
850                 struct nlattr *na_pedit_key =
851                         mnl_attr_nest_start(nl,
852                                             TCA_PEDIT_KEY_EX | NLA_F_NESTED);
853                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_HTYPE,
854                                  p_parser.keys_ex[i].htype);
855                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_CMD,
856                                  p_parser.keys_ex[i].cmd);
857                 mnl_attr_nest_end(nl, na_pedit_key);
858         }
859         mnl_attr_nest_end(nl, na_pedit_keys);
860         mnl_attr_nest_end(nl, na_act_options);
861         (*actions)--;
862 }
863
864 /**
865  * Calculate max memory size of one TC-pedit actions.
866  * One TC-pedit action can contain set of keys each defining
867  * a rewrite element (rte_flow action)
868  *
869  * @param[in,out] actions
870  *   actions specification.
871  * @param[in,out] action_flags
872  *   actions flags
873  * @param[in,out] size
874  *   accumulated size
875  * @return
876  *   Max memory size of one TC-pedit action
877  */
878 static int
879 flow_tcf_get_pedit_actions_size(const struct rte_flow_action **actions,
880                                 uint64_t *action_flags)
881 {
882         int pedit_size = 0;
883         int keys = 0;
884         uint64_t flags = 0;
885
886         pedit_size += SZ_NLATTR_NEST + /* na_act_index. */
887                       SZ_NLATTR_STRZ_OF("pedit") +
888                       SZ_NLATTR_NEST; /* TCA_ACT_OPTIONS. */
889         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
890                 switch ((*actions)->type) {
891                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
892                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
893                         flags |= MLX5_FLOW_ACTION_SET_IPV4_SRC;
894                         break;
895                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
896                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
897                         flags |= MLX5_FLOW_ACTION_SET_IPV4_DST;
898                         break;
899                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
900                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
901                         flags |= MLX5_FLOW_ACTION_SET_IPV6_SRC;
902                         break;
903                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
904                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
905                         flags |= MLX5_FLOW_ACTION_SET_IPV6_DST;
906                         break;
907                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
908                         /* TCP is as same as UDP */
909                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
910                         flags |= MLX5_FLOW_ACTION_SET_TP_SRC;
911                         break;
912                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
913                         /* TCP is as same as UDP */
914                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
915                         flags |= MLX5_FLOW_ACTION_SET_TP_DST;
916                         break;
917                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
918                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
919                         flags |= MLX5_FLOW_ACTION_SET_TTL;
920                         break;
921                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
922                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
923                         flags |= MLX5_FLOW_ACTION_DEC_TTL;
924                         break;
925                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
926                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
927                         flags |= MLX5_FLOW_ACTION_SET_MAC_SRC;
928                         break;
929                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
930                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
931                         flags |= MLX5_FLOW_ACTION_SET_MAC_DST;
932                         break;
933                 default:
934                         goto get_pedit_action_size_done;
935                 }
936         }
937 get_pedit_action_size_done:
938         /* TCA_PEDIT_PARAMS_EX */
939         pedit_size +=
940                 SZ_NLATTR_DATA_OF(sizeof(struct tc_pedit_sel) +
941                                   keys * sizeof(struct tc_pedit_key));
942         pedit_size += SZ_NLATTR_NEST; /* TCA_PEDIT_KEYS */
943         pedit_size += keys *
944                       /* TCA_PEDIT_KEY_EX + HTYPE + CMD */
945                       (SZ_NLATTR_NEST + SZ_NLATTR_DATA_OF(2) +
946                        SZ_NLATTR_DATA_OF(2));
947         (*action_flags) |= flags;
948         (*actions)--;
949         return pedit_size;
950 }
951
952 /**
953  * Retrieve mask for pattern item.
954  *
955  * This function does basic sanity checks on a pattern item in order to
956  * return the most appropriate mask for it.
957  *
958  * @param[in] item
959  *   Item specification.
960  * @param[in] mask_default
961  *   Default mask for pattern item as specified by the flow API.
962  * @param[in] mask_supported
963  *   Mask fields supported by the implementation.
964  * @param[in] mask_empty
965  *   Empty mask to return when there is no specification.
966  * @param[out] error
967  *   Perform verbose error reporting if not NULL.
968  *
969  * @return
970  *   Either @p item->mask or one of the mask parameters on success, NULL
971  *   otherwise and rte_errno is set.
972  */
973 static const void *
974 flow_tcf_item_mask(const struct rte_flow_item *item, const void *mask_default,
975                    const void *mask_supported, const void *mask_empty,
976                    size_t mask_size, struct rte_flow_error *error)
977 {
978         const uint8_t *mask;
979         size_t i;
980
981         /* item->last and item->mask cannot exist without item->spec. */
982         if (!item->spec && (item->mask || item->last)) {
983                 rte_flow_error_set(error, EINVAL,
984                                    RTE_FLOW_ERROR_TYPE_ITEM, item,
985                                    "\"mask\" or \"last\" field provided without"
986                                    " a corresponding \"spec\"");
987                 return NULL;
988         }
989         /* No spec, no mask, no problem. */
990         if (!item->spec)
991                 return mask_empty;
992         mask = item->mask ? item->mask : mask_default;
993         assert(mask);
994         /*
995          * Single-pass check to make sure that:
996          * - Mask is supported, no bits are set outside mask_supported.
997          * - Both item->spec and item->last are included in mask.
998          */
999         for (i = 0; i != mask_size; ++i) {
1000                 if (!mask[i])
1001                         continue;
1002                 if ((mask[i] | ((const uint8_t *)mask_supported)[i]) !=
1003                     ((const uint8_t *)mask_supported)[i]) {
1004                         rte_flow_error_set(error, ENOTSUP,
1005                                            RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1006                                            "unsupported field found"
1007                                            " in \"mask\"");
1008                         return NULL;
1009                 }
1010                 if (item->last &&
1011                     (((const uint8_t *)item->spec)[i] & mask[i]) !=
1012                     (((const uint8_t *)item->last)[i] & mask[i])) {
1013                         rte_flow_error_set(error, EINVAL,
1014                                            RTE_FLOW_ERROR_TYPE_ITEM_LAST,
1015                                            item->last,
1016                                            "range between \"spec\" and \"last\""
1017                                            " not comprised in \"mask\"");
1018                         return NULL;
1019                 }
1020         }
1021         return mask;
1022 }
1023
1024 /**
1025  * Build a conversion table between port ID and ifindex.
1026  *
1027  * @param[in] dev
1028  *   Pointer to Ethernet device.
1029  * @param[out] ptoi
1030  *   Pointer to ptoi table.
1031  * @param[in] len
1032  *   Size of ptoi table provided.
1033  *
1034  * @return
1035  *   Size of ptoi table filled.
1036  */
1037 static unsigned int
1038 flow_tcf_build_ptoi_table(struct rte_eth_dev *dev, struct flow_tcf_ptoi *ptoi,
1039                           unsigned int len)
1040 {
1041         unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
1042         uint16_t port_id[n + 1];
1043         unsigned int i;
1044         unsigned int own = 0;
1045
1046         /* At least one port is needed when no switch domain is present. */
1047         if (!n) {
1048                 n = 1;
1049                 port_id[0] = dev->data->port_id;
1050         } else {
1051                 n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
1052         }
1053         if (n > len)
1054                 return 0;
1055         for (i = 0; i != n; ++i) {
1056                 struct rte_eth_dev_info dev_info;
1057
1058                 rte_eth_dev_info_get(port_id[i], &dev_info);
1059                 if (port_id[i] == dev->data->port_id)
1060                         own = i;
1061                 ptoi[i].port_id = port_id[i];
1062                 ptoi[i].ifindex = dev_info.if_index;
1063         }
1064         /* Ensure first entry of ptoi[] is the current device. */
1065         if (own) {
1066                 ptoi[n] = ptoi[0];
1067                 ptoi[0] = ptoi[own];
1068                 ptoi[own] = ptoi[n];
1069         }
1070         /* An entry with zero ifindex terminates ptoi[]. */
1071         ptoi[n].port_id = 0;
1072         ptoi[n].ifindex = 0;
1073         return n;
1074 }
1075
1076 /**
1077  * Verify the @p attr will be correctly understood by the E-switch.
1078  *
1079  * @param[in] attr
1080  *   Pointer to flow attributes
1081  * @param[out] error
1082  *   Pointer to error structure.
1083  *
1084  * @return
1085  *   0 on success, a negative errno value otherwise and rte_errno is set.
1086  */
1087 static int
1088 flow_tcf_validate_attributes(const struct rte_flow_attr *attr,
1089                              struct rte_flow_error *error)
1090 {
1091         /*
1092          * Supported attributes: groups, some priorities and ingress only.
1093          * group is supported only if kernel supports chain. Don't care about
1094          * transfer as it is the caller's problem.
1095          */
1096         if (attr->group > MLX5_TCF_GROUP_ID_MAX)
1097                 return rte_flow_error_set(error, ENOTSUP,
1098                                           RTE_FLOW_ERROR_TYPE_ATTR_GROUP, attr,
1099                                           "group ID larger than "
1100                                           RTE_STR(MLX5_TCF_GROUP_ID_MAX)
1101                                           " isn't supported");
1102         else if (attr->priority > MLX5_TCF_GROUP_PRIORITY_MAX)
1103                 return rte_flow_error_set(error, ENOTSUP,
1104                                           RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1105                                           attr,
1106                                           "priority more than "
1107                                           RTE_STR(MLX5_TCF_GROUP_PRIORITY_MAX)
1108                                           " is not supported");
1109         if (!attr->ingress)
1110                 return rte_flow_error_set(error, EINVAL,
1111                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1112                                           attr, "only ingress is supported");
1113         if (attr->egress)
1114                 return rte_flow_error_set(error, ENOTSUP,
1115                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1116                                           attr, "egress is not supported");
1117         return 0;
1118 }
1119
1120 /**
1121  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_ETH item for E-Switch.
1122  * The routine checks the L2 fields to be used in encapsulation header.
1123  *
1124  * @param[in] item
1125  *   Pointer to the item structure.
1126  * @param[out] error
1127  *   Pointer to the error structure.
1128  *
1129  * @return
1130  *   0 on success, a negative errno value otherwise and rte_errno is set.
1131  **/
1132 static int
1133 flow_tcf_validate_vxlan_encap_eth(const struct rte_flow_item *item,
1134                                   struct rte_flow_error *error)
1135 {
1136         const struct rte_flow_item_eth *spec = item->spec;
1137         const struct rte_flow_item_eth *mask = item->mask;
1138
1139         if (!spec) {
1140                 /*
1141                  * Specification for L2 addresses can be empty
1142                  * because these ones are optional and not
1143                  * required directly by tc rule. Kernel tries
1144                  * to resolve these ones on its own
1145                  */
1146                 return 0;
1147         }
1148         if (!mask) {
1149                 /* If mask is not specified use the default one. */
1150                 mask = &rte_flow_item_eth_mask;
1151         }
1152         if (memcmp(&mask->dst,
1153                    &flow_tcf_mask_empty.eth.dst,
1154                    sizeof(flow_tcf_mask_empty.eth.dst))) {
1155                 if (memcmp(&mask->dst,
1156                            &rte_flow_item_eth_mask.dst,
1157                            sizeof(rte_flow_item_eth_mask.dst)))
1158                         return rte_flow_error_set
1159                                 (error, ENOTSUP,
1160                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1161                                  "no support for partial mask on"
1162                                  " \"eth.dst\" field");
1163         }
1164         if (memcmp(&mask->src,
1165                    &flow_tcf_mask_empty.eth.src,
1166                    sizeof(flow_tcf_mask_empty.eth.src))) {
1167                 if (memcmp(&mask->src,
1168                            &rte_flow_item_eth_mask.src,
1169                            sizeof(rte_flow_item_eth_mask.src)))
1170                         return rte_flow_error_set
1171                                 (error, ENOTSUP,
1172                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1173                                  "no support for partial mask on"
1174                                  " \"eth.src\" field");
1175         }
1176         if (mask->type != RTE_BE16(0x0000)) {
1177                 if (mask->type != RTE_BE16(0xffff))
1178                         return rte_flow_error_set
1179                                 (error, ENOTSUP,
1180                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1181                                  "no support for partial mask on"
1182                                  " \"eth.type\" field");
1183                 DRV_LOG(WARNING,
1184                         "outer ethernet type field"
1185                         " cannot be forced for vxlan"
1186                         " encapsulation, parameter ignored");
1187         }
1188         return 0;
1189 }
1190
1191 /**
1192  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV4 item for E-Switch.
1193  * The routine checks the IPv4 fields to be used in encapsulation header.
1194  *
1195  * @param[in] item
1196  *   Pointer to the item structure.
1197  * @param[out] error
1198  *   Pointer to the error structure.
1199  *
1200  * @return
1201  *   0 on success, a negative errno value otherwise and rte_errno is set.
1202  **/
1203 static int
1204 flow_tcf_validate_vxlan_encap_ipv4(const struct rte_flow_item *item,
1205                                    struct rte_flow_error *error)
1206 {
1207         const struct rte_flow_item_ipv4 *spec = item->spec;
1208         const struct rte_flow_item_ipv4 *mask = item->mask;
1209
1210         if (!spec) {
1211                 /*
1212                  * Specification for IP addresses cannot be empty
1213                  * because it is required by tunnel_key parameter.
1214                  */
1215                 return rte_flow_error_set(error, EINVAL,
1216                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1217                                           "NULL outer ipv4 address"
1218                                           " specification for vxlan"
1219                                           " encapsulation");
1220         }
1221         if (!mask)
1222                 mask = &rte_flow_item_ipv4_mask;
1223         if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) {
1224                 if (mask->hdr.dst_addr != RTE_BE32(0xffffffff))
1225                         return rte_flow_error_set
1226                                 (error, ENOTSUP,
1227                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1228                                  "no support for partial mask on"
1229                                  " \"ipv4.hdr.dst_addr\" field"
1230                                  " for vxlan encapsulation");
1231                 /* More IPv4 address validations can be put here. */
1232         } else {
1233                 /*
1234                  * Kernel uses the destination IP address to determine
1235                  * the routing path and obtain the MAC destination
1236                  * address, so IP destination address must be
1237                  * specified in the tc rule.
1238                  */
1239                 return rte_flow_error_set(error, EINVAL,
1240                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1241                                           "outer ipv4 destination address"
1242                                           " must be specified for"
1243                                           " vxlan encapsulation");
1244         }
1245         if (mask->hdr.src_addr != RTE_BE32(0x00000000)) {
1246                 if (mask->hdr.src_addr != RTE_BE32(0xffffffff))
1247                         return rte_flow_error_set
1248                                 (error, ENOTSUP,
1249                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1250                                  "no support for partial mask on"
1251                                  " \"ipv4.hdr.src_addr\" field"
1252                                  " for vxlan encapsulation");
1253                 /* More IPv4 address validations can be put here. */
1254         } else {
1255                 /*
1256                  * Kernel uses the source IP address to select the
1257                  * interface for egress encapsulated traffic, so
1258                  * it must be specified in the tc rule.
1259                  */
1260                 return rte_flow_error_set(error, EINVAL,
1261                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1262                                           "outer ipv4 source address"
1263                                           " must be specified for"
1264                                           " vxlan encapsulation");
1265         }
1266         return 0;
1267 }
1268
1269 /**
1270  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV6 item for E-Switch.
1271  * The routine checks the IPv6 fields to be used in encapsulation header.
1272  *
1273  * @param[in] item
1274  *   Pointer to the item structure.
1275  * @param[out] error
1276  *   Pointer to the error structure.
1277  *
1278  * @return
1279  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1280  **/
1281 static int
1282 flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item,
1283                                    struct rte_flow_error *error)
1284 {
1285         const struct rte_flow_item_ipv6 *spec = item->spec;
1286         const struct rte_flow_item_ipv6 *mask = item->mask;
1287
1288         if (!spec) {
1289                 /*
1290                  * Specification for IP addresses cannot be empty
1291                  * because it is required by tunnel_key parameter.
1292                  */
1293                 return rte_flow_error_set(error, EINVAL,
1294                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1295                                           "NULL outer ipv6 address"
1296                                           " specification for"
1297                                           " vxlan encapsulation");
1298         }
1299         if (!mask)
1300                 mask = &rte_flow_item_ipv6_mask;
1301         if (memcmp(&mask->hdr.dst_addr,
1302                    &flow_tcf_mask_empty.ipv6.hdr.dst_addr,
1303                    IPV6_ADDR_LEN)) {
1304                 if (memcmp(&mask->hdr.dst_addr,
1305                            &rte_flow_item_ipv6_mask.hdr.dst_addr,
1306                            IPV6_ADDR_LEN))
1307                         return rte_flow_error_set
1308                                         (error, ENOTSUP,
1309                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1310                                          "no support for partial mask on"
1311                                          " \"ipv6.hdr.dst_addr\" field"
1312                                          " for vxlan encapsulation");
1313                 /* More IPv6 address validations can be put here. */
1314         } else {
1315                 /*
1316                  * Kernel uses the destination IP address to determine
1317                  * the routing path and obtain the MAC destination
1318                  * address (heigh or gate), so IP destination address
1319                  * must be specified within the tc rule.
1320                  */
1321                 return rte_flow_error_set(error, EINVAL,
1322                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1323                                           "outer ipv6 destination address"
1324                                           " must be specified for"
1325                                           " vxlan encapsulation");
1326         }
1327         if (memcmp(&mask->hdr.src_addr,
1328                    &flow_tcf_mask_empty.ipv6.hdr.src_addr,
1329                    IPV6_ADDR_LEN)) {
1330                 if (memcmp(&mask->hdr.src_addr,
1331                            &rte_flow_item_ipv6_mask.hdr.src_addr,
1332                            IPV6_ADDR_LEN))
1333                         return rte_flow_error_set
1334                                         (error, ENOTSUP,
1335                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1336                                          "no support for partial mask on"
1337                                          " \"ipv6.hdr.src_addr\" field"
1338                                          " for vxlan encapsulation");
1339                 /* More L3 address validation can be put here. */
1340         } else {
1341                 /*
1342                  * Kernel uses the source IP address to select the
1343                  * interface for egress encapsulated traffic, so
1344                  * it must be specified in the tc rule.
1345                  */
1346                 return rte_flow_error_set(error, EINVAL,
1347                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1348                                           "outer L3 source address"
1349                                           " must be specified for"
1350                                           " vxlan encapsulation");
1351         }
1352         return 0;
1353 }
1354
1355 /**
1356  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_UDP item for E-Switch.
1357  * The routine checks the UDP fields to be used in encapsulation header.
1358  *
1359  * @param[in] item
1360  *   Pointer to the item structure.
1361  * @param[out] error
1362  *   Pointer to the error structure.
1363  *
1364  * @return
1365  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1366  **/
1367 static int
1368 flow_tcf_validate_vxlan_encap_udp(const struct rte_flow_item *item,
1369                                   struct rte_flow_error *error)
1370 {
1371         const struct rte_flow_item_udp *spec = item->spec;
1372         const struct rte_flow_item_udp *mask = item->mask;
1373
1374         if (!spec) {
1375                 /*
1376                  * Specification for UDP ports cannot be empty
1377                  * because it is required by tunnel_key parameter.
1378                  */
1379                 return rte_flow_error_set(error, EINVAL,
1380                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1381                                           "NULL UDP port specification "
1382                                           " for vxlan encapsulation");
1383         }
1384         if (!mask)
1385                 mask = &rte_flow_item_udp_mask;
1386         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1387                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1388                         return rte_flow_error_set
1389                                         (error, ENOTSUP,
1390                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1391                                          "no support for partial mask on"
1392                                          " \"udp.hdr.dst_port\" field"
1393                                          " for vxlan encapsulation");
1394                 if (!spec->hdr.dst_port)
1395                         return rte_flow_error_set
1396                                         (error, EINVAL,
1397                                          RTE_FLOW_ERROR_TYPE_ITEM, item,
1398                                          "outer UDP remote port cannot be"
1399                                          " 0 for vxlan encapsulation");
1400         } else {
1401                 return rte_flow_error_set(error, EINVAL,
1402                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1403                                           "outer UDP remote port"
1404                                           " must be specified for"
1405                                           " vxlan encapsulation");
1406         }
1407         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1408                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1409                         return rte_flow_error_set
1410                                         (error, ENOTSUP,
1411                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1412                                          "no support for partial mask on"
1413                                          " \"udp.hdr.src_port\" field"
1414                                          " for vxlan encapsulation");
1415                 DRV_LOG(WARNING,
1416                         "outer UDP source port cannot be"
1417                         " forced for vxlan encapsulation,"
1418                         " parameter ignored");
1419         }
1420         return 0;
1421 }
1422
1423 /**
1424  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_VXLAN item for E-Switch.
1425  * The routine checks the VNIP fields to be used in encapsulation header.
1426  *
1427  * @param[in] item
1428  *   Pointer to the item structure.
1429  * @param[out] error
1430  *   Pointer to the error structure.
1431  *
1432  * @return
1433  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1434  **/
1435 static int
1436 flow_tcf_validate_vxlan_encap_vni(const struct rte_flow_item *item,
1437                                   struct rte_flow_error *error)
1438 {
1439         const struct rte_flow_item_vxlan *spec = item->spec;
1440         const struct rte_flow_item_vxlan *mask = item->mask;
1441
1442         if (!spec) {
1443                 /* Outer VNI is required by tunnel_key parameter. */
1444                 return rte_flow_error_set(error, EINVAL,
1445                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1446                                           "NULL VNI specification"
1447                                           " for vxlan encapsulation");
1448         }
1449         if (!mask)
1450                 mask = &rte_flow_item_vxlan_mask;
1451         if (!mask->vni[0] && !mask->vni[1] && !mask->vni[2])
1452                 return rte_flow_error_set(error, EINVAL,
1453                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1454                                           "outer VNI must be specified "
1455                                           "for vxlan encapsulation");
1456         if (mask->vni[0] != 0xff ||
1457             mask->vni[1] != 0xff ||
1458             mask->vni[2] != 0xff)
1459                 return rte_flow_error_set(error, ENOTSUP,
1460                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1461                                           "no support for partial mask on"
1462                                           " \"vxlan.vni\" field");
1463
1464         if (!spec->vni[0] && !spec->vni[1] && !spec->vni[2])
1465                 return rte_flow_error_set(error, EINVAL,
1466                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1467                                           "vxlan vni cannot be 0");
1468         return 0;
1469 }
1470
1471 /**
1472  * Validate VXLAN_ENCAP action item list for E-Switch.
1473  * The routine checks items to be used in encapsulation header.
1474  *
1475  * @param[in] action
1476  *   Pointer to the VXLAN_ENCAP action structure.
1477  * @param[out] error
1478  *   Pointer to the error structure.
1479  *
1480  * @return
1481  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1482  **/
1483 static int
1484 flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action,
1485                               struct rte_flow_error *error)
1486 {
1487         const struct rte_flow_item *items;
1488         int ret;
1489         uint32_t item_flags = 0;
1490
1491         if (!action->conf)
1492                 return rte_flow_error_set(error, EINVAL,
1493                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1494                                           "Missing vxlan tunnel"
1495                                           " action configuration");
1496         items = ((const struct rte_flow_action_vxlan_encap *)
1497                                         action->conf)->definition;
1498         if (!items)
1499                 return rte_flow_error_set(error, EINVAL,
1500                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1501                                           "Missing vxlan tunnel"
1502                                           " encapsulation parameters");
1503         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1504                 switch (items->type) {
1505                 case RTE_FLOW_ITEM_TYPE_VOID:
1506                         break;
1507                 case RTE_FLOW_ITEM_TYPE_ETH:
1508                         ret = mlx5_flow_validate_item_eth(items, item_flags,
1509                                                           error);
1510                         if (ret < 0)
1511                                 return ret;
1512                         ret = flow_tcf_validate_vxlan_encap_eth(items, error);
1513                         if (ret < 0)
1514                                 return ret;
1515                         item_flags |= MLX5_FLOW_LAYER_OUTER_L2;
1516                         break;
1517                 break;
1518                 case RTE_FLOW_ITEM_TYPE_IPV4:
1519                         ret = mlx5_flow_validate_item_ipv4(items, item_flags,
1520                                                            error);
1521                         if (ret < 0)
1522                                 return ret;
1523                         ret = flow_tcf_validate_vxlan_encap_ipv4(items, error);
1524                         if (ret < 0)
1525                                 return ret;
1526                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
1527                         break;
1528                 case RTE_FLOW_ITEM_TYPE_IPV6:
1529                         ret = mlx5_flow_validate_item_ipv6(items, item_flags,
1530                                                            error);
1531                         if (ret < 0)
1532                                 return ret;
1533                         ret = flow_tcf_validate_vxlan_encap_ipv6(items, error);
1534                         if (ret < 0)
1535                                 return ret;
1536                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
1537                         break;
1538                 case RTE_FLOW_ITEM_TYPE_UDP:
1539                         ret = mlx5_flow_validate_item_udp(items, item_flags,
1540                                                            0xFF, error);
1541                         if (ret < 0)
1542                                 return ret;
1543                         ret = flow_tcf_validate_vxlan_encap_udp(items, error);
1544                         if (ret < 0)
1545                                 return ret;
1546                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
1547                         break;
1548                 case RTE_FLOW_ITEM_TYPE_VXLAN:
1549                         ret = mlx5_flow_validate_item_vxlan(items,
1550                                                             item_flags, error);
1551                         if (ret < 0)
1552                                 return ret;
1553                         ret = flow_tcf_validate_vxlan_encap_vni(items, error);
1554                         if (ret < 0)
1555                                 return ret;
1556                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
1557                         break;
1558                 default:
1559                         return rte_flow_error_set
1560                                         (error, ENOTSUP,
1561                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
1562                                          "vxlan encap item not supported");
1563                 }
1564         }
1565         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3))
1566                 return rte_flow_error_set(error, EINVAL,
1567                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1568                                           "no outer IP layer found"
1569                                           " for vxlan encapsulation");
1570         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
1571                 return rte_flow_error_set(error, EINVAL,
1572                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1573                                           "no outer UDP layer found"
1574                                           " for vxlan encapsulation");
1575         if (!(item_flags & MLX5_FLOW_LAYER_VXLAN))
1576                 return rte_flow_error_set(error, EINVAL,
1577                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1578                                           "no VXLAN VNI found"
1579                                           " for vxlan encapsulation");
1580         return 0;
1581 }
1582
1583 /**
1584  * Validate RTE_FLOW_ITEM_TYPE_IPV4 item if VXLAN_DECAP action
1585  * is present in actions list.
1586  *
1587  * @param[in] ipv4
1588  *   Outer IPv4 address item (if any, NULL otherwise).
1589  * @param[out] error
1590  *   Pointer to the error structure.
1591  *
1592  * @return
1593  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1594  **/
1595 static int
1596 flow_tcf_validate_vxlan_decap_ipv4(const struct rte_flow_item *ipv4,
1597                                    struct rte_flow_error *error)
1598 {
1599         const struct rte_flow_item_ipv4 *spec = ipv4->spec;
1600         const struct rte_flow_item_ipv4 *mask = ipv4->mask;
1601
1602         if (!spec) {
1603                 /*
1604                  * Specification for IP addresses cannot be empty
1605                  * because it is required as decap parameter.
1606                  */
1607                 return rte_flow_error_set(error, EINVAL,
1608                                           RTE_FLOW_ERROR_TYPE_ITEM, ipv4,
1609                                           "NULL outer ipv4 address"
1610                                           " specification for vxlan"
1611                                           " for vxlan decapsulation");
1612         }
1613         if (!mask)
1614                 mask = &rte_flow_item_ipv4_mask;
1615         if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) {
1616                 if (mask->hdr.dst_addr != RTE_BE32(0xffffffff))
1617                         return rte_flow_error_set
1618                                         (error, ENOTSUP,
1619                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1620                                          "no support for partial mask on"
1621                                          " \"ipv4.hdr.dst_addr\" field");
1622                 /* More IP address validations can be put here. */
1623         } else {
1624                 /*
1625                  * Kernel uses the destination IP address
1626                  * to determine the ingress network interface
1627                  * for traffic being decapsulated.
1628                  */
1629                 return rte_flow_error_set(error, EINVAL,
1630                                           RTE_FLOW_ERROR_TYPE_ITEM, ipv4,
1631                                           "outer ipv4 destination address"
1632                                           " must be specified for"
1633                                           " vxlan decapsulation");
1634         }
1635         /* Source IP address is optional for decap. */
1636         if (mask->hdr.src_addr != RTE_BE32(0x00000000) &&
1637             mask->hdr.src_addr != RTE_BE32(0xffffffff))
1638                 return rte_flow_error_set(error, ENOTSUP,
1639                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1640                                           "no support for partial mask on"
1641                                           " \"ipv4.hdr.src_addr\" field");
1642         return 0;
1643 }
1644
1645 /**
1646  * Validate RTE_FLOW_ITEM_TYPE_IPV6 item if VXLAN_DECAP action
1647  * is present in actions list.
1648  *
1649  * @param[in] ipv6
1650  *   Outer IPv6 address item (if any, NULL otherwise).
1651  * @param[out] error
1652  *   Pointer to the error structure.
1653  *
1654  * @return
1655  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1656  **/
1657 static int
1658 flow_tcf_validate_vxlan_decap_ipv6(const struct rte_flow_item *ipv6,
1659                                    struct rte_flow_error *error)
1660 {
1661         const struct rte_flow_item_ipv6 *spec = ipv6->spec;
1662         const struct rte_flow_item_ipv6 *mask = ipv6->mask;
1663
1664         if (!spec) {
1665                 /*
1666                  * Specification for IP addresses cannot be empty
1667                  * because it is required as decap parameter.
1668                  */
1669                 return rte_flow_error_set(error, EINVAL,
1670                                           RTE_FLOW_ERROR_TYPE_ITEM, ipv6,
1671                                           "NULL outer ipv6 address"
1672                                           " specification for vxlan"
1673                                           " decapsulation");
1674         }
1675         if (!mask)
1676                 mask = &rte_flow_item_ipv6_mask;
1677         if (memcmp(&mask->hdr.dst_addr,
1678                    &flow_tcf_mask_empty.ipv6.hdr.dst_addr,
1679                    IPV6_ADDR_LEN)) {
1680                 if (memcmp(&mask->hdr.dst_addr,
1681                         &rte_flow_item_ipv6_mask.hdr.dst_addr,
1682                         IPV6_ADDR_LEN))
1683                         return rte_flow_error_set
1684                                         (error, ENOTSUP,
1685                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1686                                          "no support for partial mask on"
1687                                          " \"ipv6.hdr.dst_addr\" field");
1688                 /* More IP address validations can be put here. */
1689         } else {
1690                 /*
1691                  * Kernel uses the destination IP address
1692                  * to determine the ingress network interface
1693                  * for traffic being decapsulated.
1694                  */
1695                 return rte_flow_error_set(error, EINVAL,
1696                                           RTE_FLOW_ERROR_TYPE_ITEM, ipv6,
1697                                           "outer ipv6 destination address must be "
1698                                           "specified for vxlan decapsulation");
1699         }
1700         /* Source IP address is optional for decap. */
1701         if (memcmp(&mask->hdr.src_addr,
1702                    &flow_tcf_mask_empty.ipv6.hdr.src_addr,
1703                    IPV6_ADDR_LEN)) {
1704                 if (memcmp(&mask->hdr.src_addr,
1705                            &rte_flow_item_ipv6_mask.hdr.src_addr,
1706                            IPV6_ADDR_LEN))
1707                         return rte_flow_error_set
1708                                         (error, ENOTSUP,
1709                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1710                                          "no support for partial mask on"
1711                                          " \"ipv6.hdr.src_addr\" field");
1712         }
1713         return 0;
1714 }
1715
1716 /**
1717  * Validate RTE_FLOW_ITEM_TYPE_UDP item if VXLAN_DECAP action
1718  * is present in actions list.
1719  *
1720  * @param[in] udp
1721  *   Outer UDP layer item (if any, NULL otherwise).
1722  * @param[out] error
1723  *   Pointer to the error structure.
1724  *
1725  * @return
1726  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1727  **/
1728 static int
1729 flow_tcf_validate_vxlan_decap_udp(const struct rte_flow_item *udp,
1730                                   struct rte_flow_error *error)
1731 {
1732         const struct rte_flow_item_udp *spec = udp->spec;
1733         const struct rte_flow_item_udp *mask = udp->mask;
1734
1735         if (!spec)
1736                 /*
1737                  * Specification for UDP ports cannot be empty
1738                  * because it is required as decap parameter.
1739                  */
1740                 return rte_flow_error_set(error, EINVAL,
1741                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1742                                           "NULL UDP port specification"
1743                                           " for VXLAN decapsulation");
1744         if (!mask)
1745                 mask = &rte_flow_item_udp_mask;
1746         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1747                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1748                         return rte_flow_error_set
1749                                         (error, ENOTSUP,
1750                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1751                                          "no support for partial mask on"
1752                                          " \"udp.hdr.dst_port\" field");
1753                 if (!spec->hdr.dst_port)
1754                         return rte_flow_error_set
1755                                         (error, EINVAL,
1756                                          RTE_FLOW_ERROR_TYPE_ITEM, udp,
1757                                          "zero decap local UDP port");
1758         } else {
1759                 return rte_flow_error_set(error, EINVAL,
1760                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1761                                           "outer UDP destination port must be "
1762                                           "specified for vxlan decapsulation");
1763         }
1764         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1765                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1766                         return rte_flow_error_set
1767                                         (error, ENOTSUP,
1768                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1769                                          "no support for partial mask on"
1770                                          " \"udp.hdr.src_port\" field");
1771                 DRV_LOG(WARNING,
1772                         "outer UDP local port cannot be "
1773                         "forced for VXLAN encapsulation, "
1774                         "parameter ignored");
1775         }
1776         return 0;
1777 }
1778
1779 /**
1780  * Validate flow for E-Switch.
1781  *
1782  * @param[in] priv
1783  *   Pointer to the priv structure.
1784  * @param[in] attr
1785  *   Pointer to the flow attributes.
1786  * @param[in] items
1787  *   Pointer to the list of items.
1788  * @param[in] actions
1789  *   Pointer to the list of actions.
1790  * @param[out] error
1791  *   Pointer to the error structure.
1792  *
1793  * @return
1794  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1795  */
1796 static int
1797 flow_tcf_validate(struct rte_eth_dev *dev,
1798                   const struct rte_flow_attr *attr,
1799                   const struct rte_flow_item items[],
1800                   const struct rte_flow_action actions[],
1801                   struct rte_flow_error *error)
1802 {
1803         union {
1804                 const struct rte_flow_item_port_id *port_id;
1805                 const struct rte_flow_item_eth *eth;
1806                 const struct rte_flow_item_vlan *vlan;
1807                 const struct rte_flow_item_ipv4 *ipv4;
1808                 const struct rte_flow_item_ipv6 *ipv6;
1809                 const struct rte_flow_item_tcp *tcp;
1810                 const struct rte_flow_item_udp *udp;
1811                 const struct rte_flow_item_vxlan *vxlan;
1812         } spec, mask;
1813         union {
1814                 const struct rte_flow_action_port_id *port_id;
1815                 const struct rte_flow_action_jump *jump;
1816                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
1817                 const struct rte_flow_action_of_set_vlan_vid *
1818                         of_set_vlan_vid;
1819                 const struct rte_flow_action_of_set_vlan_pcp *
1820                         of_set_vlan_pcp;
1821                 const struct rte_flow_action_vxlan_encap *vxlan_encap;
1822                 const struct rte_flow_action_set_ipv4 *set_ipv4;
1823                 const struct rte_flow_action_set_ipv6 *set_ipv6;
1824         } conf;
1825         uint64_t item_flags = 0;
1826         uint64_t action_flags = 0;
1827         uint8_t next_protocol = -1;
1828         unsigned int tcm_ifindex = 0;
1829         uint8_t pedit_validated = 0;
1830         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
1831         struct rte_eth_dev *port_id_dev = NULL;
1832         bool in_port_id_set;
1833         int ret;
1834
1835         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
1836                                                 PTOI_TABLE_SZ_MAX(dev)));
1837         ret = flow_tcf_validate_attributes(attr, error);
1838         if (ret < 0)
1839                 return ret;
1840         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
1841                 unsigned int i;
1842                 uint64_t current_action_flag = 0;
1843
1844                 switch (actions->type) {
1845                 case RTE_FLOW_ACTION_TYPE_VOID:
1846                         break;
1847                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
1848                         current_action_flag = MLX5_FLOW_ACTION_PORT_ID;
1849                         if (!actions->conf)
1850                                 break;
1851                         conf.port_id = actions->conf;
1852                         if (conf.port_id->original)
1853                                 i = 0;
1854                         else
1855                                 for (i = 0; ptoi[i].ifindex; ++i)
1856                                         if (ptoi[i].port_id == conf.port_id->id)
1857                                                 break;
1858                         if (!ptoi[i].ifindex)
1859                                 return rte_flow_error_set
1860                                         (error, ENODEV,
1861                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1862                                          conf.port_id,
1863                                          "missing data to convert port ID to"
1864                                          " ifindex");
1865                         port_id_dev = &rte_eth_devices[conf.port_id->id];
1866                         break;
1867                 case RTE_FLOW_ACTION_TYPE_JUMP:
1868                         current_action_flag = MLX5_FLOW_ACTION_JUMP;
1869                         if (!actions->conf)
1870                                 break;
1871                         conf.jump = actions->conf;
1872                         if (attr->group >= conf.jump->group)
1873                                 return rte_flow_error_set
1874                                         (error, ENOTSUP,
1875                                          RTE_FLOW_ERROR_TYPE_ACTION,
1876                                          actions,
1877                                          "can jump only to a group forward");
1878                         break;
1879                 case RTE_FLOW_ACTION_TYPE_DROP:
1880                         current_action_flag = MLX5_FLOW_ACTION_DROP;
1881                         break;
1882                 case RTE_FLOW_ACTION_TYPE_COUNT:
1883                         break;
1884                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
1885                         current_action_flag = MLX5_FLOW_ACTION_OF_POP_VLAN;
1886                         break;
1887                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
1888                         current_action_flag = MLX5_FLOW_ACTION_OF_PUSH_VLAN;
1889                         break;
1890                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
1891                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1892                                 return rte_flow_error_set
1893                                         (error, ENOTSUP,
1894                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1895                                          "vlan modify is not supported,"
1896                                          " set action must follow push action");
1897                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
1898                         break;
1899                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
1900                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1901                                 return rte_flow_error_set
1902                                         (error, ENOTSUP,
1903                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1904                                          "vlan modify is not supported,"
1905                                          " set action must follow push action");
1906                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
1907                         break;
1908                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
1909                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_DECAP;
1910                         break;
1911                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
1912                         ret = flow_tcf_validate_vxlan_encap(actions, error);
1913                         if (ret < 0)
1914                                 return ret;
1915                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_ENCAP;
1916                         break;
1917                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
1918                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_SRC;
1919                         break;
1920                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
1921                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_DST;
1922                         break;
1923                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
1924                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_SRC;
1925                         break;
1926                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
1927                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_DST;
1928                         break;
1929                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
1930                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_SRC;
1931                         break;
1932                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
1933                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_DST;
1934                         break;
1935                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
1936                         current_action_flag = MLX5_FLOW_ACTION_SET_TTL;
1937                         break;
1938                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
1939                         current_action_flag = MLX5_FLOW_ACTION_DEC_TTL;
1940                         break;
1941                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
1942                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_SRC;
1943                         break;
1944                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
1945                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_DST;
1946                         break;
1947                 default:
1948                         return rte_flow_error_set(error, ENOTSUP,
1949                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1950                                                   actions,
1951                                                   "action not supported");
1952                 }
1953                 if (current_action_flag & MLX5_TCF_CONFIG_ACTIONS) {
1954                         if (!actions->conf)
1955                                 return rte_flow_error_set
1956                                         (error, EINVAL,
1957                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1958                                          actions,
1959                                          "action configuration not set");
1960                 }
1961                 if ((current_action_flag & MLX5_TCF_PEDIT_ACTIONS) &&
1962                     pedit_validated)
1963                         return rte_flow_error_set(error, ENOTSUP,
1964                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1965                                                   actions,
1966                                                   "set actions should be "
1967                                                   "listed successively");
1968                 if ((current_action_flag & ~MLX5_TCF_PEDIT_ACTIONS) &&
1969                     (action_flags & MLX5_TCF_PEDIT_ACTIONS))
1970                         pedit_validated = 1;
1971                 if ((current_action_flag & MLX5_TCF_FATE_ACTIONS) &&
1972                     (action_flags & MLX5_TCF_FATE_ACTIONS))
1973                         return rte_flow_error_set(error, EINVAL,
1974                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1975                                                   actions,
1976                                                   "can't have multiple fate"
1977                                                   " actions");
1978                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1979                     (action_flags & MLX5_TCF_VXLAN_ACTIONS))
1980                         return rte_flow_error_set(error, EINVAL,
1981                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1982                                                   actions,
1983                                                   "can't have multiple vxlan"
1984                                                   " actions");
1985                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1986                     (action_flags & MLX5_TCF_VLAN_ACTIONS))
1987                         return rte_flow_error_set(error, ENOTSUP,
1988                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1989                                                   actions,
1990                                                   "can't have vxlan and vlan"
1991                                                   " actions in the same rule");
1992                 action_flags |= current_action_flag;
1993         }
1994         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1995                 unsigned int i;
1996
1997                 if ((item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
1998                     items->type != RTE_FLOW_ITEM_TYPE_ETH)
1999                         return rte_flow_error_set(error, ENOTSUP,
2000                                                   RTE_FLOW_ERROR_TYPE_ITEM,
2001                                                   items,
2002                                                   "only L2 inner item"
2003                                                   " is supported");
2004                 switch (items->type) {
2005                 case RTE_FLOW_ITEM_TYPE_VOID:
2006                         break;
2007                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
2008                         mask.port_id = flow_tcf_item_mask
2009                                 (items, &rte_flow_item_port_id_mask,
2010                                  &flow_tcf_mask_supported.port_id,
2011                                  &flow_tcf_mask_empty.port_id,
2012                                  sizeof(flow_tcf_mask_supported.port_id),
2013                                  error);
2014                         if (!mask.port_id)
2015                                 return -rte_errno;
2016                         if (mask.port_id == &flow_tcf_mask_empty.port_id) {
2017                                 in_port_id_set = 1;
2018                                 break;
2019                         }
2020                         spec.port_id = items->spec;
2021                         if (mask.port_id->id && mask.port_id->id != 0xffffffff)
2022                                 return rte_flow_error_set
2023                                         (error, ENOTSUP,
2024                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2025                                          mask.port_id,
2026                                          "no support for partial mask on"
2027                                          " \"id\" field");
2028                         if (!mask.port_id->id)
2029                                 i = 0;
2030                         else
2031                                 for (i = 0; ptoi[i].ifindex; ++i)
2032                                         if (ptoi[i].port_id == spec.port_id->id)
2033                                                 break;
2034                         if (!ptoi[i].ifindex)
2035                                 return rte_flow_error_set
2036                                         (error, ENODEV,
2037                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
2038                                          spec.port_id,
2039                                          "missing data to convert port ID to"
2040                                          " ifindex");
2041                         if (in_port_id_set && ptoi[i].ifindex != tcm_ifindex)
2042                                 return rte_flow_error_set
2043                                         (error, ENOTSUP,
2044                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
2045                                          spec.port_id,
2046                                          "cannot match traffic for"
2047                                          " several port IDs through"
2048                                          " a single flow rule");
2049                         tcm_ifindex = ptoi[i].ifindex;
2050                         in_port_id_set = 1;
2051                         break;
2052                 case RTE_FLOW_ITEM_TYPE_ETH:
2053                         ret = mlx5_flow_validate_item_eth(items, item_flags,
2054                                                           error);
2055                         if (ret < 0)
2056                                 return ret;
2057                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2058                                         MLX5_FLOW_LAYER_INNER_L2 :
2059                                         MLX5_FLOW_LAYER_OUTER_L2;
2060                         /* TODO:
2061                          * Redundant check due to different supported mask.
2062                          * Same for the rest of items.
2063                          */
2064                         mask.eth = flow_tcf_item_mask
2065                                 (items, &rte_flow_item_eth_mask,
2066                                  &flow_tcf_mask_supported.eth,
2067                                  &flow_tcf_mask_empty.eth,
2068                                  sizeof(flow_tcf_mask_supported.eth),
2069                                  error);
2070                         if (!mask.eth)
2071                                 return -rte_errno;
2072                         if (mask.eth->type && mask.eth->type !=
2073                             RTE_BE16(0xffff))
2074                                 return rte_flow_error_set
2075                                         (error, ENOTSUP,
2076                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2077                                          mask.eth,
2078                                          "no support for partial mask on"
2079                                          " \"type\" field");
2080                         break;
2081                 case RTE_FLOW_ITEM_TYPE_VLAN:
2082                         ret = mlx5_flow_validate_item_vlan(items, item_flags,
2083                                                            error);
2084                         if (ret < 0)
2085                                 return ret;
2086                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
2087                         mask.vlan = flow_tcf_item_mask
2088                                 (items, &rte_flow_item_vlan_mask,
2089                                  &flow_tcf_mask_supported.vlan,
2090                                  &flow_tcf_mask_empty.vlan,
2091                                  sizeof(flow_tcf_mask_supported.vlan),
2092                                  error);
2093                         if (!mask.vlan)
2094                                 return -rte_errno;
2095                         if ((mask.vlan->tci & RTE_BE16(0xe000) &&
2096                              (mask.vlan->tci & RTE_BE16(0xe000)) !=
2097                               RTE_BE16(0xe000)) ||
2098                             (mask.vlan->tci & RTE_BE16(0x0fff) &&
2099                              (mask.vlan->tci & RTE_BE16(0x0fff)) !=
2100                               RTE_BE16(0x0fff)) ||
2101                             (mask.vlan->inner_type &&
2102                              mask.vlan->inner_type != RTE_BE16(0xffff)))
2103                                 return rte_flow_error_set
2104                                         (error, ENOTSUP,
2105                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2106                                          mask.vlan,
2107                                          "no support for partial masks on"
2108                                          " \"tci\" (PCP and VID parts) and"
2109                                          " \"inner_type\" fields");
2110                         break;
2111                 case RTE_FLOW_ITEM_TYPE_IPV4:
2112                         ret = mlx5_flow_validate_item_ipv4(items, item_flags,
2113                                                            error);
2114                         if (ret < 0)
2115                                 return ret;
2116                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
2117                         mask.ipv4 = flow_tcf_item_mask
2118                                 (items, &rte_flow_item_ipv4_mask,
2119                                  &flow_tcf_mask_supported.ipv4,
2120                                  &flow_tcf_mask_empty.ipv4,
2121                                  sizeof(flow_tcf_mask_supported.ipv4),
2122                                  error);
2123                         if (!mask.ipv4)
2124                                 return -rte_errno;
2125                         if (mask.ipv4->hdr.next_proto_id &&
2126                             mask.ipv4->hdr.next_proto_id != 0xff)
2127                                 return rte_flow_error_set
2128                                         (error, ENOTSUP,
2129                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2130                                          mask.ipv4,
2131                                          "no support for partial mask on"
2132                                          " \"hdr.next_proto_id\" field");
2133                         else if (mask.ipv4->hdr.next_proto_id)
2134                                 next_protocol =
2135                                         ((const struct rte_flow_item_ipv4 *)
2136                                          (items->spec))->hdr.next_proto_id;
2137                         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2138                                 ret = flow_tcf_validate_vxlan_decap_ipv4
2139                                                                 (items, error);
2140                                 if (ret < 0)
2141                                         return ret;
2142                         }
2143                         break;
2144                 case RTE_FLOW_ITEM_TYPE_IPV6:
2145                         ret = mlx5_flow_validate_item_ipv6(items, item_flags,
2146                                                            error);
2147                         if (ret < 0)
2148                                 return ret;
2149                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
2150                         mask.ipv6 = flow_tcf_item_mask
2151                                 (items, &rte_flow_item_ipv6_mask,
2152                                  &flow_tcf_mask_supported.ipv6,
2153                                  &flow_tcf_mask_empty.ipv6,
2154                                  sizeof(flow_tcf_mask_supported.ipv6),
2155                                  error);
2156                         if (!mask.ipv6)
2157                                 return -rte_errno;
2158                         if (mask.ipv6->hdr.proto &&
2159                             mask.ipv6->hdr.proto != 0xff)
2160                                 return rte_flow_error_set
2161                                         (error, ENOTSUP,
2162                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2163                                          mask.ipv6,
2164                                          "no support for partial mask on"
2165                                          " \"hdr.proto\" field");
2166                         else if (mask.ipv6->hdr.proto)
2167                                 next_protocol =
2168                                         ((const struct rte_flow_item_ipv6 *)
2169                                          (items->spec))->hdr.proto;
2170                         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2171                                 ret = flow_tcf_validate_vxlan_decap_ipv6
2172                                                                 (items, error);
2173                                 if (ret < 0)
2174                                         return ret;
2175                         }
2176                         break;
2177                 case RTE_FLOW_ITEM_TYPE_UDP:
2178                         ret = mlx5_flow_validate_item_udp(items, item_flags,
2179                                                           next_protocol, error);
2180                         if (ret < 0)
2181                                 return ret;
2182                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
2183                         mask.udp = flow_tcf_item_mask
2184                                 (items, &rte_flow_item_udp_mask,
2185                                  &flow_tcf_mask_supported.udp,
2186                                  &flow_tcf_mask_empty.udp,
2187                                  sizeof(flow_tcf_mask_supported.udp),
2188                                  error);
2189                         if (!mask.udp)
2190                                 return -rte_errno;
2191                         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2192                                 ret = flow_tcf_validate_vxlan_decap_udp
2193                                                                 (items, error);
2194                                 if (ret < 0)
2195                                         return ret;
2196                         }
2197                         break;
2198                 case RTE_FLOW_ITEM_TYPE_TCP:
2199                         ret = mlx5_flow_validate_item_tcp
2200                                              (items, item_flags,
2201                                               next_protocol,
2202                                               &flow_tcf_mask_supported.tcp,
2203                                               error);
2204                         if (ret < 0)
2205                                 return ret;
2206                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
2207                         mask.tcp = flow_tcf_item_mask
2208                                 (items, &rte_flow_item_tcp_mask,
2209                                  &flow_tcf_mask_supported.tcp,
2210                                  &flow_tcf_mask_empty.tcp,
2211                                  sizeof(flow_tcf_mask_supported.tcp),
2212                                  error);
2213                         if (!mask.tcp)
2214                                 return -rte_errno;
2215                         break;
2216                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2217                         if (!(action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP))
2218                                 return rte_flow_error_set
2219                                         (error, ENOTSUP,
2220                                          RTE_FLOW_ERROR_TYPE_ITEM,
2221                                          items,
2222                                          "vni pattern should be followed by"
2223                                          " vxlan decapsulation action");
2224                         ret = mlx5_flow_validate_item_vxlan(items,
2225                                                             item_flags, error);
2226                         if (ret < 0)
2227                                 return ret;
2228                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
2229                         mask.vxlan = flow_tcf_item_mask
2230                                 (items, &rte_flow_item_vxlan_mask,
2231                                  &flow_tcf_mask_supported.vxlan,
2232                                  &flow_tcf_mask_empty.vxlan,
2233                                  sizeof(flow_tcf_mask_supported.vxlan), error);
2234                         if (!mask.vxlan)
2235                                 return -rte_errno;
2236                         if (mask.vxlan->vni[0] != 0xff ||
2237                             mask.vxlan->vni[1] != 0xff ||
2238                             mask.vxlan->vni[2] != 0xff)
2239                                 return rte_flow_error_set
2240                                         (error, ENOTSUP,
2241                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2242                                          mask.vxlan,
2243                                          "no support for partial or "
2244                                          "empty mask on \"vxlan.vni\" field");
2245                         break;
2246                 default:
2247                         return rte_flow_error_set(error, ENOTSUP,
2248                                                   RTE_FLOW_ERROR_TYPE_ITEM,
2249                                                   items, "item not supported");
2250                 }
2251         }
2252         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2253             (action_flags & MLX5_FLOW_ACTION_DROP))
2254                 return rte_flow_error_set(error, ENOTSUP,
2255                                           RTE_FLOW_ERROR_TYPE_ACTION,
2256                                           actions,
2257                                           "set action is not compatible with "
2258                                           "drop action");
2259         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2260             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2261                 return rte_flow_error_set(error, ENOTSUP,
2262                                           RTE_FLOW_ERROR_TYPE_ACTION,
2263                                           actions,
2264                                           "set action must be followed by "
2265                                           "port_id action");
2266         if (action_flags &
2267            (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST)) {
2268                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4))
2269                         return rte_flow_error_set(error, EINVAL,
2270                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2271                                                   actions,
2272                                                   "no ipv4 item found in"
2273                                                   " pattern");
2274         }
2275         if (action_flags &
2276            (MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST)) {
2277                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6))
2278                         return rte_flow_error_set(error, EINVAL,
2279                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2280                                                   actions,
2281                                                   "no ipv6 item found in"
2282                                                   " pattern");
2283         }
2284         if (action_flags &
2285            (MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST)) {
2286                 if (!(item_flags &
2287                      (MLX5_FLOW_LAYER_OUTER_L4_UDP |
2288                       MLX5_FLOW_LAYER_OUTER_L4_TCP)))
2289                         return rte_flow_error_set(error, EINVAL,
2290                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2291                                                   actions,
2292                                                   "no TCP/UDP item found in"
2293                                                   " pattern");
2294         }
2295         /*
2296          * FW syndrome (0xA9C090):
2297          *     set_flow_table_entry: push vlan action fte in fdb can ONLY be
2298          *     forward to the uplink.
2299          */
2300         if ((action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) &&
2301             (action_flags & MLX5_FLOW_ACTION_PORT_ID) &&
2302             ((struct priv *)port_id_dev->data->dev_private)->representor)
2303                 return rte_flow_error_set(error, ENOTSUP,
2304                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2305                                           "vlan push can only be applied"
2306                                           " when forwarding to uplink port");
2307         /*
2308          * FW syndrome (0x294609):
2309          *     set_flow_table_entry: modify/pop/push actions in fdb flow table
2310          *     are supported only while forwarding to vport.
2311          */
2312         if ((action_flags & MLX5_TCF_VLAN_ACTIONS) &&
2313             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2314                 return rte_flow_error_set(error, ENOTSUP,
2315                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2316                                           "vlan actions are supported"
2317                                           " only with port_id action");
2318         if ((action_flags & MLX5_TCF_VXLAN_ACTIONS) &&
2319             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2320                 return rte_flow_error_set(error, ENOTSUP,
2321                                           RTE_FLOW_ERROR_TYPE_ACTION, NULL,
2322                                           "vxlan actions are supported"
2323                                           " only with port_id action");
2324         if (!(action_flags & MLX5_TCF_FATE_ACTIONS))
2325                 return rte_flow_error_set(error, EINVAL,
2326                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2327                                           "no fate action is found");
2328         if (action_flags &
2329            (MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL)) {
2330                 if (!(item_flags &
2331                      (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2332                       MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2333                         return rte_flow_error_set(error, EINVAL,
2334                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2335                                                   actions,
2336                                                   "no IP found in pattern");
2337         }
2338         if (action_flags &
2339             (MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)) {
2340                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L2))
2341                         return rte_flow_error_set(error, ENOTSUP,
2342                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2343                                                   actions,
2344                                                   "no ethernet found in"
2345                                                   " pattern");
2346         }
2347         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2348                 if (!(item_flags &
2349                      (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2350                       MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2351                         return rte_flow_error_set(error, EINVAL,
2352                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2353                                                   NULL,
2354                                                   "no outer IP pattern found"
2355                                                   " for vxlan decap action");
2356                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
2357                         return rte_flow_error_set(error, EINVAL,
2358                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2359                                                   NULL,
2360                                                   "no outer UDP pattern found"
2361                                                   " for vxlan decap action");
2362                 if (!(item_flags & MLX5_FLOW_LAYER_VXLAN))
2363                         return rte_flow_error_set(error, EINVAL,
2364                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2365                                                   NULL,
2366                                                   "no VNI pattern found"
2367                                                   " for vxlan decap action");
2368         }
2369         return 0;
2370 }
2371
2372 /**
2373  * Calculate maximum size of memory for flow items of Linux TC flower and
2374  * extract specified items.
2375  *
2376  * @param[in] items
2377  *   Pointer to the list of items.
2378  * @param[out] item_flags
2379  *   Pointer to the detected items.
2380  *
2381  * @return
2382  *   Maximum size of memory for items.
2383  */
2384 static int
2385 flow_tcf_get_items_and_size(const struct rte_flow_attr *attr,
2386                             const struct rte_flow_item items[],
2387                             uint64_t *item_flags)
2388 {
2389         int size = 0;
2390         uint64_t flags = 0;
2391
2392         size += SZ_NLATTR_STRZ_OF("flower") +
2393                 SZ_NLATTR_NEST + /* TCA_OPTIONS. */
2394                 SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */
2395         if (attr->group > 0)
2396                 size += SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CHAIN. */
2397         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2398                 switch (items->type) {
2399                 case RTE_FLOW_ITEM_TYPE_VOID:
2400                         break;
2401                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
2402                         break;
2403                 case RTE_FLOW_ITEM_TYPE_ETH:
2404                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
2405                                 SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4;
2406                                 /* dst/src MAC addr and mask. */
2407                         flags |= MLX5_FLOW_LAYER_OUTER_L2;
2408                         break;
2409                 case RTE_FLOW_ITEM_TYPE_VLAN:
2410                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
2411                                 SZ_NLATTR_TYPE_OF(uint16_t) +
2412                                 /* VLAN Ether type. */
2413                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */
2414                                 SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */
2415                         flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
2416                         break;
2417                 case RTE_FLOW_ITEM_TYPE_IPV4:
2418                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
2419                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2420                                 SZ_NLATTR_TYPE_OF(uint32_t) * 4;
2421                                 /* dst/src IP addr and mask. */
2422                         flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
2423                         break;
2424                 case RTE_FLOW_ITEM_TYPE_IPV6:
2425                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
2426                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2427                                 SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4;
2428                                 /* dst/src IP addr and mask. */
2429                         flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
2430                         break;
2431                 case RTE_FLOW_ITEM_TYPE_UDP:
2432                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2433                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2434                                 /* dst/src port and mask. */
2435                         flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
2436                         break;
2437                 case RTE_FLOW_ITEM_TYPE_TCP:
2438                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2439                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2440                                 /* dst/src port and mask. */
2441                         flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
2442                         break;
2443                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2444                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2445                         flags |= MLX5_FLOW_LAYER_VXLAN;
2446                         break;
2447                 default:
2448                         DRV_LOG(WARNING,
2449                                 "unsupported item %p type %d,"
2450                                 " items must be validated before flow creation",
2451                                 (const void *)items, items->type);
2452                         break;
2453                 }
2454         }
2455         *item_flags = flags;
2456         return size;
2457 }
2458
2459 /**
2460  * Calculate size of memory to store the VXLAN encapsultion
2461  * related items in the Netlink message buffer. Items list
2462  * is specified by RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action.
2463  * The item list should be validated.
2464  *
2465  * @param[in] action
2466  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2467  *   List of pattern items to scan data from.
2468  *
2469  * @return
2470  *   The size the part of Netlink message buffer to store the
2471  *   VXLAN encapsulation item attributes.
2472  */
2473 static int
2474 flow_tcf_vxlan_encap_size(const struct rte_flow_action *action)
2475 {
2476         const struct rte_flow_item *items;
2477         int size = 0;
2478
2479         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
2480         assert(action->conf);
2481
2482         items = ((const struct rte_flow_action_vxlan_encap *)
2483                                         action->conf)->definition;
2484         assert(items);
2485         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2486                 switch (items->type) {
2487                 case RTE_FLOW_ITEM_TYPE_VOID:
2488                         break;
2489                 case RTE_FLOW_ITEM_TYPE_ETH:
2490                         /* This item does not require message buffer. */
2491                         break;
2492                 case RTE_FLOW_ITEM_TYPE_IPV4:
2493                         size += SZ_NLATTR_DATA_OF(IPV4_ADDR_LEN) * 2;
2494                         break;
2495                 case RTE_FLOW_ITEM_TYPE_IPV6:
2496                         size += SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 2;
2497                         break;
2498                 case RTE_FLOW_ITEM_TYPE_UDP: {
2499                         const struct rte_flow_item_udp *udp = items->mask;
2500
2501                         size += SZ_NLATTR_TYPE_OF(uint16_t);
2502                         if (!udp || udp->hdr.src_port != RTE_BE16(0x0000))
2503                                 size += SZ_NLATTR_TYPE_OF(uint16_t);
2504                         break;
2505                 }
2506                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2507                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2508                         break;
2509                 default:
2510                         assert(false);
2511                         DRV_LOG(WARNING,
2512                                 "unsupported item %p type %d,"
2513                                 " items must be validated"
2514                                 " before flow creation",
2515                                 (const void *)items, items->type);
2516                         return 0;
2517                 }
2518         }
2519         return size;
2520 }
2521
2522 /**
2523  * Calculate maximum size of memory for flow actions of Linux TC flower and
2524  * extract specified actions.
2525  *
2526  * @param[in] actions
2527  *   Pointer to the list of actions.
2528  * @param[out] action_flags
2529  *   Pointer to the detected actions.
2530  *
2531  * @return
2532  *   Maximum size of memory for actions.
2533  */
2534 static int
2535 flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
2536                               uint64_t *action_flags)
2537 {
2538         int size = 0;
2539         uint64_t flags = 0;
2540
2541         size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */
2542         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
2543                 switch (actions->type) {
2544                 case RTE_FLOW_ACTION_TYPE_VOID:
2545                         break;
2546                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
2547                         size += SZ_NLATTR_NEST + /* na_act_index. */
2548                                 SZ_NLATTR_STRZ_OF("mirred") +
2549                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2550                                 SZ_NLATTR_TYPE_OF(struct tc_mirred);
2551                         flags |= MLX5_FLOW_ACTION_PORT_ID;
2552                         break;
2553                 case RTE_FLOW_ACTION_TYPE_JUMP:
2554                         size += SZ_NLATTR_NEST + /* na_act_index. */
2555                                 SZ_NLATTR_STRZ_OF("gact") +
2556                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2557                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2558                         flags |= MLX5_FLOW_ACTION_JUMP;
2559                         break;
2560                 case RTE_FLOW_ACTION_TYPE_DROP:
2561                         size += SZ_NLATTR_NEST + /* na_act_index. */
2562                                 SZ_NLATTR_STRZ_OF("gact") +
2563                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2564                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2565                         flags |= MLX5_FLOW_ACTION_DROP;
2566                         break;
2567                 case RTE_FLOW_ACTION_TYPE_COUNT:
2568                         break;
2569                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
2570                         flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
2571                         goto action_of_vlan;
2572                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
2573                         flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
2574                         goto action_of_vlan;
2575                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
2576                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
2577                         goto action_of_vlan;
2578                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
2579                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
2580                         goto action_of_vlan;
2581 action_of_vlan:
2582                         size += SZ_NLATTR_NEST + /* na_act_index. */
2583                                 SZ_NLATTR_STRZ_OF("vlan") +
2584                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2585                                 SZ_NLATTR_TYPE_OF(struct tc_vlan) +
2586                                 SZ_NLATTR_TYPE_OF(uint16_t) +
2587                                 /* VLAN protocol. */
2588                                 SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
2589                                 SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
2590                         break;
2591                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
2592                         size += SZ_NLATTR_NEST + /* na_act_index. */
2593                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2594                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2595                                 SZ_NLATTR_TYPE_OF(uint8_t);
2596                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2597                         size += flow_tcf_vxlan_encap_size(actions) +
2598                                 RTE_ALIGN_CEIL /* preceding encap params. */
2599                                 (sizeof(struct flow_tcf_vxlan_encap),
2600                                 MNL_ALIGNTO);
2601                         flags |= MLX5_FLOW_ACTION_VXLAN_ENCAP;
2602                         break;
2603                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
2604                         size += SZ_NLATTR_NEST + /* na_act_index. */
2605                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2606                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2607                                 SZ_NLATTR_TYPE_OF(uint8_t);
2608                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2609                         size += RTE_ALIGN_CEIL /* preceding decap params. */
2610                                 (sizeof(struct flow_tcf_vxlan_decap),
2611                                 MNL_ALIGNTO);
2612                         flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2613                         break;
2614                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
2615                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
2616                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
2617                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
2618                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
2619                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
2620                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
2621                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
2622                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
2623                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
2624                         size += flow_tcf_get_pedit_actions_size(&actions,
2625                                                                 &flags);
2626                         break;
2627                 default:
2628                         DRV_LOG(WARNING,
2629                                 "unsupported action %p type %d,"
2630                                 " items must be validated before flow creation",
2631                                 (const void *)actions, actions->type);
2632                         break;
2633                 }
2634         }
2635         *action_flags = flags;
2636         return size;
2637 }
2638
2639 /**
2640  * Brand rtnetlink buffer with unique handle.
2641  *
2642  * This handle should be unique for a given network interface to avoid
2643  * collisions.
2644  *
2645  * @param nlh
2646  *   Pointer to Netlink message.
2647  * @param handle
2648  *   Unique 32-bit handle to use.
2649  */
2650 static void
2651 flow_tcf_nl_brand(struct nlmsghdr *nlh, uint32_t handle)
2652 {
2653         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
2654
2655         tcm->tcm_handle = handle;
2656         DRV_LOG(DEBUG, "Netlink msg %p is branded with handle %x",
2657                 (void *)nlh, handle);
2658 }
2659
2660 /**
2661  * Prepare a flow object for Linux TC flower. It calculates the maximum size of
2662  * memory required, allocates the memory, initializes Netlink message headers
2663  * and set unique TC message handle.
2664  *
2665  * @param[in] attr
2666  *   Pointer to the flow attributes.
2667  * @param[in] items
2668  *   Pointer to the list of items.
2669  * @param[in] actions
2670  *   Pointer to the list of actions.
2671  * @param[out] item_flags
2672  *   Pointer to bit mask of all items detected.
2673  * @param[out] action_flags
2674  *   Pointer to bit mask of all actions detected.
2675  * @param[out] error
2676  *   Pointer to the error structure.
2677  *
2678  * @return
2679  *   Pointer to mlx5_flow object on success,
2680  *   otherwise NULL and rte_ernno is set.
2681  */
2682 static struct mlx5_flow *
2683 flow_tcf_prepare(const struct rte_flow_attr *attr,
2684                  const struct rte_flow_item items[],
2685                  const struct rte_flow_action actions[],
2686                  uint64_t *item_flags, uint64_t *action_flags,
2687                  struct rte_flow_error *error)
2688 {
2689         size_t size = RTE_ALIGN_CEIL
2690                         (sizeof(struct mlx5_flow),
2691                          alignof(struct flow_tcf_tunnel_hdr)) +
2692                       MNL_ALIGN(sizeof(struct nlmsghdr)) +
2693                       MNL_ALIGN(sizeof(struct tcmsg));
2694         struct mlx5_flow *dev_flow;
2695         struct nlmsghdr *nlh;
2696         struct tcmsg *tcm;
2697         uint8_t *sp, *tun = NULL;
2698
2699         size += flow_tcf_get_items_and_size(attr, items, item_flags);
2700         size += flow_tcf_get_actions_and_size(actions, action_flags);
2701         dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
2702         if (!dev_flow) {
2703                 rte_flow_error_set(error, ENOMEM,
2704                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
2705                                    "not enough memory to create E-Switch flow");
2706                 return NULL;
2707         }
2708         sp = (uint8_t *)(dev_flow + 1);
2709         if (*action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) {
2710                 sp = RTE_PTR_ALIGN
2711                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2712                 tun = sp;
2713                 sp += RTE_ALIGN_CEIL
2714                         (sizeof(struct flow_tcf_vxlan_encap),
2715                         MNL_ALIGNTO);
2716 #ifndef NDEBUG
2717                 size -= RTE_ALIGN_CEIL
2718                         (sizeof(struct flow_tcf_vxlan_encap),
2719                         MNL_ALIGNTO);
2720 #endif
2721         } else if (*action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2722                 sp = RTE_PTR_ALIGN
2723                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2724                 tun = sp;
2725                 sp += RTE_ALIGN_CEIL
2726                         (sizeof(struct flow_tcf_vxlan_decap),
2727                         MNL_ALIGNTO);
2728 #ifndef NDEBUG
2729                 size -= RTE_ALIGN_CEIL
2730                         (sizeof(struct flow_tcf_vxlan_decap),
2731                         MNL_ALIGNTO);
2732 #endif
2733         } else {
2734                 sp = RTE_PTR_ALIGN(sp, MNL_ALIGNTO);
2735         }
2736         nlh = mnl_nlmsg_put_header(sp);
2737         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
2738         *dev_flow = (struct mlx5_flow){
2739                 .tcf = (struct mlx5_flow_tcf){
2740 #ifndef NDEBUG
2741                         .nlsize = size - RTE_ALIGN_CEIL
2742                                 (sizeof(struct mlx5_flow),
2743                                  alignof(struct flow_tcf_tunnel_hdr)),
2744 #endif
2745                         .tunnel = (struct flow_tcf_tunnel_hdr *)tun,
2746                         .nlh = nlh,
2747                         .tcm = tcm,
2748                 },
2749         };
2750         if (*action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP)
2751                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_DECAP;
2752         else if (*action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP)
2753                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_ENCAP;
2754         /*
2755          * Generate a reasonably unique handle based on the address of the
2756          * target buffer.
2757          *
2758          * This is straightforward on 32-bit systems where the flow pointer can
2759          * be used directly. Otherwise, its least significant part is taken
2760          * after shifting it by the previous power of two of the pointed buffer
2761          * size.
2762          */
2763         if (sizeof(dev_flow) <= 4)
2764                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow);
2765         else
2766                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow >>
2767                                        rte_log2_u32(rte_align32prevpow2(size)));
2768         return dev_flow;
2769 }
2770
2771 /**
2772  * Make adjustments for supporting count actions.
2773  *
2774  * @param[in] dev
2775  *   Pointer to the Ethernet device structure.
2776  * @param[in] dev_flow
2777  *   Pointer to mlx5_flow.
2778  * @param[out] error
2779  *   Pointer to error structure.
2780  *
2781  * @return
2782  *   0 On success else a negative errno value is returned and rte_errno is set.
2783  */
2784 static int
2785 flow_tcf_translate_action_count(struct rte_eth_dev *dev __rte_unused,
2786                                   struct mlx5_flow *dev_flow,
2787                                   struct rte_flow_error *error)
2788 {
2789         struct rte_flow *flow = dev_flow->flow;
2790
2791         if (!flow->counter) {
2792                 flow->counter = flow_tcf_counter_new();
2793                 if (!flow->counter)
2794                         return rte_flow_error_set(error, rte_errno,
2795                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2796                                                   NULL,
2797                                                   "cannot get counter"
2798                                                   " context.");
2799         }
2800         return 0;
2801 }
2802
2803 /**
2804  * Convert VXLAN VNI to 32-bit integer.
2805  *
2806  * @param[in] vni
2807  *   VXLAN VNI in 24-bit wire format.
2808  *
2809  * @return
2810  *   VXLAN VNI as a 32-bit integer value in network endian.
2811  */
2812 static inline rte_be32_t
2813 vxlan_vni_as_be32(const uint8_t vni[3])
2814 {
2815         union {
2816                 uint8_t vni[4];
2817                 rte_be32_t dword;
2818         } ret = {
2819                 .vni = { 0, vni[0], vni[1], vni[2] },
2820         };
2821         return ret.dword;
2822 }
2823
2824 /**
2825  * Helper function to process RTE_FLOW_ITEM_TYPE_ETH entry in configuration
2826  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the MAC address fields
2827  * in the encapsulation parameters structure. The item must be prevalidated,
2828  * no any validation checks performed by function.
2829  *
2830  * @param[in] spec
2831  *   RTE_FLOW_ITEM_TYPE_ETH entry specification.
2832  * @param[in] mask
2833  *   RTE_FLOW_ITEM_TYPE_ETH entry mask.
2834  * @param[out] encap
2835  *   Structure to fill the gathered MAC address data.
2836  */
2837 static void
2838 flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec,
2839                                const struct rte_flow_item_eth *mask,
2840                                struct flow_tcf_vxlan_encap *encap)
2841 {
2842         /* Item must be validated before. No redundant checks. */
2843         assert(spec);
2844         if (!mask || !memcmp(&mask->dst,
2845                              &rte_flow_item_eth_mask.dst,
2846                              sizeof(rte_flow_item_eth_mask.dst))) {
2847                 /*
2848                  * Ethernet addresses are not supported by
2849                  * tc as tunnel_key parameters. Destination
2850                  * address is needed to form encap packet
2851                  * header and retrieved by kernel from
2852                  * implicit sources (ARP table, etc),
2853                  * address masks are not supported at all.
2854                  */
2855                 encap->eth.dst = spec->dst;
2856                 encap->mask |= FLOW_TCF_ENCAP_ETH_DST;
2857         }
2858         if (!mask || !memcmp(&mask->src,
2859                              &rte_flow_item_eth_mask.src,
2860                              sizeof(rte_flow_item_eth_mask.src))) {
2861                 /*
2862                  * Ethernet addresses are not supported by
2863                  * tc as tunnel_key parameters. Source ethernet
2864                  * address is ignored anyway.
2865                  */
2866                 encap->eth.src = spec->src;
2867                 encap->mask |= FLOW_TCF_ENCAP_ETH_SRC;
2868         }
2869 }
2870
2871 /**
2872  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV4 entry in configuration
2873  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV4 address fields
2874  * in the encapsulation parameters structure. The item must be prevalidated,
2875  * no any validation checks performed by function.
2876  *
2877  * @param[in] spec
2878  *   RTE_FLOW_ITEM_TYPE_IPV4 entry specification.
2879  * @param[out] encap
2880  *   Structure to fill the gathered IPV4 address data.
2881  */
2882 static void
2883 flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec,
2884                                 struct flow_tcf_vxlan_encap *encap)
2885 {
2886         /* Item must be validated before. No redundant checks. */
2887         assert(spec);
2888         encap->ipv4.dst = spec->hdr.dst_addr;
2889         encap->ipv4.src = spec->hdr.src_addr;
2890         encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC |
2891                        FLOW_TCF_ENCAP_IPV4_DST;
2892 }
2893
2894 /**
2895  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV6 entry in configuration
2896  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV6 address fields
2897  * in the encapsulation parameters structure. The item must be prevalidated,
2898  * no any validation checks performed by function.
2899  *
2900  * @param[in] spec
2901  *   RTE_FLOW_ITEM_TYPE_IPV6 entry specification.
2902  * @param[out] encap
2903  *   Structure to fill the gathered IPV6 address data.
2904  */
2905 static void
2906 flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec,
2907                                 struct flow_tcf_vxlan_encap *encap)
2908 {
2909         /* Item must be validated before. No redundant checks. */
2910         assert(spec);
2911         memcpy(encap->ipv6.dst, spec->hdr.dst_addr, IPV6_ADDR_LEN);
2912         memcpy(encap->ipv6.src, spec->hdr.src_addr, IPV6_ADDR_LEN);
2913         encap->mask |= FLOW_TCF_ENCAP_IPV6_SRC |
2914                        FLOW_TCF_ENCAP_IPV6_DST;
2915 }
2916
2917 /**
2918  * Helper function to process RTE_FLOW_ITEM_TYPE_UDP entry in configuration
2919  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the UDP port fields
2920  * in the encapsulation parameters structure. The item must be prevalidated,
2921  * no any validation checks performed by function.
2922  *
2923  * @param[in] spec
2924  *   RTE_FLOW_ITEM_TYPE_UDP entry specification.
2925  * @param[in] mask
2926  *   RTE_FLOW_ITEM_TYPE_UDP entry mask.
2927  * @param[out] encap
2928  *   Structure to fill the gathered UDP port data.
2929  */
2930 static void
2931 flow_tcf_parse_vxlan_encap_udp(const struct rte_flow_item_udp *spec,
2932                                const struct rte_flow_item_udp *mask,
2933                                struct flow_tcf_vxlan_encap *encap)
2934 {
2935         assert(spec);
2936         encap->udp.dst = spec->hdr.dst_port;
2937         encap->mask |= FLOW_TCF_ENCAP_UDP_DST;
2938         if (!mask || mask->hdr.src_port != RTE_BE16(0x0000)) {
2939                 encap->udp.src = spec->hdr.src_port;
2940                 encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC;
2941         }
2942 }
2943
2944 /**
2945  * Helper function to process RTE_FLOW_ITEM_TYPE_VXLAN entry in configuration
2946  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the VNI fields
2947  * in the encapsulation parameters structure. The item must be prevalidated,
2948  * no any validation checks performed by function.
2949  *
2950  * @param[in] spec
2951  *   RTE_FLOW_ITEM_TYPE_VXLAN entry specification.
2952  * @param[out] encap
2953  *   Structure to fill the gathered VNI address data.
2954  */
2955 static void
2956 flow_tcf_parse_vxlan_encap_vni(const struct rte_flow_item_vxlan *spec,
2957                                struct flow_tcf_vxlan_encap *encap)
2958 {
2959         /* Item must be validated before. Do not redundant checks. */
2960         assert(spec);
2961         memcpy(encap->vxlan.vni, spec->vni, sizeof(encap->vxlan.vni));
2962         encap->mask |= FLOW_TCF_ENCAP_VXLAN_VNI;
2963 }
2964
2965 /**
2966  * Populate consolidated encapsulation object from list of pattern items.
2967  *
2968  * Helper function to process configuration of action such as
2969  * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. The item list should be
2970  * validated, there is no way to return an meaningful error.
2971  *
2972  * @param[in] action
2973  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2974  *   List of pattern items to gather data from.
2975  * @param[out] src
2976  *   Structure to fill gathered data.
2977  */
2978 static void
2979 flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action,
2980                            struct flow_tcf_vxlan_encap *encap)
2981 {
2982         union {
2983                 const struct rte_flow_item_eth *eth;
2984                 const struct rte_flow_item_ipv4 *ipv4;
2985                 const struct rte_flow_item_ipv6 *ipv6;
2986                 const struct rte_flow_item_udp *udp;
2987                 const struct rte_flow_item_vxlan *vxlan;
2988         } spec, mask;
2989         const struct rte_flow_item *items;
2990
2991         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
2992         assert(action->conf);
2993
2994         items = ((const struct rte_flow_action_vxlan_encap *)
2995                                         action->conf)->definition;
2996         assert(items);
2997         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2998                 switch (items->type) {
2999                 case RTE_FLOW_ITEM_TYPE_VOID:
3000                         break;
3001                 case RTE_FLOW_ITEM_TYPE_ETH:
3002                         mask.eth = items->mask;
3003                         spec.eth = items->spec;
3004                         flow_tcf_parse_vxlan_encap_eth(spec.eth, mask.eth,
3005                                                        encap);
3006                         break;
3007                 case RTE_FLOW_ITEM_TYPE_IPV4:
3008                         spec.ipv4 = items->spec;
3009                         flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4, encap);
3010                         break;
3011                 case RTE_FLOW_ITEM_TYPE_IPV6:
3012                         spec.ipv6 = items->spec;
3013                         flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6, encap);
3014                         break;
3015                 case RTE_FLOW_ITEM_TYPE_UDP:
3016                         mask.udp = items->mask;
3017                         spec.udp = items->spec;
3018                         flow_tcf_parse_vxlan_encap_udp(spec.udp, mask.udp,
3019                                                        encap);
3020                         break;
3021                 case RTE_FLOW_ITEM_TYPE_VXLAN:
3022                         spec.vxlan = items->spec;
3023                         flow_tcf_parse_vxlan_encap_vni(spec.vxlan, encap);
3024                         break;
3025                 default:
3026                         assert(false);
3027                         DRV_LOG(WARNING,
3028                                 "unsupported item %p type %d,"
3029                                 " items must be validated"
3030                                 " before flow creation",
3031                                 (const void *)items, items->type);
3032                         encap->mask = 0;
3033                         return;
3034                 }
3035         }
3036 }
3037
3038 /**
3039  * Translate flow for Linux TC flower and construct Netlink message.
3040  *
3041  * @param[in] priv
3042  *   Pointer to the priv structure.
3043  * @param[in, out] flow
3044  *   Pointer to the sub flow.
3045  * @param[in] attr
3046  *   Pointer to the flow attributes.
3047  * @param[in] items
3048  *   Pointer to the list of items.
3049  * @param[in] actions
3050  *   Pointer to the list of actions.
3051  * @param[out] error
3052  *   Pointer to the error structure.
3053  *
3054  * @return
3055  *   0 on success, a negative errno value otherwise and rte_ernno is set.
3056  */
3057 static int
3058 flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow,
3059                    const struct rte_flow_attr *attr,
3060                    const struct rte_flow_item items[],
3061                    const struct rte_flow_action actions[],
3062                    struct rte_flow_error *error)
3063 {
3064         union {
3065                 const struct rte_flow_item_port_id *port_id;
3066                 const struct rte_flow_item_eth *eth;
3067                 const struct rte_flow_item_vlan *vlan;
3068                 const struct rte_flow_item_ipv4 *ipv4;
3069                 const struct rte_flow_item_ipv6 *ipv6;
3070                 const struct rte_flow_item_tcp *tcp;
3071                 const struct rte_flow_item_udp *udp;
3072                 const struct rte_flow_item_vxlan *vxlan;
3073         } spec, mask;
3074         union {
3075                 const struct rte_flow_action_port_id *port_id;
3076                 const struct rte_flow_action_jump *jump;
3077                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
3078                 const struct rte_flow_action_of_set_vlan_vid *
3079                         of_set_vlan_vid;
3080                 const struct rte_flow_action_of_set_vlan_pcp *
3081                         of_set_vlan_pcp;
3082         } conf;
3083         union {
3084                 struct flow_tcf_tunnel_hdr *hdr;
3085                 struct flow_tcf_vxlan_decap *vxlan;
3086         } decap = {
3087                 .hdr = NULL,
3088         };
3089         union {
3090                 struct flow_tcf_tunnel_hdr *hdr;
3091                 struct flow_tcf_vxlan_encap *vxlan;
3092         } encap = {
3093                 .hdr = NULL,
3094         };
3095         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
3096         struct nlmsghdr *nlh = dev_flow->tcf.nlh;
3097         struct tcmsg *tcm = dev_flow->tcf.tcm;
3098         uint32_t na_act_index_cur;
3099         bool eth_type_set = 0;
3100         bool vlan_present = 0;
3101         bool vlan_eth_type_set = 0;
3102         bool ip_proto_set = 0;
3103         struct nlattr *na_flower;
3104         struct nlattr *na_flower_act;
3105         struct nlattr *na_vlan_id = NULL;
3106         struct nlattr *na_vlan_priority = NULL;
3107         uint64_t item_flags = 0;
3108         int ret;
3109
3110         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
3111                                                 PTOI_TABLE_SZ_MAX(dev)));
3112         if (dev_flow->tcf.tunnel) {
3113                 switch (dev_flow->tcf.tunnel->type) {
3114                 case FLOW_TCF_TUNACT_VXLAN_DECAP:
3115                         decap.vxlan = dev_flow->tcf.vxlan_decap;
3116                         break;
3117                 case FLOW_TCF_TUNACT_VXLAN_ENCAP:
3118                         encap.vxlan = dev_flow->tcf.vxlan_encap;
3119                         break;
3120                 /* New tunnel actions can be added here. */
3121                 default:
3122                         assert(false);
3123                         break;
3124                 }
3125         }
3126         nlh = dev_flow->tcf.nlh;
3127         tcm = dev_flow->tcf.tcm;
3128         /* Prepare API must have been called beforehand. */
3129         assert(nlh != NULL && tcm != NULL);
3130         tcm->tcm_family = AF_UNSPEC;
3131         tcm->tcm_ifindex = ptoi[0].ifindex;
3132         tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
3133         /*
3134          * Priority cannot be zero to prevent the kernel from picking one
3135          * automatically.
3136          */
3137         tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
3138                                   RTE_BE16(ETH_P_ALL));
3139         if (attr->group > 0)
3140                 mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group);
3141         mnl_attr_put_strz(nlh, TCA_KIND, "flower");
3142         na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
3143         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3144                 unsigned int i;
3145
3146                 switch (items->type) {
3147                 case RTE_FLOW_ITEM_TYPE_VOID:
3148                         break;
3149                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
3150                         mask.port_id = flow_tcf_item_mask
3151                                 (items, &rte_flow_item_port_id_mask,
3152                                  &flow_tcf_mask_supported.port_id,
3153                                  &flow_tcf_mask_empty.port_id,
3154                                  sizeof(flow_tcf_mask_supported.port_id),
3155                                  error);
3156                         assert(mask.port_id);
3157                         if (mask.port_id == &flow_tcf_mask_empty.port_id)
3158                                 break;
3159                         spec.port_id = items->spec;
3160                         if (!mask.port_id->id)
3161                                 i = 0;
3162                         else
3163                                 for (i = 0; ptoi[i].ifindex; ++i)
3164                                         if (ptoi[i].port_id == spec.port_id->id)
3165                                                 break;
3166                         assert(ptoi[i].ifindex);
3167                         tcm->tcm_ifindex = ptoi[i].ifindex;
3168                         break;
3169                 case RTE_FLOW_ITEM_TYPE_ETH:
3170                         item_flags |= (item_flags & MLX5_FLOW_LAYER_VXLAN) ?
3171                                       MLX5_FLOW_LAYER_INNER_L2 :
3172                                       MLX5_FLOW_LAYER_OUTER_L2;
3173                         mask.eth = flow_tcf_item_mask
3174                                 (items, &rte_flow_item_eth_mask,
3175                                  &flow_tcf_mask_supported.eth,
3176                                  &flow_tcf_mask_empty.eth,
3177                                  sizeof(flow_tcf_mask_supported.eth),
3178                                  error);
3179                         assert(mask.eth);
3180                         if (mask.eth == &flow_tcf_mask_empty.eth)
3181                                 break;
3182                         spec.eth = items->spec;
3183                         if (decap.vxlan &&
3184                             !(item_flags & MLX5_FLOW_LAYER_VXLAN)) {
3185                                 DRV_LOG(WARNING,
3186                                         "outer L2 addresses cannot be forced"
3187                                         " for vxlan decapsulation, parameter"
3188                                         " ignored");
3189                                 break;
3190                         }
3191                         if (mask.eth->type) {
3192                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_ETH_TYPE,
3193                                                  spec.eth->type);
3194                                 eth_type_set = 1;
3195                         }
3196                         if (!is_zero_ether_addr(&mask.eth->dst)) {
3197                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
3198                                              ETHER_ADDR_LEN,
3199                                              spec.eth->dst.addr_bytes);
3200                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK,
3201                                              ETHER_ADDR_LEN,
3202                                              mask.eth->dst.addr_bytes);
3203                         }
3204                         if (!is_zero_ether_addr(&mask.eth->src)) {
3205                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC,
3206                                              ETHER_ADDR_LEN,
3207                                              spec.eth->src.addr_bytes);
3208                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK,
3209                                              ETHER_ADDR_LEN,
3210                                              mask.eth->src.addr_bytes);
3211                         }
3212                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3213                         break;
3214                 case RTE_FLOW_ITEM_TYPE_VLAN:
3215                         assert(!encap.hdr);
3216                         assert(!decap.hdr);
3217                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
3218                         mask.vlan = flow_tcf_item_mask
3219                                 (items, &rte_flow_item_vlan_mask,
3220                                  &flow_tcf_mask_supported.vlan,
3221                                  &flow_tcf_mask_empty.vlan,
3222                                  sizeof(flow_tcf_mask_supported.vlan),
3223                                  error);
3224                         assert(mask.vlan);
3225                         if (!eth_type_set)
3226                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_ETH_TYPE,
3227                                                  RTE_BE16(ETH_P_8021Q));
3228                         eth_type_set = 1;
3229                         vlan_present = 1;
3230                         if (mask.vlan == &flow_tcf_mask_empty.vlan)
3231                                 break;
3232                         spec.vlan = items->spec;
3233                         if (mask.vlan->inner_type) {
3234                                 mnl_attr_put_u16(nlh,
3235                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE,
3236                                                  spec.vlan->inner_type);
3237                                 vlan_eth_type_set = 1;
3238                         }
3239                         if (mask.vlan->tci & RTE_BE16(0xe000))
3240                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_VLAN_PRIO,
3241                                                 (rte_be_to_cpu_16
3242                                                  (spec.vlan->tci) >> 13) & 0x7);
3243                         if (mask.vlan->tci & RTE_BE16(0x0fff))
3244                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_VLAN_ID,
3245                                                  rte_be_to_cpu_16
3246                                                  (spec.vlan->tci &
3247                                                   RTE_BE16(0x0fff)));
3248                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3249                         break;
3250                 case RTE_FLOW_ITEM_TYPE_IPV4:
3251                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
3252                         mask.ipv4 = flow_tcf_item_mask
3253                                 (items, &rte_flow_item_ipv4_mask,
3254                                  &flow_tcf_mask_supported.ipv4,
3255                                  &flow_tcf_mask_empty.ipv4,
3256                                  sizeof(flow_tcf_mask_supported.ipv4),
3257                                  error);
3258                         assert(mask.ipv4);
3259                         spec.ipv4 = items->spec;
3260                         if (!decap.vxlan) {
3261                                 if (!eth_type_set && !vlan_eth_type_set)
3262                                         mnl_attr_put_u16
3263                                                 (nlh,
3264                                                  vlan_present ?
3265                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE :
3266                                                  TCA_FLOWER_KEY_ETH_TYPE,
3267                                                  RTE_BE16(ETH_P_IP));
3268                                 eth_type_set = 1;
3269                                 vlan_eth_type_set = 1;
3270                                 if (mask.ipv4 == &flow_tcf_mask_empty.ipv4)
3271                                         break;
3272                                 if (mask.ipv4->hdr.next_proto_id) {
3273                                         mnl_attr_put_u8
3274                                                 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3275                                                  spec.ipv4->hdr.next_proto_id);
3276                                         ip_proto_set = 1;
3277                                 }
3278                         } else {
3279                                 assert(mask.ipv4 != &flow_tcf_mask_empty.ipv4);
3280                         }
3281                         if (mask.ipv4->hdr.src_addr) {
3282                                 mnl_attr_put_u32
3283                                         (nlh, decap.vxlan ?
3284                                          TCA_FLOWER_KEY_ENC_IPV4_SRC :
3285                                          TCA_FLOWER_KEY_IPV4_SRC,
3286                                          spec.ipv4->hdr.src_addr);
3287                                 mnl_attr_put_u32
3288                                         (nlh, decap.vxlan ?
3289                                          TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK :
3290                                          TCA_FLOWER_KEY_IPV4_SRC_MASK,
3291                                          mask.ipv4->hdr.src_addr);
3292                         }
3293                         if (mask.ipv4->hdr.dst_addr) {
3294                                 mnl_attr_put_u32
3295                                         (nlh, decap.vxlan ?
3296                                          TCA_FLOWER_KEY_ENC_IPV4_DST :
3297                                          TCA_FLOWER_KEY_IPV4_DST,
3298                                          spec.ipv4->hdr.dst_addr);
3299                                 mnl_attr_put_u32
3300                                         (nlh, decap.vxlan ?
3301                                          TCA_FLOWER_KEY_ENC_IPV4_DST_MASK :
3302                                          TCA_FLOWER_KEY_IPV4_DST_MASK,
3303                                          mask.ipv4->hdr.dst_addr);
3304                         }
3305                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3306                         break;
3307                 case RTE_FLOW_ITEM_TYPE_IPV6:
3308                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
3309                         mask.ipv6 = flow_tcf_item_mask
3310                                 (items, &rte_flow_item_ipv6_mask,
3311                                  &flow_tcf_mask_supported.ipv6,
3312                                  &flow_tcf_mask_empty.ipv6,
3313                                  sizeof(flow_tcf_mask_supported.ipv6),
3314                                  error);
3315                         assert(mask.ipv6);
3316                         spec.ipv6 = items->spec;
3317                         if (!decap.vxlan) {
3318                                 if (!eth_type_set || !vlan_eth_type_set) {
3319                                         mnl_attr_put_u16
3320                                                 (nlh,
3321                                                  vlan_present ?
3322                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE :
3323                                                  TCA_FLOWER_KEY_ETH_TYPE,
3324                                                  RTE_BE16(ETH_P_IPV6));
3325                                 }
3326                                 eth_type_set = 1;
3327                                 vlan_eth_type_set = 1;
3328                                 if (mask.ipv6 == &flow_tcf_mask_empty.ipv6)
3329                                         break;
3330                                 if (mask.ipv6->hdr.proto) {
3331                                         mnl_attr_put_u8
3332                                                 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3333                                                  spec.ipv6->hdr.proto);
3334                                         ip_proto_set = 1;
3335                                 }
3336                         } else {
3337                                 assert(mask.ipv6 != &flow_tcf_mask_empty.ipv6);
3338                         }
3339                         if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.src_addr)) {
3340                                 mnl_attr_put(nlh, decap.vxlan ?
3341                                              TCA_FLOWER_KEY_ENC_IPV6_SRC :
3342                                              TCA_FLOWER_KEY_IPV6_SRC,
3343                                              IPV6_ADDR_LEN,
3344                                              spec.ipv6->hdr.src_addr);
3345                                 mnl_attr_put(nlh, decap.vxlan ?
3346                                              TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK :
3347                                              TCA_FLOWER_KEY_IPV6_SRC_MASK,
3348                                              IPV6_ADDR_LEN,
3349                                              mask.ipv6->hdr.src_addr);
3350                         }
3351                         if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.dst_addr)) {
3352                                 mnl_attr_put(nlh, decap.vxlan ?
3353                                              TCA_FLOWER_KEY_ENC_IPV6_DST :
3354                                              TCA_FLOWER_KEY_IPV6_DST,
3355                                              IPV6_ADDR_LEN,
3356                                              spec.ipv6->hdr.dst_addr);
3357                                 mnl_attr_put(nlh, decap.vxlan ?
3358                                              TCA_FLOWER_KEY_ENC_IPV6_DST_MASK :
3359                                              TCA_FLOWER_KEY_IPV6_DST_MASK,
3360                                              IPV6_ADDR_LEN,
3361                                              mask.ipv6->hdr.dst_addr);
3362                         }
3363                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3364                         break;
3365                 case RTE_FLOW_ITEM_TYPE_UDP:
3366                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
3367                         mask.udp = flow_tcf_item_mask
3368                                 (items, &rte_flow_item_udp_mask,
3369                                  &flow_tcf_mask_supported.udp,
3370                                  &flow_tcf_mask_empty.udp,
3371                                  sizeof(flow_tcf_mask_supported.udp),
3372                                  error);
3373                         assert(mask.udp);
3374                         spec.udp = items->spec;
3375                         if (!decap.vxlan) {
3376                                 if (!ip_proto_set)
3377                                         mnl_attr_put_u8
3378                                                 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3379                                                 IPPROTO_UDP);
3380                                 if (mask.udp == &flow_tcf_mask_empty.udp)
3381                                         break;
3382                         } else {
3383                                 assert(mask.udp != &flow_tcf_mask_empty.udp);
3384                                 decap.vxlan->udp_port =
3385                                         rte_be_to_cpu_16
3386                                                 (spec.udp->hdr.dst_port);
3387                         }
3388                         if (mask.udp->hdr.src_port) {
3389                                 mnl_attr_put_u16
3390                                         (nlh, decap.vxlan ?
3391                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT :
3392                                          TCA_FLOWER_KEY_UDP_SRC,
3393                                          spec.udp->hdr.src_port);
3394                                 mnl_attr_put_u16
3395                                         (nlh, decap.vxlan ?
3396                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK :
3397                                          TCA_FLOWER_KEY_UDP_SRC_MASK,
3398                                          mask.udp->hdr.src_port);
3399                         }
3400                         if (mask.udp->hdr.dst_port) {
3401                                 mnl_attr_put_u16
3402                                         (nlh, decap.vxlan ?
3403                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT :
3404                                          TCA_FLOWER_KEY_UDP_DST,
3405                                          spec.udp->hdr.dst_port);
3406                                 mnl_attr_put_u16
3407                                         (nlh, decap.vxlan ?
3408                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK :
3409                                          TCA_FLOWER_KEY_UDP_DST_MASK,
3410                                          mask.udp->hdr.dst_port);
3411                         }
3412                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3413                         break;
3414                 case RTE_FLOW_ITEM_TYPE_TCP:
3415                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
3416                         mask.tcp = flow_tcf_item_mask
3417                                 (items, &rte_flow_item_tcp_mask,
3418                                  &flow_tcf_mask_supported.tcp,
3419                                  &flow_tcf_mask_empty.tcp,
3420                                  sizeof(flow_tcf_mask_supported.tcp),
3421                                  error);
3422                         assert(mask.tcp);
3423                         if (!ip_proto_set)
3424                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3425                                                 IPPROTO_TCP);
3426                         if (mask.tcp == &flow_tcf_mask_empty.tcp)
3427                                 break;
3428                         spec.tcp = items->spec;
3429                         if (mask.tcp->hdr.src_port) {
3430                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_SRC,
3431                                                  spec.tcp->hdr.src_port);
3432                                 mnl_attr_put_u16(nlh,
3433                                                  TCA_FLOWER_KEY_TCP_SRC_MASK,
3434                                                  mask.tcp->hdr.src_port);
3435                         }
3436                         if (mask.tcp->hdr.dst_port) {
3437                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_DST,
3438                                                  spec.tcp->hdr.dst_port);
3439                                 mnl_attr_put_u16(nlh,
3440                                                  TCA_FLOWER_KEY_TCP_DST_MASK,
3441                                                  mask.tcp->hdr.dst_port);
3442                         }
3443                         if (mask.tcp->hdr.tcp_flags) {
3444                                 mnl_attr_put_u16
3445                                         (nlh,
3446                                          TCA_FLOWER_KEY_TCP_FLAGS,
3447                                          rte_cpu_to_be_16
3448                                                 (spec.tcp->hdr.tcp_flags));
3449                                 mnl_attr_put_u16
3450                                         (nlh,
3451                                          TCA_FLOWER_KEY_TCP_FLAGS_MASK,
3452                                          rte_cpu_to_be_16
3453                                                 (mask.tcp->hdr.tcp_flags));
3454                         }
3455                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3456                         break;
3457                 case RTE_FLOW_ITEM_TYPE_VXLAN:
3458                         assert(decap.vxlan);
3459                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
3460                         spec.vxlan = items->spec;
3461                         mnl_attr_put_u32(nlh,
3462                                          TCA_FLOWER_KEY_ENC_KEY_ID,
3463                                          vxlan_vni_as_be32(spec.vxlan->vni));
3464                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3465                         break;
3466                 default:
3467                         return rte_flow_error_set(error, ENOTSUP,
3468                                                   RTE_FLOW_ERROR_TYPE_ITEM,
3469                                                   NULL, "item not supported");
3470                 }
3471         }
3472         na_flower_act = mnl_attr_nest_start(nlh, TCA_FLOWER_ACT);
3473         na_act_index_cur = 1;
3474         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
3475                 struct nlattr *na_act_index;
3476                 struct nlattr *na_act;
3477                 unsigned int vlan_act;
3478                 unsigned int i;
3479
3480                 switch (actions->type) {
3481                 case RTE_FLOW_ACTION_TYPE_VOID:
3482                         break;
3483                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
3484                         conf.port_id = actions->conf;
3485                         if (conf.port_id->original)
3486                                 i = 0;
3487                         else
3488                                 for (i = 0; ptoi[i].ifindex; ++i)
3489                                         if (ptoi[i].port_id == conf.port_id->id)
3490                                                 break;
3491                         assert(ptoi[i].ifindex);
3492                         na_act_index =
3493                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3494                         assert(na_act_index);
3495                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
3496                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3497                         assert(na_act);
3498                         if (encap.hdr) {
3499                                 assert(dev_flow->tcf.tunnel);
3500                                 dev_flow->tcf.tunnel->ifindex_ptr =
3501                                         &((struct tc_mirred *)
3502                                         mnl_attr_get_payload
3503                                         (mnl_nlmsg_get_payload_tail
3504                                                 (nlh)))->ifindex;
3505                         }
3506                         mnl_attr_put(nlh, TCA_MIRRED_PARMS,
3507                                      sizeof(struct tc_mirred),
3508                                      &(struct tc_mirred){
3509                                         .action = TC_ACT_STOLEN,
3510                                         .eaction = TCA_EGRESS_REDIR,
3511                                         .ifindex = ptoi[i].ifindex,
3512                                      });
3513                         mnl_attr_nest_end(nlh, na_act);
3514                         mnl_attr_nest_end(nlh, na_act_index);
3515                         break;
3516                 case RTE_FLOW_ACTION_TYPE_JUMP:
3517                         conf.jump = actions->conf;
3518                         na_act_index =
3519                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3520                         assert(na_act_index);
3521                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3522                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3523                         assert(na_act);
3524                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3525                                      sizeof(struct tc_gact),
3526                                      &(struct tc_gact){
3527                                         .action = TC_ACT_GOTO_CHAIN |
3528                                                   conf.jump->group,
3529                                      });
3530                         mnl_attr_nest_end(nlh, na_act);
3531                         mnl_attr_nest_end(nlh, na_act_index);
3532                         break;
3533                 case RTE_FLOW_ACTION_TYPE_DROP:
3534                         na_act_index =
3535                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3536                         assert(na_act_index);
3537                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3538                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3539                         assert(na_act);
3540                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3541                                      sizeof(struct tc_gact),
3542                                      &(struct tc_gact){
3543                                         .action = TC_ACT_SHOT,
3544                                      });
3545                         mnl_attr_nest_end(nlh, na_act);
3546                         mnl_attr_nest_end(nlh, na_act_index);
3547                         break;
3548                 case RTE_FLOW_ACTION_TYPE_COUNT:
3549                         /*
3550                          * Driver adds the count action implicitly for
3551                          * each rule it creates.
3552                          */
3553                         ret = flow_tcf_translate_action_count(dev,
3554                                                               dev_flow, error);
3555                         if (ret < 0)
3556                                 return ret;
3557                         break;
3558                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
3559                         conf.of_push_vlan = NULL;
3560                         vlan_act = TCA_VLAN_ACT_POP;
3561                         goto action_of_vlan;
3562                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
3563                         conf.of_push_vlan = actions->conf;
3564                         vlan_act = TCA_VLAN_ACT_PUSH;
3565                         goto action_of_vlan;
3566                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
3567                         conf.of_set_vlan_vid = actions->conf;
3568                         if (na_vlan_id)
3569                                 goto override_na_vlan_id;
3570                         vlan_act = TCA_VLAN_ACT_MODIFY;
3571                         goto action_of_vlan;
3572                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
3573                         conf.of_set_vlan_pcp = actions->conf;
3574                         if (na_vlan_priority)
3575                                 goto override_na_vlan_priority;
3576                         vlan_act = TCA_VLAN_ACT_MODIFY;
3577                         goto action_of_vlan;
3578 action_of_vlan:
3579                         na_act_index =
3580                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3581                         assert(na_act_index);
3582                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "vlan");
3583                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3584                         assert(na_act);
3585                         mnl_attr_put(nlh, TCA_VLAN_PARMS,
3586                                      sizeof(struct tc_vlan),
3587                                      &(struct tc_vlan){
3588                                         .action = TC_ACT_PIPE,
3589                                         .v_action = vlan_act,
3590                                      });
3591                         if (vlan_act == TCA_VLAN_ACT_POP) {
3592                                 mnl_attr_nest_end(nlh, na_act);
3593                                 mnl_attr_nest_end(nlh, na_act_index);
3594                                 break;
3595                         }
3596                         if (vlan_act == TCA_VLAN_ACT_PUSH)
3597                                 mnl_attr_put_u16(nlh,
3598                                                  TCA_VLAN_PUSH_VLAN_PROTOCOL,
3599                                                  conf.of_push_vlan->ethertype);
3600                         na_vlan_id = mnl_nlmsg_get_payload_tail(nlh);
3601                         mnl_attr_put_u16(nlh, TCA_VLAN_PAD, 0);
3602                         na_vlan_priority = mnl_nlmsg_get_payload_tail(nlh);
3603                         mnl_attr_put_u8(nlh, TCA_VLAN_PAD, 0);
3604                         mnl_attr_nest_end(nlh, na_act);
3605                         mnl_attr_nest_end(nlh, na_act_index);
3606                         if (actions->type ==
3607                             RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
3608 override_na_vlan_id:
3609                                 na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID;
3610                                 *(uint16_t *)mnl_attr_get_payload(na_vlan_id) =
3611                                         rte_be_to_cpu_16
3612                                         (conf.of_set_vlan_vid->vlan_vid);
3613                         } else if (actions->type ==
3614                                    RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
3615 override_na_vlan_priority:
3616                                 na_vlan_priority->nla_type =
3617                                         TCA_VLAN_PUSH_VLAN_PRIORITY;
3618                                 *(uint8_t *)mnl_attr_get_payload
3619                                         (na_vlan_priority) =
3620                                         conf.of_set_vlan_pcp->vlan_pcp;
3621                         }
3622                         break;
3623                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
3624                         assert(decap.vxlan);
3625                         assert(dev_flow->tcf.tunnel);
3626                         dev_flow->tcf.tunnel->ifindex_ptr =
3627                                 (unsigned int *)&tcm->tcm_ifindex;
3628                         na_act_index =
3629                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3630                         assert(na_act_index);
3631                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3632                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3633                         assert(na_act);
3634                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3635                                 sizeof(struct tc_tunnel_key),
3636                                 &(struct tc_tunnel_key){
3637                                         .action = TC_ACT_PIPE,
3638                                         .t_action = TCA_TUNNEL_KEY_ACT_RELEASE,
3639                                         });
3640                         mnl_attr_nest_end(nlh, na_act);
3641                         mnl_attr_nest_end(nlh, na_act_index);
3642                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3643                         break;
3644                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
3645                         assert(encap.vxlan);
3646                         flow_tcf_vxlan_encap_parse(actions, encap.vxlan);
3647                         na_act_index =
3648                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3649                         assert(na_act_index);
3650                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3651                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3652                         assert(na_act);
3653                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3654                                 sizeof(struct tc_tunnel_key),
3655                                 &(struct tc_tunnel_key){
3656                                         .action = TC_ACT_PIPE,
3657                                         .t_action = TCA_TUNNEL_KEY_ACT_SET,
3658                                         });
3659                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_UDP_DST)
3660                                 mnl_attr_put_u16(nlh,
3661                                          TCA_TUNNEL_KEY_ENC_DST_PORT,
3662                                          encap.vxlan->udp.dst);
3663                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_SRC)
3664                                 mnl_attr_put_u32(nlh,
3665                                          TCA_TUNNEL_KEY_ENC_IPV4_SRC,
3666                                          encap.vxlan->ipv4.src);
3667                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_DST)
3668                                 mnl_attr_put_u32(nlh,
3669                                          TCA_TUNNEL_KEY_ENC_IPV4_DST,
3670                                          encap.vxlan->ipv4.dst);
3671                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_SRC)
3672                                 mnl_attr_put(nlh,
3673                                          TCA_TUNNEL_KEY_ENC_IPV6_SRC,
3674                                          sizeof(encap.vxlan->ipv6.src),
3675                                          &encap.vxlan->ipv6.src);
3676                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_DST)
3677                                 mnl_attr_put(nlh,
3678                                          TCA_TUNNEL_KEY_ENC_IPV6_DST,
3679                                          sizeof(encap.vxlan->ipv6.dst),
3680                                          &encap.vxlan->ipv6.dst);
3681                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_VXLAN_VNI)
3682                                 mnl_attr_put_u32(nlh,
3683                                          TCA_TUNNEL_KEY_ENC_KEY_ID,
3684                                          vxlan_vni_as_be32
3685                                                 (encap.vxlan->vxlan.vni));
3686                         mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM, 0);
3687                         mnl_attr_nest_end(nlh, na_act);
3688                         mnl_attr_nest_end(nlh, na_act_index);
3689                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3690                         break;
3691                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
3692                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
3693                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
3694                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
3695                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
3696                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
3697                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
3698                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
3699                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
3700                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
3701                         na_act_index =
3702                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3703                         flow_tcf_create_pedit_mnl_msg(nlh,
3704                                                       &actions, item_flags);
3705                         mnl_attr_nest_end(nlh, na_act_index);
3706                         break;
3707                 default:
3708                         return rte_flow_error_set(error, ENOTSUP,
3709                                                   RTE_FLOW_ERROR_TYPE_ACTION,
3710                                                   actions,
3711                                                   "action not supported");
3712                 }
3713         }
3714         assert(na_flower);
3715         assert(na_flower_act);
3716         mnl_attr_nest_end(nlh, na_flower_act);
3717         mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, decap.vxlan ?
3718                                                 0 : TCA_CLS_FLAGS_SKIP_SW);
3719         mnl_attr_nest_end(nlh, na_flower);
3720         if (dev_flow->tcf.tunnel && dev_flow->tcf.tunnel->ifindex_ptr)
3721                 dev_flow->tcf.tunnel->ifindex_org =
3722                         *dev_flow->tcf.tunnel->ifindex_ptr;
3723         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3724         return 0;
3725 }
3726
3727 /**
3728  * Send Netlink message with acknowledgment.
3729  *
3730  * @param tcf
3731  *   Flow context to use.
3732  * @param nlh
3733  *   Message to send. This function always raises the NLM_F_ACK flag before
3734  *   sending.
3735  * @param[in] msglen
3736  *   Message length. Message buffer may contain multiple commands and
3737  *   nlmsg_len field not always corresponds to actual message length.
3738  *   If 0 specified the nlmsg_len field in header is used as message length.
3739  * @param[in] cb
3740  *   Callback handler for received message.
3741  * @param[in] arg
3742  *   Context pointer for callback handler.
3743  *
3744  * @return
3745  *   0 on success, a negative errno value otherwise and rte_errno is set.
3746  */
3747 static int
3748 flow_tcf_nl_ack(struct mlx5_flow_tcf_context *tcf,
3749                 struct nlmsghdr *nlh,
3750                 uint32_t msglen,
3751                 mnl_cb_t cb, void *arg)
3752 {
3753         unsigned int portid = mnl_socket_get_portid(tcf->nl);
3754         uint32_t seq = tcf->seq++;
3755         int err, ret;
3756
3757         assert(tcf->nl);
3758         assert(tcf->buf);
3759         if (!seq)
3760                 /* seq 0 is reserved for kernel event-driven notifications. */
3761                 seq = tcf->seq++;
3762         nlh->nlmsg_seq = seq;
3763         if (!msglen) {
3764                 msglen = nlh->nlmsg_len;
3765                 nlh->nlmsg_flags |= NLM_F_ACK;
3766         }
3767         ret = mnl_socket_sendto(tcf->nl, nlh, msglen);
3768         err = (ret <= 0) ? errno : 0;
3769         nlh = (struct nlmsghdr *)(tcf->buf);
3770         /*
3771          * The following loop postpones non-fatal errors until multipart
3772          * messages are complete.
3773          */
3774         if (ret > 0)
3775                 while (true) {
3776                         ret = mnl_socket_recvfrom(tcf->nl, tcf->buf,
3777                                                   tcf->buf_size);
3778                         if (ret < 0) {
3779                                 err = errno;
3780                                 if (err != ENOSPC)
3781                                         break;
3782                         }
3783                         if (!err) {
3784                                 ret = mnl_cb_run(nlh, ret, seq, portid,
3785                                                  cb, arg);
3786                                 if (ret < 0) {
3787                                         err = errno;
3788                                         break;
3789                                 }
3790                         }
3791                         /* Will receive till end of multipart message */
3792                         if (!(nlh->nlmsg_flags & NLM_F_MULTI) ||
3793                               nlh->nlmsg_type == NLMSG_DONE)
3794                                 break;
3795                 }
3796         if (!err)
3797                 return 0;
3798         rte_errno = err;
3799         return -err;
3800 }
3801
3802 #define MNL_BUF_EXTRA_SPACE 16
3803 #define MNL_REQUEST_SIZE_MIN 256
3804 #define MNL_REQUEST_SIZE_MAX 2048
3805 #define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \
3806                                  MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX)
3807
3808 /* Data structures used by flow_tcf_xxx_cb() routines. */
3809 struct tcf_nlcb_buf {
3810         LIST_ENTRY(tcf_nlcb_buf) next;
3811         uint32_t size;
3812         alignas(struct nlmsghdr)
3813         uint8_t msg[]; /**< Netlink message data. */
3814 };
3815
3816 struct tcf_nlcb_context {
3817         unsigned int ifindex; /**< Base interface index. */
3818         uint32_t bufsize;
3819         LIST_HEAD(, tcf_nlcb_buf) nlbuf;
3820 };
3821
3822 /**
3823  * Allocate space for netlink command in buffer list
3824  *
3825  * @param[in, out] ctx
3826  *   Pointer to callback context with command buffers list.
3827  * @param[in] size
3828  *   Required size of data buffer to be allocated.
3829  *
3830  * @return
3831  *   Pointer to allocated memory, aligned as message header.
3832  *   NULL if some error occurred.
3833  */
3834 static struct nlmsghdr *
3835 flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size)
3836 {
3837         struct tcf_nlcb_buf *buf;
3838         struct nlmsghdr *nlh;
3839
3840         size = NLMSG_ALIGN(size);
3841         buf = LIST_FIRST(&ctx->nlbuf);
3842         if (buf && (buf->size + size) <= ctx->bufsize) {
3843                 nlh = (struct nlmsghdr *)&buf->msg[buf->size];
3844                 buf->size += size;
3845                 return nlh;
3846         }
3847         if (size > ctx->bufsize) {
3848                 DRV_LOG(WARNING, "netlink: too long command buffer requested");
3849                 return NULL;
3850         }
3851         buf = rte_malloc(__func__,
3852                         ctx->bufsize + sizeof(struct tcf_nlcb_buf),
3853                         alignof(struct tcf_nlcb_buf));
3854         if (!buf) {
3855                 DRV_LOG(WARNING, "netlink: no memory for command buffer");
3856                 return NULL;
3857         }
3858         LIST_INSERT_HEAD(&ctx->nlbuf, buf, next);
3859         buf->size = size;
3860         nlh = (struct nlmsghdr *)&buf->msg[0];
3861         return nlh;
3862 }
3863
3864 /**
3865  * Set NLM_F_ACK flags in the last netlink command in buffer.
3866  * Only last command in the buffer will be acked by system.
3867  *
3868  * @param[in, out] buf
3869  *   Pointer to buffer with netlink commands.
3870  */
3871 static void
3872 flow_tcf_setack_nlcmd(struct tcf_nlcb_buf *buf)
3873 {
3874         struct nlmsghdr *nlh;
3875         uint32_t size = 0;
3876
3877         assert(buf->size);
3878         do {
3879                 nlh = (struct nlmsghdr *)&buf->msg[size];
3880                 size += NLMSG_ALIGN(nlh->nlmsg_len);
3881                 if (size >= buf->size) {
3882                         nlh->nlmsg_flags |= NLM_F_ACK;
3883                         break;
3884                 }
3885         } while (true);
3886 }
3887
3888 /**
3889  * Send the buffers with prepared netlink commands. Scans the list and
3890  * sends all found buffers. Buffers are sent and freed anyway in order
3891  * to prevent memory leakage if some every message in received packet.
3892  *
3893  * @param[in] tcf
3894  *   Context object initialized by mlx5_flow_tcf_context_create().
3895  * @param[in, out] ctx
3896  *   Pointer to callback context with command buffers list.
3897  *
3898  * @return
3899  *   Zero value on success, negative errno value otherwise
3900  *   and rte_errno is set.
3901  */
3902 static int
3903 flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf,
3904                     struct tcf_nlcb_context *ctx)
3905 {
3906         struct tcf_nlcb_buf *bc, *bn;
3907         struct nlmsghdr *nlh;
3908         int ret = 0;
3909
3910         bc = LIST_FIRST(&ctx->nlbuf);
3911         while (bc) {
3912                 int rc;
3913
3914                 bn = LIST_NEXT(bc, next);
3915                 if (bc->size) {
3916                         flow_tcf_setack_nlcmd(bc);
3917                         nlh = (struct nlmsghdr *)&bc->msg;
3918                         rc = flow_tcf_nl_ack(tcf, nlh, bc->size, NULL, NULL);
3919                         if (rc && !ret)
3920                                 ret = rc;
3921                 }
3922                 rte_free(bc);
3923                 bc = bn;
3924         }
3925         LIST_INIT(&ctx->nlbuf);
3926         return ret;
3927 }
3928
3929 /**
3930  * Collect local IP address rules with scope link attribute  on specified
3931  * network device. This is callback routine called by libmnl mnl_cb_run()
3932  * in loop for every message in received packet.
3933  *
3934  * @param[in] nlh
3935  *   Pointer to reply header.
3936  * @param[in, out] arg
3937  *   Opaque data pointer for this callback.
3938  *
3939  * @return
3940  *   A positive, nonzero value on success, negative errno value otherwise
3941  *   and rte_errno is set.
3942  */
3943 static int
3944 flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg)
3945 {
3946         struct tcf_nlcb_context *ctx = arg;
3947         struct nlmsghdr *cmd;
3948         struct ifaddrmsg *ifa;
3949         struct nlattr *na;
3950         struct nlattr *na_local = NULL;
3951         struct nlattr *na_peer = NULL;
3952         unsigned char family;
3953
3954         if (nlh->nlmsg_type != RTM_NEWADDR) {
3955                 rte_errno = EINVAL;
3956                 return -rte_errno;
3957         }
3958         ifa = mnl_nlmsg_get_payload(nlh);
3959         family = ifa->ifa_family;
3960         if (ifa->ifa_index != ctx->ifindex ||
3961             ifa->ifa_scope != RT_SCOPE_LINK ||
3962             !(ifa->ifa_flags & IFA_F_PERMANENT) ||
3963             (family != AF_INET && family != AF_INET6))
3964                 return 1;
3965         mnl_attr_for_each(na, nlh, sizeof(*ifa)) {
3966                 switch (mnl_attr_get_type(na)) {
3967                 case IFA_LOCAL:
3968                         na_local = na;
3969                         break;
3970                 case IFA_ADDRESS:
3971                         na_peer = na;
3972                         break;
3973                 }
3974                 if (na_local && na_peer)
3975                         break;
3976         }
3977         if (!na_local || !na_peer)
3978                 return 1;
3979         /* Local rule found with scope link, permanent and assigned peer. */
3980         cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) +
3981                                         MNL_ALIGN(sizeof(struct ifaddrmsg)) +
3982                                         (family == AF_INET6
3983                                         ? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
3984                                         : 2 * SZ_NLATTR_TYPE_OF(uint32_t)));
3985         if (!cmd) {
3986                 rte_errno = ENOMEM;
3987                 return -rte_errno;
3988         }
3989         cmd = mnl_nlmsg_put_header(cmd);
3990         cmd->nlmsg_type = RTM_DELADDR;
3991         cmd->nlmsg_flags = NLM_F_REQUEST;
3992         ifa = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifa));
3993         ifa->ifa_flags = IFA_F_PERMANENT;
3994         ifa->ifa_scope = RT_SCOPE_LINK;
3995         ifa->ifa_index = ctx->ifindex;
3996         if (family == AF_INET) {
3997                 ifa->ifa_family = AF_INET;
3998                 ifa->ifa_prefixlen = 32;
3999                 mnl_attr_put_u32(cmd, IFA_LOCAL, mnl_attr_get_u32(na_local));
4000                 mnl_attr_put_u32(cmd, IFA_ADDRESS, mnl_attr_get_u32(na_peer));
4001         } else {
4002                 ifa->ifa_family = AF_INET6;
4003                 ifa->ifa_prefixlen = 128;
4004                 mnl_attr_put(cmd, IFA_LOCAL, IPV6_ADDR_LEN,
4005                         mnl_attr_get_payload(na_local));
4006                 mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN,
4007                         mnl_attr_get_payload(na_peer));
4008         }
4009         return 1;
4010 }
4011
4012 /**
4013  * Cleanup the local IP addresses on outer interface.
4014  *
4015  * @param[in] tcf
4016  *   Context object initialized by mlx5_flow_tcf_context_create().
4017  * @param[in] ifindex
4018  *   Network inferface index to perform cleanup.
4019  */
4020 static void
4021 flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf,
4022                             unsigned int ifindex)
4023 {
4024         struct nlmsghdr *nlh;
4025         struct ifaddrmsg *ifa;
4026         struct tcf_nlcb_context ctx = {
4027                 .ifindex = ifindex,
4028                 .bufsize = MNL_REQUEST_SIZE,
4029                 .nlbuf = LIST_HEAD_INITIALIZER(),
4030         };
4031         int ret;
4032
4033         assert(ifindex);
4034         /*
4035          * Seek and destroy leftovers of local IP addresses with
4036          * matching properties "scope link".
4037          */
4038         nlh = mnl_nlmsg_put_header(tcf->buf);
4039         nlh->nlmsg_type = RTM_GETADDR;
4040         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4041         ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4042         ifa->ifa_family = AF_UNSPEC;
4043         ifa->ifa_index = ifindex;
4044         ifa->ifa_scope = RT_SCOPE_LINK;
4045         ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_local_cb, &ctx);
4046         if (ret)
4047                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4048         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4049         if (ret)
4050                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4051 }
4052
4053 /**
4054  * Collect neigh permament rules on specified network device.
4055  * This is callback routine called by libmnl mnl_cb_run() in loop for
4056  * every message in received packet.
4057  *
4058  * @param[in] nlh
4059  *   Pointer to reply header.
4060  * @param[in, out] arg
4061  *   Opaque data pointer for this callback.
4062  *
4063  * @return
4064  *   A positive, nonzero value on success, negative errno value otherwise
4065  *   and rte_errno is set.
4066  */
4067 static int
4068 flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg)
4069 {
4070         struct tcf_nlcb_context *ctx = arg;
4071         struct nlmsghdr *cmd;
4072         struct ndmsg *ndm;
4073         struct nlattr *na;
4074         struct nlattr *na_ip = NULL;
4075         struct nlattr *na_mac = NULL;
4076         unsigned char family;
4077
4078         if (nlh->nlmsg_type != RTM_NEWNEIGH) {
4079                 rte_errno = EINVAL;
4080                 return -rte_errno;
4081         }
4082         ndm = mnl_nlmsg_get_payload(nlh);
4083         family = ndm->ndm_family;
4084         if (ndm->ndm_ifindex != (int)ctx->ifindex ||
4085            !(ndm->ndm_state & NUD_PERMANENT) ||
4086            (family != AF_INET && family != AF_INET6))
4087                 return 1;
4088         mnl_attr_for_each(na, nlh, sizeof(*ndm)) {
4089                 switch (mnl_attr_get_type(na)) {
4090                 case NDA_DST:
4091                         na_ip = na;
4092                         break;
4093                 case NDA_LLADDR:
4094                         na_mac = na;
4095                         break;
4096                 }
4097                 if (na_mac && na_ip)
4098                         break;
4099         }
4100         if (!na_mac || !na_ip)
4101                 return 1;
4102         /* Neigh rule with permenent attribute found. */
4103         cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) +
4104                                         MNL_ALIGN(sizeof(struct ndmsg)) +
4105                                         SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) +
4106                                         (family == AF_INET6
4107                                         ? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4108                                         : SZ_NLATTR_TYPE_OF(uint32_t)));
4109         if (!cmd) {
4110                 rte_errno = ENOMEM;
4111                 return -rte_errno;
4112         }
4113         cmd = mnl_nlmsg_put_header(cmd);
4114         cmd->nlmsg_type = RTM_DELNEIGH;
4115         cmd->nlmsg_flags = NLM_F_REQUEST;
4116         ndm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ndm));
4117         ndm->ndm_ifindex = ctx->ifindex;
4118         ndm->ndm_state = NUD_PERMANENT;
4119         ndm->ndm_flags = 0;
4120         ndm->ndm_type = 0;
4121         if (family == AF_INET) {
4122                 ndm->ndm_family = AF_INET;
4123                 mnl_attr_put_u32(cmd, NDA_DST, mnl_attr_get_u32(na_ip));
4124         } else {
4125                 ndm->ndm_family = AF_INET6;
4126                 mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN,
4127                              mnl_attr_get_payload(na_ip));
4128         }
4129         mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN,
4130                      mnl_attr_get_payload(na_mac));
4131         return 1;
4132 }
4133
4134 /**
4135  * Cleanup the neigh rules on outer interface.
4136  *
4137  * @param[in] tcf
4138  *   Context object initialized by mlx5_flow_tcf_context_create().
4139  * @param[in] ifindex
4140  *   Network inferface index to perform cleanup.
4141  */
4142 static void
4143 flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf,
4144                             unsigned int ifindex)
4145 {
4146         struct nlmsghdr *nlh;
4147         struct ndmsg *ndm;
4148         struct tcf_nlcb_context ctx = {
4149                 .ifindex = ifindex,
4150                 .bufsize = MNL_REQUEST_SIZE,
4151                 .nlbuf = LIST_HEAD_INITIALIZER(),
4152         };
4153         int ret;
4154
4155         assert(ifindex);
4156         /* Seek and destroy leftovers of neigh rules. */
4157         nlh = mnl_nlmsg_put_header(tcf->buf);
4158         nlh->nlmsg_type = RTM_GETNEIGH;
4159         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4160         ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4161         ndm->ndm_family = AF_UNSPEC;
4162         ndm->ndm_ifindex = ifindex;
4163         ndm->ndm_state = NUD_PERMANENT;
4164         ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_neigh_cb, &ctx);
4165         if (ret)
4166                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4167         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4168         if (ret)
4169                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4170 }
4171
4172 /**
4173  * Collect indices of VXLAN encap/decap interfaces associated with device.
4174  * This is callback routine called by libmnl mnl_cb_run() in loop for
4175  * every message in received packet.
4176  *
4177  * @param[in] nlh
4178  *   Pointer to reply header.
4179  * @param[in, out] arg
4180  *   Opaque data pointer for this callback.
4181  *
4182  * @return
4183  *   A positive, nonzero value on success, negative errno value otherwise
4184  *   and rte_errno is set.
4185  */
4186 static int
4187 flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg)
4188 {
4189         struct tcf_nlcb_context *ctx = arg;
4190         struct nlmsghdr *cmd;
4191         struct ifinfomsg *ifm;
4192         struct nlattr *na;
4193         struct nlattr *na_info = NULL;
4194         struct nlattr *na_vxlan = NULL;
4195         bool found = false;
4196         unsigned int vxindex;
4197
4198         if (nlh->nlmsg_type != RTM_NEWLINK) {
4199                 rte_errno = EINVAL;
4200                 return -rte_errno;
4201         }
4202         ifm = mnl_nlmsg_get_payload(nlh);
4203         if (!ifm->ifi_index) {
4204                 rte_errno = EINVAL;
4205                 return -rte_errno;
4206         }
4207         mnl_attr_for_each(na, nlh, sizeof(*ifm))
4208                 if (mnl_attr_get_type(na) == IFLA_LINKINFO) {
4209                         na_info = na;
4210                         break;
4211                 }
4212         if (!na_info)
4213                 return 1;
4214         mnl_attr_for_each_nested(na, na_info) {
4215                 switch (mnl_attr_get_type(na)) {
4216                 case IFLA_INFO_KIND:
4217                         if (!strncmp("vxlan", mnl_attr_get_str(na),
4218                                      mnl_attr_get_len(na)))
4219                                 found = true;
4220                         break;
4221                 case IFLA_INFO_DATA:
4222                         na_vxlan = na;
4223                         break;
4224                 }
4225                 if (found && na_vxlan)
4226                         break;
4227         }
4228         if (!found || !na_vxlan)
4229                 return 1;
4230         found = false;
4231         mnl_attr_for_each_nested(na, na_vxlan) {
4232                 if (mnl_attr_get_type(na) == IFLA_VXLAN_LINK &&
4233                     mnl_attr_get_u32(na) == ctx->ifindex) {
4234                         found = true;
4235                         break;
4236                 }
4237         }
4238         if (!found)
4239                 return 1;
4240         /* Attached VXLAN device found, store the command to delete. */
4241         vxindex = ifm->ifi_index;
4242         cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) +
4243                                         MNL_ALIGN(sizeof(struct ifinfomsg)));
4244         if (!nlh) {
4245                 rte_errno = ENOMEM;
4246                 return -rte_errno;
4247         }
4248         cmd = mnl_nlmsg_put_header(cmd);
4249         cmd->nlmsg_type = RTM_DELLINK;
4250         cmd->nlmsg_flags = NLM_F_REQUEST;
4251         ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm));
4252         ifm->ifi_family = AF_UNSPEC;
4253         ifm->ifi_index = vxindex;
4254         return 1;
4255 }
4256
4257 /**
4258  * Cleanup the outer interface. Removes all found vxlan devices
4259  * attached to specified index, flushes the meigh and local IP
4260  * datavase.
4261  *
4262  * @param[in] tcf
4263  *   Context object initialized by mlx5_flow_tcf_context_create().
4264  * @param[in] ifindex
4265  *   Network inferface index to perform cleanup.
4266  */
4267 static void
4268 flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf,
4269                             unsigned int ifindex)
4270 {
4271         struct nlmsghdr *nlh;
4272         struct ifinfomsg *ifm;
4273         struct tcf_nlcb_context ctx = {
4274                 .ifindex = ifindex,
4275                 .bufsize = MNL_REQUEST_SIZE,
4276                 .nlbuf = LIST_HEAD_INITIALIZER(),
4277         };
4278         int ret;
4279
4280         assert(ifindex);
4281         /*
4282          * Seek and destroy leftover VXLAN encap/decap interfaces with
4283          * matching properties.
4284          */
4285         nlh = mnl_nlmsg_put_header(tcf->buf);
4286         nlh->nlmsg_type = RTM_GETLINK;
4287         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4288         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4289         ifm->ifi_family = AF_UNSPEC;
4290         ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_vxlan_cb, &ctx);
4291         if (ret)
4292                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4293         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4294         if (ret)
4295                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4296 }
4297
4298 /**
4299  * Emit Netlink message to add/remove local address to the outer device.
4300  * The address being added is visible within the link only (scope link).
4301  *
4302  * Note that an implicit route is maintained by the kernel due to the
4303  * presence of a peer address (IFA_ADDRESS).
4304  *
4305  * These rules are used for encapsultion only and allow to assign
4306  * the outer tunnel source IP address.
4307  *
4308  * @param[in] tcf
4309  *   Libmnl socket context object.
4310  * @param[in] encap
4311  *   Encapsulation properties (source address and its peer).
4312  * @param[in] ifindex
4313  *   Network interface to apply rule.
4314  * @param[in] enable
4315  *   Toggle between add and remove.
4316  * @param[out] error
4317  *   Perform verbose error reporting if not NULL.
4318  *
4319  * @return
4320  *   0 on success, a negative errno value otherwise and rte_errno is set.
4321  */
4322 static int
4323 flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf,
4324                     const struct flow_tcf_vxlan_encap *encap,
4325                     unsigned int ifindex,
4326                     bool enable,
4327                     struct rte_flow_error *error)
4328 {
4329         struct nlmsghdr *nlh;
4330         struct ifaddrmsg *ifa;
4331         alignas(struct nlmsghdr)
4332         uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
4333
4334         nlh = mnl_nlmsg_put_header(buf);
4335         nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
4336         nlh->nlmsg_flags =
4337                 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4338         nlh->nlmsg_seq = 0;
4339         ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4340         ifa->ifa_flags = IFA_F_PERMANENT;
4341         ifa->ifa_scope = RT_SCOPE_LINK;
4342         ifa->ifa_index = ifindex;
4343         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4344                 ifa->ifa_family = AF_INET;
4345                 ifa->ifa_prefixlen = 32;
4346                 mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src);
4347                 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST)
4348                         mnl_attr_put_u32(nlh, IFA_ADDRESS,
4349                                               encap->ipv4.dst);
4350         } else {
4351                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4352                 ifa->ifa_family = AF_INET6;
4353                 ifa->ifa_prefixlen = 128;
4354                 mnl_attr_put(nlh, IFA_LOCAL,
4355                                   sizeof(encap->ipv6.src),
4356                                   &encap->ipv6.src);
4357                 if (encap->mask & FLOW_TCF_ENCAP_IPV6_DST)
4358                         mnl_attr_put(nlh, IFA_ADDRESS,
4359                                           sizeof(encap->ipv6.dst),
4360                                           &encap->ipv6.dst);
4361         }
4362         if (!flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL))
4363                 return 0;
4364         return rte_flow_error_set(error, rte_errno,
4365                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4366                                   "netlink: cannot complete IFA request"
4367                                   " (ip addr add)");
4368 }
4369
4370 /**
4371  * Emit Netlink message to add/remove neighbor.
4372  *
4373  * @param[in] tcf
4374  *   Libmnl socket context object.
4375  * @param[in] encap
4376  *   Encapsulation properties (destination address).
4377  * @param[in] ifindex
4378  *   Network interface.
4379  * @param[in] enable
4380  *   Toggle between add and remove.
4381  * @param[out] error
4382  *   Perform verbose error reporting if not NULL.
4383  *
4384  * @return
4385  *   0 on success, a negative errno value otherwise and rte_errno is set.
4386  */
4387 static int
4388 flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf,
4389                      const struct flow_tcf_vxlan_encap *encap,
4390                      unsigned int ifindex,
4391                      bool enable,
4392                      struct rte_flow_error *error)
4393 {
4394         struct nlmsghdr *nlh;
4395         struct ndmsg *ndm;
4396         alignas(struct nlmsghdr)
4397         uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
4398
4399         nlh = mnl_nlmsg_put_header(buf);
4400         nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
4401         nlh->nlmsg_flags =
4402                 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4403         nlh->nlmsg_seq = 0;
4404         ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4405         ndm->ndm_ifindex = ifindex;
4406         ndm->ndm_state = NUD_PERMANENT;
4407         ndm->ndm_flags = 0;
4408         ndm->ndm_type = 0;
4409         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4410                 ndm->ndm_family = AF_INET;
4411                 mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst);
4412         } else {
4413                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4414                 ndm->ndm_family = AF_INET6;
4415                 mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst),
4416                                                  &encap->ipv6.dst);
4417         }
4418         if (encap->mask & FLOW_TCF_ENCAP_ETH_SRC && enable)
4419                 DRV_LOG(WARNING,
4420                         "outer ethernet source address cannot be "
4421                         "forced for VXLAN encapsulation");
4422         if (encap->mask & FLOW_TCF_ENCAP_ETH_DST)
4423                 mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst),
4424                                                     &encap->eth.dst);
4425         if (!flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL))
4426                 return 0;
4427         return rte_flow_error_set(error, rte_errno,
4428                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4429                                   "netlink: cannot complete ND request"
4430                                   " (ip neigh)");
4431 }
4432
4433 /**
4434  * Manage the local IP addresses and their peers IP addresses on the
4435  * outer interface for encapsulation purposes. The kernel searches the
4436  * appropriate device for tunnel egress traffic using the outer source
4437  * IP, this IP should be assigned to the outer network device, otherwise
4438  * kernel rejects the rule.
4439  *
4440  * Adds or removes the addresses using the Netlink command like this:
4441  *   ip addr add <src_ip> peer <dst_ip> scope link dev <ifouter>
4442  *
4443  * The addresses are local to the netdev ("scope link"), this reduces
4444  * the risk of conflicts. Note that an implicit route is maintained by
4445  * the kernel due to the presence of a peer address (IFA_ADDRESS).
4446  *
4447  * @param[in] tcf
4448  *   Libmnl socket context object.
4449  * @param[in] vtep
4450  *   VTEP object, contains rule database and ifouter index.
4451  * @param[in] dev_flow
4452  *   Flow object, contains the tunnel parameters (for encap only).
4453  * @param[in] enable
4454  *   Toggle between add and remove.
4455  * @param[out] error
4456  *   Perform verbose error reporting if not NULL.
4457  *
4458  * @return
4459  *   0 on success, a negative errno value otherwise and rte_errno is set.
4460  */
4461 static int
4462 flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf,
4463                      struct tcf_vtep *vtep,
4464                      struct mlx5_flow *dev_flow,
4465                      bool enable,
4466                      struct rte_flow_error *error)
4467 {
4468         const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4469         struct tcf_local_rule *rule;
4470         bool found = false;
4471         int ret;
4472
4473         assert(encap);
4474         assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4475         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4476                 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_DST);
4477                 LIST_FOREACH(rule, &vtep->local, next) {
4478                         if (rule->mask & FLOW_TCF_ENCAP_IPV4_SRC &&
4479                             encap->ipv4.src == rule->ipv4.src &&
4480                             encap->ipv4.dst == rule->ipv4.dst) {
4481                                 found = true;
4482                                 break;
4483                         }
4484                 }
4485         } else {
4486                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4487                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4488                 LIST_FOREACH(rule, &vtep->local, next) {
4489                         if (rule->mask & FLOW_TCF_ENCAP_IPV6_SRC &&
4490                             !memcmp(&encap->ipv6.src, &rule->ipv6.src,
4491                                             sizeof(encap->ipv6.src)) &&
4492                             !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4493                                             sizeof(encap->ipv6.dst))) {
4494                                 found = true;
4495                                 break;
4496                         }
4497                 }
4498         }
4499         if (found) {
4500                 if (enable) {
4501                         rule->refcnt++;
4502                         return 0;
4503                 }
4504                 if (!rule->refcnt || !--rule->refcnt) {
4505                         LIST_REMOVE(rule, next);
4506                         return flow_tcf_rule_local(tcf, encap,
4507                                         vtep->ifouter, false, error);
4508                 }
4509                 return 0;
4510         }
4511         if (!enable) {
4512                 DRV_LOG(WARNING, "disabling not existing local rule");
4513                 rte_flow_error_set(error, ENOENT,
4514                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4515                                    "disabling not existing local rule");
4516                 return -ENOENT;
4517         }
4518         rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule),
4519                                 alignof(struct tcf_local_rule));
4520         if (!rule) {
4521                 rte_flow_error_set(error, ENOMEM,
4522                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4523                                    "unable to allocate memory for local rule");
4524                 return -rte_errno;
4525         }
4526         *rule = (struct tcf_local_rule){.refcnt = 0,
4527                                         .mask = 0,
4528                                         };
4529         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4530                 rule->mask = FLOW_TCF_ENCAP_IPV4_SRC
4531                            | FLOW_TCF_ENCAP_IPV4_DST;
4532                 rule->ipv4.src = encap->ipv4.src;
4533                 rule->ipv4.dst = encap->ipv4.dst;
4534         } else {
4535                 rule->mask = FLOW_TCF_ENCAP_IPV6_SRC
4536                            | FLOW_TCF_ENCAP_IPV6_DST;
4537                 memcpy(&rule->ipv6.src, &encap->ipv6.src, IPV6_ADDR_LEN);
4538                 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4539         }
4540         ret = flow_tcf_rule_local(tcf, encap, vtep->ifouter, true, error);
4541         if (ret) {
4542                 rte_free(rule);
4543                 return ret;
4544         }
4545         rule->refcnt++;
4546         LIST_INSERT_HEAD(&vtep->local, rule, next);
4547         return 0;
4548 }
4549
4550 /**
4551  * Manage the destination MAC/IP addresses neigh database, kernel uses
4552  * this one to determine the destination MAC address within encapsulation
4553  * header. Adds or removes the entries using the Netlink command like this:
4554  *   ip neigh add dev <ifouter> lladdr <dst_mac> to <dst_ip> nud permanent
4555  *
4556  * @param[in] tcf
4557  *   Libmnl socket context object.
4558  * @param[in] vtep
4559  *   VTEP object, contains rule database and ifouter index.
4560  * @param[in] dev_flow
4561  *   Flow object, contains the tunnel parameters (for encap only).
4562  * @param[in] enable
4563  *   Toggle between add and remove.
4564  * @param[out] error
4565  *   Perform verbose error reporting if not NULL.
4566  *
4567  * @return
4568  *   0 on success, a negative errno value otherwise and rte_errno is set.
4569  */
4570 static int
4571 flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf,
4572                      struct tcf_vtep *vtep,
4573                      struct mlx5_flow *dev_flow,
4574                      bool enable,
4575                      struct rte_flow_error *error)
4576 {
4577         const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4578         struct tcf_neigh_rule *rule;
4579         bool found = false;
4580         int ret;
4581
4582         assert(encap);
4583         assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4584         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4585                 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_SRC);
4586                 LIST_FOREACH(rule, &vtep->neigh, next) {
4587                         if (rule->mask & FLOW_TCF_ENCAP_IPV4_DST &&
4588                             encap->ipv4.dst == rule->ipv4.dst) {
4589                                 found = true;
4590                                 break;
4591                         }
4592                 }
4593         } else {
4594                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4595                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4596                 LIST_FOREACH(rule, &vtep->neigh, next) {
4597                         if (rule->mask & FLOW_TCF_ENCAP_IPV6_DST &&
4598                             !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4599                                                 sizeof(encap->ipv6.dst))) {
4600                                 found = true;
4601                                 break;
4602                         }
4603                 }
4604         }
4605         if (found) {
4606                 if (memcmp(&encap->eth.dst, &rule->eth,
4607                            sizeof(encap->eth.dst))) {
4608                         DRV_LOG(WARNING, "Destination MAC differs"
4609                                          " in neigh rule");
4610                         rte_flow_error_set(error, EEXIST,
4611                                            RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
4612                                            NULL, "Different MAC address"
4613                                            " neigh rule for the same"
4614                                            " destination IP");
4615                                         return -EEXIST;
4616                 }
4617                 if (enable) {
4618                         rule->refcnt++;
4619                         return 0;
4620                 }
4621                 if (!rule->refcnt || !--rule->refcnt) {
4622                         LIST_REMOVE(rule, next);
4623                         return flow_tcf_rule_neigh(tcf, encap,
4624                                                    vtep->ifouter,
4625                                                    false, error);
4626                 }
4627                 return 0;
4628         }
4629         if (!enable) {
4630                 DRV_LOG(WARNING, "Disabling not existing neigh rule");
4631                 rte_flow_error_set(error, ENOENT,
4632                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4633                                    "unable to allocate memory for neigh rule");
4634                 return -ENOENT;
4635         }
4636         rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule),
4637                                 alignof(struct tcf_neigh_rule));
4638         if (!rule) {
4639                 rte_flow_error_set(error, ENOMEM,
4640                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4641                                    "unable to allocate memory for neigh rule");
4642                 return -rte_errno;
4643         }
4644         *rule = (struct tcf_neigh_rule){.refcnt = 0,
4645                                         .mask = 0,
4646                                         };
4647         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4648                 rule->mask = FLOW_TCF_ENCAP_IPV4_DST;
4649                 rule->ipv4.dst = encap->ipv4.dst;
4650         } else {
4651                 rule->mask = FLOW_TCF_ENCAP_IPV6_DST;
4652                 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4653         }
4654         memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth));
4655         ret = flow_tcf_rule_neigh(tcf, encap, vtep->ifouter, true, error);
4656         if (ret) {
4657                 rte_free(rule);
4658                 return ret;
4659         }
4660         rule->refcnt++;
4661         LIST_INSERT_HEAD(&vtep->neigh, rule, next);
4662         return 0;
4663 }
4664
4665 /* VTEP device list is shared between PMD port instances. */
4666 static LIST_HEAD(, tcf_vtep) vtep_list_vxlan = LIST_HEAD_INITIALIZER();
4667 static pthread_mutex_t vtep_list_mutex = PTHREAD_MUTEX_INITIALIZER;
4668
4669 /**
4670  * Deletes VTEP network device.
4671  *
4672  * @param[in] tcf
4673  *   Context object initialized by mlx5_flow_tcf_context_create().
4674  * @param[in] vtep
4675  *   Object represinting the network device to delete. Memory
4676  *   allocated for this object is freed by routine.
4677  */
4678 static void
4679 flow_tcf_vtep_delete(struct mlx5_flow_tcf_context *tcf,
4680                      struct tcf_vtep *vtep)
4681 {
4682         struct nlmsghdr *nlh;
4683         struct ifinfomsg *ifm;
4684         alignas(struct nlmsghdr)
4685         uint8_t buf[mnl_nlmsg_size(MNL_ALIGN(sizeof(*ifm))) +
4686                     MNL_BUF_EXTRA_SPACE];
4687         int ret;
4688
4689         assert(!vtep->refcnt);
4690         /* Delete only ifaces those we actually created. */
4691         if (vtep->created && vtep->ifindex) {
4692                 DRV_LOG(INFO, "VTEP delete (%d)", vtep->ifindex);
4693                 nlh = mnl_nlmsg_put_header(buf);
4694                 nlh->nlmsg_type = RTM_DELLINK;
4695                 nlh->nlmsg_flags = NLM_F_REQUEST;
4696                 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4697                 ifm->ifi_family = AF_UNSPEC;
4698                 ifm->ifi_index = vtep->ifindex;
4699                 assert(sizeof(buf) >= nlh->nlmsg_len);
4700                 ret = flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL);
4701                 if (ret)
4702                         DRV_LOG(WARNING, "netlink: error deleting vxlan"
4703                                          " encap/decap ifindex %u",
4704                                          ifm->ifi_index);
4705         }
4706         rte_free(vtep);
4707 }
4708
4709 /**
4710  * Creates VTEP network device.
4711  *
4712  * @param[in] tcf
4713  *   Context object initialized by mlx5_flow_tcf_context_create().
4714  * @param[in] ifouter
4715  *   Outer interface to attach new-created VXLAN device
4716  *   If zero the VXLAN device will not be attached to any device.
4717  *   These VTEPs are used for decapsulation and can be precreated
4718  *   and shared between processes.
4719  * @param[in] port
4720  *   UDP port of created VTEP device.
4721  * @param[out] error
4722  *   Perform verbose error reporting if not NULL.
4723  *
4724  * @return
4725  * Pointer to created device structure on success,
4726  * NULL otherwise and rte_errno is set.
4727  */
4728 #ifdef HAVE_IFLA_VXLAN_COLLECT_METADATA
4729 static struct tcf_vtep*
4730 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf,
4731                      unsigned int ifouter,
4732                      uint16_t port, struct rte_flow_error *error)
4733 {
4734         struct tcf_vtep *vtep;
4735         struct nlmsghdr *nlh;
4736         struct ifinfomsg *ifm;
4737         char name[sizeof(MLX5_VXLAN_DEVICE_PFX) + 24];
4738         alignas(struct nlmsghdr)
4739         uint8_t buf[mnl_nlmsg_size(sizeof(*ifm)) +
4740                     SZ_NLATTR_DATA_OF(sizeof(name)) +
4741                     SZ_NLATTR_NEST * 2 +
4742                     SZ_NLATTR_STRZ_OF("vxlan") +
4743                     SZ_NLATTR_DATA_OF(sizeof(uint32_t)) +
4744                     SZ_NLATTR_DATA_OF(sizeof(uint16_t)) +
4745                     SZ_NLATTR_DATA_OF(sizeof(uint8_t)) * 3 +
4746                     MNL_BUF_EXTRA_SPACE];
4747         struct nlattr *na_info;
4748         struct nlattr *na_vxlan;
4749         rte_be16_t vxlan_port = rte_cpu_to_be_16(port);
4750         int ret;
4751
4752         vtep = rte_zmalloc(__func__, sizeof(*vtep), alignof(struct tcf_vtep));
4753         if (!vtep) {
4754                 rte_flow_error_set(error, ENOMEM,
4755                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4756                                    "unable to allocate memory for VTEP");
4757                 return NULL;
4758         }
4759         *vtep = (struct tcf_vtep){
4760                         .port = port,
4761                         .local = LIST_HEAD_INITIALIZER(),
4762                         .neigh = LIST_HEAD_INITIALIZER(),
4763         };
4764         memset(buf, 0, sizeof(buf));
4765         nlh = mnl_nlmsg_put_header(buf);
4766         nlh->nlmsg_type = RTM_NEWLINK;
4767         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE  | NLM_F_EXCL;
4768         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4769         ifm->ifi_family = AF_UNSPEC;
4770         ifm->ifi_type = 0;
4771         ifm->ifi_index = 0;
4772         ifm->ifi_flags = IFF_UP;
4773         ifm->ifi_change = 0xffffffff;
4774         snprintf(name, sizeof(name), "%s%u", MLX5_VXLAN_DEVICE_PFX, port);
4775         mnl_attr_put_strz(nlh, IFLA_IFNAME, name);
4776         na_info = mnl_attr_nest_start(nlh, IFLA_LINKINFO);
4777         assert(na_info);
4778         mnl_attr_put_strz(nlh, IFLA_INFO_KIND, "vxlan");
4779         na_vxlan = mnl_attr_nest_start(nlh, IFLA_INFO_DATA);
4780         if (ifouter)
4781                 mnl_attr_put_u32(nlh, IFLA_VXLAN_LINK, ifouter);
4782         assert(na_vxlan);
4783         mnl_attr_put_u8(nlh, IFLA_VXLAN_COLLECT_METADATA, 1);
4784         mnl_attr_put_u8(nlh, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1);
4785         mnl_attr_put_u8(nlh, IFLA_VXLAN_LEARNING, 0);
4786         mnl_attr_put_u16(nlh, IFLA_VXLAN_PORT, vxlan_port);
4787         mnl_attr_nest_end(nlh, na_vxlan);
4788         mnl_attr_nest_end(nlh, na_info);
4789         assert(sizeof(buf) >= nlh->nlmsg_len);
4790         ret = flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL);
4791         if (ret) {
4792                 DRV_LOG(WARNING,
4793                         "netlink: VTEP %s create failure (%d)",
4794                         name, rte_errno);
4795                 if (rte_errno != EEXIST || ifouter)
4796                         /*
4797                          * Some unhandled error occurred or device is
4798                          * for encapsulation and cannot be shared.
4799                          */
4800                         goto error;
4801         } else {
4802                 /*
4803                  * Mark device we actually created.
4804                  * We should explicitly delete
4805                  * when we do not need it anymore.
4806                  */
4807                 vtep->created = 1;
4808         }
4809         /* Try to get ifindex of created of pre-existing device. */
4810         ret = if_nametoindex(name);
4811         if (!ret) {
4812                 DRV_LOG(WARNING,
4813                         "VTEP %s failed to get index (%d)", name, errno);
4814                 rte_flow_error_set
4815                         (error, -errno,
4816                          RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4817                          "netlink: failed to retrieve VTEP ifindex");
4818                 goto error;
4819         }
4820         vtep->ifindex = ret;
4821         vtep->ifouter = ifouter;
4822         memset(buf, 0, sizeof(buf));
4823         nlh = mnl_nlmsg_put_header(buf);
4824         nlh->nlmsg_type = RTM_NEWLINK;
4825         nlh->nlmsg_flags = NLM_F_REQUEST;
4826         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4827         ifm->ifi_family = AF_UNSPEC;
4828         ifm->ifi_type = 0;
4829         ifm->ifi_index = vtep->ifindex;
4830         ifm->ifi_flags = IFF_UP;
4831         ifm->ifi_change = IFF_UP;
4832         ret = flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL);
4833         if (ret) {
4834                 rte_flow_error_set(error, -errno,
4835                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4836                                    "netlink: failed to set VTEP link up");
4837                 DRV_LOG(WARNING, "netlink: VTEP %s set link up failure (%d)",
4838                         name, rte_errno);
4839                 goto clean;
4840         }
4841         ret = mlx5_flow_tcf_init(tcf, vtep->ifindex, error);
4842         if (ret) {
4843                 DRV_LOG(WARNING, "VTEP %s init failure (%d)", name, rte_errno);
4844                 goto clean;
4845         }
4846         DRV_LOG(INFO, "VTEP create (%d, %d)", vtep->port, vtep->ifindex);
4847         vtep->refcnt = 1;
4848         return vtep;
4849 clean:
4850         flow_tcf_vtep_delete(tcf, vtep);
4851         return NULL;
4852 error:
4853         rte_free(vtep);
4854         return NULL;
4855 }
4856 #else
4857 static struct tcf_vtep*
4858 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf __rte_unused,
4859                      unsigned int ifouter __rte_unused,
4860                      uint16_t port __rte_unused,
4861                      struct rte_flow_error *error)
4862 {
4863         rte_flow_error_set(error, ENOTSUP,
4864                            RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4865                            "netlink: failed to create VTEP, "
4866                            "vxlan metadata are not supported by kernel");
4867         return NULL;
4868 }
4869 #endif /* HAVE_IFLA_VXLAN_COLLECT_METADATA */
4870
4871 /**
4872  * Acquire target interface index for VXLAN tunneling decapsulation.
4873  * In order to share the UDP port within the other interfaces the
4874  * VXLAN device created as not attached to any interface (if created).
4875  *
4876  * @param[in] tcf
4877  *   Context object initialized by mlx5_flow_tcf_context_create().
4878  * @param[in] dev_flow
4879  *   Flow tcf object with tunnel structure pointer set.
4880  * @param[out] error
4881  *   Perform verbose error reporting if not NULL.
4882  * @return
4883  *   Interface descriptor pointer on success,
4884  *   NULL otherwise and rte_errno is set.
4885  */
4886 static struct tcf_vtep*
4887 flow_tcf_decap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
4888                             struct mlx5_flow *dev_flow,
4889                             struct rte_flow_error *error)
4890 {
4891         struct tcf_vtep *vtep;
4892         uint16_t port = dev_flow->tcf.vxlan_decap->udp_port;
4893
4894         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
4895                 if (vtep->port == port)
4896                         break;
4897         }
4898         if (vtep && vtep->ifouter) {
4899                 rte_flow_error_set(error, -errno,
4900                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4901                                    "Failed to create decap VTEP with specified"
4902                                    " UDP port, atatched device exists");
4903                 return NULL;
4904         }
4905         if (vtep) {
4906                 /* Device exists, just increment the reference counter. */
4907                 vtep->refcnt++;
4908                 assert(vtep->ifindex);
4909                 return vtep;
4910         }
4911         /* No decapsulation device exists, try to create the new one. */
4912         vtep = flow_tcf_vtep_create(tcf, 0, port, error);
4913         if (vtep)
4914                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
4915         return vtep;
4916 }
4917
4918 /**
4919  * Aqcuire target interface index for VXLAN tunneling encapsulation.
4920  *
4921  * @param[in] tcf
4922  *   Context object initialized by mlx5_flow_tcf_context_create().
4923  * @param[in] ifouter
4924  *   Network interface index to attach VXLAN encap device to.
4925  * @param[in] dev_flow
4926  *   Flow tcf object with tunnel structure pointer set.
4927  * @param[out] error
4928  *   Perform verbose error reporting if not NULL.
4929  * @return
4930  *   Interface descriptor pointer on success,
4931  *   NULL otherwise and rte_errno is set.
4932  */
4933 static struct tcf_vtep*
4934 flow_tcf_encap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
4935                             unsigned int ifouter,
4936                             struct mlx5_flow *dev_flow __rte_unused,
4937                             struct rte_flow_error *error)
4938 {
4939         static uint16_t encap_port = MLX5_VXLAN_PORT_MIN - 1;
4940         struct tcf_vtep *vtep;
4941         int ret;
4942
4943         assert(ifouter);
4944         /* Look whether the attached VTEP for encap is created. */
4945         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
4946                 if (vtep->ifouter == ifouter)
4947                         break;
4948         }
4949         if (vtep) {
4950                 /* VTEP already exists, just increment the reference. */
4951                 vtep->refcnt++;
4952         } else {
4953                 uint16_t pcnt;
4954
4955                 /* Not found, we should create the new attached VTEP. */
4956                 flow_tcf_encap_iface_cleanup(tcf, ifouter);
4957                 flow_tcf_encap_local_cleanup(tcf, ifouter);
4958                 flow_tcf_encap_neigh_cleanup(tcf, ifouter);
4959                 for (pcnt = 0; pcnt <= (MLX5_VXLAN_PORT_MAX
4960                                      - MLX5_VXLAN_PORT_MIN); pcnt++) {
4961                         encap_port++;
4962                         /* Wraparound the UDP port index. */
4963                         if (encap_port < MLX5_VXLAN_PORT_MIN ||
4964                             encap_port > MLX5_VXLAN_PORT_MAX)
4965                                 encap_port = MLX5_VXLAN_PORT_MIN;
4966                         /* Check whether UDP port is in already in use. */
4967                         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
4968                                 if (vtep->port == encap_port)
4969                                         break;
4970                         }
4971                         if (vtep) {
4972                                 /* Port is in use, try the next one. */
4973                                 vtep = NULL;
4974                                 continue;
4975                         }
4976                         vtep = flow_tcf_vtep_create(tcf, ifouter,
4977                                                     encap_port, error);
4978                         if (vtep) {
4979                                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
4980                                 break;
4981                         }
4982                         if (rte_errno != EEXIST)
4983                                 break;
4984                 }
4985                 if (!vtep)
4986                         return NULL;
4987         }
4988         assert(vtep->ifouter == ifouter);
4989         assert(vtep->ifindex);
4990         /* Create local ipaddr with peer to specify the outer IPs. */
4991         ret = flow_tcf_encap_local(tcf, vtep, dev_flow, true, error);
4992         if (!ret) {
4993                 /* Create neigh rule to specify outer destination MAC. */
4994                 ret = flow_tcf_encap_neigh(tcf, vtep, dev_flow, true, error);
4995                 if (ret)
4996                         flow_tcf_encap_local(tcf, vtep,
4997                                              dev_flow, false, error);
4998         }
4999         if (ret) {
5000                 if (--vtep->refcnt == 0)
5001                         flow_tcf_vtep_delete(tcf, vtep);
5002                 return NULL;
5003         }
5004         return vtep;
5005 }
5006
5007 /**
5008  * Acquires target interface index for tunneling of any type.
5009  * Creates the new VTEP if needed.
5010  *
5011  * @param[in] tcf
5012  *   Context object initialized by mlx5_flow_tcf_context_create().
5013  * @param[in] ifouter
5014  *   Network interface index to attach VXLAN encap device to.
5015  * @param[in] dev_flow
5016  *   Flow tcf object with tunnel structure pointer set.
5017  * @param[out] error
5018  *   Perform verbose error reporting if not NULL.
5019  * @return
5020  *   Interface descriptor pointer on success,
5021  *   NULL otherwise and rte_errno is set.
5022  */
5023 static struct tcf_vtep*
5024 flow_tcf_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5025                       unsigned int ifouter,
5026                       struct mlx5_flow *dev_flow,
5027                       struct rte_flow_error *error)
5028 {
5029         struct tcf_vtep *vtep = NULL;
5030
5031         assert(dev_flow->tcf.tunnel);
5032         pthread_mutex_lock(&vtep_list_mutex);
5033         switch (dev_flow->tcf.tunnel->type) {
5034         case FLOW_TCF_TUNACT_VXLAN_ENCAP:
5035                 vtep = flow_tcf_encap_vtep_acquire(tcf, ifouter,
5036                                                   dev_flow, error);
5037                 break;
5038         case FLOW_TCF_TUNACT_VXLAN_DECAP:
5039                 vtep = flow_tcf_decap_vtep_acquire(tcf, dev_flow, error);
5040                 break;
5041         default:
5042                 rte_flow_error_set(error, ENOTSUP,
5043                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5044                                    "unsupported tunnel type");
5045                 break;
5046         }
5047         pthread_mutex_unlock(&vtep_list_mutex);
5048         return vtep;
5049 }
5050
5051 /**
5052  * Release tunneling interface by ifindex. Decrements reference
5053  * counter and actually removes the device if counter is zero.
5054  *
5055  * @param[in] tcf
5056  *   Context object initialized by mlx5_flow_tcf_context_create().
5057  * @param[in] vtep
5058  *   VTEP device descriptor structure.
5059  * @param[in] dev_flow
5060  *   Flow tcf object with tunnel structure pointer set.
5061  */
5062 static void
5063 flow_tcf_vtep_release(struct mlx5_flow_tcf_context *tcf,
5064                       struct tcf_vtep *vtep,
5065                       struct mlx5_flow *dev_flow)
5066 {
5067         assert(dev_flow->tcf.tunnel);
5068         pthread_mutex_lock(&vtep_list_mutex);
5069         switch (dev_flow->tcf.tunnel->type) {
5070         case FLOW_TCF_TUNACT_VXLAN_DECAP:
5071                 break;
5072         case FLOW_TCF_TUNACT_VXLAN_ENCAP:
5073                 /* Remove the encap ancillary rules first. */
5074                 flow_tcf_encap_neigh(tcf, vtep, dev_flow, false, NULL);
5075                 flow_tcf_encap_local(tcf, vtep, dev_flow, false, NULL);
5076                 break;
5077         default:
5078                 assert(false);
5079                 DRV_LOG(WARNING, "Unsupported tunnel type");
5080                 break;
5081         }
5082         assert(vtep->refcnt);
5083         if (--vtep->refcnt == 0) {
5084                 LIST_REMOVE(vtep, next);
5085                 flow_tcf_vtep_delete(tcf, vtep);
5086         }
5087         pthread_mutex_unlock(&vtep_list_mutex);
5088 }
5089
5090
5091 /**
5092  * Apply flow to E-Switch by sending Netlink message.
5093  *
5094  * @param[in] dev
5095  *   Pointer to Ethernet device.
5096  * @param[in, out] flow
5097  *   Pointer to the sub flow.
5098  * @param[out] error
5099  *   Pointer to the error structure.
5100  *
5101  * @return
5102  *   0 on success, a negative errno value otherwise and rte_ernno is set.
5103  */
5104 static int
5105 flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
5106                struct rte_flow_error *error)
5107 {
5108         struct priv *priv = dev->data->dev_private;
5109         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5110         struct mlx5_flow *dev_flow;
5111         struct nlmsghdr *nlh;
5112
5113         dev_flow = LIST_FIRST(&flow->dev_flows);
5114         /* E-Switch flow can't be expanded. */
5115         assert(!LIST_NEXT(dev_flow, next));
5116         if (dev_flow->tcf.applied)
5117                 return 0;
5118         nlh = dev_flow->tcf.nlh;
5119         nlh->nlmsg_type = RTM_NEWTFILTER;
5120         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
5121         if (dev_flow->tcf.tunnel) {
5122                 /*
5123                  * Replace the interface index, target for
5124                  * encapsulation, source for decapsulation.
5125                  */
5126                 assert(!dev_flow->tcf.tunnel->vtep);
5127                 assert(dev_flow->tcf.tunnel->ifindex_ptr);
5128                 /* Acquire actual VTEP device when rule is being applied. */
5129                 dev_flow->tcf.tunnel->vtep =
5130                         flow_tcf_vtep_acquire(ctx,
5131                                         dev_flow->tcf.tunnel->ifindex_org,
5132                                         dev_flow, error);
5133                 if (!dev_flow->tcf.tunnel->vtep)
5134                         return -rte_errno;
5135                 DRV_LOG(INFO, "Replace ifindex: %d->%d",
5136                                 dev_flow->tcf.tunnel->vtep->ifindex,
5137                                 dev_flow->tcf.tunnel->ifindex_org);
5138                 *dev_flow->tcf.tunnel->ifindex_ptr =
5139                         dev_flow->tcf.tunnel->vtep->ifindex;
5140         }
5141         if (!flow_tcf_nl_ack(ctx, nlh, 0, NULL, NULL)) {
5142                 dev_flow->tcf.applied = 1;
5143                 return 0;
5144         }
5145         return rte_flow_error_set(error, rte_errno,
5146                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5147                                   "netlink: failed to create TC flow rule");
5148 }
5149
5150 /**
5151  * Remove flow from E-Switch by sending Netlink message.
5152  *
5153  * @param[in] dev
5154  *   Pointer to Ethernet device.
5155  * @param[in, out] flow
5156  *   Pointer to the sub flow.
5157  */
5158 static void
5159 flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
5160 {
5161         struct priv *priv = dev->data->dev_private;
5162         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5163         struct mlx5_flow *dev_flow;
5164         struct nlmsghdr *nlh;
5165
5166         if (!flow)
5167                 return;
5168         dev_flow = LIST_FIRST(&flow->dev_flows);
5169         if (!dev_flow)
5170                 return;
5171         /* E-Switch flow can't be expanded. */
5172         assert(!LIST_NEXT(dev_flow, next));
5173         if (dev_flow->tcf.applied) {
5174                 nlh = dev_flow->tcf.nlh;
5175                 nlh->nlmsg_type = RTM_DELTFILTER;
5176                 nlh->nlmsg_flags = NLM_F_REQUEST;
5177                 flow_tcf_nl_ack(ctx, nlh, 0, NULL, NULL);
5178                 if (dev_flow->tcf.tunnel) {
5179                         assert(dev_flow->tcf.tunnel->vtep);
5180                         flow_tcf_vtep_release(ctx,
5181                                 dev_flow->tcf.tunnel->vtep,
5182                                 dev_flow);
5183                         dev_flow->tcf.tunnel->vtep = NULL;
5184                 }
5185                 dev_flow->tcf.applied = 0;
5186         }
5187 }
5188
5189 /**
5190  * Remove flow from E-Switch and release resources of the device flow.
5191  *
5192  * @param[in] dev
5193  *   Pointer to Ethernet device.
5194  * @param[in, out] flow
5195  *   Pointer to the sub flow.
5196  */
5197 static void
5198 flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
5199 {
5200         struct mlx5_flow *dev_flow;
5201
5202         if (!flow)
5203                 return;
5204         flow_tcf_remove(dev, flow);
5205         if (flow->counter) {
5206                 if (--flow->counter->ref_cnt == 0) {
5207                         rte_free(flow->counter);
5208                         flow->counter = NULL;
5209                 }
5210         }
5211         dev_flow = LIST_FIRST(&flow->dev_flows);
5212         if (!dev_flow)
5213                 return;
5214         /* E-Switch flow can't be expanded. */
5215         assert(!LIST_NEXT(dev_flow, next));
5216         LIST_REMOVE(dev_flow, next);
5217         rte_free(dev_flow);
5218 }
5219
5220 /**
5221  * Helper routine for figuring the space size required for a parse buffer.
5222  *
5223  * @param array
5224  *   array of values to use.
5225  * @param idx
5226  *   Current location in array.
5227  * @param value
5228  *   Value to compare with.
5229  *
5230  * @return
5231  *   The maximum between the given value and the array value on index.
5232  */
5233 static uint16_t
5234 flow_tcf_arr_val_max(uint16_t array[], int idx, uint16_t value)
5235 {
5236         return idx < 0 ? (value) : RTE_MAX((array)[idx], value);
5237 }
5238
5239 /**
5240  * Parse rtnetlink message attributes filling the attribute table with the info
5241  * retrieved.
5242  *
5243  * @param tb
5244  *   Attribute table to be filled.
5245  * @param[out] max
5246  *   Maxinum entry in the attribute table.
5247  * @param rte
5248  *   The attributes section in the message to be parsed.
5249  * @param len
5250  *   The length of the attributes section in the message.
5251  */
5252 static void
5253 flow_tcf_nl_parse_rtattr(struct rtattr *tb[], int max,
5254                          struct rtattr *rta, int len)
5255 {
5256         unsigned short type;
5257         memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
5258         while (RTA_OK(rta, len)) {
5259                 type = rta->rta_type;
5260                 if (type <= max && !tb[type])
5261                         tb[type] = rta;
5262                 rta = RTA_NEXT(rta, len);
5263         }
5264 }
5265
5266 /**
5267  * Extract flow counters from flower action.
5268  *
5269  * @param rta
5270  *   flower action stats properties in the Netlink message received.
5271  * @param rta_type
5272  *   The backward sequence of rta_types, as written in the attribute table,
5273  *   we need to traverse in order to get to the requested object.
5274  * @param idx
5275  *   Current location in rta_type table.
5276  * @param[out] data
5277  *   data holding the count statistics of the rte_flow retrieved from
5278  *   the message.
5279  *
5280  * @return
5281  *   0 if data was found and retrieved, -1 otherwise.
5282  */
5283 static int
5284 flow_tcf_nl_action_stats_parse_and_get(struct rtattr *rta,
5285                                        uint16_t rta_type[], int idx,
5286                                        struct gnet_stats_basic *data)
5287 {
5288         int tca_stats_max = flow_tcf_arr_val_max(rta_type, idx,
5289                                                  TCA_STATS_BASIC);
5290         struct rtattr *tbs[tca_stats_max + 1];
5291
5292         if (rta == NULL || idx < 0)
5293                 return -1;
5294         flow_tcf_nl_parse_rtattr(tbs, tca_stats_max,
5295                                  RTA_DATA(rta), RTA_PAYLOAD(rta));
5296         switch (rta_type[idx]) {
5297         case TCA_STATS_BASIC:
5298                 if (tbs[TCA_STATS_BASIC]) {
5299                         memcpy(data, RTA_DATA(tbs[TCA_STATS_BASIC]),
5300                                RTE_MIN(RTA_PAYLOAD(tbs[TCA_STATS_BASIC]),
5301                                sizeof(*data)));
5302                         return 0;
5303                 }
5304                 break;
5305         default:
5306                 break;
5307         }
5308         return -1;
5309 }
5310
5311 /**
5312  * Parse flower single action retrieving the requested action attribute,
5313  * if found.
5314  *
5315  * @param arg
5316  *   flower action properties in the Netlink message received.
5317  * @param rta_type
5318  *   The backward sequence of rta_types, as written in the attribute table,
5319  *   we need to traverse in order to get to the requested object.
5320  * @param idx
5321  *   Current location in rta_type table.
5322  * @param[out] data
5323  *   Count statistics retrieved from the message query.
5324  *
5325  * @return
5326  *   0 if data was found and retrieved, -1 otherwise.
5327  */
5328 static int
5329 flow_tcf_nl_parse_one_action_and_get(struct rtattr *arg,
5330                                      uint16_t rta_type[], int idx, void *data)
5331 {
5332         int tca_act_max = flow_tcf_arr_val_max(rta_type, idx, TCA_ACT_STATS);
5333         struct rtattr *tb[tca_act_max + 1];
5334
5335         if (arg == NULL || idx < 0)
5336                 return -1;
5337         flow_tcf_nl_parse_rtattr(tb, tca_act_max,
5338                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
5339         if (tb[TCA_ACT_KIND] == NULL)
5340                 return -1;
5341         switch (rta_type[idx]) {
5342         case TCA_ACT_STATS:
5343                 if (tb[TCA_ACT_STATS])
5344                         return flow_tcf_nl_action_stats_parse_and_get
5345                                         (tb[TCA_ACT_STATS],
5346                                          rta_type, --idx,
5347                                          (struct gnet_stats_basic *)data);
5348                 break;
5349         default:
5350                 break;
5351         }
5352         return -1;
5353 }
5354
5355 /**
5356  * Parse flower action section in the message retrieving the requested
5357  * attribute from the first action that provides it.
5358  *
5359  * @param opt
5360  *   flower section in the Netlink message received.
5361  * @param rta_type
5362  *   The backward sequence of rta_types, as written in the attribute table,
5363  *   we need to traverse in order to get to the requested object.
5364  * @param idx
5365  *   Current location in rta_type table.
5366  * @param[out] data
5367  *   data retrieved from the message query.
5368  *
5369  * @return
5370  *   0 if data was found and retrieved, -1 otherwise.
5371  */
5372 static int
5373 flow_tcf_nl_action_parse_and_get(struct rtattr *arg,
5374                                  uint16_t rta_type[], int idx, void *data)
5375 {
5376         struct rtattr *tb[TCA_ACT_MAX_PRIO + 1];
5377         int i;
5378
5379         if (arg == NULL || idx < 0)
5380                 return -1;
5381         flow_tcf_nl_parse_rtattr(tb, TCA_ACT_MAX_PRIO,
5382                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
5383         switch (rta_type[idx]) {
5384         /*
5385          * flow counters are stored in the actions defined by the flow
5386          * and not in the flow itself, therefore we need to traverse the
5387          * flower chain of actions in search for them.
5388          *
5389          * Note that the index is not decremented here.
5390          */
5391         case TCA_ACT_STATS:
5392                 for (i = 0; i <= TCA_ACT_MAX_PRIO; i++) {
5393                         if (tb[i] &&
5394                         !flow_tcf_nl_parse_one_action_and_get(tb[i],
5395                                                               rta_type,
5396                                                               idx, data))
5397                                 return 0;
5398                 }
5399                 break;
5400         default:
5401                 break;
5402         }
5403         return -1;
5404 }
5405
5406 /**
5407  * Parse flower classifier options in the message, retrieving the requested
5408  * attribute if found.
5409  *
5410  * @param opt
5411  *   flower section in the Netlink message received.
5412  * @param rta_type
5413  *   The backward sequence of rta_types, as written in the attribute table,
5414  *   we need to traverse in order to get to the requested object.
5415  * @param idx
5416  *   Current location in rta_type table.
5417  * @param[out] data
5418  *   data retrieved from the message query.
5419  *
5420  * @return
5421  *   0 if data was found and retrieved, -1 otherwise.
5422  */
5423 static int
5424 flow_tcf_nl_opts_parse_and_get(struct rtattr *opt,
5425                                uint16_t rta_type[], int idx, void *data)
5426 {
5427         int tca_flower_max = flow_tcf_arr_val_max(rta_type, idx,
5428                                                   TCA_FLOWER_ACT);
5429         struct rtattr *tb[tca_flower_max + 1];
5430
5431         if (!opt || idx < 0)
5432                 return -1;
5433         flow_tcf_nl_parse_rtattr(tb, tca_flower_max,
5434                                  RTA_DATA(opt), RTA_PAYLOAD(opt));
5435         switch (rta_type[idx]) {
5436         case TCA_FLOWER_ACT:
5437                 if (tb[TCA_FLOWER_ACT])
5438                         return flow_tcf_nl_action_parse_and_get
5439                                                         (tb[TCA_FLOWER_ACT],
5440                                                          rta_type, --idx, data);
5441                 break;
5442         default:
5443                 break;
5444         }
5445         return -1;
5446 }
5447
5448 /**
5449  * Parse Netlink reply on filter query, retrieving the flow counters.
5450  *
5451  * @param nlh
5452  *   Message received from Netlink.
5453  * @param rta_type
5454  *   The backward sequence of rta_types, as written in the attribute table,
5455  *   we need to traverse in order to get to the requested object.
5456  * @param idx
5457  *   Current location in rta_type table.
5458  * @param[out] data
5459  *   data retrieved from the message query.
5460  *
5461  * @return
5462  *   0 if data was found and retrieved, -1 otherwise.
5463  */
5464 static int
5465 flow_tcf_nl_filter_parse_and_get(struct nlmsghdr *cnlh,
5466                                  uint16_t rta_type[], int idx, void *data)
5467 {
5468         struct nlmsghdr *nlh = cnlh;
5469         struct tcmsg *t = NLMSG_DATA(nlh);
5470         int len = nlh->nlmsg_len;
5471         int tca_max = flow_tcf_arr_val_max(rta_type, idx, TCA_OPTIONS);
5472         struct rtattr *tb[tca_max + 1];
5473
5474         if (idx < 0)
5475                 return -1;
5476         if (nlh->nlmsg_type != RTM_NEWTFILTER &&
5477             nlh->nlmsg_type != RTM_GETTFILTER &&
5478             nlh->nlmsg_type != RTM_DELTFILTER)
5479                 return -1;
5480         len -= NLMSG_LENGTH(sizeof(*t));
5481         if (len < 0)
5482                 return -1;
5483         flow_tcf_nl_parse_rtattr(tb, tca_max, TCA_RTA(t), len);
5484         /* Not a TC flower flow - bail out */
5485         if (!tb[TCA_KIND] ||
5486             strcmp(RTA_DATA(tb[TCA_KIND]), "flower"))
5487                 return -1;
5488         switch (rta_type[idx]) {
5489         case TCA_OPTIONS:
5490                 if (tb[TCA_OPTIONS])
5491                         return flow_tcf_nl_opts_parse_and_get(tb[TCA_OPTIONS],
5492                                                               rta_type,
5493                                                               --idx, data);
5494                 break;
5495         default:
5496                 break;
5497         }
5498         return -1;
5499 }
5500
5501 /**
5502  * A callback to parse Netlink reply on TC flower query.
5503  *
5504  * @param nlh
5505  *   Message received from Netlink.
5506  * @param[out] data
5507  *   Pointer to data area to be filled by the parsing routine.
5508  *   assumed to be a pinter to struct flow_tcf_stats_basic.
5509  *
5510  * @return
5511  *   MNL_CB_OK value.
5512  */
5513 static int
5514 flow_tcf_nl_message_get_stats_basic(const struct nlmsghdr *nlh, void *data)
5515 {
5516         /*
5517          * The backward sequence of rta_types to pass in order to get
5518          *  to the counters.
5519          */
5520         uint16_t rta_type[] = { TCA_STATS_BASIC, TCA_ACT_STATS,
5521                                 TCA_FLOWER_ACT, TCA_OPTIONS };
5522         struct flow_tcf_stats_basic *sb_data = data;
5523         union {
5524                 const struct nlmsghdr *c;
5525                 struct nlmsghdr *nc;
5526         } tnlh = { .c = nlh };
5527
5528         if (!flow_tcf_nl_filter_parse_and_get(tnlh.nc, rta_type,
5529                                               RTE_DIM(rta_type) - 1,
5530                                               (void *)&sb_data->counters))
5531                 sb_data->valid = true;
5532         return MNL_CB_OK;
5533 }
5534
5535 /**
5536  * Query a TC flower rule for its statistics via netlink.
5537  *
5538  * @param[in] dev
5539  *   Pointer to Ethernet device.
5540  * @param[in] flow
5541  *   Pointer to the sub flow.
5542  * @param[out] data
5543  *   data retrieved by the query.
5544  * @param[out] error
5545  *   Perform verbose error reporting if not NULL.
5546  *
5547  * @return
5548  *   0 on success, a negative errno value otherwise and rte_errno is set.
5549  */
5550 static int
5551 flow_tcf_query_count(struct rte_eth_dev *dev,
5552                           struct rte_flow *flow,
5553                           void *data,
5554                           struct rte_flow_error *error)
5555 {
5556         struct flow_tcf_stats_basic sb_data = { 0 };
5557         struct rte_flow_query_count *qc = data;
5558         struct priv *priv = dev->data->dev_private;
5559         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5560         struct mnl_socket *nl = ctx->nl;
5561         struct mlx5_flow *dev_flow;
5562         struct nlmsghdr *nlh;
5563         uint32_t seq = priv->tcf_context->seq++;
5564         ssize_t ret;
5565         assert(qc);
5566
5567         dev_flow = LIST_FIRST(&flow->dev_flows);
5568         /* E-Switch flow can't be expanded. */
5569         assert(!LIST_NEXT(dev_flow, next));
5570         if (!dev_flow->flow->counter)
5571                 goto notsup_exit;
5572         nlh = dev_flow->tcf.nlh;
5573         nlh->nlmsg_type = RTM_GETTFILTER;
5574         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ECHO;
5575         nlh->nlmsg_seq = seq;
5576         if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) == -1)
5577                 goto error_exit;
5578         do {
5579                 ret = mnl_socket_recvfrom(nl, ctx->buf, ctx->buf_size);
5580                 if (ret <= 0)
5581                         break;
5582                 ret = mnl_cb_run(ctx->buf, ret, seq,
5583                                  mnl_socket_get_portid(nl),
5584                                  flow_tcf_nl_message_get_stats_basic,
5585                                  (void *)&sb_data);
5586         } while (ret > 0);
5587         /* Return the delta from last reset. */
5588         if (sb_data.valid) {
5589                 /* Return the delta from last reset. */
5590                 qc->hits_set = 1;
5591                 qc->bytes_set = 1;
5592                 qc->hits = sb_data.counters.packets - flow->counter->hits;
5593                 qc->bytes = sb_data.counters.bytes - flow->counter->bytes;
5594                 if (qc->reset) {
5595                         flow->counter->hits = sb_data.counters.packets;
5596                         flow->counter->bytes = sb_data.counters.bytes;
5597                 }
5598                 return 0;
5599         }
5600         return rte_flow_error_set(error, EINVAL,
5601                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5602                                   NULL,
5603                                   "flow does not have counter");
5604 error_exit:
5605         return rte_flow_error_set
5606                         (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5607                          NULL, "netlink: failed to read flow rule counters");
5608 notsup_exit:
5609         return rte_flow_error_set
5610                         (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5611                          NULL, "counters are not available.");
5612 }
5613
5614 /**
5615  * Query a flow.
5616  *
5617  * @see rte_flow_query()
5618  * @see rte_flow_ops
5619  */
5620 static int
5621 flow_tcf_query(struct rte_eth_dev *dev,
5622                struct rte_flow *flow,
5623                const struct rte_flow_action *actions,
5624                void *data,
5625                struct rte_flow_error *error)
5626 {
5627         int ret = -EINVAL;
5628
5629         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
5630                 switch (actions->type) {
5631                 case RTE_FLOW_ACTION_TYPE_VOID:
5632                         break;
5633                 case RTE_FLOW_ACTION_TYPE_COUNT:
5634                         ret = flow_tcf_query_count(dev, flow, data, error);
5635                         break;
5636                 default:
5637                         return rte_flow_error_set(error, ENOTSUP,
5638                                                   RTE_FLOW_ERROR_TYPE_ACTION,
5639                                                   actions,
5640                                                   "action not supported");
5641                 }
5642         }
5643         return ret;
5644 }
5645
5646 const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops = {
5647         .validate = flow_tcf_validate,
5648         .prepare = flow_tcf_prepare,
5649         .translate = flow_tcf_translate,
5650         .apply = flow_tcf_apply,
5651         .remove = flow_tcf_remove,
5652         .destroy = flow_tcf_destroy,
5653         .query = flow_tcf_query,
5654 };
5655
5656 /**
5657  * Create and configure a libmnl socket for Netlink flow rules.
5658  *
5659  * @return
5660  *   A valid libmnl socket object pointer on success, NULL otherwise and
5661  *   rte_errno is set.
5662  */
5663 static struct mnl_socket *
5664 flow_tcf_mnl_socket_create(void)
5665 {
5666         struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
5667
5668         if (nl) {
5669                 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
5670                                       sizeof(int));
5671                 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
5672                         return nl;
5673         }
5674         rte_errno = errno;
5675         if (nl)
5676                 mnl_socket_close(nl);
5677         return NULL;
5678 }
5679
5680 /**
5681  * Destroy a libmnl socket.
5682  *
5683  * @param nl
5684  *   Libmnl socket of the @p NETLINK_ROUTE kind.
5685  */
5686 static void
5687 flow_tcf_mnl_socket_destroy(struct mnl_socket *nl)
5688 {
5689         if (nl)
5690                 mnl_socket_close(nl);
5691 }
5692
5693 /**
5694  * Initialize ingress qdisc of a given network interface.
5695  *
5696  * @param ctx
5697  *   Pointer to tc-flower context to use.
5698  * @param ifindex
5699  *   Index of network interface to initialize.
5700  * @param[out] error
5701  *   Perform verbose error reporting if not NULL.
5702  *
5703  * @return
5704  *   0 on success, a negative errno value otherwise and rte_errno is set.
5705  */
5706 int
5707 mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx,
5708                    unsigned int ifindex, struct rte_flow_error *error)
5709 {
5710         struct nlmsghdr *nlh;
5711         struct tcmsg *tcm;
5712         alignas(struct nlmsghdr)
5713         uint8_t buf[mnl_nlmsg_size(sizeof(*tcm)) +
5714                     SZ_NLATTR_STRZ_OF("ingress") +
5715                     MNL_BUF_EXTRA_SPACE];
5716
5717         /* Destroy existing ingress qdisc and everything attached to it. */
5718         nlh = mnl_nlmsg_put_header(buf);
5719         nlh->nlmsg_type = RTM_DELQDISC;
5720         nlh->nlmsg_flags = NLM_F_REQUEST;
5721         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
5722         tcm->tcm_family = AF_UNSPEC;
5723         tcm->tcm_ifindex = ifindex;
5724         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
5725         tcm->tcm_parent = TC_H_INGRESS;
5726         assert(sizeof(buf) >= nlh->nlmsg_len);
5727         /* Ignore errors when qdisc is already absent. */
5728         if (flow_tcf_nl_ack(ctx, nlh, 0, NULL, NULL) &&
5729             rte_errno != EINVAL && rte_errno != ENOENT)
5730                 return rte_flow_error_set(error, rte_errno,
5731                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5732                                           "netlink: failed to remove ingress"
5733                                           " qdisc");
5734         /* Create fresh ingress qdisc. */
5735         nlh = mnl_nlmsg_put_header(buf);
5736         nlh->nlmsg_type = RTM_NEWQDISC;
5737         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
5738         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
5739         tcm->tcm_family = AF_UNSPEC;
5740         tcm->tcm_ifindex = ifindex;
5741         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
5742         tcm->tcm_parent = TC_H_INGRESS;
5743         mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
5744         assert(sizeof(buf) >= nlh->nlmsg_len);
5745         if (flow_tcf_nl_ack(ctx, nlh, 0, NULL, NULL))
5746                 return rte_flow_error_set(error, rte_errno,
5747                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5748                                           "netlink: failed to create ingress"
5749                                           " qdisc");
5750         return 0;
5751 }
5752
5753 /**
5754  * Create libmnl context for Netlink flow rules.
5755  *
5756  * @return
5757  *   A valid libmnl socket object pointer on success, NULL otherwise and
5758  *   rte_errno is set.
5759  */
5760 struct mlx5_flow_tcf_context *
5761 mlx5_flow_tcf_context_create(void)
5762 {
5763         struct mlx5_flow_tcf_context *ctx = rte_zmalloc(__func__,
5764                                                         sizeof(*ctx),
5765                                                         sizeof(uint32_t));
5766         if (!ctx)
5767                 goto error;
5768         ctx->nl = flow_tcf_mnl_socket_create();
5769         if (!ctx->nl)
5770                 goto error;
5771         ctx->buf_size = MNL_SOCKET_BUFFER_SIZE;
5772         ctx->buf = rte_zmalloc(__func__,
5773                                ctx->buf_size, sizeof(uint32_t));
5774         if (!ctx->buf)
5775                 goto error;
5776         ctx->seq = random();
5777         return ctx;
5778 error:
5779         mlx5_flow_tcf_context_destroy(ctx);
5780         return NULL;
5781 }
5782
5783 /**
5784  * Destroy a libmnl context.
5785  *
5786  * @param ctx
5787  *   Libmnl socket of the @p NETLINK_ROUTE kind.
5788  */
5789 void
5790 mlx5_flow_tcf_context_destroy(struct mlx5_flow_tcf_context *ctx)
5791 {
5792         if (!ctx)
5793                 return;
5794         flow_tcf_mnl_socket_destroy(ctx->nl);
5795         rte_free(ctx->buf);
5796         rte_free(ctx);
5797 }