net/mlx5: add E-switch VXLAN rule cleanup routines
[dpdk.git] / drivers / net / mlx5 / mlx5_flow_tcf.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018 6WIND S.A.
3  * Copyright 2018 Mellanox Technologies, Ltd
4  */
5
6 #include <assert.h>
7 #include <errno.h>
8 #include <libmnl/libmnl.h>
9 #include <linux/gen_stats.h>
10 #include <linux/if_ether.h>
11 #include <linux/netlink.h>
12 #include <linux/pkt_cls.h>
13 #include <linux/pkt_sched.h>
14 #include <linux/rtnetlink.h>
15 #include <linux/tc_act/tc_gact.h>
16 #include <linux/tc_act/tc_mirred.h>
17 #include <netinet/in.h>
18 #include <stdalign.h>
19 #include <stdbool.h>
20 #include <stddef.h>
21 #include <stdint.h>
22 #include <stdlib.h>
23 #include <sys/socket.h>
24
25 #include <rte_byteorder.h>
26 #include <rte_errno.h>
27 #include <rte_ether.h>
28 #include <rte_flow.h>
29 #include <rte_malloc.h>
30 #include <rte_common.h>
31
32 #include "mlx5.h"
33 #include "mlx5_flow.h"
34 #include "mlx5_autoconf.h"
35
36 #ifdef HAVE_TC_ACT_VLAN
37
38 #include <linux/tc_act/tc_vlan.h>
39
40 #else /* HAVE_TC_ACT_VLAN */
41
42 #define TCA_VLAN_ACT_POP 1
43 #define TCA_VLAN_ACT_PUSH 2
44 #define TCA_VLAN_ACT_MODIFY 3
45 #define TCA_VLAN_PARMS 2
46 #define TCA_VLAN_PUSH_VLAN_ID 3
47 #define TCA_VLAN_PUSH_VLAN_PROTOCOL 4
48 #define TCA_VLAN_PAD 5
49 #define TCA_VLAN_PUSH_VLAN_PRIORITY 6
50
51 struct tc_vlan {
52         tc_gen;
53         int v_action;
54 };
55
56 #endif /* HAVE_TC_ACT_VLAN */
57
58 #ifdef HAVE_TC_ACT_PEDIT
59
60 #include <linux/tc_act/tc_pedit.h>
61
62 #else /* HAVE_TC_ACT_VLAN */
63
64 enum {
65         TCA_PEDIT_UNSPEC,
66         TCA_PEDIT_TM,
67         TCA_PEDIT_PARMS,
68         TCA_PEDIT_PAD,
69         TCA_PEDIT_PARMS_EX,
70         TCA_PEDIT_KEYS_EX,
71         TCA_PEDIT_KEY_EX,
72         __TCA_PEDIT_MAX
73 };
74
75 enum {
76         TCA_PEDIT_KEY_EX_HTYPE = 1,
77         TCA_PEDIT_KEY_EX_CMD = 2,
78         __TCA_PEDIT_KEY_EX_MAX
79 };
80
81 enum pedit_header_type {
82         TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK = 0,
83         TCA_PEDIT_KEY_EX_HDR_TYPE_ETH = 1,
84         TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 = 2,
85         TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 = 3,
86         TCA_PEDIT_KEY_EX_HDR_TYPE_TCP = 4,
87         TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
88         __PEDIT_HDR_TYPE_MAX,
89 };
90
91 enum pedit_cmd {
92         TCA_PEDIT_KEY_EX_CMD_SET = 0,
93         TCA_PEDIT_KEY_EX_CMD_ADD = 1,
94         __PEDIT_CMD_MAX,
95 };
96
97 struct tc_pedit_key {
98         __u32 mask; /* AND */
99         __u32 val; /*XOR */
100         __u32 off; /*offset */
101         __u32 at;
102         __u32 offmask;
103         __u32 shift;
104 };
105
106 __extension__
107 struct tc_pedit_sel {
108         tc_gen;
109         unsigned char nkeys;
110         unsigned char flags;
111         struct tc_pedit_key keys[0];
112 };
113
114 #endif /* HAVE_TC_ACT_VLAN */
115
116 #ifdef HAVE_TC_ACT_TUNNEL_KEY
117
118 #include <linux/tc_act/tc_tunnel_key.h>
119
120 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
121 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
122 #endif
123
124 #ifndef HAVE_TCA_TUNNEL_KEY_NO_CSUM
125 #define TCA_TUNNEL_KEY_NO_CSUM 10
126 #endif
127
128 #else /* HAVE_TC_ACT_TUNNEL_KEY */
129
130 #define TCA_ACT_TUNNEL_KEY 17
131 #define TCA_TUNNEL_KEY_ACT_SET 1
132 #define TCA_TUNNEL_KEY_ACT_RELEASE 2
133 #define TCA_TUNNEL_KEY_PARMS 2
134 #define TCA_TUNNEL_KEY_ENC_IPV4_SRC 3
135 #define TCA_TUNNEL_KEY_ENC_IPV4_DST 4
136 #define TCA_TUNNEL_KEY_ENC_IPV6_SRC 5
137 #define TCA_TUNNEL_KEY_ENC_IPV6_DST 6
138 #define TCA_TUNNEL_KEY_ENC_KEY_ID 7
139 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
140 #define TCA_TUNNEL_KEY_NO_CSUM 10
141
142 struct tc_tunnel_key {
143         tc_gen;
144         int t_action;
145 };
146
147 #endif /* HAVE_TC_ACT_TUNNEL_KEY */
148
149 /* Normally found in linux/netlink.h. */
150 #ifndef NETLINK_CAP_ACK
151 #define NETLINK_CAP_ACK 10
152 #endif
153
154 /* Normally found in linux/pkt_sched.h. */
155 #ifndef TC_H_MIN_INGRESS
156 #define TC_H_MIN_INGRESS 0xfff2u
157 #endif
158
159 /* Normally found in linux/pkt_cls.h. */
160 #ifndef TCA_CLS_FLAGS_SKIP_SW
161 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
162 #endif
163 #ifndef HAVE_TCA_CHAIN
164 #define TCA_CHAIN 11
165 #endif
166 #ifndef HAVE_TCA_FLOWER_ACT
167 #define TCA_FLOWER_ACT 3
168 #endif
169 #ifndef HAVE_TCA_FLOWER_FLAGS
170 #define TCA_FLOWER_FLAGS 22
171 #endif
172 #ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE
173 #define TCA_FLOWER_KEY_ETH_TYPE 8
174 #endif
175 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST
176 #define TCA_FLOWER_KEY_ETH_DST 4
177 #endif
178 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK
179 #define TCA_FLOWER_KEY_ETH_DST_MASK 5
180 #endif
181 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC
182 #define TCA_FLOWER_KEY_ETH_SRC 6
183 #endif
184 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK
185 #define TCA_FLOWER_KEY_ETH_SRC_MASK 7
186 #endif
187 #ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO
188 #define TCA_FLOWER_KEY_IP_PROTO 9
189 #endif
190 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC
191 #define TCA_FLOWER_KEY_IPV4_SRC 10
192 #endif
193 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK
194 #define TCA_FLOWER_KEY_IPV4_SRC_MASK 11
195 #endif
196 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST
197 #define TCA_FLOWER_KEY_IPV4_DST 12
198 #endif
199 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK
200 #define TCA_FLOWER_KEY_IPV4_DST_MASK 13
201 #endif
202 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC
203 #define TCA_FLOWER_KEY_IPV6_SRC 14
204 #endif
205 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK
206 #define TCA_FLOWER_KEY_IPV6_SRC_MASK 15
207 #endif
208 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST
209 #define TCA_FLOWER_KEY_IPV6_DST 16
210 #endif
211 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK
212 #define TCA_FLOWER_KEY_IPV6_DST_MASK 17
213 #endif
214 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC
215 #define TCA_FLOWER_KEY_TCP_SRC 18
216 #endif
217 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK
218 #define TCA_FLOWER_KEY_TCP_SRC_MASK 35
219 #endif
220 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST
221 #define TCA_FLOWER_KEY_TCP_DST 19
222 #endif
223 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK
224 #define TCA_FLOWER_KEY_TCP_DST_MASK 36
225 #endif
226 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC
227 #define TCA_FLOWER_KEY_UDP_SRC 20
228 #endif
229 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK
230 #define TCA_FLOWER_KEY_UDP_SRC_MASK 37
231 #endif
232 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST
233 #define TCA_FLOWER_KEY_UDP_DST 21
234 #endif
235 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK
236 #define TCA_FLOWER_KEY_UDP_DST_MASK 38
237 #endif
238 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID
239 #define TCA_FLOWER_KEY_VLAN_ID 23
240 #endif
241 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO
242 #define TCA_FLOWER_KEY_VLAN_PRIO 24
243 #endif
244 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
245 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
246 #endif
247 #ifndef HAVE_TCA_FLOWER_KEY_ENC_KEY_ID
248 #define TCA_FLOWER_KEY_ENC_KEY_ID 26
249 #endif
250 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC
251 #define TCA_FLOWER_KEY_ENC_IPV4_SRC 27
252 #endif
253 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK
254 #define TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK 28
255 #endif
256 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST
257 #define TCA_FLOWER_KEY_ENC_IPV4_DST 29
258 #endif
259 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK
260 #define TCA_FLOWER_KEY_ENC_IPV4_DST_MASK 30
261 #endif
262 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC
263 #define TCA_FLOWER_KEY_ENC_IPV6_SRC 31
264 #endif
265 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK
266 #define TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK 32
267 #endif
268 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST
269 #define TCA_FLOWER_KEY_ENC_IPV6_DST 33
270 #endif
271 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK
272 #define TCA_FLOWER_KEY_ENC_IPV6_DST_MASK 34
273 #endif
274 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT
275 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT 43
276 #endif
277 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK
278 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK 44
279 #endif
280 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT
281 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT 45
282 #endif
283 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK
284 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK 46
285 #endif
286 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS
287 #define TCA_FLOWER_KEY_TCP_FLAGS 71
288 #endif
289 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK
290 #define TCA_FLOWER_KEY_TCP_FLAGS_MASK 72
291 #endif
292 #ifndef HAVE_TC_ACT_GOTO_CHAIN
293 #define TC_ACT_GOTO_CHAIN 0x20000000
294 #endif
295
296 #ifndef IPV6_ADDR_LEN
297 #define IPV6_ADDR_LEN 16
298 #endif
299
300 #ifndef IPV4_ADDR_LEN
301 #define IPV4_ADDR_LEN 4
302 #endif
303
304 #ifndef TP_PORT_LEN
305 #define TP_PORT_LEN 2 /* Transport Port (UDP/TCP) Length */
306 #endif
307
308 #ifndef TTL_LEN
309 #define TTL_LEN 1
310 #endif
311
312 #ifndef TCA_ACT_MAX_PRIO
313 #define TCA_ACT_MAX_PRIO 32
314 #endif
315
316 /** UDP port range of VXLAN devices created by driver. */
317 #define MLX5_VXLAN_PORT_MIN 30000
318 #define MLX5_VXLAN_PORT_MAX 60000
319 #define MLX5_VXLAN_DEVICE_PFX "vmlx_"
320
321 /** Tunnel action type, used for @p type in header structure. */
322 enum flow_tcf_tunact_type {
323         FLOW_TCF_TUNACT_VXLAN_DECAP,
324         FLOW_TCF_TUNACT_VXLAN_ENCAP,
325 };
326
327 /** Flags used for @p mask in tunnel action encap descriptors. */
328 #define FLOW_TCF_ENCAP_ETH_SRC (1u << 0)
329 #define FLOW_TCF_ENCAP_ETH_DST (1u << 1)
330 #define FLOW_TCF_ENCAP_IPV4_SRC (1u << 2)
331 #define FLOW_TCF_ENCAP_IPV4_DST (1u << 3)
332 #define FLOW_TCF_ENCAP_IPV6_SRC (1u << 4)
333 #define FLOW_TCF_ENCAP_IPV6_DST (1u << 5)
334 #define FLOW_TCF_ENCAP_UDP_SRC (1u << 6)
335 #define FLOW_TCF_ENCAP_UDP_DST (1u << 7)
336 #define FLOW_TCF_ENCAP_VXLAN_VNI (1u << 8)
337
338 /**
339  * Structure for holding netlink context.
340  * Note the size of the message buffer which is MNL_SOCKET_BUFFER_SIZE.
341  * Using this (8KB) buffer size ensures that netlink messages will never be
342  * truncated.
343  */
344 struct mlx5_flow_tcf_context {
345         struct mnl_socket *nl; /* NETLINK_ROUTE libmnl socket. */
346         uint32_t seq; /* Message sequence number. */
347         uint32_t buf_size; /* Message buffer size. */
348         uint8_t *buf; /* Message buffer. */
349 };
350
351 /**
352  * Neigh rule structure. The neigh rule is applied via Netlink to
353  * outer tunnel iface in order to provide destination MAC address
354  * for the VXLAN encapsultion. The neigh rule is implicitly related
355  * to the Flow itself and can be shared by multiple Flows.
356  */
357 struct tcf_neigh_rule {
358         LIST_ENTRY(tcf_neigh_rule) next;
359         uint32_t refcnt;
360         struct ether_addr eth;
361         uint16_t mask;
362         union {
363                 struct {
364                         rte_be32_t dst;
365                 } ipv4;
366                 struct {
367                         uint8_t dst[IPV6_ADDR_LEN];
368                 } ipv6;
369         };
370 };
371
372 /**
373  * Local rule structure. The local rule is applied via Netlink to
374  * outer tunnel iface in order to provide local and peer IP addresses
375  * of the VXLAN tunnel for encapsulation. The local rule is implicitly
376  * related to the Flow itself and can be shared by multiple Flows.
377  */
378 struct tcf_local_rule {
379         LIST_ENTRY(tcf_local_rule) next;
380         uint32_t refcnt;
381         uint16_t mask;
382         union {
383                 struct {
384                         rte_be32_t dst;
385                         rte_be32_t src;
386                 } ipv4;
387                 struct {
388                         uint8_t dst[IPV6_ADDR_LEN];
389                         uint8_t src[IPV6_ADDR_LEN];
390                 } ipv6;
391         };
392 };
393
394 /** VXLAN virtual netdev. */
395 struct tcf_vtep {
396         LIST_ENTRY(tcf_vtep) next;
397         LIST_HEAD(, tcf_neigh_rule) neigh;
398         LIST_HEAD(, tcf_local_rule) local;
399         uint32_t refcnt;
400         unsigned int ifindex; /**< Own interface index. */
401         unsigned int ifouter; /**< Index of device attached to. */
402         uint16_t port;
403         uint8_t created;
404 };
405
406 /** Tunnel descriptor header, common for all tunnel types. */
407 struct flow_tcf_tunnel_hdr {
408         uint32_t type; /**< Tunnel action type. */
409         struct tcf_vtep *vtep; /**< Virtual tunnel endpoint device. */
410         unsigned int ifindex_org; /**< Original dst/src interface */
411         unsigned int *ifindex_ptr; /**< Interface ptr in message. */
412 };
413
414 struct flow_tcf_vxlan_decap {
415         struct flow_tcf_tunnel_hdr hdr;
416         uint16_t udp_port;
417 };
418
419 struct flow_tcf_vxlan_encap {
420         struct flow_tcf_tunnel_hdr hdr;
421         uint32_t mask;
422         struct {
423                 struct ether_addr dst;
424                 struct ether_addr src;
425         } eth;
426         union {
427                 struct {
428                         rte_be32_t dst;
429                         rte_be32_t src;
430                 } ipv4;
431                 struct {
432                         uint8_t dst[IPV6_ADDR_LEN];
433                         uint8_t src[IPV6_ADDR_LEN];
434                 } ipv6;
435         };
436 struct {
437                 rte_be16_t src;
438                 rte_be16_t dst;
439         } udp;
440         struct {
441                 uint8_t vni[3];
442         } vxlan;
443 };
444
445 /** Structure used when extracting the values of a flow counters
446  * from a netlink message.
447  */
448 struct flow_tcf_stats_basic {
449         bool valid;
450         struct gnet_stats_basic counters;
451 };
452
453 /** Empty masks for known item types. */
454 static const union {
455         struct rte_flow_item_port_id port_id;
456         struct rte_flow_item_eth eth;
457         struct rte_flow_item_vlan vlan;
458         struct rte_flow_item_ipv4 ipv4;
459         struct rte_flow_item_ipv6 ipv6;
460         struct rte_flow_item_tcp tcp;
461         struct rte_flow_item_udp udp;
462         struct rte_flow_item_vxlan vxlan;
463 } flow_tcf_mask_empty;
464
465 /** Supported masks for known item types. */
466 static const struct {
467         struct rte_flow_item_port_id port_id;
468         struct rte_flow_item_eth eth;
469         struct rte_flow_item_vlan vlan;
470         struct rte_flow_item_ipv4 ipv4;
471         struct rte_flow_item_ipv6 ipv6;
472         struct rte_flow_item_tcp tcp;
473         struct rte_flow_item_udp udp;
474         struct rte_flow_item_vxlan vxlan;
475 } flow_tcf_mask_supported = {
476         .port_id = {
477                 .id = 0xffffffff,
478         },
479         .eth = {
480                 .type = RTE_BE16(0xffff),
481                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
482                 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
483         },
484         .vlan = {
485                 /* PCP and VID only, no DEI. */
486                 .tci = RTE_BE16(0xefff),
487                 .inner_type = RTE_BE16(0xffff),
488         },
489         .ipv4.hdr = {
490                 .next_proto_id = 0xff,
491                 .src_addr = RTE_BE32(0xffffffff),
492                 .dst_addr = RTE_BE32(0xffffffff),
493         },
494         .ipv6.hdr = {
495                 .proto = 0xff,
496                 .src_addr =
497                         "\xff\xff\xff\xff\xff\xff\xff\xff"
498                         "\xff\xff\xff\xff\xff\xff\xff\xff",
499                 .dst_addr =
500                         "\xff\xff\xff\xff\xff\xff\xff\xff"
501                         "\xff\xff\xff\xff\xff\xff\xff\xff",
502         },
503         .tcp.hdr = {
504                 .src_port = RTE_BE16(0xffff),
505                 .dst_port = RTE_BE16(0xffff),
506                 .tcp_flags = 0xff,
507         },
508         .udp.hdr = {
509                 .src_port = RTE_BE16(0xffff),
510                 .dst_port = RTE_BE16(0xffff),
511         },
512         .vxlan = {
513                .vni = "\xff\xff\xff",
514         },
515 };
516
517 #define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr))
518 #define SZ_NLATTR_NEST SZ_NLATTR_HDR
519 #define SZ_NLATTR_DATA_OF(len) MNL_ALIGN(SZ_NLATTR_HDR + (len))
520 #define SZ_NLATTR_TYPE_OF(typ) SZ_NLATTR_DATA_OF(sizeof(typ))
521 #define SZ_NLATTR_STRZ_OF(str) SZ_NLATTR_DATA_OF(strlen(str) + 1)
522
523 #define PTOI_TABLE_SZ_MAX(dev) (mlx5_dev_to_port_id((dev)->device, NULL, 0) + 2)
524
525 /** DPDK port to network interface index (ifindex) conversion. */
526 struct flow_tcf_ptoi {
527         uint16_t port_id; /**< DPDK port ID. */
528         unsigned int ifindex; /**< Network interface index. */
529 };
530
531 /* Due to a limitation on driver/FW. */
532 #define MLX5_TCF_GROUP_ID_MAX 3
533 #define MLX5_TCF_GROUP_PRIORITY_MAX 14
534
535 #define MLX5_TCF_FATE_ACTIONS \
536         (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \
537          MLX5_FLOW_ACTION_JUMP)
538
539 #define MLX5_TCF_VLAN_ACTIONS \
540         (MLX5_FLOW_ACTION_OF_POP_VLAN | MLX5_FLOW_ACTION_OF_PUSH_VLAN | \
541          MLX5_FLOW_ACTION_OF_SET_VLAN_VID | MLX5_FLOW_ACTION_OF_SET_VLAN_PCP)
542
543 #define MLX5_TCF_VXLAN_ACTIONS \
544         (MLX5_FLOW_ACTION_VXLAN_ENCAP | MLX5_FLOW_ACTION_VXLAN_DECAP)
545
546 #define MLX5_TCF_PEDIT_ACTIONS \
547         (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST | \
548          MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST | \
549          MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST | \
550          MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL | \
551          MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)
552
553 #define MLX5_TCF_CONFIG_ACTIONS \
554         (MLX5_FLOW_ACTION_PORT_ID | MLX5_FLOW_ACTION_JUMP | \
555          MLX5_FLOW_ACTION_OF_PUSH_VLAN | MLX5_FLOW_ACTION_OF_SET_VLAN_VID | \
556          MLX5_FLOW_ACTION_OF_SET_VLAN_PCP | \
557          (MLX5_TCF_PEDIT_ACTIONS & ~MLX5_FLOW_ACTION_DEC_TTL))
558
559 #define MAX_PEDIT_KEYS 128
560 #define SZ_PEDIT_KEY_VAL 4
561
562 #define NUM_OF_PEDIT_KEYS(sz) \
563         (((sz) / SZ_PEDIT_KEY_VAL) + (((sz) % SZ_PEDIT_KEY_VAL) ? 1 : 0))
564
565 struct pedit_key_ex {
566         enum pedit_header_type htype;
567         enum pedit_cmd cmd;
568 };
569
570 struct pedit_parser {
571         struct tc_pedit_sel sel;
572         struct tc_pedit_key keys[MAX_PEDIT_KEYS];
573         struct pedit_key_ex keys_ex[MAX_PEDIT_KEYS];
574 };
575
576 /**
577  * Create space for using the implicitly created TC flow counter.
578  *
579  * @param[in] dev
580  *   Pointer to the Ethernet device structure.
581  *
582  * @return
583  *   A pointer to the counter data structure, NULL otherwise and
584  *   rte_errno is set.
585  */
586 static struct mlx5_flow_counter *
587 flow_tcf_counter_new(void)
588 {
589         struct mlx5_flow_counter *cnt;
590
591         /*
592          * eswitch counter cannot be shared and its id is unknown.
593          * currently returning all with id 0.
594          * in the future maybe better to switch to unique numbers.
595          */
596         struct mlx5_flow_counter tmpl = {
597                 .ref_cnt = 1,
598         };
599         cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0);
600         if (!cnt) {
601                 rte_errno = ENOMEM;
602                 return NULL;
603         }
604         *cnt = tmpl;
605         /* Implicit counter, do not add to list. */
606         return cnt;
607 }
608
609 /**
610  * Set pedit key of MAC address
611  *
612  * @param[in] actions
613  *   pointer to action specification
614  * @param[in,out] p_parser
615  *   pointer to pedit_parser
616  */
617 static void
618 flow_tcf_pedit_key_set_mac(const struct rte_flow_action *actions,
619                            struct pedit_parser *p_parser)
620 {
621         int idx = p_parser->sel.nkeys;
622         uint32_t off = actions->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ?
623                                         offsetof(struct ether_hdr, s_addr) :
624                                         offsetof(struct ether_hdr, d_addr);
625         const struct rte_flow_action_set_mac *conf =
626                 (const struct rte_flow_action_set_mac *)actions->conf;
627
628         p_parser->keys[idx].off = off;
629         p_parser->keys[idx].mask = ~UINT32_MAX;
630         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
631         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
632         memcpy(&p_parser->keys[idx].val,
633                 conf->mac_addr, SZ_PEDIT_KEY_VAL);
634         idx++;
635         p_parser->keys[idx].off = off + SZ_PEDIT_KEY_VAL;
636         p_parser->keys[idx].mask = 0xFFFF0000;
637         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
638         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
639         memcpy(&p_parser->keys[idx].val,
640                 conf->mac_addr + SZ_PEDIT_KEY_VAL,
641                 ETHER_ADDR_LEN - SZ_PEDIT_KEY_VAL);
642         p_parser->sel.nkeys = (++idx);
643 }
644
645 /**
646  * Set pedit key of decrease/set ttl
647  *
648  * @param[in] actions
649  *   pointer to action specification
650  * @param[in,out] p_parser
651  *   pointer to pedit_parser
652  * @param[in] item_flags
653  *   flags of all items presented
654  */
655 static void
656 flow_tcf_pedit_key_set_dec_ttl(const struct rte_flow_action *actions,
657                                 struct pedit_parser *p_parser,
658                                 uint64_t item_flags)
659 {
660         int idx = p_parser->sel.nkeys;
661
662         p_parser->keys[idx].mask = 0xFFFFFF00;
663         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) {
664                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
665                 p_parser->keys[idx].off =
666                         offsetof(struct ipv4_hdr, time_to_live);
667         }
668         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6) {
669                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
670                 p_parser->keys[idx].off =
671                         offsetof(struct ipv6_hdr, hop_limits);
672         }
673         if (actions->type == RTE_FLOW_ACTION_TYPE_DEC_TTL) {
674                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_ADD;
675                 p_parser->keys[idx].val = 0x000000FF;
676         } else {
677                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
678                 p_parser->keys[idx].val =
679                         (__u32)((const struct rte_flow_action_set_ttl *)
680                          actions->conf)->ttl_value;
681         }
682         p_parser->sel.nkeys = (++idx);
683 }
684
685 /**
686  * Set pedit key of transport (TCP/UDP) port value
687  *
688  * @param[in] actions
689  *   pointer to action specification
690  * @param[in,out] p_parser
691  *   pointer to pedit_parser
692  * @param[in] item_flags
693  *   flags of all items presented
694  */
695 static void
696 flow_tcf_pedit_key_set_tp_port(const struct rte_flow_action *actions,
697                                 struct pedit_parser *p_parser,
698                                 uint64_t item_flags)
699 {
700         int idx = p_parser->sel.nkeys;
701
702         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)
703                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_UDP;
704         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_TCP)
705                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_TCP;
706         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
707         /* offset of src/dst port is same for TCP and UDP */
708         p_parser->keys[idx].off =
709                 actions->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC ?
710                 offsetof(struct tcp_hdr, src_port) :
711                 offsetof(struct tcp_hdr, dst_port);
712         p_parser->keys[idx].mask = 0xFFFF0000;
713         p_parser->keys[idx].val =
714                 (__u32)((const struct rte_flow_action_set_tp *)
715                                 actions->conf)->port;
716         p_parser->sel.nkeys = (++idx);
717 }
718
719 /**
720  * Set pedit key of ipv6 address
721  *
722  * @param[in] actions
723  *   pointer to action specification
724  * @param[in,out] p_parser
725  *   pointer to pedit_parser
726  */
727 static void
728 flow_tcf_pedit_key_set_ipv6_addr(const struct rte_flow_action *actions,
729                                  struct pedit_parser *p_parser)
730 {
731         int idx = p_parser->sel.nkeys;
732         int keys = NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
733         int off_base =
734                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ?
735                 offsetof(struct ipv6_hdr, src_addr) :
736                 offsetof(struct ipv6_hdr, dst_addr);
737         const struct rte_flow_action_set_ipv6 *conf =
738                 (const struct rte_flow_action_set_ipv6 *)actions->conf;
739
740         for (int i = 0; i < keys; i++, idx++) {
741                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
742                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
743                 p_parser->keys[idx].off = off_base + i * SZ_PEDIT_KEY_VAL;
744                 p_parser->keys[idx].mask = ~UINT32_MAX;
745                 memcpy(&p_parser->keys[idx].val,
746                         conf->ipv6_addr + i *  SZ_PEDIT_KEY_VAL,
747                         SZ_PEDIT_KEY_VAL);
748         }
749         p_parser->sel.nkeys += keys;
750 }
751
752 /**
753  * Set pedit key of ipv4 address
754  *
755  * @param[in] actions
756  *   pointer to action specification
757  * @param[in,out] p_parser
758  *   pointer to pedit_parser
759  */
760 static void
761 flow_tcf_pedit_key_set_ipv4_addr(const struct rte_flow_action *actions,
762                                  struct pedit_parser *p_parser)
763 {
764         int idx = p_parser->sel.nkeys;
765
766         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
767         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
768         p_parser->keys[idx].off =
769                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ?
770                 offsetof(struct ipv4_hdr, src_addr) :
771                 offsetof(struct ipv4_hdr, dst_addr);
772         p_parser->keys[idx].mask = ~UINT32_MAX;
773         p_parser->keys[idx].val =
774                 ((const struct rte_flow_action_set_ipv4 *)
775                  actions->conf)->ipv4_addr;
776         p_parser->sel.nkeys = (++idx);
777 }
778
779 /**
780  * Create the pedit's na attribute in netlink message
781  * on pre-allocate message buffer
782  *
783  * @param[in,out] nl
784  *   pointer to pre-allocated netlink message buffer
785  * @param[in,out] actions
786  *   pointer to pointer of actions specification.
787  * @param[in,out] action_flags
788  *   pointer to actions flags
789  * @param[in] item_flags
790  *   flags of all item presented
791  */
792 static void
793 flow_tcf_create_pedit_mnl_msg(struct nlmsghdr *nl,
794                               const struct rte_flow_action **actions,
795                               uint64_t item_flags)
796 {
797         struct pedit_parser p_parser;
798         struct nlattr *na_act_options;
799         struct nlattr *na_pedit_keys;
800
801         memset(&p_parser, 0, sizeof(p_parser));
802         mnl_attr_put_strz(nl, TCA_ACT_KIND, "pedit");
803         na_act_options = mnl_attr_nest_start(nl, TCA_ACT_OPTIONS);
804         /* all modify header actions should be in one tc-pedit action */
805         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
806                 switch ((*actions)->type) {
807                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
808                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
809                         flow_tcf_pedit_key_set_ipv4_addr(*actions, &p_parser);
810                         break;
811                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
812                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
813                         flow_tcf_pedit_key_set_ipv6_addr(*actions, &p_parser);
814                         break;
815                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
816                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
817                         flow_tcf_pedit_key_set_tp_port(*actions,
818                                                         &p_parser, item_flags);
819                         break;
820                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
821                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
822                         flow_tcf_pedit_key_set_dec_ttl(*actions,
823                                                         &p_parser, item_flags);
824                         break;
825                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
826                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
827                         flow_tcf_pedit_key_set_mac(*actions, &p_parser);
828                         break;
829                 default:
830                         goto pedit_mnl_msg_done;
831                 }
832         }
833 pedit_mnl_msg_done:
834         p_parser.sel.action = TC_ACT_PIPE;
835         mnl_attr_put(nl, TCA_PEDIT_PARMS_EX,
836                      sizeof(p_parser.sel) +
837                      p_parser.sel.nkeys * sizeof(struct tc_pedit_key),
838                      &p_parser);
839         na_pedit_keys =
840                 mnl_attr_nest_start(nl, TCA_PEDIT_KEYS_EX | NLA_F_NESTED);
841         for (int i = 0; i < p_parser.sel.nkeys; i++) {
842                 struct nlattr *na_pedit_key =
843                         mnl_attr_nest_start(nl,
844                                             TCA_PEDIT_KEY_EX | NLA_F_NESTED);
845                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_HTYPE,
846                                  p_parser.keys_ex[i].htype);
847                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_CMD,
848                                  p_parser.keys_ex[i].cmd);
849                 mnl_attr_nest_end(nl, na_pedit_key);
850         }
851         mnl_attr_nest_end(nl, na_pedit_keys);
852         mnl_attr_nest_end(nl, na_act_options);
853         (*actions)--;
854 }
855
856 /**
857  * Calculate max memory size of one TC-pedit actions.
858  * One TC-pedit action can contain set of keys each defining
859  * a rewrite element (rte_flow action)
860  *
861  * @param[in,out] actions
862  *   actions specification.
863  * @param[in,out] action_flags
864  *   actions flags
865  * @param[in,out] size
866  *   accumulated size
867  * @return
868  *   Max memory size of one TC-pedit action
869  */
870 static int
871 flow_tcf_get_pedit_actions_size(const struct rte_flow_action **actions,
872                                 uint64_t *action_flags)
873 {
874         int pedit_size = 0;
875         int keys = 0;
876         uint64_t flags = 0;
877
878         pedit_size += SZ_NLATTR_NEST + /* na_act_index. */
879                       SZ_NLATTR_STRZ_OF("pedit") +
880                       SZ_NLATTR_NEST; /* TCA_ACT_OPTIONS. */
881         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
882                 switch ((*actions)->type) {
883                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
884                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
885                         flags |= MLX5_FLOW_ACTION_SET_IPV4_SRC;
886                         break;
887                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
888                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
889                         flags |= MLX5_FLOW_ACTION_SET_IPV4_DST;
890                         break;
891                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
892                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
893                         flags |= MLX5_FLOW_ACTION_SET_IPV6_SRC;
894                         break;
895                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
896                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
897                         flags |= MLX5_FLOW_ACTION_SET_IPV6_DST;
898                         break;
899                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
900                         /* TCP is as same as UDP */
901                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
902                         flags |= MLX5_FLOW_ACTION_SET_TP_SRC;
903                         break;
904                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
905                         /* TCP is as same as UDP */
906                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
907                         flags |= MLX5_FLOW_ACTION_SET_TP_DST;
908                         break;
909                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
910                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
911                         flags |= MLX5_FLOW_ACTION_SET_TTL;
912                         break;
913                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
914                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
915                         flags |= MLX5_FLOW_ACTION_DEC_TTL;
916                         break;
917                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
918                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
919                         flags |= MLX5_FLOW_ACTION_SET_MAC_SRC;
920                         break;
921                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
922                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
923                         flags |= MLX5_FLOW_ACTION_SET_MAC_DST;
924                         break;
925                 default:
926                         goto get_pedit_action_size_done;
927                 }
928         }
929 get_pedit_action_size_done:
930         /* TCA_PEDIT_PARAMS_EX */
931         pedit_size +=
932                 SZ_NLATTR_DATA_OF(sizeof(struct tc_pedit_sel) +
933                                   keys * sizeof(struct tc_pedit_key));
934         pedit_size += SZ_NLATTR_NEST; /* TCA_PEDIT_KEYS */
935         pedit_size += keys *
936                       /* TCA_PEDIT_KEY_EX + HTYPE + CMD */
937                       (SZ_NLATTR_NEST + SZ_NLATTR_DATA_OF(2) +
938                        SZ_NLATTR_DATA_OF(2));
939         (*action_flags) |= flags;
940         (*actions)--;
941         return pedit_size;
942 }
943
944 /**
945  * Retrieve mask for pattern item.
946  *
947  * This function does basic sanity checks on a pattern item in order to
948  * return the most appropriate mask for it.
949  *
950  * @param[in] item
951  *   Item specification.
952  * @param[in] mask_default
953  *   Default mask for pattern item as specified by the flow API.
954  * @param[in] mask_supported
955  *   Mask fields supported by the implementation.
956  * @param[in] mask_empty
957  *   Empty mask to return when there is no specification.
958  * @param[out] error
959  *   Perform verbose error reporting if not NULL.
960  *
961  * @return
962  *   Either @p item->mask or one of the mask parameters on success, NULL
963  *   otherwise and rte_errno is set.
964  */
965 static const void *
966 flow_tcf_item_mask(const struct rte_flow_item *item, const void *mask_default,
967                    const void *mask_supported, const void *mask_empty,
968                    size_t mask_size, struct rte_flow_error *error)
969 {
970         const uint8_t *mask;
971         size_t i;
972
973         /* item->last and item->mask cannot exist without item->spec. */
974         if (!item->spec && (item->mask || item->last)) {
975                 rte_flow_error_set(error, EINVAL,
976                                    RTE_FLOW_ERROR_TYPE_ITEM, item,
977                                    "\"mask\" or \"last\" field provided without"
978                                    " a corresponding \"spec\"");
979                 return NULL;
980         }
981         /* No spec, no mask, no problem. */
982         if (!item->spec)
983                 return mask_empty;
984         mask = item->mask ? item->mask : mask_default;
985         assert(mask);
986         /*
987          * Single-pass check to make sure that:
988          * - Mask is supported, no bits are set outside mask_supported.
989          * - Both item->spec and item->last are included in mask.
990          */
991         for (i = 0; i != mask_size; ++i) {
992                 if (!mask[i])
993                         continue;
994                 if ((mask[i] | ((const uint8_t *)mask_supported)[i]) !=
995                     ((const uint8_t *)mask_supported)[i]) {
996                         rte_flow_error_set(error, ENOTSUP,
997                                            RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
998                                            "unsupported field found"
999                                            " in \"mask\"");
1000                         return NULL;
1001                 }
1002                 if (item->last &&
1003                     (((const uint8_t *)item->spec)[i] & mask[i]) !=
1004                     (((const uint8_t *)item->last)[i] & mask[i])) {
1005                         rte_flow_error_set(error, EINVAL,
1006                                            RTE_FLOW_ERROR_TYPE_ITEM_LAST,
1007                                            item->last,
1008                                            "range between \"spec\" and \"last\""
1009                                            " not comprised in \"mask\"");
1010                         return NULL;
1011                 }
1012         }
1013         return mask;
1014 }
1015
1016 /**
1017  * Build a conversion table between port ID and ifindex.
1018  *
1019  * @param[in] dev
1020  *   Pointer to Ethernet device.
1021  * @param[out] ptoi
1022  *   Pointer to ptoi table.
1023  * @param[in] len
1024  *   Size of ptoi table provided.
1025  *
1026  * @return
1027  *   Size of ptoi table filled.
1028  */
1029 static unsigned int
1030 flow_tcf_build_ptoi_table(struct rte_eth_dev *dev, struct flow_tcf_ptoi *ptoi,
1031                           unsigned int len)
1032 {
1033         unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
1034         uint16_t port_id[n + 1];
1035         unsigned int i;
1036         unsigned int own = 0;
1037
1038         /* At least one port is needed when no switch domain is present. */
1039         if (!n) {
1040                 n = 1;
1041                 port_id[0] = dev->data->port_id;
1042         } else {
1043                 n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
1044         }
1045         if (n > len)
1046                 return 0;
1047         for (i = 0; i != n; ++i) {
1048                 struct rte_eth_dev_info dev_info;
1049
1050                 rte_eth_dev_info_get(port_id[i], &dev_info);
1051                 if (port_id[i] == dev->data->port_id)
1052                         own = i;
1053                 ptoi[i].port_id = port_id[i];
1054                 ptoi[i].ifindex = dev_info.if_index;
1055         }
1056         /* Ensure first entry of ptoi[] is the current device. */
1057         if (own) {
1058                 ptoi[n] = ptoi[0];
1059                 ptoi[0] = ptoi[own];
1060                 ptoi[own] = ptoi[n];
1061         }
1062         /* An entry with zero ifindex terminates ptoi[]. */
1063         ptoi[n].port_id = 0;
1064         ptoi[n].ifindex = 0;
1065         return n;
1066 }
1067
1068 /**
1069  * Verify the @p attr will be correctly understood by the E-switch.
1070  *
1071  * @param[in] attr
1072  *   Pointer to flow attributes
1073  * @param[out] error
1074  *   Pointer to error structure.
1075  *
1076  * @return
1077  *   0 on success, a negative errno value otherwise and rte_errno is set.
1078  */
1079 static int
1080 flow_tcf_validate_attributes(const struct rte_flow_attr *attr,
1081                              struct rte_flow_error *error)
1082 {
1083         /*
1084          * Supported attributes: groups, some priorities and ingress only.
1085          * group is supported only if kernel supports chain. Don't care about
1086          * transfer as it is the caller's problem.
1087          */
1088         if (attr->group > MLX5_TCF_GROUP_ID_MAX)
1089                 return rte_flow_error_set(error, ENOTSUP,
1090                                           RTE_FLOW_ERROR_TYPE_ATTR_GROUP, attr,
1091                                           "group ID larger than "
1092                                           RTE_STR(MLX5_TCF_GROUP_ID_MAX)
1093                                           " isn't supported");
1094         else if (attr->group > 0 &&
1095                  attr->priority > MLX5_TCF_GROUP_PRIORITY_MAX)
1096                 return rte_flow_error_set(error, ENOTSUP,
1097                                           RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1098                                           attr,
1099                                           "lowest priority level is "
1100                                           RTE_STR(MLX5_TCF_GROUP_PRIORITY_MAX)
1101                                           " when group is configured");
1102         else if (attr->priority > 0xfffe)
1103                 return rte_flow_error_set(error, ENOTSUP,
1104                                           RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1105                                           attr,
1106                                           "lowest priority level is 0xfffe");
1107         if (!attr->ingress)
1108                 return rte_flow_error_set(error, EINVAL,
1109                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1110                                           attr, "only ingress is supported");
1111         if (attr->egress)
1112                 return rte_flow_error_set(error, ENOTSUP,
1113                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1114                                           attr, "egress is not supported");
1115         return 0;
1116 }
1117
1118 /**
1119  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_ETH item for E-Switch.
1120  * The routine checks the L2 fields to be used in encapsulation header.
1121  *
1122  * @param[in] item
1123  *   Pointer to the item structure.
1124  * @param[out] error
1125  *   Pointer to the error structure.
1126  *
1127  * @return
1128  *   0 on success, a negative errno value otherwise and rte_errno is set.
1129  **/
1130 static int
1131 flow_tcf_validate_vxlan_encap_eth(const struct rte_flow_item *item,
1132                                   struct rte_flow_error *error)
1133 {
1134         const struct rte_flow_item_eth *spec = item->spec;
1135         const struct rte_flow_item_eth *mask = item->mask;
1136
1137         if (!spec) {
1138                 /*
1139                  * Specification for L2 addresses can be empty
1140                  * because these ones are optional and not
1141                  * required directly by tc rule. Kernel tries
1142                  * to resolve these ones on its own
1143                  */
1144                 return 0;
1145         }
1146         if (!mask) {
1147                 /* If mask is not specified use the default one. */
1148                 mask = &rte_flow_item_eth_mask;
1149         }
1150         if (memcmp(&mask->dst,
1151                    &flow_tcf_mask_empty.eth.dst,
1152                    sizeof(flow_tcf_mask_empty.eth.dst))) {
1153                 if (memcmp(&mask->dst,
1154                            &rte_flow_item_eth_mask.dst,
1155                            sizeof(rte_flow_item_eth_mask.dst)))
1156                         return rte_flow_error_set
1157                                 (error, ENOTSUP,
1158                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1159                                  "no support for partial mask on"
1160                                  " \"eth.dst\" field");
1161         }
1162         if (memcmp(&mask->src,
1163                    &flow_tcf_mask_empty.eth.src,
1164                    sizeof(flow_tcf_mask_empty.eth.src))) {
1165                 if (memcmp(&mask->src,
1166                            &rte_flow_item_eth_mask.src,
1167                            sizeof(rte_flow_item_eth_mask.src)))
1168                         return rte_flow_error_set
1169                                 (error, ENOTSUP,
1170                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1171                                  "no support for partial mask on"
1172                                  " \"eth.src\" field");
1173         }
1174         if (mask->type != RTE_BE16(0x0000)) {
1175                 if (mask->type != RTE_BE16(0xffff))
1176                         return rte_flow_error_set
1177                                 (error, ENOTSUP,
1178                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1179                                  "no support for partial mask on"
1180                                  " \"eth.type\" field");
1181                 DRV_LOG(WARNING,
1182                         "outer ethernet type field"
1183                         " cannot be forced for vxlan"
1184                         " encapsulation, parameter ignored");
1185         }
1186         return 0;
1187 }
1188
1189 /**
1190  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV4 item for E-Switch.
1191  * The routine checks the IPv4 fields to be used in encapsulation header.
1192  *
1193  * @param[in] item
1194  *   Pointer to the item structure.
1195  * @param[out] error
1196  *   Pointer to the error structure.
1197  *
1198  * @return
1199  *   0 on success, a negative errno value otherwise and rte_errno is set.
1200  **/
1201 static int
1202 flow_tcf_validate_vxlan_encap_ipv4(const struct rte_flow_item *item,
1203                                    struct rte_flow_error *error)
1204 {
1205         const struct rte_flow_item_ipv4 *spec = item->spec;
1206         const struct rte_flow_item_ipv4 *mask = item->mask;
1207
1208         if (!spec) {
1209                 /*
1210                  * Specification for IP addresses cannot be empty
1211                  * because it is required by tunnel_key parameter.
1212                  */
1213                 return rte_flow_error_set(error, EINVAL,
1214                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1215                                           "NULL outer ipv4 address"
1216                                           " specification for vxlan"
1217                                           " encapsulation");
1218         }
1219         if (!mask)
1220                 mask = &rte_flow_item_ipv4_mask;
1221         if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) {
1222                 if (mask->hdr.dst_addr != RTE_BE32(0xffffffff))
1223                         return rte_flow_error_set
1224                                 (error, ENOTSUP,
1225                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1226                                  "no support for partial mask on"
1227                                  " \"ipv4.hdr.dst_addr\" field"
1228                                  " for vxlan encapsulation");
1229                 /* More IPv4 address validations can be put here. */
1230         } else {
1231                 /*
1232                  * Kernel uses the destination IP address to determine
1233                  * the routing path and obtain the MAC destination
1234                  * address, so IP destination address must be
1235                  * specified in the tc rule.
1236                  */
1237                 return rte_flow_error_set(error, EINVAL,
1238                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1239                                           "outer ipv4 destination address"
1240                                           " must be specified for"
1241                                           " vxlan encapsulation");
1242         }
1243         if (mask->hdr.src_addr != RTE_BE32(0x00000000)) {
1244                 if (mask->hdr.src_addr != RTE_BE32(0xffffffff))
1245                         return rte_flow_error_set
1246                                 (error, ENOTSUP,
1247                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1248                                  "no support for partial mask on"
1249                                  " \"ipv4.hdr.src_addr\" field"
1250                                  " for vxlan encapsulation");
1251                 /* More IPv4 address validations can be put here. */
1252         } else {
1253                 /*
1254                  * Kernel uses the source IP address to select the
1255                  * interface for egress encapsulated traffic, so
1256                  * it must be specified in the tc rule.
1257                  */
1258                 return rte_flow_error_set(error, EINVAL,
1259                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1260                                           "outer ipv4 source address"
1261                                           " must be specified for"
1262                                           " vxlan encapsulation");
1263         }
1264         return 0;
1265 }
1266
1267 /**
1268  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV6 item for E-Switch.
1269  * The routine checks the IPv6 fields to be used in encapsulation header.
1270  *
1271  * @param[in] item
1272  *   Pointer to the item structure.
1273  * @param[out] error
1274  *   Pointer to the error structure.
1275  *
1276  * @return
1277  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1278  **/
1279 static int
1280 flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item,
1281                                    struct rte_flow_error *error)
1282 {
1283         const struct rte_flow_item_ipv6 *spec = item->spec;
1284         const struct rte_flow_item_ipv6 *mask = item->mask;
1285
1286         if (!spec) {
1287                 /*
1288                  * Specification for IP addresses cannot be empty
1289                  * because it is required by tunnel_key parameter.
1290                  */
1291                 return rte_flow_error_set(error, EINVAL,
1292                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1293                                           "NULL outer ipv6 address"
1294                                           " specification for"
1295                                           " vxlan encapsulation");
1296         }
1297         if (!mask)
1298                 mask = &rte_flow_item_ipv6_mask;
1299         if (memcmp(&mask->hdr.dst_addr,
1300                    &flow_tcf_mask_empty.ipv6.hdr.dst_addr,
1301                    IPV6_ADDR_LEN)) {
1302                 if (memcmp(&mask->hdr.dst_addr,
1303                            &rte_flow_item_ipv6_mask.hdr.dst_addr,
1304                            IPV6_ADDR_LEN))
1305                         return rte_flow_error_set
1306                                         (error, ENOTSUP,
1307                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1308                                          "no support for partial mask on"
1309                                          " \"ipv6.hdr.dst_addr\" field"
1310                                          " for vxlan encapsulation");
1311                 /* More IPv6 address validations can be put here. */
1312         } else {
1313                 /*
1314                  * Kernel uses the destination IP address to determine
1315                  * the routing path and obtain the MAC destination
1316                  * address (heigh or gate), so IP destination address
1317                  * must be specified within the tc rule.
1318                  */
1319                 return rte_flow_error_set(error, EINVAL,
1320                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1321                                           "outer ipv6 destination address"
1322                                           " must be specified for"
1323                                           " vxlan encapsulation");
1324         }
1325         if (memcmp(&mask->hdr.src_addr,
1326                    &flow_tcf_mask_empty.ipv6.hdr.src_addr,
1327                    IPV6_ADDR_LEN)) {
1328                 if (memcmp(&mask->hdr.src_addr,
1329                            &rte_flow_item_ipv6_mask.hdr.src_addr,
1330                            IPV6_ADDR_LEN))
1331                         return rte_flow_error_set
1332                                         (error, ENOTSUP,
1333                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1334                                          "no support for partial mask on"
1335                                          " \"ipv6.hdr.src_addr\" field"
1336                                          " for vxlan encapsulation");
1337                 /* More L3 address validation can be put here. */
1338         } else {
1339                 /*
1340                  * Kernel uses the source IP address to select the
1341                  * interface for egress encapsulated traffic, so
1342                  * it must be specified in the tc rule.
1343                  */
1344                 return rte_flow_error_set(error, EINVAL,
1345                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1346                                           "outer L3 source address"
1347                                           " must be specified for"
1348                                           " vxlan encapsulation");
1349         }
1350         return 0;
1351 }
1352
1353 /**
1354  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_UDP item for E-Switch.
1355  * The routine checks the UDP fields to be used in encapsulation header.
1356  *
1357  * @param[in] item
1358  *   Pointer to the item structure.
1359  * @param[out] error
1360  *   Pointer to the error structure.
1361  *
1362  * @return
1363  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1364  **/
1365 static int
1366 flow_tcf_validate_vxlan_encap_udp(const struct rte_flow_item *item,
1367                                   struct rte_flow_error *error)
1368 {
1369         const struct rte_flow_item_udp *spec = item->spec;
1370         const struct rte_flow_item_udp *mask = item->mask;
1371
1372         if (!spec) {
1373                 /*
1374                  * Specification for UDP ports cannot be empty
1375                  * because it is required by tunnel_key parameter.
1376                  */
1377                 return rte_flow_error_set(error, EINVAL,
1378                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1379                                           "NULL UDP port specification "
1380                                           " for vxlan encapsulation");
1381         }
1382         if (!mask)
1383                 mask = &rte_flow_item_udp_mask;
1384         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1385                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1386                         return rte_flow_error_set
1387                                         (error, ENOTSUP,
1388                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1389                                          "no support for partial mask on"
1390                                          " \"udp.hdr.dst_port\" field"
1391                                          " for vxlan encapsulation");
1392                 if (!spec->hdr.dst_port)
1393                         return rte_flow_error_set
1394                                         (error, EINVAL,
1395                                          RTE_FLOW_ERROR_TYPE_ITEM, item,
1396                                          "outer UDP remote port cannot be"
1397                                          " 0 for vxlan encapsulation");
1398         } else {
1399                 return rte_flow_error_set(error, EINVAL,
1400                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1401                                           "outer UDP remote port"
1402                                           " must be specified for"
1403                                           " vxlan encapsulation");
1404         }
1405         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1406                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1407                         return rte_flow_error_set
1408                                         (error, ENOTSUP,
1409                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1410                                          "no support for partial mask on"
1411                                          " \"udp.hdr.src_port\" field"
1412                                          " for vxlan encapsulation");
1413                 DRV_LOG(WARNING,
1414                         "outer UDP source port cannot be"
1415                         " forced for vxlan encapsulation,"
1416                         " parameter ignored");
1417         }
1418         return 0;
1419 }
1420
1421 /**
1422  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_VXLAN item for E-Switch.
1423  * The routine checks the VNIP fields to be used in encapsulation header.
1424  *
1425  * @param[in] item
1426  *   Pointer to the item structure.
1427  * @param[out] error
1428  *   Pointer to the error structure.
1429  *
1430  * @return
1431  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1432  **/
1433 static int
1434 flow_tcf_validate_vxlan_encap_vni(const struct rte_flow_item *item,
1435                                   struct rte_flow_error *error)
1436 {
1437         const struct rte_flow_item_vxlan *spec = item->spec;
1438         const struct rte_flow_item_vxlan *mask = item->mask;
1439
1440         if (!spec) {
1441                 /* Outer VNI is required by tunnel_key parameter. */
1442                 return rte_flow_error_set(error, EINVAL,
1443                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1444                                           "NULL VNI specification"
1445                                           " for vxlan encapsulation");
1446         }
1447         if (!mask)
1448                 mask = &rte_flow_item_vxlan_mask;
1449         if (!mask->vni[0] && !mask->vni[1] && !mask->vni[2])
1450                 return rte_flow_error_set(error, EINVAL,
1451                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1452                                           "outer VNI must be specified "
1453                                           "for vxlan encapsulation");
1454         if (mask->vni[0] != 0xff ||
1455             mask->vni[1] != 0xff ||
1456             mask->vni[2] != 0xff)
1457                 return rte_flow_error_set(error, ENOTSUP,
1458                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1459                                           "no support for partial mask on"
1460                                           " \"vxlan.vni\" field");
1461
1462         if (!spec->vni[0] && !spec->vni[1] && !spec->vni[2])
1463                 return rte_flow_error_set(error, EINVAL,
1464                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1465                                           "vxlan vni cannot be 0");
1466         return 0;
1467 }
1468
1469 /**
1470  * Validate VXLAN_ENCAP action item list for E-Switch.
1471  * The routine checks items to be used in encapsulation header.
1472  *
1473  * @param[in] action
1474  *   Pointer to the VXLAN_ENCAP action structure.
1475  * @param[out] error
1476  *   Pointer to the error structure.
1477  *
1478  * @return
1479  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1480  **/
1481 static int
1482 flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action,
1483                               struct rte_flow_error *error)
1484 {
1485         const struct rte_flow_item *items;
1486         int ret;
1487         uint32_t item_flags = 0;
1488
1489         if (!action->conf)
1490                 return rte_flow_error_set(error, EINVAL,
1491                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1492                                           "Missing vxlan tunnel"
1493                                           " action configuration");
1494         items = ((const struct rte_flow_action_vxlan_encap *)
1495                                         action->conf)->definition;
1496         if (!items)
1497                 return rte_flow_error_set(error, EINVAL,
1498                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1499                                           "Missing vxlan tunnel"
1500                                           " encapsulation parameters");
1501         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1502                 switch (items->type) {
1503                 case RTE_FLOW_ITEM_TYPE_VOID:
1504                         break;
1505                 case RTE_FLOW_ITEM_TYPE_ETH:
1506                         ret = mlx5_flow_validate_item_eth(items, item_flags,
1507                                                           error);
1508                         if (ret < 0)
1509                                 return ret;
1510                         ret = flow_tcf_validate_vxlan_encap_eth(items, error);
1511                         if (ret < 0)
1512                                 return ret;
1513                         item_flags |= MLX5_FLOW_LAYER_OUTER_L2;
1514                         break;
1515                 break;
1516                 case RTE_FLOW_ITEM_TYPE_IPV4:
1517                         ret = mlx5_flow_validate_item_ipv4(items, item_flags,
1518                                                            error);
1519                         if (ret < 0)
1520                                 return ret;
1521                         ret = flow_tcf_validate_vxlan_encap_ipv4(items, error);
1522                         if (ret < 0)
1523                                 return ret;
1524                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
1525                         break;
1526                 case RTE_FLOW_ITEM_TYPE_IPV6:
1527                         ret = mlx5_flow_validate_item_ipv6(items, item_flags,
1528                                                            error);
1529                         if (ret < 0)
1530                                 return ret;
1531                         ret = flow_tcf_validate_vxlan_encap_ipv6(items, error);
1532                         if (ret < 0)
1533                                 return ret;
1534                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
1535                         break;
1536                 case RTE_FLOW_ITEM_TYPE_UDP:
1537                         ret = mlx5_flow_validate_item_udp(items, item_flags,
1538                                                            0xFF, error);
1539                         if (ret < 0)
1540                                 return ret;
1541                         ret = flow_tcf_validate_vxlan_encap_udp(items, error);
1542                         if (ret < 0)
1543                                 return ret;
1544                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
1545                         break;
1546                 case RTE_FLOW_ITEM_TYPE_VXLAN:
1547                         ret = mlx5_flow_validate_item_vxlan(items,
1548                                                             item_flags, error);
1549                         if (ret < 0)
1550                                 return ret;
1551                         ret = flow_tcf_validate_vxlan_encap_vni(items, error);
1552                         if (ret < 0)
1553                                 return ret;
1554                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
1555                         break;
1556                 default:
1557                         return rte_flow_error_set
1558                                         (error, ENOTSUP,
1559                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
1560                                          "vxlan encap item not supported");
1561                 }
1562         }
1563         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3))
1564                 return rte_flow_error_set(error, EINVAL,
1565                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1566                                           "no outer IP layer found"
1567                                           " for vxlan encapsulation");
1568         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
1569                 return rte_flow_error_set(error, EINVAL,
1570                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1571                                           "no outer UDP layer found"
1572                                           " for vxlan encapsulation");
1573         if (!(item_flags & MLX5_FLOW_LAYER_VXLAN))
1574                 return rte_flow_error_set(error, EINVAL,
1575                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1576                                           "no VXLAN VNI found"
1577                                           " for vxlan encapsulation");
1578         return 0;
1579 }
1580
1581 /**
1582  * Validate RTE_FLOW_ITEM_TYPE_IPV4 item if VXLAN_DECAP action
1583  * is present in actions list.
1584  *
1585  * @param[in] ipv4
1586  *   Outer IPv4 address item (if any, NULL otherwise).
1587  * @param[out] error
1588  *   Pointer to the error structure.
1589  *
1590  * @return
1591  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1592  **/
1593 static int
1594 flow_tcf_validate_vxlan_decap_ipv4(const struct rte_flow_item *ipv4,
1595                                    struct rte_flow_error *error)
1596 {
1597         const struct rte_flow_item_ipv4 *spec = ipv4->spec;
1598         const struct rte_flow_item_ipv4 *mask = ipv4->mask;
1599
1600         if (!spec) {
1601                 /*
1602                  * Specification for IP addresses cannot be empty
1603                  * because it is required as decap parameter.
1604                  */
1605                 return rte_flow_error_set(error, EINVAL,
1606                                           RTE_FLOW_ERROR_TYPE_ITEM, ipv4,
1607                                           "NULL outer ipv4 address"
1608                                           " specification for vxlan"
1609                                           " for vxlan decapsulation");
1610         }
1611         if (!mask)
1612                 mask = &rte_flow_item_ipv4_mask;
1613         if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) {
1614                 if (mask->hdr.dst_addr != RTE_BE32(0xffffffff))
1615                         return rte_flow_error_set
1616                                         (error, ENOTSUP,
1617                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1618                                          "no support for partial mask on"
1619                                          " \"ipv4.hdr.dst_addr\" field");
1620                 /* More IP address validations can be put here. */
1621         } else {
1622                 /*
1623                  * Kernel uses the destination IP address
1624                  * to determine the ingress network interface
1625                  * for traffic being decapsulated.
1626                  */
1627                 return rte_flow_error_set(error, EINVAL,
1628                                           RTE_FLOW_ERROR_TYPE_ITEM, ipv4,
1629                                           "outer ipv4 destination address"
1630                                           " must be specified for"
1631                                           " vxlan decapsulation");
1632         }
1633         /* Source IP address is optional for decap. */
1634         if (mask->hdr.src_addr != RTE_BE32(0x00000000) &&
1635             mask->hdr.src_addr != RTE_BE32(0xffffffff))
1636                 return rte_flow_error_set(error, ENOTSUP,
1637                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1638                                           "no support for partial mask on"
1639                                           " \"ipv4.hdr.src_addr\" field");
1640         return 0;
1641 }
1642
1643 /**
1644  * Validate RTE_FLOW_ITEM_TYPE_IPV6 item if VXLAN_DECAP action
1645  * is present in actions list.
1646  *
1647  * @param[in] ipv6
1648  *   Outer IPv6 address item (if any, NULL otherwise).
1649  * @param[out] error
1650  *   Pointer to the error structure.
1651  *
1652  * @return
1653  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1654  **/
1655 static int
1656 flow_tcf_validate_vxlan_decap_ipv6(const struct rte_flow_item *ipv6,
1657                                    struct rte_flow_error *error)
1658 {
1659         const struct rte_flow_item_ipv6 *spec = ipv6->spec;
1660         const struct rte_flow_item_ipv6 *mask = ipv6->mask;
1661
1662         if (!spec) {
1663                 /*
1664                  * Specification for IP addresses cannot be empty
1665                  * because it is required as decap parameter.
1666                  */
1667                 return rte_flow_error_set(error, EINVAL,
1668                                           RTE_FLOW_ERROR_TYPE_ITEM, ipv6,
1669                                           "NULL outer ipv6 address"
1670                                           " specification for vxlan"
1671                                           " decapsulation");
1672         }
1673         if (!mask)
1674                 mask = &rte_flow_item_ipv6_mask;
1675         if (memcmp(&mask->hdr.dst_addr,
1676                    &flow_tcf_mask_empty.ipv6.hdr.dst_addr,
1677                    IPV6_ADDR_LEN)) {
1678                 if (memcmp(&mask->hdr.dst_addr,
1679                         &rte_flow_item_ipv6_mask.hdr.dst_addr,
1680                         IPV6_ADDR_LEN))
1681                         return rte_flow_error_set
1682                                         (error, ENOTSUP,
1683                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1684                                          "no support for partial mask on"
1685                                          " \"ipv6.hdr.dst_addr\" field");
1686                 /* More IP address validations can be put here. */
1687         } else {
1688                 /*
1689                  * Kernel uses the destination IP address
1690                  * to determine the ingress network interface
1691                  * for traffic being decapsulated.
1692                  */
1693                 return rte_flow_error_set(error, EINVAL,
1694                                           RTE_FLOW_ERROR_TYPE_ITEM, ipv6,
1695                                           "outer ipv6 destination address must be "
1696                                           "specified for vxlan decapsulation");
1697         }
1698         /* Source IP address is optional for decap. */
1699         if (memcmp(&mask->hdr.src_addr,
1700                    &flow_tcf_mask_empty.ipv6.hdr.src_addr,
1701                    IPV6_ADDR_LEN)) {
1702                 if (memcmp(&mask->hdr.src_addr,
1703                            &rte_flow_item_ipv6_mask.hdr.src_addr,
1704                            IPV6_ADDR_LEN))
1705                         return rte_flow_error_set
1706                                         (error, ENOTSUP,
1707                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1708                                          "no support for partial mask on"
1709                                          " \"ipv6.hdr.src_addr\" field");
1710         }
1711         return 0;
1712 }
1713
1714 /**
1715  * Validate RTE_FLOW_ITEM_TYPE_UDP item if VXLAN_DECAP action
1716  * is present in actions list.
1717  *
1718  * @param[in] udp
1719  *   Outer UDP layer item (if any, NULL otherwise).
1720  * @param[out] error
1721  *   Pointer to the error structure.
1722  *
1723  * @return
1724  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1725  **/
1726 static int
1727 flow_tcf_validate_vxlan_decap_udp(const struct rte_flow_item *udp,
1728                                   struct rte_flow_error *error)
1729 {
1730         const struct rte_flow_item_udp *spec = udp->spec;
1731         const struct rte_flow_item_udp *mask = udp->mask;
1732
1733         if (!spec)
1734                 /*
1735                  * Specification for UDP ports cannot be empty
1736                  * because it is required as decap parameter.
1737                  */
1738                 return rte_flow_error_set(error, EINVAL,
1739                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1740                                           "NULL UDP port specification"
1741                                           " for VXLAN decapsulation");
1742         if (!mask)
1743                 mask = &rte_flow_item_udp_mask;
1744         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1745                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1746                         return rte_flow_error_set
1747                                         (error, ENOTSUP,
1748                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1749                                          "no support for partial mask on"
1750                                          " \"udp.hdr.dst_port\" field");
1751                 if (!spec->hdr.dst_port)
1752                         return rte_flow_error_set
1753                                         (error, EINVAL,
1754                                          RTE_FLOW_ERROR_TYPE_ITEM, udp,
1755                                          "zero decap local UDP port");
1756         } else {
1757                 return rte_flow_error_set(error, EINVAL,
1758                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1759                                           "outer UDP destination port must be "
1760                                           "specified for vxlan decapsulation");
1761         }
1762         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1763                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1764                         return rte_flow_error_set
1765                                         (error, ENOTSUP,
1766                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1767                                          "no support for partial mask on"
1768                                          " \"udp.hdr.src_port\" field");
1769                 DRV_LOG(WARNING,
1770                         "outer UDP local port cannot be "
1771                         "forced for VXLAN encapsulation, "
1772                         "parameter ignored");
1773         }
1774         return 0;
1775 }
1776
1777 /**
1778  * Validate flow for E-Switch.
1779  *
1780  * @param[in] priv
1781  *   Pointer to the priv structure.
1782  * @param[in] attr
1783  *   Pointer to the flow attributes.
1784  * @param[in] items
1785  *   Pointer to the list of items.
1786  * @param[in] actions
1787  *   Pointer to the list of actions.
1788  * @param[out] error
1789  *   Pointer to the error structure.
1790  *
1791  * @return
1792  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1793  */
1794 static int
1795 flow_tcf_validate(struct rte_eth_dev *dev,
1796                   const struct rte_flow_attr *attr,
1797                   const struct rte_flow_item items[],
1798                   const struct rte_flow_action actions[],
1799                   struct rte_flow_error *error)
1800 {
1801         union {
1802                 const struct rte_flow_item_port_id *port_id;
1803                 const struct rte_flow_item_eth *eth;
1804                 const struct rte_flow_item_vlan *vlan;
1805                 const struct rte_flow_item_ipv4 *ipv4;
1806                 const struct rte_flow_item_ipv6 *ipv6;
1807                 const struct rte_flow_item_tcp *tcp;
1808                 const struct rte_flow_item_udp *udp;
1809                 const struct rte_flow_item_vxlan *vxlan;
1810         } spec, mask;
1811         union {
1812                 const struct rte_flow_action_port_id *port_id;
1813                 const struct rte_flow_action_jump *jump;
1814                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
1815                 const struct rte_flow_action_of_set_vlan_vid *
1816                         of_set_vlan_vid;
1817                 const struct rte_flow_action_of_set_vlan_pcp *
1818                         of_set_vlan_pcp;
1819                 const struct rte_flow_action_vxlan_encap *vxlan_encap;
1820                 const struct rte_flow_action_set_ipv4 *set_ipv4;
1821                 const struct rte_flow_action_set_ipv6 *set_ipv6;
1822         } conf;
1823         uint64_t item_flags = 0;
1824         uint64_t action_flags = 0;
1825         uint8_t next_protocol = -1;
1826         unsigned int tcm_ifindex = 0;
1827         uint8_t pedit_validated = 0;
1828         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
1829         struct rte_eth_dev *port_id_dev = NULL;
1830         bool in_port_id_set;
1831         int ret;
1832
1833         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
1834                                                 PTOI_TABLE_SZ_MAX(dev)));
1835         ret = flow_tcf_validate_attributes(attr, error);
1836         if (ret < 0)
1837                 return ret;
1838         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
1839                 unsigned int i;
1840                 uint64_t current_action_flag = 0;
1841
1842                 switch (actions->type) {
1843                 case RTE_FLOW_ACTION_TYPE_VOID:
1844                         break;
1845                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
1846                         current_action_flag = MLX5_FLOW_ACTION_PORT_ID;
1847                         if (!actions->conf)
1848                                 break;
1849                         conf.port_id = actions->conf;
1850                         if (conf.port_id->original)
1851                                 i = 0;
1852                         else
1853                                 for (i = 0; ptoi[i].ifindex; ++i)
1854                                         if (ptoi[i].port_id == conf.port_id->id)
1855                                                 break;
1856                         if (!ptoi[i].ifindex)
1857                                 return rte_flow_error_set
1858                                         (error, ENODEV,
1859                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1860                                          conf.port_id,
1861                                          "missing data to convert port ID to"
1862                                          " ifindex");
1863                         port_id_dev = &rte_eth_devices[conf.port_id->id];
1864                         break;
1865                 case RTE_FLOW_ACTION_TYPE_JUMP:
1866                         current_action_flag = MLX5_FLOW_ACTION_JUMP;
1867                         if (!actions->conf)
1868                                 break;
1869                         conf.jump = actions->conf;
1870                         if (attr->group >= conf.jump->group)
1871                                 return rte_flow_error_set
1872                                         (error, ENOTSUP,
1873                                          RTE_FLOW_ERROR_TYPE_ACTION,
1874                                          actions,
1875                                          "can jump only to a group forward");
1876                         break;
1877                 case RTE_FLOW_ACTION_TYPE_DROP:
1878                         current_action_flag = MLX5_FLOW_ACTION_DROP;
1879                         break;
1880                 case RTE_FLOW_ACTION_TYPE_COUNT:
1881                         break;
1882                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
1883                         current_action_flag = MLX5_FLOW_ACTION_OF_POP_VLAN;
1884                         break;
1885                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
1886                         current_action_flag = MLX5_FLOW_ACTION_OF_PUSH_VLAN;
1887                         break;
1888                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
1889                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1890                                 return rte_flow_error_set
1891                                         (error, ENOTSUP,
1892                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1893                                          "vlan modify is not supported,"
1894                                          " set action must follow push action");
1895                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
1896                         break;
1897                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
1898                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1899                                 return rte_flow_error_set
1900                                         (error, ENOTSUP,
1901                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1902                                          "vlan modify is not supported,"
1903                                          " set action must follow push action");
1904                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
1905                         break;
1906                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
1907                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_DECAP;
1908                         break;
1909                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
1910                         ret = flow_tcf_validate_vxlan_encap(actions, error);
1911                         if (ret < 0)
1912                                 return ret;
1913                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_ENCAP;
1914                         break;
1915                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
1916                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_SRC;
1917                         break;
1918                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
1919                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_DST;
1920                         break;
1921                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
1922                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_SRC;
1923                         break;
1924                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
1925                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_DST;
1926                         break;
1927                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
1928                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_SRC;
1929                         break;
1930                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
1931                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_DST;
1932                         break;
1933                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
1934                         current_action_flag = MLX5_FLOW_ACTION_SET_TTL;
1935                         break;
1936                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
1937                         current_action_flag = MLX5_FLOW_ACTION_DEC_TTL;
1938                         break;
1939                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
1940                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_SRC;
1941                         break;
1942                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
1943                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_DST;
1944                         break;
1945                 default:
1946                         return rte_flow_error_set(error, ENOTSUP,
1947                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1948                                                   actions,
1949                                                   "action not supported");
1950                 }
1951                 if (current_action_flag & MLX5_TCF_CONFIG_ACTIONS) {
1952                         if (!actions->conf)
1953                                 return rte_flow_error_set
1954                                         (error, EINVAL,
1955                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1956                                          actions,
1957                                          "action configuration not set");
1958                 }
1959                 if ((current_action_flag & MLX5_TCF_PEDIT_ACTIONS) &&
1960                     pedit_validated)
1961                         return rte_flow_error_set(error, ENOTSUP,
1962                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1963                                                   actions,
1964                                                   "set actions should be "
1965                                                   "listed successively");
1966                 if ((current_action_flag & ~MLX5_TCF_PEDIT_ACTIONS) &&
1967                     (action_flags & MLX5_TCF_PEDIT_ACTIONS))
1968                         pedit_validated = 1;
1969                 if ((current_action_flag & MLX5_TCF_FATE_ACTIONS) &&
1970                     (action_flags & MLX5_TCF_FATE_ACTIONS))
1971                         return rte_flow_error_set(error, EINVAL,
1972                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1973                                                   actions,
1974                                                   "can't have multiple fate"
1975                                                   " actions");
1976                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1977                     (action_flags & MLX5_TCF_VXLAN_ACTIONS))
1978                         return rte_flow_error_set(error, EINVAL,
1979                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1980                                                   actions,
1981                                                   "can't have multiple vxlan"
1982                                                   " actions");
1983                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1984                     (action_flags & MLX5_TCF_VLAN_ACTIONS))
1985                         return rte_flow_error_set(error, ENOTSUP,
1986                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1987                                                   actions,
1988                                                   "can't have vxlan and vlan"
1989                                                   " actions in the same rule");
1990                 action_flags |= current_action_flag;
1991         }
1992         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1993                 unsigned int i;
1994
1995                 if ((item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
1996                     items->type != RTE_FLOW_ITEM_TYPE_ETH)
1997                         return rte_flow_error_set(error, ENOTSUP,
1998                                                   RTE_FLOW_ERROR_TYPE_ITEM,
1999                                                   items,
2000                                                   "only L2 inner item"
2001                                                   " is supported");
2002                 switch (items->type) {
2003                 case RTE_FLOW_ITEM_TYPE_VOID:
2004                         break;
2005                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
2006                         mask.port_id = flow_tcf_item_mask
2007                                 (items, &rte_flow_item_port_id_mask,
2008                                  &flow_tcf_mask_supported.port_id,
2009                                  &flow_tcf_mask_empty.port_id,
2010                                  sizeof(flow_tcf_mask_supported.port_id),
2011                                  error);
2012                         if (!mask.port_id)
2013                                 return -rte_errno;
2014                         if (mask.port_id == &flow_tcf_mask_empty.port_id) {
2015                                 in_port_id_set = 1;
2016                                 break;
2017                         }
2018                         spec.port_id = items->spec;
2019                         if (mask.port_id->id && mask.port_id->id != 0xffffffff)
2020                                 return rte_flow_error_set
2021                                         (error, ENOTSUP,
2022                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2023                                          mask.port_id,
2024                                          "no support for partial mask on"
2025                                          " \"id\" field");
2026                         if (!mask.port_id->id)
2027                                 i = 0;
2028                         else
2029                                 for (i = 0; ptoi[i].ifindex; ++i)
2030                                         if (ptoi[i].port_id == spec.port_id->id)
2031                                                 break;
2032                         if (!ptoi[i].ifindex)
2033                                 return rte_flow_error_set
2034                                         (error, ENODEV,
2035                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
2036                                          spec.port_id,
2037                                          "missing data to convert port ID to"
2038                                          " ifindex");
2039                         if (in_port_id_set && ptoi[i].ifindex != tcm_ifindex)
2040                                 return rte_flow_error_set
2041                                         (error, ENOTSUP,
2042                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
2043                                          spec.port_id,
2044                                          "cannot match traffic for"
2045                                          " several port IDs through"
2046                                          " a single flow rule");
2047                         tcm_ifindex = ptoi[i].ifindex;
2048                         in_port_id_set = 1;
2049                         break;
2050                 case RTE_FLOW_ITEM_TYPE_ETH:
2051                         ret = mlx5_flow_validate_item_eth(items, item_flags,
2052                                                           error);
2053                         if (ret < 0)
2054                                 return ret;
2055                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2056                                         MLX5_FLOW_LAYER_INNER_L2 :
2057                                         MLX5_FLOW_LAYER_OUTER_L2;
2058                         /* TODO:
2059                          * Redundant check due to different supported mask.
2060                          * Same for the rest of items.
2061                          */
2062                         mask.eth = flow_tcf_item_mask
2063                                 (items, &rte_flow_item_eth_mask,
2064                                  &flow_tcf_mask_supported.eth,
2065                                  &flow_tcf_mask_empty.eth,
2066                                  sizeof(flow_tcf_mask_supported.eth),
2067                                  error);
2068                         if (!mask.eth)
2069                                 return -rte_errno;
2070                         if (mask.eth->type && mask.eth->type !=
2071                             RTE_BE16(0xffff))
2072                                 return rte_flow_error_set
2073                                         (error, ENOTSUP,
2074                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2075                                          mask.eth,
2076                                          "no support for partial mask on"
2077                                          " \"type\" field");
2078                         break;
2079                 case RTE_FLOW_ITEM_TYPE_VLAN:
2080                         ret = mlx5_flow_validate_item_vlan(items, item_flags,
2081                                                            error);
2082                         if (ret < 0)
2083                                 return ret;
2084                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
2085                         mask.vlan = flow_tcf_item_mask
2086                                 (items, &rte_flow_item_vlan_mask,
2087                                  &flow_tcf_mask_supported.vlan,
2088                                  &flow_tcf_mask_empty.vlan,
2089                                  sizeof(flow_tcf_mask_supported.vlan),
2090                                  error);
2091                         if (!mask.vlan)
2092                                 return -rte_errno;
2093                         if ((mask.vlan->tci & RTE_BE16(0xe000) &&
2094                              (mask.vlan->tci & RTE_BE16(0xe000)) !=
2095                               RTE_BE16(0xe000)) ||
2096                             (mask.vlan->tci & RTE_BE16(0x0fff) &&
2097                              (mask.vlan->tci & RTE_BE16(0x0fff)) !=
2098                               RTE_BE16(0x0fff)) ||
2099                             (mask.vlan->inner_type &&
2100                              mask.vlan->inner_type != RTE_BE16(0xffff)))
2101                                 return rte_flow_error_set
2102                                         (error, ENOTSUP,
2103                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2104                                          mask.vlan,
2105                                          "no support for partial masks on"
2106                                          " \"tci\" (PCP and VID parts) and"
2107                                          " \"inner_type\" fields");
2108                         break;
2109                 case RTE_FLOW_ITEM_TYPE_IPV4:
2110                         ret = mlx5_flow_validate_item_ipv4(items, item_flags,
2111                                                            error);
2112                         if (ret < 0)
2113                                 return ret;
2114                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
2115                         mask.ipv4 = flow_tcf_item_mask
2116                                 (items, &rte_flow_item_ipv4_mask,
2117                                  &flow_tcf_mask_supported.ipv4,
2118                                  &flow_tcf_mask_empty.ipv4,
2119                                  sizeof(flow_tcf_mask_supported.ipv4),
2120                                  error);
2121                         if (!mask.ipv4)
2122                                 return -rte_errno;
2123                         if (mask.ipv4->hdr.next_proto_id &&
2124                             mask.ipv4->hdr.next_proto_id != 0xff)
2125                                 return rte_flow_error_set
2126                                         (error, ENOTSUP,
2127                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2128                                          mask.ipv4,
2129                                          "no support for partial mask on"
2130                                          " \"hdr.next_proto_id\" field");
2131                         else if (mask.ipv4->hdr.next_proto_id)
2132                                 next_protocol =
2133                                         ((const struct rte_flow_item_ipv4 *)
2134                                          (items->spec))->hdr.next_proto_id;
2135                         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2136                                 ret = flow_tcf_validate_vxlan_decap_ipv4
2137                                                                 (items, error);
2138                                 if (ret < 0)
2139                                         return ret;
2140                         }
2141                         break;
2142                 case RTE_FLOW_ITEM_TYPE_IPV6:
2143                         ret = mlx5_flow_validate_item_ipv6(items, item_flags,
2144                                                            error);
2145                         if (ret < 0)
2146                                 return ret;
2147                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
2148                         mask.ipv6 = flow_tcf_item_mask
2149                                 (items, &rte_flow_item_ipv6_mask,
2150                                  &flow_tcf_mask_supported.ipv6,
2151                                  &flow_tcf_mask_empty.ipv6,
2152                                  sizeof(flow_tcf_mask_supported.ipv6),
2153                                  error);
2154                         if (!mask.ipv6)
2155                                 return -rte_errno;
2156                         if (mask.ipv6->hdr.proto &&
2157                             mask.ipv6->hdr.proto != 0xff)
2158                                 return rte_flow_error_set
2159                                         (error, ENOTSUP,
2160                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2161                                          mask.ipv6,
2162                                          "no support for partial mask on"
2163                                          " \"hdr.proto\" field");
2164                         else if (mask.ipv6->hdr.proto)
2165                                 next_protocol =
2166                                         ((const struct rte_flow_item_ipv6 *)
2167                                          (items->spec))->hdr.proto;
2168                         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2169                                 ret = flow_tcf_validate_vxlan_decap_ipv6
2170                                                                 (items, error);
2171                                 if (ret < 0)
2172                                         return ret;
2173                         }
2174                         break;
2175                 case RTE_FLOW_ITEM_TYPE_UDP:
2176                         ret = mlx5_flow_validate_item_udp(items, item_flags,
2177                                                           next_protocol, error);
2178                         if (ret < 0)
2179                                 return ret;
2180                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
2181                         mask.udp = flow_tcf_item_mask
2182                                 (items, &rte_flow_item_udp_mask,
2183                                  &flow_tcf_mask_supported.udp,
2184                                  &flow_tcf_mask_empty.udp,
2185                                  sizeof(flow_tcf_mask_supported.udp),
2186                                  error);
2187                         if (!mask.udp)
2188                                 return -rte_errno;
2189                         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2190                                 ret = flow_tcf_validate_vxlan_decap_udp
2191                                                                 (items, error);
2192                                 if (ret < 0)
2193                                         return ret;
2194                         }
2195                         break;
2196                 case RTE_FLOW_ITEM_TYPE_TCP:
2197                         ret = mlx5_flow_validate_item_tcp
2198                                              (items, item_flags,
2199                                               next_protocol,
2200                                               &flow_tcf_mask_supported.tcp,
2201                                               error);
2202                         if (ret < 0)
2203                                 return ret;
2204                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
2205                         mask.tcp = flow_tcf_item_mask
2206                                 (items, &rte_flow_item_tcp_mask,
2207                                  &flow_tcf_mask_supported.tcp,
2208                                  &flow_tcf_mask_empty.tcp,
2209                                  sizeof(flow_tcf_mask_supported.tcp),
2210                                  error);
2211                         if (!mask.tcp)
2212                                 return -rte_errno;
2213                         break;
2214                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2215                         if (!(action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP))
2216                                 return rte_flow_error_set
2217                                         (error, ENOTSUP,
2218                                          RTE_FLOW_ERROR_TYPE_ITEM,
2219                                          items,
2220                                          "vni pattern should be followed by"
2221                                          " vxlan decapsulation action");
2222                         ret = mlx5_flow_validate_item_vxlan(items,
2223                                                             item_flags, error);
2224                         if (ret < 0)
2225                                 return ret;
2226                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
2227                         mask.vxlan = flow_tcf_item_mask
2228                                 (items, &rte_flow_item_vxlan_mask,
2229                                  &flow_tcf_mask_supported.vxlan,
2230                                  &flow_tcf_mask_empty.vxlan,
2231                                  sizeof(flow_tcf_mask_supported.vxlan), error);
2232                         if (!mask.vxlan)
2233                                 return -rte_errno;
2234                         if (mask.vxlan->vni[0] != 0xff ||
2235                             mask.vxlan->vni[1] != 0xff ||
2236                             mask.vxlan->vni[2] != 0xff)
2237                                 return rte_flow_error_set
2238                                         (error, ENOTSUP,
2239                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2240                                          mask.vxlan,
2241                                          "no support for partial or "
2242                                          "empty mask on \"vxlan.vni\" field");
2243                         break;
2244                 default:
2245                         return rte_flow_error_set(error, ENOTSUP,
2246                                                   RTE_FLOW_ERROR_TYPE_ITEM,
2247                                                   items, "item not supported");
2248                 }
2249         }
2250         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2251             (action_flags & MLX5_FLOW_ACTION_DROP))
2252                 return rte_flow_error_set(error, ENOTSUP,
2253                                           RTE_FLOW_ERROR_TYPE_ACTION,
2254                                           actions,
2255                                           "set action is not compatible with "
2256                                           "drop action");
2257         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2258             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2259                 return rte_flow_error_set(error, ENOTSUP,
2260                                           RTE_FLOW_ERROR_TYPE_ACTION,
2261                                           actions,
2262                                           "set action must be followed by "
2263                                           "port_id action");
2264         if (action_flags &
2265            (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST)) {
2266                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4))
2267                         return rte_flow_error_set(error, EINVAL,
2268                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2269                                                   actions,
2270                                                   "no ipv4 item found in"
2271                                                   " pattern");
2272         }
2273         if (action_flags &
2274            (MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST)) {
2275                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6))
2276                         return rte_flow_error_set(error, EINVAL,
2277                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2278                                                   actions,
2279                                                   "no ipv6 item found in"
2280                                                   " pattern");
2281         }
2282         if (action_flags &
2283            (MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST)) {
2284                 if (!(item_flags &
2285                      (MLX5_FLOW_LAYER_OUTER_L4_UDP |
2286                       MLX5_FLOW_LAYER_OUTER_L4_TCP)))
2287                         return rte_flow_error_set(error, EINVAL,
2288                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2289                                                   actions,
2290                                                   "no TCP/UDP item found in"
2291                                                   " pattern");
2292         }
2293         /*
2294          * FW syndrome (0xA9C090):
2295          *     set_flow_table_entry: push vlan action fte in fdb can ONLY be
2296          *     forward to the uplink.
2297          */
2298         if ((action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) &&
2299             (action_flags & MLX5_FLOW_ACTION_PORT_ID) &&
2300             ((struct priv *)port_id_dev->data->dev_private)->representor)
2301                 return rte_flow_error_set(error, ENOTSUP,
2302                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2303                                           "vlan push can only be applied"
2304                                           " when forwarding to uplink port");
2305         /*
2306          * FW syndrome (0x294609):
2307          *     set_flow_table_entry: modify/pop/push actions in fdb flow table
2308          *     are supported only while forwarding to vport.
2309          */
2310         if ((action_flags & MLX5_TCF_VLAN_ACTIONS) &&
2311             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2312                 return rte_flow_error_set(error, ENOTSUP,
2313                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2314                                           "vlan actions are supported"
2315                                           " only with port_id action");
2316         if ((action_flags & MLX5_TCF_VXLAN_ACTIONS) &&
2317             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2318                 return rte_flow_error_set(error, ENOTSUP,
2319                                           RTE_FLOW_ERROR_TYPE_ACTION, NULL,
2320                                           "vxlan actions are supported"
2321                                           " only with port_id action");
2322         if (!(action_flags & MLX5_TCF_FATE_ACTIONS))
2323                 return rte_flow_error_set(error, EINVAL,
2324                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2325                                           "no fate action is found");
2326         if (action_flags &
2327            (MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL)) {
2328                 if (!(item_flags &
2329                      (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2330                       MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2331                         return rte_flow_error_set(error, EINVAL,
2332                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2333                                                   actions,
2334                                                   "no IP found in pattern");
2335         }
2336         if (action_flags &
2337             (MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)) {
2338                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L2))
2339                         return rte_flow_error_set(error, ENOTSUP,
2340                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2341                                                   actions,
2342                                                   "no ethernet found in"
2343                                                   " pattern");
2344         }
2345         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2346                 if (!(item_flags &
2347                      (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2348                       MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2349                         return rte_flow_error_set(error, EINVAL,
2350                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2351                                                   NULL,
2352                                                   "no outer IP pattern found"
2353                                                   " for vxlan decap action");
2354                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
2355                         return rte_flow_error_set(error, EINVAL,
2356                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2357                                                   NULL,
2358                                                   "no outer UDP pattern found"
2359                                                   " for vxlan decap action");
2360                 if (!(item_flags & MLX5_FLOW_LAYER_VXLAN))
2361                         return rte_flow_error_set(error, EINVAL,
2362                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2363                                                   NULL,
2364                                                   "no VNI pattern found"
2365                                                   " for vxlan decap action");
2366         }
2367         return 0;
2368 }
2369
2370 /**
2371  * Calculate maximum size of memory for flow items of Linux TC flower and
2372  * extract specified items.
2373  *
2374  * @param[in] items
2375  *   Pointer to the list of items.
2376  * @param[out] item_flags
2377  *   Pointer to the detected items.
2378  *
2379  * @return
2380  *   Maximum size of memory for items.
2381  */
2382 static int
2383 flow_tcf_get_items_and_size(const struct rte_flow_attr *attr,
2384                             const struct rte_flow_item items[],
2385                             uint64_t *item_flags)
2386 {
2387         int size = 0;
2388         uint64_t flags = 0;
2389
2390         size += SZ_NLATTR_STRZ_OF("flower") +
2391                 SZ_NLATTR_NEST + /* TCA_OPTIONS. */
2392                 SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */
2393         if (attr->group > 0)
2394                 size += SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CHAIN. */
2395         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2396                 switch (items->type) {
2397                 case RTE_FLOW_ITEM_TYPE_VOID:
2398                         break;
2399                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
2400                         break;
2401                 case RTE_FLOW_ITEM_TYPE_ETH:
2402                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
2403                                 SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4;
2404                                 /* dst/src MAC addr and mask. */
2405                         flags |= MLX5_FLOW_LAYER_OUTER_L2;
2406                         break;
2407                 case RTE_FLOW_ITEM_TYPE_VLAN:
2408                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
2409                                 SZ_NLATTR_TYPE_OF(uint16_t) +
2410                                 /* VLAN Ether type. */
2411                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */
2412                                 SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */
2413                         flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
2414                         break;
2415                 case RTE_FLOW_ITEM_TYPE_IPV4:
2416                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
2417                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2418                                 SZ_NLATTR_TYPE_OF(uint32_t) * 4;
2419                                 /* dst/src IP addr and mask. */
2420                         flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
2421                         break;
2422                 case RTE_FLOW_ITEM_TYPE_IPV6:
2423                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
2424                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2425                                 SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4;
2426                                 /* dst/src IP addr and mask. */
2427                         flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
2428                         break;
2429                 case RTE_FLOW_ITEM_TYPE_UDP:
2430                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2431                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2432                                 /* dst/src port and mask. */
2433                         flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
2434                         break;
2435                 case RTE_FLOW_ITEM_TYPE_TCP:
2436                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2437                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2438                                 /* dst/src port and mask. */
2439                         flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
2440                         break;
2441                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2442                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2443                         flags |= MLX5_FLOW_LAYER_VXLAN;
2444                         break;
2445                 default:
2446                         DRV_LOG(WARNING,
2447                                 "unsupported item %p type %d,"
2448                                 " items must be validated before flow creation",
2449                                 (const void *)items, items->type);
2450                         break;
2451                 }
2452         }
2453         *item_flags = flags;
2454         return size;
2455 }
2456
2457 /**
2458  * Calculate size of memory to store the VXLAN encapsultion
2459  * related items in the Netlink message buffer. Items list
2460  * is specified by RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action.
2461  * The item list should be validated.
2462  *
2463  * @param[in] action
2464  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2465  *   List of pattern items to scan data from.
2466  *
2467  * @return
2468  *   The size the part of Netlink message buffer to store the
2469  *   VXLAN encapsulation item attributes.
2470  */
2471 static int
2472 flow_tcf_vxlan_encap_size(const struct rte_flow_action *action)
2473 {
2474         const struct rte_flow_item *items;
2475         int size = 0;
2476
2477         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
2478         assert(action->conf);
2479
2480         items = ((const struct rte_flow_action_vxlan_encap *)
2481                                         action->conf)->definition;
2482         assert(items);
2483         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2484                 switch (items->type) {
2485                 case RTE_FLOW_ITEM_TYPE_VOID:
2486                         break;
2487                 case RTE_FLOW_ITEM_TYPE_ETH:
2488                         /* This item does not require message buffer. */
2489                         break;
2490                 case RTE_FLOW_ITEM_TYPE_IPV4:
2491                         size += SZ_NLATTR_DATA_OF(IPV4_ADDR_LEN) * 2;
2492                         break;
2493                 case RTE_FLOW_ITEM_TYPE_IPV6:
2494                         size += SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 2;
2495                         break;
2496                 case RTE_FLOW_ITEM_TYPE_UDP: {
2497                         const struct rte_flow_item_udp *udp = items->mask;
2498
2499                         size += SZ_NLATTR_TYPE_OF(uint16_t);
2500                         if (!udp || udp->hdr.src_port != RTE_BE16(0x0000))
2501                                 size += SZ_NLATTR_TYPE_OF(uint16_t);
2502                         break;
2503                 }
2504                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2505                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2506                         break;
2507                 default:
2508                         assert(false);
2509                         DRV_LOG(WARNING,
2510                                 "unsupported item %p type %d,"
2511                                 " items must be validated"
2512                                 " before flow creation",
2513                                 (const void *)items, items->type);
2514                         return 0;
2515                 }
2516         }
2517         return size;
2518 }
2519
2520 /**
2521  * Calculate maximum size of memory for flow actions of Linux TC flower and
2522  * extract specified actions.
2523  *
2524  * @param[in] actions
2525  *   Pointer to the list of actions.
2526  * @param[out] action_flags
2527  *   Pointer to the detected actions.
2528  *
2529  * @return
2530  *   Maximum size of memory for actions.
2531  */
2532 static int
2533 flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
2534                               uint64_t *action_flags)
2535 {
2536         int size = 0;
2537         uint64_t flags = 0;
2538
2539         size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */
2540         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
2541                 switch (actions->type) {
2542                 case RTE_FLOW_ACTION_TYPE_VOID:
2543                         break;
2544                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
2545                         size += SZ_NLATTR_NEST + /* na_act_index. */
2546                                 SZ_NLATTR_STRZ_OF("mirred") +
2547                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2548                                 SZ_NLATTR_TYPE_OF(struct tc_mirred);
2549                         flags |= MLX5_FLOW_ACTION_PORT_ID;
2550                         break;
2551                 case RTE_FLOW_ACTION_TYPE_JUMP:
2552                         size += SZ_NLATTR_NEST + /* na_act_index. */
2553                                 SZ_NLATTR_STRZ_OF("gact") +
2554                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2555                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2556                         flags |= MLX5_FLOW_ACTION_JUMP;
2557                         break;
2558                 case RTE_FLOW_ACTION_TYPE_DROP:
2559                         size += SZ_NLATTR_NEST + /* na_act_index. */
2560                                 SZ_NLATTR_STRZ_OF("gact") +
2561                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2562                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2563                         flags |= MLX5_FLOW_ACTION_DROP;
2564                         break;
2565                 case RTE_FLOW_ACTION_TYPE_COUNT:
2566                         break;
2567                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
2568                         flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
2569                         goto action_of_vlan;
2570                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
2571                         flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
2572                         goto action_of_vlan;
2573                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
2574                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
2575                         goto action_of_vlan;
2576                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
2577                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
2578                         goto action_of_vlan;
2579 action_of_vlan:
2580                         size += SZ_NLATTR_NEST + /* na_act_index. */
2581                                 SZ_NLATTR_STRZ_OF("vlan") +
2582                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2583                                 SZ_NLATTR_TYPE_OF(struct tc_vlan) +
2584                                 SZ_NLATTR_TYPE_OF(uint16_t) +
2585                                 /* VLAN protocol. */
2586                                 SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
2587                                 SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
2588                         break;
2589                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
2590                         size += SZ_NLATTR_NEST + /* na_act_index. */
2591                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2592                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2593                                 SZ_NLATTR_TYPE_OF(uint8_t);
2594                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2595                         size += flow_tcf_vxlan_encap_size(actions) +
2596                                 RTE_ALIGN_CEIL /* preceding encap params. */
2597                                 (sizeof(struct flow_tcf_vxlan_encap),
2598                                 MNL_ALIGNTO);
2599                         flags |= MLX5_FLOW_ACTION_VXLAN_ENCAP;
2600                         break;
2601                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
2602                         size += SZ_NLATTR_NEST + /* na_act_index. */
2603                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2604                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2605                                 SZ_NLATTR_TYPE_OF(uint8_t);
2606                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2607                         size += RTE_ALIGN_CEIL /* preceding decap params. */
2608                                 (sizeof(struct flow_tcf_vxlan_decap),
2609                                 MNL_ALIGNTO);
2610                         flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2611                         break;
2612                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
2613                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
2614                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
2615                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
2616                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
2617                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
2618                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
2619                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
2620                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
2621                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
2622                         size += flow_tcf_get_pedit_actions_size(&actions,
2623                                                                 &flags);
2624                         break;
2625                 default:
2626                         DRV_LOG(WARNING,
2627                                 "unsupported action %p type %d,"
2628                                 " items must be validated before flow creation",
2629                                 (const void *)actions, actions->type);
2630                         break;
2631                 }
2632         }
2633         *action_flags = flags;
2634         return size;
2635 }
2636
2637 /**
2638  * Brand rtnetlink buffer with unique handle.
2639  *
2640  * This handle should be unique for a given network interface to avoid
2641  * collisions.
2642  *
2643  * @param nlh
2644  *   Pointer to Netlink message.
2645  * @param handle
2646  *   Unique 32-bit handle to use.
2647  */
2648 static void
2649 flow_tcf_nl_brand(struct nlmsghdr *nlh, uint32_t handle)
2650 {
2651         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
2652
2653         tcm->tcm_handle = handle;
2654         DRV_LOG(DEBUG, "Netlink msg %p is branded with handle %x",
2655                 (void *)nlh, handle);
2656 }
2657
2658 /**
2659  * Prepare a flow object for Linux TC flower. It calculates the maximum size of
2660  * memory required, allocates the memory, initializes Netlink message headers
2661  * and set unique TC message handle.
2662  *
2663  * @param[in] attr
2664  *   Pointer to the flow attributes.
2665  * @param[in] items
2666  *   Pointer to the list of items.
2667  * @param[in] actions
2668  *   Pointer to the list of actions.
2669  * @param[out] item_flags
2670  *   Pointer to bit mask of all items detected.
2671  * @param[out] action_flags
2672  *   Pointer to bit mask of all actions detected.
2673  * @param[out] error
2674  *   Pointer to the error structure.
2675  *
2676  * @return
2677  *   Pointer to mlx5_flow object on success,
2678  *   otherwise NULL and rte_ernno is set.
2679  */
2680 static struct mlx5_flow *
2681 flow_tcf_prepare(const struct rte_flow_attr *attr,
2682                  const struct rte_flow_item items[],
2683                  const struct rte_flow_action actions[],
2684                  uint64_t *item_flags, uint64_t *action_flags,
2685                  struct rte_flow_error *error)
2686 {
2687         size_t size = RTE_ALIGN_CEIL
2688                         (sizeof(struct mlx5_flow),
2689                          alignof(struct flow_tcf_tunnel_hdr)) +
2690                       MNL_ALIGN(sizeof(struct nlmsghdr)) +
2691                       MNL_ALIGN(sizeof(struct tcmsg));
2692         struct mlx5_flow *dev_flow;
2693         struct nlmsghdr *nlh;
2694         struct tcmsg *tcm;
2695         uint8_t *sp, *tun = NULL;
2696
2697         size += flow_tcf_get_items_and_size(attr, items, item_flags);
2698         size += flow_tcf_get_actions_and_size(actions, action_flags);
2699         dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
2700         if (!dev_flow) {
2701                 rte_flow_error_set(error, ENOMEM,
2702                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
2703                                    "not enough memory to create E-Switch flow");
2704                 return NULL;
2705         }
2706         sp = (uint8_t *)(dev_flow + 1);
2707         if (*action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) {
2708                 sp = RTE_PTR_ALIGN
2709                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2710                 tun = sp;
2711                 sp += RTE_ALIGN_CEIL
2712                         (sizeof(struct flow_tcf_vxlan_encap),
2713                         MNL_ALIGNTO);
2714 #ifndef NDEBUG
2715                 size -= RTE_ALIGN_CEIL
2716                         (sizeof(struct flow_tcf_vxlan_encap),
2717                         MNL_ALIGNTO);
2718 #endif
2719         } else if (*action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2720                 sp = RTE_PTR_ALIGN
2721                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2722                 tun = sp;
2723                 sp += RTE_ALIGN_CEIL
2724                         (sizeof(struct flow_tcf_vxlan_decap),
2725                         MNL_ALIGNTO);
2726 #ifndef NDEBUG
2727                 size -= RTE_ALIGN_CEIL
2728                         (sizeof(struct flow_tcf_vxlan_decap),
2729                         MNL_ALIGNTO);
2730 #endif
2731         } else {
2732                 sp = RTE_PTR_ALIGN(sp, MNL_ALIGNTO);
2733         }
2734         nlh = mnl_nlmsg_put_header(sp);
2735         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
2736         *dev_flow = (struct mlx5_flow){
2737                 .tcf = (struct mlx5_flow_tcf){
2738 #ifndef NDEBUG
2739                         .nlsize = size - RTE_ALIGN_CEIL
2740                                 (sizeof(struct mlx5_flow),
2741                                  alignof(struct flow_tcf_tunnel_hdr)),
2742 #endif
2743                         .tunnel = (struct flow_tcf_tunnel_hdr *)tun,
2744                         .nlh = nlh,
2745                         .tcm = tcm,
2746                 },
2747         };
2748         if (*action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP)
2749                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_DECAP;
2750         else if (*action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP)
2751                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_ENCAP;
2752         /*
2753          * Generate a reasonably unique handle based on the address of the
2754          * target buffer.
2755          *
2756          * This is straightforward on 32-bit systems where the flow pointer can
2757          * be used directly. Otherwise, its least significant part is taken
2758          * after shifting it by the previous power of two of the pointed buffer
2759          * size.
2760          */
2761         if (sizeof(dev_flow) <= 4)
2762                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow);
2763         else
2764                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow >>
2765                                        rte_log2_u32(rte_align32prevpow2(size)));
2766         return dev_flow;
2767 }
2768
2769 /**
2770  * Make adjustments for supporting count actions.
2771  *
2772  * @param[in] dev
2773  *   Pointer to the Ethernet device structure.
2774  * @param[in] dev_flow
2775  *   Pointer to mlx5_flow.
2776  * @param[out] error
2777  *   Pointer to error structure.
2778  *
2779  * @return
2780  *   0 On success else a negative errno value is returned and rte_errno is set.
2781  */
2782 static int
2783 flow_tcf_translate_action_count(struct rte_eth_dev *dev __rte_unused,
2784                                   struct mlx5_flow *dev_flow,
2785                                   struct rte_flow_error *error)
2786 {
2787         struct rte_flow *flow = dev_flow->flow;
2788
2789         if (!flow->counter) {
2790                 flow->counter = flow_tcf_counter_new();
2791                 if (!flow->counter)
2792                         return rte_flow_error_set(error, rte_errno,
2793                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2794                                                   NULL,
2795                                                   "cannot get counter"
2796                                                   " context.");
2797         }
2798         return 0;
2799 }
2800
2801 /**
2802  * Convert VXLAN VNI to 32-bit integer.
2803  *
2804  * @param[in] vni
2805  *   VXLAN VNI in 24-bit wire format.
2806  *
2807  * @return
2808  *   VXLAN VNI as a 32-bit integer value in network endian.
2809  */
2810 static inline rte_be32_t
2811 vxlan_vni_as_be32(const uint8_t vni[3])
2812 {
2813         union {
2814                 uint8_t vni[4];
2815                 rte_be32_t dword;
2816         } ret = {
2817                 .vni = { 0, vni[0], vni[1], vni[2] },
2818         };
2819         return ret.dword;
2820 }
2821
2822 /**
2823  * Helper function to process RTE_FLOW_ITEM_TYPE_ETH entry in configuration
2824  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the MAC address fields
2825  * in the encapsulation parameters structure. The item must be prevalidated,
2826  * no any validation checks performed by function.
2827  *
2828  * @param[in] spec
2829  *   RTE_FLOW_ITEM_TYPE_ETH entry specification.
2830  * @param[in] mask
2831  *   RTE_FLOW_ITEM_TYPE_ETH entry mask.
2832  * @param[out] encap
2833  *   Structure to fill the gathered MAC address data.
2834  */
2835 static void
2836 flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec,
2837                                const struct rte_flow_item_eth *mask,
2838                                struct flow_tcf_vxlan_encap *encap)
2839 {
2840         /* Item must be validated before. No redundant checks. */
2841         assert(spec);
2842         if (!mask || !memcmp(&mask->dst,
2843                              &rte_flow_item_eth_mask.dst,
2844                              sizeof(rte_flow_item_eth_mask.dst))) {
2845                 /*
2846                  * Ethernet addresses are not supported by
2847                  * tc as tunnel_key parameters. Destination
2848                  * address is needed to form encap packet
2849                  * header and retrieved by kernel from
2850                  * implicit sources (ARP table, etc),
2851                  * address masks are not supported at all.
2852                  */
2853                 encap->eth.dst = spec->dst;
2854                 encap->mask |= FLOW_TCF_ENCAP_ETH_DST;
2855         }
2856         if (!mask || !memcmp(&mask->src,
2857                              &rte_flow_item_eth_mask.src,
2858                              sizeof(rte_flow_item_eth_mask.src))) {
2859                 /*
2860                  * Ethernet addresses are not supported by
2861                  * tc as tunnel_key parameters. Source ethernet
2862                  * address is ignored anyway.
2863                  */
2864                 encap->eth.src = spec->src;
2865                 encap->mask |= FLOW_TCF_ENCAP_ETH_SRC;
2866         }
2867 }
2868
2869 /**
2870  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV4 entry in configuration
2871  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV4 address fields
2872  * in the encapsulation parameters structure. The item must be prevalidated,
2873  * no any validation checks performed by function.
2874  *
2875  * @param[in] spec
2876  *   RTE_FLOW_ITEM_TYPE_IPV4 entry specification.
2877  * @param[out] encap
2878  *   Structure to fill the gathered IPV4 address data.
2879  */
2880 static void
2881 flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec,
2882                                 struct flow_tcf_vxlan_encap *encap)
2883 {
2884         /* Item must be validated before. No redundant checks. */
2885         assert(spec);
2886         encap->ipv4.dst = spec->hdr.dst_addr;
2887         encap->ipv4.src = spec->hdr.src_addr;
2888         encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC |
2889                        FLOW_TCF_ENCAP_IPV4_DST;
2890 }
2891
2892 /**
2893  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV6 entry in configuration
2894  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV6 address fields
2895  * in the encapsulation parameters structure. The item must be prevalidated,
2896  * no any validation checks performed by function.
2897  *
2898  * @param[in] spec
2899  *   RTE_FLOW_ITEM_TYPE_IPV6 entry specification.
2900  * @param[out] encap
2901  *   Structure to fill the gathered IPV6 address data.
2902  */
2903 static void
2904 flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec,
2905                                 struct flow_tcf_vxlan_encap *encap)
2906 {
2907         /* Item must be validated before. No redundant checks. */
2908         assert(spec);
2909         memcpy(encap->ipv6.dst, spec->hdr.dst_addr, IPV6_ADDR_LEN);
2910         memcpy(encap->ipv6.src, spec->hdr.src_addr, IPV6_ADDR_LEN);
2911         encap->mask |= FLOW_TCF_ENCAP_IPV6_SRC |
2912                        FLOW_TCF_ENCAP_IPV6_DST;
2913 }
2914
2915 /**
2916  * Helper function to process RTE_FLOW_ITEM_TYPE_UDP entry in configuration
2917  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the UDP port fields
2918  * in the encapsulation parameters structure. The item must be prevalidated,
2919  * no any validation checks performed by function.
2920  *
2921  * @param[in] spec
2922  *   RTE_FLOW_ITEM_TYPE_UDP entry specification.
2923  * @param[in] mask
2924  *   RTE_FLOW_ITEM_TYPE_UDP entry mask.
2925  * @param[out] encap
2926  *   Structure to fill the gathered UDP port data.
2927  */
2928 static void
2929 flow_tcf_parse_vxlan_encap_udp(const struct rte_flow_item_udp *spec,
2930                                const struct rte_flow_item_udp *mask,
2931                                struct flow_tcf_vxlan_encap *encap)
2932 {
2933         assert(spec);
2934         encap->udp.dst = spec->hdr.dst_port;
2935         encap->mask |= FLOW_TCF_ENCAP_UDP_DST;
2936         if (!mask || mask->hdr.src_port != RTE_BE16(0x0000)) {
2937                 encap->udp.src = spec->hdr.src_port;
2938                 encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC;
2939         }
2940 }
2941
2942 /**
2943  * Helper function to process RTE_FLOW_ITEM_TYPE_VXLAN entry in configuration
2944  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the VNI fields
2945  * in the encapsulation parameters structure. The item must be prevalidated,
2946  * no any validation checks performed by function.
2947  *
2948  * @param[in] spec
2949  *   RTE_FLOW_ITEM_TYPE_VXLAN entry specification.
2950  * @param[out] encap
2951  *   Structure to fill the gathered VNI address data.
2952  */
2953 static void
2954 flow_tcf_parse_vxlan_encap_vni(const struct rte_flow_item_vxlan *spec,
2955                                struct flow_tcf_vxlan_encap *encap)
2956 {
2957         /* Item must be validated before. Do not redundant checks. */
2958         assert(spec);
2959         memcpy(encap->vxlan.vni, spec->vni, sizeof(encap->vxlan.vni));
2960         encap->mask |= FLOW_TCF_ENCAP_VXLAN_VNI;
2961 }
2962
2963 /**
2964  * Populate consolidated encapsulation object from list of pattern items.
2965  *
2966  * Helper function to process configuration of action such as
2967  * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. The item list should be
2968  * validated, there is no way to return an meaningful error.
2969  *
2970  * @param[in] action
2971  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2972  *   List of pattern items to gather data from.
2973  * @param[out] src
2974  *   Structure to fill gathered data.
2975  */
2976 static void
2977 flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action,
2978                            struct flow_tcf_vxlan_encap *encap)
2979 {
2980         union {
2981                 const struct rte_flow_item_eth *eth;
2982                 const struct rte_flow_item_ipv4 *ipv4;
2983                 const struct rte_flow_item_ipv6 *ipv6;
2984                 const struct rte_flow_item_udp *udp;
2985                 const struct rte_flow_item_vxlan *vxlan;
2986         } spec, mask;
2987         const struct rte_flow_item *items;
2988
2989         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
2990         assert(action->conf);
2991
2992         items = ((const struct rte_flow_action_vxlan_encap *)
2993                                         action->conf)->definition;
2994         assert(items);
2995         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2996                 switch (items->type) {
2997                 case RTE_FLOW_ITEM_TYPE_VOID:
2998                         break;
2999                 case RTE_FLOW_ITEM_TYPE_ETH:
3000                         mask.eth = items->mask;
3001                         spec.eth = items->spec;
3002                         flow_tcf_parse_vxlan_encap_eth(spec.eth, mask.eth,
3003                                                        encap);
3004                         break;
3005                 case RTE_FLOW_ITEM_TYPE_IPV4:
3006                         spec.ipv4 = items->spec;
3007                         flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4, encap);
3008                         break;
3009                 case RTE_FLOW_ITEM_TYPE_IPV6:
3010                         spec.ipv6 = items->spec;
3011                         flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6, encap);
3012                         break;
3013                 case RTE_FLOW_ITEM_TYPE_UDP:
3014                         mask.udp = items->mask;
3015                         spec.udp = items->spec;
3016                         flow_tcf_parse_vxlan_encap_udp(spec.udp, mask.udp,
3017                                                        encap);
3018                         break;
3019                 case RTE_FLOW_ITEM_TYPE_VXLAN:
3020                         spec.vxlan = items->spec;
3021                         flow_tcf_parse_vxlan_encap_vni(spec.vxlan, encap);
3022                         break;
3023                 default:
3024                         assert(false);
3025                         DRV_LOG(WARNING,
3026                                 "unsupported item %p type %d,"
3027                                 " items must be validated"
3028                                 " before flow creation",
3029                                 (const void *)items, items->type);
3030                         encap->mask = 0;
3031                         return;
3032                 }
3033         }
3034 }
3035
3036 /**
3037  * Translate flow for Linux TC flower and construct Netlink message.
3038  *
3039  * @param[in] priv
3040  *   Pointer to the priv structure.
3041  * @param[in, out] flow
3042  *   Pointer to the sub flow.
3043  * @param[in] attr
3044  *   Pointer to the flow attributes.
3045  * @param[in] items
3046  *   Pointer to the list of items.
3047  * @param[in] actions
3048  *   Pointer to the list of actions.
3049  * @param[out] error
3050  *   Pointer to the error structure.
3051  *
3052  * @return
3053  *   0 on success, a negative errno value otherwise and rte_ernno is set.
3054  */
3055 static int
3056 flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow,
3057                    const struct rte_flow_attr *attr,
3058                    const struct rte_flow_item items[],
3059                    const struct rte_flow_action actions[],
3060                    struct rte_flow_error *error)
3061 {
3062         union {
3063                 const struct rte_flow_item_port_id *port_id;
3064                 const struct rte_flow_item_eth *eth;
3065                 const struct rte_flow_item_vlan *vlan;
3066                 const struct rte_flow_item_ipv4 *ipv4;
3067                 const struct rte_flow_item_ipv6 *ipv6;
3068                 const struct rte_flow_item_tcp *tcp;
3069                 const struct rte_flow_item_udp *udp;
3070                 const struct rte_flow_item_vxlan *vxlan;
3071         } spec, mask;
3072         union {
3073                 const struct rte_flow_action_port_id *port_id;
3074                 const struct rte_flow_action_jump *jump;
3075                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
3076                 const struct rte_flow_action_of_set_vlan_vid *
3077                         of_set_vlan_vid;
3078                 const struct rte_flow_action_of_set_vlan_pcp *
3079                         of_set_vlan_pcp;
3080         } conf;
3081         union {
3082                 struct flow_tcf_tunnel_hdr *hdr;
3083                 struct flow_tcf_vxlan_decap *vxlan;
3084         } decap = {
3085                 .hdr = NULL,
3086         };
3087         union {
3088                 struct flow_tcf_tunnel_hdr *hdr;
3089                 struct flow_tcf_vxlan_encap *vxlan;
3090         } encap = {
3091                 .hdr = NULL,
3092         };
3093         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
3094         struct nlmsghdr *nlh = dev_flow->tcf.nlh;
3095         struct tcmsg *tcm = dev_flow->tcf.tcm;
3096         uint32_t na_act_index_cur;
3097         bool eth_type_set = 0;
3098         bool vlan_present = 0;
3099         bool vlan_eth_type_set = 0;
3100         bool ip_proto_set = 0;
3101         struct nlattr *na_flower;
3102         struct nlattr *na_flower_act;
3103         struct nlattr *na_vlan_id = NULL;
3104         struct nlattr *na_vlan_priority = NULL;
3105         uint64_t item_flags = 0;
3106         int ret;
3107
3108         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
3109                                                 PTOI_TABLE_SZ_MAX(dev)));
3110         if (dev_flow->tcf.tunnel) {
3111                 switch (dev_flow->tcf.tunnel->type) {
3112                 case FLOW_TCF_TUNACT_VXLAN_DECAP:
3113                         decap.vxlan = dev_flow->tcf.vxlan_decap;
3114                         break;
3115                 case FLOW_TCF_TUNACT_VXLAN_ENCAP:
3116                         encap.vxlan = dev_flow->tcf.vxlan_encap;
3117                         break;
3118                 /* New tunnel actions can be added here. */
3119                 default:
3120                         assert(false);
3121                         break;
3122                 }
3123         }
3124         nlh = dev_flow->tcf.nlh;
3125         tcm = dev_flow->tcf.tcm;
3126         /* Prepare API must have been called beforehand. */
3127         assert(nlh != NULL && tcm != NULL);
3128         tcm->tcm_family = AF_UNSPEC;
3129         tcm->tcm_ifindex = ptoi[0].ifindex;
3130         tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
3131         /*
3132          * Priority cannot be zero to prevent the kernel from picking one
3133          * automatically.
3134          */
3135         tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
3136                                   RTE_BE16(ETH_P_ALL));
3137         if (attr->group > 0)
3138                 mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group);
3139         mnl_attr_put_strz(nlh, TCA_KIND, "flower");
3140         na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
3141         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3142                 unsigned int i;
3143
3144                 switch (items->type) {
3145                 case RTE_FLOW_ITEM_TYPE_VOID:
3146                         break;
3147                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
3148                         mask.port_id = flow_tcf_item_mask
3149                                 (items, &rte_flow_item_port_id_mask,
3150                                  &flow_tcf_mask_supported.port_id,
3151                                  &flow_tcf_mask_empty.port_id,
3152                                  sizeof(flow_tcf_mask_supported.port_id),
3153                                  error);
3154                         assert(mask.port_id);
3155                         if (mask.port_id == &flow_tcf_mask_empty.port_id)
3156                                 break;
3157                         spec.port_id = items->spec;
3158                         if (!mask.port_id->id)
3159                                 i = 0;
3160                         else
3161                                 for (i = 0; ptoi[i].ifindex; ++i)
3162                                         if (ptoi[i].port_id == spec.port_id->id)
3163                                                 break;
3164                         assert(ptoi[i].ifindex);
3165                         tcm->tcm_ifindex = ptoi[i].ifindex;
3166                         break;
3167                 case RTE_FLOW_ITEM_TYPE_ETH:
3168                         item_flags |= (item_flags & MLX5_FLOW_LAYER_VXLAN) ?
3169                                       MLX5_FLOW_LAYER_INNER_L2 :
3170                                       MLX5_FLOW_LAYER_OUTER_L2;
3171                         mask.eth = flow_tcf_item_mask
3172                                 (items, &rte_flow_item_eth_mask,
3173                                  &flow_tcf_mask_supported.eth,
3174                                  &flow_tcf_mask_empty.eth,
3175                                  sizeof(flow_tcf_mask_supported.eth),
3176                                  error);
3177                         assert(mask.eth);
3178                         if (mask.eth == &flow_tcf_mask_empty.eth)
3179                                 break;
3180                         spec.eth = items->spec;
3181                         if (decap.vxlan &&
3182                             !(item_flags & MLX5_FLOW_LAYER_VXLAN)) {
3183                                 DRV_LOG(WARNING,
3184                                         "outer L2 addresses cannot be forced"
3185                                         " for vxlan decapsulation, parameter"
3186                                         " ignored");
3187                                 break;
3188                         }
3189                         if (mask.eth->type) {
3190                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_ETH_TYPE,
3191                                                  spec.eth->type);
3192                                 eth_type_set = 1;
3193                         }
3194                         if (!is_zero_ether_addr(&mask.eth->dst)) {
3195                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
3196                                              ETHER_ADDR_LEN,
3197                                              spec.eth->dst.addr_bytes);
3198                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK,
3199                                              ETHER_ADDR_LEN,
3200                                              mask.eth->dst.addr_bytes);
3201                         }
3202                         if (!is_zero_ether_addr(&mask.eth->src)) {
3203                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC,
3204                                              ETHER_ADDR_LEN,
3205                                              spec.eth->src.addr_bytes);
3206                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK,
3207                                              ETHER_ADDR_LEN,
3208                                              mask.eth->src.addr_bytes);
3209                         }
3210                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3211                         break;
3212                 case RTE_FLOW_ITEM_TYPE_VLAN:
3213                         assert(!encap.hdr);
3214                         assert(!decap.hdr);
3215                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
3216                         mask.vlan = flow_tcf_item_mask
3217                                 (items, &rte_flow_item_vlan_mask,
3218                                  &flow_tcf_mask_supported.vlan,
3219                                  &flow_tcf_mask_empty.vlan,
3220                                  sizeof(flow_tcf_mask_supported.vlan),
3221                                  error);
3222                         assert(mask.vlan);
3223                         if (!eth_type_set)
3224                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_ETH_TYPE,
3225                                                  RTE_BE16(ETH_P_8021Q));
3226                         eth_type_set = 1;
3227                         vlan_present = 1;
3228                         if (mask.vlan == &flow_tcf_mask_empty.vlan)
3229                                 break;
3230                         spec.vlan = items->spec;
3231                         if (mask.vlan->inner_type) {
3232                                 mnl_attr_put_u16(nlh,
3233                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE,
3234                                                  spec.vlan->inner_type);
3235                                 vlan_eth_type_set = 1;
3236                         }
3237                         if (mask.vlan->tci & RTE_BE16(0xe000))
3238                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_VLAN_PRIO,
3239                                                 (rte_be_to_cpu_16
3240                                                  (spec.vlan->tci) >> 13) & 0x7);
3241                         if (mask.vlan->tci & RTE_BE16(0x0fff))
3242                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_VLAN_ID,
3243                                                  rte_be_to_cpu_16
3244                                                  (spec.vlan->tci &
3245                                                   RTE_BE16(0x0fff)));
3246                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3247                         break;
3248                 case RTE_FLOW_ITEM_TYPE_IPV4:
3249                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
3250                         mask.ipv4 = flow_tcf_item_mask
3251                                 (items, &rte_flow_item_ipv4_mask,
3252                                  &flow_tcf_mask_supported.ipv4,
3253                                  &flow_tcf_mask_empty.ipv4,
3254                                  sizeof(flow_tcf_mask_supported.ipv4),
3255                                  error);
3256                         assert(mask.ipv4);
3257                         spec.ipv4 = items->spec;
3258                         if (!decap.vxlan) {
3259                                 if (!eth_type_set && !vlan_eth_type_set)
3260                                         mnl_attr_put_u16
3261                                                 (nlh,
3262                                                  vlan_present ?
3263                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE :
3264                                                  TCA_FLOWER_KEY_ETH_TYPE,
3265                                                  RTE_BE16(ETH_P_IP));
3266                                 eth_type_set = 1;
3267                                 vlan_eth_type_set = 1;
3268                                 if (mask.ipv4 == &flow_tcf_mask_empty.ipv4)
3269                                         break;
3270                                 if (mask.ipv4->hdr.next_proto_id) {
3271                                         mnl_attr_put_u8
3272                                                 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3273                                                  spec.ipv4->hdr.next_proto_id);
3274                                         ip_proto_set = 1;
3275                                 }
3276                         } else {
3277                                 assert(mask.ipv4 != &flow_tcf_mask_empty.ipv4);
3278                         }
3279                         if (mask.ipv4->hdr.src_addr) {
3280                                 mnl_attr_put_u32
3281                                         (nlh, decap.vxlan ?
3282                                          TCA_FLOWER_KEY_ENC_IPV4_SRC :
3283                                          TCA_FLOWER_KEY_IPV4_SRC,
3284                                          spec.ipv4->hdr.src_addr);
3285                                 mnl_attr_put_u32
3286                                         (nlh, decap.vxlan ?
3287                                          TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK :
3288                                          TCA_FLOWER_KEY_IPV4_SRC_MASK,
3289                                          mask.ipv4->hdr.src_addr);
3290                         }
3291                         if (mask.ipv4->hdr.dst_addr) {
3292                                 mnl_attr_put_u32
3293                                         (nlh, decap.vxlan ?
3294                                          TCA_FLOWER_KEY_ENC_IPV4_DST :
3295                                          TCA_FLOWER_KEY_IPV4_DST,
3296                                          spec.ipv4->hdr.dst_addr);
3297                                 mnl_attr_put_u32
3298                                         (nlh, decap.vxlan ?
3299                                          TCA_FLOWER_KEY_ENC_IPV4_DST_MASK :
3300                                          TCA_FLOWER_KEY_IPV4_DST_MASK,
3301                                          mask.ipv4->hdr.dst_addr);
3302                         }
3303                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3304                         break;
3305                 case RTE_FLOW_ITEM_TYPE_IPV6:
3306                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
3307                         mask.ipv6 = flow_tcf_item_mask
3308                                 (items, &rte_flow_item_ipv6_mask,
3309                                  &flow_tcf_mask_supported.ipv6,
3310                                  &flow_tcf_mask_empty.ipv6,
3311                                  sizeof(flow_tcf_mask_supported.ipv6),
3312                                  error);
3313                         assert(mask.ipv6);
3314                         spec.ipv6 = items->spec;
3315                         if (!decap.vxlan) {
3316                                 if (!eth_type_set || !vlan_eth_type_set) {
3317                                         mnl_attr_put_u16
3318                                                 (nlh,
3319                                                  vlan_present ?
3320                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE :
3321                                                  TCA_FLOWER_KEY_ETH_TYPE,
3322                                                  RTE_BE16(ETH_P_IPV6));
3323                                 }
3324                                 eth_type_set = 1;
3325                                 vlan_eth_type_set = 1;
3326                                 if (mask.ipv6 == &flow_tcf_mask_empty.ipv6)
3327                                         break;
3328                                 if (mask.ipv6->hdr.proto) {
3329                                         mnl_attr_put_u8
3330                                                 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3331                                                  spec.ipv6->hdr.proto);
3332                                         ip_proto_set = 1;
3333                                 }
3334                         } else {
3335                                 assert(mask.ipv6 != &flow_tcf_mask_empty.ipv6);
3336                         }
3337                         if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.src_addr)) {
3338                                 mnl_attr_put(nlh, decap.vxlan ?
3339                                              TCA_FLOWER_KEY_ENC_IPV6_SRC :
3340                                              TCA_FLOWER_KEY_IPV6_SRC,
3341                                              IPV6_ADDR_LEN,
3342                                              spec.ipv6->hdr.src_addr);
3343                                 mnl_attr_put(nlh, decap.vxlan ?
3344                                              TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK :
3345                                              TCA_FLOWER_KEY_IPV6_SRC_MASK,
3346                                              IPV6_ADDR_LEN,
3347                                              mask.ipv6->hdr.src_addr);
3348                         }
3349                         if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.dst_addr)) {
3350                                 mnl_attr_put(nlh, decap.vxlan ?
3351                                              TCA_FLOWER_KEY_ENC_IPV6_DST :
3352                                              TCA_FLOWER_KEY_IPV6_DST,
3353                                              IPV6_ADDR_LEN,
3354                                              spec.ipv6->hdr.dst_addr);
3355                                 mnl_attr_put(nlh, decap.vxlan ?
3356                                              TCA_FLOWER_KEY_ENC_IPV6_DST_MASK :
3357                                              TCA_FLOWER_KEY_IPV6_DST_MASK,
3358                                              IPV6_ADDR_LEN,
3359                                              mask.ipv6->hdr.dst_addr);
3360                         }
3361                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3362                         break;
3363                 case RTE_FLOW_ITEM_TYPE_UDP:
3364                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
3365                         mask.udp = flow_tcf_item_mask
3366                                 (items, &rte_flow_item_udp_mask,
3367                                  &flow_tcf_mask_supported.udp,
3368                                  &flow_tcf_mask_empty.udp,
3369                                  sizeof(flow_tcf_mask_supported.udp),
3370                                  error);
3371                         assert(mask.udp);
3372                         spec.udp = items->spec;
3373                         if (!decap.vxlan) {
3374                                 if (!ip_proto_set)
3375                                         mnl_attr_put_u8
3376                                                 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3377                                                 IPPROTO_UDP);
3378                                 if (mask.udp == &flow_tcf_mask_empty.udp)
3379                                         break;
3380                         } else {
3381                                 assert(mask.udp != &flow_tcf_mask_empty.udp);
3382                                 decap.vxlan->udp_port =
3383                                         rte_be_to_cpu_16
3384                                                 (spec.udp->hdr.dst_port);
3385                         }
3386                         if (mask.udp->hdr.src_port) {
3387                                 mnl_attr_put_u16
3388                                         (nlh, decap.vxlan ?
3389                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT :
3390                                          TCA_FLOWER_KEY_UDP_SRC,
3391                                          spec.udp->hdr.src_port);
3392                                 mnl_attr_put_u16
3393                                         (nlh, decap.vxlan ?
3394                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK :
3395                                          TCA_FLOWER_KEY_UDP_SRC_MASK,
3396                                          mask.udp->hdr.src_port);
3397                         }
3398                         if (mask.udp->hdr.dst_port) {
3399                                 mnl_attr_put_u16
3400                                         (nlh, decap.vxlan ?
3401                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT :
3402                                          TCA_FLOWER_KEY_UDP_DST,
3403                                          spec.udp->hdr.dst_port);
3404                                 mnl_attr_put_u16
3405                                         (nlh, decap.vxlan ?
3406                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK :
3407                                          TCA_FLOWER_KEY_UDP_DST_MASK,
3408                                          mask.udp->hdr.dst_port);
3409                         }
3410                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3411                         break;
3412                 case RTE_FLOW_ITEM_TYPE_TCP:
3413                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
3414                         mask.tcp = flow_tcf_item_mask
3415                                 (items, &rte_flow_item_tcp_mask,
3416                                  &flow_tcf_mask_supported.tcp,
3417                                  &flow_tcf_mask_empty.tcp,
3418                                  sizeof(flow_tcf_mask_supported.tcp),
3419                                  error);
3420                         assert(mask.tcp);
3421                         if (!ip_proto_set)
3422                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3423                                                 IPPROTO_TCP);
3424                         if (mask.tcp == &flow_tcf_mask_empty.tcp)
3425                                 break;
3426                         spec.tcp = items->spec;
3427                         if (mask.tcp->hdr.src_port) {
3428                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_SRC,
3429                                                  spec.tcp->hdr.src_port);
3430                                 mnl_attr_put_u16(nlh,
3431                                                  TCA_FLOWER_KEY_TCP_SRC_MASK,
3432                                                  mask.tcp->hdr.src_port);
3433                         }
3434                         if (mask.tcp->hdr.dst_port) {
3435                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_DST,
3436                                                  spec.tcp->hdr.dst_port);
3437                                 mnl_attr_put_u16(nlh,
3438                                                  TCA_FLOWER_KEY_TCP_DST_MASK,
3439                                                  mask.tcp->hdr.dst_port);
3440                         }
3441                         if (mask.tcp->hdr.tcp_flags) {
3442                                 mnl_attr_put_u16
3443                                         (nlh,
3444                                          TCA_FLOWER_KEY_TCP_FLAGS,
3445                                          rte_cpu_to_be_16
3446                                                 (spec.tcp->hdr.tcp_flags));
3447                                 mnl_attr_put_u16
3448                                         (nlh,
3449                                          TCA_FLOWER_KEY_TCP_FLAGS_MASK,
3450                                          rte_cpu_to_be_16
3451                                                 (mask.tcp->hdr.tcp_flags));
3452                         }
3453                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3454                         break;
3455                 case RTE_FLOW_ITEM_TYPE_VXLAN:
3456                         assert(decap.vxlan);
3457                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
3458                         spec.vxlan = items->spec;
3459                         mnl_attr_put_u32(nlh,
3460                                          TCA_FLOWER_KEY_ENC_KEY_ID,
3461                                          vxlan_vni_as_be32(spec.vxlan->vni));
3462                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3463                         break;
3464                 default:
3465                         return rte_flow_error_set(error, ENOTSUP,
3466                                                   RTE_FLOW_ERROR_TYPE_ITEM,
3467                                                   NULL, "item not supported");
3468                 }
3469         }
3470         na_flower_act = mnl_attr_nest_start(nlh, TCA_FLOWER_ACT);
3471         na_act_index_cur = 1;
3472         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
3473                 struct nlattr *na_act_index;
3474                 struct nlattr *na_act;
3475                 unsigned int vlan_act;
3476                 unsigned int i;
3477
3478                 switch (actions->type) {
3479                 case RTE_FLOW_ACTION_TYPE_VOID:
3480                         break;
3481                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
3482                         conf.port_id = actions->conf;
3483                         if (conf.port_id->original)
3484                                 i = 0;
3485                         else
3486                                 for (i = 0; ptoi[i].ifindex; ++i)
3487                                         if (ptoi[i].port_id == conf.port_id->id)
3488                                                 break;
3489                         assert(ptoi[i].ifindex);
3490                         na_act_index =
3491                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3492                         assert(na_act_index);
3493                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
3494                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3495                         assert(na_act);
3496                         if (encap.hdr) {
3497                                 assert(dev_flow->tcf.tunnel);
3498                                 dev_flow->tcf.tunnel->ifindex_ptr =
3499                                         &((struct tc_mirred *)
3500                                         mnl_attr_get_payload
3501                                         (mnl_nlmsg_get_payload_tail
3502                                                 (nlh)))->ifindex;
3503                         }
3504                         mnl_attr_put(nlh, TCA_MIRRED_PARMS,
3505                                      sizeof(struct tc_mirred),
3506                                      &(struct tc_mirred){
3507                                         .action = TC_ACT_STOLEN,
3508                                         .eaction = TCA_EGRESS_REDIR,
3509                                         .ifindex = ptoi[i].ifindex,
3510                                      });
3511                         mnl_attr_nest_end(nlh, na_act);
3512                         mnl_attr_nest_end(nlh, na_act_index);
3513                         break;
3514                 case RTE_FLOW_ACTION_TYPE_JUMP:
3515                         conf.jump = actions->conf;
3516                         na_act_index =
3517                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3518                         assert(na_act_index);
3519                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3520                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3521                         assert(na_act);
3522                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3523                                      sizeof(struct tc_gact),
3524                                      &(struct tc_gact){
3525                                         .action = TC_ACT_GOTO_CHAIN |
3526                                                   conf.jump->group,
3527                                      });
3528                         mnl_attr_nest_end(nlh, na_act);
3529                         mnl_attr_nest_end(nlh, na_act_index);
3530                         break;
3531                 case RTE_FLOW_ACTION_TYPE_DROP:
3532                         na_act_index =
3533                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3534                         assert(na_act_index);
3535                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3536                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3537                         assert(na_act);
3538                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3539                                      sizeof(struct tc_gact),
3540                                      &(struct tc_gact){
3541                                         .action = TC_ACT_SHOT,
3542                                      });
3543                         mnl_attr_nest_end(nlh, na_act);
3544                         mnl_attr_nest_end(nlh, na_act_index);
3545                         break;
3546                 case RTE_FLOW_ACTION_TYPE_COUNT:
3547                         /*
3548                          * Driver adds the count action implicitly for
3549                          * each rule it creates.
3550                          */
3551                         ret = flow_tcf_translate_action_count(dev,
3552                                                               dev_flow, error);
3553                         if (ret < 0)
3554                                 return ret;
3555                         break;
3556                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
3557                         conf.of_push_vlan = NULL;
3558                         vlan_act = TCA_VLAN_ACT_POP;
3559                         goto action_of_vlan;
3560                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
3561                         conf.of_push_vlan = actions->conf;
3562                         vlan_act = TCA_VLAN_ACT_PUSH;
3563                         goto action_of_vlan;
3564                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
3565                         conf.of_set_vlan_vid = actions->conf;
3566                         if (na_vlan_id)
3567                                 goto override_na_vlan_id;
3568                         vlan_act = TCA_VLAN_ACT_MODIFY;
3569                         goto action_of_vlan;
3570                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
3571                         conf.of_set_vlan_pcp = actions->conf;
3572                         if (na_vlan_priority)
3573                                 goto override_na_vlan_priority;
3574                         vlan_act = TCA_VLAN_ACT_MODIFY;
3575                         goto action_of_vlan;
3576 action_of_vlan:
3577                         na_act_index =
3578                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3579                         assert(na_act_index);
3580                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "vlan");
3581                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3582                         assert(na_act);
3583                         mnl_attr_put(nlh, TCA_VLAN_PARMS,
3584                                      sizeof(struct tc_vlan),
3585                                      &(struct tc_vlan){
3586                                         .action = TC_ACT_PIPE,
3587                                         .v_action = vlan_act,
3588                                      });
3589                         if (vlan_act == TCA_VLAN_ACT_POP) {
3590                                 mnl_attr_nest_end(nlh, na_act);
3591                                 mnl_attr_nest_end(nlh, na_act_index);
3592                                 break;
3593                         }
3594                         if (vlan_act == TCA_VLAN_ACT_PUSH)
3595                                 mnl_attr_put_u16(nlh,
3596                                                  TCA_VLAN_PUSH_VLAN_PROTOCOL,
3597                                                  conf.of_push_vlan->ethertype);
3598                         na_vlan_id = mnl_nlmsg_get_payload_tail(nlh);
3599                         mnl_attr_put_u16(nlh, TCA_VLAN_PAD, 0);
3600                         na_vlan_priority = mnl_nlmsg_get_payload_tail(nlh);
3601                         mnl_attr_put_u8(nlh, TCA_VLAN_PAD, 0);
3602                         mnl_attr_nest_end(nlh, na_act);
3603                         mnl_attr_nest_end(nlh, na_act_index);
3604                         if (actions->type ==
3605                             RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
3606 override_na_vlan_id:
3607                                 na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID;
3608                                 *(uint16_t *)mnl_attr_get_payload(na_vlan_id) =
3609                                         rte_be_to_cpu_16
3610                                         (conf.of_set_vlan_vid->vlan_vid);
3611                         } else if (actions->type ==
3612                                    RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
3613 override_na_vlan_priority:
3614                                 na_vlan_priority->nla_type =
3615                                         TCA_VLAN_PUSH_VLAN_PRIORITY;
3616                                 *(uint8_t *)mnl_attr_get_payload
3617                                         (na_vlan_priority) =
3618                                         conf.of_set_vlan_pcp->vlan_pcp;
3619                         }
3620                         break;
3621                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
3622                         assert(decap.vxlan);
3623                         assert(dev_flow->tcf.tunnel);
3624                         dev_flow->tcf.tunnel->ifindex_ptr =
3625                                 (unsigned int *)&tcm->tcm_ifindex;
3626                         na_act_index =
3627                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3628                         assert(na_act_index);
3629                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3630                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3631                         assert(na_act);
3632                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3633                                 sizeof(struct tc_tunnel_key),
3634                                 &(struct tc_tunnel_key){
3635                                         .action = TC_ACT_PIPE,
3636                                         .t_action = TCA_TUNNEL_KEY_ACT_RELEASE,
3637                                         });
3638                         mnl_attr_nest_end(nlh, na_act);
3639                         mnl_attr_nest_end(nlh, na_act_index);
3640                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3641                         break;
3642                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
3643                         assert(encap.vxlan);
3644                         flow_tcf_vxlan_encap_parse(actions, encap.vxlan);
3645                         na_act_index =
3646                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3647                         assert(na_act_index);
3648                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3649                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3650                         assert(na_act);
3651                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3652                                 sizeof(struct tc_tunnel_key),
3653                                 &(struct tc_tunnel_key){
3654                                         .action = TC_ACT_PIPE,
3655                                         .t_action = TCA_TUNNEL_KEY_ACT_SET,
3656                                         });
3657                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_UDP_DST)
3658                                 mnl_attr_put_u16(nlh,
3659                                          TCA_TUNNEL_KEY_ENC_DST_PORT,
3660                                          encap.vxlan->udp.dst);
3661                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_SRC)
3662                                 mnl_attr_put_u32(nlh,
3663                                          TCA_TUNNEL_KEY_ENC_IPV4_SRC,
3664                                          encap.vxlan->ipv4.src);
3665                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_DST)
3666                                 mnl_attr_put_u32(nlh,
3667                                          TCA_TUNNEL_KEY_ENC_IPV4_DST,
3668                                          encap.vxlan->ipv4.dst);
3669                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_SRC)
3670                                 mnl_attr_put(nlh,
3671                                          TCA_TUNNEL_KEY_ENC_IPV6_SRC,
3672                                          sizeof(encap.vxlan->ipv6.src),
3673                                          &encap.vxlan->ipv6.src);
3674                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_DST)
3675                                 mnl_attr_put(nlh,
3676                                          TCA_TUNNEL_KEY_ENC_IPV6_DST,
3677                                          sizeof(encap.vxlan->ipv6.dst),
3678                                          &encap.vxlan->ipv6.dst);
3679                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_VXLAN_VNI)
3680                                 mnl_attr_put_u32(nlh,
3681                                          TCA_TUNNEL_KEY_ENC_KEY_ID,
3682                                          vxlan_vni_as_be32
3683                                                 (encap.vxlan->vxlan.vni));
3684                         mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM, 0);
3685                         mnl_attr_nest_end(nlh, na_act);
3686                         mnl_attr_nest_end(nlh, na_act_index);
3687                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3688                         break;
3689                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
3690                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
3691                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
3692                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
3693                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
3694                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
3695                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
3696                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
3697                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
3698                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
3699                         na_act_index =
3700                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3701                         flow_tcf_create_pedit_mnl_msg(nlh,
3702                                                       &actions, item_flags);
3703                         mnl_attr_nest_end(nlh, na_act_index);
3704                         break;
3705                 default:
3706                         return rte_flow_error_set(error, ENOTSUP,
3707                                                   RTE_FLOW_ERROR_TYPE_ACTION,
3708                                                   actions,
3709                                                   "action not supported");
3710                 }
3711         }
3712         assert(na_flower);
3713         assert(na_flower_act);
3714         mnl_attr_nest_end(nlh, na_flower_act);
3715         mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, decap.vxlan ?
3716                                                 0 : TCA_CLS_FLAGS_SKIP_SW);
3717         mnl_attr_nest_end(nlh, na_flower);
3718         if (dev_flow->tcf.tunnel && dev_flow->tcf.tunnel->ifindex_ptr)
3719                 dev_flow->tcf.tunnel->ifindex_org =
3720                         *dev_flow->tcf.tunnel->ifindex_ptr;
3721         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3722         return 0;
3723 }
3724
3725 /**
3726  * Send Netlink message with acknowledgment.
3727  *
3728  * @param tcf
3729  *   Flow context to use.
3730  * @param nlh
3731  *   Message to send. This function always raises the NLM_F_ACK flag before
3732  *   sending.
3733  * @param[in] msglen
3734  *   Message length. Message buffer may contain multiple commands and
3735  *   nlmsg_len field not always corresponds to actual message length.
3736  *   If 0 specified the nlmsg_len field in header is used as message length.
3737  * @param[in] cb
3738  *   Callback handler for received message.
3739  * @param[in] arg
3740  *   Context pointer for callback handler.
3741  *
3742  * @return
3743  *   0 on success, a negative errno value otherwise and rte_errno is set.
3744  */
3745 static int
3746 flow_tcf_nl_ack(struct mlx5_flow_tcf_context *tcf,
3747                 struct nlmsghdr *nlh,
3748                 uint32_t msglen,
3749                 mnl_cb_t cb, void *arg)
3750 {
3751         unsigned int portid = mnl_socket_get_portid(tcf->nl);
3752         uint32_t seq = tcf->seq++;
3753         int err, ret;
3754
3755         assert(tcf->nl);
3756         assert(tcf->buf);
3757         if (!seq)
3758                 /* seq 0 is reserved for kernel event-driven notifications. */
3759                 seq = tcf->seq++;
3760         nlh->nlmsg_seq = seq;
3761         if (!msglen) {
3762                 msglen = nlh->nlmsg_len;
3763                 nlh->nlmsg_flags |= NLM_F_ACK;
3764         }
3765         ret = mnl_socket_sendto(tcf->nl, nlh, msglen);
3766         err = (ret <= 0) ? errno : 0;
3767         nlh = (struct nlmsghdr *)(tcf->buf);
3768         /*
3769          * The following loop postpones non-fatal errors until multipart
3770          * messages are complete.
3771          */
3772         if (ret > 0)
3773                 while (true) {
3774                         ret = mnl_socket_recvfrom(tcf->nl, tcf->buf,
3775                                                   tcf->buf_size);
3776                         if (ret < 0) {
3777                                 err = errno;
3778                                 if (err != ENOSPC)
3779                                         break;
3780                         }
3781                         if (!err) {
3782                                 ret = mnl_cb_run(nlh, ret, seq, portid,
3783                                                  cb, arg);
3784                                 if (ret < 0) {
3785                                         err = errno;
3786                                         break;
3787                                 }
3788                         }
3789                         /* Will receive till end of multipart message */
3790                         if (!(nlh->nlmsg_flags & NLM_F_MULTI) ||
3791                               nlh->nlmsg_type == NLMSG_DONE)
3792                                 break;
3793                 }
3794         if (!err)
3795                 return 0;
3796         rte_errno = err;
3797         return -err;
3798 }
3799
3800 #define MNL_BUF_EXTRA_SPACE 16
3801 #define MNL_REQUEST_SIZE_MIN 256
3802 #define MNL_REQUEST_SIZE_MAX 2048
3803 #define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \
3804                                  MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX)
3805
3806 /* Data structures used by flow_tcf_xxx_cb() routines. */
3807 struct tcf_nlcb_buf {
3808         LIST_ENTRY(tcf_nlcb_buf) next;
3809         uint32_t size;
3810         alignas(struct nlmsghdr)
3811         uint8_t msg[]; /**< Netlink message data. */
3812 };
3813
3814 struct tcf_nlcb_context {
3815         unsigned int ifindex; /**< Base interface index. */
3816         uint32_t bufsize;
3817         LIST_HEAD(, tcf_nlcb_buf) nlbuf;
3818 };
3819
3820 /**
3821  * Allocate space for netlink command in buffer list
3822  *
3823  * @param[in, out] ctx
3824  *   Pointer to callback context with command buffers list.
3825  * @param[in] size
3826  *   Required size of data buffer to be allocated.
3827  *
3828  * @return
3829  *   Pointer to allocated memory, aligned as message header.
3830  *   NULL if some error occurred.
3831  */
3832 static struct nlmsghdr *
3833 flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size)
3834 {
3835         struct tcf_nlcb_buf *buf;
3836         struct nlmsghdr *nlh;
3837
3838         size = NLMSG_ALIGN(size);
3839         buf = LIST_FIRST(&ctx->nlbuf);
3840         if (buf && (buf->size + size) <= ctx->bufsize) {
3841                 nlh = (struct nlmsghdr *)&buf->msg[buf->size];
3842                 buf->size += size;
3843                 return nlh;
3844         }
3845         if (size > ctx->bufsize) {
3846                 DRV_LOG(WARNING, "netlink: too long command buffer requested");
3847                 return NULL;
3848         }
3849         buf = rte_malloc(__func__,
3850                         ctx->bufsize + sizeof(struct tcf_nlcb_buf),
3851                         alignof(struct tcf_nlcb_buf));
3852         if (!buf) {
3853                 DRV_LOG(WARNING, "netlink: no memory for command buffer");
3854                 return NULL;
3855         }
3856         LIST_INSERT_HEAD(&ctx->nlbuf, buf, next);
3857         buf->size = size;
3858         nlh = (struct nlmsghdr *)&buf->msg[0];
3859         return nlh;
3860 }
3861
3862 /**
3863  * Set NLM_F_ACK flags in the last netlink command in buffer.
3864  * Only last command in the buffer will be acked by system.
3865  *
3866  * @param[in, out] buf
3867  *   Pointer to buffer with netlink commands.
3868  */
3869 static void
3870 flow_tcf_setack_nlcmd(struct tcf_nlcb_buf *buf)
3871 {
3872         struct nlmsghdr *nlh;
3873         uint32_t size = 0;
3874
3875         assert(buf->size);
3876         do {
3877                 nlh = (struct nlmsghdr *)&buf->msg[size];
3878                 size += NLMSG_ALIGN(nlh->nlmsg_len);
3879                 if (size >= buf->size) {
3880                         nlh->nlmsg_flags |= NLM_F_ACK;
3881                         break;
3882                 }
3883         } while (true);
3884 }
3885
3886 /**
3887  * Send the buffers with prepared netlink commands. Scans the list and
3888  * sends all found buffers. Buffers are sent and freed anyway in order
3889  * to prevent memory leakage if some every message in received packet.
3890  *
3891  * @param[in] tcf
3892  *   Context object initialized by mlx5_flow_tcf_context_create().
3893  * @param[in, out] ctx
3894  *   Pointer to callback context with command buffers list.
3895  *
3896  * @return
3897  *   Zero value on success, negative errno value otherwise
3898  *   and rte_errno is set.
3899  */
3900 static int
3901 flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf,
3902                     struct tcf_nlcb_context *ctx)
3903 {
3904         struct tcf_nlcb_buf *bc, *bn;
3905         struct nlmsghdr *nlh;
3906         int ret = 0;
3907
3908         bc = LIST_FIRST(&ctx->nlbuf);
3909         while (bc) {
3910                 int rc;
3911
3912                 bn = LIST_NEXT(bc, next);
3913                 if (bc->size) {
3914                         flow_tcf_setack_nlcmd(bc);
3915                         nlh = (struct nlmsghdr *)&bc->msg;
3916                         rc = flow_tcf_nl_ack(tcf, nlh, bc->size, NULL, NULL);
3917                         if (rc && !ret)
3918                                 ret = rc;
3919                 }
3920                 rte_free(bc);
3921                 bc = bn;
3922         }
3923         LIST_INIT(&ctx->nlbuf);
3924         return ret;
3925 }
3926
3927 /**
3928  * Collect local IP address rules with scope link attribute  on specified
3929  * network device. This is callback routine called by libmnl mnl_cb_run()
3930  * in loop for every message in received packet.
3931  *
3932  * @param[in] nlh
3933  *   Pointer to reply header.
3934  * @param[in, out] arg
3935  *   Opaque data pointer for this callback.
3936  *
3937  * @return
3938  *   A positive, nonzero value on success, negative errno value otherwise
3939  *   and rte_errno is set.
3940  */
3941 static int
3942 flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg)
3943 {
3944         struct tcf_nlcb_context *ctx = arg;
3945         struct nlmsghdr *cmd;
3946         struct ifaddrmsg *ifa;
3947         struct nlattr *na;
3948         struct nlattr *na_local = NULL;
3949         struct nlattr *na_peer = NULL;
3950         unsigned char family;
3951
3952         if (nlh->nlmsg_type != RTM_NEWADDR) {
3953                 rte_errno = EINVAL;
3954                 return -rte_errno;
3955         }
3956         ifa = mnl_nlmsg_get_payload(nlh);
3957         family = ifa->ifa_family;
3958         if (ifa->ifa_index != ctx->ifindex ||
3959             ifa->ifa_scope != RT_SCOPE_LINK ||
3960             !(ifa->ifa_flags & IFA_F_PERMANENT) ||
3961             (family != AF_INET && family != AF_INET6))
3962                 return 1;
3963         mnl_attr_for_each(na, nlh, sizeof(*ifa)) {
3964                 switch (mnl_attr_get_type(na)) {
3965                 case IFA_LOCAL:
3966                         na_local = na;
3967                         break;
3968                 case IFA_ADDRESS:
3969                         na_peer = na;
3970                         break;
3971                 }
3972                 if (na_local && na_peer)
3973                         break;
3974         }
3975         if (!na_local || !na_peer)
3976                 return 1;
3977         /* Local rule found with scope link, permanent and assigned peer. */
3978         cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) +
3979                                         MNL_ALIGN(sizeof(struct ifaddrmsg)) +
3980                                         (family == AF_INET6
3981                                         ? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
3982                                         : 2 * SZ_NLATTR_TYPE_OF(uint32_t)));
3983         if (!cmd) {
3984                 rte_errno = ENOMEM;
3985                 return -rte_errno;
3986         }
3987         cmd = mnl_nlmsg_put_header(cmd);
3988         cmd->nlmsg_type = RTM_DELADDR;
3989         cmd->nlmsg_flags = NLM_F_REQUEST;
3990         ifa = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifa));
3991         ifa->ifa_flags = IFA_F_PERMANENT;
3992         ifa->ifa_scope = RT_SCOPE_LINK;
3993         ifa->ifa_index = ctx->ifindex;
3994         if (family == AF_INET) {
3995                 ifa->ifa_family = AF_INET;
3996                 ifa->ifa_prefixlen = 32;
3997                 mnl_attr_put_u32(cmd, IFA_LOCAL, mnl_attr_get_u32(na_local));
3998                 mnl_attr_put_u32(cmd, IFA_ADDRESS, mnl_attr_get_u32(na_peer));
3999         } else {
4000                 ifa->ifa_family = AF_INET6;
4001                 ifa->ifa_prefixlen = 128;
4002                 mnl_attr_put(cmd, IFA_LOCAL, IPV6_ADDR_LEN,
4003                         mnl_attr_get_payload(na_local));
4004                 mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN,
4005                         mnl_attr_get_payload(na_peer));
4006         }
4007         return 1;
4008 }
4009
4010 /**
4011  * Cleanup the local IP addresses on outer interface.
4012  *
4013  * @param[in] tcf
4014  *   Context object initialized by mlx5_flow_tcf_context_create().
4015  * @param[in] ifindex
4016  *   Network inferface index to perform cleanup.
4017  */
4018 static void
4019 flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf,
4020                             unsigned int ifindex)
4021 {
4022         struct nlmsghdr *nlh;
4023         struct ifaddrmsg *ifa;
4024         struct tcf_nlcb_context ctx = {
4025                 .ifindex = ifindex,
4026                 .bufsize = MNL_REQUEST_SIZE,
4027                 .nlbuf = LIST_HEAD_INITIALIZER(),
4028         };
4029         int ret;
4030
4031         assert(ifindex);
4032         /*
4033          * Seek and destroy leftovers of local IP addresses with
4034          * matching properties "scope link".
4035          */
4036         nlh = mnl_nlmsg_put_header(tcf->buf);
4037         nlh->nlmsg_type = RTM_GETADDR;
4038         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4039         ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4040         ifa->ifa_family = AF_UNSPEC;
4041         ifa->ifa_index = ifindex;
4042         ifa->ifa_scope = RT_SCOPE_LINK;
4043         ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_local_cb, &ctx);
4044         if (ret)
4045                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4046         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4047         if (ret)
4048                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4049 }
4050
4051 /**
4052  * Collect neigh permament rules on specified network device.
4053  * This is callback routine called by libmnl mnl_cb_run() in loop for
4054  * every message in received packet.
4055  *
4056  * @param[in] nlh
4057  *   Pointer to reply header.
4058  * @param[in, out] arg
4059  *   Opaque data pointer for this callback.
4060  *
4061  * @return
4062  *   A positive, nonzero value on success, negative errno value otherwise
4063  *   and rte_errno is set.
4064  */
4065 static int
4066 flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg)
4067 {
4068         struct tcf_nlcb_context *ctx = arg;
4069         struct nlmsghdr *cmd;
4070         struct ndmsg *ndm;
4071         struct nlattr *na;
4072         struct nlattr *na_ip = NULL;
4073         struct nlattr *na_mac = NULL;
4074         unsigned char family;
4075
4076         if (nlh->nlmsg_type != RTM_NEWNEIGH) {
4077                 rte_errno = EINVAL;
4078                 return -rte_errno;
4079         }
4080         ndm = mnl_nlmsg_get_payload(nlh);
4081         family = ndm->ndm_family;
4082         if (ndm->ndm_ifindex != (int)ctx->ifindex ||
4083            !(ndm->ndm_state & NUD_PERMANENT) ||
4084            (family != AF_INET && family != AF_INET6))
4085                 return 1;
4086         mnl_attr_for_each(na, nlh, sizeof(*ndm)) {
4087                 switch (mnl_attr_get_type(na)) {
4088                 case NDA_DST:
4089                         na_ip = na;
4090                         break;
4091                 case NDA_LLADDR:
4092                         na_mac = na;
4093                         break;
4094                 }
4095                 if (na_mac && na_ip)
4096                         break;
4097         }
4098         if (!na_mac || !na_ip)
4099                 return 1;
4100         /* Neigh rule with permenent attribute found. */
4101         cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) +
4102                                         MNL_ALIGN(sizeof(struct ndmsg)) +
4103                                         SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) +
4104                                         (family == AF_INET6
4105                                         ? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4106                                         : SZ_NLATTR_TYPE_OF(uint32_t)));
4107         if (!cmd) {
4108                 rte_errno = ENOMEM;
4109                 return -rte_errno;
4110         }
4111         cmd = mnl_nlmsg_put_header(cmd);
4112         cmd->nlmsg_type = RTM_DELNEIGH;
4113         cmd->nlmsg_flags = NLM_F_REQUEST;
4114         ndm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ndm));
4115         ndm->ndm_ifindex = ctx->ifindex;
4116         ndm->ndm_state = NUD_PERMANENT;
4117         ndm->ndm_flags = 0;
4118         ndm->ndm_type = 0;
4119         if (family == AF_INET) {
4120                 ndm->ndm_family = AF_INET;
4121                 mnl_attr_put_u32(cmd, NDA_DST, mnl_attr_get_u32(na_ip));
4122         } else {
4123                 ndm->ndm_family = AF_INET6;
4124                 mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN,
4125                              mnl_attr_get_payload(na_ip));
4126         }
4127         mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN,
4128                      mnl_attr_get_payload(na_mac));
4129         return 1;
4130 }
4131
4132 /**
4133  * Cleanup the neigh rules on outer interface.
4134  *
4135  * @param[in] tcf
4136  *   Context object initialized by mlx5_flow_tcf_context_create().
4137  * @param[in] ifindex
4138  *   Network inferface index to perform cleanup.
4139  */
4140 static void
4141 flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf,
4142                             unsigned int ifindex)
4143 {
4144         struct nlmsghdr *nlh;
4145         struct ndmsg *ndm;
4146         struct tcf_nlcb_context ctx = {
4147                 .ifindex = ifindex,
4148                 .bufsize = MNL_REQUEST_SIZE,
4149                 .nlbuf = LIST_HEAD_INITIALIZER(),
4150         };
4151         int ret;
4152
4153         assert(ifindex);
4154         /* Seek and destroy leftovers of neigh rules. */
4155         nlh = mnl_nlmsg_put_header(tcf->buf);
4156         nlh->nlmsg_type = RTM_GETNEIGH;
4157         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4158         ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4159         ndm->ndm_family = AF_UNSPEC;
4160         ndm->ndm_ifindex = ifindex;
4161         ndm->ndm_state = NUD_PERMANENT;
4162         ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_neigh_cb, &ctx);
4163         if (ret)
4164                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4165         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4166         if (ret)
4167                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4168 }
4169
4170 /**
4171  * Collect indices of VXLAN encap/decap interfaces associated with device.
4172  * This is callback routine called by libmnl mnl_cb_run() in loop for
4173  * every message in received packet.
4174  *
4175  * @param[in] nlh
4176  *   Pointer to reply header.
4177  * @param[in, out] arg
4178  *   Opaque data pointer for this callback.
4179  *
4180  * @return
4181  *   A positive, nonzero value on success, negative errno value otherwise
4182  *   and rte_errno is set.
4183  */
4184 static int
4185 flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg)
4186 {
4187         struct tcf_nlcb_context *ctx = arg;
4188         struct nlmsghdr *cmd;
4189         struct ifinfomsg *ifm;
4190         struct nlattr *na;
4191         struct nlattr *na_info = NULL;
4192         struct nlattr *na_vxlan = NULL;
4193         bool found = false;
4194         unsigned int vxindex;
4195
4196         if (nlh->nlmsg_type != RTM_NEWLINK) {
4197                 rte_errno = EINVAL;
4198                 return -rte_errno;
4199         }
4200         ifm = mnl_nlmsg_get_payload(nlh);
4201         if (!ifm->ifi_index) {
4202                 rte_errno = EINVAL;
4203                 return -rte_errno;
4204         }
4205         mnl_attr_for_each(na, nlh, sizeof(*ifm))
4206                 if (mnl_attr_get_type(na) == IFLA_LINKINFO) {
4207                         na_info = na;
4208                         break;
4209                 }
4210         if (!na_info)
4211                 return 1;
4212         mnl_attr_for_each_nested(na, na_info) {
4213                 switch (mnl_attr_get_type(na)) {
4214                 case IFLA_INFO_KIND:
4215                         if (!strncmp("vxlan", mnl_attr_get_str(na),
4216                                      mnl_attr_get_len(na)))
4217                                 found = true;
4218                         break;
4219                 case IFLA_INFO_DATA:
4220                         na_vxlan = na;
4221                         break;
4222                 }
4223                 if (found && na_vxlan)
4224                         break;
4225         }
4226         if (!found || !na_vxlan)
4227                 return 1;
4228         found = false;
4229         mnl_attr_for_each_nested(na, na_vxlan) {
4230                 if (mnl_attr_get_type(na) == IFLA_VXLAN_LINK &&
4231                     mnl_attr_get_u32(na) == ctx->ifindex) {
4232                         found = true;
4233                         break;
4234                 }
4235         }
4236         if (!found)
4237                 return 1;
4238         /* Attached VXLAN device found, store the command to delete. */
4239         vxindex = ifm->ifi_index;
4240         cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) +
4241                                         MNL_ALIGN(sizeof(struct ifinfomsg)));
4242         if (!nlh) {
4243                 rte_errno = ENOMEM;
4244                 return -rte_errno;
4245         }
4246         cmd = mnl_nlmsg_put_header(cmd);
4247         cmd->nlmsg_type = RTM_DELLINK;
4248         cmd->nlmsg_flags = NLM_F_REQUEST;
4249         ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm));
4250         ifm->ifi_family = AF_UNSPEC;
4251         ifm->ifi_index = vxindex;
4252         return 1;
4253 }
4254
4255 /**
4256  * Cleanup the outer interface. Removes all found vxlan devices
4257  * attached to specified index, flushes the meigh and local IP
4258  * datavase.
4259  *
4260  * @param[in] tcf
4261  *   Context object initialized by mlx5_flow_tcf_context_create().
4262  * @param[in] ifindex
4263  *   Network inferface index to perform cleanup.
4264  */
4265 static void
4266 flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf,
4267                             unsigned int ifindex)
4268 {
4269         struct nlmsghdr *nlh;
4270         struct ifinfomsg *ifm;
4271         struct tcf_nlcb_context ctx = {
4272                 .ifindex = ifindex,
4273                 .bufsize = MNL_REQUEST_SIZE,
4274                 .nlbuf = LIST_HEAD_INITIALIZER(),
4275         };
4276         int ret;
4277
4278         assert(ifindex);
4279         /*
4280          * Seek and destroy leftover VXLAN encap/decap interfaces with
4281          * matching properties.
4282          */
4283         nlh = mnl_nlmsg_put_header(tcf->buf);
4284         nlh->nlmsg_type = RTM_GETLINK;
4285         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4286         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4287         ifm->ifi_family = AF_UNSPEC;
4288         ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_vxlan_cb, &ctx);
4289         if (ret)
4290                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4291         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4292         if (ret)
4293                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4294 }
4295
4296 /**
4297  * Emit Netlink message to add/remove local address to the outer device.
4298  * The address being added is visible within the link only (scope link).
4299  *
4300  * Note that an implicit route is maintained by the kernel due to the
4301  * presence of a peer address (IFA_ADDRESS).
4302  *
4303  * These rules are used for encapsultion only and allow to assign
4304  * the outer tunnel source IP address.
4305  *
4306  * @param[in] tcf
4307  *   Libmnl socket context object.
4308  * @param[in] encap
4309  *   Encapsulation properties (source address and its peer).
4310  * @param[in] ifindex
4311  *   Network interface to apply rule.
4312  * @param[in] enable
4313  *   Toggle between add and remove.
4314  * @param[out] error
4315  *   Perform verbose error reporting if not NULL.
4316  *
4317  * @return
4318  *   0 on success, a negative errno value otherwise and rte_errno is set.
4319  */
4320 static int
4321 flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf,
4322                     const struct flow_tcf_vxlan_encap *encap,
4323                     unsigned int ifindex,
4324                     bool enable,
4325                     struct rte_flow_error *error)
4326 {
4327         struct nlmsghdr *nlh;
4328         struct ifaddrmsg *ifa;
4329         alignas(struct nlmsghdr)
4330         uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
4331
4332         nlh = mnl_nlmsg_put_header(buf);
4333         nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
4334         nlh->nlmsg_flags =
4335                 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4336         nlh->nlmsg_seq = 0;
4337         ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4338         ifa->ifa_flags = IFA_F_PERMANENT;
4339         ifa->ifa_scope = RT_SCOPE_LINK;
4340         ifa->ifa_index = ifindex;
4341         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4342                 ifa->ifa_family = AF_INET;
4343                 ifa->ifa_prefixlen = 32;
4344                 mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src);
4345                 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST)
4346                         mnl_attr_put_u32(nlh, IFA_ADDRESS,
4347                                               encap->ipv4.dst);
4348         } else {
4349                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4350                 ifa->ifa_family = AF_INET6;
4351                 ifa->ifa_prefixlen = 128;
4352                 mnl_attr_put(nlh, IFA_LOCAL,
4353                                   sizeof(encap->ipv6.src),
4354                                   &encap->ipv6.src);
4355                 if (encap->mask & FLOW_TCF_ENCAP_IPV6_DST)
4356                         mnl_attr_put(nlh, IFA_ADDRESS,
4357                                           sizeof(encap->ipv6.dst),
4358                                           &encap->ipv6.dst);
4359         }
4360         if (!flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL))
4361                 return 0;
4362         return rte_flow_error_set(error, rte_errno,
4363                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4364                                   "netlink: cannot complete IFA request"
4365                                   " (ip addr add)");
4366 }
4367
4368 /**
4369  * Emit Netlink message to add/remove neighbor.
4370  *
4371  * @param[in] tcf
4372  *   Libmnl socket context object.
4373  * @param[in] encap
4374  *   Encapsulation properties (destination address).
4375  * @param[in] ifindex
4376  *   Network interface.
4377  * @param[in] enable
4378  *   Toggle between add and remove.
4379  * @param[out] error
4380  *   Perform verbose error reporting if not NULL.
4381  *
4382  * @return
4383  *   0 on success, a negative errno value otherwise and rte_errno is set.
4384  */
4385 static int
4386 flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf,
4387                      const struct flow_tcf_vxlan_encap *encap,
4388                      unsigned int ifindex,
4389                      bool enable,
4390                      struct rte_flow_error *error)
4391 {
4392         struct nlmsghdr *nlh;
4393         struct ndmsg *ndm;
4394         alignas(struct nlmsghdr)
4395         uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
4396
4397         nlh = mnl_nlmsg_put_header(buf);
4398         nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
4399         nlh->nlmsg_flags =
4400                 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4401         nlh->nlmsg_seq = 0;
4402         ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4403         ndm->ndm_ifindex = ifindex;
4404         ndm->ndm_state = NUD_PERMANENT;
4405         ndm->ndm_flags = 0;
4406         ndm->ndm_type = 0;
4407         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4408                 ndm->ndm_family = AF_INET;
4409                 mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst);
4410         } else {
4411                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4412                 ndm->ndm_family = AF_INET6;
4413                 mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst),
4414                                                  &encap->ipv6.dst);
4415         }
4416         if (encap->mask & FLOW_TCF_ENCAP_ETH_SRC && enable)
4417                 DRV_LOG(WARNING,
4418                         "outer ethernet source address cannot be "
4419                         "forced for VXLAN encapsulation");
4420         if (encap->mask & FLOW_TCF_ENCAP_ETH_DST)
4421                 mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst),
4422                                                     &encap->eth.dst);
4423         if (!flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL))
4424                 return 0;
4425         return rte_flow_error_set(error, rte_errno,
4426                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4427                                   "netlink: cannot complete ND request"
4428                                   " (ip neigh)");
4429 }
4430
4431 /**
4432  * Manage the local IP addresses and their peers IP addresses on the
4433  * outer interface for encapsulation purposes. The kernel searches the
4434  * appropriate device for tunnel egress traffic using the outer source
4435  * IP, this IP should be assigned to the outer network device, otherwise
4436  * kernel rejects the rule.
4437  *
4438  * Adds or removes the addresses using the Netlink command like this:
4439  *   ip addr add <src_ip> peer <dst_ip> scope link dev <ifouter>
4440  *
4441  * The addresses are local to the netdev ("scope link"), this reduces
4442  * the risk of conflicts. Note that an implicit route is maintained by
4443  * the kernel due to the presence of a peer address (IFA_ADDRESS).
4444  *
4445  * @param[in] tcf
4446  *   Libmnl socket context object.
4447  * @param[in] vtep
4448  *   VTEP object, contains rule database and ifouter index.
4449  * @param[in] dev_flow
4450  *   Flow object, contains the tunnel parameters (for encap only).
4451  * @param[in] enable
4452  *   Toggle between add and remove.
4453  * @param[out] error
4454  *   Perform verbose error reporting if not NULL.
4455  *
4456  * @return
4457  *   0 on success, a negative errno value otherwise and rte_errno is set.
4458  */
4459 static int
4460 flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf,
4461                      struct tcf_vtep *vtep,
4462                      struct mlx5_flow *dev_flow,
4463                      bool enable,
4464                      struct rte_flow_error *error)
4465 {
4466         const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4467         struct tcf_local_rule *rule;
4468         bool found = false;
4469         int ret;
4470
4471         assert(encap);
4472         assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4473         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4474                 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_DST);
4475                 LIST_FOREACH(rule, &vtep->local, next) {
4476                         if (rule->mask & FLOW_TCF_ENCAP_IPV4_SRC &&
4477                             encap->ipv4.src == rule->ipv4.src &&
4478                             encap->ipv4.dst == rule->ipv4.dst) {
4479                                 found = true;
4480                                 break;
4481                         }
4482                 }
4483         } else {
4484                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4485                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4486                 LIST_FOREACH(rule, &vtep->local, next) {
4487                         if (rule->mask & FLOW_TCF_ENCAP_IPV6_SRC &&
4488                             !memcmp(&encap->ipv6.src, &rule->ipv6.src,
4489                                             sizeof(encap->ipv6.src)) &&
4490                             !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4491                                             sizeof(encap->ipv6.dst))) {
4492                                 found = true;
4493                                 break;
4494                         }
4495                 }
4496         }
4497         if (found) {
4498                 if (enable) {
4499                         rule->refcnt++;
4500                         return 0;
4501                 }
4502                 if (!rule->refcnt || !--rule->refcnt) {
4503                         LIST_REMOVE(rule, next);
4504                         return flow_tcf_rule_local(tcf, encap,
4505                                         vtep->ifouter, false, error);
4506                 }
4507                 return 0;
4508         }
4509         if (!enable) {
4510                 DRV_LOG(WARNING, "disabling not existing local rule");
4511                 rte_flow_error_set(error, ENOENT,
4512                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4513                                    "disabling not existing local rule");
4514                 return -ENOENT;
4515         }
4516         rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule),
4517                                 alignof(struct tcf_local_rule));
4518         if (!rule) {
4519                 rte_flow_error_set(error, ENOMEM,
4520                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4521                                    "unable to allocate memory for local rule");
4522                 return -rte_errno;
4523         }
4524         *rule = (struct tcf_local_rule){.refcnt = 0,
4525                                         .mask = 0,
4526                                         };
4527         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4528                 rule->mask = FLOW_TCF_ENCAP_IPV4_SRC
4529                            | FLOW_TCF_ENCAP_IPV4_DST;
4530                 rule->ipv4.src = encap->ipv4.src;
4531                 rule->ipv4.dst = encap->ipv4.dst;
4532         } else {
4533                 rule->mask = FLOW_TCF_ENCAP_IPV6_SRC
4534                            | FLOW_TCF_ENCAP_IPV6_DST;
4535                 memcpy(&rule->ipv6.src, &encap->ipv6.src, IPV6_ADDR_LEN);
4536                 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4537         }
4538         ret = flow_tcf_rule_local(tcf, encap, vtep->ifouter, true, error);
4539         if (ret) {
4540                 rte_free(rule);
4541                 return ret;
4542         }
4543         rule->refcnt++;
4544         LIST_INSERT_HEAD(&vtep->local, rule, next);
4545         return 0;
4546 }
4547
4548 /**
4549  * Manage the destination MAC/IP addresses neigh database, kernel uses
4550  * this one to determine the destination MAC address within encapsulation
4551  * header. Adds or removes the entries using the Netlink command like this:
4552  *   ip neigh add dev <ifouter> lladdr <dst_mac> to <dst_ip> nud permanent
4553  *
4554  * @param[in] tcf
4555  *   Libmnl socket context object.
4556  * @param[in] vtep
4557  *   VTEP object, contains rule database and ifouter index.
4558  * @param[in] dev_flow
4559  *   Flow object, contains the tunnel parameters (for encap only).
4560  * @param[in] enable
4561  *   Toggle between add and remove.
4562  * @param[out] error
4563  *   Perform verbose error reporting if not NULL.
4564  *
4565  * @return
4566  *   0 on success, a negative errno value otherwise and rte_errno is set.
4567  */
4568 static int
4569 flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf,
4570                      struct tcf_vtep *vtep,
4571                      struct mlx5_flow *dev_flow,
4572                      bool enable,
4573                      struct rte_flow_error *error)
4574 {
4575         const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4576         struct tcf_neigh_rule *rule;
4577         bool found = false;
4578         int ret;
4579
4580         assert(encap);
4581         assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4582         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4583                 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_SRC);
4584                 LIST_FOREACH(rule, &vtep->neigh, next) {
4585                         if (rule->mask & FLOW_TCF_ENCAP_IPV4_DST &&
4586                             encap->ipv4.dst == rule->ipv4.dst) {
4587                                 found = true;
4588                                 break;
4589                         }
4590                 }
4591         } else {
4592                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4593                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4594                 LIST_FOREACH(rule, &vtep->neigh, next) {
4595                         if (rule->mask & FLOW_TCF_ENCAP_IPV6_DST &&
4596                             !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4597                                                 sizeof(encap->ipv6.dst))) {
4598                                 found = true;
4599                                 break;
4600                         }
4601                 }
4602         }
4603         if (found) {
4604                 if (memcmp(&encap->eth.dst, &rule->eth,
4605                            sizeof(encap->eth.dst))) {
4606                         DRV_LOG(WARNING, "Destination MAC differs"
4607                                          " in neigh rule");
4608                         rte_flow_error_set(error, EEXIST,
4609                                            RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
4610                                            NULL, "Different MAC address"
4611                                            " neigh rule for the same"
4612                                            " destination IP");
4613                                         return -EEXIST;
4614                 }
4615                 if (enable) {
4616                         rule->refcnt++;
4617                         return 0;
4618                 }
4619                 if (!rule->refcnt || !--rule->refcnt) {
4620                         LIST_REMOVE(rule, next);
4621                         return flow_tcf_rule_neigh(tcf, encap,
4622                                                    vtep->ifouter,
4623                                                    false, error);
4624                 }
4625                 return 0;
4626         }
4627         if (!enable) {
4628                 DRV_LOG(WARNING, "Disabling not existing neigh rule");
4629                 rte_flow_error_set(error, ENOENT,
4630                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4631                                    "unable to allocate memory for neigh rule");
4632                 return -ENOENT;
4633         }
4634         rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule),
4635                                 alignof(struct tcf_neigh_rule));
4636         if (!rule) {
4637                 rte_flow_error_set(error, ENOMEM,
4638                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4639                                    "unable to allocate memory for neigh rule");
4640                 return -rte_errno;
4641         }
4642         *rule = (struct tcf_neigh_rule){.refcnt = 0,
4643                                         .mask = 0,
4644                                         };
4645         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4646                 rule->mask = FLOW_TCF_ENCAP_IPV4_DST;
4647                 rule->ipv4.dst = encap->ipv4.dst;
4648         } else {
4649                 rule->mask = FLOW_TCF_ENCAP_IPV6_DST;
4650                 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4651         }
4652         memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth));
4653         ret = flow_tcf_rule_neigh(tcf, encap, vtep->ifouter, true, error);
4654         if (ret) {
4655                 rte_free(rule);
4656                 return ret;
4657         }
4658         rule->refcnt++;
4659         LIST_INSERT_HEAD(&vtep->neigh, rule, next);
4660         return 0;
4661 }
4662
4663 /* VTEP device list is shared between PMD port instances. */
4664 static LIST_HEAD(, tcf_vtep) vtep_list_vxlan = LIST_HEAD_INITIALIZER();
4665 static pthread_mutex_t vtep_list_mutex = PTHREAD_MUTEX_INITIALIZER;
4666
4667 /**
4668  * Deletes VTEP network device.
4669  *
4670  * @param[in] tcf
4671  *   Context object initialized by mlx5_flow_tcf_context_create().
4672  * @param[in] vtep
4673  *   Object represinting the network device to delete. Memory
4674  *   allocated for this object is freed by routine.
4675  */
4676 static void
4677 flow_tcf_vtep_delete(struct mlx5_flow_tcf_context *tcf,
4678                      struct tcf_vtep *vtep)
4679 {
4680         struct nlmsghdr *nlh;
4681         struct ifinfomsg *ifm;
4682         alignas(struct nlmsghdr)
4683         uint8_t buf[mnl_nlmsg_size(MNL_ALIGN(sizeof(*ifm))) +
4684                     MNL_BUF_EXTRA_SPACE];
4685         int ret;
4686
4687         assert(!vtep->refcnt);
4688         /* Delete only ifaces those we actually created. */
4689         if (vtep->created && vtep->ifindex) {
4690                 DRV_LOG(INFO, "VTEP delete (%d)", vtep->ifindex);
4691                 nlh = mnl_nlmsg_put_header(buf);
4692                 nlh->nlmsg_type = RTM_DELLINK;
4693                 nlh->nlmsg_flags = NLM_F_REQUEST;
4694                 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4695                 ifm->ifi_family = AF_UNSPEC;
4696                 ifm->ifi_index = vtep->ifindex;
4697                 assert(sizeof(buf) >= nlh->nlmsg_len);
4698                 ret = flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL);
4699                 if (ret)
4700                         DRV_LOG(WARNING, "netlink: error deleting vxlan"
4701                                          " encap/decap ifindex %u",
4702                                          ifm->ifi_index);
4703         }
4704         rte_free(vtep);
4705 }
4706
4707 /**
4708  * Creates VTEP network device.
4709  *
4710  * @param[in] tcf
4711  *   Context object initialized by mlx5_flow_tcf_context_create().
4712  * @param[in] ifouter
4713  *   Outer interface to attach new-created VXLAN device
4714  *   If zero the VXLAN device will not be attached to any device.
4715  *   These VTEPs are used for decapsulation and can be precreated
4716  *   and shared between processes.
4717  * @param[in] port
4718  *   UDP port of created VTEP device.
4719  * @param[out] error
4720  *   Perform verbose error reporting if not NULL.
4721  *
4722  * @return
4723  * Pointer to created device structure on success,
4724  * NULL otherwise and rte_errno is set.
4725  */
4726 #ifdef HAVE_IFLA_VXLAN_COLLECT_METADATA
4727 static struct tcf_vtep*
4728 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf,
4729                      unsigned int ifouter,
4730                      uint16_t port, struct rte_flow_error *error)
4731 {
4732         struct tcf_vtep *vtep;
4733         struct nlmsghdr *nlh;
4734         struct ifinfomsg *ifm;
4735         char name[sizeof(MLX5_VXLAN_DEVICE_PFX) + 24];
4736         alignas(struct nlmsghdr)
4737         uint8_t buf[mnl_nlmsg_size(sizeof(*ifm)) +
4738                     SZ_NLATTR_DATA_OF(sizeof(name)) +
4739                     SZ_NLATTR_NEST * 2 +
4740                     SZ_NLATTR_STRZ_OF("vxlan") +
4741                     SZ_NLATTR_DATA_OF(sizeof(uint32_t)) +
4742                     SZ_NLATTR_DATA_OF(sizeof(uint16_t)) +
4743                     SZ_NLATTR_DATA_OF(sizeof(uint8_t)) * 3 +
4744                     MNL_BUF_EXTRA_SPACE];
4745         struct nlattr *na_info;
4746         struct nlattr *na_vxlan;
4747         rte_be16_t vxlan_port = rte_cpu_to_be_16(port);
4748         int ret;
4749
4750         vtep = rte_zmalloc(__func__, sizeof(*vtep), alignof(struct tcf_vtep));
4751         if (!vtep) {
4752                 rte_flow_error_set(error, ENOMEM,
4753                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4754                                    "unable to allocate memory for VTEP");
4755                 return NULL;
4756         }
4757         *vtep = (struct tcf_vtep){
4758                         .port = port,
4759                         .local = LIST_HEAD_INITIALIZER(),
4760                         .neigh = LIST_HEAD_INITIALIZER(),
4761         };
4762         memset(buf, 0, sizeof(buf));
4763         nlh = mnl_nlmsg_put_header(buf);
4764         nlh->nlmsg_type = RTM_NEWLINK;
4765         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE  | NLM_F_EXCL;
4766         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4767         ifm->ifi_family = AF_UNSPEC;
4768         ifm->ifi_type = 0;
4769         ifm->ifi_index = 0;
4770         ifm->ifi_flags = IFF_UP;
4771         ifm->ifi_change = 0xffffffff;
4772         snprintf(name, sizeof(name), "%s%u", MLX5_VXLAN_DEVICE_PFX, port);
4773         mnl_attr_put_strz(nlh, IFLA_IFNAME, name);
4774         na_info = mnl_attr_nest_start(nlh, IFLA_LINKINFO);
4775         assert(na_info);
4776         mnl_attr_put_strz(nlh, IFLA_INFO_KIND, "vxlan");
4777         na_vxlan = mnl_attr_nest_start(nlh, IFLA_INFO_DATA);
4778         if (ifouter)
4779                 mnl_attr_put_u32(nlh, IFLA_VXLAN_LINK, ifouter);
4780         assert(na_vxlan);
4781         mnl_attr_put_u8(nlh, IFLA_VXLAN_COLLECT_METADATA, 1);
4782         mnl_attr_put_u8(nlh, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1);
4783         mnl_attr_put_u8(nlh, IFLA_VXLAN_LEARNING, 0);
4784         mnl_attr_put_u16(nlh, IFLA_VXLAN_PORT, vxlan_port);
4785         mnl_attr_nest_end(nlh, na_vxlan);
4786         mnl_attr_nest_end(nlh, na_info);
4787         assert(sizeof(buf) >= nlh->nlmsg_len);
4788         ret = flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL);
4789         if (ret) {
4790                 DRV_LOG(WARNING,
4791                         "netlink: VTEP %s create failure (%d)",
4792                         name, rte_errno);
4793                 if (rte_errno != EEXIST || ifouter)
4794                         /*
4795                          * Some unhandled error occurred or device is
4796                          * for encapsulation and cannot be shared.
4797                          */
4798                         goto error;
4799         } else {
4800                 /*
4801                  * Mark device we actually created.
4802                  * We should explicitly delete
4803                  * when we do not need it anymore.
4804                  */
4805                 vtep->created = 1;
4806         }
4807         /* Try to get ifindex of created of pre-existing device. */
4808         ret = if_nametoindex(name);
4809         if (!ret) {
4810                 DRV_LOG(WARNING,
4811                         "VTEP %s failed to get index (%d)", name, errno);
4812                 rte_flow_error_set
4813                         (error, -errno,
4814                          RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4815                          "netlink: failed to retrieve VTEP ifindex");
4816                 goto error;
4817         }
4818         vtep->ifindex = ret;
4819         vtep->ifouter = ifouter;
4820         memset(buf, 0, sizeof(buf));
4821         nlh = mnl_nlmsg_put_header(buf);
4822         nlh->nlmsg_type = RTM_NEWLINK;
4823         nlh->nlmsg_flags = NLM_F_REQUEST;
4824         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4825         ifm->ifi_family = AF_UNSPEC;
4826         ifm->ifi_type = 0;
4827         ifm->ifi_index = vtep->ifindex;
4828         ifm->ifi_flags = IFF_UP;
4829         ifm->ifi_change = IFF_UP;
4830         ret = flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL);
4831         if (ret) {
4832                 rte_flow_error_set(error, -errno,
4833                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4834                                    "netlink: failed to set VTEP link up");
4835                 DRV_LOG(WARNING, "netlink: VTEP %s set link up failure (%d)",
4836                         name, rte_errno);
4837                 goto clean;
4838         }
4839         ret = mlx5_flow_tcf_init(tcf, vtep->ifindex, error);
4840         if (ret) {
4841                 DRV_LOG(WARNING, "VTEP %s init failure (%d)", name, rte_errno);
4842                 goto clean;
4843         }
4844         DRV_LOG(INFO, "VTEP create (%d, %d)", vtep->port, vtep->ifindex);
4845         vtep->refcnt = 1;
4846         return vtep;
4847 clean:
4848         flow_tcf_vtep_delete(tcf, vtep);
4849         return NULL;
4850 error:
4851         rte_free(vtep);
4852         return NULL;
4853 }
4854 #else
4855 static struct tcf_vtep*
4856 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf __rte_unused,
4857                      unsigned int ifouter __rte_unused,
4858                      uint16_t port __rte_unused,
4859                      struct rte_flow_error *error)
4860 {
4861         rte_flow_error_set(error, ENOTSUP,
4862                            RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4863                            "netlink: failed to create VTEP, "
4864                            "vxlan metadata are not supported by kernel");
4865         return NULL;
4866 }
4867 #endif /* HAVE_IFLA_VXLAN_COLLECT_METADATA */
4868
4869 /**
4870  * Acquire target interface index for VXLAN tunneling decapsulation.
4871  * In order to share the UDP port within the other interfaces the
4872  * VXLAN device created as not attached to any interface (if created).
4873  *
4874  * @param[in] tcf
4875  *   Context object initialized by mlx5_flow_tcf_context_create().
4876  * @param[in] dev_flow
4877  *   Flow tcf object with tunnel structure pointer set.
4878  * @param[out] error
4879  *   Perform verbose error reporting if not NULL.
4880  * @return
4881  *   Interface descriptor pointer on success,
4882  *   NULL otherwise and rte_errno is set.
4883  */
4884 static struct tcf_vtep*
4885 flow_tcf_decap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
4886                             struct mlx5_flow *dev_flow,
4887                             struct rte_flow_error *error)
4888 {
4889         struct tcf_vtep *vtep;
4890         uint16_t port = dev_flow->tcf.vxlan_decap->udp_port;
4891
4892         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
4893                 if (vtep->port == port)
4894                         break;
4895         }
4896         if (vtep && vtep->ifouter) {
4897                 rte_flow_error_set(error, -errno,
4898                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4899                                    "Failed to create decap VTEP with specified"
4900                                    " UDP port, atatched device exists");
4901                 return NULL;
4902         }
4903         if (vtep) {
4904                 /* Device exists, just increment the reference counter. */
4905                 vtep->refcnt++;
4906                 assert(vtep->ifindex);
4907                 return vtep;
4908         }
4909         /* No decapsulation device exists, try to create the new one. */
4910         vtep = flow_tcf_vtep_create(tcf, 0, port, error);
4911         if (vtep)
4912                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
4913         return vtep;
4914 }
4915
4916 /**
4917  * Aqcuire target interface index for VXLAN tunneling encapsulation.
4918  *
4919  * @param[in] tcf
4920  *   Context object initialized by mlx5_flow_tcf_context_create().
4921  * @param[in] ifouter
4922  *   Network interface index to attach VXLAN encap device to.
4923  * @param[in] dev_flow
4924  *   Flow tcf object with tunnel structure pointer set.
4925  * @param[out] error
4926  *   Perform verbose error reporting if not NULL.
4927  * @return
4928  *   Interface descriptor pointer on success,
4929  *   NULL otherwise and rte_errno is set.
4930  */
4931 static struct tcf_vtep*
4932 flow_tcf_encap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
4933                             unsigned int ifouter,
4934                             struct mlx5_flow *dev_flow __rte_unused,
4935                             struct rte_flow_error *error)
4936 {
4937         static uint16_t encap_port = MLX5_VXLAN_PORT_MIN - 1;
4938         struct tcf_vtep *vtep;
4939         int ret;
4940
4941         assert(ifouter);
4942         /* Look whether the attached VTEP for encap is created. */
4943         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
4944                 if (vtep->ifouter == ifouter)
4945                         break;
4946         }
4947         if (vtep) {
4948                 /* VTEP already exists, just increment the reference. */
4949                 vtep->refcnt++;
4950         } else {
4951                 uint16_t pcnt;
4952
4953                 /* Not found, we should create the new attached VTEP. */
4954                 flow_tcf_encap_iface_cleanup(tcf, ifouter);
4955                 flow_tcf_encap_local_cleanup(tcf, ifouter);
4956                 flow_tcf_encap_neigh_cleanup(tcf, ifouter);
4957                 for (pcnt = 0; pcnt <= (MLX5_VXLAN_PORT_MAX
4958                                      - MLX5_VXLAN_PORT_MIN); pcnt++) {
4959                         encap_port++;
4960                         /* Wraparound the UDP port index. */
4961                         if (encap_port < MLX5_VXLAN_PORT_MIN ||
4962                             encap_port > MLX5_VXLAN_PORT_MAX)
4963                                 encap_port = MLX5_VXLAN_PORT_MIN;
4964                         /* Check whether UDP port is in already in use. */
4965                         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
4966                                 if (vtep->port == encap_port)
4967                                         break;
4968                         }
4969                         if (vtep) {
4970                                 /* Port is in use, try the next one. */
4971                                 vtep = NULL;
4972                                 continue;
4973                         }
4974                         vtep = flow_tcf_vtep_create(tcf, ifouter,
4975                                                     encap_port, error);
4976                         if (vtep) {
4977                                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
4978                                 break;
4979                         }
4980                         if (rte_errno != EEXIST)
4981                                 break;
4982                 }
4983                 if (!vtep)
4984                         return NULL;
4985         }
4986         assert(vtep->ifouter == ifouter);
4987         assert(vtep->ifindex);
4988         /* Create local ipaddr with peer to specify the outer IPs. */
4989         ret = flow_tcf_encap_local(tcf, vtep, dev_flow, true, error);
4990         if (!ret) {
4991                 /* Create neigh rule to specify outer destination MAC. */
4992                 ret = flow_tcf_encap_neigh(tcf, vtep, dev_flow, true, error);
4993                 if (ret)
4994                         flow_tcf_encap_local(tcf, vtep,
4995                                              dev_flow, false, error);
4996         }
4997         if (ret) {
4998                 if (--vtep->refcnt == 0)
4999                         flow_tcf_vtep_delete(tcf, vtep);
5000                 return NULL;
5001         }
5002         return vtep;
5003 }
5004
5005 /**
5006  * Acquires target interface index for tunneling of any type.
5007  * Creates the new VTEP if needed.
5008  *
5009  * @param[in] tcf
5010  *   Context object initialized by mlx5_flow_tcf_context_create().
5011  * @param[in] ifouter
5012  *   Network interface index to attach VXLAN encap device to.
5013  * @param[in] dev_flow
5014  *   Flow tcf object with tunnel structure pointer set.
5015  * @param[out] error
5016  *   Perform verbose error reporting if not NULL.
5017  * @return
5018  *   Interface descriptor pointer on success,
5019  *   NULL otherwise and rte_errno is set.
5020  */
5021 static struct tcf_vtep*
5022 flow_tcf_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5023                       unsigned int ifouter,
5024                       struct mlx5_flow *dev_flow,
5025                       struct rte_flow_error *error)
5026 {
5027         struct tcf_vtep *vtep = NULL;
5028
5029         assert(dev_flow->tcf.tunnel);
5030         pthread_mutex_lock(&vtep_list_mutex);
5031         switch (dev_flow->tcf.tunnel->type) {
5032         case FLOW_TCF_TUNACT_VXLAN_ENCAP:
5033                 vtep = flow_tcf_encap_vtep_acquire(tcf, ifouter,
5034                                                   dev_flow, error);
5035                 break;
5036         case FLOW_TCF_TUNACT_VXLAN_DECAP:
5037                 vtep = flow_tcf_decap_vtep_acquire(tcf, dev_flow, error);
5038                 break;
5039         default:
5040                 rte_flow_error_set(error, ENOTSUP,
5041                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5042                                    "unsupported tunnel type");
5043                 break;
5044         }
5045         pthread_mutex_unlock(&vtep_list_mutex);
5046         return vtep;
5047 }
5048
5049 /**
5050  * Release tunneling interface by ifindex. Decrements reference
5051  * counter and actually removes the device if counter is zero.
5052  *
5053  * @param[in] tcf
5054  *   Context object initialized by mlx5_flow_tcf_context_create().
5055  * @param[in] vtep
5056  *   VTEP device descriptor structure.
5057  * @param[in] dev_flow
5058  *   Flow tcf object with tunnel structure pointer set.
5059  */
5060 static void
5061 flow_tcf_vtep_release(struct mlx5_flow_tcf_context *tcf,
5062                       struct tcf_vtep *vtep,
5063                       struct mlx5_flow *dev_flow)
5064 {
5065         assert(dev_flow->tcf.tunnel);
5066         pthread_mutex_lock(&vtep_list_mutex);
5067         switch (dev_flow->tcf.tunnel->type) {
5068         case FLOW_TCF_TUNACT_VXLAN_DECAP:
5069                 break;
5070         case FLOW_TCF_TUNACT_VXLAN_ENCAP:
5071                 /* Remove the encap ancillary rules first. */
5072                 flow_tcf_encap_neigh(tcf, vtep, dev_flow, false, NULL);
5073                 flow_tcf_encap_local(tcf, vtep, dev_flow, false, NULL);
5074                 break;
5075         default:
5076                 assert(false);
5077                 DRV_LOG(WARNING, "Unsupported tunnel type");
5078                 break;
5079         }
5080         assert(vtep->refcnt);
5081         if (--vtep->refcnt == 0) {
5082                 LIST_REMOVE(vtep, next);
5083                 flow_tcf_vtep_delete(tcf, vtep);
5084         }
5085         pthread_mutex_unlock(&vtep_list_mutex);
5086 }
5087
5088
5089 /**
5090  * Apply flow to E-Switch by sending Netlink message.
5091  *
5092  * @param[in] dev
5093  *   Pointer to Ethernet device.
5094  * @param[in, out] flow
5095  *   Pointer to the sub flow.
5096  * @param[out] error
5097  *   Pointer to the error structure.
5098  *
5099  * @return
5100  *   0 on success, a negative errno value otherwise and rte_ernno is set.
5101  */
5102 static int
5103 flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
5104                struct rte_flow_error *error)
5105 {
5106         struct priv *priv = dev->data->dev_private;
5107         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5108         struct mlx5_flow *dev_flow;
5109         struct nlmsghdr *nlh;
5110
5111         dev_flow = LIST_FIRST(&flow->dev_flows);
5112         /* E-Switch flow can't be expanded. */
5113         assert(!LIST_NEXT(dev_flow, next));
5114         if (dev_flow->tcf.applied)
5115                 return 0;
5116         nlh = dev_flow->tcf.nlh;
5117         nlh->nlmsg_type = RTM_NEWTFILTER;
5118         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
5119         if (dev_flow->tcf.tunnel) {
5120                 /*
5121                  * Replace the interface index, target for
5122                  * encapsulation, source for decapsulation.
5123                  */
5124                 assert(!dev_flow->tcf.tunnel->vtep);
5125                 assert(dev_flow->tcf.tunnel->ifindex_ptr);
5126                 /* Acquire actual VTEP device when rule is being applied. */
5127                 dev_flow->tcf.tunnel->vtep =
5128                         flow_tcf_vtep_acquire(ctx,
5129                                         dev_flow->tcf.tunnel->ifindex_org,
5130                                         dev_flow, error);
5131                 if (!dev_flow->tcf.tunnel->vtep)
5132                         return -rte_errno;
5133                 DRV_LOG(INFO, "Replace ifindex: %d->%d",
5134                                 dev_flow->tcf.tunnel->vtep->ifindex,
5135                                 dev_flow->tcf.tunnel->ifindex_org);
5136                 *dev_flow->tcf.tunnel->ifindex_ptr =
5137                         dev_flow->tcf.tunnel->vtep->ifindex;
5138         }
5139         if (!flow_tcf_nl_ack(ctx, nlh, 0, NULL, NULL)) {
5140                 dev_flow->tcf.applied = 1;
5141                 return 0;
5142         }
5143         return rte_flow_error_set(error, rte_errno,
5144                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5145                                   "netlink: failed to create TC flow rule");
5146 }
5147
5148 /**
5149  * Remove flow from E-Switch by sending Netlink message.
5150  *
5151  * @param[in] dev
5152  *   Pointer to Ethernet device.
5153  * @param[in, out] flow
5154  *   Pointer to the sub flow.
5155  */
5156 static void
5157 flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
5158 {
5159         struct priv *priv = dev->data->dev_private;
5160         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5161         struct mlx5_flow *dev_flow;
5162         struct nlmsghdr *nlh;
5163
5164         if (!flow)
5165                 return;
5166         dev_flow = LIST_FIRST(&flow->dev_flows);
5167         if (!dev_flow)
5168                 return;
5169         /* E-Switch flow can't be expanded. */
5170         assert(!LIST_NEXT(dev_flow, next));
5171         if (dev_flow->tcf.applied) {
5172                 nlh = dev_flow->tcf.nlh;
5173                 nlh->nlmsg_type = RTM_DELTFILTER;
5174                 nlh->nlmsg_flags = NLM_F_REQUEST;
5175                 flow_tcf_nl_ack(ctx, nlh, 0, NULL, NULL);
5176                 if (dev_flow->tcf.tunnel) {
5177                         assert(dev_flow->tcf.tunnel->vtep);
5178                         flow_tcf_vtep_release(ctx,
5179                                 dev_flow->tcf.tunnel->vtep,
5180                                 dev_flow);
5181                         dev_flow->tcf.tunnel->vtep = NULL;
5182                 }
5183                 dev_flow->tcf.applied = 0;
5184         }
5185 }
5186
5187 /**
5188  * Remove flow from E-Switch and release resources of the device flow.
5189  *
5190  * @param[in] dev
5191  *   Pointer to Ethernet device.
5192  * @param[in, out] flow
5193  *   Pointer to the sub flow.
5194  */
5195 static void
5196 flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
5197 {
5198         struct mlx5_flow *dev_flow;
5199
5200         if (!flow)
5201                 return;
5202         flow_tcf_remove(dev, flow);
5203         if (flow->counter) {
5204                 if (--flow->counter->ref_cnt == 0) {
5205                         rte_free(flow->counter);
5206                         flow->counter = NULL;
5207                 }
5208         }
5209         dev_flow = LIST_FIRST(&flow->dev_flows);
5210         if (!dev_flow)
5211                 return;
5212         /* E-Switch flow can't be expanded. */
5213         assert(!LIST_NEXT(dev_flow, next));
5214         LIST_REMOVE(dev_flow, next);
5215         rte_free(dev_flow);
5216 }
5217
5218 /**
5219  * Helper routine for figuring the space size required for a parse buffer.
5220  *
5221  * @param array
5222  *   array of values to use.
5223  * @param idx
5224  *   Current location in array.
5225  * @param value
5226  *   Value to compare with.
5227  *
5228  * @return
5229  *   The maximum between the given value and the array value on index.
5230  */
5231 static uint16_t
5232 flow_tcf_arr_val_max(uint16_t array[], int idx, uint16_t value)
5233 {
5234         return idx < 0 ? (value) : RTE_MAX((array)[idx], value);
5235 }
5236
5237 /**
5238  * Parse rtnetlink message attributes filling the attribute table with the info
5239  * retrieved.
5240  *
5241  * @param tb
5242  *   Attribute table to be filled.
5243  * @param[out] max
5244  *   Maxinum entry in the attribute table.
5245  * @param rte
5246  *   The attributes section in the message to be parsed.
5247  * @param len
5248  *   The length of the attributes section in the message.
5249  */
5250 static void
5251 flow_tcf_nl_parse_rtattr(struct rtattr *tb[], int max,
5252                          struct rtattr *rta, int len)
5253 {
5254         unsigned short type;
5255         memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
5256         while (RTA_OK(rta, len)) {
5257                 type = rta->rta_type;
5258                 if (type <= max && !tb[type])
5259                         tb[type] = rta;
5260                 rta = RTA_NEXT(rta, len);
5261         }
5262 }
5263
5264 /**
5265  * Extract flow counters from flower action.
5266  *
5267  * @param rta
5268  *   flower action stats properties in the Netlink message received.
5269  * @param rta_type
5270  *   The backward sequence of rta_types, as written in the attribute table,
5271  *   we need to traverse in order to get to the requested object.
5272  * @param idx
5273  *   Current location in rta_type table.
5274  * @param[out] data
5275  *   data holding the count statistics of the rte_flow retrieved from
5276  *   the message.
5277  *
5278  * @return
5279  *   0 if data was found and retrieved, -1 otherwise.
5280  */
5281 static int
5282 flow_tcf_nl_action_stats_parse_and_get(struct rtattr *rta,
5283                                        uint16_t rta_type[], int idx,
5284                                        struct gnet_stats_basic *data)
5285 {
5286         int tca_stats_max = flow_tcf_arr_val_max(rta_type, idx,
5287                                                  TCA_STATS_BASIC);
5288         struct rtattr *tbs[tca_stats_max + 1];
5289
5290         if (rta == NULL || idx < 0)
5291                 return -1;
5292         flow_tcf_nl_parse_rtattr(tbs, tca_stats_max,
5293                                  RTA_DATA(rta), RTA_PAYLOAD(rta));
5294         switch (rta_type[idx]) {
5295         case TCA_STATS_BASIC:
5296                 if (tbs[TCA_STATS_BASIC]) {
5297                         memcpy(data, RTA_DATA(tbs[TCA_STATS_BASIC]),
5298                                RTE_MIN(RTA_PAYLOAD(tbs[TCA_STATS_BASIC]),
5299                                sizeof(*data)));
5300                         return 0;
5301                 }
5302                 break;
5303         default:
5304                 break;
5305         }
5306         return -1;
5307 }
5308
5309 /**
5310  * Parse flower single action retrieving the requested action attribute,
5311  * if found.
5312  *
5313  * @param arg
5314  *   flower action properties in the Netlink message received.
5315  * @param rta_type
5316  *   The backward sequence of rta_types, as written in the attribute table,
5317  *   we need to traverse in order to get to the requested object.
5318  * @param idx
5319  *   Current location in rta_type table.
5320  * @param[out] data
5321  *   Count statistics retrieved from the message query.
5322  *
5323  * @return
5324  *   0 if data was found and retrieved, -1 otherwise.
5325  */
5326 static int
5327 flow_tcf_nl_parse_one_action_and_get(struct rtattr *arg,
5328                                      uint16_t rta_type[], int idx, void *data)
5329 {
5330         int tca_act_max = flow_tcf_arr_val_max(rta_type, idx, TCA_ACT_STATS);
5331         struct rtattr *tb[tca_act_max + 1];
5332
5333         if (arg == NULL || idx < 0)
5334                 return -1;
5335         flow_tcf_nl_parse_rtattr(tb, tca_act_max,
5336                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
5337         if (tb[TCA_ACT_KIND] == NULL)
5338                 return -1;
5339         switch (rta_type[idx]) {
5340         case TCA_ACT_STATS:
5341                 if (tb[TCA_ACT_STATS])
5342                         return flow_tcf_nl_action_stats_parse_and_get
5343                                         (tb[TCA_ACT_STATS],
5344                                          rta_type, --idx,
5345                                          (struct gnet_stats_basic *)data);
5346                 break;
5347         default:
5348                 break;
5349         }
5350         return -1;
5351 }
5352
5353 /**
5354  * Parse flower action section in the message retrieving the requested
5355  * attribute from the first action that provides it.
5356  *
5357  * @param opt
5358  *   flower section in the Netlink message received.
5359  * @param rta_type
5360  *   The backward sequence of rta_types, as written in the attribute table,
5361  *   we need to traverse in order to get to the requested object.
5362  * @param idx
5363  *   Current location in rta_type table.
5364  * @param[out] data
5365  *   data retrieved from the message query.
5366  *
5367  * @return
5368  *   0 if data was found and retrieved, -1 otherwise.
5369  */
5370 static int
5371 flow_tcf_nl_action_parse_and_get(struct rtattr *arg,
5372                                  uint16_t rta_type[], int idx, void *data)
5373 {
5374         struct rtattr *tb[TCA_ACT_MAX_PRIO + 1];
5375         int i;
5376
5377         if (arg == NULL || idx < 0)
5378                 return -1;
5379         flow_tcf_nl_parse_rtattr(tb, TCA_ACT_MAX_PRIO,
5380                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
5381         switch (rta_type[idx]) {
5382         /*
5383          * flow counters are stored in the actions defined by the flow
5384          * and not in the flow itself, therefore we need to traverse the
5385          * flower chain of actions in search for them.
5386          *
5387          * Note that the index is not decremented here.
5388          */
5389         case TCA_ACT_STATS:
5390                 for (i = 0; i <= TCA_ACT_MAX_PRIO; i++) {
5391                         if (tb[i] &&
5392                         !flow_tcf_nl_parse_one_action_and_get(tb[i],
5393                                                               rta_type,
5394                                                               idx, data))
5395                                 return 0;
5396                 }
5397                 break;
5398         default:
5399                 break;
5400         }
5401         return -1;
5402 }
5403
5404 /**
5405  * Parse flower classifier options in the message, retrieving the requested
5406  * attribute if found.
5407  *
5408  * @param opt
5409  *   flower section in the Netlink message received.
5410  * @param rta_type
5411  *   The backward sequence of rta_types, as written in the attribute table,
5412  *   we need to traverse in order to get to the requested object.
5413  * @param idx
5414  *   Current location in rta_type table.
5415  * @param[out] data
5416  *   data retrieved from the message query.
5417  *
5418  * @return
5419  *   0 if data was found and retrieved, -1 otherwise.
5420  */
5421 static int
5422 flow_tcf_nl_opts_parse_and_get(struct rtattr *opt,
5423                                uint16_t rta_type[], int idx, void *data)
5424 {
5425         int tca_flower_max = flow_tcf_arr_val_max(rta_type, idx,
5426                                                   TCA_FLOWER_ACT);
5427         struct rtattr *tb[tca_flower_max + 1];
5428
5429         if (!opt || idx < 0)
5430                 return -1;
5431         flow_tcf_nl_parse_rtattr(tb, tca_flower_max,
5432                                  RTA_DATA(opt), RTA_PAYLOAD(opt));
5433         switch (rta_type[idx]) {
5434         case TCA_FLOWER_ACT:
5435                 if (tb[TCA_FLOWER_ACT])
5436                         return flow_tcf_nl_action_parse_and_get
5437                                                         (tb[TCA_FLOWER_ACT],
5438                                                          rta_type, --idx, data);
5439                 break;
5440         default:
5441                 break;
5442         }
5443         return -1;
5444 }
5445
5446 /**
5447  * Parse Netlink reply on filter query, retrieving the flow counters.
5448  *
5449  * @param nlh
5450  *   Message received from Netlink.
5451  * @param rta_type
5452  *   The backward sequence of rta_types, as written in the attribute table,
5453  *   we need to traverse in order to get to the requested object.
5454  * @param idx
5455  *   Current location in rta_type table.
5456  * @param[out] data
5457  *   data retrieved from the message query.
5458  *
5459  * @return
5460  *   0 if data was found and retrieved, -1 otherwise.
5461  */
5462 static int
5463 flow_tcf_nl_filter_parse_and_get(struct nlmsghdr *cnlh,
5464                                  uint16_t rta_type[], int idx, void *data)
5465 {
5466         struct nlmsghdr *nlh = cnlh;
5467         struct tcmsg *t = NLMSG_DATA(nlh);
5468         int len = nlh->nlmsg_len;
5469         int tca_max = flow_tcf_arr_val_max(rta_type, idx, TCA_OPTIONS);
5470         struct rtattr *tb[tca_max + 1];
5471
5472         if (idx < 0)
5473                 return -1;
5474         if (nlh->nlmsg_type != RTM_NEWTFILTER &&
5475             nlh->nlmsg_type != RTM_GETTFILTER &&
5476             nlh->nlmsg_type != RTM_DELTFILTER)
5477                 return -1;
5478         len -= NLMSG_LENGTH(sizeof(*t));
5479         if (len < 0)
5480                 return -1;
5481         flow_tcf_nl_parse_rtattr(tb, tca_max, TCA_RTA(t), len);
5482         /* Not a TC flower flow - bail out */
5483         if (!tb[TCA_KIND] ||
5484             strcmp(RTA_DATA(tb[TCA_KIND]), "flower"))
5485                 return -1;
5486         switch (rta_type[idx]) {
5487         case TCA_OPTIONS:
5488                 if (tb[TCA_OPTIONS])
5489                         return flow_tcf_nl_opts_parse_and_get(tb[TCA_OPTIONS],
5490                                                               rta_type,
5491                                                               --idx, data);
5492                 break;
5493         default:
5494                 break;
5495         }
5496         return -1;
5497 }
5498
5499 /**
5500  * A callback to parse Netlink reply on TC flower query.
5501  *
5502  * @param nlh
5503  *   Message received from Netlink.
5504  * @param[out] data
5505  *   Pointer to data area to be filled by the parsing routine.
5506  *   assumed to be a pinter to struct flow_tcf_stats_basic.
5507  *
5508  * @return
5509  *   MNL_CB_OK value.
5510  */
5511 static int
5512 flow_tcf_nl_message_get_stats_basic(const struct nlmsghdr *nlh, void *data)
5513 {
5514         /*
5515          * The backward sequence of rta_types to pass in order to get
5516          *  to the counters.
5517          */
5518         uint16_t rta_type[] = { TCA_STATS_BASIC, TCA_ACT_STATS,
5519                                 TCA_FLOWER_ACT, TCA_OPTIONS };
5520         struct flow_tcf_stats_basic *sb_data = data;
5521         union {
5522                 const struct nlmsghdr *c;
5523                 struct nlmsghdr *nc;
5524         } tnlh = { .c = nlh };
5525
5526         if (!flow_tcf_nl_filter_parse_and_get(tnlh.nc, rta_type,
5527                                               RTE_DIM(rta_type) - 1,
5528                                               (void *)&sb_data->counters))
5529                 sb_data->valid = true;
5530         return MNL_CB_OK;
5531 }
5532
5533 /**
5534  * Query a TC flower rule for its statistics via netlink.
5535  *
5536  * @param[in] dev
5537  *   Pointer to Ethernet device.
5538  * @param[in] flow
5539  *   Pointer to the sub flow.
5540  * @param[out] data
5541  *   data retrieved by the query.
5542  * @param[out] error
5543  *   Perform verbose error reporting if not NULL.
5544  *
5545  * @return
5546  *   0 on success, a negative errno value otherwise and rte_errno is set.
5547  */
5548 static int
5549 flow_tcf_query_count(struct rte_eth_dev *dev,
5550                           struct rte_flow *flow,
5551                           void *data,
5552                           struct rte_flow_error *error)
5553 {
5554         struct flow_tcf_stats_basic sb_data = { 0 };
5555         struct rte_flow_query_count *qc = data;
5556         struct priv *priv = dev->data->dev_private;
5557         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5558         struct mnl_socket *nl = ctx->nl;
5559         struct mlx5_flow *dev_flow;
5560         struct nlmsghdr *nlh;
5561         uint32_t seq = priv->tcf_context->seq++;
5562         ssize_t ret;
5563         assert(qc);
5564
5565         dev_flow = LIST_FIRST(&flow->dev_flows);
5566         /* E-Switch flow can't be expanded. */
5567         assert(!LIST_NEXT(dev_flow, next));
5568         if (!dev_flow->flow->counter)
5569                 goto notsup_exit;
5570         nlh = dev_flow->tcf.nlh;
5571         nlh->nlmsg_type = RTM_GETTFILTER;
5572         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ECHO;
5573         nlh->nlmsg_seq = seq;
5574         if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) == -1)
5575                 goto error_exit;
5576         do {
5577                 ret = mnl_socket_recvfrom(nl, ctx->buf, ctx->buf_size);
5578                 if (ret <= 0)
5579                         break;
5580                 ret = mnl_cb_run(ctx->buf, ret, seq,
5581                                  mnl_socket_get_portid(nl),
5582                                  flow_tcf_nl_message_get_stats_basic,
5583                                  (void *)&sb_data);
5584         } while (ret > 0);
5585         /* Return the delta from last reset. */
5586         if (sb_data.valid) {
5587                 /* Return the delta from last reset. */
5588                 qc->hits_set = 1;
5589                 qc->bytes_set = 1;
5590                 qc->hits = sb_data.counters.packets - flow->counter->hits;
5591                 qc->bytes = sb_data.counters.bytes - flow->counter->bytes;
5592                 if (qc->reset) {
5593                         flow->counter->hits = sb_data.counters.packets;
5594                         flow->counter->bytes = sb_data.counters.bytes;
5595                 }
5596                 return 0;
5597         }
5598         return rte_flow_error_set(error, EINVAL,
5599                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5600                                   NULL,
5601                                   "flow does not have counter");
5602 error_exit:
5603         return rte_flow_error_set
5604                         (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5605                          NULL, "netlink: failed to read flow rule counters");
5606 notsup_exit:
5607         return rte_flow_error_set
5608                         (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5609                          NULL, "counters are not available.");
5610 }
5611
5612 /**
5613  * Query a flow.
5614  *
5615  * @see rte_flow_query()
5616  * @see rte_flow_ops
5617  */
5618 static int
5619 flow_tcf_query(struct rte_eth_dev *dev,
5620                struct rte_flow *flow,
5621                const struct rte_flow_action *actions,
5622                void *data,
5623                struct rte_flow_error *error)
5624 {
5625         int ret = -EINVAL;
5626
5627         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
5628                 switch (actions->type) {
5629                 case RTE_FLOW_ACTION_TYPE_VOID:
5630                         break;
5631                 case RTE_FLOW_ACTION_TYPE_COUNT:
5632                         ret = flow_tcf_query_count(dev, flow, data, error);
5633                         break;
5634                 default:
5635                         return rte_flow_error_set(error, ENOTSUP,
5636                                                   RTE_FLOW_ERROR_TYPE_ACTION,
5637                                                   actions,
5638                                                   "action not supported");
5639                 }
5640         }
5641         return ret;
5642 }
5643
5644 const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops = {
5645         .validate = flow_tcf_validate,
5646         .prepare = flow_tcf_prepare,
5647         .translate = flow_tcf_translate,
5648         .apply = flow_tcf_apply,
5649         .remove = flow_tcf_remove,
5650         .destroy = flow_tcf_destroy,
5651         .query = flow_tcf_query,
5652 };
5653
5654 /**
5655  * Create and configure a libmnl socket for Netlink flow rules.
5656  *
5657  * @return
5658  *   A valid libmnl socket object pointer on success, NULL otherwise and
5659  *   rte_errno is set.
5660  */
5661 static struct mnl_socket *
5662 flow_tcf_mnl_socket_create(void)
5663 {
5664         struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
5665
5666         if (nl) {
5667                 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
5668                                       sizeof(int));
5669                 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
5670                         return nl;
5671         }
5672         rte_errno = errno;
5673         if (nl)
5674                 mnl_socket_close(nl);
5675         return NULL;
5676 }
5677
5678 /**
5679  * Destroy a libmnl socket.
5680  *
5681  * @param nl
5682  *   Libmnl socket of the @p NETLINK_ROUTE kind.
5683  */
5684 static void
5685 flow_tcf_mnl_socket_destroy(struct mnl_socket *nl)
5686 {
5687         if (nl)
5688                 mnl_socket_close(nl);
5689 }
5690
5691 /**
5692  * Initialize ingress qdisc of a given network interface.
5693  *
5694  * @param ctx
5695  *   Pointer to tc-flower context to use.
5696  * @param ifindex
5697  *   Index of network interface to initialize.
5698  * @param[out] error
5699  *   Perform verbose error reporting if not NULL.
5700  *
5701  * @return
5702  *   0 on success, a negative errno value otherwise and rte_errno is set.
5703  */
5704 int
5705 mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx,
5706                    unsigned int ifindex, struct rte_flow_error *error)
5707 {
5708         struct nlmsghdr *nlh;
5709         struct tcmsg *tcm;
5710         alignas(struct nlmsghdr)
5711         uint8_t buf[mnl_nlmsg_size(sizeof(*tcm)) +
5712                     SZ_NLATTR_STRZ_OF("ingress") +
5713                     MNL_BUF_EXTRA_SPACE];
5714
5715         /* Destroy existing ingress qdisc and everything attached to it. */
5716         nlh = mnl_nlmsg_put_header(buf);
5717         nlh->nlmsg_type = RTM_DELQDISC;
5718         nlh->nlmsg_flags = NLM_F_REQUEST;
5719         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
5720         tcm->tcm_family = AF_UNSPEC;
5721         tcm->tcm_ifindex = ifindex;
5722         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
5723         tcm->tcm_parent = TC_H_INGRESS;
5724         assert(sizeof(buf) >= nlh->nlmsg_len);
5725         /* Ignore errors when qdisc is already absent. */
5726         if (flow_tcf_nl_ack(ctx, nlh, 0, NULL, NULL) &&
5727             rte_errno != EINVAL && rte_errno != ENOENT)
5728                 return rte_flow_error_set(error, rte_errno,
5729                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5730                                           "netlink: failed to remove ingress"
5731                                           " qdisc");
5732         /* Create fresh ingress qdisc. */
5733         nlh = mnl_nlmsg_put_header(buf);
5734         nlh->nlmsg_type = RTM_NEWQDISC;
5735         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
5736         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
5737         tcm->tcm_family = AF_UNSPEC;
5738         tcm->tcm_ifindex = ifindex;
5739         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
5740         tcm->tcm_parent = TC_H_INGRESS;
5741         mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
5742         assert(sizeof(buf) >= nlh->nlmsg_len);
5743         if (flow_tcf_nl_ack(ctx, nlh, 0, NULL, NULL))
5744                 return rte_flow_error_set(error, rte_errno,
5745                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5746                                           "netlink: failed to create ingress"
5747                                           " qdisc");
5748         return 0;
5749 }
5750
5751 /**
5752  * Create libmnl context for Netlink flow rules.
5753  *
5754  * @return
5755  *   A valid libmnl socket object pointer on success, NULL otherwise and
5756  *   rte_errno is set.
5757  */
5758 struct mlx5_flow_tcf_context *
5759 mlx5_flow_tcf_context_create(void)
5760 {
5761         struct mlx5_flow_tcf_context *ctx = rte_zmalloc(__func__,
5762                                                         sizeof(*ctx),
5763                                                         sizeof(uint32_t));
5764         if (!ctx)
5765                 goto error;
5766         ctx->nl = flow_tcf_mnl_socket_create();
5767         if (!ctx->nl)
5768                 goto error;
5769         ctx->buf_size = MNL_SOCKET_BUFFER_SIZE;
5770         ctx->buf = rte_zmalloc(__func__,
5771                                ctx->buf_size, sizeof(uint32_t));
5772         if (!ctx->buf)
5773                 goto error;
5774         ctx->seq = random();
5775         return ctx;
5776 error:
5777         mlx5_flow_tcf_context_destroy(ctx);
5778         return NULL;
5779 }
5780
5781 /**
5782  * Destroy a libmnl context.
5783  *
5784  * @param ctx
5785  *   Libmnl socket of the @p NETLINK_ROUTE kind.
5786  */
5787 void
5788 mlx5_flow_tcf_context_destroy(struct mlx5_flow_tcf_context *ctx)
5789 {
5790         if (!ctx)
5791                 return;
5792         flow_tcf_mnl_socket_destroy(ctx->nl);
5793         rte_free(ctx->buf);
5794         rte_free(ctx);
5795 }