net/mlx5: validate tunnel inner items on E-Switch
[dpdk.git] / drivers / net / mlx5 / mlx5_flow_tcf.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018 6WIND S.A.
3  * Copyright 2018 Mellanox Technologies, Ltd
4  */
5
6 #include <assert.h>
7 #include <errno.h>
8 #include <libmnl/libmnl.h>
9 #include <linux/gen_stats.h>
10 #include <linux/if_ether.h>
11 #include <linux/netlink.h>
12 #include <linux/pkt_cls.h>
13 #include <linux/pkt_sched.h>
14 #include <linux/rtnetlink.h>
15 #include <linux/tc_act/tc_gact.h>
16 #include <linux/tc_act/tc_mirred.h>
17 #include <netinet/in.h>
18 #include <stdalign.h>
19 #include <stdbool.h>
20 #include <stddef.h>
21 #include <stdint.h>
22 #include <stdlib.h>
23 #include <sys/socket.h>
24
25 #include <rte_byteorder.h>
26 #include <rte_errno.h>
27 #include <rte_ether.h>
28 #include <rte_flow.h>
29 #include <rte_malloc.h>
30 #include <rte_common.h>
31
32 #include "mlx5.h"
33 #include "mlx5_flow.h"
34 #include "mlx5_autoconf.h"
35
36 #ifdef HAVE_TC_ACT_VLAN
37
38 #include <linux/tc_act/tc_vlan.h>
39
40 #else /* HAVE_TC_ACT_VLAN */
41
42 #define TCA_VLAN_ACT_POP 1
43 #define TCA_VLAN_ACT_PUSH 2
44 #define TCA_VLAN_ACT_MODIFY 3
45 #define TCA_VLAN_PARMS 2
46 #define TCA_VLAN_PUSH_VLAN_ID 3
47 #define TCA_VLAN_PUSH_VLAN_PROTOCOL 4
48 #define TCA_VLAN_PAD 5
49 #define TCA_VLAN_PUSH_VLAN_PRIORITY 6
50
51 struct tc_vlan {
52         tc_gen;
53         int v_action;
54 };
55
56 #endif /* HAVE_TC_ACT_VLAN */
57
58 #ifdef HAVE_TC_ACT_PEDIT
59
60 #include <linux/tc_act/tc_pedit.h>
61
62 #else /* HAVE_TC_ACT_VLAN */
63
64 enum {
65         TCA_PEDIT_UNSPEC,
66         TCA_PEDIT_TM,
67         TCA_PEDIT_PARMS,
68         TCA_PEDIT_PAD,
69         TCA_PEDIT_PARMS_EX,
70         TCA_PEDIT_KEYS_EX,
71         TCA_PEDIT_KEY_EX,
72         __TCA_PEDIT_MAX
73 };
74
75 enum {
76         TCA_PEDIT_KEY_EX_HTYPE = 1,
77         TCA_PEDIT_KEY_EX_CMD = 2,
78         __TCA_PEDIT_KEY_EX_MAX
79 };
80
81 enum pedit_header_type {
82         TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK = 0,
83         TCA_PEDIT_KEY_EX_HDR_TYPE_ETH = 1,
84         TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 = 2,
85         TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 = 3,
86         TCA_PEDIT_KEY_EX_HDR_TYPE_TCP = 4,
87         TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
88         __PEDIT_HDR_TYPE_MAX,
89 };
90
91 enum pedit_cmd {
92         TCA_PEDIT_KEY_EX_CMD_SET = 0,
93         TCA_PEDIT_KEY_EX_CMD_ADD = 1,
94         __PEDIT_CMD_MAX,
95 };
96
97 struct tc_pedit_key {
98         __u32 mask; /* AND */
99         __u32 val; /*XOR */
100         __u32 off; /*offset */
101         __u32 at;
102         __u32 offmask;
103         __u32 shift;
104 };
105
106 __extension__
107 struct tc_pedit_sel {
108         tc_gen;
109         unsigned char nkeys;
110         unsigned char flags;
111         struct tc_pedit_key keys[0];
112 };
113
114 #endif /* HAVE_TC_ACT_VLAN */
115
116 #ifdef HAVE_TC_ACT_TUNNEL_KEY
117
118 #include <linux/tc_act/tc_tunnel_key.h>
119
120 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
121 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
122 #endif
123
124 #ifndef HAVE_TCA_TUNNEL_KEY_NO_CSUM
125 #define TCA_TUNNEL_KEY_NO_CSUM 10
126 #endif
127
128 #else /* HAVE_TC_ACT_TUNNEL_KEY */
129
130 #define TCA_ACT_TUNNEL_KEY 17
131 #define TCA_TUNNEL_KEY_ACT_SET 1
132 #define TCA_TUNNEL_KEY_ACT_RELEASE 2
133 #define TCA_TUNNEL_KEY_PARMS 2
134 #define TCA_TUNNEL_KEY_ENC_IPV4_SRC 3
135 #define TCA_TUNNEL_KEY_ENC_IPV4_DST 4
136 #define TCA_TUNNEL_KEY_ENC_IPV6_SRC 5
137 #define TCA_TUNNEL_KEY_ENC_IPV6_DST 6
138 #define TCA_TUNNEL_KEY_ENC_KEY_ID 7
139 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
140 #define TCA_TUNNEL_KEY_NO_CSUM 10
141
142 struct tc_tunnel_key {
143         tc_gen;
144         int t_action;
145 };
146
147 #endif /* HAVE_TC_ACT_TUNNEL_KEY */
148
149 /* Normally found in linux/netlink.h. */
150 #ifndef NETLINK_CAP_ACK
151 #define NETLINK_CAP_ACK 10
152 #endif
153
154 /* Normally found in linux/pkt_sched.h. */
155 #ifndef TC_H_MIN_INGRESS
156 #define TC_H_MIN_INGRESS 0xfff2u
157 #endif
158
159 /* Normally found in linux/pkt_cls.h. */
160 #ifndef TCA_CLS_FLAGS_SKIP_SW
161 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
162 #endif
163 #ifndef TCA_CLS_FLAGS_IN_HW
164 #define TCA_CLS_FLAGS_IN_HW (1 << 2)
165 #endif
166 #ifndef HAVE_TCA_CHAIN
167 #define TCA_CHAIN 11
168 #endif
169 #ifndef HAVE_TCA_FLOWER_ACT
170 #define TCA_FLOWER_ACT 3
171 #endif
172 #ifndef HAVE_TCA_FLOWER_FLAGS
173 #define TCA_FLOWER_FLAGS 22
174 #endif
175 #ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE
176 #define TCA_FLOWER_KEY_ETH_TYPE 8
177 #endif
178 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST
179 #define TCA_FLOWER_KEY_ETH_DST 4
180 #endif
181 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK
182 #define TCA_FLOWER_KEY_ETH_DST_MASK 5
183 #endif
184 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC
185 #define TCA_FLOWER_KEY_ETH_SRC 6
186 #endif
187 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK
188 #define TCA_FLOWER_KEY_ETH_SRC_MASK 7
189 #endif
190 #ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO
191 #define TCA_FLOWER_KEY_IP_PROTO 9
192 #endif
193 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC
194 #define TCA_FLOWER_KEY_IPV4_SRC 10
195 #endif
196 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK
197 #define TCA_FLOWER_KEY_IPV4_SRC_MASK 11
198 #endif
199 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST
200 #define TCA_FLOWER_KEY_IPV4_DST 12
201 #endif
202 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK
203 #define TCA_FLOWER_KEY_IPV4_DST_MASK 13
204 #endif
205 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC
206 #define TCA_FLOWER_KEY_IPV6_SRC 14
207 #endif
208 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK
209 #define TCA_FLOWER_KEY_IPV6_SRC_MASK 15
210 #endif
211 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST
212 #define TCA_FLOWER_KEY_IPV6_DST 16
213 #endif
214 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK
215 #define TCA_FLOWER_KEY_IPV6_DST_MASK 17
216 #endif
217 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC
218 #define TCA_FLOWER_KEY_TCP_SRC 18
219 #endif
220 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK
221 #define TCA_FLOWER_KEY_TCP_SRC_MASK 35
222 #endif
223 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST
224 #define TCA_FLOWER_KEY_TCP_DST 19
225 #endif
226 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK
227 #define TCA_FLOWER_KEY_TCP_DST_MASK 36
228 #endif
229 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC
230 #define TCA_FLOWER_KEY_UDP_SRC 20
231 #endif
232 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK
233 #define TCA_FLOWER_KEY_UDP_SRC_MASK 37
234 #endif
235 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST
236 #define TCA_FLOWER_KEY_UDP_DST 21
237 #endif
238 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK
239 #define TCA_FLOWER_KEY_UDP_DST_MASK 38
240 #endif
241 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID
242 #define TCA_FLOWER_KEY_VLAN_ID 23
243 #endif
244 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO
245 #define TCA_FLOWER_KEY_VLAN_PRIO 24
246 #endif
247 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
248 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
249 #endif
250 #ifndef HAVE_TCA_FLOWER_KEY_ENC_KEY_ID
251 #define TCA_FLOWER_KEY_ENC_KEY_ID 26
252 #endif
253 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC
254 #define TCA_FLOWER_KEY_ENC_IPV4_SRC 27
255 #endif
256 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK
257 #define TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK 28
258 #endif
259 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST
260 #define TCA_FLOWER_KEY_ENC_IPV4_DST 29
261 #endif
262 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK
263 #define TCA_FLOWER_KEY_ENC_IPV4_DST_MASK 30
264 #endif
265 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC
266 #define TCA_FLOWER_KEY_ENC_IPV6_SRC 31
267 #endif
268 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK
269 #define TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK 32
270 #endif
271 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST
272 #define TCA_FLOWER_KEY_ENC_IPV6_DST 33
273 #endif
274 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK
275 #define TCA_FLOWER_KEY_ENC_IPV6_DST_MASK 34
276 #endif
277 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT
278 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT 43
279 #endif
280 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK
281 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK 44
282 #endif
283 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT
284 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT 45
285 #endif
286 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK
287 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK 46
288 #endif
289 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS
290 #define TCA_FLOWER_KEY_TCP_FLAGS 71
291 #endif
292 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK
293 #define TCA_FLOWER_KEY_TCP_FLAGS_MASK 72
294 #endif
295 #ifndef HAVE_TC_ACT_GOTO_CHAIN
296 #define TC_ACT_GOTO_CHAIN 0x20000000
297 #endif
298
299 #ifndef IPV6_ADDR_LEN
300 #define IPV6_ADDR_LEN 16
301 #endif
302
303 #ifndef IPV4_ADDR_LEN
304 #define IPV4_ADDR_LEN 4
305 #endif
306
307 #ifndef TP_PORT_LEN
308 #define TP_PORT_LEN 2 /* Transport Port (UDP/TCP) Length */
309 #endif
310
311 #ifndef TTL_LEN
312 #define TTL_LEN 1
313 #endif
314
315 #ifndef TCA_ACT_MAX_PRIO
316 #define TCA_ACT_MAX_PRIO 32
317 #endif
318
319 /** UDP port range of VXLAN devices created by driver. */
320 #define MLX5_VXLAN_PORT_MIN 30000
321 #define MLX5_VXLAN_PORT_MAX 60000
322 #define MLX5_VXLAN_DEVICE_PFX "vmlx_"
323
324 /** Tunnel action type, used for @p type in header structure. */
325 enum flow_tcf_tunact_type {
326         FLOW_TCF_TUNACT_VXLAN_DECAP,
327         FLOW_TCF_TUNACT_VXLAN_ENCAP,
328 };
329
330 /** Flags used for @p mask in tunnel action encap descriptors. */
331 #define FLOW_TCF_ENCAP_ETH_SRC (1u << 0)
332 #define FLOW_TCF_ENCAP_ETH_DST (1u << 1)
333 #define FLOW_TCF_ENCAP_IPV4_SRC (1u << 2)
334 #define FLOW_TCF_ENCAP_IPV4_DST (1u << 3)
335 #define FLOW_TCF_ENCAP_IPV6_SRC (1u << 4)
336 #define FLOW_TCF_ENCAP_IPV6_DST (1u << 5)
337 #define FLOW_TCF_ENCAP_UDP_SRC (1u << 6)
338 #define FLOW_TCF_ENCAP_UDP_DST (1u << 7)
339 #define FLOW_TCF_ENCAP_VXLAN_VNI (1u << 8)
340
341 /**
342  * Structure for holding netlink context.
343  * Note the size of the message buffer which is MNL_SOCKET_BUFFER_SIZE.
344  * Using this (8KB) buffer size ensures that netlink messages will never be
345  * truncated.
346  */
347 struct mlx5_flow_tcf_context {
348         struct mnl_socket *nl; /* NETLINK_ROUTE libmnl socket. */
349         uint32_t seq; /* Message sequence number. */
350         uint32_t buf_size; /* Message buffer size. */
351         uint8_t *buf; /* Message buffer. */
352 };
353
354 /**
355  * Neigh rule structure. The neigh rule is applied via Netlink to
356  * outer tunnel iface in order to provide destination MAC address
357  * for the VXLAN encapsultion. The neigh rule is implicitly related
358  * to the Flow itself and can be shared by multiple Flows.
359  */
360 struct tcf_neigh_rule {
361         LIST_ENTRY(tcf_neigh_rule) next;
362         uint32_t refcnt;
363         struct ether_addr eth;
364         uint16_t mask;
365         union {
366                 struct {
367                         rte_be32_t dst;
368                 } ipv4;
369                 struct {
370                         uint8_t dst[IPV6_ADDR_LEN];
371                 } ipv6;
372         };
373 };
374
375 /**
376  * Local rule structure. The local rule is applied via Netlink to
377  * outer tunnel iface in order to provide local and peer IP addresses
378  * of the VXLAN tunnel for encapsulation. The local rule is implicitly
379  * related to the Flow itself and can be shared by multiple Flows.
380  */
381 struct tcf_local_rule {
382         LIST_ENTRY(tcf_local_rule) next;
383         uint32_t refcnt;
384         uint16_t mask;
385         union {
386                 struct {
387                         rte_be32_t dst;
388                         rte_be32_t src;
389                 } ipv4;
390                 struct {
391                         uint8_t dst[IPV6_ADDR_LEN];
392                         uint8_t src[IPV6_ADDR_LEN];
393                 } ipv6;
394         };
395 };
396
397 /** VXLAN virtual netdev. */
398 struct tcf_vtep {
399         LIST_ENTRY(tcf_vtep) next;
400         LIST_HEAD(, tcf_neigh_rule) neigh;
401         LIST_HEAD(, tcf_local_rule) local;
402         uint32_t refcnt;
403         unsigned int ifindex; /**< Own interface index. */
404         unsigned int ifouter; /**< Index of device attached to. */
405         uint16_t port;
406         uint8_t created;
407 };
408
409 /** Tunnel descriptor header, common for all tunnel types. */
410 struct flow_tcf_tunnel_hdr {
411         uint32_t type; /**< Tunnel action type. */
412         struct tcf_vtep *vtep; /**< Virtual tunnel endpoint device. */
413         unsigned int ifindex_org; /**< Original dst/src interface */
414         unsigned int *ifindex_ptr; /**< Interface ptr in message. */
415 };
416
417 struct flow_tcf_vxlan_decap {
418         struct flow_tcf_tunnel_hdr hdr;
419         uint16_t udp_port;
420 };
421
422 struct flow_tcf_vxlan_encap {
423         struct flow_tcf_tunnel_hdr hdr;
424         uint32_t mask;
425         struct {
426                 struct ether_addr dst;
427                 struct ether_addr src;
428         } eth;
429         union {
430                 struct {
431                         rte_be32_t dst;
432                         rte_be32_t src;
433                 } ipv4;
434                 struct {
435                         uint8_t dst[IPV6_ADDR_LEN];
436                         uint8_t src[IPV6_ADDR_LEN];
437                 } ipv6;
438         };
439 struct {
440                 rte_be16_t src;
441                 rte_be16_t dst;
442         } udp;
443         struct {
444                 uint8_t vni[3];
445         } vxlan;
446 };
447
448 /** Structure used when extracting the values of a flow counters
449  * from a netlink message.
450  */
451 struct flow_tcf_stats_basic {
452         bool valid;
453         struct gnet_stats_basic counters;
454 };
455
456 /** Empty masks for known item types. */
457 static const union {
458         struct rte_flow_item_port_id port_id;
459         struct rte_flow_item_eth eth;
460         struct rte_flow_item_vlan vlan;
461         struct rte_flow_item_ipv4 ipv4;
462         struct rte_flow_item_ipv6 ipv6;
463         struct rte_flow_item_tcp tcp;
464         struct rte_flow_item_udp udp;
465         struct rte_flow_item_vxlan vxlan;
466 } flow_tcf_mask_empty;
467
468 /** Supported masks for known item types. */
469 static const struct {
470         struct rte_flow_item_port_id port_id;
471         struct rte_flow_item_eth eth;
472         struct rte_flow_item_vlan vlan;
473         struct rte_flow_item_ipv4 ipv4;
474         struct rte_flow_item_ipv6 ipv6;
475         struct rte_flow_item_tcp tcp;
476         struct rte_flow_item_udp udp;
477         struct rte_flow_item_vxlan vxlan;
478 } flow_tcf_mask_supported = {
479         .port_id = {
480                 .id = 0xffffffff,
481         },
482         .eth = {
483                 .type = RTE_BE16(0xffff),
484                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
485                 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
486         },
487         .vlan = {
488                 /* PCP and VID only, no DEI. */
489                 .tci = RTE_BE16(0xefff),
490                 .inner_type = RTE_BE16(0xffff),
491         },
492         .ipv4.hdr = {
493                 .next_proto_id = 0xff,
494                 .src_addr = RTE_BE32(0xffffffff),
495                 .dst_addr = RTE_BE32(0xffffffff),
496         },
497         .ipv6.hdr = {
498                 .proto = 0xff,
499                 .src_addr =
500                         "\xff\xff\xff\xff\xff\xff\xff\xff"
501                         "\xff\xff\xff\xff\xff\xff\xff\xff",
502                 .dst_addr =
503                         "\xff\xff\xff\xff\xff\xff\xff\xff"
504                         "\xff\xff\xff\xff\xff\xff\xff\xff",
505         },
506         .tcp.hdr = {
507                 .src_port = RTE_BE16(0xffff),
508                 .dst_port = RTE_BE16(0xffff),
509                 .tcp_flags = 0xff,
510         },
511         .udp.hdr = {
512                 .src_port = RTE_BE16(0xffff),
513                 .dst_port = RTE_BE16(0xffff),
514         },
515         .vxlan = {
516                .vni = "\xff\xff\xff",
517         },
518 };
519
520 #define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr))
521 #define SZ_NLATTR_NEST SZ_NLATTR_HDR
522 #define SZ_NLATTR_DATA_OF(len) MNL_ALIGN(SZ_NLATTR_HDR + (len))
523 #define SZ_NLATTR_TYPE_OF(typ) SZ_NLATTR_DATA_OF(sizeof(typ))
524 #define SZ_NLATTR_STRZ_OF(str) SZ_NLATTR_DATA_OF(strlen(str) + 1)
525
526 #define PTOI_TABLE_SZ_MAX(dev) (mlx5_dev_to_port_id((dev)->device, NULL, 0) + 2)
527
528 /** DPDK port to network interface index (ifindex) conversion. */
529 struct flow_tcf_ptoi {
530         uint16_t port_id; /**< DPDK port ID. */
531         unsigned int ifindex; /**< Network interface index. */
532 };
533
534 /* Due to a limitation on driver/FW. */
535 #define MLX5_TCF_GROUP_ID_MAX 3
536
537 /*
538  * Due to a limitation on driver/FW, priority ranges from 1 to 16 in kernel.
539  * Priority in rte_flow attribute starts from 0 and is added by 1 in
540  * translation. This is subject to be changed to determine the max priority
541  * based on trial-and-error like Verbs driver once the restriction is lifted or
542  * the range is extended.
543  */
544 #define MLX5_TCF_GROUP_PRIORITY_MAX 15
545
546 #define MLX5_TCF_FATE_ACTIONS \
547         (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \
548          MLX5_FLOW_ACTION_JUMP)
549
550 #define MLX5_TCF_VLAN_ACTIONS \
551         (MLX5_FLOW_ACTION_OF_POP_VLAN | MLX5_FLOW_ACTION_OF_PUSH_VLAN | \
552          MLX5_FLOW_ACTION_OF_SET_VLAN_VID | MLX5_FLOW_ACTION_OF_SET_VLAN_PCP)
553
554 #define MLX5_TCF_VXLAN_ACTIONS \
555         (MLX5_FLOW_ACTION_VXLAN_ENCAP | MLX5_FLOW_ACTION_VXLAN_DECAP)
556
557 #define MLX5_TCF_PEDIT_ACTIONS \
558         (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST | \
559          MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST | \
560          MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST | \
561          MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL | \
562          MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)
563
564 #define MLX5_TCF_CONFIG_ACTIONS \
565         (MLX5_FLOW_ACTION_PORT_ID | MLX5_FLOW_ACTION_JUMP | \
566          MLX5_FLOW_ACTION_OF_PUSH_VLAN | MLX5_FLOW_ACTION_OF_SET_VLAN_VID | \
567          MLX5_FLOW_ACTION_OF_SET_VLAN_PCP | \
568          (MLX5_TCF_PEDIT_ACTIONS & ~MLX5_FLOW_ACTION_DEC_TTL))
569
570 #define MAX_PEDIT_KEYS 128
571 #define SZ_PEDIT_KEY_VAL 4
572
573 #define NUM_OF_PEDIT_KEYS(sz) \
574         (((sz) / SZ_PEDIT_KEY_VAL) + (((sz) % SZ_PEDIT_KEY_VAL) ? 1 : 0))
575
576 struct pedit_key_ex {
577         enum pedit_header_type htype;
578         enum pedit_cmd cmd;
579 };
580
581 struct pedit_parser {
582         struct tc_pedit_sel sel;
583         struct tc_pedit_key keys[MAX_PEDIT_KEYS];
584         struct pedit_key_ex keys_ex[MAX_PEDIT_KEYS];
585 };
586
587 /**
588  * Create space for using the implicitly created TC flow counter.
589  *
590  * @param[in] dev
591  *   Pointer to the Ethernet device structure.
592  *
593  * @return
594  *   A pointer to the counter data structure, NULL otherwise and
595  *   rte_errno is set.
596  */
597 static struct mlx5_flow_counter *
598 flow_tcf_counter_new(void)
599 {
600         struct mlx5_flow_counter *cnt;
601
602         /*
603          * eswitch counter cannot be shared and its id is unknown.
604          * currently returning all with id 0.
605          * in the future maybe better to switch to unique numbers.
606          */
607         struct mlx5_flow_counter tmpl = {
608                 .ref_cnt = 1,
609         };
610         cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0);
611         if (!cnt) {
612                 rte_errno = ENOMEM;
613                 return NULL;
614         }
615         *cnt = tmpl;
616         /* Implicit counter, do not add to list. */
617         return cnt;
618 }
619
620 /**
621  * Set pedit key of MAC address
622  *
623  * @param[in] actions
624  *   pointer to action specification
625  * @param[in,out] p_parser
626  *   pointer to pedit_parser
627  */
628 static void
629 flow_tcf_pedit_key_set_mac(const struct rte_flow_action *actions,
630                            struct pedit_parser *p_parser)
631 {
632         int idx = p_parser->sel.nkeys;
633         uint32_t off = actions->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ?
634                                         offsetof(struct ether_hdr, s_addr) :
635                                         offsetof(struct ether_hdr, d_addr);
636         const struct rte_flow_action_set_mac *conf =
637                 (const struct rte_flow_action_set_mac *)actions->conf;
638
639         p_parser->keys[idx].off = off;
640         p_parser->keys[idx].mask = ~UINT32_MAX;
641         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
642         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
643         memcpy(&p_parser->keys[idx].val,
644                 conf->mac_addr, SZ_PEDIT_KEY_VAL);
645         idx++;
646         p_parser->keys[idx].off = off + SZ_PEDIT_KEY_VAL;
647         p_parser->keys[idx].mask = 0xFFFF0000;
648         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
649         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
650         memcpy(&p_parser->keys[idx].val,
651                 conf->mac_addr + SZ_PEDIT_KEY_VAL,
652                 ETHER_ADDR_LEN - SZ_PEDIT_KEY_VAL);
653         p_parser->sel.nkeys = (++idx);
654 }
655
656 /**
657  * Set pedit key of decrease/set ttl
658  *
659  * @param[in] actions
660  *   pointer to action specification
661  * @param[in,out] p_parser
662  *   pointer to pedit_parser
663  * @param[in] item_flags
664  *   flags of all items presented
665  */
666 static void
667 flow_tcf_pedit_key_set_dec_ttl(const struct rte_flow_action *actions,
668                                 struct pedit_parser *p_parser,
669                                 uint64_t item_flags)
670 {
671         int idx = p_parser->sel.nkeys;
672
673         p_parser->keys[idx].mask = 0xFFFFFF00;
674         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) {
675                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
676                 p_parser->keys[idx].off =
677                         offsetof(struct ipv4_hdr, time_to_live);
678         }
679         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6) {
680                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
681                 p_parser->keys[idx].off =
682                         offsetof(struct ipv6_hdr, hop_limits);
683         }
684         if (actions->type == RTE_FLOW_ACTION_TYPE_DEC_TTL) {
685                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_ADD;
686                 p_parser->keys[idx].val = 0x000000FF;
687         } else {
688                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
689                 p_parser->keys[idx].val =
690                         (__u32)((const struct rte_flow_action_set_ttl *)
691                          actions->conf)->ttl_value;
692         }
693         p_parser->sel.nkeys = (++idx);
694 }
695
696 /**
697  * Set pedit key of transport (TCP/UDP) port value
698  *
699  * @param[in] actions
700  *   pointer to action specification
701  * @param[in,out] p_parser
702  *   pointer to pedit_parser
703  * @param[in] item_flags
704  *   flags of all items presented
705  */
706 static void
707 flow_tcf_pedit_key_set_tp_port(const struct rte_flow_action *actions,
708                                 struct pedit_parser *p_parser,
709                                 uint64_t item_flags)
710 {
711         int idx = p_parser->sel.nkeys;
712
713         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)
714                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_UDP;
715         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_TCP)
716                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_TCP;
717         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
718         /* offset of src/dst port is same for TCP and UDP */
719         p_parser->keys[idx].off =
720                 actions->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC ?
721                 offsetof(struct tcp_hdr, src_port) :
722                 offsetof(struct tcp_hdr, dst_port);
723         p_parser->keys[idx].mask = 0xFFFF0000;
724         p_parser->keys[idx].val =
725                 (__u32)((const struct rte_flow_action_set_tp *)
726                                 actions->conf)->port;
727         p_parser->sel.nkeys = (++idx);
728 }
729
730 /**
731  * Set pedit key of ipv6 address
732  *
733  * @param[in] actions
734  *   pointer to action specification
735  * @param[in,out] p_parser
736  *   pointer to pedit_parser
737  */
738 static void
739 flow_tcf_pedit_key_set_ipv6_addr(const struct rte_flow_action *actions,
740                                  struct pedit_parser *p_parser)
741 {
742         int idx = p_parser->sel.nkeys;
743         int keys = NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
744         int off_base =
745                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ?
746                 offsetof(struct ipv6_hdr, src_addr) :
747                 offsetof(struct ipv6_hdr, dst_addr);
748         const struct rte_flow_action_set_ipv6 *conf =
749                 (const struct rte_flow_action_set_ipv6 *)actions->conf;
750
751         for (int i = 0; i < keys; i++, idx++) {
752                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
753                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
754                 p_parser->keys[idx].off = off_base + i * SZ_PEDIT_KEY_VAL;
755                 p_parser->keys[idx].mask = ~UINT32_MAX;
756                 memcpy(&p_parser->keys[idx].val,
757                         conf->ipv6_addr + i *  SZ_PEDIT_KEY_VAL,
758                         SZ_PEDIT_KEY_VAL);
759         }
760         p_parser->sel.nkeys += keys;
761 }
762
763 /**
764  * Set pedit key of ipv4 address
765  *
766  * @param[in] actions
767  *   pointer to action specification
768  * @param[in,out] p_parser
769  *   pointer to pedit_parser
770  */
771 static void
772 flow_tcf_pedit_key_set_ipv4_addr(const struct rte_flow_action *actions,
773                                  struct pedit_parser *p_parser)
774 {
775         int idx = p_parser->sel.nkeys;
776
777         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
778         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
779         p_parser->keys[idx].off =
780                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ?
781                 offsetof(struct ipv4_hdr, src_addr) :
782                 offsetof(struct ipv4_hdr, dst_addr);
783         p_parser->keys[idx].mask = ~UINT32_MAX;
784         p_parser->keys[idx].val =
785                 ((const struct rte_flow_action_set_ipv4 *)
786                  actions->conf)->ipv4_addr;
787         p_parser->sel.nkeys = (++idx);
788 }
789
790 /**
791  * Create the pedit's na attribute in netlink message
792  * on pre-allocate message buffer
793  *
794  * @param[in,out] nl
795  *   pointer to pre-allocated netlink message buffer
796  * @param[in,out] actions
797  *   pointer to pointer of actions specification.
798  * @param[in,out] action_flags
799  *   pointer to actions flags
800  * @param[in] item_flags
801  *   flags of all item presented
802  */
803 static void
804 flow_tcf_create_pedit_mnl_msg(struct nlmsghdr *nl,
805                               const struct rte_flow_action **actions,
806                               uint64_t item_flags)
807 {
808         struct pedit_parser p_parser;
809         struct nlattr *na_act_options;
810         struct nlattr *na_pedit_keys;
811
812         memset(&p_parser, 0, sizeof(p_parser));
813         mnl_attr_put_strz(nl, TCA_ACT_KIND, "pedit");
814         na_act_options = mnl_attr_nest_start(nl, TCA_ACT_OPTIONS);
815         /* all modify header actions should be in one tc-pedit action */
816         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
817                 switch ((*actions)->type) {
818                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
819                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
820                         flow_tcf_pedit_key_set_ipv4_addr(*actions, &p_parser);
821                         break;
822                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
823                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
824                         flow_tcf_pedit_key_set_ipv6_addr(*actions, &p_parser);
825                         break;
826                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
827                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
828                         flow_tcf_pedit_key_set_tp_port(*actions,
829                                                         &p_parser, item_flags);
830                         break;
831                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
832                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
833                         flow_tcf_pedit_key_set_dec_ttl(*actions,
834                                                         &p_parser, item_flags);
835                         break;
836                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
837                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
838                         flow_tcf_pedit_key_set_mac(*actions, &p_parser);
839                         break;
840                 default:
841                         goto pedit_mnl_msg_done;
842                 }
843         }
844 pedit_mnl_msg_done:
845         p_parser.sel.action = TC_ACT_PIPE;
846         mnl_attr_put(nl, TCA_PEDIT_PARMS_EX,
847                      sizeof(p_parser.sel) +
848                      p_parser.sel.nkeys * sizeof(struct tc_pedit_key),
849                      &p_parser);
850         na_pedit_keys =
851                 mnl_attr_nest_start(nl, TCA_PEDIT_KEYS_EX | NLA_F_NESTED);
852         for (int i = 0; i < p_parser.sel.nkeys; i++) {
853                 struct nlattr *na_pedit_key =
854                         mnl_attr_nest_start(nl,
855                                             TCA_PEDIT_KEY_EX | NLA_F_NESTED);
856                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_HTYPE,
857                                  p_parser.keys_ex[i].htype);
858                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_CMD,
859                                  p_parser.keys_ex[i].cmd);
860                 mnl_attr_nest_end(nl, na_pedit_key);
861         }
862         mnl_attr_nest_end(nl, na_pedit_keys);
863         mnl_attr_nest_end(nl, na_act_options);
864         (*actions)--;
865 }
866
867 /**
868  * Calculate max memory size of one TC-pedit actions.
869  * One TC-pedit action can contain set of keys each defining
870  * a rewrite element (rte_flow action)
871  *
872  * @param[in,out] actions
873  *   actions specification.
874  * @param[in,out] action_flags
875  *   actions flags
876  * @param[in,out] size
877  *   accumulated size
878  * @return
879  *   Max memory size of one TC-pedit action
880  */
881 static int
882 flow_tcf_get_pedit_actions_size(const struct rte_flow_action **actions,
883                                 uint64_t *action_flags)
884 {
885         int pedit_size = 0;
886         int keys = 0;
887         uint64_t flags = 0;
888
889         pedit_size += SZ_NLATTR_NEST + /* na_act_index. */
890                       SZ_NLATTR_STRZ_OF("pedit") +
891                       SZ_NLATTR_NEST; /* TCA_ACT_OPTIONS. */
892         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
893                 switch ((*actions)->type) {
894                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
895                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
896                         flags |= MLX5_FLOW_ACTION_SET_IPV4_SRC;
897                         break;
898                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
899                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
900                         flags |= MLX5_FLOW_ACTION_SET_IPV4_DST;
901                         break;
902                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
903                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
904                         flags |= MLX5_FLOW_ACTION_SET_IPV6_SRC;
905                         break;
906                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
907                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
908                         flags |= MLX5_FLOW_ACTION_SET_IPV6_DST;
909                         break;
910                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
911                         /* TCP is as same as UDP */
912                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
913                         flags |= MLX5_FLOW_ACTION_SET_TP_SRC;
914                         break;
915                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
916                         /* TCP is as same as UDP */
917                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
918                         flags |= MLX5_FLOW_ACTION_SET_TP_DST;
919                         break;
920                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
921                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
922                         flags |= MLX5_FLOW_ACTION_SET_TTL;
923                         break;
924                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
925                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
926                         flags |= MLX5_FLOW_ACTION_DEC_TTL;
927                         break;
928                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
929                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
930                         flags |= MLX5_FLOW_ACTION_SET_MAC_SRC;
931                         break;
932                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
933                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
934                         flags |= MLX5_FLOW_ACTION_SET_MAC_DST;
935                         break;
936                 default:
937                         goto get_pedit_action_size_done;
938                 }
939         }
940 get_pedit_action_size_done:
941         /* TCA_PEDIT_PARAMS_EX */
942         pedit_size +=
943                 SZ_NLATTR_DATA_OF(sizeof(struct tc_pedit_sel) +
944                                   keys * sizeof(struct tc_pedit_key));
945         pedit_size += SZ_NLATTR_NEST; /* TCA_PEDIT_KEYS */
946         pedit_size += keys *
947                       /* TCA_PEDIT_KEY_EX + HTYPE + CMD */
948                       (SZ_NLATTR_NEST + SZ_NLATTR_DATA_OF(2) +
949                        SZ_NLATTR_DATA_OF(2));
950         (*action_flags) |= flags;
951         (*actions)--;
952         return pedit_size;
953 }
954
955 /**
956  * Retrieve mask for pattern item.
957  *
958  * This function does basic sanity checks on a pattern item in order to
959  * return the most appropriate mask for it.
960  *
961  * @param[in] item
962  *   Item specification.
963  * @param[in] mask_default
964  *   Default mask for pattern item as specified by the flow API.
965  * @param[in] mask_supported
966  *   Mask fields supported by the implementation.
967  * @param[in] mask_empty
968  *   Empty mask to return when there is no specification.
969  * @param[out] error
970  *   Perform verbose error reporting if not NULL.
971  *
972  * @return
973  *   Either @p item->mask or one of the mask parameters on success, NULL
974  *   otherwise and rte_errno is set.
975  */
976 static const void *
977 flow_tcf_item_mask(const struct rte_flow_item *item, const void *mask_default,
978                    const void *mask_supported, const void *mask_empty,
979                    size_t mask_size, struct rte_flow_error *error)
980 {
981         const uint8_t *mask;
982         size_t i;
983
984         /* item->last and item->mask cannot exist without item->spec. */
985         if (!item->spec && (item->mask || item->last)) {
986                 rte_flow_error_set(error, EINVAL,
987                                    RTE_FLOW_ERROR_TYPE_ITEM, item,
988                                    "\"mask\" or \"last\" field provided without"
989                                    " a corresponding \"spec\"");
990                 return NULL;
991         }
992         /* No spec, no mask, no problem. */
993         if (!item->spec)
994                 return mask_empty;
995         mask = item->mask ? item->mask : mask_default;
996         assert(mask);
997         /*
998          * Single-pass check to make sure that:
999          * - Mask is supported, no bits are set outside mask_supported.
1000          * - Both item->spec and item->last are included in mask.
1001          */
1002         for (i = 0; i != mask_size; ++i) {
1003                 if (!mask[i])
1004                         continue;
1005                 if ((mask[i] | ((const uint8_t *)mask_supported)[i]) !=
1006                     ((const uint8_t *)mask_supported)[i]) {
1007                         rte_flow_error_set(error, ENOTSUP,
1008                                            RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1009                                            "unsupported field found"
1010                                            " in \"mask\"");
1011                         return NULL;
1012                 }
1013                 if (item->last &&
1014                     (((const uint8_t *)item->spec)[i] & mask[i]) !=
1015                     (((const uint8_t *)item->last)[i] & mask[i])) {
1016                         rte_flow_error_set(error, EINVAL,
1017                                            RTE_FLOW_ERROR_TYPE_ITEM_LAST,
1018                                            item->last,
1019                                            "range between \"spec\" and \"last\""
1020                                            " not comprised in \"mask\"");
1021                         return NULL;
1022                 }
1023         }
1024         return mask;
1025 }
1026
1027 /**
1028  * Build a conversion table between port ID and ifindex.
1029  *
1030  * @param[in] dev
1031  *   Pointer to Ethernet device.
1032  * @param[out] ptoi
1033  *   Pointer to ptoi table.
1034  * @param[in] len
1035  *   Size of ptoi table provided.
1036  *
1037  * @return
1038  *   Size of ptoi table filled.
1039  */
1040 static unsigned int
1041 flow_tcf_build_ptoi_table(struct rte_eth_dev *dev, struct flow_tcf_ptoi *ptoi,
1042                           unsigned int len)
1043 {
1044         unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
1045         uint16_t port_id[n + 1];
1046         unsigned int i;
1047         unsigned int own = 0;
1048
1049         /* At least one port is needed when no switch domain is present. */
1050         if (!n) {
1051                 n = 1;
1052                 port_id[0] = dev->data->port_id;
1053         } else {
1054                 n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
1055         }
1056         if (n > len)
1057                 return 0;
1058         for (i = 0; i != n; ++i) {
1059                 struct rte_eth_dev_info dev_info;
1060
1061                 rte_eth_dev_info_get(port_id[i], &dev_info);
1062                 if (port_id[i] == dev->data->port_id)
1063                         own = i;
1064                 ptoi[i].port_id = port_id[i];
1065                 ptoi[i].ifindex = dev_info.if_index;
1066         }
1067         /* Ensure first entry of ptoi[] is the current device. */
1068         if (own) {
1069                 ptoi[n] = ptoi[0];
1070                 ptoi[0] = ptoi[own];
1071                 ptoi[own] = ptoi[n];
1072         }
1073         /* An entry with zero ifindex terminates ptoi[]. */
1074         ptoi[n].port_id = 0;
1075         ptoi[n].ifindex = 0;
1076         return n;
1077 }
1078
1079 /**
1080  * Verify the @p attr will be correctly understood by the E-switch.
1081  *
1082  * @param[in] attr
1083  *   Pointer to flow attributes
1084  * @param[out] error
1085  *   Pointer to error structure.
1086  *
1087  * @return
1088  *   0 on success, a negative errno value otherwise and rte_errno is set.
1089  */
1090 static int
1091 flow_tcf_validate_attributes(const struct rte_flow_attr *attr,
1092                              struct rte_flow_error *error)
1093 {
1094         /*
1095          * Supported attributes: groups, some priorities and ingress only.
1096          * group is supported only if kernel supports chain. Don't care about
1097          * transfer as it is the caller's problem.
1098          */
1099         if (attr->group > MLX5_TCF_GROUP_ID_MAX)
1100                 return rte_flow_error_set(error, ENOTSUP,
1101                                           RTE_FLOW_ERROR_TYPE_ATTR_GROUP, attr,
1102                                           "group ID larger than "
1103                                           RTE_STR(MLX5_TCF_GROUP_ID_MAX)
1104                                           " isn't supported");
1105         else if (attr->priority > MLX5_TCF_GROUP_PRIORITY_MAX)
1106                 return rte_flow_error_set(error, ENOTSUP,
1107                                           RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1108                                           attr,
1109                                           "priority more than "
1110                                           RTE_STR(MLX5_TCF_GROUP_PRIORITY_MAX)
1111                                           " is not supported");
1112         if (!attr->ingress)
1113                 return rte_flow_error_set(error, EINVAL,
1114                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1115                                           attr, "only ingress is supported");
1116         if (attr->egress)
1117                 return rte_flow_error_set(error, ENOTSUP,
1118                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1119                                           attr, "egress is not supported");
1120         return 0;
1121 }
1122
1123 /**
1124  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_ETH item for E-Switch.
1125  * The routine checks the L2 fields to be used in encapsulation header.
1126  *
1127  * @param[in] item
1128  *   Pointer to the item structure.
1129  * @param[out] error
1130  *   Pointer to the error structure.
1131  *
1132  * @return
1133  *   0 on success, a negative errno value otherwise and rte_errno is set.
1134  **/
1135 static int
1136 flow_tcf_validate_vxlan_encap_eth(const struct rte_flow_item *item,
1137                                   struct rte_flow_error *error)
1138 {
1139         const struct rte_flow_item_eth *spec = item->spec;
1140         const struct rte_flow_item_eth *mask = item->mask;
1141
1142         if (!spec) {
1143                 /*
1144                  * Specification for L2 addresses can be empty
1145                  * because these ones are optional and not
1146                  * required directly by tc rule. Kernel tries
1147                  * to resolve these ones on its own
1148                  */
1149                 return 0;
1150         }
1151         if (!mask) {
1152                 /* If mask is not specified use the default one. */
1153                 mask = &rte_flow_item_eth_mask;
1154         }
1155         if (memcmp(&mask->dst,
1156                    &flow_tcf_mask_empty.eth.dst,
1157                    sizeof(flow_tcf_mask_empty.eth.dst))) {
1158                 if (memcmp(&mask->dst,
1159                            &rte_flow_item_eth_mask.dst,
1160                            sizeof(rte_flow_item_eth_mask.dst)))
1161                         return rte_flow_error_set
1162                                 (error, ENOTSUP,
1163                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1164                                  "no support for partial mask on"
1165                                  " \"eth.dst\" field");
1166         }
1167         if (memcmp(&mask->src,
1168                    &flow_tcf_mask_empty.eth.src,
1169                    sizeof(flow_tcf_mask_empty.eth.src))) {
1170                 if (memcmp(&mask->src,
1171                            &rte_flow_item_eth_mask.src,
1172                            sizeof(rte_flow_item_eth_mask.src)))
1173                         return rte_flow_error_set
1174                                 (error, ENOTSUP,
1175                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1176                                  "no support for partial mask on"
1177                                  " \"eth.src\" field");
1178         }
1179         if (mask->type != RTE_BE16(0x0000)) {
1180                 if (mask->type != RTE_BE16(0xffff))
1181                         return rte_flow_error_set
1182                                 (error, ENOTSUP,
1183                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1184                                  "no support for partial mask on"
1185                                  " \"eth.type\" field");
1186                 DRV_LOG(WARNING,
1187                         "outer ethernet type field"
1188                         " cannot be forced for vxlan"
1189                         " encapsulation, parameter ignored");
1190         }
1191         return 0;
1192 }
1193
1194 /**
1195  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV4 item for E-Switch.
1196  * The routine checks the IPv4 fields to be used in encapsulation header.
1197  *
1198  * @param[in] item
1199  *   Pointer to the item structure.
1200  * @param[out] error
1201  *   Pointer to the error structure.
1202  *
1203  * @return
1204  *   0 on success, a negative errno value otherwise and rte_errno is set.
1205  **/
1206 static int
1207 flow_tcf_validate_vxlan_encap_ipv4(const struct rte_flow_item *item,
1208                                    struct rte_flow_error *error)
1209 {
1210         const struct rte_flow_item_ipv4 *spec = item->spec;
1211         const struct rte_flow_item_ipv4 *mask = item->mask;
1212
1213         if (!spec) {
1214                 /*
1215                  * Specification for IP addresses cannot be empty
1216                  * because it is required by tunnel_key parameter.
1217                  */
1218                 return rte_flow_error_set(error, EINVAL,
1219                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1220                                           "NULL outer ipv4 address"
1221                                           " specification for vxlan"
1222                                           " encapsulation");
1223         }
1224         if (!mask)
1225                 mask = &rte_flow_item_ipv4_mask;
1226         if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) {
1227                 if (mask->hdr.dst_addr != RTE_BE32(0xffffffff))
1228                         return rte_flow_error_set
1229                                 (error, ENOTSUP,
1230                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1231                                  "no support for partial mask on"
1232                                  " \"ipv4.hdr.dst_addr\" field"
1233                                  " for vxlan encapsulation");
1234                 /* More IPv4 address validations can be put here. */
1235         } else {
1236                 /*
1237                  * Kernel uses the destination IP address to determine
1238                  * the routing path and obtain the MAC destination
1239                  * address, so IP destination address must be
1240                  * specified in the tc rule.
1241                  */
1242                 return rte_flow_error_set(error, EINVAL,
1243                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1244                                           "outer ipv4 destination address"
1245                                           " must be specified for"
1246                                           " vxlan encapsulation");
1247         }
1248         if (mask->hdr.src_addr != RTE_BE32(0x00000000)) {
1249                 if (mask->hdr.src_addr != RTE_BE32(0xffffffff))
1250                         return rte_flow_error_set
1251                                 (error, ENOTSUP,
1252                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1253                                  "no support for partial mask on"
1254                                  " \"ipv4.hdr.src_addr\" field"
1255                                  " for vxlan encapsulation");
1256                 /* More IPv4 address validations can be put here. */
1257         } else {
1258                 /*
1259                  * Kernel uses the source IP address to select the
1260                  * interface for egress encapsulated traffic, so
1261                  * it must be specified in the tc rule.
1262                  */
1263                 return rte_flow_error_set(error, EINVAL,
1264                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1265                                           "outer ipv4 source address"
1266                                           " must be specified for"
1267                                           " vxlan encapsulation");
1268         }
1269         return 0;
1270 }
1271
1272 /**
1273  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV6 item for E-Switch.
1274  * The routine checks the IPv6 fields to be used in encapsulation header.
1275  *
1276  * @param[in] item
1277  *   Pointer to the item structure.
1278  * @param[out] error
1279  *   Pointer to the error structure.
1280  *
1281  * @return
1282  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1283  **/
1284 static int
1285 flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item,
1286                                    struct rte_flow_error *error)
1287 {
1288         const struct rte_flow_item_ipv6 *spec = item->spec;
1289         const struct rte_flow_item_ipv6 *mask = item->mask;
1290
1291         if (!spec) {
1292                 /*
1293                  * Specification for IP addresses cannot be empty
1294                  * because it is required by tunnel_key parameter.
1295                  */
1296                 return rte_flow_error_set(error, EINVAL,
1297                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1298                                           "NULL outer ipv6 address"
1299                                           " specification for"
1300                                           " vxlan encapsulation");
1301         }
1302         if (!mask)
1303                 mask = &rte_flow_item_ipv6_mask;
1304         if (memcmp(&mask->hdr.dst_addr,
1305                    &flow_tcf_mask_empty.ipv6.hdr.dst_addr,
1306                    IPV6_ADDR_LEN)) {
1307                 if (memcmp(&mask->hdr.dst_addr,
1308                            &rte_flow_item_ipv6_mask.hdr.dst_addr,
1309                            IPV6_ADDR_LEN))
1310                         return rte_flow_error_set
1311                                         (error, ENOTSUP,
1312                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1313                                          "no support for partial mask on"
1314                                          " \"ipv6.hdr.dst_addr\" field"
1315                                          " for vxlan encapsulation");
1316                 /* More IPv6 address validations can be put here. */
1317         } else {
1318                 /*
1319                  * Kernel uses the destination IP address to determine
1320                  * the routing path and obtain the MAC destination
1321                  * address (heigh or gate), so IP destination address
1322                  * must be specified within the tc rule.
1323                  */
1324                 return rte_flow_error_set(error, EINVAL,
1325                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1326                                           "outer ipv6 destination address"
1327                                           " must be specified for"
1328                                           " vxlan encapsulation");
1329         }
1330         if (memcmp(&mask->hdr.src_addr,
1331                    &flow_tcf_mask_empty.ipv6.hdr.src_addr,
1332                    IPV6_ADDR_LEN)) {
1333                 if (memcmp(&mask->hdr.src_addr,
1334                            &rte_flow_item_ipv6_mask.hdr.src_addr,
1335                            IPV6_ADDR_LEN))
1336                         return rte_flow_error_set
1337                                         (error, ENOTSUP,
1338                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1339                                          "no support for partial mask on"
1340                                          " \"ipv6.hdr.src_addr\" field"
1341                                          " for vxlan encapsulation");
1342                 /* More L3 address validation can be put here. */
1343         } else {
1344                 /*
1345                  * Kernel uses the source IP address to select the
1346                  * interface for egress encapsulated traffic, so
1347                  * it must be specified in the tc rule.
1348                  */
1349                 return rte_flow_error_set(error, EINVAL,
1350                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1351                                           "outer L3 source address"
1352                                           " must be specified for"
1353                                           " vxlan encapsulation");
1354         }
1355         return 0;
1356 }
1357
1358 /**
1359  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_UDP item for E-Switch.
1360  * The routine checks the UDP fields to be used in encapsulation header.
1361  *
1362  * @param[in] item
1363  *   Pointer to the item structure.
1364  * @param[out] error
1365  *   Pointer to the error structure.
1366  *
1367  * @return
1368  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1369  **/
1370 static int
1371 flow_tcf_validate_vxlan_encap_udp(const struct rte_flow_item *item,
1372                                   struct rte_flow_error *error)
1373 {
1374         const struct rte_flow_item_udp *spec = item->spec;
1375         const struct rte_flow_item_udp *mask = item->mask;
1376
1377         if (!spec) {
1378                 /*
1379                  * Specification for UDP ports cannot be empty
1380                  * because it is required by tunnel_key parameter.
1381                  */
1382                 return rte_flow_error_set(error, EINVAL,
1383                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1384                                           "NULL UDP port specification "
1385                                           " for vxlan encapsulation");
1386         }
1387         if (!mask)
1388                 mask = &rte_flow_item_udp_mask;
1389         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1390                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1391                         return rte_flow_error_set
1392                                         (error, ENOTSUP,
1393                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1394                                          "no support for partial mask on"
1395                                          " \"udp.hdr.dst_port\" field"
1396                                          " for vxlan encapsulation");
1397                 if (!spec->hdr.dst_port)
1398                         return rte_flow_error_set
1399                                         (error, EINVAL,
1400                                          RTE_FLOW_ERROR_TYPE_ITEM, item,
1401                                          "outer UDP remote port cannot be"
1402                                          " 0 for vxlan encapsulation");
1403         } else {
1404                 return rte_flow_error_set(error, EINVAL,
1405                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1406                                           "outer UDP remote port"
1407                                           " must be specified for"
1408                                           " vxlan encapsulation");
1409         }
1410         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1411                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1412                         return rte_flow_error_set
1413                                         (error, ENOTSUP,
1414                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1415                                          "no support for partial mask on"
1416                                          " \"udp.hdr.src_port\" field"
1417                                          " for vxlan encapsulation");
1418                 DRV_LOG(WARNING,
1419                         "outer UDP source port cannot be"
1420                         " forced for vxlan encapsulation,"
1421                         " parameter ignored");
1422         }
1423         return 0;
1424 }
1425
1426 /**
1427  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_VXLAN item for E-Switch.
1428  * The routine checks the VNIP fields to be used in encapsulation header.
1429  *
1430  * @param[in] item
1431  *   Pointer to the item structure.
1432  * @param[out] error
1433  *   Pointer to the error structure.
1434  *
1435  * @return
1436  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1437  **/
1438 static int
1439 flow_tcf_validate_vxlan_encap_vni(const struct rte_flow_item *item,
1440                                   struct rte_flow_error *error)
1441 {
1442         const struct rte_flow_item_vxlan *spec = item->spec;
1443         const struct rte_flow_item_vxlan *mask = item->mask;
1444
1445         if (!spec) {
1446                 /* Outer VNI is required by tunnel_key parameter. */
1447                 return rte_flow_error_set(error, EINVAL,
1448                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1449                                           "NULL VNI specification"
1450                                           " for vxlan encapsulation");
1451         }
1452         if (!mask)
1453                 mask = &rte_flow_item_vxlan_mask;
1454         if (!mask->vni[0] && !mask->vni[1] && !mask->vni[2])
1455                 return rte_flow_error_set(error, EINVAL,
1456                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1457                                           "outer VNI must be specified "
1458                                           "for vxlan encapsulation");
1459         if (mask->vni[0] != 0xff ||
1460             mask->vni[1] != 0xff ||
1461             mask->vni[2] != 0xff)
1462                 return rte_flow_error_set(error, ENOTSUP,
1463                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1464                                           "no support for partial mask on"
1465                                           " \"vxlan.vni\" field");
1466
1467         if (!spec->vni[0] && !spec->vni[1] && !spec->vni[2])
1468                 return rte_flow_error_set(error, EINVAL,
1469                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1470                                           "vxlan vni cannot be 0");
1471         return 0;
1472 }
1473
1474 /**
1475  * Validate VXLAN_ENCAP action item list for E-Switch.
1476  * The routine checks items to be used in encapsulation header.
1477  *
1478  * @param[in] action
1479  *   Pointer to the VXLAN_ENCAP action structure.
1480  * @param[out] error
1481  *   Pointer to the error structure.
1482  *
1483  * @return
1484  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1485  **/
1486 static int
1487 flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action,
1488                               struct rte_flow_error *error)
1489 {
1490         const struct rte_flow_item *items;
1491         int ret;
1492         uint32_t item_flags = 0;
1493
1494         if (!action->conf)
1495                 return rte_flow_error_set(error, EINVAL,
1496                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1497                                           "Missing vxlan tunnel"
1498                                           " action configuration");
1499         items = ((const struct rte_flow_action_vxlan_encap *)
1500                                         action->conf)->definition;
1501         if (!items)
1502                 return rte_flow_error_set(error, EINVAL,
1503                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1504                                           "Missing vxlan tunnel"
1505                                           " encapsulation parameters");
1506         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1507                 switch (items->type) {
1508                 case RTE_FLOW_ITEM_TYPE_VOID:
1509                         break;
1510                 case RTE_FLOW_ITEM_TYPE_ETH:
1511                         ret = mlx5_flow_validate_item_eth(items, item_flags,
1512                                                           error);
1513                         if (ret < 0)
1514                                 return ret;
1515                         ret = flow_tcf_validate_vxlan_encap_eth(items, error);
1516                         if (ret < 0)
1517                                 return ret;
1518                         item_flags |= MLX5_FLOW_LAYER_OUTER_L2;
1519                         break;
1520                 break;
1521                 case RTE_FLOW_ITEM_TYPE_IPV4:
1522                         ret = mlx5_flow_validate_item_ipv4(items, item_flags,
1523                                                            error);
1524                         if (ret < 0)
1525                                 return ret;
1526                         ret = flow_tcf_validate_vxlan_encap_ipv4(items, error);
1527                         if (ret < 0)
1528                                 return ret;
1529                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
1530                         break;
1531                 case RTE_FLOW_ITEM_TYPE_IPV6:
1532                         ret = mlx5_flow_validate_item_ipv6(items, item_flags,
1533                                                            error);
1534                         if (ret < 0)
1535                                 return ret;
1536                         ret = flow_tcf_validate_vxlan_encap_ipv6(items, error);
1537                         if (ret < 0)
1538                                 return ret;
1539                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
1540                         break;
1541                 case RTE_FLOW_ITEM_TYPE_UDP:
1542                         ret = mlx5_flow_validate_item_udp(items, item_flags,
1543                                                            0xFF, error);
1544                         if (ret < 0)
1545                                 return ret;
1546                         ret = flow_tcf_validate_vxlan_encap_udp(items, error);
1547                         if (ret < 0)
1548                                 return ret;
1549                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
1550                         break;
1551                 case RTE_FLOW_ITEM_TYPE_VXLAN:
1552                         ret = mlx5_flow_validate_item_vxlan(items,
1553                                                             item_flags, error);
1554                         if (ret < 0)
1555                                 return ret;
1556                         ret = flow_tcf_validate_vxlan_encap_vni(items, error);
1557                         if (ret < 0)
1558                                 return ret;
1559                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
1560                         break;
1561                 default:
1562                         return rte_flow_error_set
1563                                         (error, ENOTSUP,
1564                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
1565                                          "vxlan encap item not supported");
1566                 }
1567         }
1568         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3))
1569                 return rte_flow_error_set(error, EINVAL,
1570                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1571                                           "no outer IP layer found"
1572                                           " for vxlan encapsulation");
1573         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
1574                 return rte_flow_error_set(error, EINVAL,
1575                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1576                                           "no outer UDP layer found"
1577                                           " for vxlan encapsulation");
1578         if (!(item_flags & MLX5_FLOW_LAYER_VXLAN))
1579                 return rte_flow_error_set(error, EINVAL,
1580                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1581                                           "no VXLAN VNI found"
1582                                           " for vxlan encapsulation");
1583         return 0;
1584 }
1585
1586 /**
1587  * Validate outer RTE_FLOW_ITEM_TYPE_UDP item if tunnel item
1588  * RTE_FLOW_ITEM_TYPE_VXLAN is present in item list.
1589  *
1590  * @param[in] udp
1591  *   Outer UDP layer item (if any, NULL otherwise).
1592  * @param[out] error
1593  *   Pointer to the error structure.
1594  *
1595  * @return
1596  *   0 on success, a negative errno value otherwise and rte_errno is set.
1597  **/
1598 static int
1599 flow_tcf_validate_vxlan_decap_udp(const struct rte_flow_item *udp,
1600                                   struct rte_flow_error *error)
1601 {
1602         const struct rte_flow_item_udp *spec = udp->spec;
1603         const struct rte_flow_item_udp *mask = udp->mask;
1604
1605         if (!spec)
1606                 /*
1607                  * Specification for UDP ports cannot be empty
1608                  * because it is required as decap parameter.
1609                  */
1610                 return rte_flow_error_set(error, EINVAL,
1611                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1612                                           "NULL UDP port specification"
1613                                           " for VXLAN decapsulation");
1614         if (!mask)
1615                 mask = &rte_flow_item_udp_mask;
1616         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1617                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1618                         return rte_flow_error_set
1619                                         (error, ENOTSUP,
1620                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1621                                          "no support for partial mask on"
1622                                          " \"udp.hdr.dst_port\" field");
1623                 if (!spec->hdr.dst_port)
1624                         return rte_flow_error_set
1625                                         (error, EINVAL,
1626                                          RTE_FLOW_ERROR_TYPE_ITEM, udp,
1627                                          "zero decap local UDP port");
1628         } else {
1629                 return rte_flow_error_set(error, EINVAL,
1630                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1631                                           "outer UDP destination port must be "
1632                                           "specified for vxlan decapsulation");
1633         }
1634         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1635                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1636                         return rte_flow_error_set
1637                                         (error, ENOTSUP,
1638                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1639                                          "no support for partial mask on"
1640                                          " \"udp.hdr.src_port\" field");
1641                 DRV_LOG(WARNING,
1642                         "outer UDP local port cannot be "
1643                         "forced for VXLAN encapsulation, "
1644                         "parameter ignored");
1645         }
1646         return 0;
1647 }
1648
1649 /**
1650  * Validate flow for E-Switch.
1651  *
1652  * @param[in] priv
1653  *   Pointer to the priv structure.
1654  * @param[in] attr
1655  *   Pointer to the flow attributes.
1656  * @param[in] items
1657  *   Pointer to the list of items.
1658  * @param[in] actions
1659  *   Pointer to the list of actions.
1660  * @param[out] error
1661  *   Pointer to the error structure.
1662  *
1663  * @return
1664  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1665  */
1666 static int
1667 flow_tcf_validate(struct rte_eth_dev *dev,
1668                   const struct rte_flow_attr *attr,
1669                   const struct rte_flow_item items[],
1670                   const struct rte_flow_action actions[],
1671                   struct rte_flow_error *error)
1672 {
1673         union {
1674                 const struct rte_flow_item_port_id *port_id;
1675                 const struct rte_flow_item_eth *eth;
1676                 const struct rte_flow_item_vlan *vlan;
1677                 const struct rte_flow_item_ipv4 *ipv4;
1678                 const struct rte_flow_item_ipv6 *ipv6;
1679                 const struct rte_flow_item_tcp *tcp;
1680                 const struct rte_flow_item_udp *udp;
1681                 const struct rte_flow_item_vxlan *vxlan;
1682         } spec, mask;
1683         union {
1684                 const struct rte_flow_action_port_id *port_id;
1685                 const struct rte_flow_action_jump *jump;
1686                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
1687                 const struct rte_flow_action_of_set_vlan_vid *
1688                         of_set_vlan_vid;
1689                 const struct rte_flow_action_of_set_vlan_pcp *
1690                         of_set_vlan_pcp;
1691                 const struct rte_flow_action_vxlan_encap *vxlan_encap;
1692                 const struct rte_flow_action_set_ipv4 *set_ipv4;
1693                 const struct rte_flow_action_set_ipv6 *set_ipv6;
1694         } conf;
1695         const struct rte_flow_item *outer_udp = NULL;
1696         uint64_t item_flags = 0;
1697         uint64_t action_flags = 0;
1698         uint8_t next_protocol = -1;
1699         unsigned int tcm_ifindex = 0;
1700         uint8_t pedit_validated = 0;
1701         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
1702         struct rte_eth_dev *port_id_dev = NULL;
1703         bool in_port_id_set;
1704         int ret;
1705
1706         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
1707                                                 PTOI_TABLE_SZ_MAX(dev)));
1708         ret = flow_tcf_validate_attributes(attr, error);
1709         if (ret < 0)
1710                 return ret;
1711         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
1712                 unsigned int i;
1713                 uint64_t current_action_flag = 0;
1714
1715                 switch (actions->type) {
1716                 case RTE_FLOW_ACTION_TYPE_VOID:
1717                         break;
1718                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
1719                         current_action_flag = MLX5_FLOW_ACTION_PORT_ID;
1720                         if (!actions->conf)
1721                                 break;
1722                         conf.port_id = actions->conf;
1723                         if (conf.port_id->original)
1724                                 i = 0;
1725                         else
1726                                 for (i = 0; ptoi[i].ifindex; ++i)
1727                                         if (ptoi[i].port_id == conf.port_id->id)
1728                                                 break;
1729                         if (!ptoi[i].ifindex)
1730                                 return rte_flow_error_set
1731                                         (error, ENODEV,
1732                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1733                                          conf.port_id,
1734                                          "missing data to convert port ID to"
1735                                          " ifindex");
1736                         port_id_dev = &rte_eth_devices[conf.port_id->id];
1737                         break;
1738                 case RTE_FLOW_ACTION_TYPE_JUMP:
1739                         current_action_flag = MLX5_FLOW_ACTION_JUMP;
1740                         if (!actions->conf)
1741                                 break;
1742                         conf.jump = actions->conf;
1743                         if (attr->group >= conf.jump->group)
1744                                 return rte_flow_error_set
1745                                         (error, ENOTSUP,
1746                                          RTE_FLOW_ERROR_TYPE_ACTION,
1747                                          actions,
1748                                          "can jump only to a group forward");
1749                         break;
1750                 case RTE_FLOW_ACTION_TYPE_DROP:
1751                         current_action_flag = MLX5_FLOW_ACTION_DROP;
1752                         break;
1753                 case RTE_FLOW_ACTION_TYPE_COUNT:
1754                         break;
1755                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
1756                         current_action_flag = MLX5_FLOW_ACTION_OF_POP_VLAN;
1757                         break;
1758                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: {
1759                         rte_be16_t ethertype;
1760
1761                         current_action_flag = MLX5_FLOW_ACTION_OF_PUSH_VLAN;
1762                         if (!actions->conf)
1763                                 break;
1764                         conf.of_push_vlan = actions->conf;
1765                         ethertype = conf.of_push_vlan->ethertype;
1766                         if (ethertype != RTE_BE16(ETH_P_8021Q) &&
1767                             ethertype != RTE_BE16(ETH_P_8021AD))
1768                                 return rte_flow_error_set
1769                                         (error, EINVAL,
1770                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1771                                          "vlan push TPID must be "
1772                                          "802.1Q or 802.1AD");
1773                         break;
1774                 }
1775                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
1776                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1777                                 return rte_flow_error_set
1778                                         (error, ENOTSUP,
1779                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1780                                          "vlan modify is not supported,"
1781                                          " set action must follow push action");
1782                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
1783                         break;
1784                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
1785                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1786                                 return rte_flow_error_set
1787                                         (error, ENOTSUP,
1788                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1789                                          "vlan modify is not supported,"
1790                                          " set action must follow push action");
1791                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
1792                         break;
1793                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
1794                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_DECAP;
1795                         break;
1796                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
1797                         ret = flow_tcf_validate_vxlan_encap(actions, error);
1798                         if (ret < 0)
1799                                 return ret;
1800                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_ENCAP;
1801                         break;
1802                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
1803                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_SRC;
1804                         break;
1805                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
1806                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_DST;
1807                         break;
1808                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
1809                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_SRC;
1810                         break;
1811                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
1812                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_DST;
1813                         break;
1814                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
1815                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_SRC;
1816                         break;
1817                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
1818                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_DST;
1819                         break;
1820                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
1821                         current_action_flag = MLX5_FLOW_ACTION_SET_TTL;
1822                         break;
1823                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
1824                         current_action_flag = MLX5_FLOW_ACTION_DEC_TTL;
1825                         break;
1826                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
1827                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_SRC;
1828                         break;
1829                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
1830                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_DST;
1831                         break;
1832                 default:
1833                         return rte_flow_error_set(error, ENOTSUP,
1834                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1835                                                   actions,
1836                                                   "action not supported");
1837                 }
1838                 if (current_action_flag & MLX5_TCF_CONFIG_ACTIONS) {
1839                         if (!actions->conf)
1840                                 return rte_flow_error_set
1841                                         (error, EINVAL,
1842                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1843                                          actions,
1844                                          "action configuration not set");
1845                 }
1846                 if ((current_action_flag & MLX5_TCF_PEDIT_ACTIONS) &&
1847                     pedit_validated)
1848                         return rte_flow_error_set(error, ENOTSUP,
1849                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1850                                                   actions,
1851                                                   "set actions should be "
1852                                                   "listed successively");
1853                 if ((current_action_flag & ~MLX5_TCF_PEDIT_ACTIONS) &&
1854                     (action_flags & MLX5_TCF_PEDIT_ACTIONS))
1855                         pedit_validated = 1;
1856                 if ((current_action_flag & MLX5_TCF_FATE_ACTIONS) &&
1857                     (action_flags & MLX5_TCF_FATE_ACTIONS))
1858                         return rte_flow_error_set(error, EINVAL,
1859                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1860                                                   actions,
1861                                                   "can't have multiple fate"
1862                                                   " actions");
1863                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1864                     (action_flags & MLX5_TCF_VXLAN_ACTIONS))
1865                         return rte_flow_error_set(error, EINVAL,
1866                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1867                                                   actions,
1868                                                   "can't have multiple vxlan"
1869                                                   " actions");
1870                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1871                     (action_flags & MLX5_TCF_VLAN_ACTIONS))
1872                         return rte_flow_error_set(error, ENOTSUP,
1873                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1874                                                   actions,
1875                                                   "can't have vxlan and vlan"
1876                                                   " actions in the same rule");
1877                 action_flags |= current_action_flag;
1878         }
1879         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1880                 unsigned int i;
1881
1882                 switch (items->type) {
1883                 case RTE_FLOW_ITEM_TYPE_VOID:
1884                         break;
1885                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
1886                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
1887                                 return rte_flow_error_set
1888                                         (error, ENOTSUP,
1889                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
1890                                          "inner tunnel port id"
1891                                          " item is not supported");
1892                         mask.port_id = flow_tcf_item_mask
1893                                 (items, &rte_flow_item_port_id_mask,
1894                                  &flow_tcf_mask_supported.port_id,
1895                                  &flow_tcf_mask_empty.port_id,
1896                                  sizeof(flow_tcf_mask_supported.port_id),
1897                                  error);
1898                         if (!mask.port_id)
1899                                 return -rte_errno;
1900                         if (mask.port_id == &flow_tcf_mask_empty.port_id) {
1901                                 in_port_id_set = 1;
1902                                 break;
1903                         }
1904                         spec.port_id = items->spec;
1905                         if (mask.port_id->id && mask.port_id->id != 0xffffffff)
1906                                 return rte_flow_error_set
1907                                         (error, ENOTSUP,
1908                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
1909                                          mask.port_id,
1910                                          "no support for partial mask on"
1911                                          " \"id\" field");
1912                         if (!mask.port_id->id)
1913                                 i = 0;
1914                         else
1915                                 for (i = 0; ptoi[i].ifindex; ++i)
1916                                         if (ptoi[i].port_id == spec.port_id->id)
1917                                                 break;
1918                         if (!ptoi[i].ifindex)
1919                                 return rte_flow_error_set
1920                                         (error, ENODEV,
1921                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
1922                                          spec.port_id,
1923                                          "missing data to convert port ID to"
1924                                          " ifindex");
1925                         if (in_port_id_set && ptoi[i].ifindex != tcm_ifindex)
1926                                 return rte_flow_error_set
1927                                         (error, ENOTSUP,
1928                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
1929                                          spec.port_id,
1930                                          "cannot match traffic for"
1931                                          " several port IDs through"
1932                                          " a single flow rule");
1933                         tcm_ifindex = ptoi[i].ifindex;
1934                         in_port_id_set = 1;
1935                         break;
1936                 case RTE_FLOW_ITEM_TYPE_ETH:
1937                         ret = mlx5_flow_validate_item_eth(items, item_flags,
1938                                                           error);
1939                         if (ret < 0)
1940                                 return ret;
1941                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
1942                                       MLX5_FLOW_LAYER_INNER_L2 :
1943                                       MLX5_FLOW_LAYER_OUTER_L2;
1944                         /* TODO:
1945                          * Redundant check due to different supported mask.
1946                          * Same for the rest of items.
1947                          */
1948                         mask.eth = flow_tcf_item_mask
1949                                 (items, &rte_flow_item_eth_mask,
1950                                  &flow_tcf_mask_supported.eth,
1951                                  &flow_tcf_mask_empty.eth,
1952                                  sizeof(flow_tcf_mask_supported.eth),
1953                                  error);
1954                         if (!mask.eth)
1955                                 return -rte_errno;
1956                         if (mask.eth->type && mask.eth->type !=
1957                             RTE_BE16(0xffff))
1958                                 return rte_flow_error_set
1959                                         (error, ENOTSUP,
1960                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
1961                                          mask.eth,
1962                                          "no support for partial mask on"
1963                                          " \"type\" field");
1964                         break;
1965                 case RTE_FLOW_ITEM_TYPE_VLAN:
1966                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
1967                                 return rte_flow_error_set
1968                                         (error, ENOTSUP,
1969                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
1970                                          "inner tunnel VLAN"
1971                                          " is not supported");
1972                         ret = mlx5_flow_validate_item_vlan(items, item_flags,
1973                                                            error);
1974                         if (ret < 0)
1975                                 return ret;
1976                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
1977                         mask.vlan = flow_tcf_item_mask
1978                                 (items, &rte_flow_item_vlan_mask,
1979                                  &flow_tcf_mask_supported.vlan,
1980                                  &flow_tcf_mask_empty.vlan,
1981                                  sizeof(flow_tcf_mask_supported.vlan),
1982                                  error);
1983                         if (!mask.vlan)
1984                                 return -rte_errno;
1985                         if ((mask.vlan->tci & RTE_BE16(0xe000) &&
1986                              (mask.vlan->tci & RTE_BE16(0xe000)) !=
1987                               RTE_BE16(0xe000)) ||
1988                             (mask.vlan->tci & RTE_BE16(0x0fff) &&
1989                              (mask.vlan->tci & RTE_BE16(0x0fff)) !=
1990                               RTE_BE16(0x0fff)) ||
1991                             (mask.vlan->inner_type &&
1992                              mask.vlan->inner_type != RTE_BE16(0xffff)))
1993                                 return rte_flow_error_set
1994                                         (error, ENOTSUP,
1995                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
1996                                          mask.vlan,
1997                                          "no support for partial masks on"
1998                                          " \"tci\" (PCP and VID parts) and"
1999                                          " \"inner_type\" fields");
2000                         break;
2001                 case RTE_FLOW_ITEM_TYPE_IPV4:
2002                         ret = mlx5_flow_validate_item_ipv4(items, item_flags,
2003                                                            error);
2004                         if (ret < 0)
2005                                 return ret;
2006                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2007                                       MLX5_FLOW_LAYER_INNER_L3_IPV4 :
2008                                       MLX5_FLOW_LAYER_OUTER_L3_IPV4;
2009                         mask.ipv4 = flow_tcf_item_mask
2010                                 (items, &rte_flow_item_ipv4_mask,
2011                                  &flow_tcf_mask_supported.ipv4,
2012                                  &flow_tcf_mask_empty.ipv4,
2013                                  sizeof(flow_tcf_mask_supported.ipv4),
2014                                  error);
2015                         if (!mask.ipv4)
2016                                 return -rte_errno;
2017                         if (mask.ipv4->hdr.next_proto_id &&
2018                             mask.ipv4->hdr.next_proto_id != 0xff)
2019                                 return rte_flow_error_set
2020                                         (error, ENOTSUP,
2021                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2022                                          mask.ipv4,
2023                                          "no support for partial mask on"
2024                                          " \"hdr.next_proto_id\" field");
2025                         else if (mask.ipv4->hdr.next_proto_id)
2026                                 next_protocol =
2027                                         ((const struct rte_flow_item_ipv4 *)
2028                                          (items->spec))->hdr.next_proto_id;
2029                         break;
2030                 case RTE_FLOW_ITEM_TYPE_IPV6:
2031                         ret = mlx5_flow_validate_item_ipv6(items, item_flags,
2032                                                            error);
2033                         if (ret < 0)
2034                                 return ret;
2035                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2036                                       MLX5_FLOW_LAYER_INNER_L3_IPV6 :
2037                                       MLX5_FLOW_LAYER_OUTER_L3_IPV6;
2038                         mask.ipv6 = flow_tcf_item_mask
2039                                 (items, &rte_flow_item_ipv6_mask,
2040                                  &flow_tcf_mask_supported.ipv6,
2041                                  &flow_tcf_mask_empty.ipv6,
2042                                  sizeof(flow_tcf_mask_supported.ipv6),
2043                                  error);
2044                         if (!mask.ipv6)
2045                                 return -rte_errno;
2046                         if (mask.ipv6->hdr.proto &&
2047                             mask.ipv6->hdr.proto != 0xff)
2048                                 return rte_flow_error_set
2049                                         (error, ENOTSUP,
2050                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2051                                          mask.ipv6,
2052                                          "no support for partial mask on"
2053                                          " \"hdr.proto\" field");
2054                         else if (mask.ipv6->hdr.proto)
2055                                 next_protocol =
2056                                         ((const struct rte_flow_item_ipv6 *)
2057                                          (items->spec))->hdr.proto;
2058                         break;
2059                 case RTE_FLOW_ITEM_TYPE_UDP:
2060                         ret = mlx5_flow_validate_item_udp(items, item_flags,
2061                                                           next_protocol, error);
2062                         if (ret < 0)
2063                                 return ret;
2064                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2065                                       MLX5_FLOW_LAYER_INNER_L4_UDP :
2066                                       MLX5_FLOW_LAYER_OUTER_L4_UDP;
2067                         mask.udp = flow_tcf_item_mask
2068                                 (items, &rte_flow_item_udp_mask,
2069                                  &flow_tcf_mask_supported.udp,
2070                                  &flow_tcf_mask_empty.udp,
2071                                  sizeof(flow_tcf_mask_supported.udp),
2072                                  error);
2073                         if (!mask.udp)
2074                                 return -rte_errno;
2075                         /*
2076                          * Save the presumed outer UDP item for extra check
2077                          * if the tunnel item will be found later in the list.
2078                          */
2079                         if (!(item_flags & MLX5_FLOW_LAYER_TUNNEL))
2080                                 outer_udp = items;
2081                         break;
2082                 case RTE_FLOW_ITEM_TYPE_TCP:
2083                         ret = mlx5_flow_validate_item_tcp
2084                                              (items, item_flags,
2085                                               next_protocol,
2086                                               &flow_tcf_mask_supported.tcp,
2087                                               error);
2088                         if (ret < 0)
2089                                 return ret;
2090                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2091                                       MLX5_FLOW_LAYER_INNER_L4_TCP :
2092                                       MLX5_FLOW_LAYER_OUTER_L4_TCP;
2093                         mask.tcp = flow_tcf_item_mask
2094                                 (items, &rte_flow_item_tcp_mask,
2095                                  &flow_tcf_mask_supported.tcp,
2096                                  &flow_tcf_mask_empty.tcp,
2097                                  sizeof(flow_tcf_mask_supported.tcp),
2098                                  error);
2099                         if (!mask.tcp)
2100                                 return -rte_errno;
2101                         break;
2102                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2103                         if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN)
2104                                 return rte_flow_error_set
2105                                         (error, ENOTSUP,
2106                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
2107                                          "vxlan tunnel over vlan"
2108                                          " is not supported");
2109                         ret = mlx5_flow_validate_item_vxlan(items,
2110                                                             item_flags, error);
2111                         if (ret < 0)
2112                                 return ret;
2113                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
2114                         mask.vxlan = flow_tcf_item_mask
2115                                 (items, &rte_flow_item_vxlan_mask,
2116                                  &flow_tcf_mask_supported.vxlan,
2117                                  &flow_tcf_mask_empty.vxlan,
2118                                  sizeof(flow_tcf_mask_supported.vxlan), error);
2119                         if (!mask.vxlan)
2120                                 return -rte_errno;
2121                         if (mask.vxlan->vni[0] != 0xff ||
2122                             mask.vxlan->vni[1] != 0xff ||
2123                             mask.vxlan->vni[2] != 0xff)
2124                                 return rte_flow_error_set
2125                                         (error, ENOTSUP,
2126                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2127                                          mask.vxlan,
2128                                          "no support for partial or "
2129                                          "empty mask on \"vxlan.vni\" field");
2130                         /*
2131                          * The VNI item assumes the VXLAN tunnel, it requires
2132                          * at least the outer destination UDP port must be
2133                          * specified without wildcards to allow kernel select
2134                          * the virtual VXLAN device by port. Also outer IPv4
2135                          * or IPv6 item must be specified (wilcards or even
2136                          * zero mask are allowed) to let driver know the tunnel
2137                          * IP version and process UDP traffic correctly.
2138                          */
2139                         if (!(item_flags &
2140                              (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2141                               MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2142                                 return rte_flow_error_set
2143                                                  (error, EINVAL,
2144                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2145                                                   NULL,
2146                                                   "no outer IP pattern found"
2147                                                   " for vxlan tunnel");
2148                         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
2149                                 return rte_flow_error_set
2150                                                  (error, EINVAL,
2151                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2152                                                   NULL,
2153                                                   "no outer UDP pattern found"
2154                                                   " for vxlan tunnel");
2155                         /*
2156                          * All items preceding the tunnel item become outer
2157                          * ones and we should do extra validation for them
2158                          * due to tc limitations for tunnel outer parameters.
2159                          * Currently only outer UDP item requres extra check,
2160                          * use the saved pointer instead of item list rescan.
2161                          */
2162                         assert(outer_udp);
2163                         ret = flow_tcf_validate_vxlan_decap_udp
2164                                                 (outer_udp, error);
2165                         if (ret < 0)
2166                                 return ret;
2167                         /* Reset L4 protocol for inner parameters. */
2168                         next_protocol = 0xff;
2169                         break;
2170                 default:
2171                         return rte_flow_error_set(error, ENOTSUP,
2172                                                   RTE_FLOW_ERROR_TYPE_ITEM,
2173                                                   items, "item not supported");
2174                 }
2175         }
2176         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2177             (action_flags & MLX5_FLOW_ACTION_DROP))
2178                 return rte_flow_error_set(error, ENOTSUP,
2179                                           RTE_FLOW_ERROR_TYPE_ACTION,
2180                                           actions,
2181                                           "set action is not compatible with "
2182                                           "drop action");
2183         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2184             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2185                 return rte_flow_error_set(error, ENOTSUP,
2186                                           RTE_FLOW_ERROR_TYPE_ACTION,
2187                                           actions,
2188                                           "set action must be followed by "
2189                                           "port_id action");
2190         if (action_flags &
2191            (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST)) {
2192                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4))
2193                         return rte_flow_error_set(error, EINVAL,
2194                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2195                                                   actions,
2196                                                   "no ipv4 item found in"
2197                                                   " pattern");
2198         }
2199         if (action_flags &
2200            (MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST)) {
2201                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6))
2202                         return rte_flow_error_set(error, EINVAL,
2203                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2204                                                   actions,
2205                                                   "no ipv6 item found in"
2206                                                   " pattern");
2207         }
2208         if (action_flags &
2209            (MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST)) {
2210                 if (!(item_flags &
2211                      (MLX5_FLOW_LAYER_OUTER_L4_UDP |
2212                       MLX5_FLOW_LAYER_OUTER_L4_TCP)))
2213                         return rte_flow_error_set(error, EINVAL,
2214                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2215                                                   actions,
2216                                                   "no TCP/UDP item found in"
2217                                                   " pattern");
2218         }
2219         /*
2220          * FW syndrome (0xA9C090):
2221          *     set_flow_table_entry: push vlan action fte in fdb can ONLY be
2222          *     forward to the uplink.
2223          */
2224         if ((action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) &&
2225             (action_flags & MLX5_FLOW_ACTION_PORT_ID) &&
2226             ((struct priv *)port_id_dev->data->dev_private)->representor)
2227                 return rte_flow_error_set(error, ENOTSUP,
2228                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2229                                           "vlan push can only be applied"
2230                                           " when forwarding to uplink port");
2231         /*
2232          * FW syndrome (0x294609):
2233          *     set_flow_table_entry: modify/pop/push actions in fdb flow table
2234          *     are supported only while forwarding to vport.
2235          */
2236         if ((action_flags & MLX5_TCF_VLAN_ACTIONS) &&
2237             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2238                 return rte_flow_error_set(error, ENOTSUP,
2239                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2240                                           "vlan actions are supported"
2241                                           " only with port_id action");
2242         if ((action_flags & MLX5_TCF_VXLAN_ACTIONS) &&
2243             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2244                 return rte_flow_error_set(error, ENOTSUP,
2245                                           RTE_FLOW_ERROR_TYPE_ACTION, NULL,
2246                                           "vxlan actions are supported"
2247                                           " only with port_id action");
2248         if (!(action_flags & MLX5_TCF_FATE_ACTIONS))
2249                 return rte_flow_error_set(error, EINVAL,
2250                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2251                                           "no fate action is found");
2252         if (action_flags &
2253            (MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL)) {
2254                 if (!(item_flags &
2255                      (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2256                       MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2257                         return rte_flow_error_set(error, EINVAL,
2258                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2259                                                   actions,
2260                                                   "no IP found in pattern");
2261         }
2262         if (action_flags &
2263             (MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)) {
2264                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L2))
2265                         return rte_flow_error_set(error, ENOTSUP,
2266                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2267                                                   actions,
2268                                                   "no ethernet found in"
2269                                                   " pattern");
2270         }
2271         if ((action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) &&
2272             !(item_flags & MLX5_FLOW_LAYER_VXLAN))
2273                 return rte_flow_error_set(error, EINVAL,
2274                                           RTE_FLOW_ERROR_TYPE_ACTION,
2275                                           NULL,
2276                                           "no VNI pattern found"
2277                                           " for vxlan decap action");
2278         if ((action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) &&
2279             (item_flags & MLX5_FLOW_LAYER_TUNNEL))
2280                 return rte_flow_error_set(error, EINVAL,
2281                                           RTE_FLOW_ERROR_TYPE_ACTION,
2282                                           NULL,
2283                                           "vxlan encap not supported"
2284                                           " for tunneled traffic");
2285         return 0;
2286 }
2287
2288 /**
2289  * Calculate maximum size of memory for flow items of Linux TC flower.
2290  *
2291  * @param[in] attr
2292  *   Pointer to the flow attributes.
2293  * @param[in] items
2294  *   Pointer to the list of items.
2295  *
2296  * @return
2297  *   Maximum size of memory for items.
2298  */
2299 static int
2300 flow_tcf_get_items_size(const struct rte_flow_attr *attr,
2301                         const struct rte_flow_item items[])
2302 {
2303         int size = 0;
2304
2305         size += SZ_NLATTR_STRZ_OF("flower") +
2306                 SZ_NLATTR_NEST + /* TCA_OPTIONS. */
2307                 SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */
2308         if (attr->group > 0)
2309                 size += SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CHAIN. */
2310         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2311                 switch (items->type) {
2312                 case RTE_FLOW_ITEM_TYPE_VOID:
2313                         break;
2314                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
2315                         break;
2316                 case RTE_FLOW_ITEM_TYPE_ETH:
2317                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
2318                                 SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4;
2319                                 /* dst/src MAC addr and mask. */
2320                         break;
2321                 case RTE_FLOW_ITEM_TYPE_VLAN:
2322                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
2323                                 SZ_NLATTR_TYPE_OF(uint16_t) +
2324                                 /* VLAN Ether type. */
2325                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */
2326                                 SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */
2327                         break;
2328                 case RTE_FLOW_ITEM_TYPE_IPV4:
2329                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
2330                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2331                                 SZ_NLATTR_TYPE_OF(uint32_t) * 4;
2332                                 /* dst/src IP addr and mask. */
2333                         break;
2334                 case RTE_FLOW_ITEM_TYPE_IPV6:
2335                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
2336                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2337                                 SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4;
2338                                 /* dst/src IP addr and mask. */
2339                         break;
2340                 case RTE_FLOW_ITEM_TYPE_UDP:
2341                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2342                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2343                                 /* dst/src port and mask. */
2344                         break;
2345                 case RTE_FLOW_ITEM_TYPE_TCP:
2346                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2347                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2348                                 /* dst/src port and mask. */
2349                         break;
2350                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2351                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2352                         break;
2353                 default:
2354                         DRV_LOG(WARNING,
2355                                 "unsupported item %p type %d,"
2356                                 " items must be validated before flow creation",
2357                                 (const void *)items, items->type);
2358                         break;
2359                 }
2360         }
2361         return size;
2362 }
2363
2364 /**
2365  * Calculate size of memory to store the VXLAN encapsultion
2366  * related items in the Netlink message buffer. Items list
2367  * is specified by RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action.
2368  * The item list should be validated.
2369  *
2370  * @param[in] action
2371  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2372  *   List of pattern items to scan data from.
2373  *
2374  * @return
2375  *   The size the part of Netlink message buffer to store the
2376  *   VXLAN encapsulation item attributes.
2377  */
2378 static int
2379 flow_tcf_vxlan_encap_size(const struct rte_flow_action *action)
2380 {
2381         const struct rte_flow_item *items;
2382         int size = 0;
2383
2384         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
2385         assert(action->conf);
2386
2387         items = ((const struct rte_flow_action_vxlan_encap *)
2388                                         action->conf)->definition;
2389         assert(items);
2390         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2391                 switch (items->type) {
2392                 case RTE_FLOW_ITEM_TYPE_VOID:
2393                         break;
2394                 case RTE_FLOW_ITEM_TYPE_ETH:
2395                         /* This item does not require message buffer. */
2396                         break;
2397                 case RTE_FLOW_ITEM_TYPE_IPV4:
2398                         size += SZ_NLATTR_DATA_OF(IPV4_ADDR_LEN) * 2;
2399                         break;
2400                 case RTE_FLOW_ITEM_TYPE_IPV6:
2401                         size += SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 2;
2402                         break;
2403                 case RTE_FLOW_ITEM_TYPE_UDP: {
2404                         const struct rte_flow_item_udp *udp = items->mask;
2405
2406                         size += SZ_NLATTR_TYPE_OF(uint16_t);
2407                         if (!udp || udp->hdr.src_port != RTE_BE16(0x0000))
2408                                 size += SZ_NLATTR_TYPE_OF(uint16_t);
2409                         break;
2410                 }
2411                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2412                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2413                         break;
2414                 default:
2415                         assert(false);
2416                         DRV_LOG(WARNING,
2417                                 "unsupported item %p type %d,"
2418                                 " items must be validated"
2419                                 " before flow creation",
2420                                 (const void *)items, items->type);
2421                         return 0;
2422                 }
2423         }
2424         return size;
2425 }
2426
2427 /**
2428  * Calculate maximum size of memory for flow actions of Linux TC flower and
2429  * extract specified actions.
2430  *
2431  * @param[in] actions
2432  *   Pointer to the list of actions.
2433  * @param[out] action_flags
2434  *   Pointer to the detected actions.
2435  *
2436  * @return
2437  *   Maximum size of memory for actions.
2438  */
2439 static int
2440 flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
2441                               uint64_t *action_flags)
2442 {
2443         int size = 0;
2444         uint64_t flags = 0;
2445
2446         size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */
2447         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
2448                 switch (actions->type) {
2449                 case RTE_FLOW_ACTION_TYPE_VOID:
2450                         break;
2451                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
2452                         size += SZ_NLATTR_NEST + /* na_act_index. */
2453                                 SZ_NLATTR_STRZ_OF("mirred") +
2454                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2455                                 SZ_NLATTR_TYPE_OF(struct tc_mirred);
2456                         flags |= MLX5_FLOW_ACTION_PORT_ID;
2457                         break;
2458                 case RTE_FLOW_ACTION_TYPE_JUMP:
2459                         size += SZ_NLATTR_NEST + /* na_act_index. */
2460                                 SZ_NLATTR_STRZ_OF("gact") +
2461                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2462                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2463                         flags |= MLX5_FLOW_ACTION_JUMP;
2464                         break;
2465                 case RTE_FLOW_ACTION_TYPE_DROP:
2466                         size += SZ_NLATTR_NEST + /* na_act_index. */
2467                                 SZ_NLATTR_STRZ_OF("gact") +
2468                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2469                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2470                         flags |= MLX5_FLOW_ACTION_DROP;
2471                         break;
2472                 case RTE_FLOW_ACTION_TYPE_COUNT:
2473                         break;
2474                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
2475                         flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
2476                         goto action_of_vlan;
2477                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
2478                         flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
2479                         goto action_of_vlan;
2480                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
2481                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
2482                         goto action_of_vlan;
2483                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
2484                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
2485                         goto action_of_vlan;
2486 action_of_vlan:
2487                         size += SZ_NLATTR_NEST + /* na_act_index. */
2488                                 SZ_NLATTR_STRZ_OF("vlan") +
2489                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2490                                 SZ_NLATTR_TYPE_OF(struct tc_vlan) +
2491                                 SZ_NLATTR_TYPE_OF(uint16_t) +
2492                                 /* VLAN protocol. */
2493                                 SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
2494                                 SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
2495                         break;
2496                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
2497                         size += SZ_NLATTR_NEST + /* na_act_index. */
2498                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2499                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2500                                 SZ_NLATTR_TYPE_OF(uint8_t);
2501                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2502                         size += flow_tcf_vxlan_encap_size(actions) +
2503                                 RTE_ALIGN_CEIL /* preceding encap params. */
2504                                 (sizeof(struct flow_tcf_vxlan_encap),
2505                                 MNL_ALIGNTO);
2506                         flags |= MLX5_FLOW_ACTION_VXLAN_ENCAP;
2507                         break;
2508                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
2509                         size += SZ_NLATTR_NEST + /* na_act_index. */
2510                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2511                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2512                                 SZ_NLATTR_TYPE_OF(uint8_t);
2513                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2514                         size += RTE_ALIGN_CEIL /* preceding decap params. */
2515                                 (sizeof(struct flow_tcf_vxlan_decap),
2516                                 MNL_ALIGNTO);
2517                         flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2518                         break;
2519                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
2520                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
2521                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
2522                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
2523                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
2524                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
2525                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
2526                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
2527                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
2528                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
2529                         size += flow_tcf_get_pedit_actions_size(&actions,
2530                                                                 &flags);
2531                         break;
2532                 default:
2533                         DRV_LOG(WARNING,
2534                                 "unsupported action %p type %d,"
2535                                 " items must be validated before flow creation",
2536                                 (const void *)actions, actions->type);
2537                         break;
2538                 }
2539         }
2540         *action_flags = flags;
2541         return size;
2542 }
2543
2544 /**
2545  * Brand rtnetlink buffer with unique handle.
2546  *
2547  * This handle should be unique for a given network interface to avoid
2548  * collisions.
2549  *
2550  * @param nlh
2551  *   Pointer to Netlink message.
2552  * @param handle
2553  *   Unique 32-bit handle to use.
2554  */
2555 static void
2556 flow_tcf_nl_brand(struct nlmsghdr *nlh, uint32_t handle)
2557 {
2558         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
2559
2560         tcm->tcm_handle = handle;
2561         DRV_LOG(DEBUG, "Netlink msg %p is branded with handle %x",
2562                 (void *)nlh, handle);
2563 }
2564
2565 /**
2566  * Prepare a flow object for Linux TC flower. It calculates the maximum size of
2567  * memory required, allocates the memory, initializes Netlink message headers
2568  * and set unique TC message handle.
2569  *
2570  * @param[in] attr
2571  *   Pointer to the flow attributes.
2572  * @param[in] items
2573  *   Pointer to the list of items.
2574  * @param[in] actions
2575  *   Pointer to the list of actions.
2576  * @param[out] error
2577  *   Pointer to the error structure.
2578  *
2579  * @return
2580  *   Pointer to mlx5_flow object on success,
2581  *   otherwise NULL and rte_ernno is set.
2582  */
2583 static struct mlx5_flow *
2584 flow_tcf_prepare(const struct rte_flow_attr *attr,
2585                  const struct rte_flow_item items[],
2586                  const struct rte_flow_action actions[],
2587                  struct rte_flow_error *error)
2588 {
2589         size_t size = RTE_ALIGN_CEIL
2590                         (sizeof(struct mlx5_flow),
2591                          alignof(struct flow_tcf_tunnel_hdr)) +
2592                       MNL_ALIGN(sizeof(struct nlmsghdr)) +
2593                       MNL_ALIGN(sizeof(struct tcmsg));
2594         struct mlx5_flow *dev_flow;
2595         uint64_t action_flags = 0;
2596         struct nlmsghdr *nlh;
2597         struct tcmsg *tcm;
2598         uint8_t *sp, *tun = NULL;
2599
2600         size += flow_tcf_get_items_size(attr, items);
2601         size += flow_tcf_get_actions_and_size(actions, &action_flags);
2602         dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
2603         if (!dev_flow) {
2604                 rte_flow_error_set(error, ENOMEM,
2605                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
2606                                    "not enough memory to create E-Switch flow");
2607                 return NULL;
2608         }
2609         sp = (uint8_t *)(dev_flow + 1);
2610         if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) {
2611                 sp = RTE_PTR_ALIGN
2612                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2613                 tun = sp;
2614                 sp += RTE_ALIGN_CEIL
2615                         (sizeof(struct flow_tcf_vxlan_encap),
2616                         MNL_ALIGNTO);
2617 #ifndef NDEBUG
2618                 size -= RTE_ALIGN_CEIL
2619                         (sizeof(struct flow_tcf_vxlan_encap),
2620                         MNL_ALIGNTO);
2621 #endif
2622         } else if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2623                 sp = RTE_PTR_ALIGN
2624                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2625                 tun = sp;
2626                 sp += RTE_ALIGN_CEIL
2627                         (sizeof(struct flow_tcf_vxlan_decap),
2628                         MNL_ALIGNTO);
2629 #ifndef NDEBUG
2630                 size -= RTE_ALIGN_CEIL
2631                         (sizeof(struct flow_tcf_vxlan_decap),
2632                         MNL_ALIGNTO);
2633 #endif
2634         } else {
2635                 sp = RTE_PTR_ALIGN(sp, MNL_ALIGNTO);
2636         }
2637         nlh = mnl_nlmsg_put_header(sp);
2638         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
2639         *dev_flow = (struct mlx5_flow){
2640                 .tcf = (struct mlx5_flow_tcf){
2641 #ifndef NDEBUG
2642                         .nlsize = size - RTE_ALIGN_CEIL
2643                                 (sizeof(struct mlx5_flow),
2644                                  alignof(struct flow_tcf_tunnel_hdr)),
2645 #endif
2646                         .tunnel = (struct flow_tcf_tunnel_hdr *)tun,
2647                         .nlh = nlh,
2648                         .tcm = tcm,
2649                 },
2650         };
2651         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP)
2652                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_DECAP;
2653         else if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP)
2654                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_ENCAP;
2655         /*
2656          * Generate a reasonably unique handle based on the address of the
2657          * target buffer.
2658          *
2659          * This is straightforward on 32-bit systems where the flow pointer can
2660          * be used directly. Otherwise, its least significant part is taken
2661          * after shifting it by the previous power of two of the pointed buffer
2662          * size.
2663          */
2664         if (sizeof(dev_flow) <= 4)
2665                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow);
2666         else
2667                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow >>
2668                                        rte_log2_u32(rte_align32prevpow2(size)));
2669         return dev_flow;
2670 }
2671
2672 /**
2673  * Make adjustments for supporting count actions.
2674  *
2675  * @param[in] dev
2676  *   Pointer to the Ethernet device structure.
2677  * @param[in] dev_flow
2678  *   Pointer to mlx5_flow.
2679  * @param[out] error
2680  *   Pointer to error structure.
2681  *
2682  * @return
2683  *   0 On success else a negative errno value is returned and rte_errno is set.
2684  */
2685 static int
2686 flow_tcf_translate_action_count(struct rte_eth_dev *dev __rte_unused,
2687                                   struct mlx5_flow *dev_flow,
2688                                   struct rte_flow_error *error)
2689 {
2690         struct rte_flow *flow = dev_flow->flow;
2691
2692         if (!flow->counter) {
2693                 flow->counter = flow_tcf_counter_new();
2694                 if (!flow->counter)
2695                         return rte_flow_error_set(error, rte_errno,
2696                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2697                                                   NULL,
2698                                                   "cannot get counter"
2699                                                   " context.");
2700         }
2701         return 0;
2702 }
2703
2704 /**
2705  * Convert VXLAN VNI to 32-bit integer.
2706  *
2707  * @param[in] vni
2708  *   VXLAN VNI in 24-bit wire format.
2709  *
2710  * @return
2711  *   VXLAN VNI as a 32-bit integer value in network endian.
2712  */
2713 static inline rte_be32_t
2714 vxlan_vni_as_be32(const uint8_t vni[3])
2715 {
2716         union {
2717                 uint8_t vni[4];
2718                 rte_be32_t dword;
2719         } ret = {
2720                 .vni = { 0, vni[0], vni[1], vni[2] },
2721         };
2722         return ret.dword;
2723 }
2724
2725 /**
2726  * Helper function to process RTE_FLOW_ITEM_TYPE_ETH entry in configuration
2727  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the MAC address fields
2728  * in the encapsulation parameters structure. The item must be prevalidated,
2729  * no any validation checks performed by function.
2730  *
2731  * @param[in] spec
2732  *   RTE_FLOW_ITEM_TYPE_ETH entry specification.
2733  * @param[in] mask
2734  *   RTE_FLOW_ITEM_TYPE_ETH entry mask.
2735  * @param[out] encap
2736  *   Structure to fill the gathered MAC address data.
2737  */
2738 static void
2739 flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec,
2740                                const struct rte_flow_item_eth *mask,
2741                                struct flow_tcf_vxlan_encap *encap)
2742 {
2743         /* Item must be validated before. No redundant checks. */
2744         assert(spec);
2745         if (!mask || !memcmp(&mask->dst,
2746                              &rte_flow_item_eth_mask.dst,
2747                              sizeof(rte_flow_item_eth_mask.dst))) {
2748                 /*
2749                  * Ethernet addresses are not supported by
2750                  * tc as tunnel_key parameters. Destination
2751                  * address is needed to form encap packet
2752                  * header and retrieved by kernel from
2753                  * implicit sources (ARP table, etc),
2754                  * address masks are not supported at all.
2755                  */
2756                 encap->eth.dst = spec->dst;
2757                 encap->mask |= FLOW_TCF_ENCAP_ETH_DST;
2758         }
2759         if (!mask || !memcmp(&mask->src,
2760                              &rte_flow_item_eth_mask.src,
2761                              sizeof(rte_flow_item_eth_mask.src))) {
2762                 /*
2763                  * Ethernet addresses are not supported by
2764                  * tc as tunnel_key parameters. Source ethernet
2765                  * address is ignored anyway.
2766                  */
2767                 encap->eth.src = spec->src;
2768                 encap->mask |= FLOW_TCF_ENCAP_ETH_SRC;
2769         }
2770 }
2771
2772 /**
2773  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV4 entry in configuration
2774  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV4 address fields
2775  * in the encapsulation parameters structure. The item must be prevalidated,
2776  * no any validation checks performed by function.
2777  *
2778  * @param[in] spec
2779  *   RTE_FLOW_ITEM_TYPE_IPV4 entry specification.
2780  * @param[out] encap
2781  *   Structure to fill the gathered IPV4 address data.
2782  */
2783 static void
2784 flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec,
2785                                 struct flow_tcf_vxlan_encap *encap)
2786 {
2787         /* Item must be validated before. No redundant checks. */
2788         assert(spec);
2789         encap->ipv4.dst = spec->hdr.dst_addr;
2790         encap->ipv4.src = spec->hdr.src_addr;
2791         encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC |
2792                        FLOW_TCF_ENCAP_IPV4_DST;
2793 }
2794
2795 /**
2796  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV6 entry in configuration
2797  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV6 address fields
2798  * in the encapsulation parameters structure. The item must be prevalidated,
2799  * no any validation checks performed by function.
2800  *
2801  * @param[in] spec
2802  *   RTE_FLOW_ITEM_TYPE_IPV6 entry specification.
2803  * @param[out] encap
2804  *   Structure to fill the gathered IPV6 address data.
2805  */
2806 static void
2807 flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec,
2808                                 struct flow_tcf_vxlan_encap *encap)
2809 {
2810         /* Item must be validated before. No redundant checks. */
2811         assert(spec);
2812         memcpy(encap->ipv6.dst, spec->hdr.dst_addr, IPV6_ADDR_LEN);
2813         memcpy(encap->ipv6.src, spec->hdr.src_addr, IPV6_ADDR_LEN);
2814         encap->mask |= FLOW_TCF_ENCAP_IPV6_SRC |
2815                        FLOW_TCF_ENCAP_IPV6_DST;
2816 }
2817
2818 /**
2819  * Helper function to process RTE_FLOW_ITEM_TYPE_UDP entry in configuration
2820  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the UDP port fields
2821  * in the encapsulation parameters structure. The item must be prevalidated,
2822  * no any validation checks performed by function.
2823  *
2824  * @param[in] spec
2825  *   RTE_FLOW_ITEM_TYPE_UDP entry specification.
2826  * @param[in] mask
2827  *   RTE_FLOW_ITEM_TYPE_UDP entry mask.
2828  * @param[out] encap
2829  *   Structure to fill the gathered UDP port data.
2830  */
2831 static void
2832 flow_tcf_parse_vxlan_encap_udp(const struct rte_flow_item_udp *spec,
2833                                const struct rte_flow_item_udp *mask,
2834                                struct flow_tcf_vxlan_encap *encap)
2835 {
2836         assert(spec);
2837         encap->udp.dst = spec->hdr.dst_port;
2838         encap->mask |= FLOW_TCF_ENCAP_UDP_DST;
2839         if (!mask || mask->hdr.src_port != RTE_BE16(0x0000)) {
2840                 encap->udp.src = spec->hdr.src_port;
2841                 encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC;
2842         }
2843 }
2844
2845 /**
2846  * Helper function to process RTE_FLOW_ITEM_TYPE_VXLAN entry in configuration
2847  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the VNI fields
2848  * in the encapsulation parameters structure. The item must be prevalidated,
2849  * no any validation checks performed by function.
2850  *
2851  * @param[in] spec
2852  *   RTE_FLOW_ITEM_TYPE_VXLAN entry specification.
2853  * @param[out] encap
2854  *   Structure to fill the gathered VNI address data.
2855  */
2856 static void
2857 flow_tcf_parse_vxlan_encap_vni(const struct rte_flow_item_vxlan *spec,
2858                                struct flow_tcf_vxlan_encap *encap)
2859 {
2860         /* Item must be validated before. Do not redundant checks. */
2861         assert(spec);
2862         memcpy(encap->vxlan.vni, spec->vni, sizeof(encap->vxlan.vni));
2863         encap->mask |= FLOW_TCF_ENCAP_VXLAN_VNI;
2864 }
2865
2866 /**
2867  * Populate consolidated encapsulation object from list of pattern items.
2868  *
2869  * Helper function to process configuration of action such as
2870  * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. The item list should be
2871  * validated, there is no way to return an meaningful error.
2872  *
2873  * @param[in] action
2874  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2875  *   List of pattern items to gather data from.
2876  * @param[out] src
2877  *   Structure to fill gathered data.
2878  */
2879 static void
2880 flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action,
2881                            struct flow_tcf_vxlan_encap *encap)
2882 {
2883         union {
2884                 const struct rte_flow_item_eth *eth;
2885                 const struct rte_flow_item_ipv4 *ipv4;
2886                 const struct rte_flow_item_ipv6 *ipv6;
2887                 const struct rte_flow_item_udp *udp;
2888                 const struct rte_flow_item_vxlan *vxlan;
2889         } spec, mask;
2890         const struct rte_flow_item *items;
2891
2892         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
2893         assert(action->conf);
2894
2895         items = ((const struct rte_flow_action_vxlan_encap *)
2896                                         action->conf)->definition;
2897         assert(items);
2898         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2899                 switch (items->type) {
2900                 case RTE_FLOW_ITEM_TYPE_VOID:
2901                         break;
2902                 case RTE_FLOW_ITEM_TYPE_ETH:
2903                         mask.eth = items->mask;
2904                         spec.eth = items->spec;
2905                         flow_tcf_parse_vxlan_encap_eth(spec.eth, mask.eth,
2906                                                        encap);
2907                         break;
2908                 case RTE_FLOW_ITEM_TYPE_IPV4:
2909                         spec.ipv4 = items->spec;
2910                         flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4, encap);
2911                         break;
2912                 case RTE_FLOW_ITEM_TYPE_IPV6:
2913                         spec.ipv6 = items->spec;
2914                         flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6, encap);
2915                         break;
2916                 case RTE_FLOW_ITEM_TYPE_UDP:
2917                         mask.udp = items->mask;
2918                         spec.udp = items->spec;
2919                         flow_tcf_parse_vxlan_encap_udp(spec.udp, mask.udp,
2920                                                        encap);
2921                         break;
2922                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2923                         spec.vxlan = items->spec;
2924                         flow_tcf_parse_vxlan_encap_vni(spec.vxlan, encap);
2925                         break;
2926                 default:
2927                         assert(false);
2928                         DRV_LOG(WARNING,
2929                                 "unsupported item %p type %d,"
2930                                 " items must be validated"
2931                                 " before flow creation",
2932                                 (const void *)items, items->type);
2933                         encap->mask = 0;
2934                         return;
2935                 }
2936         }
2937 }
2938
2939 /**
2940  * Translate flow for Linux TC flower and construct Netlink message.
2941  *
2942  * @param[in] priv
2943  *   Pointer to the priv structure.
2944  * @param[in, out] flow
2945  *   Pointer to the sub flow.
2946  * @param[in] attr
2947  *   Pointer to the flow attributes.
2948  * @param[in] items
2949  *   Pointer to the list of items.
2950  * @param[in] actions
2951  *   Pointer to the list of actions.
2952  * @param[out] error
2953  *   Pointer to the error structure.
2954  *
2955  * @return
2956  *   0 on success, a negative errno value otherwise and rte_ernno is set.
2957  */
2958 static int
2959 flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow,
2960                    const struct rte_flow_attr *attr,
2961                    const struct rte_flow_item items[],
2962                    const struct rte_flow_action actions[],
2963                    struct rte_flow_error *error)
2964 {
2965         union {
2966                 const struct rte_flow_item_port_id *port_id;
2967                 const struct rte_flow_item_eth *eth;
2968                 const struct rte_flow_item_vlan *vlan;
2969                 const struct rte_flow_item_ipv4 *ipv4;
2970                 const struct rte_flow_item_ipv6 *ipv6;
2971                 const struct rte_flow_item_tcp *tcp;
2972                 const struct rte_flow_item_udp *udp;
2973                 const struct rte_flow_item_vxlan *vxlan;
2974         } spec, mask;
2975         union {
2976                 const struct rte_flow_action_port_id *port_id;
2977                 const struct rte_flow_action_jump *jump;
2978                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
2979                 const struct rte_flow_action_of_set_vlan_vid *
2980                         of_set_vlan_vid;
2981                 const struct rte_flow_action_of_set_vlan_pcp *
2982                         of_set_vlan_pcp;
2983         } conf;
2984         union {
2985                 struct flow_tcf_tunnel_hdr *hdr;
2986                 struct flow_tcf_vxlan_decap *vxlan;
2987         } decap = {
2988                 .hdr = NULL,
2989         };
2990         union {
2991                 struct flow_tcf_tunnel_hdr *hdr;
2992                 struct flow_tcf_vxlan_encap *vxlan;
2993         } encap = {
2994                 .hdr = NULL,
2995         };
2996         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
2997         struct nlmsghdr *nlh = dev_flow->tcf.nlh;
2998         struct tcmsg *tcm = dev_flow->tcf.tcm;
2999         uint32_t na_act_index_cur;
3000         bool eth_type_set = 0;
3001         bool vlan_present = 0;
3002         bool vlan_eth_type_set = 0;
3003         bool ip_proto_set = 0;
3004         struct nlattr *na_flower;
3005         struct nlattr *na_flower_act;
3006         struct nlattr *na_vlan_id = NULL;
3007         struct nlattr *na_vlan_priority = NULL;
3008         uint64_t item_flags = 0;
3009         int ret;
3010
3011         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
3012                                                 PTOI_TABLE_SZ_MAX(dev)));
3013         if (dev_flow->tcf.tunnel) {
3014                 switch (dev_flow->tcf.tunnel->type) {
3015                 case FLOW_TCF_TUNACT_VXLAN_DECAP:
3016                         decap.vxlan = dev_flow->tcf.vxlan_decap;
3017                         break;
3018                 case FLOW_TCF_TUNACT_VXLAN_ENCAP:
3019                         encap.vxlan = dev_flow->tcf.vxlan_encap;
3020                         break;
3021                 /* New tunnel actions can be added here. */
3022                 default:
3023                         assert(false);
3024                         break;
3025                 }
3026         }
3027         nlh = dev_flow->tcf.nlh;
3028         tcm = dev_flow->tcf.tcm;
3029         /* Prepare API must have been called beforehand. */
3030         assert(nlh != NULL && tcm != NULL);
3031         tcm->tcm_family = AF_UNSPEC;
3032         tcm->tcm_ifindex = ptoi[0].ifindex;
3033         tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
3034         /*
3035          * Priority cannot be zero to prevent the kernel from picking one
3036          * automatically.
3037          */
3038         tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
3039                                   RTE_BE16(ETH_P_ALL));
3040         if (attr->group > 0)
3041                 mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group);
3042         mnl_attr_put_strz(nlh, TCA_KIND, "flower");
3043         na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
3044         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3045                 unsigned int i;
3046
3047                 switch (items->type) {
3048                 case RTE_FLOW_ITEM_TYPE_VOID:
3049                         break;
3050                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
3051                         mask.port_id = flow_tcf_item_mask
3052                                 (items, &rte_flow_item_port_id_mask,
3053                                  &flow_tcf_mask_supported.port_id,
3054                                  &flow_tcf_mask_empty.port_id,
3055                                  sizeof(flow_tcf_mask_supported.port_id),
3056                                  error);
3057                         assert(mask.port_id);
3058                         if (mask.port_id == &flow_tcf_mask_empty.port_id)
3059                                 break;
3060                         spec.port_id = items->spec;
3061                         if (!mask.port_id->id)
3062                                 i = 0;
3063                         else
3064                                 for (i = 0; ptoi[i].ifindex; ++i)
3065                                         if (ptoi[i].port_id == spec.port_id->id)
3066                                                 break;
3067                         assert(ptoi[i].ifindex);
3068                         tcm->tcm_ifindex = ptoi[i].ifindex;
3069                         break;
3070                 case RTE_FLOW_ITEM_TYPE_ETH:
3071                         item_flags |= (item_flags & MLX5_FLOW_LAYER_VXLAN) ?
3072                                       MLX5_FLOW_LAYER_INNER_L2 :
3073                                       MLX5_FLOW_LAYER_OUTER_L2;
3074                         mask.eth = flow_tcf_item_mask
3075                                 (items, &rte_flow_item_eth_mask,
3076                                  &flow_tcf_mask_supported.eth,
3077                                  &flow_tcf_mask_empty.eth,
3078                                  sizeof(flow_tcf_mask_supported.eth),
3079                                  error);
3080                         assert(mask.eth);
3081                         if (mask.eth == &flow_tcf_mask_empty.eth)
3082                                 break;
3083                         spec.eth = items->spec;
3084                         if (decap.vxlan &&
3085                             !(item_flags & MLX5_FLOW_LAYER_VXLAN)) {
3086                                 DRV_LOG(WARNING,
3087                                         "outer L2 addresses cannot be forced"
3088                                         " for vxlan decapsulation, parameter"
3089                                         " ignored");
3090                                 break;
3091                         }
3092                         if (mask.eth->type) {
3093                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_ETH_TYPE,
3094                                                  spec.eth->type);
3095                                 eth_type_set = 1;
3096                         }
3097                         if (!is_zero_ether_addr(&mask.eth->dst)) {
3098                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
3099                                              ETHER_ADDR_LEN,
3100                                              spec.eth->dst.addr_bytes);
3101                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK,
3102                                              ETHER_ADDR_LEN,
3103                                              mask.eth->dst.addr_bytes);
3104                         }
3105                         if (!is_zero_ether_addr(&mask.eth->src)) {
3106                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC,
3107                                              ETHER_ADDR_LEN,
3108                                              spec.eth->src.addr_bytes);
3109                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK,
3110                                              ETHER_ADDR_LEN,
3111                                              mask.eth->src.addr_bytes);
3112                         }
3113                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3114                         break;
3115                 case RTE_FLOW_ITEM_TYPE_VLAN:
3116                         assert(!encap.hdr);
3117                         assert(!decap.hdr);
3118                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
3119                         mask.vlan = flow_tcf_item_mask
3120                                 (items, &rte_flow_item_vlan_mask,
3121                                  &flow_tcf_mask_supported.vlan,
3122                                  &flow_tcf_mask_empty.vlan,
3123                                  sizeof(flow_tcf_mask_supported.vlan),
3124                                  error);
3125                         assert(mask.vlan);
3126                         if (!eth_type_set)
3127                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_ETH_TYPE,
3128                                                  RTE_BE16(ETH_P_8021Q));
3129                         eth_type_set = 1;
3130                         vlan_present = 1;
3131                         if (mask.vlan == &flow_tcf_mask_empty.vlan)
3132                                 break;
3133                         spec.vlan = items->spec;
3134                         if (mask.vlan->inner_type) {
3135                                 mnl_attr_put_u16(nlh,
3136                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE,
3137                                                  spec.vlan->inner_type);
3138                                 vlan_eth_type_set = 1;
3139                         }
3140                         if (mask.vlan->tci & RTE_BE16(0xe000))
3141                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_VLAN_PRIO,
3142                                                 (rte_be_to_cpu_16
3143                                                  (spec.vlan->tci) >> 13) & 0x7);
3144                         if (mask.vlan->tci & RTE_BE16(0x0fff))
3145                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_VLAN_ID,
3146                                                  rte_be_to_cpu_16
3147                                                  (spec.vlan->tci &
3148                                                   RTE_BE16(0x0fff)));
3149                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3150                         break;
3151                 case RTE_FLOW_ITEM_TYPE_IPV4:
3152                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
3153                         mask.ipv4 = flow_tcf_item_mask
3154                                 (items, &rte_flow_item_ipv4_mask,
3155                                  &flow_tcf_mask_supported.ipv4,
3156                                  &flow_tcf_mask_empty.ipv4,
3157                                  sizeof(flow_tcf_mask_supported.ipv4),
3158                                  error);
3159                         assert(mask.ipv4);
3160                         spec.ipv4 = items->spec;
3161                         if (!decap.vxlan) {
3162                                 if (!eth_type_set ||
3163                                     (!vlan_eth_type_set && vlan_present))
3164                                         mnl_attr_put_u16
3165                                                 (nlh,
3166                                                  vlan_present ?
3167                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE :
3168                                                  TCA_FLOWER_KEY_ETH_TYPE,
3169                                                  RTE_BE16(ETH_P_IP));
3170                                 eth_type_set = 1;
3171                                 vlan_eth_type_set = 1;
3172                                 if (mask.ipv4 == &flow_tcf_mask_empty.ipv4)
3173                                         break;
3174                                 if (mask.ipv4->hdr.next_proto_id) {
3175                                         mnl_attr_put_u8
3176                                                 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3177                                                  spec.ipv4->hdr.next_proto_id);
3178                                         ip_proto_set = 1;
3179                                 }
3180                         } else {
3181                                 assert(mask.ipv4 != &flow_tcf_mask_empty.ipv4);
3182                         }
3183                         if (mask.ipv4->hdr.src_addr) {
3184                                 mnl_attr_put_u32
3185                                         (nlh, decap.vxlan ?
3186                                          TCA_FLOWER_KEY_ENC_IPV4_SRC :
3187                                          TCA_FLOWER_KEY_IPV4_SRC,
3188                                          spec.ipv4->hdr.src_addr);
3189                                 mnl_attr_put_u32
3190                                         (nlh, decap.vxlan ?
3191                                          TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK :
3192                                          TCA_FLOWER_KEY_IPV4_SRC_MASK,
3193                                          mask.ipv4->hdr.src_addr);
3194                         }
3195                         if (mask.ipv4->hdr.dst_addr) {
3196                                 mnl_attr_put_u32
3197                                         (nlh, decap.vxlan ?
3198                                          TCA_FLOWER_KEY_ENC_IPV4_DST :
3199                                          TCA_FLOWER_KEY_IPV4_DST,
3200                                          spec.ipv4->hdr.dst_addr);
3201                                 mnl_attr_put_u32
3202                                         (nlh, decap.vxlan ?
3203                                          TCA_FLOWER_KEY_ENC_IPV4_DST_MASK :
3204                                          TCA_FLOWER_KEY_IPV4_DST_MASK,
3205                                          mask.ipv4->hdr.dst_addr);
3206                         }
3207                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3208                         break;
3209                 case RTE_FLOW_ITEM_TYPE_IPV6:
3210                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
3211                         mask.ipv6 = flow_tcf_item_mask
3212                                 (items, &rte_flow_item_ipv6_mask,
3213                                  &flow_tcf_mask_supported.ipv6,
3214                                  &flow_tcf_mask_empty.ipv6,
3215                                  sizeof(flow_tcf_mask_supported.ipv6),
3216                                  error);
3217                         assert(mask.ipv6);
3218                         spec.ipv6 = items->spec;
3219                         if (!decap.vxlan) {
3220                                 if (!eth_type_set ||
3221                                     (!vlan_eth_type_set && vlan_present))
3222                                         mnl_attr_put_u16
3223                                                 (nlh,
3224                                                  vlan_present ?
3225                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE :
3226                                                  TCA_FLOWER_KEY_ETH_TYPE,
3227                                                  RTE_BE16(ETH_P_IPV6));
3228                                 eth_type_set = 1;
3229                                 vlan_eth_type_set = 1;
3230                                 if (mask.ipv6 == &flow_tcf_mask_empty.ipv6)
3231                                         break;
3232                                 if (mask.ipv6->hdr.proto) {
3233                                         mnl_attr_put_u8
3234                                                 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3235                                                  spec.ipv6->hdr.proto);
3236                                         ip_proto_set = 1;
3237                                 }
3238                         } else {
3239                                 assert(mask.ipv6 != &flow_tcf_mask_empty.ipv6);
3240                         }
3241                         if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.src_addr)) {
3242                                 mnl_attr_put(nlh, decap.vxlan ?
3243                                              TCA_FLOWER_KEY_ENC_IPV6_SRC :
3244                                              TCA_FLOWER_KEY_IPV6_SRC,
3245                                              IPV6_ADDR_LEN,
3246                                              spec.ipv6->hdr.src_addr);
3247                                 mnl_attr_put(nlh, decap.vxlan ?
3248                                              TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK :
3249                                              TCA_FLOWER_KEY_IPV6_SRC_MASK,
3250                                              IPV6_ADDR_LEN,
3251                                              mask.ipv6->hdr.src_addr);
3252                         }
3253                         if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.dst_addr)) {
3254                                 mnl_attr_put(nlh, decap.vxlan ?
3255                                              TCA_FLOWER_KEY_ENC_IPV6_DST :
3256                                              TCA_FLOWER_KEY_IPV6_DST,
3257                                              IPV6_ADDR_LEN,
3258                                              spec.ipv6->hdr.dst_addr);
3259                                 mnl_attr_put(nlh, decap.vxlan ?
3260                                              TCA_FLOWER_KEY_ENC_IPV6_DST_MASK :
3261                                              TCA_FLOWER_KEY_IPV6_DST_MASK,
3262                                              IPV6_ADDR_LEN,
3263                                              mask.ipv6->hdr.dst_addr);
3264                         }
3265                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3266                         break;
3267                 case RTE_FLOW_ITEM_TYPE_UDP:
3268                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
3269                         mask.udp = flow_tcf_item_mask
3270                                 (items, &rte_flow_item_udp_mask,
3271                                  &flow_tcf_mask_supported.udp,
3272                                  &flow_tcf_mask_empty.udp,
3273                                  sizeof(flow_tcf_mask_supported.udp),
3274                                  error);
3275                         assert(mask.udp);
3276                         spec.udp = items->spec;
3277                         if (!decap.vxlan) {
3278                                 if (!ip_proto_set)
3279                                         mnl_attr_put_u8
3280                                                 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3281                                                 IPPROTO_UDP);
3282                                 if (mask.udp == &flow_tcf_mask_empty.udp)
3283                                         break;
3284                         } else {
3285                                 assert(mask.udp != &flow_tcf_mask_empty.udp);
3286                                 decap.vxlan->udp_port =
3287                                         rte_be_to_cpu_16
3288                                                 (spec.udp->hdr.dst_port);
3289                         }
3290                         if (mask.udp->hdr.src_port) {
3291                                 mnl_attr_put_u16
3292                                         (nlh, decap.vxlan ?
3293                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT :
3294                                          TCA_FLOWER_KEY_UDP_SRC,
3295                                          spec.udp->hdr.src_port);
3296                                 mnl_attr_put_u16
3297                                         (nlh, decap.vxlan ?
3298                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK :
3299                                          TCA_FLOWER_KEY_UDP_SRC_MASK,
3300                                          mask.udp->hdr.src_port);
3301                         }
3302                         if (mask.udp->hdr.dst_port) {
3303                                 mnl_attr_put_u16
3304                                         (nlh, decap.vxlan ?
3305                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT :
3306                                          TCA_FLOWER_KEY_UDP_DST,
3307                                          spec.udp->hdr.dst_port);
3308                                 mnl_attr_put_u16
3309                                         (nlh, decap.vxlan ?
3310                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK :
3311                                          TCA_FLOWER_KEY_UDP_DST_MASK,
3312                                          mask.udp->hdr.dst_port);
3313                         }
3314                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3315                         break;
3316                 case RTE_FLOW_ITEM_TYPE_TCP:
3317                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
3318                         mask.tcp = flow_tcf_item_mask
3319                                 (items, &rte_flow_item_tcp_mask,
3320                                  &flow_tcf_mask_supported.tcp,
3321                                  &flow_tcf_mask_empty.tcp,
3322                                  sizeof(flow_tcf_mask_supported.tcp),
3323                                  error);
3324                         assert(mask.tcp);
3325                         if (!ip_proto_set)
3326                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3327                                                 IPPROTO_TCP);
3328                         if (mask.tcp == &flow_tcf_mask_empty.tcp)
3329                                 break;
3330                         spec.tcp = items->spec;
3331                         if (mask.tcp->hdr.src_port) {
3332                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_SRC,
3333                                                  spec.tcp->hdr.src_port);
3334                                 mnl_attr_put_u16(nlh,
3335                                                  TCA_FLOWER_KEY_TCP_SRC_MASK,
3336                                                  mask.tcp->hdr.src_port);
3337                         }
3338                         if (mask.tcp->hdr.dst_port) {
3339                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_DST,
3340                                                  spec.tcp->hdr.dst_port);
3341                                 mnl_attr_put_u16(nlh,
3342                                                  TCA_FLOWER_KEY_TCP_DST_MASK,
3343                                                  mask.tcp->hdr.dst_port);
3344                         }
3345                         if (mask.tcp->hdr.tcp_flags) {
3346                                 mnl_attr_put_u16
3347                                         (nlh,
3348                                          TCA_FLOWER_KEY_TCP_FLAGS,
3349                                          rte_cpu_to_be_16
3350                                                 (spec.tcp->hdr.tcp_flags));
3351                                 mnl_attr_put_u16
3352                                         (nlh,
3353                                          TCA_FLOWER_KEY_TCP_FLAGS_MASK,
3354                                          rte_cpu_to_be_16
3355                                                 (mask.tcp->hdr.tcp_flags));
3356                         }
3357                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3358                         break;
3359                 case RTE_FLOW_ITEM_TYPE_VXLAN:
3360                         assert(decap.vxlan);
3361                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
3362                         spec.vxlan = items->spec;
3363                         mnl_attr_put_u32(nlh,
3364                                          TCA_FLOWER_KEY_ENC_KEY_ID,
3365                                          vxlan_vni_as_be32(spec.vxlan->vni));
3366                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3367                         break;
3368                 default:
3369                         return rte_flow_error_set(error, ENOTSUP,
3370                                                   RTE_FLOW_ERROR_TYPE_ITEM,
3371                                                   NULL, "item not supported");
3372                 }
3373         }
3374         na_flower_act = mnl_attr_nest_start(nlh, TCA_FLOWER_ACT);
3375         na_act_index_cur = 1;
3376         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
3377                 struct nlattr *na_act_index;
3378                 struct nlattr *na_act;
3379                 unsigned int vlan_act;
3380                 unsigned int i;
3381
3382                 switch (actions->type) {
3383                 case RTE_FLOW_ACTION_TYPE_VOID:
3384                         break;
3385                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
3386                         conf.port_id = actions->conf;
3387                         if (conf.port_id->original)
3388                                 i = 0;
3389                         else
3390                                 for (i = 0; ptoi[i].ifindex; ++i)
3391                                         if (ptoi[i].port_id == conf.port_id->id)
3392                                                 break;
3393                         assert(ptoi[i].ifindex);
3394                         na_act_index =
3395                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3396                         assert(na_act_index);
3397                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
3398                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3399                         assert(na_act);
3400                         if (encap.hdr) {
3401                                 assert(dev_flow->tcf.tunnel);
3402                                 dev_flow->tcf.tunnel->ifindex_ptr =
3403                                         &((struct tc_mirred *)
3404                                         mnl_attr_get_payload
3405                                         (mnl_nlmsg_get_payload_tail
3406                                                 (nlh)))->ifindex;
3407                         }
3408                         mnl_attr_put(nlh, TCA_MIRRED_PARMS,
3409                                      sizeof(struct tc_mirred),
3410                                      &(struct tc_mirred){
3411                                         .action = TC_ACT_STOLEN,
3412                                         .eaction = TCA_EGRESS_REDIR,
3413                                         .ifindex = ptoi[i].ifindex,
3414                                      });
3415                         mnl_attr_nest_end(nlh, na_act);
3416                         mnl_attr_nest_end(nlh, na_act_index);
3417                         break;
3418                 case RTE_FLOW_ACTION_TYPE_JUMP:
3419                         conf.jump = actions->conf;
3420                         na_act_index =
3421                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3422                         assert(na_act_index);
3423                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3424                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3425                         assert(na_act);
3426                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3427                                      sizeof(struct tc_gact),
3428                                      &(struct tc_gact){
3429                                         .action = TC_ACT_GOTO_CHAIN |
3430                                                   conf.jump->group,
3431                                      });
3432                         mnl_attr_nest_end(nlh, na_act);
3433                         mnl_attr_nest_end(nlh, na_act_index);
3434                         break;
3435                 case RTE_FLOW_ACTION_TYPE_DROP:
3436                         na_act_index =
3437                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3438                         assert(na_act_index);
3439                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3440                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3441                         assert(na_act);
3442                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3443                                      sizeof(struct tc_gact),
3444                                      &(struct tc_gact){
3445                                         .action = TC_ACT_SHOT,
3446                                      });
3447                         mnl_attr_nest_end(nlh, na_act);
3448                         mnl_attr_nest_end(nlh, na_act_index);
3449                         break;
3450                 case RTE_FLOW_ACTION_TYPE_COUNT:
3451                         /*
3452                          * Driver adds the count action implicitly for
3453                          * each rule it creates.
3454                          */
3455                         ret = flow_tcf_translate_action_count(dev,
3456                                                               dev_flow, error);
3457                         if (ret < 0)
3458                                 return ret;
3459                         break;
3460                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
3461                         conf.of_push_vlan = NULL;
3462                         vlan_act = TCA_VLAN_ACT_POP;
3463                         goto action_of_vlan;
3464                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
3465                         conf.of_push_vlan = actions->conf;
3466                         vlan_act = TCA_VLAN_ACT_PUSH;
3467                         goto action_of_vlan;
3468                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
3469                         conf.of_set_vlan_vid = actions->conf;
3470                         if (na_vlan_id)
3471                                 goto override_na_vlan_id;
3472                         vlan_act = TCA_VLAN_ACT_MODIFY;
3473                         goto action_of_vlan;
3474                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
3475                         conf.of_set_vlan_pcp = actions->conf;
3476                         if (na_vlan_priority)
3477                                 goto override_na_vlan_priority;
3478                         vlan_act = TCA_VLAN_ACT_MODIFY;
3479                         goto action_of_vlan;
3480 action_of_vlan:
3481                         na_act_index =
3482                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3483                         assert(na_act_index);
3484                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "vlan");
3485                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3486                         assert(na_act);
3487                         mnl_attr_put(nlh, TCA_VLAN_PARMS,
3488                                      sizeof(struct tc_vlan),
3489                                      &(struct tc_vlan){
3490                                         .action = TC_ACT_PIPE,
3491                                         .v_action = vlan_act,
3492                                      });
3493                         if (vlan_act == TCA_VLAN_ACT_POP) {
3494                                 mnl_attr_nest_end(nlh, na_act);
3495                                 mnl_attr_nest_end(nlh, na_act_index);
3496                                 break;
3497                         }
3498                         if (vlan_act == TCA_VLAN_ACT_PUSH)
3499                                 mnl_attr_put_u16(nlh,
3500                                                  TCA_VLAN_PUSH_VLAN_PROTOCOL,
3501                                                  conf.of_push_vlan->ethertype);
3502                         na_vlan_id = mnl_nlmsg_get_payload_tail(nlh);
3503                         mnl_attr_put_u16(nlh, TCA_VLAN_PAD, 0);
3504                         na_vlan_priority = mnl_nlmsg_get_payload_tail(nlh);
3505                         mnl_attr_put_u8(nlh, TCA_VLAN_PAD, 0);
3506                         mnl_attr_nest_end(nlh, na_act);
3507                         mnl_attr_nest_end(nlh, na_act_index);
3508                         if (actions->type ==
3509                             RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
3510 override_na_vlan_id:
3511                                 na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID;
3512                                 *(uint16_t *)mnl_attr_get_payload(na_vlan_id) =
3513                                         rte_be_to_cpu_16
3514                                         (conf.of_set_vlan_vid->vlan_vid);
3515                         } else if (actions->type ==
3516                                    RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
3517 override_na_vlan_priority:
3518                                 na_vlan_priority->nla_type =
3519                                         TCA_VLAN_PUSH_VLAN_PRIORITY;
3520                                 *(uint8_t *)mnl_attr_get_payload
3521                                         (na_vlan_priority) =
3522                                         conf.of_set_vlan_pcp->vlan_pcp;
3523                         }
3524                         break;
3525                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
3526                         assert(decap.vxlan);
3527                         assert(dev_flow->tcf.tunnel);
3528                         dev_flow->tcf.tunnel->ifindex_ptr =
3529                                 (unsigned int *)&tcm->tcm_ifindex;
3530                         na_act_index =
3531                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3532                         assert(na_act_index);
3533                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3534                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3535                         assert(na_act);
3536                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3537                                 sizeof(struct tc_tunnel_key),
3538                                 &(struct tc_tunnel_key){
3539                                         .action = TC_ACT_PIPE,
3540                                         .t_action = TCA_TUNNEL_KEY_ACT_RELEASE,
3541                                         });
3542                         mnl_attr_nest_end(nlh, na_act);
3543                         mnl_attr_nest_end(nlh, na_act_index);
3544                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3545                         break;
3546                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
3547                         assert(encap.vxlan);
3548                         flow_tcf_vxlan_encap_parse(actions, encap.vxlan);
3549                         na_act_index =
3550                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3551                         assert(na_act_index);
3552                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3553                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3554                         assert(na_act);
3555                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3556                                 sizeof(struct tc_tunnel_key),
3557                                 &(struct tc_tunnel_key){
3558                                         .action = TC_ACT_PIPE,
3559                                         .t_action = TCA_TUNNEL_KEY_ACT_SET,
3560                                         });
3561                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_UDP_DST)
3562                                 mnl_attr_put_u16(nlh,
3563                                          TCA_TUNNEL_KEY_ENC_DST_PORT,
3564                                          encap.vxlan->udp.dst);
3565                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_SRC)
3566                                 mnl_attr_put_u32(nlh,
3567                                          TCA_TUNNEL_KEY_ENC_IPV4_SRC,
3568                                          encap.vxlan->ipv4.src);
3569                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_DST)
3570                                 mnl_attr_put_u32(nlh,
3571                                          TCA_TUNNEL_KEY_ENC_IPV4_DST,
3572                                          encap.vxlan->ipv4.dst);
3573                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_SRC)
3574                                 mnl_attr_put(nlh,
3575                                          TCA_TUNNEL_KEY_ENC_IPV6_SRC,
3576                                          sizeof(encap.vxlan->ipv6.src),
3577                                          &encap.vxlan->ipv6.src);
3578                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_DST)
3579                                 mnl_attr_put(nlh,
3580                                          TCA_TUNNEL_KEY_ENC_IPV6_DST,
3581                                          sizeof(encap.vxlan->ipv6.dst),
3582                                          &encap.vxlan->ipv6.dst);
3583                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_VXLAN_VNI)
3584                                 mnl_attr_put_u32(nlh,
3585                                          TCA_TUNNEL_KEY_ENC_KEY_ID,
3586                                          vxlan_vni_as_be32
3587                                                 (encap.vxlan->vxlan.vni));
3588                         mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM, 0);
3589                         mnl_attr_nest_end(nlh, na_act);
3590                         mnl_attr_nest_end(nlh, na_act_index);
3591                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3592                         break;
3593                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
3594                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
3595                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
3596                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
3597                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
3598                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
3599                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
3600                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
3601                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
3602                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
3603                         na_act_index =
3604                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3605                         flow_tcf_create_pedit_mnl_msg(nlh,
3606                                                       &actions, item_flags);
3607                         mnl_attr_nest_end(nlh, na_act_index);
3608                         break;
3609                 default:
3610                         return rte_flow_error_set(error, ENOTSUP,
3611                                                   RTE_FLOW_ERROR_TYPE_ACTION,
3612                                                   actions,
3613                                                   "action not supported");
3614                 }
3615         }
3616         assert(na_flower);
3617         assert(na_flower_act);
3618         mnl_attr_nest_end(nlh, na_flower_act);
3619         dev_flow->tcf.ptc_flags = mnl_attr_get_payload
3620                                         (mnl_nlmsg_get_payload_tail(nlh));
3621         mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, decap.vxlan ?
3622                                                 0 : TCA_CLS_FLAGS_SKIP_SW);
3623         mnl_attr_nest_end(nlh, na_flower);
3624         if (dev_flow->tcf.tunnel && dev_flow->tcf.tunnel->ifindex_ptr)
3625                 dev_flow->tcf.tunnel->ifindex_org =
3626                         *dev_flow->tcf.tunnel->ifindex_ptr;
3627         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3628         return 0;
3629 }
3630
3631 /**
3632  * Send Netlink message with acknowledgment.
3633  *
3634  * @param tcf
3635  *   Flow context to use.
3636  * @param nlh
3637  *   Message to send. This function always raises the NLM_F_ACK flag before
3638  *   sending.
3639  * @param[in] cb
3640  *   Callback handler for received message.
3641  * @param[in] arg
3642  *   Context pointer for callback handler.
3643  *
3644  * @return
3645  *   0 on success, a negative errno value otherwise and rte_errno is set.
3646  */
3647 static int
3648 flow_tcf_nl_ack(struct mlx5_flow_tcf_context *tcf,
3649                 struct nlmsghdr *nlh,
3650                 mnl_cb_t cb, void *arg)
3651 {
3652         unsigned int portid = mnl_socket_get_portid(tcf->nl);
3653         uint32_t seq = tcf->seq++;
3654         int ret, err = 0;
3655
3656         assert(tcf->nl);
3657         assert(tcf->buf);
3658         if (!seq) {
3659                 /* seq 0 is reserved for kernel event-driven notifications. */
3660                 seq = tcf->seq++;
3661         }
3662         nlh->nlmsg_seq = seq;
3663         nlh->nlmsg_flags |= NLM_F_ACK;
3664         ret = mnl_socket_sendto(tcf->nl, nlh, nlh->nlmsg_len);
3665         if (ret <= 0) {
3666                 /* Message send error occurres. */
3667                 rte_errno = errno;
3668                 return -rte_errno;
3669         }
3670         nlh = (struct nlmsghdr *)(tcf->buf);
3671         /*
3672          * The following loop postpones non-fatal errors until multipart
3673          * messages are complete.
3674          */
3675         while (true) {
3676                 ret = mnl_socket_recvfrom(tcf->nl, tcf->buf, tcf->buf_size);
3677                 if (ret < 0) {
3678                         err = errno;
3679                         /*
3680                          * In case of overflow Will receive till
3681                          * end of multipart message. We may lost part
3682                          * of reply messages but mark and return an error.
3683                          */
3684                         if (err != ENOSPC ||
3685                             !(nlh->nlmsg_flags & NLM_F_MULTI) ||
3686                             nlh->nlmsg_type == NLMSG_DONE)
3687                                 break;
3688                 } else {
3689                         ret = mnl_cb_run(nlh, ret, seq, portid, cb, arg);
3690                         if (!ret) {
3691                                 /*
3692                                  * libmnl returns 0 if DONE or
3693                                  * success ACK message found.
3694                                  */
3695                                 break;
3696                         }
3697                         if (ret < 0) {
3698                                 /*
3699                                  * ACK message with error found
3700                                  * or some error occurred.
3701                                  */
3702                                 err = errno;
3703                                 break;
3704                         }
3705                         /* We should continue receiving. */
3706                 }
3707         }
3708         if (!err)
3709                 return 0;
3710         rte_errno = err;
3711         return -err;
3712 }
3713
3714 #define MNL_BUF_EXTRA_SPACE 16
3715 #define MNL_REQUEST_SIZE_MIN 256
3716 #define MNL_REQUEST_SIZE_MAX 2048
3717 #define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \
3718                                  MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX)
3719
3720 /* Data structures used by flow_tcf_xxx_cb() routines. */
3721 struct tcf_nlcb_buf {
3722         LIST_ENTRY(tcf_nlcb_buf) next;
3723         uint32_t size;
3724         alignas(struct nlmsghdr)
3725         uint8_t msg[]; /**< Netlink message data. */
3726 };
3727
3728 struct tcf_nlcb_context {
3729         unsigned int ifindex; /**< Base interface index. */
3730         uint32_t bufsize;
3731         LIST_HEAD(, tcf_nlcb_buf) nlbuf;
3732 };
3733
3734 /**
3735  * Allocate space for netlink command in buffer list
3736  *
3737  * @param[in, out] ctx
3738  *   Pointer to callback context with command buffers list.
3739  * @param[in] size
3740  *   Required size of data buffer to be allocated.
3741  *
3742  * @return
3743  *   Pointer to allocated memory, aligned as message header.
3744  *   NULL if some error occurred.
3745  */
3746 static struct nlmsghdr *
3747 flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size)
3748 {
3749         struct tcf_nlcb_buf *buf;
3750         struct nlmsghdr *nlh;
3751
3752         size = NLMSG_ALIGN(size);
3753         buf = LIST_FIRST(&ctx->nlbuf);
3754         if (buf && (buf->size + size) <= ctx->bufsize) {
3755                 nlh = (struct nlmsghdr *)&buf->msg[buf->size];
3756                 buf->size += size;
3757                 return nlh;
3758         }
3759         if (size > ctx->bufsize) {
3760                 DRV_LOG(WARNING, "netlink: too long command buffer requested");
3761                 return NULL;
3762         }
3763         buf = rte_malloc(__func__,
3764                         ctx->bufsize + sizeof(struct tcf_nlcb_buf),
3765                         alignof(struct tcf_nlcb_buf));
3766         if (!buf) {
3767                 DRV_LOG(WARNING, "netlink: no memory for command buffer");
3768                 return NULL;
3769         }
3770         LIST_INSERT_HEAD(&ctx->nlbuf, buf, next);
3771         buf->size = size;
3772         nlh = (struct nlmsghdr *)&buf->msg[0];
3773         return nlh;
3774 }
3775
3776 /**
3777  * Send the buffers with prepared netlink commands. Scans the list and
3778  * sends all found buffers. Buffers are sent and freed anyway in order
3779  * to prevent memory leakage if some every message in received packet.
3780  *
3781  * @param[in] tcf
3782  *   Context object initialized by mlx5_flow_tcf_context_create().
3783  * @param[in, out] ctx
3784  *   Pointer to callback context with command buffers list.
3785  *
3786  * @return
3787  *   Zero value on success, negative errno value otherwise
3788  *   and rte_errno is set.
3789  */
3790 static int
3791 flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf,
3792                     struct tcf_nlcb_context *ctx)
3793 {
3794         struct tcf_nlcb_buf *bc = LIST_FIRST(&ctx->nlbuf);
3795         int ret = 0;
3796
3797         while (bc) {
3798                 struct tcf_nlcb_buf *bn = LIST_NEXT(bc, next);
3799                 struct nlmsghdr *nlh;
3800                 uint32_t msg = 0;
3801                 int rc;
3802
3803                 while (msg < bc->size) {
3804                         /*
3805                          * Send Netlink commands from buffer in one by one
3806                          * fashion. If we send multiple rule deletion commands
3807                          * in one Netlink message and some error occurs it may
3808                          * cause multiple ACK error messages and break sequence
3809                          * numbers of Netlink communication, because we expect
3810                          * the only one ACK reply.
3811                          */
3812                         assert((bc->size - msg) >= sizeof(struct nlmsghdr));
3813                         nlh = (struct nlmsghdr *)&bc->msg[msg];
3814                         assert((bc->size - msg) >= nlh->nlmsg_len);
3815                         msg += nlh->nlmsg_len;
3816                         rc = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
3817                         if (rc) {
3818                                 DRV_LOG(WARNING,
3819                                         "netlink: cleanup error %d", rc);
3820                                 if (!ret)
3821                                         ret = rc;
3822                         }
3823                 }
3824                 rte_free(bc);
3825                 bc = bn;
3826         }
3827         LIST_INIT(&ctx->nlbuf);
3828         return ret;
3829 }
3830
3831 /**
3832  * Collect local IP address rules with scope link attribute  on specified
3833  * network device. This is callback routine called by libmnl mnl_cb_run()
3834  * in loop for every message in received packet.
3835  *
3836  * @param[in] nlh
3837  *   Pointer to reply header.
3838  * @param[in, out] arg
3839  *   Opaque data pointer for this callback.
3840  *
3841  * @return
3842  *   A positive, nonzero value on success, negative errno value otherwise
3843  *   and rte_errno is set.
3844  */
3845 static int
3846 flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg)
3847 {
3848         struct tcf_nlcb_context *ctx = arg;
3849         struct nlmsghdr *cmd;
3850         struct ifaddrmsg *ifa;
3851         struct nlattr *na;
3852         struct nlattr *na_local = NULL;
3853         struct nlattr *na_peer = NULL;
3854         unsigned char family;
3855         uint32_t size;
3856
3857         if (nlh->nlmsg_type != RTM_NEWADDR) {
3858                 rte_errno = EINVAL;
3859                 return -rte_errno;
3860         }
3861         ifa = mnl_nlmsg_get_payload(nlh);
3862         family = ifa->ifa_family;
3863         if (ifa->ifa_index != ctx->ifindex ||
3864             ifa->ifa_scope != RT_SCOPE_LINK ||
3865             !(ifa->ifa_flags & IFA_F_PERMANENT) ||
3866             (family != AF_INET && family != AF_INET6))
3867                 return 1;
3868         mnl_attr_for_each(na, nlh, sizeof(*ifa)) {
3869                 switch (mnl_attr_get_type(na)) {
3870                 case IFA_LOCAL:
3871                         na_local = na;
3872                         break;
3873                 case IFA_ADDRESS:
3874                         na_peer = na;
3875                         break;
3876                 }
3877                 if (na_local && na_peer)
3878                         break;
3879         }
3880         if (!na_local || !na_peer)
3881                 return 1;
3882         /* Local rule found with scope link, permanent and assigned peer. */
3883         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
3884                MNL_ALIGN(sizeof(struct ifaddrmsg)) +
3885                (family == AF_INET6 ? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
3886                                    : 2 * SZ_NLATTR_TYPE_OF(uint32_t));
3887         cmd = flow_tcf_alloc_nlcmd(ctx, size);
3888         if (!cmd) {
3889                 rte_errno = ENOMEM;
3890                 return -rte_errno;
3891         }
3892         cmd = mnl_nlmsg_put_header(cmd);
3893         cmd->nlmsg_type = RTM_DELADDR;
3894         cmd->nlmsg_flags = NLM_F_REQUEST;
3895         ifa = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifa));
3896         ifa->ifa_flags = IFA_F_PERMANENT;
3897         ifa->ifa_scope = RT_SCOPE_LINK;
3898         ifa->ifa_index = ctx->ifindex;
3899         if (family == AF_INET) {
3900                 ifa->ifa_family = AF_INET;
3901                 ifa->ifa_prefixlen = 32;
3902                 mnl_attr_put_u32(cmd, IFA_LOCAL, mnl_attr_get_u32(na_local));
3903                 mnl_attr_put_u32(cmd, IFA_ADDRESS, mnl_attr_get_u32(na_peer));
3904         } else {
3905                 ifa->ifa_family = AF_INET6;
3906                 ifa->ifa_prefixlen = 128;
3907                 mnl_attr_put(cmd, IFA_LOCAL, IPV6_ADDR_LEN,
3908                         mnl_attr_get_payload(na_local));
3909                 mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN,
3910                         mnl_attr_get_payload(na_peer));
3911         }
3912         assert(size == cmd->nlmsg_len);
3913         return 1;
3914 }
3915
3916 /**
3917  * Cleanup the local IP addresses on outer interface.
3918  *
3919  * @param[in] tcf
3920  *   Context object initialized by mlx5_flow_tcf_context_create().
3921  * @param[in] ifindex
3922  *   Network inferface index to perform cleanup.
3923  */
3924 static void
3925 flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf,
3926                             unsigned int ifindex)
3927 {
3928         struct nlmsghdr *nlh;
3929         struct ifaddrmsg *ifa;
3930         struct tcf_nlcb_context ctx = {
3931                 .ifindex = ifindex,
3932                 .bufsize = MNL_REQUEST_SIZE,
3933                 .nlbuf = LIST_HEAD_INITIALIZER(),
3934         };
3935         int ret;
3936
3937         assert(ifindex);
3938         /*
3939          * Seek and destroy leftovers of local IP addresses with
3940          * matching properties "scope link".
3941          */
3942         nlh = mnl_nlmsg_put_header(tcf->buf);
3943         nlh->nlmsg_type = RTM_GETADDR;
3944         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
3945         ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
3946         ifa->ifa_family = AF_UNSPEC;
3947         ifa->ifa_index = ifindex;
3948         ifa->ifa_scope = RT_SCOPE_LINK;
3949         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_local_cb, &ctx);
3950         if (ret)
3951                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
3952         ret = flow_tcf_send_nlcmd(tcf, &ctx);
3953         if (ret)
3954                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
3955 }
3956
3957 /**
3958  * Collect neigh permament rules on specified network device.
3959  * This is callback routine called by libmnl mnl_cb_run() in loop for
3960  * every message in received packet.
3961  *
3962  * @param[in] nlh
3963  *   Pointer to reply header.
3964  * @param[in, out] arg
3965  *   Opaque data pointer for this callback.
3966  *
3967  * @return
3968  *   A positive, nonzero value on success, negative errno value otherwise
3969  *   and rte_errno is set.
3970  */
3971 static int
3972 flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg)
3973 {
3974         struct tcf_nlcb_context *ctx = arg;
3975         struct nlmsghdr *cmd;
3976         struct ndmsg *ndm;
3977         struct nlattr *na;
3978         struct nlattr *na_ip = NULL;
3979         struct nlattr *na_mac = NULL;
3980         unsigned char family;
3981         uint32_t size;
3982
3983         if (nlh->nlmsg_type != RTM_NEWNEIGH) {
3984                 rte_errno = EINVAL;
3985                 return -rte_errno;
3986         }
3987         ndm = mnl_nlmsg_get_payload(nlh);
3988         family = ndm->ndm_family;
3989         if (ndm->ndm_ifindex != (int)ctx->ifindex ||
3990            !(ndm->ndm_state & NUD_PERMANENT) ||
3991            (family != AF_INET && family != AF_INET6))
3992                 return 1;
3993         mnl_attr_for_each(na, nlh, sizeof(*ndm)) {
3994                 switch (mnl_attr_get_type(na)) {
3995                 case NDA_DST:
3996                         na_ip = na;
3997                         break;
3998                 case NDA_LLADDR:
3999                         na_mac = na;
4000                         break;
4001                 }
4002                 if (na_mac && na_ip)
4003                         break;
4004         }
4005         if (!na_mac || !na_ip)
4006                 return 1;
4007         /* Neigh rule with permenent attribute found. */
4008         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4009                MNL_ALIGN(sizeof(struct ndmsg)) +
4010                SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) +
4011                (family == AF_INET6 ? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4012                                    : SZ_NLATTR_TYPE_OF(uint32_t));
4013         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4014         if (!cmd) {
4015                 rte_errno = ENOMEM;
4016                 return -rte_errno;
4017         }
4018         cmd = mnl_nlmsg_put_header(cmd);
4019         cmd->nlmsg_type = RTM_DELNEIGH;
4020         cmd->nlmsg_flags = NLM_F_REQUEST;
4021         ndm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ndm));
4022         ndm->ndm_ifindex = ctx->ifindex;
4023         ndm->ndm_state = NUD_PERMANENT;
4024         ndm->ndm_flags = 0;
4025         ndm->ndm_type = 0;
4026         if (family == AF_INET) {
4027                 ndm->ndm_family = AF_INET;
4028                 mnl_attr_put_u32(cmd, NDA_DST, mnl_attr_get_u32(na_ip));
4029         } else {
4030                 ndm->ndm_family = AF_INET6;
4031                 mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN,
4032                              mnl_attr_get_payload(na_ip));
4033         }
4034         mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN,
4035                      mnl_attr_get_payload(na_mac));
4036         assert(size == cmd->nlmsg_len);
4037         return 1;
4038 }
4039
4040 /**
4041  * Cleanup the neigh rules on outer interface.
4042  *
4043  * @param[in] tcf
4044  *   Context object initialized by mlx5_flow_tcf_context_create().
4045  * @param[in] ifindex
4046  *   Network inferface index to perform cleanup.
4047  */
4048 static void
4049 flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf,
4050                             unsigned int ifindex)
4051 {
4052         struct nlmsghdr *nlh;
4053         struct ndmsg *ndm;
4054         struct tcf_nlcb_context ctx = {
4055                 .ifindex = ifindex,
4056                 .bufsize = MNL_REQUEST_SIZE,
4057                 .nlbuf = LIST_HEAD_INITIALIZER(),
4058         };
4059         int ret;
4060
4061         assert(ifindex);
4062         /* Seek and destroy leftovers of neigh rules. */
4063         nlh = mnl_nlmsg_put_header(tcf->buf);
4064         nlh->nlmsg_type = RTM_GETNEIGH;
4065         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4066         ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4067         ndm->ndm_family = AF_UNSPEC;
4068         ndm->ndm_ifindex = ifindex;
4069         ndm->ndm_state = NUD_PERMANENT;
4070         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_neigh_cb, &ctx);
4071         if (ret)
4072                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4073         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4074         if (ret)
4075                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4076 }
4077
4078 /**
4079  * Collect indices of VXLAN encap/decap interfaces associated with device.
4080  * This is callback routine called by libmnl mnl_cb_run() in loop for
4081  * every message in received packet.
4082  *
4083  * @param[in] nlh
4084  *   Pointer to reply header.
4085  * @param[in, out] arg
4086  *   Opaque data pointer for this callback.
4087  *
4088  * @return
4089  *   A positive, nonzero value on success, negative errno value otherwise
4090  *   and rte_errno is set.
4091  */
4092 static int
4093 flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg)
4094 {
4095         struct tcf_nlcb_context *ctx = arg;
4096         struct nlmsghdr *cmd;
4097         struct ifinfomsg *ifm;
4098         struct nlattr *na;
4099         struct nlattr *na_info = NULL;
4100         struct nlattr *na_vxlan = NULL;
4101         bool found = false;
4102         unsigned int vxindex;
4103         uint32_t size;
4104
4105         if (nlh->nlmsg_type != RTM_NEWLINK) {
4106                 rte_errno = EINVAL;
4107                 return -rte_errno;
4108         }
4109         ifm = mnl_nlmsg_get_payload(nlh);
4110         if (!ifm->ifi_index) {
4111                 rte_errno = EINVAL;
4112                 return -rte_errno;
4113         }
4114         mnl_attr_for_each(na, nlh, sizeof(*ifm))
4115                 if (mnl_attr_get_type(na) == IFLA_LINKINFO) {
4116                         na_info = na;
4117                         break;
4118                 }
4119         if (!na_info)
4120                 return 1;
4121         mnl_attr_for_each_nested(na, na_info) {
4122                 switch (mnl_attr_get_type(na)) {
4123                 case IFLA_INFO_KIND:
4124                         if (!strncmp("vxlan", mnl_attr_get_str(na),
4125                                      mnl_attr_get_len(na)))
4126                                 found = true;
4127                         break;
4128                 case IFLA_INFO_DATA:
4129                         na_vxlan = na;
4130                         break;
4131                 }
4132                 if (found && na_vxlan)
4133                         break;
4134         }
4135         if (!found || !na_vxlan)
4136                 return 1;
4137         found = false;
4138         mnl_attr_for_each_nested(na, na_vxlan) {
4139                 if (mnl_attr_get_type(na) == IFLA_VXLAN_LINK &&
4140                     mnl_attr_get_u32(na) == ctx->ifindex) {
4141                         found = true;
4142                         break;
4143                 }
4144         }
4145         if (!found)
4146                 return 1;
4147         /* Attached VXLAN device found, store the command to delete. */
4148         vxindex = ifm->ifi_index;
4149         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4150                MNL_ALIGN(sizeof(struct ifinfomsg));
4151         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4152         if (!cmd) {
4153                 rte_errno = ENOMEM;
4154                 return -rte_errno;
4155         }
4156         cmd = mnl_nlmsg_put_header(cmd);
4157         cmd->nlmsg_type = RTM_DELLINK;
4158         cmd->nlmsg_flags = NLM_F_REQUEST;
4159         ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm));
4160         ifm->ifi_family = AF_UNSPEC;
4161         ifm->ifi_index = vxindex;
4162         assert(size == cmd->nlmsg_len);
4163         return 1;
4164 }
4165
4166 /**
4167  * Cleanup the outer interface. Removes all found vxlan devices
4168  * attached to specified index, flushes the meigh and local IP
4169  * datavase.
4170  *
4171  * @param[in] tcf
4172  *   Context object initialized by mlx5_flow_tcf_context_create().
4173  * @param[in] ifindex
4174  *   Network inferface index to perform cleanup.
4175  */
4176 static void
4177 flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf,
4178                             unsigned int ifindex)
4179 {
4180         struct nlmsghdr *nlh;
4181         struct ifinfomsg *ifm;
4182         struct tcf_nlcb_context ctx = {
4183                 .ifindex = ifindex,
4184                 .bufsize = MNL_REQUEST_SIZE,
4185                 .nlbuf = LIST_HEAD_INITIALIZER(),
4186         };
4187         int ret;
4188
4189         assert(ifindex);
4190         /*
4191          * Seek and destroy leftover VXLAN encap/decap interfaces with
4192          * matching properties.
4193          */
4194         nlh = mnl_nlmsg_put_header(tcf->buf);
4195         nlh->nlmsg_type = RTM_GETLINK;
4196         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4197         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4198         ifm->ifi_family = AF_UNSPEC;
4199         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_vxlan_cb, &ctx);
4200         if (ret)
4201                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4202         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4203         if (ret)
4204                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4205 }
4206
4207 /**
4208  * Emit Netlink message to add/remove local address to the outer device.
4209  * The address being added is visible within the link only (scope link).
4210  *
4211  * Note that an implicit route is maintained by the kernel due to the
4212  * presence of a peer address (IFA_ADDRESS).
4213  *
4214  * These rules are used for encapsultion only and allow to assign
4215  * the outer tunnel source IP address.
4216  *
4217  * @param[in] tcf
4218  *   Libmnl socket context object.
4219  * @param[in] encap
4220  *   Encapsulation properties (source address and its peer).
4221  * @param[in] ifindex
4222  *   Network interface to apply rule.
4223  * @param[in] enable
4224  *   Toggle between add and remove.
4225  * @param[out] error
4226  *   Perform verbose error reporting if not NULL.
4227  *
4228  * @return
4229  *   0 on success, a negative errno value otherwise and rte_errno is set.
4230  */
4231 static int
4232 flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf,
4233                     const struct flow_tcf_vxlan_encap *encap,
4234                     unsigned int ifindex,
4235                     bool enable,
4236                     struct rte_flow_error *error)
4237 {
4238         struct nlmsghdr *nlh;
4239         struct ifaddrmsg *ifa;
4240         alignas(struct nlmsghdr)
4241         uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
4242
4243         nlh = mnl_nlmsg_put_header(buf);
4244         nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
4245         nlh->nlmsg_flags =
4246                 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4247         nlh->nlmsg_seq = 0;
4248         ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4249         ifa->ifa_flags = IFA_F_PERMANENT;
4250         ifa->ifa_scope = RT_SCOPE_LINK;
4251         ifa->ifa_index = ifindex;
4252         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4253                 ifa->ifa_family = AF_INET;
4254                 ifa->ifa_prefixlen = 32;
4255                 mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src);
4256                 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST)
4257                         mnl_attr_put_u32(nlh, IFA_ADDRESS,
4258                                               encap->ipv4.dst);
4259         } else {
4260                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4261                 ifa->ifa_family = AF_INET6;
4262                 ifa->ifa_prefixlen = 128;
4263                 mnl_attr_put(nlh, IFA_LOCAL,
4264                                   sizeof(encap->ipv6.src),
4265                                   &encap->ipv6.src);
4266                 if (encap->mask & FLOW_TCF_ENCAP_IPV6_DST)
4267                         mnl_attr_put(nlh, IFA_ADDRESS,
4268                                           sizeof(encap->ipv6.dst),
4269                                           &encap->ipv6.dst);
4270         }
4271         if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4272                 return 0;
4273         return rte_flow_error_set(error, rte_errno,
4274                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4275                                   "netlink: cannot complete IFA request"
4276                                   " (ip addr add)");
4277 }
4278
4279 /**
4280  * Emit Netlink message to add/remove neighbor.
4281  *
4282  * @param[in] tcf
4283  *   Libmnl socket context object.
4284  * @param[in] encap
4285  *   Encapsulation properties (destination address).
4286  * @param[in] ifindex
4287  *   Network interface.
4288  * @param[in] enable
4289  *   Toggle between add and remove.
4290  * @param[out] error
4291  *   Perform verbose error reporting if not NULL.
4292  *
4293  * @return
4294  *   0 on success, a negative errno value otherwise and rte_errno is set.
4295  */
4296 static int
4297 flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf,
4298                      const struct flow_tcf_vxlan_encap *encap,
4299                      unsigned int ifindex,
4300                      bool enable,
4301                      struct rte_flow_error *error)
4302 {
4303         struct nlmsghdr *nlh;
4304         struct ndmsg *ndm;
4305         alignas(struct nlmsghdr)
4306         uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
4307
4308         nlh = mnl_nlmsg_put_header(buf);
4309         nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
4310         nlh->nlmsg_flags =
4311                 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4312         nlh->nlmsg_seq = 0;
4313         ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4314         ndm->ndm_ifindex = ifindex;
4315         ndm->ndm_state = NUD_PERMANENT;
4316         ndm->ndm_flags = 0;
4317         ndm->ndm_type = 0;
4318         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4319                 ndm->ndm_family = AF_INET;
4320                 mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst);
4321         } else {
4322                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4323                 ndm->ndm_family = AF_INET6;
4324                 mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst),
4325                                                  &encap->ipv6.dst);
4326         }
4327         if (encap->mask & FLOW_TCF_ENCAP_ETH_SRC && enable)
4328                 DRV_LOG(WARNING,
4329                         "outer ethernet source address cannot be "
4330                         "forced for VXLAN encapsulation");
4331         if (encap->mask & FLOW_TCF_ENCAP_ETH_DST)
4332                 mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst),
4333                                                     &encap->eth.dst);
4334         if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4335                 return 0;
4336         return rte_flow_error_set(error, rte_errno,
4337                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4338                                   "netlink: cannot complete ND request"
4339                                   " (ip neigh)");
4340 }
4341
4342 /**
4343  * Manage the local IP addresses and their peers IP addresses on the
4344  * outer interface for encapsulation purposes. The kernel searches the
4345  * appropriate device for tunnel egress traffic using the outer source
4346  * IP, this IP should be assigned to the outer network device, otherwise
4347  * kernel rejects the rule.
4348  *
4349  * Adds or removes the addresses using the Netlink command like this:
4350  *   ip addr add <src_ip> peer <dst_ip> scope link dev <ifouter>
4351  *
4352  * The addresses are local to the netdev ("scope link"), this reduces
4353  * the risk of conflicts. Note that an implicit route is maintained by
4354  * the kernel due to the presence of a peer address (IFA_ADDRESS).
4355  *
4356  * @param[in] tcf
4357  *   Libmnl socket context object.
4358  * @param[in] vtep
4359  *   VTEP object, contains rule database and ifouter index.
4360  * @param[in] dev_flow
4361  *   Flow object, contains the tunnel parameters (for encap only).
4362  * @param[in] enable
4363  *   Toggle between add and remove.
4364  * @param[out] error
4365  *   Perform verbose error reporting if not NULL.
4366  *
4367  * @return
4368  *   0 on success, a negative errno value otherwise and rte_errno is set.
4369  */
4370 static int
4371 flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf,
4372                      struct tcf_vtep *vtep,
4373                      struct mlx5_flow *dev_flow,
4374                      bool enable,
4375                      struct rte_flow_error *error)
4376 {
4377         const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4378         struct tcf_local_rule *rule;
4379         bool found = false;
4380         int ret;
4381
4382         assert(encap);
4383         assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4384         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4385                 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_DST);
4386                 LIST_FOREACH(rule, &vtep->local, next) {
4387                         if (rule->mask & FLOW_TCF_ENCAP_IPV4_SRC &&
4388                             encap->ipv4.src == rule->ipv4.src &&
4389                             encap->ipv4.dst == rule->ipv4.dst) {
4390                                 found = true;
4391                                 break;
4392                         }
4393                 }
4394         } else {
4395                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4396                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4397                 LIST_FOREACH(rule, &vtep->local, next) {
4398                         if (rule->mask & FLOW_TCF_ENCAP_IPV6_SRC &&
4399                             !memcmp(&encap->ipv6.src, &rule->ipv6.src,
4400                                             sizeof(encap->ipv6.src)) &&
4401                             !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4402                                             sizeof(encap->ipv6.dst))) {
4403                                 found = true;
4404                                 break;
4405                         }
4406                 }
4407         }
4408         if (found) {
4409                 if (enable) {
4410                         rule->refcnt++;
4411                         return 0;
4412                 }
4413                 if (!rule->refcnt || !--rule->refcnt) {
4414                         LIST_REMOVE(rule, next);
4415                         return flow_tcf_rule_local(tcf, encap,
4416                                         vtep->ifouter, false, error);
4417                 }
4418                 return 0;
4419         }
4420         if (!enable) {
4421                 DRV_LOG(WARNING, "disabling not existing local rule");
4422                 rte_flow_error_set(error, ENOENT,
4423                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4424                                    "disabling not existing local rule");
4425                 return -ENOENT;
4426         }
4427         rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule),
4428                                 alignof(struct tcf_local_rule));
4429         if (!rule) {
4430                 rte_flow_error_set(error, ENOMEM,
4431                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4432                                    "unable to allocate memory for local rule");
4433                 return -rte_errno;
4434         }
4435         *rule = (struct tcf_local_rule){.refcnt = 0,
4436                                         .mask = 0,
4437                                         };
4438         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4439                 rule->mask = FLOW_TCF_ENCAP_IPV4_SRC
4440                            | FLOW_TCF_ENCAP_IPV4_DST;
4441                 rule->ipv4.src = encap->ipv4.src;
4442                 rule->ipv4.dst = encap->ipv4.dst;
4443         } else {
4444                 rule->mask = FLOW_TCF_ENCAP_IPV6_SRC
4445                            | FLOW_TCF_ENCAP_IPV6_DST;
4446                 memcpy(&rule->ipv6.src, &encap->ipv6.src, IPV6_ADDR_LEN);
4447                 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4448         }
4449         ret = flow_tcf_rule_local(tcf, encap, vtep->ifouter, true, error);
4450         if (ret) {
4451                 rte_free(rule);
4452                 return ret;
4453         }
4454         rule->refcnt++;
4455         LIST_INSERT_HEAD(&vtep->local, rule, next);
4456         return 0;
4457 }
4458
4459 /**
4460  * Manage the destination MAC/IP addresses neigh database, kernel uses
4461  * this one to determine the destination MAC address within encapsulation
4462  * header. Adds or removes the entries using the Netlink command like this:
4463  *   ip neigh add dev <ifouter> lladdr <dst_mac> to <dst_ip> nud permanent
4464  *
4465  * @param[in] tcf
4466  *   Libmnl socket context object.
4467  * @param[in] vtep
4468  *   VTEP object, contains rule database and ifouter index.
4469  * @param[in] dev_flow
4470  *   Flow object, contains the tunnel parameters (for encap only).
4471  * @param[in] enable
4472  *   Toggle between add and remove.
4473  * @param[out] error
4474  *   Perform verbose error reporting if not NULL.
4475  *
4476  * @return
4477  *   0 on success, a negative errno value otherwise and rte_errno is set.
4478  */
4479 static int
4480 flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf,
4481                      struct tcf_vtep *vtep,
4482                      struct mlx5_flow *dev_flow,
4483                      bool enable,
4484                      struct rte_flow_error *error)
4485 {
4486         const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4487         struct tcf_neigh_rule *rule;
4488         bool found = false;
4489         int ret;
4490
4491         assert(encap);
4492         assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4493         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4494                 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_SRC);
4495                 LIST_FOREACH(rule, &vtep->neigh, next) {
4496                         if (rule->mask & FLOW_TCF_ENCAP_IPV4_DST &&
4497                             encap->ipv4.dst == rule->ipv4.dst) {
4498                                 found = true;
4499                                 break;
4500                         }
4501                 }
4502         } else {
4503                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4504                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4505                 LIST_FOREACH(rule, &vtep->neigh, next) {
4506                         if (rule->mask & FLOW_TCF_ENCAP_IPV6_DST &&
4507                             !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4508                                                 sizeof(encap->ipv6.dst))) {
4509                                 found = true;
4510                                 break;
4511                         }
4512                 }
4513         }
4514         if (found) {
4515                 if (memcmp(&encap->eth.dst, &rule->eth,
4516                            sizeof(encap->eth.dst))) {
4517                         DRV_LOG(WARNING, "Destination MAC differs"
4518                                          " in neigh rule");
4519                         rte_flow_error_set(error, EEXIST,
4520                                            RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
4521                                            NULL, "Different MAC address"
4522                                            " neigh rule for the same"
4523                                            " destination IP");
4524                                         return -EEXIST;
4525                 }
4526                 if (enable) {
4527                         rule->refcnt++;
4528                         return 0;
4529                 }
4530                 if (!rule->refcnt || !--rule->refcnt) {
4531                         LIST_REMOVE(rule, next);
4532                         return flow_tcf_rule_neigh(tcf, encap,
4533                                                    vtep->ifouter,
4534                                                    false, error);
4535                 }
4536                 return 0;
4537         }
4538         if (!enable) {
4539                 DRV_LOG(WARNING, "Disabling not existing neigh rule");
4540                 rte_flow_error_set(error, ENOENT,
4541                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4542                                    "unable to allocate memory for neigh rule");
4543                 return -ENOENT;
4544         }
4545         rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule),
4546                                 alignof(struct tcf_neigh_rule));
4547         if (!rule) {
4548                 rte_flow_error_set(error, ENOMEM,
4549                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4550                                    "unable to allocate memory for neigh rule");
4551                 return -rte_errno;
4552         }
4553         *rule = (struct tcf_neigh_rule){.refcnt = 0,
4554                                         .mask = 0,
4555                                         };
4556         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4557                 rule->mask = FLOW_TCF_ENCAP_IPV4_DST;
4558                 rule->ipv4.dst = encap->ipv4.dst;
4559         } else {
4560                 rule->mask = FLOW_TCF_ENCAP_IPV6_DST;
4561                 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4562         }
4563         memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth));
4564         ret = flow_tcf_rule_neigh(tcf, encap, vtep->ifouter, true, error);
4565         if (ret) {
4566                 rte_free(rule);
4567                 return ret;
4568         }
4569         rule->refcnt++;
4570         LIST_INSERT_HEAD(&vtep->neigh, rule, next);
4571         return 0;
4572 }
4573
4574 /* VTEP device list is shared between PMD port instances. */
4575 static LIST_HEAD(, tcf_vtep) vtep_list_vxlan = LIST_HEAD_INITIALIZER();
4576 static pthread_mutex_t vtep_list_mutex = PTHREAD_MUTEX_INITIALIZER;
4577
4578 /**
4579  * Deletes VTEP network device.
4580  *
4581  * @param[in] tcf
4582  *   Context object initialized by mlx5_flow_tcf_context_create().
4583  * @param[in] vtep
4584  *   Object represinting the network device to delete. Memory
4585  *   allocated for this object is freed by routine.
4586  */
4587 static void
4588 flow_tcf_vtep_delete(struct mlx5_flow_tcf_context *tcf,
4589                      struct tcf_vtep *vtep)
4590 {
4591         struct nlmsghdr *nlh;
4592         struct ifinfomsg *ifm;
4593         alignas(struct nlmsghdr)
4594         uint8_t buf[mnl_nlmsg_size(MNL_ALIGN(sizeof(*ifm))) +
4595                     MNL_BUF_EXTRA_SPACE];
4596         int ret;
4597
4598         assert(!vtep->refcnt);
4599         /* Delete only ifaces those we actually created. */
4600         if (vtep->created && vtep->ifindex) {
4601                 DRV_LOG(INFO, "VTEP delete (%d)", vtep->ifindex);
4602                 nlh = mnl_nlmsg_put_header(buf);
4603                 nlh->nlmsg_type = RTM_DELLINK;
4604                 nlh->nlmsg_flags = NLM_F_REQUEST;
4605                 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4606                 ifm->ifi_family = AF_UNSPEC;
4607                 ifm->ifi_index = vtep->ifindex;
4608                 assert(sizeof(buf) >= nlh->nlmsg_len);
4609                 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4610                 if (ret)
4611                         DRV_LOG(WARNING, "netlink: error deleting vxlan"
4612                                          " encap/decap ifindex %u",
4613                                          ifm->ifi_index);
4614         }
4615         rte_free(vtep);
4616 }
4617
4618 /**
4619  * Creates VTEP network device.
4620  *
4621  * @param[in] tcf
4622  *   Context object initialized by mlx5_flow_tcf_context_create().
4623  * @param[in] ifouter
4624  *   Outer interface to attach new-created VXLAN device
4625  *   If zero the VXLAN device will not be attached to any device.
4626  *   These VTEPs are used for decapsulation and can be precreated
4627  *   and shared between processes.
4628  * @param[in] port
4629  *   UDP port of created VTEP device.
4630  * @param[out] error
4631  *   Perform verbose error reporting if not NULL.
4632  *
4633  * @return
4634  * Pointer to created device structure on success,
4635  * NULL otherwise and rte_errno is set.
4636  */
4637 #ifdef HAVE_IFLA_VXLAN_COLLECT_METADATA
4638 static struct tcf_vtep*
4639 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf,
4640                      unsigned int ifouter,
4641                      uint16_t port, struct rte_flow_error *error)
4642 {
4643         struct tcf_vtep *vtep;
4644         struct nlmsghdr *nlh;
4645         struct ifinfomsg *ifm;
4646         char name[sizeof(MLX5_VXLAN_DEVICE_PFX) + 24];
4647         alignas(struct nlmsghdr)
4648         uint8_t buf[mnl_nlmsg_size(sizeof(*ifm)) +
4649                     SZ_NLATTR_DATA_OF(sizeof(name)) +
4650                     SZ_NLATTR_NEST * 2 +
4651                     SZ_NLATTR_STRZ_OF("vxlan") +
4652                     SZ_NLATTR_DATA_OF(sizeof(uint32_t)) +
4653                     SZ_NLATTR_DATA_OF(sizeof(uint16_t)) +
4654                     SZ_NLATTR_DATA_OF(sizeof(uint8_t)) * 3 +
4655                     MNL_BUF_EXTRA_SPACE];
4656         struct nlattr *na_info;
4657         struct nlattr *na_vxlan;
4658         rte_be16_t vxlan_port = rte_cpu_to_be_16(port);
4659         int ret;
4660
4661         vtep = rte_zmalloc(__func__, sizeof(*vtep), alignof(struct tcf_vtep));
4662         if (!vtep) {
4663                 rte_flow_error_set(error, ENOMEM,
4664                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4665                                    "unable to allocate memory for VTEP");
4666                 return NULL;
4667         }
4668         *vtep = (struct tcf_vtep){
4669                         .port = port,
4670                         .local = LIST_HEAD_INITIALIZER(),
4671                         .neigh = LIST_HEAD_INITIALIZER(),
4672         };
4673         memset(buf, 0, sizeof(buf));
4674         nlh = mnl_nlmsg_put_header(buf);
4675         nlh->nlmsg_type = RTM_NEWLINK;
4676         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE  | NLM_F_EXCL;
4677         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4678         ifm->ifi_family = AF_UNSPEC;
4679         ifm->ifi_type = 0;
4680         ifm->ifi_index = 0;
4681         ifm->ifi_flags = IFF_UP;
4682         ifm->ifi_change = 0xffffffff;
4683         snprintf(name, sizeof(name), "%s%u", MLX5_VXLAN_DEVICE_PFX, port);
4684         mnl_attr_put_strz(nlh, IFLA_IFNAME, name);
4685         na_info = mnl_attr_nest_start(nlh, IFLA_LINKINFO);
4686         assert(na_info);
4687         mnl_attr_put_strz(nlh, IFLA_INFO_KIND, "vxlan");
4688         na_vxlan = mnl_attr_nest_start(nlh, IFLA_INFO_DATA);
4689         if (ifouter)
4690                 mnl_attr_put_u32(nlh, IFLA_VXLAN_LINK, ifouter);
4691         assert(na_vxlan);
4692         mnl_attr_put_u8(nlh, IFLA_VXLAN_COLLECT_METADATA, 1);
4693         mnl_attr_put_u8(nlh, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1);
4694         mnl_attr_put_u8(nlh, IFLA_VXLAN_LEARNING, 0);
4695         mnl_attr_put_u16(nlh, IFLA_VXLAN_PORT, vxlan_port);
4696         mnl_attr_nest_end(nlh, na_vxlan);
4697         mnl_attr_nest_end(nlh, na_info);
4698         assert(sizeof(buf) >= nlh->nlmsg_len);
4699         ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4700         if (ret) {
4701                 DRV_LOG(WARNING,
4702                         "netlink: VTEP %s create failure (%d)",
4703                         name, rte_errno);
4704                 if (rte_errno != EEXIST || ifouter)
4705                         /*
4706                          * Some unhandled error occurred or device is
4707                          * for encapsulation and cannot be shared.
4708                          */
4709                         goto error;
4710         } else {
4711                 /*
4712                  * Mark device we actually created.
4713                  * We should explicitly delete
4714                  * when we do not need it anymore.
4715                  */
4716                 vtep->created = 1;
4717         }
4718         /* Try to get ifindex of created of pre-existing device. */
4719         ret = if_nametoindex(name);
4720         if (!ret) {
4721                 DRV_LOG(WARNING,
4722                         "VTEP %s failed to get index (%d)", name, errno);
4723                 rte_flow_error_set
4724                         (error, -errno,
4725                          RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4726                          "netlink: failed to retrieve VTEP ifindex");
4727                 goto error;
4728         }
4729         vtep->ifindex = ret;
4730         vtep->ifouter = ifouter;
4731         memset(buf, 0, sizeof(buf));
4732         nlh = mnl_nlmsg_put_header(buf);
4733         nlh->nlmsg_type = RTM_NEWLINK;
4734         nlh->nlmsg_flags = NLM_F_REQUEST;
4735         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4736         ifm->ifi_family = AF_UNSPEC;
4737         ifm->ifi_type = 0;
4738         ifm->ifi_index = vtep->ifindex;
4739         ifm->ifi_flags = IFF_UP;
4740         ifm->ifi_change = IFF_UP;
4741         ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4742         if (ret) {
4743                 rte_flow_error_set(error, -errno,
4744                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4745                                    "netlink: failed to set VTEP link up");
4746                 DRV_LOG(WARNING, "netlink: VTEP %s set link up failure (%d)",
4747                         name, rte_errno);
4748                 goto clean;
4749         }
4750         ret = mlx5_flow_tcf_init(tcf, vtep->ifindex, error);
4751         if (ret) {
4752                 DRV_LOG(WARNING, "VTEP %s init failure (%d)", name, rte_errno);
4753                 goto clean;
4754         }
4755         DRV_LOG(INFO, "VTEP create (%d, %d)", vtep->port, vtep->ifindex);
4756         vtep->refcnt = 1;
4757         return vtep;
4758 clean:
4759         flow_tcf_vtep_delete(tcf, vtep);
4760         return NULL;
4761 error:
4762         rte_free(vtep);
4763         return NULL;
4764 }
4765 #else
4766 static struct tcf_vtep*
4767 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf __rte_unused,
4768                      unsigned int ifouter __rte_unused,
4769                      uint16_t port __rte_unused,
4770                      struct rte_flow_error *error)
4771 {
4772         rte_flow_error_set(error, ENOTSUP,
4773                            RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4774                            "netlink: failed to create VTEP, "
4775                            "vxlan metadata are not supported by kernel");
4776         return NULL;
4777 }
4778 #endif /* HAVE_IFLA_VXLAN_COLLECT_METADATA */
4779
4780 /**
4781  * Acquire target interface index for VXLAN tunneling decapsulation.
4782  * In order to share the UDP port within the other interfaces the
4783  * VXLAN device created as not attached to any interface (if created).
4784  *
4785  * @param[in] tcf
4786  *   Context object initialized by mlx5_flow_tcf_context_create().
4787  * @param[in] dev_flow
4788  *   Flow tcf object with tunnel structure pointer set.
4789  * @param[out] error
4790  *   Perform verbose error reporting if not NULL.
4791  * @return
4792  *   Interface descriptor pointer on success,
4793  *   NULL otherwise and rte_errno is set.
4794  */
4795 static struct tcf_vtep*
4796 flow_tcf_decap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
4797                             struct mlx5_flow *dev_flow,
4798                             struct rte_flow_error *error)
4799 {
4800         struct tcf_vtep *vtep;
4801         uint16_t port = dev_flow->tcf.vxlan_decap->udp_port;
4802
4803         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
4804                 if (vtep->port == port)
4805                         break;
4806         }
4807         if (vtep && vtep->ifouter) {
4808                 rte_flow_error_set(error, -errno,
4809                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4810                                    "Failed to create decap VTEP with specified"
4811                                    " UDP port, atatched device exists");
4812                 return NULL;
4813         }
4814         if (vtep) {
4815                 /* Device exists, just increment the reference counter. */
4816                 vtep->refcnt++;
4817                 assert(vtep->ifindex);
4818                 return vtep;
4819         }
4820         /* No decapsulation device exists, try to create the new one. */
4821         vtep = flow_tcf_vtep_create(tcf, 0, port, error);
4822         if (vtep)
4823                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
4824         return vtep;
4825 }
4826
4827 /**
4828  * Aqcuire target interface index for VXLAN tunneling encapsulation.
4829  *
4830  * @param[in] tcf
4831  *   Context object initialized by mlx5_flow_tcf_context_create().
4832  * @param[in] ifouter
4833  *   Network interface index to attach VXLAN encap device to.
4834  * @param[in] dev_flow
4835  *   Flow tcf object with tunnel structure pointer set.
4836  * @param[out] error
4837  *   Perform verbose error reporting if not NULL.
4838  * @return
4839  *   Interface descriptor pointer on success,
4840  *   NULL otherwise and rte_errno is set.
4841  */
4842 static struct tcf_vtep*
4843 flow_tcf_encap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
4844                             unsigned int ifouter,
4845                             struct mlx5_flow *dev_flow __rte_unused,
4846                             struct rte_flow_error *error)
4847 {
4848         static uint16_t encap_port = MLX5_VXLAN_PORT_MIN - 1;
4849         struct tcf_vtep *vtep;
4850         int ret;
4851
4852         assert(ifouter);
4853         /* Look whether the attached VTEP for encap is created. */
4854         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
4855                 if (vtep->ifouter == ifouter)
4856                         break;
4857         }
4858         if (vtep) {
4859                 /* VTEP already exists, just increment the reference. */
4860                 vtep->refcnt++;
4861         } else {
4862                 uint16_t pcnt;
4863
4864                 /* Not found, we should create the new attached VTEP. */
4865                 flow_tcf_encap_iface_cleanup(tcf, ifouter);
4866                 flow_tcf_encap_local_cleanup(tcf, ifouter);
4867                 flow_tcf_encap_neigh_cleanup(tcf, ifouter);
4868                 for (pcnt = 0; pcnt <= (MLX5_VXLAN_PORT_MAX
4869                                      - MLX5_VXLAN_PORT_MIN); pcnt++) {
4870                         encap_port++;
4871                         /* Wraparound the UDP port index. */
4872                         if (encap_port < MLX5_VXLAN_PORT_MIN ||
4873                             encap_port > MLX5_VXLAN_PORT_MAX)
4874                                 encap_port = MLX5_VXLAN_PORT_MIN;
4875                         /* Check whether UDP port is in already in use. */
4876                         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
4877                                 if (vtep->port == encap_port)
4878                                         break;
4879                         }
4880                         if (vtep) {
4881                                 /* Port is in use, try the next one. */
4882                                 vtep = NULL;
4883                                 continue;
4884                         }
4885                         vtep = flow_tcf_vtep_create(tcf, ifouter,
4886                                                     encap_port, error);
4887                         if (vtep) {
4888                                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
4889                                 break;
4890                         }
4891                         if (rte_errno != EEXIST)
4892                                 break;
4893                 }
4894                 if (!vtep)
4895                         return NULL;
4896         }
4897         assert(vtep->ifouter == ifouter);
4898         assert(vtep->ifindex);
4899         /* Create local ipaddr with peer to specify the outer IPs. */
4900         ret = flow_tcf_encap_local(tcf, vtep, dev_flow, true, error);
4901         if (!ret) {
4902                 /* Create neigh rule to specify outer destination MAC. */
4903                 ret = flow_tcf_encap_neigh(tcf, vtep, dev_flow, true, error);
4904                 if (ret)
4905                         flow_tcf_encap_local(tcf, vtep,
4906                                              dev_flow, false, error);
4907         }
4908         if (ret) {
4909                 if (--vtep->refcnt == 0)
4910                         flow_tcf_vtep_delete(tcf, vtep);
4911                 return NULL;
4912         }
4913         return vtep;
4914 }
4915
4916 /**
4917  * Acquires target interface index for tunneling of any type.
4918  * Creates the new VTEP if needed.
4919  *
4920  * @param[in] tcf
4921  *   Context object initialized by mlx5_flow_tcf_context_create().
4922  * @param[in] ifouter
4923  *   Network interface index to attach VXLAN encap device to.
4924  * @param[in] dev_flow
4925  *   Flow tcf object with tunnel structure pointer set.
4926  * @param[out] error
4927  *   Perform verbose error reporting if not NULL.
4928  * @return
4929  *   Interface descriptor pointer on success,
4930  *   NULL otherwise and rte_errno is set.
4931  */
4932 static struct tcf_vtep*
4933 flow_tcf_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
4934                       unsigned int ifouter,
4935                       struct mlx5_flow *dev_flow,
4936                       struct rte_flow_error *error)
4937 {
4938         struct tcf_vtep *vtep = NULL;
4939
4940         assert(dev_flow->tcf.tunnel);
4941         pthread_mutex_lock(&vtep_list_mutex);
4942         switch (dev_flow->tcf.tunnel->type) {
4943         case FLOW_TCF_TUNACT_VXLAN_ENCAP:
4944                 vtep = flow_tcf_encap_vtep_acquire(tcf, ifouter,
4945                                                   dev_flow, error);
4946                 break;
4947         case FLOW_TCF_TUNACT_VXLAN_DECAP:
4948                 vtep = flow_tcf_decap_vtep_acquire(tcf, dev_flow, error);
4949                 break;
4950         default:
4951                 rte_flow_error_set(error, ENOTSUP,
4952                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4953                                    "unsupported tunnel type");
4954                 break;
4955         }
4956         pthread_mutex_unlock(&vtep_list_mutex);
4957         return vtep;
4958 }
4959
4960 /**
4961  * Release tunneling interface by ifindex. Decrements reference
4962  * counter and actually removes the device if counter is zero.
4963  *
4964  * @param[in] tcf
4965  *   Context object initialized by mlx5_flow_tcf_context_create().
4966  * @param[in] vtep
4967  *   VTEP device descriptor structure.
4968  * @param[in] dev_flow
4969  *   Flow tcf object with tunnel structure pointer set.
4970  */
4971 static void
4972 flow_tcf_vtep_release(struct mlx5_flow_tcf_context *tcf,
4973                       struct tcf_vtep *vtep,
4974                       struct mlx5_flow *dev_flow)
4975 {
4976         assert(dev_flow->tcf.tunnel);
4977         pthread_mutex_lock(&vtep_list_mutex);
4978         switch (dev_flow->tcf.tunnel->type) {
4979         case FLOW_TCF_TUNACT_VXLAN_DECAP:
4980                 break;
4981         case FLOW_TCF_TUNACT_VXLAN_ENCAP:
4982                 /* Remove the encap ancillary rules first. */
4983                 flow_tcf_encap_neigh(tcf, vtep, dev_flow, false, NULL);
4984                 flow_tcf_encap_local(tcf, vtep, dev_flow, false, NULL);
4985                 break;
4986         default:
4987                 assert(false);
4988                 DRV_LOG(WARNING, "Unsupported tunnel type");
4989                 break;
4990         }
4991         assert(vtep->refcnt);
4992         if (--vtep->refcnt == 0) {
4993                 LIST_REMOVE(vtep, next);
4994                 flow_tcf_vtep_delete(tcf, vtep);
4995         }
4996         pthread_mutex_unlock(&vtep_list_mutex);
4997 }
4998
4999 struct tcf_nlcb_query {
5000         uint32_t handle;
5001         uint32_t tc_flags;
5002         uint32_t flags_valid:1;
5003 };
5004
5005 /**
5006  * Collect queried rule attributes. This is callback routine called by
5007  * libmnl mnl_cb_run() in loop for every message in received packet.
5008  * Current implementation collects the flower flags only.
5009  *
5010  * @param[in] nlh
5011  *   Pointer to reply header.
5012  * @param[in, out] arg
5013  *   Context pointer for this callback.
5014  *
5015  * @return
5016  *   A positive, nonzero value on success (required by libmnl
5017  *   to continue messages processing).
5018  */
5019 static int
5020 flow_tcf_collect_query_cb(const struct nlmsghdr *nlh, void *arg)
5021 {
5022         struct tcf_nlcb_query *query = arg;
5023         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
5024         struct nlattr *na, *na_opt;
5025         bool flower = false;
5026
5027         if (nlh->nlmsg_type != RTM_NEWTFILTER ||
5028             tcm->tcm_handle != query->handle)
5029                 return 1;
5030         mnl_attr_for_each(na, nlh, sizeof(*tcm)) {
5031                 switch (mnl_attr_get_type(na)) {
5032                 case TCA_KIND:
5033                         if (strcmp(mnl_attr_get_payload(na), "flower")) {
5034                                 /* Not flower filter, drop entire message. */
5035                                 return 1;
5036                         }
5037                         flower = true;
5038                         break;
5039                 case TCA_OPTIONS:
5040                         if (!flower) {
5041                                 /* Not flower options, drop entire message. */
5042                                 return 1;
5043                         }
5044                         /* Check nested flower options. */
5045                         mnl_attr_for_each_nested(na_opt, na) {
5046                                 switch (mnl_attr_get_type(na_opt)) {
5047                                 case TCA_FLOWER_FLAGS:
5048                                         query->flags_valid = 1;
5049                                         query->tc_flags =
5050                                                 mnl_attr_get_u32(na_opt);
5051                                         break;
5052                                 }
5053                         }
5054                         break;
5055                 }
5056         }
5057         return 1;
5058 }
5059
5060 /**
5061  * Query a TC flower rule flags via netlink.
5062  *
5063  * @param[in] tcf
5064  *   Context object initialized by mlx5_flow_tcf_context_create().
5065  * @param[in] dev_flow
5066  *   Pointer to the flow.
5067  * @param[out] pflags
5068  *   pointer to the data retrieved by the query.
5069  *
5070  * @return
5071  *   0 on success, a negative errno value otherwise.
5072  */
5073 static int
5074 flow_tcf_query_flags(struct mlx5_flow_tcf_context *tcf,
5075                      struct mlx5_flow *dev_flow,
5076                      uint32_t *pflags)
5077 {
5078         struct nlmsghdr *nlh;
5079         struct tcmsg *tcm;
5080         struct tcf_nlcb_query query = {
5081                 .handle = dev_flow->tcf.tcm->tcm_handle,
5082         };
5083
5084         nlh = mnl_nlmsg_put_header(tcf->buf);
5085         nlh->nlmsg_type = RTM_GETTFILTER;
5086         nlh->nlmsg_flags = NLM_F_REQUEST;
5087         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
5088         memcpy(tcm, dev_flow->tcf.tcm, sizeof(*tcm));
5089         /*
5090          * Ignore Netlink error for filter query operations.
5091          * The reply length is sent by kernel as errno.
5092          * Just check we got the flags option.
5093          */
5094         flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_query_cb, &query);
5095         if (!query.flags_valid) {
5096                 *pflags = 0;
5097                 return -ENOENT;
5098         }
5099         *pflags = query.tc_flags;
5100         return 0;
5101 }
5102
5103 /**
5104  * Query and check the in_hw set for specified rule.
5105  *
5106  * @param[in] tcf
5107  *   Context object initialized by mlx5_flow_tcf_context_create().
5108  * @param[in] dev_flow
5109  *   Pointer to the flow to check.
5110  *
5111  * @return
5112  *   0 on success, a negative errno value otherwise.
5113  */
5114 static int
5115 flow_tcf_check_inhw(struct mlx5_flow_tcf_context *tcf,
5116                     struct mlx5_flow *dev_flow)
5117 {
5118         uint32_t flags;
5119         int ret;
5120
5121         ret = flow_tcf_query_flags(tcf, dev_flow, &flags);
5122         if (ret)
5123                 return ret;
5124         return  (flags & TCA_CLS_FLAGS_IN_HW) ? 0 : -ENOENT;
5125 }
5126
5127 /**
5128  * Remove flow from E-Switch by sending Netlink message.
5129  *
5130  * @param[in] dev
5131  *   Pointer to Ethernet device.
5132  * @param[in, out] flow
5133  *   Pointer to the sub flow.
5134  */
5135 static void
5136 flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
5137 {
5138         struct priv *priv = dev->data->dev_private;
5139         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5140         struct mlx5_flow *dev_flow;
5141         struct nlmsghdr *nlh;
5142
5143         if (!flow)
5144                 return;
5145         dev_flow = LIST_FIRST(&flow->dev_flows);
5146         if (!dev_flow)
5147                 return;
5148         /* E-Switch flow can't be expanded. */
5149         assert(!LIST_NEXT(dev_flow, next));
5150         if (dev_flow->tcf.applied) {
5151                 nlh = dev_flow->tcf.nlh;
5152                 nlh->nlmsg_type = RTM_DELTFILTER;
5153                 nlh->nlmsg_flags = NLM_F_REQUEST;
5154                 flow_tcf_nl_ack(ctx, nlh, NULL, NULL);
5155                 if (dev_flow->tcf.tunnel) {
5156                         assert(dev_flow->tcf.tunnel->vtep);
5157                         flow_tcf_vtep_release(ctx,
5158                                 dev_flow->tcf.tunnel->vtep,
5159                                 dev_flow);
5160                         dev_flow->tcf.tunnel->vtep = NULL;
5161                 }
5162                 dev_flow->tcf.applied = 0;
5163         }
5164 }
5165
5166 /**
5167  * Apply flow to E-Switch by sending Netlink message.
5168  *
5169  * @param[in] dev
5170  *   Pointer to Ethernet device.
5171  * @param[in, out] flow
5172  *   Pointer to the sub flow.
5173  * @param[out] error
5174  *   Pointer to the error structure.
5175  *
5176  * @return
5177  *   0 on success, a negative errno value otherwise and rte_ernno is set.
5178  */
5179 static int
5180 flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
5181                struct rte_flow_error *error)
5182 {
5183         struct priv *priv = dev->data->dev_private;
5184         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5185         struct mlx5_flow *dev_flow;
5186         struct nlmsghdr *nlh;
5187
5188         dev_flow = LIST_FIRST(&flow->dev_flows);
5189         /* E-Switch flow can't be expanded. */
5190         assert(!LIST_NEXT(dev_flow, next));
5191         if (dev_flow->tcf.applied)
5192                 return 0;
5193         nlh = dev_flow->tcf.nlh;
5194         nlh->nlmsg_type = RTM_NEWTFILTER;
5195         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
5196         if (dev_flow->tcf.tunnel) {
5197                 /*
5198                  * Replace the interface index, target for
5199                  * encapsulation, source for decapsulation.
5200                  */
5201                 assert(!dev_flow->tcf.tunnel->vtep);
5202                 assert(dev_flow->tcf.tunnel->ifindex_ptr);
5203                 /* Acquire actual VTEP device when rule is being applied. */
5204                 dev_flow->tcf.tunnel->vtep =
5205                         flow_tcf_vtep_acquire(ctx,
5206                                         dev_flow->tcf.tunnel->ifindex_org,
5207                                         dev_flow, error);
5208                 if (!dev_flow->tcf.tunnel->vtep)
5209                         return -rte_errno;
5210                 DRV_LOG(INFO, "Replace ifindex: %d->%d",
5211                                 dev_flow->tcf.tunnel->vtep->ifindex,
5212                                 dev_flow->tcf.tunnel->ifindex_org);
5213                 *dev_flow->tcf.tunnel->ifindex_ptr =
5214                         dev_flow->tcf.tunnel->vtep->ifindex;
5215         }
5216         if (!flow_tcf_nl_ack(ctx, nlh, NULL, NULL)) {
5217                 dev_flow->tcf.applied = 1;
5218                 if (*dev_flow->tcf.ptc_flags & TCA_CLS_FLAGS_SKIP_SW)
5219                         return 0;
5220                 /*
5221                  * Rule was applied without skip_sw flag set.
5222                  * We should check whether the rule was acctually
5223                  * accepted by hardware (have look at in_hw flag).
5224                  */
5225                 if (flow_tcf_check_inhw(ctx, dev_flow)) {
5226                         flow_tcf_remove(dev, flow);
5227                         return rte_flow_error_set
5228                                 (error, ENOENT,
5229                                  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5230                                  "netlink: rule has no in_hw flag set");
5231                 }
5232                 return 0;
5233         }
5234         if (dev_flow->tcf.tunnel) {
5235                 /* Rollback the VTEP configuration if rule apply failed. */
5236                 assert(dev_flow->tcf.tunnel->vtep);
5237                 flow_tcf_vtep_release(ctx, dev_flow->tcf.tunnel->vtep,
5238                                       dev_flow);
5239                 dev_flow->tcf.tunnel->vtep = NULL;
5240         }
5241         return rte_flow_error_set(error, rte_errno,
5242                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5243                                   "netlink: failed to create TC flow rule");
5244 }
5245
5246 /**
5247  * Remove flow from E-Switch and release resources of the device flow.
5248  *
5249  * @param[in] dev
5250  *   Pointer to Ethernet device.
5251  * @param[in, out] flow
5252  *   Pointer to the sub flow.
5253  */
5254 static void
5255 flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
5256 {
5257         struct mlx5_flow *dev_flow;
5258
5259         if (!flow)
5260                 return;
5261         flow_tcf_remove(dev, flow);
5262         if (flow->counter) {
5263                 if (--flow->counter->ref_cnt == 0) {
5264                         rte_free(flow->counter);
5265                         flow->counter = NULL;
5266                 }
5267         }
5268         dev_flow = LIST_FIRST(&flow->dev_flows);
5269         if (!dev_flow)
5270                 return;
5271         /* E-Switch flow can't be expanded. */
5272         assert(!LIST_NEXT(dev_flow, next));
5273         LIST_REMOVE(dev_flow, next);
5274         rte_free(dev_flow);
5275 }
5276
5277 /**
5278  * Helper routine for figuring the space size required for a parse buffer.
5279  *
5280  * @param array
5281  *   array of values to use.
5282  * @param idx
5283  *   Current location in array.
5284  * @param value
5285  *   Value to compare with.
5286  *
5287  * @return
5288  *   The maximum between the given value and the array value on index.
5289  */
5290 static uint16_t
5291 flow_tcf_arr_val_max(uint16_t array[], int idx, uint16_t value)
5292 {
5293         return idx < 0 ? (value) : RTE_MAX((array)[idx], value);
5294 }
5295
5296 /**
5297  * Parse rtnetlink message attributes filling the attribute table with the info
5298  * retrieved.
5299  *
5300  * @param tb
5301  *   Attribute table to be filled.
5302  * @param[out] max
5303  *   Maxinum entry in the attribute table.
5304  * @param rte
5305  *   The attributes section in the message to be parsed.
5306  * @param len
5307  *   The length of the attributes section in the message.
5308  */
5309 static void
5310 flow_tcf_nl_parse_rtattr(struct rtattr *tb[], int max,
5311                          struct rtattr *rta, int len)
5312 {
5313         unsigned short type;
5314         memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
5315         while (RTA_OK(rta, len)) {
5316                 type = rta->rta_type;
5317                 if (type <= max && !tb[type])
5318                         tb[type] = rta;
5319                 rta = RTA_NEXT(rta, len);
5320         }
5321 }
5322
5323 /**
5324  * Extract flow counters from flower action.
5325  *
5326  * @param rta
5327  *   flower action stats properties in the Netlink message received.
5328  * @param rta_type
5329  *   The backward sequence of rta_types, as written in the attribute table,
5330  *   we need to traverse in order to get to the requested object.
5331  * @param idx
5332  *   Current location in rta_type table.
5333  * @param[out] data
5334  *   data holding the count statistics of the rte_flow retrieved from
5335  *   the message.
5336  *
5337  * @return
5338  *   0 if data was found and retrieved, -1 otherwise.
5339  */
5340 static int
5341 flow_tcf_nl_action_stats_parse_and_get(struct rtattr *rta,
5342                                        uint16_t rta_type[], int idx,
5343                                        struct gnet_stats_basic *data)
5344 {
5345         int tca_stats_max = flow_tcf_arr_val_max(rta_type, idx,
5346                                                  TCA_STATS_BASIC);
5347         struct rtattr *tbs[tca_stats_max + 1];
5348
5349         if (rta == NULL || idx < 0)
5350                 return -1;
5351         flow_tcf_nl_parse_rtattr(tbs, tca_stats_max,
5352                                  RTA_DATA(rta), RTA_PAYLOAD(rta));
5353         switch (rta_type[idx]) {
5354         case TCA_STATS_BASIC:
5355                 if (tbs[TCA_STATS_BASIC]) {
5356                         memcpy(data, RTA_DATA(tbs[TCA_STATS_BASIC]),
5357                                RTE_MIN(RTA_PAYLOAD(tbs[TCA_STATS_BASIC]),
5358                                sizeof(*data)));
5359                         return 0;
5360                 }
5361                 break;
5362         default:
5363                 break;
5364         }
5365         return -1;
5366 }
5367
5368 /**
5369  * Parse flower single action retrieving the requested action attribute,
5370  * if found.
5371  *
5372  * @param arg
5373  *   flower action properties in the Netlink message received.
5374  * @param rta_type
5375  *   The backward sequence of rta_types, as written in the attribute table,
5376  *   we need to traverse in order to get to the requested object.
5377  * @param idx
5378  *   Current location in rta_type table.
5379  * @param[out] data
5380  *   Count statistics retrieved from the message query.
5381  *
5382  * @return
5383  *   0 if data was found and retrieved, -1 otherwise.
5384  */
5385 static int
5386 flow_tcf_nl_parse_one_action_and_get(struct rtattr *arg,
5387                                      uint16_t rta_type[], int idx, void *data)
5388 {
5389         int tca_act_max = flow_tcf_arr_val_max(rta_type, idx, TCA_ACT_STATS);
5390         struct rtattr *tb[tca_act_max + 1];
5391
5392         if (arg == NULL || idx < 0)
5393                 return -1;
5394         flow_tcf_nl_parse_rtattr(tb, tca_act_max,
5395                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
5396         if (tb[TCA_ACT_KIND] == NULL)
5397                 return -1;
5398         switch (rta_type[idx]) {
5399         case TCA_ACT_STATS:
5400                 if (tb[TCA_ACT_STATS])
5401                         return flow_tcf_nl_action_stats_parse_and_get
5402                                         (tb[TCA_ACT_STATS],
5403                                          rta_type, --idx,
5404                                          (struct gnet_stats_basic *)data);
5405                 break;
5406         default:
5407                 break;
5408         }
5409         return -1;
5410 }
5411
5412 /**
5413  * Parse flower action section in the message retrieving the requested
5414  * attribute from the first action that provides it.
5415  *
5416  * @param opt
5417  *   flower section in the Netlink message received.
5418  * @param rta_type
5419  *   The backward sequence of rta_types, as written in the attribute table,
5420  *   we need to traverse in order to get to the requested object.
5421  * @param idx
5422  *   Current location in rta_type table.
5423  * @param[out] data
5424  *   data retrieved from the message query.
5425  *
5426  * @return
5427  *   0 if data was found and retrieved, -1 otherwise.
5428  */
5429 static int
5430 flow_tcf_nl_action_parse_and_get(struct rtattr *arg,
5431                                  uint16_t rta_type[], int idx, void *data)
5432 {
5433         struct rtattr *tb[TCA_ACT_MAX_PRIO + 1];
5434         int i;
5435
5436         if (arg == NULL || idx < 0)
5437                 return -1;
5438         flow_tcf_nl_parse_rtattr(tb, TCA_ACT_MAX_PRIO,
5439                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
5440         switch (rta_type[idx]) {
5441         /*
5442          * flow counters are stored in the actions defined by the flow
5443          * and not in the flow itself, therefore we need to traverse the
5444          * flower chain of actions in search for them.
5445          *
5446          * Note that the index is not decremented here.
5447          */
5448         case TCA_ACT_STATS:
5449                 for (i = 0; i <= TCA_ACT_MAX_PRIO; i++) {
5450                         if (tb[i] &&
5451                         !flow_tcf_nl_parse_one_action_and_get(tb[i],
5452                                                               rta_type,
5453                                                               idx, data))
5454                                 return 0;
5455                 }
5456                 break;
5457         default:
5458                 break;
5459         }
5460         return -1;
5461 }
5462
5463 /**
5464  * Parse flower classifier options in the message, retrieving the requested
5465  * attribute if found.
5466  *
5467  * @param opt
5468  *   flower section in the Netlink message received.
5469  * @param rta_type
5470  *   The backward sequence of rta_types, as written in the attribute table,
5471  *   we need to traverse in order to get to the requested object.
5472  * @param idx
5473  *   Current location in rta_type table.
5474  * @param[out] data
5475  *   data retrieved from the message query.
5476  *
5477  * @return
5478  *   0 if data was found and retrieved, -1 otherwise.
5479  */
5480 static int
5481 flow_tcf_nl_opts_parse_and_get(struct rtattr *opt,
5482                                uint16_t rta_type[], int idx, void *data)
5483 {
5484         int tca_flower_max = flow_tcf_arr_val_max(rta_type, idx,
5485                                                   TCA_FLOWER_ACT);
5486         struct rtattr *tb[tca_flower_max + 1];
5487
5488         if (!opt || idx < 0)
5489                 return -1;
5490         flow_tcf_nl_parse_rtattr(tb, tca_flower_max,
5491                                  RTA_DATA(opt), RTA_PAYLOAD(opt));
5492         switch (rta_type[idx]) {
5493         case TCA_FLOWER_ACT:
5494                 if (tb[TCA_FLOWER_ACT])
5495                         return flow_tcf_nl_action_parse_and_get
5496                                                         (tb[TCA_FLOWER_ACT],
5497                                                          rta_type, --idx, data);
5498                 break;
5499         default:
5500                 break;
5501         }
5502         return -1;
5503 }
5504
5505 /**
5506  * Parse Netlink reply on filter query, retrieving the flow counters.
5507  *
5508  * @param nlh
5509  *   Message received from Netlink.
5510  * @param rta_type
5511  *   The backward sequence of rta_types, as written in the attribute table,
5512  *   we need to traverse in order to get to the requested object.
5513  * @param idx
5514  *   Current location in rta_type table.
5515  * @param[out] data
5516  *   data retrieved from the message query.
5517  *
5518  * @return
5519  *   0 if data was found and retrieved, -1 otherwise.
5520  */
5521 static int
5522 flow_tcf_nl_filter_parse_and_get(struct nlmsghdr *cnlh,
5523                                  uint16_t rta_type[], int idx, void *data)
5524 {
5525         struct nlmsghdr *nlh = cnlh;
5526         struct tcmsg *t = NLMSG_DATA(nlh);
5527         int len = nlh->nlmsg_len;
5528         int tca_max = flow_tcf_arr_val_max(rta_type, idx, TCA_OPTIONS);
5529         struct rtattr *tb[tca_max + 1];
5530
5531         if (idx < 0)
5532                 return -1;
5533         if (nlh->nlmsg_type != RTM_NEWTFILTER &&
5534             nlh->nlmsg_type != RTM_GETTFILTER &&
5535             nlh->nlmsg_type != RTM_DELTFILTER)
5536                 return -1;
5537         len -= NLMSG_LENGTH(sizeof(*t));
5538         if (len < 0)
5539                 return -1;
5540         flow_tcf_nl_parse_rtattr(tb, tca_max, TCA_RTA(t), len);
5541         /* Not a TC flower flow - bail out */
5542         if (!tb[TCA_KIND] ||
5543             strcmp(RTA_DATA(tb[TCA_KIND]), "flower"))
5544                 return -1;
5545         switch (rta_type[idx]) {
5546         case TCA_OPTIONS:
5547                 if (tb[TCA_OPTIONS])
5548                         return flow_tcf_nl_opts_parse_and_get(tb[TCA_OPTIONS],
5549                                                               rta_type,
5550                                                               --idx, data);
5551                 break;
5552         default:
5553                 break;
5554         }
5555         return -1;
5556 }
5557
5558 /**
5559  * A callback to parse Netlink reply on TC flower query.
5560  *
5561  * @param nlh
5562  *   Message received from Netlink.
5563  * @param[out] data
5564  *   Pointer to data area to be filled by the parsing routine.
5565  *   assumed to be a pointer to struct flow_tcf_stats_basic.
5566  *
5567  * @return
5568  *   MNL_CB_OK value.
5569  */
5570 static int
5571 flow_tcf_nl_message_get_stats_basic(const struct nlmsghdr *nlh, void *data)
5572 {
5573         /*
5574          * The backward sequence of rta_types to pass in order to get
5575          *  to the counters.
5576          */
5577         uint16_t rta_type[] = { TCA_STATS_BASIC, TCA_ACT_STATS,
5578                                 TCA_FLOWER_ACT, TCA_OPTIONS };
5579         struct flow_tcf_stats_basic *sb_data = data;
5580         union {
5581                 const struct nlmsghdr *c;
5582                 struct nlmsghdr *nc;
5583         } tnlh = { .c = nlh };
5584
5585         if (!flow_tcf_nl_filter_parse_and_get(tnlh.nc, rta_type,
5586                                               RTE_DIM(rta_type) - 1,
5587                                               (void *)&sb_data->counters))
5588                 sb_data->valid = true;
5589         return MNL_CB_OK;
5590 }
5591
5592 /**
5593  * Query a TC flower rule for its statistics via netlink.
5594  *
5595  * @param[in] dev
5596  *   Pointer to Ethernet device.
5597  * @param[in] flow
5598  *   Pointer to the sub flow.
5599  * @param[out] data
5600  *   data retrieved by the query.
5601  * @param[out] error
5602  *   Perform verbose error reporting if not NULL.
5603  *
5604  * @return
5605  *   0 on success, a negative errno value otherwise and rte_errno is set.
5606  */
5607 static int
5608 flow_tcf_query_count(struct rte_eth_dev *dev,
5609                           struct rte_flow *flow,
5610                           void *data,
5611                           struct rte_flow_error *error)
5612 {
5613         struct flow_tcf_stats_basic sb_data;
5614         struct rte_flow_query_count *qc = data;
5615         struct priv *priv = dev->data->dev_private;
5616         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5617         struct mnl_socket *nl = ctx->nl;
5618         struct mlx5_flow *dev_flow;
5619         struct nlmsghdr *nlh;
5620         uint32_t seq = priv->tcf_context->seq++;
5621         ssize_t ret;
5622         assert(qc);
5623
5624         memset(&sb_data, 0, sizeof(sb_data));
5625         dev_flow = LIST_FIRST(&flow->dev_flows);
5626         /* E-Switch flow can't be expanded. */
5627         assert(!LIST_NEXT(dev_flow, next));
5628         if (!dev_flow->flow->counter)
5629                 goto notsup_exit;
5630         nlh = dev_flow->tcf.nlh;
5631         nlh->nlmsg_type = RTM_GETTFILTER;
5632         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ECHO;
5633         nlh->nlmsg_seq = seq;
5634         if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) == -1)
5635                 goto error_exit;
5636         do {
5637                 ret = mnl_socket_recvfrom(nl, ctx->buf, ctx->buf_size);
5638                 if (ret <= 0)
5639                         break;
5640                 ret = mnl_cb_run(ctx->buf, ret, seq,
5641                                  mnl_socket_get_portid(nl),
5642                                  flow_tcf_nl_message_get_stats_basic,
5643                                  (void *)&sb_data);
5644         } while (ret > 0);
5645         /* Return the delta from last reset. */
5646         if (sb_data.valid) {
5647                 /* Return the delta from last reset. */
5648                 qc->hits_set = 1;
5649                 qc->bytes_set = 1;
5650                 qc->hits = sb_data.counters.packets - flow->counter->hits;
5651                 qc->bytes = sb_data.counters.bytes - flow->counter->bytes;
5652                 if (qc->reset) {
5653                         flow->counter->hits = sb_data.counters.packets;
5654                         flow->counter->bytes = sb_data.counters.bytes;
5655                 }
5656                 return 0;
5657         }
5658         return rte_flow_error_set(error, EINVAL,
5659                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5660                                   NULL,
5661                                   "flow does not have counter");
5662 error_exit:
5663         return rte_flow_error_set
5664                         (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5665                          NULL, "netlink: failed to read flow rule counters");
5666 notsup_exit:
5667         return rte_flow_error_set
5668                         (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5669                          NULL, "counters are not available.");
5670 }
5671
5672 /**
5673  * Query a flow.
5674  *
5675  * @see rte_flow_query()
5676  * @see rte_flow_ops
5677  */
5678 static int
5679 flow_tcf_query(struct rte_eth_dev *dev,
5680                struct rte_flow *flow,
5681                const struct rte_flow_action *actions,
5682                void *data,
5683                struct rte_flow_error *error)
5684 {
5685         int ret = -EINVAL;
5686
5687         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
5688                 switch (actions->type) {
5689                 case RTE_FLOW_ACTION_TYPE_VOID:
5690                         break;
5691                 case RTE_FLOW_ACTION_TYPE_COUNT:
5692                         ret = flow_tcf_query_count(dev, flow, data, error);
5693                         break;
5694                 default:
5695                         return rte_flow_error_set(error, ENOTSUP,
5696                                                   RTE_FLOW_ERROR_TYPE_ACTION,
5697                                                   actions,
5698                                                   "action not supported");
5699                 }
5700         }
5701         return ret;
5702 }
5703
5704 const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops = {
5705         .validate = flow_tcf_validate,
5706         .prepare = flow_tcf_prepare,
5707         .translate = flow_tcf_translate,
5708         .apply = flow_tcf_apply,
5709         .remove = flow_tcf_remove,
5710         .destroy = flow_tcf_destroy,
5711         .query = flow_tcf_query,
5712 };
5713
5714 /**
5715  * Create and configure a libmnl socket for Netlink flow rules.
5716  *
5717  * @return
5718  *   A valid libmnl socket object pointer on success, NULL otherwise and
5719  *   rte_errno is set.
5720  */
5721 static struct mnl_socket *
5722 flow_tcf_mnl_socket_create(void)
5723 {
5724         struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
5725
5726         if (nl) {
5727                 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
5728                                       sizeof(int));
5729                 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
5730                         return nl;
5731         }
5732         rte_errno = errno;
5733         if (nl)
5734                 mnl_socket_close(nl);
5735         return NULL;
5736 }
5737
5738 /**
5739  * Destroy a libmnl socket.
5740  *
5741  * @param nl
5742  *   Libmnl socket of the @p NETLINK_ROUTE kind.
5743  */
5744 static void
5745 flow_tcf_mnl_socket_destroy(struct mnl_socket *nl)
5746 {
5747         if (nl)
5748                 mnl_socket_close(nl);
5749 }
5750
5751 /**
5752  * Initialize ingress qdisc of a given network interface.
5753  *
5754  * @param ctx
5755  *   Pointer to tc-flower context to use.
5756  * @param ifindex
5757  *   Index of network interface to initialize.
5758  * @param[out] error
5759  *   Perform verbose error reporting if not NULL.
5760  *
5761  * @return
5762  *   0 on success, a negative errno value otherwise and rte_errno is set.
5763  */
5764 int
5765 mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx,
5766                    unsigned int ifindex, struct rte_flow_error *error)
5767 {
5768         struct nlmsghdr *nlh;
5769         struct tcmsg *tcm;
5770         alignas(struct nlmsghdr)
5771         uint8_t buf[mnl_nlmsg_size(sizeof(*tcm)) +
5772                     SZ_NLATTR_STRZ_OF("ingress") +
5773                     MNL_BUF_EXTRA_SPACE];
5774
5775         /* Destroy existing ingress qdisc and everything attached to it. */
5776         nlh = mnl_nlmsg_put_header(buf);
5777         nlh->nlmsg_type = RTM_DELQDISC;
5778         nlh->nlmsg_flags = NLM_F_REQUEST;
5779         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
5780         tcm->tcm_family = AF_UNSPEC;
5781         tcm->tcm_ifindex = ifindex;
5782         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
5783         tcm->tcm_parent = TC_H_INGRESS;
5784         assert(sizeof(buf) >= nlh->nlmsg_len);
5785         /* Ignore errors when qdisc is already absent. */
5786         if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL) &&
5787             rte_errno != EINVAL && rte_errno != ENOENT)
5788                 return rte_flow_error_set(error, rte_errno,
5789                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5790                                           "netlink: failed to remove ingress"
5791                                           " qdisc");
5792         /* Create fresh ingress qdisc. */
5793         nlh = mnl_nlmsg_put_header(buf);
5794         nlh->nlmsg_type = RTM_NEWQDISC;
5795         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
5796         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
5797         tcm->tcm_family = AF_UNSPEC;
5798         tcm->tcm_ifindex = ifindex;
5799         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
5800         tcm->tcm_parent = TC_H_INGRESS;
5801         mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
5802         assert(sizeof(buf) >= nlh->nlmsg_len);
5803         if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL))
5804                 return rte_flow_error_set(error, rte_errno,
5805                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5806                                           "netlink: failed to create ingress"
5807                                           " qdisc");
5808         return 0;
5809 }
5810
5811 /**
5812  * Create libmnl context for Netlink flow rules.
5813  *
5814  * @return
5815  *   A valid libmnl socket object pointer on success, NULL otherwise and
5816  *   rte_errno is set.
5817  */
5818 struct mlx5_flow_tcf_context *
5819 mlx5_flow_tcf_context_create(void)
5820 {
5821         struct mlx5_flow_tcf_context *ctx = rte_zmalloc(__func__,
5822                                                         sizeof(*ctx),
5823                                                         sizeof(uint32_t));
5824         if (!ctx)
5825                 goto error;
5826         ctx->nl = flow_tcf_mnl_socket_create();
5827         if (!ctx->nl)
5828                 goto error;
5829         ctx->buf_size = MNL_SOCKET_BUFFER_SIZE;
5830         ctx->buf = rte_zmalloc(__func__,
5831                                ctx->buf_size, sizeof(uint32_t));
5832         if (!ctx->buf)
5833                 goto error;
5834         ctx->seq = random();
5835         return ctx;
5836 error:
5837         mlx5_flow_tcf_context_destroy(ctx);
5838         return NULL;
5839 }
5840
5841 /**
5842  * Destroy a libmnl context.
5843  *
5844  * @param ctx
5845  *   Libmnl socket of the @p NETLINK_ROUTE kind.
5846  */
5847 void
5848 mlx5_flow_tcf_context_destroy(struct mlx5_flow_tcf_context *ctx)
5849 {
5850         if (!ctx)
5851                 return;
5852         flow_tcf_mnl_socket_destroy(ctx->nl);
5853         rte_free(ctx->buf);
5854         rte_free(ctx);
5855 }