net/mlx5: switch encap rules to use container
[dpdk.git] / drivers / net / mlx5 / mlx5_flow_tcf.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018 6WIND S.A.
3  * Copyright 2018 Mellanox Technologies, Ltd
4  */
5
6 #include <assert.h>
7 #include <errno.h>
8 #include <libmnl/libmnl.h>
9 #include <linux/gen_stats.h>
10 #include <linux/if_ether.h>
11 #include <linux/netlink.h>
12 #include <linux/pkt_cls.h>
13 #include <linux/pkt_sched.h>
14 #include <linux/rtnetlink.h>
15 #include <linux/tc_act/tc_gact.h>
16 #include <linux/tc_act/tc_mirred.h>
17 #include <netinet/in.h>
18 #include <stdalign.h>
19 #include <stdbool.h>
20 #include <stddef.h>
21 #include <stdint.h>
22 #include <stdlib.h>
23 #include <sys/socket.h>
24
25 #include <rte_byteorder.h>
26 #include <rte_errno.h>
27 #include <rte_ether.h>
28 #include <rte_flow.h>
29 #include <rte_malloc.h>
30 #include <rte_common.h>
31
32 #include "mlx5.h"
33 #include "mlx5_flow.h"
34 #include "mlx5_autoconf.h"
35
36 #ifdef HAVE_TC_ACT_VLAN
37
38 #include <linux/tc_act/tc_vlan.h>
39
40 #else /* HAVE_TC_ACT_VLAN */
41
42 #define TCA_VLAN_ACT_POP 1
43 #define TCA_VLAN_ACT_PUSH 2
44 #define TCA_VLAN_ACT_MODIFY 3
45 #define TCA_VLAN_PARMS 2
46 #define TCA_VLAN_PUSH_VLAN_ID 3
47 #define TCA_VLAN_PUSH_VLAN_PROTOCOL 4
48 #define TCA_VLAN_PAD 5
49 #define TCA_VLAN_PUSH_VLAN_PRIORITY 6
50
51 struct tc_vlan {
52         tc_gen;
53         int v_action;
54 };
55
56 #endif /* HAVE_TC_ACT_VLAN */
57
58 #ifdef HAVE_TC_ACT_PEDIT
59
60 #include <linux/tc_act/tc_pedit.h>
61
62 #else /* HAVE_TC_ACT_VLAN */
63
64 enum {
65         TCA_PEDIT_UNSPEC,
66         TCA_PEDIT_TM,
67         TCA_PEDIT_PARMS,
68         TCA_PEDIT_PAD,
69         TCA_PEDIT_PARMS_EX,
70         TCA_PEDIT_KEYS_EX,
71         TCA_PEDIT_KEY_EX,
72         __TCA_PEDIT_MAX
73 };
74
75 enum {
76         TCA_PEDIT_KEY_EX_HTYPE = 1,
77         TCA_PEDIT_KEY_EX_CMD = 2,
78         __TCA_PEDIT_KEY_EX_MAX
79 };
80
81 enum pedit_header_type {
82         TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK = 0,
83         TCA_PEDIT_KEY_EX_HDR_TYPE_ETH = 1,
84         TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 = 2,
85         TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 = 3,
86         TCA_PEDIT_KEY_EX_HDR_TYPE_TCP = 4,
87         TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
88         __PEDIT_HDR_TYPE_MAX,
89 };
90
91 enum pedit_cmd {
92         TCA_PEDIT_KEY_EX_CMD_SET = 0,
93         TCA_PEDIT_KEY_EX_CMD_ADD = 1,
94         __PEDIT_CMD_MAX,
95 };
96
97 struct tc_pedit_key {
98         __u32 mask; /* AND */
99         __u32 val; /*XOR */
100         __u32 off; /*offset */
101         __u32 at;
102         __u32 offmask;
103         __u32 shift;
104 };
105
106 __extension__
107 struct tc_pedit_sel {
108         tc_gen;
109         unsigned char nkeys;
110         unsigned char flags;
111         struct tc_pedit_key keys[0];
112 };
113
114 #endif /* HAVE_TC_ACT_VLAN */
115
116 #ifdef HAVE_TC_ACT_TUNNEL_KEY
117
118 #include <linux/tc_act/tc_tunnel_key.h>
119
120 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
121 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
122 #endif
123
124 #ifndef HAVE_TCA_TUNNEL_KEY_NO_CSUM
125 #define TCA_TUNNEL_KEY_NO_CSUM 10
126 #endif
127
128 #else /* HAVE_TC_ACT_TUNNEL_KEY */
129
130 #define TCA_ACT_TUNNEL_KEY 17
131 #define TCA_TUNNEL_KEY_ACT_SET 1
132 #define TCA_TUNNEL_KEY_ACT_RELEASE 2
133 #define TCA_TUNNEL_KEY_PARMS 2
134 #define TCA_TUNNEL_KEY_ENC_IPV4_SRC 3
135 #define TCA_TUNNEL_KEY_ENC_IPV4_DST 4
136 #define TCA_TUNNEL_KEY_ENC_IPV6_SRC 5
137 #define TCA_TUNNEL_KEY_ENC_IPV6_DST 6
138 #define TCA_TUNNEL_KEY_ENC_KEY_ID 7
139 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
140 #define TCA_TUNNEL_KEY_NO_CSUM 10
141
142 struct tc_tunnel_key {
143         tc_gen;
144         int t_action;
145 };
146
147 #endif /* HAVE_TC_ACT_TUNNEL_KEY */
148
149 /* Normally found in linux/netlink.h. */
150 #ifndef NETLINK_CAP_ACK
151 #define NETLINK_CAP_ACK 10
152 #endif
153
154 /* Normally found in linux/pkt_sched.h. */
155 #ifndef TC_H_MIN_INGRESS
156 #define TC_H_MIN_INGRESS 0xfff2u
157 #endif
158
159 /* Normally found in linux/pkt_cls.h. */
160 #ifndef TCA_CLS_FLAGS_SKIP_SW
161 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
162 #endif
163 #ifndef TCA_CLS_FLAGS_IN_HW
164 #define TCA_CLS_FLAGS_IN_HW (1 << 2)
165 #endif
166 #ifndef HAVE_TCA_CHAIN
167 #define TCA_CHAIN 11
168 #endif
169 #ifndef HAVE_TCA_FLOWER_ACT
170 #define TCA_FLOWER_ACT 3
171 #endif
172 #ifndef HAVE_TCA_FLOWER_FLAGS
173 #define TCA_FLOWER_FLAGS 22
174 #endif
175 #ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE
176 #define TCA_FLOWER_KEY_ETH_TYPE 8
177 #endif
178 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST
179 #define TCA_FLOWER_KEY_ETH_DST 4
180 #endif
181 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK
182 #define TCA_FLOWER_KEY_ETH_DST_MASK 5
183 #endif
184 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC
185 #define TCA_FLOWER_KEY_ETH_SRC 6
186 #endif
187 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK
188 #define TCA_FLOWER_KEY_ETH_SRC_MASK 7
189 #endif
190 #ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO
191 #define TCA_FLOWER_KEY_IP_PROTO 9
192 #endif
193 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC
194 #define TCA_FLOWER_KEY_IPV4_SRC 10
195 #endif
196 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK
197 #define TCA_FLOWER_KEY_IPV4_SRC_MASK 11
198 #endif
199 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST
200 #define TCA_FLOWER_KEY_IPV4_DST 12
201 #endif
202 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK
203 #define TCA_FLOWER_KEY_IPV4_DST_MASK 13
204 #endif
205 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC
206 #define TCA_FLOWER_KEY_IPV6_SRC 14
207 #endif
208 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK
209 #define TCA_FLOWER_KEY_IPV6_SRC_MASK 15
210 #endif
211 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST
212 #define TCA_FLOWER_KEY_IPV6_DST 16
213 #endif
214 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK
215 #define TCA_FLOWER_KEY_IPV6_DST_MASK 17
216 #endif
217 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC
218 #define TCA_FLOWER_KEY_TCP_SRC 18
219 #endif
220 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK
221 #define TCA_FLOWER_KEY_TCP_SRC_MASK 35
222 #endif
223 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST
224 #define TCA_FLOWER_KEY_TCP_DST 19
225 #endif
226 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK
227 #define TCA_FLOWER_KEY_TCP_DST_MASK 36
228 #endif
229 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC
230 #define TCA_FLOWER_KEY_UDP_SRC 20
231 #endif
232 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK
233 #define TCA_FLOWER_KEY_UDP_SRC_MASK 37
234 #endif
235 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST
236 #define TCA_FLOWER_KEY_UDP_DST 21
237 #endif
238 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK
239 #define TCA_FLOWER_KEY_UDP_DST_MASK 38
240 #endif
241 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID
242 #define TCA_FLOWER_KEY_VLAN_ID 23
243 #endif
244 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO
245 #define TCA_FLOWER_KEY_VLAN_PRIO 24
246 #endif
247 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
248 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
249 #endif
250 #ifndef HAVE_TCA_FLOWER_KEY_ENC_KEY_ID
251 #define TCA_FLOWER_KEY_ENC_KEY_ID 26
252 #endif
253 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC
254 #define TCA_FLOWER_KEY_ENC_IPV4_SRC 27
255 #endif
256 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK
257 #define TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK 28
258 #endif
259 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST
260 #define TCA_FLOWER_KEY_ENC_IPV4_DST 29
261 #endif
262 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK
263 #define TCA_FLOWER_KEY_ENC_IPV4_DST_MASK 30
264 #endif
265 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC
266 #define TCA_FLOWER_KEY_ENC_IPV6_SRC 31
267 #endif
268 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK
269 #define TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK 32
270 #endif
271 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST
272 #define TCA_FLOWER_KEY_ENC_IPV6_DST 33
273 #endif
274 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK
275 #define TCA_FLOWER_KEY_ENC_IPV6_DST_MASK 34
276 #endif
277 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT
278 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT 43
279 #endif
280 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK
281 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK 44
282 #endif
283 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT
284 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT 45
285 #endif
286 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK
287 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK 46
288 #endif
289 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS
290 #define TCA_FLOWER_KEY_TCP_FLAGS 71
291 #endif
292 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK
293 #define TCA_FLOWER_KEY_TCP_FLAGS_MASK 72
294 #endif
295 #ifndef HAVE_TC_ACT_GOTO_CHAIN
296 #define TC_ACT_GOTO_CHAIN 0x20000000
297 #endif
298
299 #ifndef IPV6_ADDR_LEN
300 #define IPV6_ADDR_LEN 16
301 #endif
302
303 #ifndef IPV4_ADDR_LEN
304 #define IPV4_ADDR_LEN 4
305 #endif
306
307 #ifndef TP_PORT_LEN
308 #define TP_PORT_LEN 2 /* Transport Port (UDP/TCP) Length */
309 #endif
310
311 #ifndef TTL_LEN
312 #define TTL_LEN 1
313 #endif
314
315 #ifndef TCA_ACT_MAX_PRIO
316 #define TCA_ACT_MAX_PRIO 32
317 #endif
318
319 /** UDP port range of VXLAN devices created by driver. */
320 #define MLX5_VXLAN_PORT_MIN 30000
321 #define MLX5_VXLAN_PORT_MAX 60000
322 #define MLX5_VXLAN_DEVICE_PFX "vmlx_"
323
324 /** Tunnel action type, used for @p type in header structure. */
325 enum flow_tcf_tunact_type {
326         FLOW_TCF_TUNACT_VXLAN_DECAP,
327         FLOW_TCF_TUNACT_VXLAN_ENCAP,
328 };
329
330 /** Flags used for @p mask in tunnel action encap descriptors. */
331 #define FLOW_TCF_ENCAP_ETH_SRC (1u << 0)
332 #define FLOW_TCF_ENCAP_ETH_DST (1u << 1)
333 #define FLOW_TCF_ENCAP_IPV4_SRC (1u << 2)
334 #define FLOW_TCF_ENCAP_IPV4_DST (1u << 3)
335 #define FLOW_TCF_ENCAP_IPV6_SRC (1u << 4)
336 #define FLOW_TCF_ENCAP_IPV6_DST (1u << 5)
337 #define FLOW_TCF_ENCAP_UDP_SRC (1u << 6)
338 #define FLOW_TCF_ENCAP_UDP_DST (1u << 7)
339 #define FLOW_TCF_ENCAP_VXLAN_VNI (1u << 8)
340
341 /**
342  * Structure for holding netlink context.
343  * Note the size of the message buffer which is MNL_SOCKET_BUFFER_SIZE.
344  * Using this (8KB) buffer size ensures that netlink messages will never be
345  * truncated.
346  */
347 struct mlx5_flow_tcf_context {
348         struct mnl_socket *nl; /* NETLINK_ROUTE libmnl socket. */
349         uint32_t seq; /* Message sequence number. */
350         uint32_t buf_size; /* Message buffer size. */
351         uint8_t *buf; /* Message buffer. */
352 };
353
354 /**
355  * Neigh rule structure. The neigh rule is applied via Netlink to
356  * outer tunnel iface in order to provide destination MAC address
357  * for the VXLAN encapsultion. The neigh rule is implicitly related
358  * to the Flow itself and can be shared by multiple Flows.
359  */
360 struct tcf_neigh_rule {
361         LIST_ENTRY(tcf_neigh_rule) next;
362         uint32_t refcnt;
363         struct ether_addr eth;
364         uint16_t mask;
365         union {
366                 struct {
367                         rte_be32_t dst;
368                 } ipv4;
369                 struct {
370                         uint8_t dst[IPV6_ADDR_LEN];
371                 } ipv6;
372         };
373 };
374
375 /**
376  * Local rule structure. The local rule is applied via Netlink to
377  * outer tunnel iface in order to provide local and peer IP addresses
378  * of the VXLAN tunnel for encapsulation. The local rule is implicitly
379  * related to the Flow itself and can be shared by multiple Flows.
380  */
381 struct tcf_local_rule {
382         LIST_ENTRY(tcf_local_rule) next;
383         uint32_t refcnt;
384         uint16_t mask;
385         union {
386                 struct {
387                         rte_be32_t dst;
388                         rte_be32_t src;
389                 } ipv4;
390                 struct {
391                         uint8_t dst[IPV6_ADDR_LEN];
392                         uint8_t src[IPV6_ADDR_LEN];
393                 } ipv6;
394         };
395 };
396
397 /** Outer interface VXLAN encapsulation rules container. */
398 struct tcf_irule {
399         LIST_ENTRY(tcf_irule) next;
400         LIST_HEAD(, tcf_neigh_rule) neigh;
401         LIST_HEAD(, tcf_local_rule) local;
402         uint32_t refcnt;
403         unsigned int ifouter; /**< Own interface index. */
404 };
405
406 /** VXLAN virtual netdev. */
407 struct tcf_vtep {
408         LIST_ENTRY(tcf_vtep) next;
409         LIST_HEAD(, tcf_neigh_rule) neigh;
410         LIST_HEAD(, tcf_local_rule) local;
411         uint32_t refcnt;
412         unsigned int ifindex; /**< Own interface index. */
413         unsigned int ifouter; /**< Index of device attached to. */
414         uint16_t port;
415         uint8_t created;
416 };
417
418 /** Tunnel descriptor header, common for all tunnel types. */
419 struct flow_tcf_tunnel_hdr {
420         uint32_t type; /**< Tunnel action type. */
421         struct tcf_vtep *vtep; /**< Virtual tunnel endpoint device. */
422         unsigned int ifindex_org; /**< Original dst/src interface */
423         unsigned int *ifindex_ptr; /**< Interface ptr in message. */
424 };
425
426 struct flow_tcf_vxlan_decap {
427         struct flow_tcf_tunnel_hdr hdr;
428         uint16_t udp_port;
429 };
430
431 struct flow_tcf_vxlan_encap {
432         struct flow_tcf_tunnel_hdr hdr;
433         struct tcf_irule *iface;
434         uint32_t mask;
435         struct {
436                 struct ether_addr dst;
437                 struct ether_addr src;
438         } eth;
439         union {
440                 struct {
441                         rte_be32_t dst;
442                         rte_be32_t src;
443                 } ipv4;
444                 struct {
445                         uint8_t dst[IPV6_ADDR_LEN];
446                         uint8_t src[IPV6_ADDR_LEN];
447                 } ipv6;
448         };
449         struct {
450                 rte_be16_t src;
451                 rte_be16_t dst;
452         } udp;
453         struct {
454                 uint8_t vni[3];
455         } vxlan;
456 };
457
458 /** Structure used when extracting the values of a flow counters
459  * from a netlink message.
460  */
461 struct flow_tcf_stats_basic {
462         bool valid;
463         struct gnet_stats_basic counters;
464 };
465
466 /** Empty masks for known item types. */
467 static const union {
468         struct rte_flow_item_port_id port_id;
469         struct rte_flow_item_eth eth;
470         struct rte_flow_item_vlan vlan;
471         struct rte_flow_item_ipv4 ipv4;
472         struct rte_flow_item_ipv6 ipv6;
473         struct rte_flow_item_tcp tcp;
474         struct rte_flow_item_udp udp;
475         struct rte_flow_item_vxlan vxlan;
476 } flow_tcf_mask_empty = {
477         {0},
478 };
479
480 /** Supported masks for known item types. */
481 static const struct {
482         struct rte_flow_item_port_id port_id;
483         struct rte_flow_item_eth eth;
484         struct rte_flow_item_vlan vlan;
485         struct rte_flow_item_ipv4 ipv4;
486         struct rte_flow_item_ipv6 ipv6;
487         struct rte_flow_item_tcp tcp;
488         struct rte_flow_item_udp udp;
489         struct rte_flow_item_vxlan vxlan;
490 } flow_tcf_mask_supported = {
491         .port_id = {
492                 .id = 0xffffffff,
493         },
494         .eth = {
495                 .type = RTE_BE16(0xffff),
496                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
497                 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
498         },
499         .vlan = {
500                 /* PCP and VID only, no DEI. */
501                 .tci = RTE_BE16(0xefff),
502                 .inner_type = RTE_BE16(0xffff),
503         },
504         .ipv4.hdr = {
505                 .next_proto_id = 0xff,
506                 .src_addr = RTE_BE32(0xffffffff),
507                 .dst_addr = RTE_BE32(0xffffffff),
508         },
509         .ipv6.hdr = {
510                 .proto = 0xff,
511                 .src_addr =
512                         "\xff\xff\xff\xff\xff\xff\xff\xff"
513                         "\xff\xff\xff\xff\xff\xff\xff\xff",
514                 .dst_addr =
515                         "\xff\xff\xff\xff\xff\xff\xff\xff"
516                         "\xff\xff\xff\xff\xff\xff\xff\xff",
517         },
518         .tcp.hdr = {
519                 .src_port = RTE_BE16(0xffff),
520                 .dst_port = RTE_BE16(0xffff),
521                 .tcp_flags = 0xff,
522         },
523         .udp.hdr = {
524                 .src_port = RTE_BE16(0xffff),
525                 .dst_port = RTE_BE16(0xffff),
526         },
527         .vxlan = {
528                .vni = "\xff\xff\xff",
529         },
530 };
531
532 #define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr))
533 #define SZ_NLATTR_NEST SZ_NLATTR_HDR
534 #define SZ_NLATTR_DATA_OF(len) MNL_ALIGN(SZ_NLATTR_HDR + (len))
535 #define SZ_NLATTR_TYPE_OF(typ) SZ_NLATTR_DATA_OF(sizeof(typ))
536 #define SZ_NLATTR_STRZ_OF(str) SZ_NLATTR_DATA_OF(strlen(str) + 1)
537
538 #define PTOI_TABLE_SZ_MAX(dev) (mlx5_dev_to_port_id((dev)->device, NULL, 0) + 2)
539
540 /** DPDK port to network interface index (ifindex) conversion. */
541 struct flow_tcf_ptoi {
542         uint16_t port_id; /**< DPDK port ID. */
543         unsigned int ifindex; /**< Network interface index. */
544 };
545
546 /* Due to a limitation on driver/FW. */
547 #define MLX5_TCF_GROUP_ID_MAX 3
548
549 /*
550  * Due to a limitation on driver/FW, priority ranges from 1 to 16 in kernel.
551  * Priority in rte_flow attribute starts from 0 and is added by 1 in
552  * translation. This is subject to be changed to determine the max priority
553  * based on trial-and-error like Verbs driver once the restriction is lifted or
554  * the range is extended.
555  */
556 #define MLX5_TCF_GROUP_PRIORITY_MAX 15
557
558 #define MLX5_TCF_FATE_ACTIONS \
559         (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \
560          MLX5_FLOW_ACTION_JUMP)
561
562 #define MLX5_TCF_VLAN_ACTIONS \
563         (MLX5_FLOW_ACTION_OF_POP_VLAN | MLX5_FLOW_ACTION_OF_PUSH_VLAN | \
564          MLX5_FLOW_ACTION_OF_SET_VLAN_VID | MLX5_FLOW_ACTION_OF_SET_VLAN_PCP)
565
566 #define MLX5_TCF_VXLAN_ACTIONS \
567         (MLX5_FLOW_ACTION_VXLAN_ENCAP | MLX5_FLOW_ACTION_VXLAN_DECAP)
568
569 #define MLX5_TCF_PEDIT_ACTIONS \
570         (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST | \
571          MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST | \
572          MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST | \
573          MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL | \
574          MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)
575
576 #define MLX5_TCF_CONFIG_ACTIONS \
577         (MLX5_FLOW_ACTION_PORT_ID | MLX5_FLOW_ACTION_JUMP | \
578          MLX5_FLOW_ACTION_OF_PUSH_VLAN | MLX5_FLOW_ACTION_OF_SET_VLAN_VID | \
579          MLX5_FLOW_ACTION_OF_SET_VLAN_PCP | \
580          (MLX5_TCF_PEDIT_ACTIONS & ~MLX5_FLOW_ACTION_DEC_TTL))
581
582 #define MAX_PEDIT_KEYS 128
583 #define SZ_PEDIT_KEY_VAL 4
584
585 #define NUM_OF_PEDIT_KEYS(sz) \
586         (((sz) / SZ_PEDIT_KEY_VAL) + (((sz) % SZ_PEDIT_KEY_VAL) ? 1 : 0))
587
588 struct pedit_key_ex {
589         enum pedit_header_type htype;
590         enum pedit_cmd cmd;
591 };
592
593 struct pedit_parser {
594         struct tc_pedit_sel sel;
595         struct tc_pedit_key keys[MAX_PEDIT_KEYS];
596         struct pedit_key_ex keys_ex[MAX_PEDIT_KEYS];
597 };
598
599 /**
600  * Create space for using the implicitly created TC flow counter.
601  *
602  * @param[in] dev
603  *   Pointer to the Ethernet device structure.
604  *
605  * @return
606  *   A pointer to the counter data structure, NULL otherwise and
607  *   rte_errno is set.
608  */
609 static struct mlx5_flow_counter *
610 flow_tcf_counter_new(void)
611 {
612         struct mlx5_flow_counter *cnt;
613
614         /*
615          * eswitch counter cannot be shared and its id is unknown.
616          * currently returning all with id 0.
617          * in the future maybe better to switch to unique numbers.
618          */
619         struct mlx5_flow_counter tmpl = {
620                 .ref_cnt = 1,
621         };
622         cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0);
623         if (!cnt) {
624                 rte_errno = ENOMEM;
625                 return NULL;
626         }
627         *cnt = tmpl;
628         /* Implicit counter, do not add to list. */
629         return cnt;
630 }
631
632 /**
633  * Set pedit key of MAC address
634  *
635  * @param[in] actions
636  *   pointer to action specification
637  * @param[in,out] p_parser
638  *   pointer to pedit_parser
639  */
640 static void
641 flow_tcf_pedit_key_set_mac(const struct rte_flow_action *actions,
642                            struct pedit_parser *p_parser)
643 {
644         int idx = p_parser->sel.nkeys;
645         uint32_t off = actions->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ?
646                                         offsetof(struct ether_hdr, s_addr) :
647                                         offsetof(struct ether_hdr, d_addr);
648         const struct rte_flow_action_set_mac *conf =
649                 (const struct rte_flow_action_set_mac *)actions->conf;
650
651         p_parser->keys[idx].off = off;
652         p_parser->keys[idx].mask = ~UINT32_MAX;
653         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
654         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
655         memcpy(&p_parser->keys[idx].val,
656                 conf->mac_addr, SZ_PEDIT_KEY_VAL);
657         idx++;
658         p_parser->keys[idx].off = off + SZ_PEDIT_KEY_VAL;
659         p_parser->keys[idx].mask = 0xFFFF0000;
660         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
661         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
662         memcpy(&p_parser->keys[idx].val,
663                 conf->mac_addr + SZ_PEDIT_KEY_VAL,
664                 ETHER_ADDR_LEN - SZ_PEDIT_KEY_VAL);
665         p_parser->sel.nkeys = (++idx);
666 }
667
668 /**
669  * Set pedit key of decrease/set ttl
670  *
671  * @param[in] actions
672  *   pointer to action specification
673  * @param[in,out] p_parser
674  *   pointer to pedit_parser
675  * @param[in] item_flags
676  *   flags of all items presented
677  */
678 static void
679 flow_tcf_pedit_key_set_dec_ttl(const struct rte_flow_action *actions,
680                                 struct pedit_parser *p_parser,
681                                 uint64_t item_flags)
682 {
683         int idx = p_parser->sel.nkeys;
684
685         p_parser->keys[idx].mask = 0xFFFFFF00;
686         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) {
687                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
688                 p_parser->keys[idx].off =
689                         offsetof(struct ipv4_hdr, time_to_live);
690         }
691         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6) {
692                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
693                 p_parser->keys[idx].off =
694                         offsetof(struct ipv6_hdr, hop_limits);
695         }
696         if (actions->type == RTE_FLOW_ACTION_TYPE_DEC_TTL) {
697                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_ADD;
698                 p_parser->keys[idx].val = 0x000000FF;
699         } else {
700                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
701                 p_parser->keys[idx].val =
702                         (__u32)((const struct rte_flow_action_set_ttl *)
703                          actions->conf)->ttl_value;
704         }
705         p_parser->sel.nkeys = (++idx);
706 }
707
708 /**
709  * Set pedit key of transport (TCP/UDP) port value
710  *
711  * @param[in] actions
712  *   pointer to action specification
713  * @param[in,out] p_parser
714  *   pointer to pedit_parser
715  * @param[in] item_flags
716  *   flags of all items presented
717  */
718 static void
719 flow_tcf_pedit_key_set_tp_port(const struct rte_flow_action *actions,
720                                 struct pedit_parser *p_parser,
721                                 uint64_t item_flags)
722 {
723         int idx = p_parser->sel.nkeys;
724
725         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)
726                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_UDP;
727         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_TCP)
728                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_TCP;
729         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
730         /* offset of src/dst port is same for TCP and UDP */
731         p_parser->keys[idx].off =
732                 actions->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC ?
733                 offsetof(struct tcp_hdr, src_port) :
734                 offsetof(struct tcp_hdr, dst_port);
735         p_parser->keys[idx].mask = 0xFFFF0000;
736         p_parser->keys[idx].val =
737                 (__u32)((const struct rte_flow_action_set_tp *)
738                                 actions->conf)->port;
739         p_parser->sel.nkeys = (++idx);
740 }
741
742 /**
743  * Set pedit key of ipv6 address
744  *
745  * @param[in] actions
746  *   pointer to action specification
747  * @param[in,out] p_parser
748  *   pointer to pedit_parser
749  */
750 static void
751 flow_tcf_pedit_key_set_ipv6_addr(const struct rte_flow_action *actions,
752                                  struct pedit_parser *p_parser)
753 {
754         int idx = p_parser->sel.nkeys;
755         int keys = NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
756         int off_base =
757                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ?
758                 offsetof(struct ipv6_hdr, src_addr) :
759                 offsetof(struct ipv6_hdr, dst_addr);
760         const struct rte_flow_action_set_ipv6 *conf =
761                 (const struct rte_flow_action_set_ipv6 *)actions->conf;
762
763         for (int i = 0; i < keys; i++, idx++) {
764                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
765                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
766                 p_parser->keys[idx].off = off_base + i * SZ_PEDIT_KEY_VAL;
767                 p_parser->keys[idx].mask = ~UINT32_MAX;
768                 memcpy(&p_parser->keys[idx].val,
769                         conf->ipv6_addr + i *  SZ_PEDIT_KEY_VAL,
770                         SZ_PEDIT_KEY_VAL);
771         }
772         p_parser->sel.nkeys += keys;
773 }
774
775 /**
776  * Set pedit key of ipv4 address
777  *
778  * @param[in] actions
779  *   pointer to action specification
780  * @param[in,out] p_parser
781  *   pointer to pedit_parser
782  */
783 static void
784 flow_tcf_pedit_key_set_ipv4_addr(const struct rte_flow_action *actions,
785                                  struct pedit_parser *p_parser)
786 {
787         int idx = p_parser->sel.nkeys;
788
789         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
790         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
791         p_parser->keys[idx].off =
792                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ?
793                 offsetof(struct ipv4_hdr, src_addr) :
794                 offsetof(struct ipv4_hdr, dst_addr);
795         p_parser->keys[idx].mask = ~UINT32_MAX;
796         p_parser->keys[idx].val =
797                 ((const struct rte_flow_action_set_ipv4 *)
798                  actions->conf)->ipv4_addr;
799         p_parser->sel.nkeys = (++idx);
800 }
801
802 /**
803  * Create the pedit's na attribute in netlink message
804  * on pre-allocate message buffer
805  *
806  * @param[in,out] nl
807  *   pointer to pre-allocated netlink message buffer
808  * @param[in,out] actions
809  *   pointer to pointer of actions specification.
810  * @param[in,out] action_flags
811  *   pointer to actions flags
812  * @param[in] item_flags
813  *   flags of all item presented
814  */
815 static void
816 flow_tcf_create_pedit_mnl_msg(struct nlmsghdr *nl,
817                               const struct rte_flow_action **actions,
818                               uint64_t item_flags)
819 {
820         struct pedit_parser p_parser;
821         struct nlattr *na_act_options;
822         struct nlattr *na_pedit_keys;
823
824         memset(&p_parser, 0, sizeof(p_parser));
825         mnl_attr_put_strz(nl, TCA_ACT_KIND, "pedit");
826         na_act_options = mnl_attr_nest_start(nl, TCA_ACT_OPTIONS);
827         /* all modify header actions should be in one tc-pedit action */
828         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
829                 switch ((*actions)->type) {
830                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
831                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
832                         flow_tcf_pedit_key_set_ipv4_addr(*actions, &p_parser);
833                         break;
834                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
835                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
836                         flow_tcf_pedit_key_set_ipv6_addr(*actions, &p_parser);
837                         break;
838                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
839                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
840                         flow_tcf_pedit_key_set_tp_port(*actions,
841                                                         &p_parser, item_flags);
842                         break;
843                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
844                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
845                         flow_tcf_pedit_key_set_dec_ttl(*actions,
846                                                         &p_parser, item_flags);
847                         break;
848                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
849                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
850                         flow_tcf_pedit_key_set_mac(*actions, &p_parser);
851                         break;
852                 default:
853                         goto pedit_mnl_msg_done;
854                 }
855         }
856 pedit_mnl_msg_done:
857         p_parser.sel.action = TC_ACT_PIPE;
858         mnl_attr_put(nl, TCA_PEDIT_PARMS_EX,
859                      sizeof(p_parser.sel) +
860                      p_parser.sel.nkeys * sizeof(struct tc_pedit_key),
861                      &p_parser);
862         na_pedit_keys =
863                 mnl_attr_nest_start(nl, TCA_PEDIT_KEYS_EX | NLA_F_NESTED);
864         for (int i = 0; i < p_parser.sel.nkeys; i++) {
865                 struct nlattr *na_pedit_key =
866                         mnl_attr_nest_start(nl,
867                                             TCA_PEDIT_KEY_EX | NLA_F_NESTED);
868                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_HTYPE,
869                                  p_parser.keys_ex[i].htype);
870                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_CMD,
871                                  p_parser.keys_ex[i].cmd);
872                 mnl_attr_nest_end(nl, na_pedit_key);
873         }
874         mnl_attr_nest_end(nl, na_pedit_keys);
875         mnl_attr_nest_end(nl, na_act_options);
876         (*actions)--;
877 }
878
879 /**
880  * Calculate max memory size of one TC-pedit actions.
881  * One TC-pedit action can contain set of keys each defining
882  * a rewrite element (rte_flow action)
883  *
884  * @param[in,out] actions
885  *   actions specification.
886  * @param[in,out] action_flags
887  *   actions flags
888  * @param[in,out] size
889  *   accumulated size
890  * @return
891  *   Max memory size of one TC-pedit action
892  */
893 static int
894 flow_tcf_get_pedit_actions_size(const struct rte_flow_action **actions,
895                                 uint64_t *action_flags)
896 {
897         int pedit_size = 0;
898         int keys = 0;
899         uint64_t flags = 0;
900
901         pedit_size += SZ_NLATTR_NEST + /* na_act_index. */
902                       SZ_NLATTR_STRZ_OF("pedit") +
903                       SZ_NLATTR_NEST; /* TCA_ACT_OPTIONS. */
904         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
905                 switch ((*actions)->type) {
906                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
907                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
908                         flags |= MLX5_FLOW_ACTION_SET_IPV4_SRC;
909                         break;
910                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
911                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
912                         flags |= MLX5_FLOW_ACTION_SET_IPV4_DST;
913                         break;
914                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
915                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
916                         flags |= MLX5_FLOW_ACTION_SET_IPV6_SRC;
917                         break;
918                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
919                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
920                         flags |= MLX5_FLOW_ACTION_SET_IPV6_DST;
921                         break;
922                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
923                         /* TCP is as same as UDP */
924                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
925                         flags |= MLX5_FLOW_ACTION_SET_TP_SRC;
926                         break;
927                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
928                         /* TCP is as same as UDP */
929                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
930                         flags |= MLX5_FLOW_ACTION_SET_TP_DST;
931                         break;
932                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
933                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
934                         flags |= MLX5_FLOW_ACTION_SET_TTL;
935                         break;
936                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
937                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
938                         flags |= MLX5_FLOW_ACTION_DEC_TTL;
939                         break;
940                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
941                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
942                         flags |= MLX5_FLOW_ACTION_SET_MAC_SRC;
943                         break;
944                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
945                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
946                         flags |= MLX5_FLOW_ACTION_SET_MAC_DST;
947                         break;
948                 default:
949                         goto get_pedit_action_size_done;
950                 }
951         }
952 get_pedit_action_size_done:
953         /* TCA_PEDIT_PARAMS_EX */
954         pedit_size +=
955                 SZ_NLATTR_DATA_OF(sizeof(struct tc_pedit_sel) +
956                                   keys * sizeof(struct tc_pedit_key));
957         pedit_size += SZ_NLATTR_NEST; /* TCA_PEDIT_KEYS */
958         pedit_size += keys *
959                       /* TCA_PEDIT_KEY_EX + HTYPE + CMD */
960                       (SZ_NLATTR_NEST + SZ_NLATTR_DATA_OF(2) +
961                        SZ_NLATTR_DATA_OF(2));
962         (*action_flags) |= flags;
963         (*actions)--;
964         return pedit_size;
965 }
966
967 /**
968  * Retrieve mask for pattern item.
969  *
970  * This function does basic sanity checks on a pattern item in order to
971  * return the most appropriate mask for it.
972  *
973  * @param[in] item
974  *   Item specification.
975  * @param[in] mask_default
976  *   Default mask for pattern item as specified by the flow API.
977  * @param[in] mask_supported
978  *   Mask fields supported by the implementation.
979  * @param[in] mask_empty
980  *   Empty mask to return when there is no specification.
981  * @param[out] error
982  *   Perform verbose error reporting if not NULL.
983  *
984  * @return
985  *   Either @p item->mask or one of the mask parameters on success, NULL
986  *   otherwise and rte_errno is set.
987  */
988 static const void *
989 flow_tcf_item_mask(const struct rte_flow_item *item, const void *mask_default,
990                    const void *mask_supported, const void *mask_empty,
991                    size_t mask_size, struct rte_flow_error *error)
992 {
993         const uint8_t *mask;
994         size_t i;
995
996         /* item->last and item->mask cannot exist without item->spec. */
997         if (!item->spec && (item->mask || item->last)) {
998                 rte_flow_error_set(error, EINVAL,
999                                    RTE_FLOW_ERROR_TYPE_ITEM, item,
1000                                    "\"mask\" or \"last\" field provided without"
1001                                    " a corresponding \"spec\"");
1002                 return NULL;
1003         }
1004         /* No spec, no mask, no problem. */
1005         if (!item->spec)
1006                 return mask_empty;
1007         mask = item->mask ? item->mask : mask_default;
1008         assert(mask);
1009         /*
1010          * Single-pass check to make sure that:
1011          * - Mask is supported, no bits are set outside mask_supported.
1012          * - Both item->spec and item->last are included in mask.
1013          */
1014         for (i = 0; i != mask_size; ++i) {
1015                 if (!mask[i])
1016                         continue;
1017                 if ((mask[i] | ((const uint8_t *)mask_supported)[i]) !=
1018                     ((const uint8_t *)mask_supported)[i]) {
1019                         rte_flow_error_set(error, ENOTSUP,
1020                                            RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1021                                            "unsupported field found"
1022                                            " in \"mask\"");
1023                         return NULL;
1024                 }
1025                 if (item->last &&
1026                     (((const uint8_t *)item->spec)[i] & mask[i]) !=
1027                     (((const uint8_t *)item->last)[i] & mask[i])) {
1028                         rte_flow_error_set(error, EINVAL,
1029                                            RTE_FLOW_ERROR_TYPE_ITEM_LAST,
1030                                            item->last,
1031                                            "range between \"spec\" and \"last\""
1032                                            " not comprised in \"mask\"");
1033                         return NULL;
1034                 }
1035         }
1036         return mask;
1037 }
1038
1039 /**
1040  * Build a conversion table between port ID and ifindex.
1041  *
1042  * @param[in] dev
1043  *   Pointer to Ethernet device.
1044  * @param[out] ptoi
1045  *   Pointer to ptoi table.
1046  * @param[in] len
1047  *   Size of ptoi table provided.
1048  *
1049  * @return
1050  *   Size of ptoi table filled.
1051  */
1052 static unsigned int
1053 flow_tcf_build_ptoi_table(struct rte_eth_dev *dev, struct flow_tcf_ptoi *ptoi,
1054                           unsigned int len)
1055 {
1056         unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
1057         uint16_t port_id[n + 1];
1058         unsigned int i;
1059         unsigned int own = 0;
1060
1061         /* At least one port is needed when no switch domain is present. */
1062         if (!n) {
1063                 n = 1;
1064                 port_id[0] = dev->data->port_id;
1065         } else {
1066                 n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
1067         }
1068         if (n > len)
1069                 return 0;
1070         for (i = 0; i != n; ++i) {
1071                 struct rte_eth_dev_info dev_info;
1072
1073                 rte_eth_dev_info_get(port_id[i], &dev_info);
1074                 if (port_id[i] == dev->data->port_id)
1075                         own = i;
1076                 ptoi[i].port_id = port_id[i];
1077                 ptoi[i].ifindex = dev_info.if_index;
1078         }
1079         /* Ensure first entry of ptoi[] is the current device. */
1080         if (own) {
1081                 ptoi[n] = ptoi[0];
1082                 ptoi[0] = ptoi[own];
1083                 ptoi[own] = ptoi[n];
1084         }
1085         /* An entry with zero ifindex terminates ptoi[]. */
1086         ptoi[n].port_id = 0;
1087         ptoi[n].ifindex = 0;
1088         return n;
1089 }
1090
1091 /**
1092  * Verify the @p attr will be correctly understood by the E-switch.
1093  *
1094  * @param[in] attr
1095  *   Pointer to flow attributes
1096  * @param[out] error
1097  *   Pointer to error structure.
1098  *
1099  * @return
1100  *   0 on success, a negative errno value otherwise and rte_errno is set.
1101  */
1102 static int
1103 flow_tcf_validate_attributes(const struct rte_flow_attr *attr,
1104                              struct rte_flow_error *error)
1105 {
1106         /*
1107          * Supported attributes: groups, some priorities and ingress only.
1108          * group is supported only if kernel supports chain. Don't care about
1109          * transfer as it is the caller's problem.
1110          */
1111         if (attr->group > MLX5_TCF_GROUP_ID_MAX)
1112                 return rte_flow_error_set(error, ENOTSUP,
1113                                           RTE_FLOW_ERROR_TYPE_ATTR_GROUP, attr,
1114                                           "group ID larger than "
1115                                           RTE_STR(MLX5_TCF_GROUP_ID_MAX)
1116                                           " isn't supported");
1117         else if (attr->priority > MLX5_TCF_GROUP_PRIORITY_MAX)
1118                 return rte_flow_error_set(error, ENOTSUP,
1119                                           RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1120                                           attr,
1121                                           "priority more than "
1122                                           RTE_STR(MLX5_TCF_GROUP_PRIORITY_MAX)
1123                                           " is not supported");
1124         if (!attr->ingress)
1125                 return rte_flow_error_set(error, EINVAL,
1126                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1127                                           attr, "only ingress is supported");
1128         if (attr->egress)
1129                 return rte_flow_error_set(error, ENOTSUP,
1130                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1131                                           attr, "egress is not supported");
1132         return 0;
1133 }
1134
1135 /**
1136  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_ETH item for E-Switch.
1137  * The routine checks the L2 fields to be used in encapsulation header.
1138  *
1139  * @param[in] item
1140  *   Pointer to the item structure.
1141  * @param[out] error
1142  *   Pointer to the error structure.
1143  *
1144  * @return
1145  *   0 on success, a negative errno value otherwise and rte_errno is set.
1146  **/
1147 static int
1148 flow_tcf_validate_vxlan_encap_eth(const struct rte_flow_item *item,
1149                                   struct rte_flow_error *error)
1150 {
1151         const struct rte_flow_item_eth *spec = item->spec;
1152         const struct rte_flow_item_eth *mask = item->mask;
1153
1154         if (!spec) {
1155                 /*
1156                  * Specification for L2 addresses can be empty
1157                  * because these ones are optional and not
1158                  * required directly by tc rule. Kernel tries
1159                  * to resolve these ones on its own
1160                  */
1161                 return 0;
1162         }
1163         if (!mask) {
1164                 /* If mask is not specified use the default one. */
1165                 mask = &rte_flow_item_eth_mask;
1166         }
1167         if (memcmp(&mask->dst,
1168                    &flow_tcf_mask_empty.eth.dst,
1169                    sizeof(flow_tcf_mask_empty.eth.dst))) {
1170                 if (memcmp(&mask->dst,
1171                            &rte_flow_item_eth_mask.dst,
1172                            sizeof(rte_flow_item_eth_mask.dst)))
1173                         return rte_flow_error_set
1174                                 (error, ENOTSUP,
1175                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1176                                  "no support for partial mask on"
1177                                  " \"eth.dst\" field");
1178         }
1179         if (memcmp(&mask->src,
1180                    &flow_tcf_mask_empty.eth.src,
1181                    sizeof(flow_tcf_mask_empty.eth.src))) {
1182                 if (memcmp(&mask->src,
1183                            &rte_flow_item_eth_mask.src,
1184                            sizeof(rte_flow_item_eth_mask.src)))
1185                         return rte_flow_error_set
1186                                 (error, ENOTSUP,
1187                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1188                                  "no support for partial mask on"
1189                                  " \"eth.src\" field");
1190         }
1191         if (mask->type != RTE_BE16(0x0000)) {
1192                 if (mask->type != RTE_BE16(0xffff))
1193                         return rte_flow_error_set
1194                                 (error, ENOTSUP,
1195                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1196                                  "no support for partial mask on"
1197                                  " \"eth.type\" field");
1198                 DRV_LOG(WARNING,
1199                         "outer ethernet type field"
1200                         " cannot be forced for vxlan"
1201                         " encapsulation, parameter ignored");
1202         }
1203         return 0;
1204 }
1205
1206 /**
1207  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV4 item for E-Switch.
1208  * The routine checks the IPv4 fields to be used in encapsulation header.
1209  *
1210  * @param[in] item
1211  *   Pointer to the item structure.
1212  * @param[out] error
1213  *   Pointer to the error structure.
1214  *
1215  * @return
1216  *   0 on success, a negative errno value otherwise and rte_errno is set.
1217  **/
1218 static int
1219 flow_tcf_validate_vxlan_encap_ipv4(const struct rte_flow_item *item,
1220                                    struct rte_flow_error *error)
1221 {
1222         const struct rte_flow_item_ipv4 *spec = item->spec;
1223         const struct rte_flow_item_ipv4 *mask = item->mask;
1224
1225         if (!spec) {
1226                 /*
1227                  * Specification for IP addresses cannot be empty
1228                  * because it is required by tunnel_key parameter.
1229                  */
1230                 return rte_flow_error_set(error, EINVAL,
1231                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1232                                           "NULL outer ipv4 address"
1233                                           " specification for vxlan"
1234                                           " encapsulation");
1235         }
1236         if (!mask)
1237                 mask = &rte_flow_item_ipv4_mask;
1238         if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) {
1239                 if (mask->hdr.dst_addr != RTE_BE32(0xffffffff))
1240                         return rte_flow_error_set
1241                                 (error, ENOTSUP,
1242                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1243                                  "no support for partial mask on"
1244                                  " \"ipv4.hdr.dst_addr\" field"
1245                                  " for vxlan encapsulation");
1246                 /* More IPv4 address validations can be put here. */
1247         } else {
1248                 /*
1249                  * Kernel uses the destination IP address to determine
1250                  * the routing path and obtain the MAC destination
1251                  * address, so IP destination address must be
1252                  * specified in the tc rule.
1253                  */
1254                 return rte_flow_error_set(error, EINVAL,
1255                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1256                                           "outer ipv4 destination address"
1257                                           " must be specified for"
1258                                           " vxlan encapsulation");
1259         }
1260         if (mask->hdr.src_addr != RTE_BE32(0x00000000)) {
1261                 if (mask->hdr.src_addr != RTE_BE32(0xffffffff))
1262                         return rte_flow_error_set
1263                                 (error, ENOTSUP,
1264                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1265                                  "no support for partial mask on"
1266                                  " \"ipv4.hdr.src_addr\" field"
1267                                  " for vxlan encapsulation");
1268                 /* More IPv4 address validations can be put here. */
1269         } else {
1270                 /*
1271                  * Kernel uses the source IP address to select the
1272                  * interface for egress encapsulated traffic, so
1273                  * it must be specified in the tc rule.
1274                  */
1275                 return rte_flow_error_set(error, EINVAL,
1276                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1277                                           "outer ipv4 source address"
1278                                           " must be specified for"
1279                                           " vxlan encapsulation");
1280         }
1281         return 0;
1282 }
1283
1284 /**
1285  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV6 item for E-Switch.
1286  * The routine checks the IPv6 fields to be used in encapsulation header.
1287  *
1288  * @param[in] item
1289  *   Pointer to the item structure.
1290  * @param[out] error
1291  *   Pointer to the error structure.
1292  *
1293  * @return
1294  *   0 on success, a negative errno value otherwise and rte_errno is set.
1295  **/
1296 static int
1297 flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item,
1298                                    struct rte_flow_error *error)
1299 {
1300         const struct rte_flow_item_ipv6 *spec = item->spec;
1301         const struct rte_flow_item_ipv6 *mask = item->mask;
1302
1303         if (!spec) {
1304                 /*
1305                  * Specification for IP addresses cannot be empty
1306                  * because it is required by tunnel_key parameter.
1307                  */
1308                 return rte_flow_error_set(error, EINVAL,
1309                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1310                                           "NULL outer ipv6 address"
1311                                           " specification for"
1312                                           " vxlan encapsulation");
1313         }
1314         if (!mask)
1315                 mask = &rte_flow_item_ipv6_mask;
1316         if (memcmp(&mask->hdr.dst_addr,
1317                    &flow_tcf_mask_empty.ipv6.hdr.dst_addr,
1318                    IPV6_ADDR_LEN)) {
1319                 if (memcmp(&mask->hdr.dst_addr,
1320                            &rte_flow_item_ipv6_mask.hdr.dst_addr,
1321                            IPV6_ADDR_LEN))
1322                         return rte_flow_error_set
1323                                         (error, ENOTSUP,
1324                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1325                                          "no support for partial mask on"
1326                                          " \"ipv6.hdr.dst_addr\" field"
1327                                          " for vxlan encapsulation");
1328                 /* More IPv6 address validations can be put here. */
1329         } else {
1330                 /*
1331                  * Kernel uses the destination IP address to determine
1332                  * the routing path and obtain the MAC destination
1333                  * address (heigh or gate), so IP destination address
1334                  * must be specified within the tc rule.
1335                  */
1336                 return rte_flow_error_set(error, EINVAL,
1337                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1338                                           "outer ipv6 destination address"
1339                                           " must be specified for"
1340                                           " vxlan encapsulation");
1341         }
1342         if (memcmp(&mask->hdr.src_addr,
1343                    &flow_tcf_mask_empty.ipv6.hdr.src_addr,
1344                    IPV6_ADDR_LEN)) {
1345                 if (memcmp(&mask->hdr.src_addr,
1346                            &rte_flow_item_ipv6_mask.hdr.src_addr,
1347                            IPV6_ADDR_LEN))
1348                         return rte_flow_error_set
1349                                         (error, ENOTSUP,
1350                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1351                                          "no support for partial mask on"
1352                                          " \"ipv6.hdr.src_addr\" field"
1353                                          " for vxlan encapsulation");
1354                 /* More L3 address validation can be put here. */
1355         } else {
1356                 /*
1357                  * Kernel uses the source IP address to select the
1358                  * interface for egress encapsulated traffic, so
1359                  * it must be specified in the tc rule.
1360                  */
1361                 return rte_flow_error_set(error, EINVAL,
1362                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1363                                           "outer L3 source address"
1364                                           " must be specified for"
1365                                           " vxlan encapsulation");
1366         }
1367         return 0;
1368 }
1369
1370 /**
1371  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_UDP item for E-Switch.
1372  * The routine checks the UDP fields to be used in encapsulation header.
1373  *
1374  * @param[in] item
1375  *   Pointer to the item structure.
1376  * @param[out] error
1377  *   Pointer to the error structure.
1378  *
1379  * @return
1380  *   0 on success, a negative errno value otherwise and rte_errno is set.
1381  **/
1382 static int
1383 flow_tcf_validate_vxlan_encap_udp(const struct rte_flow_item *item,
1384                                   struct rte_flow_error *error)
1385 {
1386         const struct rte_flow_item_udp *spec = item->spec;
1387         const struct rte_flow_item_udp *mask = item->mask;
1388
1389         if (!spec) {
1390                 /*
1391                  * Specification for UDP ports cannot be empty
1392                  * because it is required by tunnel_key parameter.
1393                  */
1394                 return rte_flow_error_set(error, EINVAL,
1395                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1396                                           "NULL UDP port specification "
1397                                           " for vxlan encapsulation");
1398         }
1399         if (!mask)
1400                 mask = &rte_flow_item_udp_mask;
1401         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1402                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1403                         return rte_flow_error_set
1404                                         (error, ENOTSUP,
1405                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1406                                          "no support for partial mask on"
1407                                          " \"udp.hdr.dst_port\" field"
1408                                          " for vxlan encapsulation");
1409                 if (!spec->hdr.dst_port)
1410                         return rte_flow_error_set
1411                                         (error, EINVAL,
1412                                          RTE_FLOW_ERROR_TYPE_ITEM, item,
1413                                          "outer UDP remote port cannot be"
1414                                          " 0 for vxlan encapsulation");
1415         } else {
1416                 return rte_flow_error_set(error, EINVAL,
1417                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1418                                           "outer UDP remote port"
1419                                           " must be specified for"
1420                                           " vxlan encapsulation");
1421         }
1422         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1423                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1424                         return rte_flow_error_set
1425                                         (error, ENOTSUP,
1426                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1427                                          "no support for partial mask on"
1428                                          " \"udp.hdr.src_port\" field"
1429                                          " for vxlan encapsulation");
1430                 DRV_LOG(WARNING,
1431                         "outer UDP source port cannot be"
1432                         " forced for vxlan encapsulation,"
1433                         " parameter ignored");
1434         }
1435         return 0;
1436 }
1437
1438 /**
1439  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_VXLAN item for E-Switch.
1440  * The routine checks the VNIP fields to be used in encapsulation header.
1441  *
1442  * @param[in] item
1443  *   Pointer to the item structure.
1444  * @param[out] error
1445  *   Pointer to the error structure.
1446  *
1447  * @return
1448  *   0 on success, a negative errno value otherwise and rte_errno is set.
1449  **/
1450 static int
1451 flow_tcf_validate_vxlan_encap_vni(const struct rte_flow_item *item,
1452                                   struct rte_flow_error *error)
1453 {
1454         const struct rte_flow_item_vxlan *spec = item->spec;
1455         const struct rte_flow_item_vxlan *mask = item->mask;
1456
1457         if (!spec) {
1458                 /* Outer VNI is required by tunnel_key parameter. */
1459                 return rte_flow_error_set(error, EINVAL,
1460                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1461                                           "NULL VNI specification"
1462                                           " for vxlan encapsulation");
1463         }
1464         if (!mask)
1465                 mask = &rte_flow_item_vxlan_mask;
1466         if (!mask->vni[0] && !mask->vni[1] && !mask->vni[2])
1467                 return rte_flow_error_set(error, EINVAL,
1468                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1469                                           "outer VNI must be specified "
1470                                           "for vxlan encapsulation");
1471         if (mask->vni[0] != 0xff ||
1472             mask->vni[1] != 0xff ||
1473             mask->vni[2] != 0xff)
1474                 return rte_flow_error_set(error, ENOTSUP,
1475                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1476                                           "no support for partial mask on"
1477                                           " \"vxlan.vni\" field");
1478
1479         if (!spec->vni[0] && !spec->vni[1] && !spec->vni[2])
1480                 return rte_flow_error_set(error, EINVAL,
1481                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1482                                           "vxlan vni cannot be 0");
1483         return 0;
1484 }
1485
1486 /**
1487  * Validate VXLAN_ENCAP action item list for E-Switch.
1488  * The routine checks items to be used in encapsulation header.
1489  *
1490  * @param[in] action
1491  *   Pointer to the VXLAN_ENCAP action structure.
1492  * @param[out] error
1493  *   Pointer to the error structure.
1494  *
1495  * @return
1496  *   0 on success, a negative errno value otherwise and rte_errno is set.
1497  **/
1498 static int
1499 flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action,
1500                               struct rte_flow_error *error)
1501 {
1502         const struct rte_flow_item *items;
1503         int ret;
1504         uint32_t item_flags = 0;
1505
1506         if (!action->conf)
1507                 return rte_flow_error_set(error, EINVAL,
1508                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1509                                           "Missing vxlan tunnel"
1510                                           " action configuration");
1511         items = ((const struct rte_flow_action_vxlan_encap *)
1512                                         action->conf)->definition;
1513         if (!items)
1514                 return rte_flow_error_set(error, EINVAL,
1515                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1516                                           "Missing vxlan tunnel"
1517                                           " encapsulation parameters");
1518         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1519                 switch (items->type) {
1520                 case RTE_FLOW_ITEM_TYPE_VOID:
1521                         break;
1522                 case RTE_FLOW_ITEM_TYPE_ETH:
1523                         ret = mlx5_flow_validate_item_eth(items, item_flags,
1524                                                           error);
1525                         if (ret < 0)
1526                                 return ret;
1527                         ret = flow_tcf_validate_vxlan_encap_eth(items, error);
1528                         if (ret < 0)
1529                                 return ret;
1530                         item_flags |= MLX5_FLOW_LAYER_OUTER_L2;
1531                         break;
1532                 break;
1533                 case RTE_FLOW_ITEM_TYPE_IPV4:
1534                         ret = mlx5_flow_validate_item_ipv4(items, item_flags,
1535                                                            error);
1536                         if (ret < 0)
1537                                 return ret;
1538                         ret = flow_tcf_validate_vxlan_encap_ipv4(items, error);
1539                         if (ret < 0)
1540                                 return ret;
1541                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
1542                         break;
1543                 case RTE_FLOW_ITEM_TYPE_IPV6:
1544                         ret = mlx5_flow_validate_item_ipv6(items, item_flags,
1545                                                            error);
1546                         if (ret < 0)
1547                                 return ret;
1548                         ret = flow_tcf_validate_vxlan_encap_ipv6(items, error);
1549                         if (ret < 0)
1550                                 return ret;
1551                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
1552                         break;
1553                 case RTE_FLOW_ITEM_TYPE_UDP:
1554                         ret = mlx5_flow_validate_item_udp(items, item_flags,
1555                                                            0xFF, error);
1556                         if (ret < 0)
1557                                 return ret;
1558                         ret = flow_tcf_validate_vxlan_encap_udp(items, error);
1559                         if (ret < 0)
1560                                 return ret;
1561                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
1562                         break;
1563                 case RTE_FLOW_ITEM_TYPE_VXLAN:
1564                         ret = mlx5_flow_validate_item_vxlan(items,
1565                                                             item_flags, error);
1566                         if (ret < 0)
1567                                 return ret;
1568                         ret = flow_tcf_validate_vxlan_encap_vni(items, error);
1569                         if (ret < 0)
1570                                 return ret;
1571                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
1572                         break;
1573                 default:
1574                         return rte_flow_error_set
1575                                         (error, ENOTSUP,
1576                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
1577                                          "vxlan encap item not supported");
1578                 }
1579         }
1580         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3))
1581                 return rte_flow_error_set(error, EINVAL,
1582                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1583                                           "no outer IP layer found"
1584                                           " for vxlan encapsulation");
1585         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
1586                 return rte_flow_error_set(error, EINVAL,
1587                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1588                                           "no outer UDP layer found"
1589                                           " for vxlan encapsulation");
1590         if (!(item_flags & MLX5_FLOW_LAYER_VXLAN))
1591                 return rte_flow_error_set(error, EINVAL,
1592                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1593                                           "no VXLAN VNI found"
1594                                           " for vxlan encapsulation");
1595         return 0;
1596 }
1597
1598 /**
1599  * Validate outer RTE_FLOW_ITEM_TYPE_UDP item if tunnel item
1600  * RTE_FLOW_ITEM_TYPE_VXLAN is present in item list.
1601  *
1602  * @param[in] udp
1603  *   Outer UDP layer item (if any, NULL otherwise).
1604  * @param[out] error
1605  *   Pointer to the error structure.
1606  *
1607  * @return
1608  *   0 on success, a negative errno value otherwise and rte_errno is set.
1609  **/
1610 static int
1611 flow_tcf_validate_vxlan_decap_udp(const struct rte_flow_item *udp,
1612                                   struct rte_flow_error *error)
1613 {
1614         const struct rte_flow_item_udp *spec = udp->spec;
1615         const struct rte_flow_item_udp *mask = udp->mask;
1616
1617         if (!spec)
1618                 /*
1619                  * Specification for UDP ports cannot be empty
1620                  * because it is required as decap parameter.
1621                  */
1622                 return rte_flow_error_set(error, EINVAL,
1623                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1624                                           "NULL UDP port specification"
1625                                           " for VXLAN decapsulation");
1626         if (!mask)
1627                 mask = &rte_flow_item_udp_mask;
1628         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1629                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1630                         return rte_flow_error_set
1631                                         (error, ENOTSUP,
1632                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1633                                          "no support for partial mask on"
1634                                          " \"udp.hdr.dst_port\" field");
1635                 if (!spec->hdr.dst_port)
1636                         return rte_flow_error_set
1637                                         (error, EINVAL,
1638                                          RTE_FLOW_ERROR_TYPE_ITEM, udp,
1639                                          "zero decap local UDP port");
1640         } else {
1641                 return rte_flow_error_set(error, EINVAL,
1642                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1643                                           "outer UDP destination port must be "
1644                                           "specified for vxlan decapsulation");
1645         }
1646         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1647                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1648                         return rte_flow_error_set
1649                                         (error, ENOTSUP,
1650                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1651                                          "no support for partial mask on"
1652                                          " \"udp.hdr.src_port\" field");
1653                 DRV_LOG(WARNING,
1654                         "outer UDP local port cannot be "
1655                         "forced for VXLAN encapsulation, "
1656                         "parameter ignored");
1657         }
1658         return 0;
1659 }
1660
1661 /**
1662  * Validate flow for E-Switch.
1663  *
1664  * @param[in] priv
1665  *   Pointer to the priv structure.
1666  * @param[in] attr
1667  *   Pointer to the flow attributes.
1668  * @param[in] items
1669  *   Pointer to the list of items.
1670  * @param[in] actions
1671  *   Pointer to the list of actions.
1672  * @param[out] error
1673  *   Pointer to the error structure.
1674  *
1675  * @return
1676  *   0 on success, a negative errno value otherwise and rte_errno is set.
1677  */
1678 static int
1679 flow_tcf_validate(struct rte_eth_dev *dev,
1680                   const struct rte_flow_attr *attr,
1681                   const struct rte_flow_item items[],
1682                   const struct rte_flow_action actions[],
1683                   struct rte_flow_error *error)
1684 {
1685         union {
1686                 const struct rte_flow_item_port_id *port_id;
1687                 const struct rte_flow_item_eth *eth;
1688                 const struct rte_flow_item_vlan *vlan;
1689                 const struct rte_flow_item_ipv4 *ipv4;
1690                 const struct rte_flow_item_ipv6 *ipv6;
1691                 const struct rte_flow_item_tcp *tcp;
1692                 const struct rte_flow_item_udp *udp;
1693                 const struct rte_flow_item_vxlan *vxlan;
1694         } spec, mask;
1695         union {
1696                 const struct rte_flow_action_port_id *port_id;
1697                 const struct rte_flow_action_jump *jump;
1698                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
1699                 const struct rte_flow_action_of_set_vlan_vid *
1700                         of_set_vlan_vid;
1701                 const struct rte_flow_action_of_set_vlan_pcp *
1702                         of_set_vlan_pcp;
1703                 const struct rte_flow_action_vxlan_encap *vxlan_encap;
1704                 const struct rte_flow_action_set_ipv4 *set_ipv4;
1705                 const struct rte_flow_action_set_ipv6 *set_ipv6;
1706         } conf;
1707         const struct rte_flow_item *outer_udp = NULL;
1708         rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
1709         rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
1710         rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
1711         uint64_t item_flags = 0;
1712         uint64_t action_flags = 0;
1713         uint8_t next_protocol = 0xff;
1714         unsigned int tcm_ifindex = 0;
1715         uint8_t pedit_validated = 0;
1716         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
1717         struct rte_eth_dev *port_id_dev = NULL;
1718         bool in_port_id_set;
1719         int ret;
1720
1721         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
1722                                                 PTOI_TABLE_SZ_MAX(dev)));
1723         ret = flow_tcf_validate_attributes(attr, error);
1724         if (ret < 0)
1725                 return ret;
1726         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
1727                 unsigned int i;
1728                 uint64_t current_action_flag = 0;
1729
1730                 switch (actions->type) {
1731                 case RTE_FLOW_ACTION_TYPE_VOID:
1732                         break;
1733                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
1734                         current_action_flag = MLX5_FLOW_ACTION_PORT_ID;
1735                         if (!actions->conf)
1736                                 break;
1737                         conf.port_id = actions->conf;
1738                         if (conf.port_id->original)
1739                                 i = 0;
1740                         else
1741                                 for (i = 0; ptoi[i].ifindex; ++i)
1742                                         if (ptoi[i].port_id == conf.port_id->id)
1743                                                 break;
1744                         if (!ptoi[i].ifindex)
1745                                 return rte_flow_error_set
1746                                         (error, ENODEV,
1747                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1748                                          conf.port_id,
1749                                          "missing data to convert port ID to"
1750                                          " ifindex");
1751                         port_id_dev = &rte_eth_devices[conf.port_id->id];
1752                         break;
1753                 case RTE_FLOW_ACTION_TYPE_JUMP:
1754                         current_action_flag = MLX5_FLOW_ACTION_JUMP;
1755                         if (!actions->conf)
1756                                 break;
1757                         conf.jump = actions->conf;
1758                         if (attr->group >= conf.jump->group)
1759                                 return rte_flow_error_set
1760                                         (error, ENOTSUP,
1761                                          RTE_FLOW_ERROR_TYPE_ACTION,
1762                                          actions,
1763                                          "can jump only to a group forward");
1764                         break;
1765                 case RTE_FLOW_ACTION_TYPE_DROP:
1766                         current_action_flag = MLX5_FLOW_ACTION_DROP;
1767                         break;
1768                 case RTE_FLOW_ACTION_TYPE_COUNT:
1769                         break;
1770                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
1771                         current_action_flag = MLX5_FLOW_ACTION_OF_POP_VLAN;
1772                         break;
1773                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: {
1774                         rte_be16_t ethertype;
1775
1776                         current_action_flag = MLX5_FLOW_ACTION_OF_PUSH_VLAN;
1777                         if (!actions->conf)
1778                                 break;
1779                         conf.of_push_vlan = actions->conf;
1780                         ethertype = conf.of_push_vlan->ethertype;
1781                         if (ethertype != RTE_BE16(ETH_P_8021Q) &&
1782                             ethertype != RTE_BE16(ETH_P_8021AD))
1783                                 return rte_flow_error_set
1784                                         (error, EINVAL,
1785                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1786                                          "vlan push TPID must be "
1787                                          "802.1Q or 802.1AD");
1788                         break;
1789                 }
1790                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
1791                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1792                                 return rte_flow_error_set
1793                                         (error, ENOTSUP,
1794                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1795                                          "vlan modify is not supported,"
1796                                          " set action must follow push action");
1797                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
1798                         break;
1799                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
1800                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1801                                 return rte_flow_error_set
1802                                         (error, ENOTSUP,
1803                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1804                                          "vlan modify is not supported,"
1805                                          " set action must follow push action");
1806                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
1807                         break;
1808                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
1809                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_DECAP;
1810                         break;
1811                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
1812                         ret = flow_tcf_validate_vxlan_encap(actions, error);
1813                         if (ret < 0)
1814                                 return ret;
1815                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_ENCAP;
1816                         break;
1817                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
1818                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_SRC;
1819                         break;
1820                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
1821                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_DST;
1822                         break;
1823                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
1824                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_SRC;
1825                         break;
1826                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
1827                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_DST;
1828                         break;
1829                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
1830                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_SRC;
1831                         break;
1832                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
1833                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_DST;
1834                         break;
1835                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
1836                         current_action_flag = MLX5_FLOW_ACTION_SET_TTL;
1837                         break;
1838                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
1839                         current_action_flag = MLX5_FLOW_ACTION_DEC_TTL;
1840                         break;
1841                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
1842                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_SRC;
1843                         break;
1844                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
1845                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_DST;
1846                         break;
1847                 default:
1848                         return rte_flow_error_set(error, ENOTSUP,
1849                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1850                                                   actions,
1851                                                   "action not supported");
1852                 }
1853                 if (current_action_flag & MLX5_TCF_CONFIG_ACTIONS) {
1854                         if (!actions->conf)
1855                                 return rte_flow_error_set
1856                                         (error, EINVAL,
1857                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1858                                          actions,
1859                                          "action configuration not set");
1860                 }
1861                 if ((current_action_flag & MLX5_TCF_PEDIT_ACTIONS) &&
1862                     pedit_validated)
1863                         return rte_flow_error_set(error, ENOTSUP,
1864                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1865                                                   actions,
1866                                                   "set actions should be "
1867                                                   "listed successively");
1868                 if ((current_action_flag & ~MLX5_TCF_PEDIT_ACTIONS) &&
1869                     (action_flags & MLX5_TCF_PEDIT_ACTIONS))
1870                         pedit_validated = 1;
1871                 if ((current_action_flag & MLX5_TCF_FATE_ACTIONS) &&
1872                     (action_flags & MLX5_TCF_FATE_ACTIONS))
1873                         return rte_flow_error_set(error, EINVAL,
1874                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1875                                                   actions,
1876                                                   "can't have multiple fate"
1877                                                   " actions");
1878                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1879                     (action_flags & MLX5_TCF_VXLAN_ACTIONS))
1880                         return rte_flow_error_set(error, EINVAL,
1881                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1882                                                   actions,
1883                                                   "can't have multiple vxlan"
1884                                                   " actions");
1885                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1886                     (action_flags & MLX5_TCF_VLAN_ACTIONS))
1887                         return rte_flow_error_set(error, ENOTSUP,
1888                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1889                                                   actions,
1890                                                   "can't have vxlan and vlan"
1891                                                   " actions in the same rule");
1892                 action_flags |= current_action_flag;
1893         }
1894         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1895                 unsigned int i;
1896
1897                 switch (items->type) {
1898                 case RTE_FLOW_ITEM_TYPE_VOID:
1899                         break;
1900                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
1901                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
1902                                 return rte_flow_error_set
1903                                         (error, ENOTSUP,
1904                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
1905                                          "inner tunnel port id"
1906                                          " item is not supported");
1907                         mask.port_id = flow_tcf_item_mask
1908                                 (items, &rte_flow_item_port_id_mask,
1909                                  &flow_tcf_mask_supported.port_id,
1910                                  &flow_tcf_mask_empty.port_id,
1911                                  sizeof(flow_tcf_mask_supported.port_id),
1912                                  error);
1913                         if (!mask.port_id)
1914                                 return -rte_errno;
1915                         if (mask.port_id == &flow_tcf_mask_empty.port_id) {
1916                                 in_port_id_set = 1;
1917                                 break;
1918                         }
1919                         spec.port_id = items->spec;
1920                         if (mask.port_id->id && mask.port_id->id != 0xffffffff)
1921                                 return rte_flow_error_set
1922                                         (error, ENOTSUP,
1923                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
1924                                          mask.port_id,
1925                                          "no support for partial mask on"
1926                                          " \"id\" field");
1927                         if (!mask.port_id->id)
1928                                 i = 0;
1929                         else
1930                                 for (i = 0; ptoi[i].ifindex; ++i)
1931                                         if (ptoi[i].port_id == spec.port_id->id)
1932                                                 break;
1933                         if (!ptoi[i].ifindex)
1934                                 return rte_flow_error_set
1935                                         (error, ENODEV,
1936                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
1937                                          spec.port_id,
1938                                          "missing data to convert port ID to"
1939                                          " ifindex");
1940                         if (in_port_id_set && ptoi[i].ifindex != tcm_ifindex)
1941                                 return rte_flow_error_set
1942                                         (error, ENOTSUP,
1943                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
1944                                          spec.port_id,
1945                                          "cannot match traffic for"
1946                                          " several port IDs through"
1947                                          " a single flow rule");
1948                         tcm_ifindex = ptoi[i].ifindex;
1949                         in_port_id_set = 1;
1950                         break;
1951                 case RTE_FLOW_ITEM_TYPE_ETH:
1952                         ret = mlx5_flow_validate_item_eth(items, item_flags,
1953                                                           error);
1954                         if (ret < 0)
1955                                 return ret;
1956                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
1957                                       MLX5_FLOW_LAYER_INNER_L2 :
1958                                       MLX5_FLOW_LAYER_OUTER_L2;
1959                         /* TODO:
1960                          * Redundant check due to different supported mask.
1961                          * Same for the rest of items.
1962                          */
1963                         mask.eth = flow_tcf_item_mask
1964                                 (items, &rte_flow_item_eth_mask,
1965                                  &flow_tcf_mask_supported.eth,
1966                                  &flow_tcf_mask_empty.eth,
1967                                  sizeof(flow_tcf_mask_supported.eth),
1968                                  error);
1969                         if (!mask.eth)
1970                                 return -rte_errno;
1971                         if (mask.eth->type && mask.eth->type !=
1972                             RTE_BE16(0xffff))
1973                                 return rte_flow_error_set
1974                                         (error, ENOTSUP,
1975                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
1976                                          mask.eth,
1977                                          "no support for partial mask on"
1978                                          " \"type\" field");
1979                         assert(items->spec);
1980                         spec.eth = items->spec;
1981                         if (mask.eth->type &&
1982                             (item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
1983                             inner_etype != RTE_BE16(ETH_P_ALL) &&
1984                             inner_etype != spec.eth->type)
1985                                 return rte_flow_error_set
1986                                         (error, EINVAL,
1987                                          RTE_FLOW_ERROR_TYPE_ITEM,
1988                                          items,
1989                                          "inner eth_type conflict");
1990                         if (mask.eth->type &&
1991                             !(item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
1992                             outer_etype != RTE_BE16(ETH_P_ALL) &&
1993                             outer_etype != spec.eth->type)
1994                                 return rte_flow_error_set
1995                                         (error, EINVAL,
1996                                          RTE_FLOW_ERROR_TYPE_ITEM,
1997                                          items,
1998                                          "outer eth_type conflict");
1999                         if (mask.eth->type) {
2000                                 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
2001                                         inner_etype = spec.eth->type;
2002                                 else
2003                                         outer_etype = spec.eth->type;
2004                         }
2005                         break;
2006                 case RTE_FLOW_ITEM_TYPE_VLAN:
2007                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
2008                                 return rte_flow_error_set
2009                                         (error, ENOTSUP,
2010                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
2011                                          "inner tunnel VLAN"
2012                                          " is not supported");
2013                         ret = mlx5_flow_validate_item_vlan(items, item_flags,
2014                                                            error);
2015                         if (ret < 0)
2016                                 return ret;
2017                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
2018                         mask.vlan = flow_tcf_item_mask
2019                                 (items, &rte_flow_item_vlan_mask,
2020                                  &flow_tcf_mask_supported.vlan,
2021                                  &flow_tcf_mask_empty.vlan,
2022                                  sizeof(flow_tcf_mask_supported.vlan),
2023                                  error);
2024                         if (!mask.vlan)
2025                                 return -rte_errno;
2026                         if ((mask.vlan->tci & RTE_BE16(0xe000) &&
2027                              (mask.vlan->tci & RTE_BE16(0xe000)) !=
2028                               RTE_BE16(0xe000)) ||
2029                             (mask.vlan->tci & RTE_BE16(0x0fff) &&
2030                              (mask.vlan->tci & RTE_BE16(0x0fff)) !=
2031                               RTE_BE16(0x0fff)) ||
2032                             (mask.vlan->inner_type &&
2033                              mask.vlan->inner_type != RTE_BE16(0xffff)))
2034                                 return rte_flow_error_set
2035                                         (error, ENOTSUP,
2036                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2037                                          mask.vlan,
2038                                          "no support for partial masks on"
2039                                          " \"tci\" (PCP and VID parts) and"
2040                                          " \"inner_type\" fields");
2041                         if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2042                             outer_etype != RTE_BE16(ETH_P_8021Q))
2043                                 return rte_flow_error_set
2044                                         (error, EINVAL,
2045                                          RTE_FLOW_ERROR_TYPE_ITEM,
2046                                          items,
2047                                          "outer eth_type conflict,"
2048                                          " must be 802.1Q");
2049                         outer_etype = RTE_BE16(ETH_P_8021Q);
2050                         assert(items->spec);
2051                         spec.vlan = items->spec;
2052                         if (mask.vlan->inner_type &&
2053                             vlan_etype != RTE_BE16(ETH_P_ALL) &&
2054                             vlan_etype != spec.vlan->inner_type)
2055                                 return rte_flow_error_set
2056                                         (error, EINVAL,
2057                                          RTE_FLOW_ERROR_TYPE_ITEM,
2058                                          items,
2059                                          "vlan eth_type conflict");
2060                         if (mask.vlan->inner_type)
2061                                 vlan_etype = spec.vlan->inner_type;
2062                         break;
2063                 case RTE_FLOW_ITEM_TYPE_IPV4:
2064                         ret = mlx5_flow_validate_item_ipv4(items, item_flags,
2065                                                            error);
2066                         if (ret < 0)
2067                                 return ret;
2068                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2069                                       MLX5_FLOW_LAYER_INNER_L3_IPV4 :
2070                                       MLX5_FLOW_LAYER_OUTER_L3_IPV4;
2071                         mask.ipv4 = flow_tcf_item_mask
2072                                 (items, &rte_flow_item_ipv4_mask,
2073                                  &flow_tcf_mask_supported.ipv4,
2074                                  &flow_tcf_mask_empty.ipv4,
2075                                  sizeof(flow_tcf_mask_supported.ipv4),
2076                                  error);
2077                         if (!mask.ipv4)
2078                                 return -rte_errno;
2079                         if (mask.ipv4->hdr.next_proto_id &&
2080                             mask.ipv4->hdr.next_proto_id != 0xff)
2081                                 return rte_flow_error_set
2082                                         (error, ENOTSUP,
2083                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2084                                          mask.ipv4,
2085                                          "no support for partial mask on"
2086                                          " \"hdr.next_proto_id\" field");
2087                         else if (mask.ipv4->hdr.next_proto_id)
2088                                 next_protocol =
2089                                         ((const struct rte_flow_item_ipv4 *)
2090                                          (items->spec))->hdr.next_proto_id;
2091                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2092                                 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2093                                     inner_etype != RTE_BE16(ETH_P_IP))
2094                                         return rte_flow_error_set
2095                                                 (error, EINVAL,
2096                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2097                                                  items,
2098                                                  "inner eth_type conflict,"
2099                                                  " IPv4 is required");
2100                                 inner_etype = RTE_BE16(ETH_P_IP);
2101                         } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2102                                 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2103                                     vlan_etype != RTE_BE16(ETH_P_IP))
2104                                         return rte_flow_error_set
2105                                                 (error, EINVAL,
2106                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2107                                                  items,
2108                                                  "vlan eth_type conflict,"
2109                                                  " IPv4 is required");
2110                                 vlan_etype = RTE_BE16(ETH_P_IP);
2111                         } else {
2112                                 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2113                                     outer_etype != RTE_BE16(ETH_P_IP))
2114                                         return rte_flow_error_set
2115                                                 (error, EINVAL,
2116                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2117                                                  items,
2118                                                  "eth_type conflict,"
2119                                                  " IPv4 is required");
2120                                 outer_etype = RTE_BE16(ETH_P_IP);
2121                         }
2122                         break;
2123                 case RTE_FLOW_ITEM_TYPE_IPV6:
2124                         ret = mlx5_flow_validate_item_ipv6(items, item_flags,
2125                                                            error);
2126                         if (ret < 0)
2127                                 return ret;
2128                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2129                                       MLX5_FLOW_LAYER_INNER_L3_IPV6 :
2130                                       MLX5_FLOW_LAYER_OUTER_L3_IPV6;
2131                         mask.ipv6 = flow_tcf_item_mask
2132                                 (items, &rte_flow_item_ipv6_mask,
2133                                  &flow_tcf_mask_supported.ipv6,
2134                                  &flow_tcf_mask_empty.ipv6,
2135                                  sizeof(flow_tcf_mask_supported.ipv6),
2136                                  error);
2137                         if (!mask.ipv6)
2138                                 return -rte_errno;
2139                         if (mask.ipv6->hdr.proto &&
2140                             mask.ipv6->hdr.proto != 0xff)
2141                                 return rte_flow_error_set
2142                                         (error, ENOTSUP,
2143                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2144                                          mask.ipv6,
2145                                          "no support for partial mask on"
2146                                          " \"hdr.proto\" field");
2147                         else if (mask.ipv6->hdr.proto)
2148                                 next_protocol =
2149                                         ((const struct rte_flow_item_ipv6 *)
2150                                          (items->spec))->hdr.proto;
2151                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2152                                 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2153                                     inner_etype != RTE_BE16(ETH_P_IPV6))
2154                                         return rte_flow_error_set
2155                                                 (error, EINVAL,
2156                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2157                                                  items,
2158                                                  "inner eth_type conflict,"
2159                                                  " IPv6 is required");
2160                                 inner_etype = RTE_BE16(ETH_P_IPV6);
2161                         } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2162                                 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2163                                     vlan_etype != RTE_BE16(ETH_P_IPV6))
2164                                         return rte_flow_error_set
2165                                                 (error, EINVAL,
2166                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2167                                                  items,
2168                                                  "vlan eth_type conflict,"
2169                                                  " IPv6 is required");
2170                                 vlan_etype = RTE_BE16(ETH_P_IPV6);
2171                         } else {
2172                                 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2173                                     outer_etype != RTE_BE16(ETH_P_IPV6))
2174                                         return rte_flow_error_set
2175                                                 (error, EINVAL,
2176                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2177                                                  items,
2178                                                  "eth_type conflict,"
2179                                                  " IPv6 is required");
2180                                 outer_etype = RTE_BE16(ETH_P_IPV6);
2181                         }
2182                         break;
2183                 case RTE_FLOW_ITEM_TYPE_UDP:
2184                         ret = mlx5_flow_validate_item_udp(items, item_flags,
2185                                                           next_protocol, error);
2186                         if (ret < 0)
2187                                 return ret;
2188                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2189                                       MLX5_FLOW_LAYER_INNER_L4_UDP :
2190                                       MLX5_FLOW_LAYER_OUTER_L4_UDP;
2191                         mask.udp = flow_tcf_item_mask
2192                                 (items, &rte_flow_item_udp_mask,
2193                                  &flow_tcf_mask_supported.udp,
2194                                  &flow_tcf_mask_empty.udp,
2195                                  sizeof(flow_tcf_mask_supported.udp),
2196                                  error);
2197                         if (!mask.udp)
2198                                 return -rte_errno;
2199                         /*
2200                          * Save the presumed outer UDP item for extra check
2201                          * if the tunnel item will be found later in the list.
2202                          */
2203                         if (!(item_flags & MLX5_FLOW_LAYER_TUNNEL))
2204                                 outer_udp = items;
2205                         break;
2206                 case RTE_FLOW_ITEM_TYPE_TCP:
2207                         ret = mlx5_flow_validate_item_tcp
2208                                              (items, item_flags,
2209                                               next_protocol,
2210                                               &flow_tcf_mask_supported.tcp,
2211                                               error);
2212                         if (ret < 0)
2213                                 return ret;
2214                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2215                                       MLX5_FLOW_LAYER_INNER_L4_TCP :
2216                                       MLX5_FLOW_LAYER_OUTER_L4_TCP;
2217                         mask.tcp = flow_tcf_item_mask
2218                                 (items, &rte_flow_item_tcp_mask,
2219                                  &flow_tcf_mask_supported.tcp,
2220                                  &flow_tcf_mask_empty.tcp,
2221                                  sizeof(flow_tcf_mask_supported.tcp),
2222                                  error);
2223                         if (!mask.tcp)
2224                                 return -rte_errno;
2225                         break;
2226                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2227                         if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN)
2228                                 return rte_flow_error_set
2229                                         (error, ENOTSUP,
2230                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
2231                                          "vxlan tunnel over vlan"
2232                                          " is not supported");
2233                         ret = mlx5_flow_validate_item_vxlan(items,
2234                                                             item_flags, error);
2235                         if (ret < 0)
2236                                 return ret;
2237                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
2238                         mask.vxlan = flow_tcf_item_mask
2239                                 (items, &rte_flow_item_vxlan_mask,
2240                                  &flow_tcf_mask_supported.vxlan,
2241                                  &flow_tcf_mask_empty.vxlan,
2242                                  sizeof(flow_tcf_mask_supported.vxlan), error);
2243                         if (!mask.vxlan)
2244                                 return -rte_errno;
2245                         if (mask.vxlan->vni[0] != 0xff ||
2246                             mask.vxlan->vni[1] != 0xff ||
2247                             mask.vxlan->vni[2] != 0xff)
2248                                 return rte_flow_error_set
2249                                         (error, ENOTSUP,
2250                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2251                                          mask.vxlan,
2252                                          "no support for partial or "
2253                                          "empty mask on \"vxlan.vni\" field");
2254                         /*
2255                          * The VNI item assumes the VXLAN tunnel, it requires
2256                          * at least the outer destination UDP port must be
2257                          * specified without wildcards to allow kernel select
2258                          * the virtual VXLAN device by port. Also outer IPv4
2259                          * or IPv6 item must be specified (wilcards or even
2260                          * zero mask are allowed) to let driver know the tunnel
2261                          * IP version and process UDP traffic correctly.
2262                          */
2263                         if (!(item_flags &
2264                              (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2265                               MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2266                                 return rte_flow_error_set
2267                                                  (error, EINVAL,
2268                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2269                                                   NULL,
2270                                                   "no outer IP pattern found"
2271                                                   " for vxlan tunnel");
2272                         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
2273                                 return rte_flow_error_set
2274                                                  (error, EINVAL,
2275                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2276                                                   NULL,
2277                                                   "no outer UDP pattern found"
2278                                                   " for vxlan tunnel");
2279                         /*
2280                          * All items preceding the tunnel item become outer
2281                          * ones and we should do extra validation for them
2282                          * due to tc limitations for tunnel outer parameters.
2283                          * Currently only outer UDP item requres extra check,
2284                          * use the saved pointer instead of item list rescan.
2285                          */
2286                         assert(outer_udp);
2287                         ret = flow_tcf_validate_vxlan_decap_udp
2288                                                 (outer_udp, error);
2289                         if (ret < 0)
2290                                 return ret;
2291                         /* Reset L4 protocol for inner parameters. */
2292                         next_protocol = 0xff;
2293                         break;
2294                 default:
2295                         return rte_flow_error_set(error, ENOTSUP,
2296                                                   RTE_FLOW_ERROR_TYPE_ITEM,
2297                                                   items, "item not supported");
2298                 }
2299         }
2300         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2301             (action_flags & MLX5_FLOW_ACTION_DROP))
2302                 return rte_flow_error_set(error, ENOTSUP,
2303                                           RTE_FLOW_ERROR_TYPE_ACTION,
2304                                           actions,
2305                                           "set action is not compatible with "
2306                                           "drop action");
2307         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2308             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2309                 return rte_flow_error_set(error, ENOTSUP,
2310                                           RTE_FLOW_ERROR_TYPE_ACTION,
2311                                           actions,
2312                                           "set action must be followed by "
2313                                           "port_id action");
2314         if (action_flags &
2315            (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST)) {
2316                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4))
2317                         return rte_flow_error_set(error, EINVAL,
2318                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2319                                                   actions,
2320                                                   "no ipv4 item found in"
2321                                                   " pattern");
2322         }
2323         if (action_flags &
2324            (MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST)) {
2325                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6))
2326                         return rte_flow_error_set(error, EINVAL,
2327                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2328                                                   actions,
2329                                                   "no ipv6 item found in"
2330                                                   " pattern");
2331         }
2332         if (action_flags &
2333            (MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST)) {
2334                 if (!(item_flags &
2335                      (MLX5_FLOW_LAYER_OUTER_L4_UDP |
2336                       MLX5_FLOW_LAYER_OUTER_L4_TCP)))
2337                         return rte_flow_error_set(error, EINVAL,
2338                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2339                                                   actions,
2340                                                   "no TCP/UDP item found in"
2341                                                   " pattern");
2342         }
2343         /*
2344          * FW syndrome (0xA9C090):
2345          *     set_flow_table_entry: push vlan action fte in fdb can ONLY be
2346          *     forward to the uplink.
2347          */
2348         if ((action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) &&
2349             (action_flags & MLX5_FLOW_ACTION_PORT_ID) &&
2350             ((struct priv *)port_id_dev->data->dev_private)->representor)
2351                 return rte_flow_error_set(error, ENOTSUP,
2352                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2353                                           "vlan push can only be applied"
2354                                           " when forwarding to uplink port");
2355         /*
2356          * FW syndrome (0x294609):
2357          *     set_flow_table_entry: modify/pop/push actions in fdb flow table
2358          *     are supported only while forwarding to vport.
2359          */
2360         if ((action_flags & MLX5_TCF_VLAN_ACTIONS) &&
2361             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2362                 return rte_flow_error_set(error, ENOTSUP,
2363                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2364                                           "vlan actions are supported"
2365                                           " only with port_id action");
2366         if ((action_flags & MLX5_TCF_VXLAN_ACTIONS) &&
2367             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2368                 return rte_flow_error_set(error, ENOTSUP,
2369                                           RTE_FLOW_ERROR_TYPE_ACTION, NULL,
2370                                           "vxlan actions are supported"
2371                                           " only with port_id action");
2372         if (!(action_flags & MLX5_TCF_FATE_ACTIONS))
2373                 return rte_flow_error_set(error, EINVAL,
2374                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2375                                           "no fate action is found");
2376         if (action_flags &
2377            (MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL)) {
2378                 if (!(item_flags &
2379                      (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2380                       MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2381                         return rte_flow_error_set(error, EINVAL,
2382                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2383                                                   actions,
2384                                                   "no IP found in pattern");
2385         }
2386         if (action_flags &
2387             (MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)) {
2388                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L2))
2389                         return rte_flow_error_set(error, ENOTSUP,
2390                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2391                                                   actions,
2392                                                   "no ethernet found in"
2393                                                   " pattern");
2394         }
2395         if ((action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) &&
2396             !(item_flags & MLX5_FLOW_LAYER_VXLAN))
2397                 return rte_flow_error_set(error, EINVAL,
2398                                           RTE_FLOW_ERROR_TYPE_ACTION,
2399                                           NULL,
2400                                           "no VNI pattern found"
2401                                           " for vxlan decap action");
2402         if ((action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) &&
2403             (item_flags & MLX5_FLOW_LAYER_TUNNEL))
2404                 return rte_flow_error_set(error, EINVAL,
2405                                           RTE_FLOW_ERROR_TYPE_ACTION,
2406                                           NULL,
2407                                           "vxlan encap not supported"
2408                                           " for tunneled traffic");
2409         return 0;
2410 }
2411
2412 /**
2413  * Calculate maximum size of memory for flow items of Linux TC flower.
2414  *
2415  * @param[in] attr
2416  *   Pointer to the flow attributes.
2417  * @param[in] items
2418  *   Pointer to the list of items.
2419  * @param[out] action_flags
2420  *   Pointer to the detected actions.
2421  *
2422  * @return
2423  *   Maximum size of memory for items.
2424  */
2425 static int
2426 flow_tcf_get_items_size(const struct rte_flow_attr *attr,
2427                         const struct rte_flow_item items[],
2428                         uint64_t *action_flags)
2429 {
2430         int size = 0;
2431
2432         size += SZ_NLATTR_STRZ_OF("flower") +
2433                 SZ_NLATTR_TYPE_OF(uint16_t) + /* Outer ether type. */
2434                 SZ_NLATTR_NEST + /* TCA_OPTIONS. */
2435                 SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */
2436         if (attr->group > 0)
2437                 size += SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CHAIN. */
2438         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2439                 switch (items->type) {
2440                 case RTE_FLOW_ITEM_TYPE_VOID:
2441                         break;
2442                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
2443                         break;
2444                 case RTE_FLOW_ITEM_TYPE_ETH:
2445                         size += SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4;
2446                                 /* dst/src MAC addr and mask. */
2447                         break;
2448                 case RTE_FLOW_ITEM_TYPE_VLAN:
2449                         size += SZ_NLATTR_TYPE_OF(uint16_t) +
2450                                 /* VLAN Ether type. */
2451                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */
2452                                 SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */
2453                         break;
2454                 case RTE_FLOW_ITEM_TYPE_IPV4:
2455                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2456                                 SZ_NLATTR_TYPE_OF(uint32_t) * 4;
2457                                 /* dst/src IP addr and mask. */
2458                         break;
2459                 case RTE_FLOW_ITEM_TYPE_IPV6:
2460                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2461                                 SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4;
2462                                 /* dst/src IP addr and mask. */
2463                         break;
2464                 case RTE_FLOW_ITEM_TYPE_UDP:
2465                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2466                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2467                                 /* dst/src port and mask. */
2468                         break;
2469                 case RTE_FLOW_ITEM_TYPE_TCP:
2470                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2471                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2472                                 /* dst/src port and mask. */
2473                         break;
2474                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2475                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2476                         /*
2477                          * There might be no VXLAN decap action in the action
2478                          * list, nonetheless the VXLAN tunnel flow requires
2479                          * the decap structure to be correctly applied to
2480                          * VXLAN device, set the flag to create the structure.
2481                          * Translation routine will not put the decap action
2482                          * in tne Netlink message if there is no actual action
2483                          * in the list.
2484                          */
2485                         *action_flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2486                         break;
2487                 default:
2488                         DRV_LOG(WARNING,
2489                                 "unsupported item %p type %d,"
2490                                 " items must be validated before flow creation",
2491                                 (const void *)items, items->type);
2492                         break;
2493                 }
2494         }
2495         return size;
2496 }
2497
2498 /**
2499  * Calculate size of memory to store the VXLAN encapsultion
2500  * related items in the Netlink message buffer. Items list
2501  * is specified by RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action.
2502  * The item list should be validated.
2503  *
2504  * @param[in] action
2505  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2506  *   List of pattern items to scan data from.
2507  *
2508  * @return
2509  *   The size the part of Netlink message buffer to store the
2510  *   VXLAN encapsulation item attributes.
2511  */
2512 static int
2513 flow_tcf_vxlan_encap_size(const struct rte_flow_action *action)
2514 {
2515         const struct rte_flow_item *items;
2516         int size = 0;
2517
2518         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
2519         assert(action->conf);
2520
2521         items = ((const struct rte_flow_action_vxlan_encap *)
2522                                         action->conf)->definition;
2523         assert(items);
2524         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2525                 switch (items->type) {
2526                 case RTE_FLOW_ITEM_TYPE_VOID:
2527                         break;
2528                 case RTE_FLOW_ITEM_TYPE_ETH:
2529                         /* This item does not require message buffer. */
2530                         break;
2531                 case RTE_FLOW_ITEM_TYPE_IPV4:
2532                         size += SZ_NLATTR_DATA_OF(IPV4_ADDR_LEN) * 2;
2533                         break;
2534                 case RTE_FLOW_ITEM_TYPE_IPV6:
2535                         size += SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 2;
2536                         break;
2537                 case RTE_FLOW_ITEM_TYPE_UDP: {
2538                         const struct rte_flow_item_udp *udp = items->mask;
2539
2540                         size += SZ_NLATTR_TYPE_OF(uint16_t);
2541                         if (!udp || udp->hdr.src_port != RTE_BE16(0x0000))
2542                                 size += SZ_NLATTR_TYPE_OF(uint16_t);
2543                         break;
2544                 }
2545                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2546                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2547                         break;
2548                 default:
2549                         assert(false);
2550                         DRV_LOG(WARNING,
2551                                 "unsupported item %p type %d,"
2552                                 " items must be validated"
2553                                 " before flow creation",
2554                                 (const void *)items, items->type);
2555                         return 0;
2556                 }
2557         }
2558         return size;
2559 }
2560
2561 /**
2562  * Calculate maximum size of memory for flow actions of Linux TC flower and
2563  * extract specified actions.
2564  *
2565  * @param[in] actions
2566  *   Pointer to the list of actions.
2567  * @param[out] action_flags
2568  *   Pointer to the detected actions.
2569  *
2570  * @return
2571  *   Maximum size of memory for actions.
2572  */
2573 static int
2574 flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
2575                               uint64_t *action_flags)
2576 {
2577         int size = 0;
2578         uint64_t flags = 0;
2579
2580         size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */
2581         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
2582                 switch (actions->type) {
2583                 case RTE_FLOW_ACTION_TYPE_VOID:
2584                         break;
2585                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
2586                         size += SZ_NLATTR_NEST + /* na_act_index. */
2587                                 SZ_NLATTR_STRZ_OF("mirred") +
2588                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2589                                 SZ_NLATTR_TYPE_OF(struct tc_mirred);
2590                         flags |= MLX5_FLOW_ACTION_PORT_ID;
2591                         break;
2592                 case RTE_FLOW_ACTION_TYPE_JUMP:
2593                         size += SZ_NLATTR_NEST + /* na_act_index. */
2594                                 SZ_NLATTR_STRZ_OF("gact") +
2595                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2596                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2597                         flags |= MLX5_FLOW_ACTION_JUMP;
2598                         break;
2599                 case RTE_FLOW_ACTION_TYPE_DROP:
2600                         size += SZ_NLATTR_NEST + /* na_act_index. */
2601                                 SZ_NLATTR_STRZ_OF("gact") +
2602                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2603                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2604                         flags |= MLX5_FLOW_ACTION_DROP;
2605                         break;
2606                 case RTE_FLOW_ACTION_TYPE_COUNT:
2607                         break;
2608                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
2609                         flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
2610                         goto action_of_vlan;
2611                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
2612                         flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
2613                         goto action_of_vlan;
2614                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
2615                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
2616                         goto action_of_vlan;
2617                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
2618                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
2619                         goto action_of_vlan;
2620 action_of_vlan:
2621                         size += SZ_NLATTR_NEST + /* na_act_index. */
2622                                 SZ_NLATTR_STRZ_OF("vlan") +
2623                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2624                                 SZ_NLATTR_TYPE_OF(struct tc_vlan) +
2625                                 SZ_NLATTR_TYPE_OF(uint16_t) +
2626                                 /* VLAN protocol. */
2627                                 SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
2628                                 SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
2629                         break;
2630                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
2631                         size += SZ_NLATTR_NEST + /* na_act_index. */
2632                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2633                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2634                                 SZ_NLATTR_TYPE_OF(uint8_t);
2635                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2636                         size += flow_tcf_vxlan_encap_size(actions) +
2637                                 RTE_ALIGN_CEIL /* preceding encap params. */
2638                                 (sizeof(struct flow_tcf_vxlan_encap),
2639                                 MNL_ALIGNTO);
2640                         flags |= MLX5_FLOW_ACTION_VXLAN_ENCAP;
2641                         break;
2642                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
2643                         size += SZ_NLATTR_NEST + /* na_act_index. */
2644                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2645                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2646                                 SZ_NLATTR_TYPE_OF(uint8_t);
2647                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2648                         size += RTE_ALIGN_CEIL /* preceding decap params. */
2649                                 (sizeof(struct flow_tcf_vxlan_decap),
2650                                 MNL_ALIGNTO);
2651                         flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2652                         break;
2653                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
2654                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
2655                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
2656                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
2657                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
2658                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
2659                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
2660                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
2661                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
2662                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
2663                         size += flow_tcf_get_pedit_actions_size(&actions,
2664                                                                 &flags);
2665                         break;
2666                 default:
2667                         DRV_LOG(WARNING,
2668                                 "unsupported action %p type %d,"
2669                                 " items must be validated before flow creation",
2670                                 (const void *)actions, actions->type);
2671                         break;
2672                 }
2673         }
2674         *action_flags = flags;
2675         return size;
2676 }
2677
2678 /**
2679  * Brand rtnetlink buffer with unique handle.
2680  *
2681  * This handle should be unique for a given network interface to avoid
2682  * collisions.
2683  *
2684  * @param nlh
2685  *   Pointer to Netlink message.
2686  * @param handle
2687  *   Unique 32-bit handle to use.
2688  */
2689 static void
2690 flow_tcf_nl_brand(struct nlmsghdr *nlh, uint32_t handle)
2691 {
2692         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
2693
2694         tcm->tcm_handle = handle;
2695         DRV_LOG(DEBUG, "Netlink msg %p is branded with handle %x",
2696                 (void *)nlh, handle);
2697 }
2698
2699 /**
2700  * Prepare a flow object for Linux TC flower. It calculates the maximum size of
2701  * memory required, allocates the memory, initializes Netlink message headers
2702  * and set unique TC message handle.
2703  *
2704  * @param[in] attr
2705  *   Pointer to the flow attributes.
2706  * @param[in] items
2707  *   Pointer to the list of items.
2708  * @param[in] actions
2709  *   Pointer to the list of actions.
2710  * @param[out] error
2711  *   Pointer to the error structure.
2712  *
2713  * @return
2714  *   Pointer to mlx5_flow object on success,
2715  *   otherwise NULL and rte_errno is set.
2716  */
2717 static struct mlx5_flow *
2718 flow_tcf_prepare(const struct rte_flow_attr *attr,
2719                  const struct rte_flow_item items[],
2720                  const struct rte_flow_action actions[],
2721                  struct rte_flow_error *error)
2722 {
2723         size_t size = RTE_ALIGN_CEIL
2724                         (sizeof(struct mlx5_flow),
2725                          alignof(struct flow_tcf_tunnel_hdr)) +
2726                       MNL_ALIGN(sizeof(struct nlmsghdr)) +
2727                       MNL_ALIGN(sizeof(struct tcmsg));
2728         struct mlx5_flow *dev_flow;
2729         uint64_t action_flags = 0;
2730         struct nlmsghdr *nlh;
2731         struct tcmsg *tcm;
2732         uint8_t *sp, *tun = NULL;
2733
2734         size += flow_tcf_get_items_size(attr, items, &action_flags);
2735         size += flow_tcf_get_actions_and_size(actions, &action_flags);
2736         dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
2737         if (!dev_flow) {
2738                 rte_flow_error_set(error, ENOMEM,
2739                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
2740                                    "not enough memory to create E-Switch flow");
2741                 return NULL;
2742         }
2743         sp = (uint8_t *)(dev_flow + 1);
2744         if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) {
2745                 sp = RTE_PTR_ALIGN
2746                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2747                 tun = sp;
2748                 sp += RTE_ALIGN_CEIL
2749                         (sizeof(struct flow_tcf_vxlan_encap),
2750                         MNL_ALIGNTO);
2751 #ifndef NDEBUG
2752                 size -= RTE_ALIGN_CEIL
2753                         (sizeof(struct flow_tcf_vxlan_encap),
2754                         MNL_ALIGNTO);
2755 #endif
2756         } else if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2757                 sp = RTE_PTR_ALIGN
2758                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2759                 tun = sp;
2760                 sp += RTE_ALIGN_CEIL
2761                         (sizeof(struct flow_tcf_vxlan_decap),
2762                         MNL_ALIGNTO);
2763 #ifndef NDEBUG
2764                 size -= RTE_ALIGN_CEIL
2765                         (sizeof(struct flow_tcf_vxlan_decap),
2766                         MNL_ALIGNTO);
2767 #endif
2768         } else {
2769                 sp = RTE_PTR_ALIGN(sp, MNL_ALIGNTO);
2770         }
2771         nlh = mnl_nlmsg_put_header(sp);
2772         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
2773         *dev_flow = (struct mlx5_flow){
2774                 .tcf = (struct mlx5_flow_tcf){
2775 #ifndef NDEBUG
2776                         .nlsize = size - RTE_ALIGN_CEIL
2777                                 (sizeof(struct mlx5_flow),
2778                                  alignof(struct flow_tcf_tunnel_hdr)),
2779 #endif
2780                         .tunnel = (struct flow_tcf_tunnel_hdr *)tun,
2781                         .nlh = nlh,
2782                         .tcm = tcm,
2783                 },
2784         };
2785         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP)
2786                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_DECAP;
2787         else if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP)
2788                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_ENCAP;
2789         /*
2790          * Generate a reasonably unique handle based on the address of the
2791          * target buffer.
2792          *
2793          * This is straightforward on 32-bit systems where the flow pointer can
2794          * be used directly. Otherwise, its least significant part is taken
2795          * after shifting it by the previous power of two of the pointed buffer
2796          * size.
2797          */
2798         if (sizeof(dev_flow) <= 4)
2799                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow);
2800         else
2801                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow >>
2802                                        rte_log2_u32(rte_align32prevpow2(size)));
2803         return dev_flow;
2804 }
2805
2806 /**
2807  * Make adjustments for supporting count actions.
2808  *
2809  * @param[in] dev
2810  *   Pointer to the Ethernet device structure.
2811  * @param[in] dev_flow
2812  *   Pointer to mlx5_flow.
2813  * @param[out] error
2814  *   Pointer to error structure.
2815  *
2816  * @return
2817  *   0 On success else a negative errno value is returned and rte_errno is set.
2818  */
2819 static int
2820 flow_tcf_translate_action_count(struct rte_eth_dev *dev __rte_unused,
2821                                   struct mlx5_flow *dev_flow,
2822                                   struct rte_flow_error *error)
2823 {
2824         struct rte_flow *flow = dev_flow->flow;
2825
2826         if (!flow->counter) {
2827                 flow->counter = flow_tcf_counter_new();
2828                 if (!flow->counter)
2829                         return rte_flow_error_set(error, rte_errno,
2830                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2831                                                   NULL,
2832                                                   "cannot get counter"
2833                                                   " context.");
2834         }
2835         return 0;
2836 }
2837
2838 /**
2839  * Convert VXLAN VNI to 32-bit integer.
2840  *
2841  * @param[in] vni
2842  *   VXLAN VNI in 24-bit wire format.
2843  *
2844  * @return
2845  *   VXLAN VNI as a 32-bit integer value in network endian.
2846  */
2847 static inline rte_be32_t
2848 vxlan_vni_as_be32(const uint8_t vni[3])
2849 {
2850         union {
2851                 uint8_t vni[4];
2852                 rte_be32_t dword;
2853         } ret = {
2854                 .vni = { 0, vni[0], vni[1], vni[2] },
2855         };
2856         return ret.dword;
2857 }
2858
2859 /**
2860  * Helper function to process RTE_FLOW_ITEM_TYPE_ETH entry in configuration
2861  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the MAC address fields
2862  * in the encapsulation parameters structure. The item must be prevalidated,
2863  * no any validation checks performed by function.
2864  *
2865  * @param[in] spec
2866  *   RTE_FLOW_ITEM_TYPE_ETH entry specification.
2867  * @param[in] mask
2868  *   RTE_FLOW_ITEM_TYPE_ETH entry mask.
2869  * @param[out] encap
2870  *   Structure to fill the gathered MAC address data.
2871  */
2872 static void
2873 flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec,
2874                                const struct rte_flow_item_eth *mask,
2875                                struct flow_tcf_vxlan_encap *encap)
2876 {
2877         /* Item must be validated before. No redundant checks. */
2878         assert(spec);
2879         if (!mask || !memcmp(&mask->dst,
2880                              &rte_flow_item_eth_mask.dst,
2881                              sizeof(rte_flow_item_eth_mask.dst))) {
2882                 /*
2883                  * Ethernet addresses are not supported by
2884                  * tc as tunnel_key parameters. Destination
2885                  * address is needed to form encap packet
2886                  * header and retrieved by kernel from
2887                  * implicit sources (ARP table, etc),
2888                  * address masks are not supported at all.
2889                  */
2890                 encap->eth.dst = spec->dst;
2891                 encap->mask |= FLOW_TCF_ENCAP_ETH_DST;
2892         }
2893         if (!mask || !memcmp(&mask->src,
2894                              &rte_flow_item_eth_mask.src,
2895                              sizeof(rte_flow_item_eth_mask.src))) {
2896                 /*
2897                  * Ethernet addresses are not supported by
2898                  * tc as tunnel_key parameters. Source ethernet
2899                  * address is ignored anyway.
2900                  */
2901                 encap->eth.src = spec->src;
2902                 encap->mask |= FLOW_TCF_ENCAP_ETH_SRC;
2903         }
2904 }
2905
2906 /**
2907  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV4 entry in configuration
2908  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV4 address fields
2909  * in the encapsulation parameters structure. The item must be prevalidated,
2910  * no any validation checks performed by function.
2911  *
2912  * @param[in] spec
2913  *   RTE_FLOW_ITEM_TYPE_IPV4 entry specification.
2914  * @param[out] encap
2915  *   Structure to fill the gathered IPV4 address data.
2916  */
2917 static void
2918 flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec,
2919                                 struct flow_tcf_vxlan_encap *encap)
2920 {
2921         /* Item must be validated before. No redundant checks. */
2922         assert(spec);
2923         encap->ipv4.dst = spec->hdr.dst_addr;
2924         encap->ipv4.src = spec->hdr.src_addr;
2925         encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC |
2926                        FLOW_TCF_ENCAP_IPV4_DST;
2927 }
2928
2929 /**
2930  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV6 entry in configuration
2931  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV6 address fields
2932  * in the encapsulation parameters structure. The item must be prevalidated,
2933  * no any validation checks performed by function.
2934  *
2935  * @param[in] spec
2936  *   RTE_FLOW_ITEM_TYPE_IPV6 entry specification.
2937  * @param[out] encap
2938  *   Structure to fill the gathered IPV6 address data.
2939  */
2940 static void
2941 flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec,
2942                                 struct flow_tcf_vxlan_encap *encap)
2943 {
2944         /* Item must be validated before. No redundant checks. */
2945         assert(spec);
2946         memcpy(encap->ipv6.dst, spec->hdr.dst_addr, IPV6_ADDR_LEN);
2947         memcpy(encap->ipv6.src, spec->hdr.src_addr, IPV6_ADDR_LEN);
2948         encap->mask |= FLOW_TCF_ENCAP_IPV6_SRC |
2949                        FLOW_TCF_ENCAP_IPV6_DST;
2950 }
2951
2952 /**
2953  * Helper function to process RTE_FLOW_ITEM_TYPE_UDP entry in configuration
2954  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the UDP port fields
2955  * in the encapsulation parameters structure. The item must be prevalidated,
2956  * no any validation checks performed by function.
2957  *
2958  * @param[in] spec
2959  *   RTE_FLOW_ITEM_TYPE_UDP entry specification.
2960  * @param[in] mask
2961  *   RTE_FLOW_ITEM_TYPE_UDP entry mask.
2962  * @param[out] encap
2963  *   Structure to fill the gathered UDP port data.
2964  */
2965 static void
2966 flow_tcf_parse_vxlan_encap_udp(const struct rte_flow_item_udp *spec,
2967                                const struct rte_flow_item_udp *mask,
2968                                struct flow_tcf_vxlan_encap *encap)
2969 {
2970         assert(spec);
2971         encap->udp.dst = spec->hdr.dst_port;
2972         encap->mask |= FLOW_TCF_ENCAP_UDP_DST;
2973         if (!mask || mask->hdr.src_port != RTE_BE16(0x0000)) {
2974                 encap->udp.src = spec->hdr.src_port;
2975                 encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC;
2976         }
2977 }
2978
2979 /**
2980  * Helper function to process RTE_FLOW_ITEM_TYPE_VXLAN entry in configuration
2981  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the VNI fields
2982  * in the encapsulation parameters structure. The item must be prevalidated,
2983  * no any validation checks performed by function.
2984  *
2985  * @param[in] spec
2986  *   RTE_FLOW_ITEM_TYPE_VXLAN entry specification.
2987  * @param[out] encap
2988  *   Structure to fill the gathered VNI address data.
2989  */
2990 static void
2991 flow_tcf_parse_vxlan_encap_vni(const struct rte_flow_item_vxlan *spec,
2992                                struct flow_tcf_vxlan_encap *encap)
2993 {
2994         /* Item must be validated before. Do not redundant checks. */
2995         assert(spec);
2996         memcpy(encap->vxlan.vni, spec->vni, sizeof(encap->vxlan.vni));
2997         encap->mask |= FLOW_TCF_ENCAP_VXLAN_VNI;
2998 }
2999
3000 /**
3001  * Populate consolidated encapsulation object from list of pattern items.
3002  *
3003  * Helper function to process configuration of action such as
3004  * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. The item list should be
3005  * validated, there is no way to return an meaningful error.
3006  *
3007  * @param[in] action
3008  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
3009  *   List of pattern items to gather data from.
3010  * @param[out] src
3011  *   Structure to fill gathered data.
3012  */
3013 static void
3014 flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action,
3015                            struct flow_tcf_vxlan_encap *encap)
3016 {
3017         union {
3018                 const struct rte_flow_item_eth *eth;
3019                 const struct rte_flow_item_ipv4 *ipv4;
3020                 const struct rte_flow_item_ipv6 *ipv6;
3021                 const struct rte_flow_item_udp *udp;
3022                 const struct rte_flow_item_vxlan *vxlan;
3023         } spec, mask;
3024         const struct rte_flow_item *items;
3025
3026         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
3027         assert(action->conf);
3028
3029         items = ((const struct rte_flow_action_vxlan_encap *)
3030                                         action->conf)->definition;
3031         assert(items);
3032         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3033                 switch (items->type) {
3034                 case RTE_FLOW_ITEM_TYPE_VOID:
3035                         break;
3036                 case RTE_FLOW_ITEM_TYPE_ETH:
3037                         mask.eth = items->mask;
3038                         spec.eth = items->spec;
3039                         flow_tcf_parse_vxlan_encap_eth(spec.eth, mask.eth,
3040                                                        encap);
3041                         break;
3042                 case RTE_FLOW_ITEM_TYPE_IPV4:
3043                         spec.ipv4 = items->spec;
3044                         flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4, encap);
3045                         break;
3046                 case RTE_FLOW_ITEM_TYPE_IPV6:
3047                         spec.ipv6 = items->spec;
3048                         flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6, encap);
3049                         break;
3050                 case RTE_FLOW_ITEM_TYPE_UDP:
3051                         mask.udp = items->mask;
3052                         spec.udp = items->spec;
3053                         flow_tcf_parse_vxlan_encap_udp(spec.udp, mask.udp,
3054                                                        encap);
3055                         break;
3056                 case RTE_FLOW_ITEM_TYPE_VXLAN:
3057                         spec.vxlan = items->spec;
3058                         flow_tcf_parse_vxlan_encap_vni(spec.vxlan, encap);
3059                         break;
3060                 default:
3061                         assert(false);
3062                         DRV_LOG(WARNING,
3063                                 "unsupported item %p type %d,"
3064                                 " items must be validated"
3065                                 " before flow creation",
3066                                 (const void *)items, items->type);
3067                         encap->mask = 0;
3068                         return;
3069                 }
3070         }
3071 }
3072
3073 /**
3074  * Translate flow for Linux TC flower and construct Netlink message.
3075  *
3076  * @param[in] priv
3077  *   Pointer to the priv structure.
3078  * @param[in, out] flow
3079  *   Pointer to the sub flow.
3080  * @param[in] attr
3081  *   Pointer to the flow attributes.
3082  * @param[in] items
3083  *   Pointer to the list of items.
3084  * @param[in] actions
3085  *   Pointer to the list of actions.
3086  * @param[out] error
3087  *   Pointer to the error structure.
3088  *
3089  * @return
3090  *   0 on success, a negative errno value otherwise and rte_errno is set.
3091  */
3092 static int
3093 flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow,
3094                    const struct rte_flow_attr *attr,
3095                    const struct rte_flow_item items[],
3096                    const struct rte_flow_action actions[],
3097                    struct rte_flow_error *error)
3098 {
3099         union {
3100                 const struct rte_flow_item_port_id *port_id;
3101                 const struct rte_flow_item_eth *eth;
3102                 const struct rte_flow_item_vlan *vlan;
3103                 const struct rte_flow_item_ipv4 *ipv4;
3104                 const struct rte_flow_item_ipv6 *ipv6;
3105                 const struct rte_flow_item_tcp *tcp;
3106                 const struct rte_flow_item_udp *udp;
3107                 const struct rte_flow_item_vxlan *vxlan;
3108         } spec, mask;
3109         union {
3110                 const struct rte_flow_action_port_id *port_id;
3111                 const struct rte_flow_action_jump *jump;
3112                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
3113                 const struct rte_flow_action_of_set_vlan_vid *
3114                         of_set_vlan_vid;
3115                 const struct rte_flow_action_of_set_vlan_pcp *
3116                         of_set_vlan_pcp;
3117         } conf;
3118         union {
3119                 struct flow_tcf_tunnel_hdr *hdr;
3120                 struct flow_tcf_vxlan_decap *vxlan;
3121         } decap = {
3122                 .hdr = NULL,
3123         };
3124         union {
3125                 struct flow_tcf_tunnel_hdr *hdr;
3126                 struct flow_tcf_vxlan_encap *vxlan;
3127         } encap = {
3128                 .hdr = NULL,
3129         };
3130         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
3131         struct nlmsghdr *nlh = dev_flow->tcf.nlh;
3132         struct tcmsg *tcm = dev_flow->tcf.tcm;
3133         uint32_t na_act_index_cur;
3134         rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
3135         rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
3136         rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
3137         bool ip_proto_set = 0;
3138         bool tunnel_outer = 0;
3139         struct nlattr *na_flower;
3140         struct nlattr *na_flower_act;
3141         struct nlattr *na_vlan_id = NULL;
3142         struct nlattr *na_vlan_priority = NULL;
3143         uint64_t item_flags = 0;
3144         int ret;
3145
3146         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
3147                                                 PTOI_TABLE_SZ_MAX(dev)));
3148         if (dev_flow->tcf.tunnel) {
3149                 switch (dev_flow->tcf.tunnel->type) {
3150                 case FLOW_TCF_TUNACT_VXLAN_DECAP:
3151                         decap.vxlan = dev_flow->tcf.vxlan_decap;
3152                         tunnel_outer = 1;
3153                         break;
3154                 case FLOW_TCF_TUNACT_VXLAN_ENCAP:
3155                         encap.vxlan = dev_flow->tcf.vxlan_encap;
3156                         break;
3157                 /* New tunnel actions can be added here. */
3158                 default:
3159                         assert(false);
3160                         break;
3161                 }
3162         }
3163         nlh = dev_flow->tcf.nlh;
3164         tcm = dev_flow->tcf.tcm;
3165         /* Prepare API must have been called beforehand. */
3166         assert(nlh != NULL && tcm != NULL);
3167         tcm->tcm_family = AF_UNSPEC;
3168         tcm->tcm_ifindex = ptoi[0].ifindex;
3169         tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
3170         /*
3171          * Priority cannot be zero to prevent the kernel from picking one
3172          * automatically.
3173          */
3174         tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16, outer_etype);
3175         if (attr->group > 0)
3176                 mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group);
3177         mnl_attr_put_strz(nlh, TCA_KIND, "flower");
3178         na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
3179         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3180                 unsigned int i;
3181
3182                 switch (items->type) {
3183                 case RTE_FLOW_ITEM_TYPE_VOID:
3184                         break;
3185                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
3186                         mask.port_id = flow_tcf_item_mask
3187                                 (items, &rte_flow_item_port_id_mask,
3188                                  &flow_tcf_mask_supported.port_id,
3189                                  &flow_tcf_mask_empty.port_id,
3190                                  sizeof(flow_tcf_mask_supported.port_id),
3191                                  error);
3192                         assert(mask.port_id);
3193                         if (mask.port_id == &flow_tcf_mask_empty.port_id)
3194                                 break;
3195                         spec.port_id = items->spec;
3196                         if (!mask.port_id->id)
3197                                 i = 0;
3198                         else
3199                                 for (i = 0; ptoi[i].ifindex; ++i)
3200                                         if (ptoi[i].port_id == spec.port_id->id)
3201                                                 break;
3202                         assert(ptoi[i].ifindex);
3203                         tcm->tcm_ifindex = ptoi[i].ifindex;
3204                         break;
3205                 case RTE_FLOW_ITEM_TYPE_ETH:
3206                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3207                                       MLX5_FLOW_LAYER_INNER_L2 :
3208                                       MLX5_FLOW_LAYER_OUTER_L2;
3209                         mask.eth = flow_tcf_item_mask
3210                                 (items, &rte_flow_item_eth_mask,
3211                                  &flow_tcf_mask_supported.eth,
3212                                  &flow_tcf_mask_empty.eth,
3213                                  sizeof(flow_tcf_mask_supported.eth),
3214                                  error);
3215                         assert(mask.eth);
3216                         if (mask.eth == &flow_tcf_mask_empty.eth)
3217                                 break;
3218                         spec.eth = items->spec;
3219                         if (mask.eth->type) {
3220                                 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
3221                                         inner_etype = spec.eth->type;
3222                                 else
3223                                         outer_etype = spec.eth->type;
3224                         }
3225                         if (tunnel_outer) {
3226                                 DRV_LOG(WARNING,
3227                                         "outer L2 addresses cannot be"
3228                                         " forced is outer ones for tunnel,"
3229                                         " parameter is ignored");
3230                                 break;
3231                         }
3232                         if (!is_zero_ether_addr(&mask.eth->dst)) {
3233                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
3234                                              ETHER_ADDR_LEN,
3235                                              spec.eth->dst.addr_bytes);
3236                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK,
3237                                              ETHER_ADDR_LEN,
3238                                              mask.eth->dst.addr_bytes);
3239                         }
3240                         if (!is_zero_ether_addr(&mask.eth->src)) {
3241                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC,
3242                                              ETHER_ADDR_LEN,
3243                                              spec.eth->src.addr_bytes);
3244                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK,
3245                                              ETHER_ADDR_LEN,
3246                                              mask.eth->src.addr_bytes);
3247                         }
3248                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3249                         break;
3250                 case RTE_FLOW_ITEM_TYPE_VLAN:
3251                         assert(!encap.hdr);
3252                         assert(!decap.hdr);
3253                         assert(!tunnel_outer);
3254                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
3255                         mask.vlan = flow_tcf_item_mask
3256                                 (items, &rte_flow_item_vlan_mask,
3257                                  &flow_tcf_mask_supported.vlan,
3258                                  &flow_tcf_mask_empty.vlan,
3259                                  sizeof(flow_tcf_mask_supported.vlan),
3260                                  error);
3261                         assert(mask.vlan);
3262                         if (mask.vlan == &flow_tcf_mask_empty.vlan)
3263                                 break;
3264                         spec.vlan = items->spec;
3265                         assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3266                                outer_etype == RTE_BE16(ETH_P_8021Q));
3267                         outer_etype = RTE_BE16(ETH_P_8021Q);
3268                         if (mask.vlan->inner_type)
3269                                 vlan_etype = spec.vlan->inner_type;
3270                         if (mask.vlan->tci & RTE_BE16(0xe000))
3271                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_VLAN_PRIO,
3272                                                 (rte_be_to_cpu_16
3273                                                  (spec.vlan->tci) >> 13) & 0x7);
3274                         if (mask.vlan->tci & RTE_BE16(0x0fff))
3275                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_VLAN_ID,
3276                                                  rte_be_to_cpu_16
3277                                                  (spec.vlan->tci &
3278                                                   RTE_BE16(0x0fff)));
3279                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3280                         break;
3281                 case RTE_FLOW_ITEM_TYPE_IPV4:
3282                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3283                                       MLX5_FLOW_LAYER_INNER_L3_IPV4 :
3284                                       MLX5_FLOW_LAYER_OUTER_L3_IPV4;
3285                         mask.ipv4 = flow_tcf_item_mask
3286                                 (items, &rte_flow_item_ipv4_mask,
3287                                  &flow_tcf_mask_supported.ipv4,
3288                                  &flow_tcf_mask_empty.ipv4,
3289                                  sizeof(flow_tcf_mask_supported.ipv4),
3290                                  error);
3291                         assert(mask.ipv4);
3292                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3293                                 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3294                                        inner_etype == RTE_BE16(ETH_P_IP));
3295                                 inner_etype = RTE_BE16(ETH_P_IP);
3296                         } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3297                                 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3298                                        vlan_etype == RTE_BE16(ETH_P_IP));
3299                                 vlan_etype = RTE_BE16(ETH_P_IP);
3300                         } else {
3301                                 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3302                                        outer_etype == RTE_BE16(ETH_P_IP));
3303                                 outer_etype = RTE_BE16(ETH_P_IP);
3304                         }
3305                         spec.ipv4 = items->spec;
3306                         if (!tunnel_outer && mask.ipv4->hdr.next_proto_id) {
3307                                 /*
3308                                  * No way to set IP protocol for outer tunnel
3309                                  * layers. Usually it is fixed, for example,
3310                                  * to UDP for VXLAN/GPE.
3311                                  */
3312                                 assert(spec.ipv4); /* Mask is not empty. */
3313                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3314                                                 spec.ipv4->hdr.next_proto_id);
3315                                 ip_proto_set = 1;
3316                         }
3317                         if (mask.ipv4 == &flow_tcf_mask_empty.ipv4 ||
3318                              (!mask.ipv4->hdr.src_addr &&
3319                               !mask.ipv4->hdr.dst_addr)) {
3320                                 if (!tunnel_outer)
3321                                         break;
3322                                 /*
3323                                  * For tunnel outer we must set outer IP key
3324                                  * anyway, even if the specification/mask is
3325                                  * empty. There is no another way to tell
3326                                  * kernel about he outer layer protocol.
3327                                  */
3328                                 mnl_attr_put_u32
3329                                         (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC,
3330                                          mask.ipv4->hdr.src_addr);
3331                                 mnl_attr_put_u32
3332                                         (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
3333                                          mask.ipv4->hdr.src_addr);
3334                                 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3335                                 break;
3336                         }
3337                         if (mask.ipv4->hdr.src_addr) {
3338                                 mnl_attr_put_u32
3339                                         (nlh, tunnel_outer ?
3340                                          TCA_FLOWER_KEY_ENC_IPV4_SRC :
3341                                          TCA_FLOWER_KEY_IPV4_SRC,
3342                                          spec.ipv4->hdr.src_addr);
3343                                 mnl_attr_put_u32
3344                                         (nlh, tunnel_outer ?
3345                                          TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK :
3346                                          TCA_FLOWER_KEY_IPV4_SRC_MASK,
3347                                          mask.ipv4->hdr.src_addr);
3348                         }
3349                         if (mask.ipv4->hdr.dst_addr) {
3350                                 mnl_attr_put_u32
3351                                         (nlh, tunnel_outer ?
3352                                          TCA_FLOWER_KEY_ENC_IPV4_DST :
3353                                          TCA_FLOWER_KEY_IPV4_DST,
3354                                          spec.ipv4->hdr.dst_addr);
3355                                 mnl_attr_put_u32
3356                                         (nlh, tunnel_outer ?
3357                                          TCA_FLOWER_KEY_ENC_IPV4_DST_MASK :
3358                                          TCA_FLOWER_KEY_IPV4_DST_MASK,
3359                                          mask.ipv4->hdr.dst_addr);
3360                         }
3361                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3362                         break;
3363                 case RTE_FLOW_ITEM_TYPE_IPV6: {
3364                         bool ipv6_src, ipv6_dst;
3365
3366                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3367                                       MLX5_FLOW_LAYER_INNER_L3_IPV6 :
3368                                       MLX5_FLOW_LAYER_OUTER_L3_IPV6;
3369                         mask.ipv6 = flow_tcf_item_mask
3370                                 (items, &rte_flow_item_ipv6_mask,
3371                                  &flow_tcf_mask_supported.ipv6,
3372                                  &flow_tcf_mask_empty.ipv6,
3373                                  sizeof(flow_tcf_mask_supported.ipv6),
3374                                  error);
3375                         assert(mask.ipv6);
3376                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3377                                 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3378                                        inner_etype == RTE_BE16(ETH_P_IPV6));
3379                                 inner_etype = RTE_BE16(ETH_P_IPV6);
3380                         } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3381                                 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3382                                        vlan_etype == RTE_BE16(ETH_P_IPV6));
3383                                 vlan_etype = RTE_BE16(ETH_P_IPV6);
3384                         } else {
3385                                 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3386                                        outer_etype == RTE_BE16(ETH_P_IPV6));
3387                                 outer_etype = RTE_BE16(ETH_P_IPV6);
3388                         }
3389                         spec.ipv6 = items->spec;
3390                         if (!tunnel_outer && mask.ipv6->hdr.proto) {
3391                                 /*
3392                                  * No way to set IP protocol for outer tunnel
3393                                  * layers. Usually it is fixed, for example,
3394                                  * to UDP for VXLAN/GPE.
3395                                  */
3396                                 assert(spec.ipv6); /* Mask is not empty. */
3397                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3398                                                 spec.ipv6->hdr.proto);
3399                                 ip_proto_set = 1;
3400                         }
3401                         ipv6_dst = !IN6_IS_ADDR_UNSPECIFIED
3402                                                 (mask.ipv6->hdr.dst_addr);
3403                         ipv6_src = !IN6_IS_ADDR_UNSPECIFIED
3404                                                 (mask.ipv6->hdr.src_addr);
3405                         if (mask.ipv6 == &flow_tcf_mask_empty.ipv6 ||
3406                              (!ipv6_dst && !ipv6_src)) {
3407                                 if (!tunnel_outer)
3408                                         break;
3409                                 /*
3410                                  * For tunnel outer we must set outer IP key
3411                                  * anyway, even if the specification/mask is
3412                                  * empty. There is no another way to tell
3413                                  * kernel about he outer layer protocol.
3414                                  */
3415                                 mnl_attr_put(nlh,
3416                                              TCA_FLOWER_KEY_ENC_IPV6_SRC,
3417                                              IPV6_ADDR_LEN,
3418                                              mask.ipv6->hdr.src_addr);
3419                                 mnl_attr_put(nlh,
3420                                              TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
3421                                              IPV6_ADDR_LEN,
3422                                              mask.ipv6->hdr.src_addr);
3423                                 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3424                                 break;
3425                         }
3426                         if (ipv6_src) {
3427                                 mnl_attr_put(nlh, tunnel_outer ?
3428                                              TCA_FLOWER_KEY_ENC_IPV6_SRC :
3429                                              TCA_FLOWER_KEY_IPV6_SRC,
3430                                              IPV6_ADDR_LEN,
3431                                              spec.ipv6->hdr.src_addr);
3432                                 mnl_attr_put(nlh, tunnel_outer ?
3433                                              TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK :
3434                                              TCA_FLOWER_KEY_IPV6_SRC_MASK,
3435                                              IPV6_ADDR_LEN,
3436                                              mask.ipv6->hdr.src_addr);
3437                         }
3438                         if (ipv6_dst) {
3439                                 mnl_attr_put(nlh, tunnel_outer ?
3440                                              TCA_FLOWER_KEY_ENC_IPV6_DST :
3441                                              TCA_FLOWER_KEY_IPV6_DST,
3442                                              IPV6_ADDR_LEN,
3443                                              spec.ipv6->hdr.dst_addr);
3444                                 mnl_attr_put(nlh, tunnel_outer ?
3445                                              TCA_FLOWER_KEY_ENC_IPV6_DST_MASK :
3446                                              TCA_FLOWER_KEY_IPV6_DST_MASK,
3447                                              IPV6_ADDR_LEN,
3448                                              mask.ipv6->hdr.dst_addr);
3449                         }
3450                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3451                         break;
3452                 }
3453                 case RTE_FLOW_ITEM_TYPE_UDP:
3454                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3455                                       MLX5_FLOW_LAYER_INNER_L4_UDP :
3456                                       MLX5_FLOW_LAYER_OUTER_L4_UDP;
3457                         mask.udp = flow_tcf_item_mask
3458                                 (items, &rte_flow_item_udp_mask,
3459                                  &flow_tcf_mask_supported.udp,
3460                                  &flow_tcf_mask_empty.udp,
3461                                  sizeof(flow_tcf_mask_supported.udp),
3462                                  error);
3463                         assert(mask.udp);
3464                         spec.udp = items->spec;
3465                         if (!tunnel_outer) {
3466                                 if (!ip_proto_set)
3467                                         mnl_attr_put_u8
3468                                                 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3469                                                 IPPROTO_UDP);
3470                                 if (mask.udp == &flow_tcf_mask_empty.udp)
3471                                         break;
3472                         } else {
3473                                 assert(mask.udp != &flow_tcf_mask_empty.udp);
3474                                 decap.vxlan->udp_port =
3475                                         rte_be_to_cpu_16
3476                                                 (spec.udp->hdr.dst_port);
3477                         }
3478                         if (mask.udp->hdr.src_port) {
3479                                 mnl_attr_put_u16
3480                                         (nlh, tunnel_outer ?
3481                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT :
3482                                          TCA_FLOWER_KEY_UDP_SRC,
3483                                          spec.udp->hdr.src_port);
3484                                 mnl_attr_put_u16
3485                                         (nlh, tunnel_outer ?
3486                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK :
3487                                          TCA_FLOWER_KEY_UDP_SRC_MASK,
3488                                          mask.udp->hdr.src_port);
3489                         }
3490                         if (mask.udp->hdr.dst_port) {
3491                                 mnl_attr_put_u16
3492                                         (nlh, tunnel_outer ?
3493                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT :
3494                                          TCA_FLOWER_KEY_UDP_DST,
3495                                          spec.udp->hdr.dst_port);
3496                                 mnl_attr_put_u16
3497                                         (nlh, tunnel_outer ?
3498                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK :
3499                                          TCA_FLOWER_KEY_UDP_DST_MASK,
3500                                          mask.udp->hdr.dst_port);
3501                         }
3502                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3503                         break;
3504                 case RTE_FLOW_ITEM_TYPE_TCP:
3505                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3506                                       MLX5_FLOW_LAYER_INNER_L4_TCP :
3507                                       MLX5_FLOW_LAYER_OUTER_L4_TCP;
3508                         mask.tcp = flow_tcf_item_mask
3509                                 (items, &rte_flow_item_tcp_mask,
3510                                  &flow_tcf_mask_supported.tcp,
3511                                  &flow_tcf_mask_empty.tcp,
3512                                  sizeof(flow_tcf_mask_supported.tcp),
3513                                  error);
3514                         assert(mask.tcp);
3515                         if (!ip_proto_set)
3516                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3517                                                 IPPROTO_TCP);
3518                         if (mask.tcp == &flow_tcf_mask_empty.tcp)
3519                                 break;
3520                         spec.tcp = items->spec;
3521                         if (mask.tcp->hdr.src_port) {
3522                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_SRC,
3523                                                  spec.tcp->hdr.src_port);
3524                                 mnl_attr_put_u16(nlh,
3525                                                  TCA_FLOWER_KEY_TCP_SRC_MASK,
3526                                                  mask.tcp->hdr.src_port);
3527                         }
3528                         if (mask.tcp->hdr.dst_port) {
3529                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_DST,
3530                                                  spec.tcp->hdr.dst_port);
3531                                 mnl_attr_put_u16(nlh,
3532                                                  TCA_FLOWER_KEY_TCP_DST_MASK,
3533                                                  mask.tcp->hdr.dst_port);
3534                         }
3535                         if (mask.tcp->hdr.tcp_flags) {
3536                                 mnl_attr_put_u16
3537                                         (nlh,
3538                                          TCA_FLOWER_KEY_TCP_FLAGS,
3539                                          rte_cpu_to_be_16
3540                                                 (spec.tcp->hdr.tcp_flags));
3541                                 mnl_attr_put_u16
3542                                         (nlh,
3543                                          TCA_FLOWER_KEY_TCP_FLAGS_MASK,
3544                                          rte_cpu_to_be_16
3545                                                 (mask.tcp->hdr.tcp_flags));
3546                         }
3547                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3548                         break;
3549                 case RTE_FLOW_ITEM_TYPE_VXLAN:
3550                         assert(decap.vxlan);
3551                         tunnel_outer = 0;
3552                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
3553                         spec.vxlan = items->spec;
3554                         mnl_attr_put_u32(nlh,
3555                                          TCA_FLOWER_KEY_ENC_KEY_ID,
3556                                          vxlan_vni_as_be32(spec.vxlan->vni));
3557                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3558                         break;
3559                 default:
3560                         return rte_flow_error_set(error, ENOTSUP,
3561                                                   RTE_FLOW_ERROR_TYPE_ITEM,
3562                                                   NULL, "item not supported");
3563                 }
3564         }
3565         /*
3566          * Set the ether_type flower key and tc rule protocol:
3567          * - if there is nor VLAN neither VXLAN the key is taken from
3568          *   eth item directly or deduced from L3 items.
3569          * - if there is vlan item then key is fixed to 802.1q.
3570          * - if there is vxlan item then key is set to inner tunnel type.
3571          * - simultaneous vlan and vxlan items are prohibited.
3572          */
3573         if (outer_etype != RTE_BE16(ETH_P_ALL)) {
3574                 tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
3575                                            outer_etype);
3576                 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3577                         if (inner_etype != RTE_BE16(ETH_P_ALL))
3578                                 mnl_attr_put_u16(nlh,
3579                                                  TCA_FLOWER_KEY_ETH_TYPE,
3580                                                  inner_etype);
3581                 } else {
3582                         mnl_attr_put_u16(nlh,
3583                                          TCA_FLOWER_KEY_ETH_TYPE,
3584                                          outer_etype);
3585                         if (outer_etype == RTE_BE16(ETH_P_8021Q) &&
3586                             vlan_etype != RTE_BE16(ETH_P_ALL))
3587                                 mnl_attr_put_u16(nlh,
3588                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE,
3589                                                  vlan_etype);
3590                 }
3591                 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3592         }
3593         na_flower_act = mnl_attr_nest_start(nlh, TCA_FLOWER_ACT);
3594         na_act_index_cur = 1;
3595         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
3596                 struct nlattr *na_act_index;
3597                 struct nlattr *na_act;
3598                 unsigned int vlan_act;
3599                 unsigned int i;
3600
3601                 switch (actions->type) {
3602                 case RTE_FLOW_ACTION_TYPE_VOID:
3603                         break;
3604                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
3605                         conf.port_id = actions->conf;
3606                         if (conf.port_id->original)
3607                                 i = 0;
3608                         else
3609                                 for (i = 0; ptoi[i].ifindex; ++i)
3610                                         if (ptoi[i].port_id == conf.port_id->id)
3611                                                 break;
3612                         assert(ptoi[i].ifindex);
3613                         na_act_index =
3614                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3615                         assert(na_act_index);
3616                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
3617                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3618                         assert(na_act);
3619                         if (encap.hdr) {
3620                                 assert(dev_flow->tcf.tunnel);
3621                                 dev_flow->tcf.tunnel->ifindex_ptr =
3622                                         &((struct tc_mirred *)
3623                                         mnl_attr_get_payload
3624                                         (mnl_nlmsg_get_payload_tail
3625                                                 (nlh)))->ifindex;
3626                         }
3627                         mnl_attr_put(nlh, TCA_MIRRED_PARMS,
3628                                      sizeof(struct tc_mirred),
3629                                      &(struct tc_mirred){
3630                                         .action = TC_ACT_STOLEN,
3631                                         .eaction = TCA_EGRESS_REDIR,
3632                                         .ifindex = ptoi[i].ifindex,
3633                                      });
3634                         mnl_attr_nest_end(nlh, na_act);
3635                         mnl_attr_nest_end(nlh, na_act_index);
3636                         break;
3637                 case RTE_FLOW_ACTION_TYPE_JUMP:
3638                         conf.jump = actions->conf;
3639                         na_act_index =
3640                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3641                         assert(na_act_index);
3642                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3643                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3644                         assert(na_act);
3645                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3646                                      sizeof(struct tc_gact),
3647                                      &(struct tc_gact){
3648                                         .action = TC_ACT_GOTO_CHAIN |
3649                                                   conf.jump->group,
3650                                      });
3651                         mnl_attr_nest_end(nlh, na_act);
3652                         mnl_attr_nest_end(nlh, na_act_index);
3653                         break;
3654                 case RTE_FLOW_ACTION_TYPE_DROP:
3655                         na_act_index =
3656                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3657                         assert(na_act_index);
3658                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3659                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3660                         assert(na_act);
3661                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3662                                      sizeof(struct tc_gact),
3663                                      &(struct tc_gact){
3664                                         .action = TC_ACT_SHOT,
3665                                      });
3666                         mnl_attr_nest_end(nlh, na_act);
3667                         mnl_attr_nest_end(nlh, na_act_index);
3668                         break;
3669                 case RTE_FLOW_ACTION_TYPE_COUNT:
3670                         /*
3671                          * Driver adds the count action implicitly for
3672                          * each rule it creates.
3673                          */
3674                         ret = flow_tcf_translate_action_count(dev,
3675                                                               dev_flow, error);
3676                         if (ret < 0)
3677                                 return ret;
3678                         break;
3679                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
3680                         conf.of_push_vlan = NULL;
3681                         vlan_act = TCA_VLAN_ACT_POP;
3682                         goto action_of_vlan;
3683                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
3684                         conf.of_push_vlan = actions->conf;
3685                         vlan_act = TCA_VLAN_ACT_PUSH;
3686                         goto action_of_vlan;
3687                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
3688                         conf.of_set_vlan_vid = actions->conf;
3689                         if (na_vlan_id)
3690                                 goto override_na_vlan_id;
3691                         vlan_act = TCA_VLAN_ACT_MODIFY;
3692                         goto action_of_vlan;
3693                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
3694                         conf.of_set_vlan_pcp = actions->conf;
3695                         if (na_vlan_priority)
3696                                 goto override_na_vlan_priority;
3697                         vlan_act = TCA_VLAN_ACT_MODIFY;
3698                         goto action_of_vlan;
3699 action_of_vlan:
3700                         na_act_index =
3701                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3702                         assert(na_act_index);
3703                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "vlan");
3704                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3705                         assert(na_act);
3706                         mnl_attr_put(nlh, TCA_VLAN_PARMS,
3707                                      sizeof(struct tc_vlan),
3708                                      &(struct tc_vlan){
3709                                         .action = TC_ACT_PIPE,
3710                                         .v_action = vlan_act,
3711                                      });
3712                         if (vlan_act == TCA_VLAN_ACT_POP) {
3713                                 mnl_attr_nest_end(nlh, na_act);
3714                                 mnl_attr_nest_end(nlh, na_act_index);
3715                                 break;
3716                         }
3717                         if (vlan_act == TCA_VLAN_ACT_PUSH)
3718                                 mnl_attr_put_u16(nlh,
3719                                                  TCA_VLAN_PUSH_VLAN_PROTOCOL,
3720                                                  conf.of_push_vlan->ethertype);
3721                         na_vlan_id = mnl_nlmsg_get_payload_tail(nlh);
3722                         mnl_attr_put_u16(nlh, TCA_VLAN_PAD, 0);
3723                         na_vlan_priority = mnl_nlmsg_get_payload_tail(nlh);
3724                         mnl_attr_put_u8(nlh, TCA_VLAN_PAD, 0);
3725                         mnl_attr_nest_end(nlh, na_act);
3726                         mnl_attr_nest_end(nlh, na_act_index);
3727                         if (actions->type ==
3728                             RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
3729 override_na_vlan_id:
3730                                 na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID;
3731                                 *(uint16_t *)mnl_attr_get_payload(na_vlan_id) =
3732                                         rte_be_to_cpu_16
3733                                         (conf.of_set_vlan_vid->vlan_vid);
3734                         } else if (actions->type ==
3735                                    RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
3736 override_na_vlan_priority:
3737                                 na_vlan_priority->nla_type =
3738                                         TCA_VLAN_PUSH_VLAN_PRIORITY;
3739                                 *(uint8_t *)mnl_attr_get_payload
3740                                         (na_vlan_priority) =
3741                                         conf.of_set_vlan_pcp->vlan_pcp;
3742                         }
3743                         break;
3744                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
3745                         assert(decap.vxlan);
3746                         assert(dev_flow->tcf.tunnel);
3747                         dev_flow->tcf.tunnel->ifindex_ptr =
3748                                 (unsigned int *)&tcm->tcm_ifindex;
3749                         na_act_index =
3750                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3751                         assert(na_act_index);
3752                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3753                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3754                         assert(na_act);
3755                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3756                                 sizeof(struct tc_tunnel_key),
3757                                 &(struct tc_tunnel_key){
3758                                         .action = TC_ACT_PIPE,
3759                                         .t_action = TCA_TUNNEL_KEY_ACT_RELEASE,
3760                                         });
3761                         mnl_attr_nest_end(nlh, na_act);
3762                         mnl_attr_nest_end(nlh, na_act_index);
3763                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3764                         break;
3765                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
3766                         assert(encap.vxlan);
3767                         flow_tcf_vxlan_encap_parse(actions, encap.vxlan);
3768                         na_act_index =
3769                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3770                         assert(na_act_index);
3771                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3772                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3773                         assert(na_act);
3774                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3775                                 sizeof(struct tc_tunnel_key),
3776                                 &(struct tc_tunnel_key){
3777                                         .action = TC_ACT_PIPE,
3778                                         .t_action = TCA_TUNNEL_KEY_ACT_SET,
3779                                         });
3780                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_UDP_DST)
3781                                 mnl_attr_put_u16(nlh,
3782                                          TCA_TUNNEL_KEY_ENC_DST_PORT,
3783                                          encap.vxlan->udp.dst);
3784                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_SRC)
3785                                 mnl_attr_put_u32(nlh,
3786                                          TCA_TUNNEL_KEY_ENC_IPV4_SRC,
3787                                          encap.vxlan->ipv4.src);
3788                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_DST)
3789                                 mnl_attr_put_u32(nlh,
3790                                          TCA_TUNNEL_KEY_ENC_IPV4_DST,
3791                                          encap.vxlan->ipv4.dst);
3792                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_SRC)
3793                                 mnl_attr_put(nlh,
3794                                          TCA_TUNNEL_KEY_ENC_IPV6_SRC,
3795                                          sizeof(encap.vxlan->ipv6.src),
3796                                          &encap.vxlan->ipv6.src);
3797                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_DST)
3798                                 mnl_attr_put(nlh,
3799                                          TCA_TUNNEL_KEY_ENC_IPV6_DST,
3800                                          sizeof(encap.vxlan->ipv6.dst),
3801                                          &encap.vxlan->ipv6.dst);
3802                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_VXLAN_VNI)
3803                                 mnl_attr_put_u32(nlh,
3804                                          TCA_TUNNEL_KEY_ENC_KEY_ID,
3805                                          vxlan_vni_as_be32
3806                                                 (encap.vxlan->vxlan.vni));
3807                         mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM, 0);
3808                         mnl_attr_nest_end(nlh, na_act);
3809                         mnl_attr_nest_end(nlh, na_act_index);
3810                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3811                         break;
3812                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
3813                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
3814                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
3815                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
3816                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
3817                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
3818                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
3819                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
3820                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
3821                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
3822                         na_act_index =
3823                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3824                         flow_tcf_create_pedit_mnl_msg(nlh,
3825                                                       &actions, item_flags);
3826                         mnl_attr_nest_end(nlh, na_act_index);
3827                         break;
3828                 default:
3829                         return rte_flow_error_set(error, ENOTSUP,
3830                                                   RTE_FLOW_ERROR_TYPE_ACTION,
3831                                                   actions,
3832                                                   "action not supported");
3833                 }
3834         }
3835         assert(na_flower);
3836         assert(na_flower_act);
3837         mnl_attr_nest_end(nlh, na_flower_act);
3838         dev_flow->tcf.ptc_flags = mnl_attr_get_payload
3839                                         (mnl_nlmsg_get_payload_tail(nlh));
3840         mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, decap.vxlan ?
3841                                                 0 : TCA_CLS_FLAGS_SKIP_SW);
3842         mnl_attr_nest_end(nlh, na_flower);
3843         if (dev_flow->tcf.tunnel && dev_flow->tcf.tunnel->ifindex_ptr)
3844                 dev_flow->tcf.tunnel->ifindex_org =
3845                         *dev_flow->tcf.tunnel->ifindex_ptr;
3846         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3847         return 0;
3848 }
3849
3850 /**
3851  * Send Netlink message with acknowledgment.
3852  *
3853  * @param tcf
3854  *   Flow context to use.
3855  * @param nlh
3856  *   Message to send. This function always raises the NLM_F_ACK flag before
3857  *   sending.
3858  * @param[in] cb
3859  *   Callback handler for received message.
3860  * @param[in] arg
3861  *   Context pointer for callback handler.
3862  *
3863  * @return
3864  *   0 on success, a negative errno value otherwise and rte_errno is set.
3865  */
3866 static int
3867 flow_tcf_nl_ack(struct mlx5_flow_tcf_context *tcf,
3868                 struct nlmsghdr *nlh,
3869                 mnl_cb_t cb, void *arg)
3870 {
3871         unsigned int portid = mnl_socket_get_portid(tcf->nl);
3872         uint32_t seq = tcf->seq++;
3873         int ret, err = 0;
3874
3875         assert(tcf->nl);
3876         assert(tcf->buf);
3877         if (!seq) {
3878                 /* seq 0 is reserved for kernel event-driven notifications. */
3879                 seq = tcf->seq++;
3880         }
3881         nlh->nlmsg_seq = seq;
3882         nlh->nlmsg_flags |= NLM_F_ACK;
3883         ret = mnl_socket_sendto(tcf->nl, nlh, nlh->nlmsg_len);
3884         if (ret <= 0) {
3885                 /* Message send error occurres. */
3886                 rte_errno = errno;
3887                 return -rte_errno;
3888         }
3889         nlh = (struct nlmsghdr *)(tcf->buf);
3890         /*
3891          * The following loop postpones non-fatal errors until multipart
3892          * messages are complete.
3893          */
3894         while (true) {
3895                 ret = mnl_socket_recvfrom(tcf->nl, tcf->buf, tcf->buf_size);
3896                 if (ret < 0) {
3897                         err = errno;
3898                         /*
3899                          * In case of overflow Will receive till
3900                          * end of multipart message. We may lost part
3901                          * of reply messages but mark and return an error.
3902                          */
3903                         if (err != ENOSPC ||
3904                             !(nlh->nlmsg_flags & NLM_F_MULTI) ||
3905                             nlh->nlmsg_type == NLMSG_DONE)
3906                                 break;
3907                 } else {
3908                         ret = mnl_cb_run(nlh, ret, seq, portid, cb, arg);
3909                         if (!ret) {
3910                                 /*
3911                                  * libmnl returns 0 if DONE or
3912                                  * success ACK message found.
3913                                  */
3914                                 break;
3915                         }
3916                         if (ret < 0) {
3917                                 /*
3918                                  * ACK message with error found
3919                                  * or some error occurred.
3920                                  */
3921                                 err = errno;
3922                                 break;
3923                         }
3924                         /* We should continue receiving. */
3925                 }
3926         }
3927         if (!err)
3928                 return 0;
3929         rte_errno = err;
3930         return -err;
3931 }
3932
3933 #define MNL_BUF_EXTRA_SPACE 16
3934 #define MNL_REQUEST_SIZE_MIN 256
3935 #define MNL_REQUEST_SIZE_MAX 2048
3936 #define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \
3937                                  MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX)
3938
3939 /* Data structures used by flow_tcf_xxx_cb() routines. */
3940 struct tcf_nlcb_buf {
3941         LIST_ENTRY(tcf_nlcb_buf) next;
3942         uint32_t size;
3943         alignas(struct nlmsghdr)
3944         uint8_t msg[]; /**< Netlink message data. */
3945 };
3946
3947 struct tcf_nlcb_context {
3948         unsigned int ifindex; /**< Base interface index. */
3949         uint32_t bufsize;
3950         LIST_HEAD(, tcf_nlcb_buf) nlbuf;
3951 };
3952
3953 /**
3954  * Allocate space for netlink command in buffer list
3955  *
3956  * @param[in, out] ctx
3957  *   Pointer to callback context with command buffers list.
3958  * @param[in] size
3959  *   Required size of data buffer to be allocated.
3960  *
3961  * @return
3962  *   Pointer to allocated memory, aligned as message header.
3963  *   NULL if some error occurred.
3964  */
3965 static struct nlmsghdr *
3966 flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size)
3967 {
3968         struct tcf_nlcb_buf *buf;
3969         struct nlmsghdr *nlh;
3970
3971         size = NLMSG_ALIGN(size);
3972         buf = LIST_FIRST(&ctx->nlbuf);
3973         if (buf && (buf->size + size) <= ctx->bufsize) {
3974                 nlh = (struct nlmsghdr *)&buf->msg[buf->size];
3975                 buf->size += size;
3976                 return nlh;
3977         }
3978         if (size > ctx->bufsize) {
3979                 DRV_LOG(WARNING, "netlink: too long command buffer requested");
3980                 return NULL;
3981         }
3982         buf = rte_malloc(__func__,
3983                         ctx->bufsize + sizeof(struct tcf_nlcb_buf),
3984                         alignof(struct tcf_nlcb_buf));
3985         if (!buf) {
3986                 DRV_LOG(WARNING, "netlink: no memory for command buffer");
3987                 return NULL;
3988         }
3989         LIST_INSERT_HEAD(&ctx->nlbuf, buf, next);
3990         buf->size = size;
3991         nlh = (struct nlmsghdr *)&buf->msg[0];
3992         return nlh;
3993 }
3994
3995 /**
3996  * Send the buffers with prepared netlink commands. Scans the list and
3997  * sends all found buffers. Buffers are sent and freed anyway in order
3998  * to prevent memory leakage if some every message in received packet.
3999  *
4000  * @param[in] tcf
4001  *   Context object initialized by mlx5_flow_tcf_context_create().
4002  * @param[in, out] ctx
4003  *   Pointer to callback context with command buffers list.
4004  *
4005  * @return
4006  *   Zero value on success, negative errno value otherwise
4007  *   and rte_errno is set.
4008  */
4009 static int
4010 flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf,
4011                     struct tcf_nlcb_context *ctx)
4012 {
4013         struct tcf_nlcb_buf *bc = LIST_FIRST(&ctx->nlbuf);
4014         int ret = 0;
4015
4016         while (bc) {
4017                 struct tcf_nlcb_buf *bn = LIST_NEXT(bc, next);
4018                 struct nlmsghdr *nlh;
4019                 uint32_t msg = 0;
4020                 int rc;
4021
4022                 while (msg < bc->size) {
4023                         /*
4024                          * Send Netlink commands from buffer in one by one
4025                          * fashion. If we send multiple rule deletion commands
4026                          * in one Netlink message and some error occurs it may
4027                          * cause multiple ACK error messages and break sequence
4028                          * numbers of Netlink communication, because we expect
4029                          * the only one ACK reply.
4030                          */
4031                         assert((bc->size - msg) >= sizeof(struct nlmsghdr));
4032                         nlh = (struct nlmsghdr *)&bc->msg[msg];
4033                         assert((bc->size - msg) >= nlh->nlmsg_len);
4034                         msg += nlh->nlmsg_len;
4035                         rc = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4036                         if (rc) {
4037                                 DRV_LOG(WARNING,
4038                                         "netlink: cleanup error %d", rc);
4039                                 if (!ret)
4040                                         ret = rc;
4041                         }
4042                 }
4043                 rte_free(bc);
4044                 bc = bn;
4045         }
4046         LIST_INIT(&ctx->nlbuf);
4047         return ret;
4048 }
4049
4050 /**
4051  * Collect local IP address rules with scope link attribute  on specified
4052  * network device. This is callback routine called by libmnl mnl_cb_run()
4053  * in loop for every message in received packet.
4054  *
4055  * @param[in] nlh
4056  *   Pointer to reply header.
4057  * @param[in, out] arg
4058  *   Opaque data pointer for this callback.
4059  *
4060  * @return
4061  *   A positive, nonzero value on success, negative errno value otherwise
4062  *   and rte_errno is set.
4063  */
4064 static int
4065 flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg)
4066 {
4067         struct tcf_nlcb_context *ctx = arg;
4068         struct nlmsghdr *cmd;
4069         struct ifaddrmsg *ifa;
4070         struct nlattr *na;
4071         struct nlattr *na_local = NULL;
4072         struct nlattr *na_peer = NULL;
4073         unsigned char family;
4074         uint32_t size;
4075
4076         if (nlh->nlmsg_type != RTM_NEWADDR) {
4077                 rte_errno = EINVAL;
4078                 return -rte_errno;
4079         }
4080         ifa = mnl_nlmsg_get_payload(nlh);
4081         family = ifa->ifa_family;
4082         if (ifa->ifa_index != ctx->ifindex ||
4083             ifa->ifa_scope != RT_SCOPE_LINK ||
4084             !(ifa->ifa_flags & IFA_F_PERMANENT) ||
4085             (family != AF_INET && family != AF_INET6))
4086                 return 1;
4087         mnl_attr_for_each(na, nlh, sizeof(*ifa)) {
4088                 switch (mnl_attr_get_type(na)) {
4089                 case IFA_LOCAL:
4090                         na_local = na;
4091                         break;
4092                 case IFA_ADDRESS:
4093                         na_peer = na;
4094                         break;
4095                 }
4096                 if (na_local && na_peer)
4097                         break;
4098         }
4099         if (!na_local || !na_peer)
4100                 return 1;
4101         /* Local rule found with scope link, permanent and assigned peer. */
4102         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4103                MNL_ALIGN(sizeof(struct ifaddrmsg)) +
4104                (family == AF_INET6 ? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4105                                    : 2 * SZ_NLATTR_TYPE_OF(uint32_t));
4106         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4107         if (!cmd) {
4108                 rte_errno = ENOMEM;
4109                 return -rte_errno;
4110         }
4111         cmd = mnl_nlmsg_put_header(cmd);
4112         cmd->nlmsg_type = RTM_DELADDR;
4113         cmd->nlmsg_flags = NLM_F_REQUEST;
4114         ifa = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifa));
4115         ifa->ifa_flags = IFA_F_PERMANENT;
4116         ifa->ifa_scope = RT_SCOPE_LINK;
4117         ifa->ifa_index = ctx->ifindex;
4118         if (family == AF_INET) {
4119                 ifa->ifa_family = AF_INET;
4120                 ifa->ifa_prefixlen = 32;
4121                 mnl_attr_put_u32(cmd, IFA_LOCAL, mnl_attr_get_u32(na_local));
4122                 mnl_attr_put_u32(cmd, IFA_ADDRESS, mnl_attr_get_u32(na_peer));
4123         } else {
4124                 ifa->ifa_family = AF_INET6;
4125                 ifa->ifa_prefixlen = 128;
4126                 mnl_attr_put(cmd, IFA_LOCAL, IPV6_ADDR_LEN,
4127                         mnl_attr_get_payload(na_local));
4128                 mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN,
4129                         mnl_attr_get_payload(na_peer));
4130         }
4131         assert(size == cmd->nlmsg_len);
4132         return 1;
4133 }
4134
4135 /**
4136  * Cleanup the local IP addresses on outer interface.
4137  *
4138  * @param[in] tcf
4139  *   Context object initialized by mlx5_flow_tcf_context_create().
4140  * @param[in] ifindex
4141  *   Network inferface index to perform cleanup.
4142  */
4143 static void
4144 flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf,
4145                             unsigned int ifindex)
4146 {
4147         struct nlmsghdr *nlh;
4148         struct ifaddrmsg *ifa;
4149         struct tcf_nlcb_context ctx = {
4150                 .ifindex = ifindex,
4151                 .bufsize = MNL_REQUEST_SIZE,
4152                 .nlbuf = LIST_HEAD_INITIALIZER(),
4153         };
4154         int ret;
4155
4156         assert(ifindex);
4157         /*
4158          * Seek and destroy leftovers of local IP addresses with
4159          * matching properties "scope link".
4160          */
4161         nlh = mnl_nlmsg_put_header(tcf->buf);
4162         nlh->nlmsg_type = RTM_GETADDR;
4163         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4164         ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4165         ifa->ifa_family = AF_UNSPEC;
4166         ifa->ifa_index = ifindex;
4167         ifa->ifa_scope = RT_SCOPE_LINK;
4168         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_local_cb, &ctx);
4169         if (ret)
4170                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4171         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4172         if (ret)
4173                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4174 }
4175
4176 /**
4177  * Collect neigh permament rules on specified network device.
4178  * This is callback routine called by libmnl mnl_cb_run() in loop for
4179  * every message in received packet.
4180  *
4181  * @param[in] nlh
4182  *   Pointer to reply header.
4183  * @param[in, out] arg
4184  *   Opaque data pointer for this callback.
4185  *
4186  * @return
4187  *   A positive, nonzero value on success, negative errno value otherwise
4188  *   and rte_errno is set.
4189  */
4190 static int
4191 flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg)
4192 {
4193         struct tcf_nlcb_context *ctx = arg;
4194         struct nlmsghdr *cmd;
4195         struct ndmsg *ndm;
4196         struct nlattr *na;
4197         struct nlattr *na_ip = NULL;
4198         struct nlattr *na_mac = NULL;
4199         unsigned char family;
4200         uint32_t size;
4201
4202         if (nlh->nlmsg_type != RTM_NEWNEIGH) {
4203                 rte_errno = EINVAL;
4204                 return -rte_errno;
4205         }
4206         ndm = mnl_nlmsg_get_payload(nlh);
4207         family = ndm->ndm_family;
4208         if (ndm->ndm_ifindex != (int)ctx->ifindex ||
4209            !(ndm->ndm_state & NUD_PERMANENT) ||
4210            (family != AF_INET && family != AF_INET6))
4211                 return 1;
4212         mnl_attr_for_each(na, nlh, sizeof(*ndm)) {
4213                 switch (mnl_attr_get_type(na)) {
4214                 case NDA_DST:
4215                         na_ip = na;
4216                         break;
4217                 case NDA_LLADDR:
4218                         na_mac = na;
4219                         break;
4220                 }
4221                 if (na_mac && na_ip)
4222                         break;
4223         }
4224         if (!na_mac || !na_ip)
4225                 return 1;
4226         /* Neigh rule with permenent attribute found. */
4227         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4228                MNL_ALIGN(sizeof(struct ndmsg)) +
4229                SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) +
4230                (family == AF_INET6 ? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4231                                    : SZ_NLATTR_TYPE_OF(uint32_t));
4232         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4233         if (!cmd) {
4234                 rte_errno = ENOMEM;
4235                 return -rte_errno;
4236         }
4237         cmd = mnl_nlmsg_put_header(cmd);
4238         cmd->nlmsg_type = RTM_DELNEIGH;
4239         cmd->nlmsg_flags = NLM_F_REQUEST;
4240         ndm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ndm));
4241         ndm->ndm_ifindex = ctx->ifindex;
4242         ndm->ndm_state = NUD_PERMANENT;
4243         ndm->ndm_flags = 0;
4244         ndm->ndm_type = 0;
4245         if (family == AF_INET) {
4246                 ndm->ndm_family = AF_INET;
4247                 mnl_attr_put_u32(cmd, NDA_DST, mnl_attr_get_u32(na_ip));
4248         } else {
4249                 ndm->ndm_family = AF_INET6;
4250                 mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN,
4251                              mnl_attr_get_payload(na_ip));
4252         }
4253         mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN,
4254                      mnl_attr_get_payload(na_mac));
4255         assert(size == cmd->nlmsg_len);
4256         return 1;
4257 }
4258
4259 /**
4260  * Cleanup the neigh rules on outer interface.
4261  *
4262  * @param[in] tcf
4263  *   Context object initialized by mlx5_flow_tcf_context_create().
4264  * @param[in] ifindex
4265  *   Network inferface index to perform cleanup.
4266  */
4267 static void
4268 flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf,
4269                             unsigned int ifindex)
4270 {
4271         struct nlmsghdr *nlh;
4272         struct ndmsg *ndm;
4273         struct tcf_nlcb_context ctx = {
4274                 .ifindex = ifindex,
4275                 .bufsize = MNL_REQUEST_SIZE,
4276                 .nlbuf = LIST_HEAD_INITIALIZER(),
4277         };
4278         int ret;
4279
4280         assert(ifindex);
4281         /* Seek and destroy leftovers of neigh rules. */
4282         nlh = mnl_nlmsg_put_header(tcf->buf);
4283         nlh->nlmsg_type = RTM_GETNEIGH;
4284         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4285         ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4286         ndm->ndm_family = AF_UNSPEC;
4287         ndm->ndm_ifindex = ifindex;
4288         ndm->ndm_state = NUD_PERMANENT;
4289         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_neigh_cb, &ctx);
4290         if (ret)
4291                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4292         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4293         if (ret)
4294                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4295 }
4296
4297 /**
4298  * Collect indices of VXLAN encap/decap interfaces associated with device.
4299  * This is callback routine called by libmnl mnl_cb_run() in loop for
4300  * every message in received packet.
4301  *
4302  * @param[in] nlh
4303  *   Pointer to reply header.
4304  * @param[in, out] arg
4305  *   Opaque data pointer for this callback.
4306  *
4307  * @return
4308  *   A positive, nonzero value on success, negative errno value otherwise
4309  *   and rte_errno is set.
4310  */
4311 static int
4312 flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg)
4313 {
4314         struct tcf_nlcb_context *ctx = arg;
4315         struct nlmsghdr *cmd;
4316         struct ifinfomsg *ifm;
4317         struct nlattr *na;
4318         struct nlattr *na_info = NULL;
4319         struct nlattr *na_vxlan = NULL;
4320         bool found = false;
4321         unsigned int vxindex;
4322         uint32_t size;
4323
4324         if (nlh->nlmsg_type != RTM_NEWLINK) {
4325                 rte_errno = EINVAL;
4326                 return -rte_errno;
4327         }
4328         ifm = mnl_nlmsg_get_payload(nlh);
4329         if (!ifm->ifi_index) {
4330                 rte_errno = EINVAL;
4331                 return -rte_errno;
4332         }
4333         mnl_attr_for_each(na, nlh, sizeof(*ifm))
4334                 if (mnl_attr_get_type(na) == IFLA_LINKINFO) {
4335                         na_info = na;
4336                         break;
4337                 }
4338         if (!na_info)
4339                 return 1;
4340         mnl_attr_for_each_nested(na, na_info) {
4341                 switch (mnl_attr_get_type(na)) {
4342                 case IFLA_INFO_KIND:
4343                         if (!strncmp("vxlan", mnl_attr_get_str(na),
4344                                      mnl_attr_get_len(na)))
4345                                 found = true;
4346                         break;
4347                 case IFLA_INFO_DATA:
4348                         na_vxlan = na;
4349                         break;
4350                 }
4351                 if (found && na_vxlan)
4352                         break;
4353         }
4354         if (!found || !na_vxlan)
4355                 return 1;
4356         found = false;
4357         mnl_attr_for_each_nested(na, na_vxlan) {
4358                 if (mnl_attr_get_type(na) == IFLA_VXLAN_LINK &&
4359                     mnl_attr_get_u32(na) == ctx->ifindex) {
4360                         found = true;
4361                         break;
4362                 }
4363         }
4364         if (!found)
4365                 return 1;
4366         /* Attached VXLAN device found, store the command to delete. */
4367         vxindex = ifm->ifi_index;
4368         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4369                MNL_ALIGN(sizeof(struct ifinfomsg));
4370         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4371         if (!cmd) {
4372                 rte_errno = ENOMEM;
4373                 return -rte_errno;
4374         }
4375         cmd = mnl_nlmsg_put_header(cmd);
4376         cmd->nlmsg_type = RTM_DELLINK;
4377         cmd->nlmsg_flags = NLM_F_REQUEST;
4378         ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm));
4379         ifm->ifi_family = AF_UNSPEC;
4380         ifm->ifi_index = vxindex;
4381         assert(size == cmd->nlmsg_len);
4382         return 1;
4383 }
4384
4385 /**
4386  * Cleanup the outer interface. Removes all found vxlan devices
4387  * attached to specified index, flushes the neigh and local IP
4388  * database.
4389  *
4390  * @param[in] tcf
4391  *   Context object initialized by mlx5_flow_tcf_context_create().
4392  * @param[in] ifindex
4393  *   Network inferface index to perform cleanup.
4394  */
4395 static void
4396 flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf,
4397                             unsigned int ifindex)
4398 {
4399         struct nlmsghdr *nlh;
4400         struct ifinfomsg *ifm;
4401         struct tcf_nlcb_context ctx = {
4402                 .ifindex = ifindex,
4403                 .bufsize = MNL_REQUEST_SIZE,
4404                 .nlbuf = LIST_HEAD_INITIALIZER(),
4405         };
4406         int ret;
4407
4408         assert(ifindex);
4409         /*
4410          * Seek and destroy leftover VXLAN encap/decap interfaces with
4411          * matching properties.
4412          */
4413         nlh = mnl_nlmsg_put_header(tcf->buf);
4414         nlh->nlmsg_type = RTM_GETLINK;
4415         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4416         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4417         ifm->ifi_family = AF_UNSPEC;
4418         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_vxlan_cb, &ctx);
4419         if (ret)
4420                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4421         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4422         if (ret)
4423                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4424 }
4425
4426 /**
4427  * Emit Netlink message to add/remove local address to the outer device.
4428  * The address being added is visible within the link only (scope link).
4429  *
4430  * Note that an implicit route is maintained by the kernel due to the
4431  * presence of a peer address (IFA_ADDRESS).
4432  *
4433  * These rules are used for encapsultion only and allow to assign
4434  * the outer tunnel source IP address.
4435  *
4436  * @param[in] tcf
4437  *   Libmnl socket context object.
4438  * @param[in] encap
4439  *   Encapsulation properties (source address and its peer).
4440  * @param[in] ifindex
4441  *   Network interface to apply rule.
4442  * @param[in] enable
4443  *   Toggle between add and remove.
4444  * @param[out] error
4445  *   Perform verbose error reporting if not NULL.
4446  *
4447  * @return
4448  *   0 on success, a negative errno value otherwise and rte_errno is set.
4449  */
4450 static int
4451 flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf,
4452                     const struct flow_tcf_vxlan_encap *encap,
4453                     unsigned int ifindex,
4454                     bool enable,
4455                     struct rte_flow_error *error)
4456 {
4457         struct nlmsghdr *nlh;
4458         struct ifaddrmsg *ifa;
4459         alignas(struct nlmsghdr)
4460         uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
4461
4462         nlh = mnl_nlmsg_put_header(buf);
4463         nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
4464         nlh->nlmsg_flags =
4465                 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4466         nlh->nlmsg_seq = 0;
4467         ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4468         ifa->ifa_flags = IFA_F_PERMANENT;
4469         ifa->ifa_scope = RT_SCOPE_LINK;
4470         ifa->ifa_index = ifindex;
4471         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4472                 ifa->ifa_family = AF_INET;
4473                 ifa->ifa_prefixlen = 32;
4474                 mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src);
4475                 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST)
4476                         mnl_attr_put_u32(nlh, IFA_ADDRESS,
4477                                               encap->ipv4.dst);
4478         } else {
4479                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4480                 ifa->ifa_family = AF_INET6;
4481                 ifa->ifa_prefixlen = 128;
4482                 mnl_attr_put(nlh, IFA_LOCAL,
4483                                   sizeof(encap->ipv6.src),
4484                                   &encap->ipv6.src);
4485                 if (encap->mask & FLOW_TCF_ENCAP_IPV6_DST)
4486                         mnl_attr_put(nlh, IFA_ADDRESS,
4487                                           sizeof(encap->ipv6.dst),
4488                                           &encap->ipv6.dst);
4489         }
4490         if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4491                 return 0;
4492         return rte_flow_error_set(error, rte_errno,
4493                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4494                                   "netlink: cannot complete IFA request"
4495                                   " (ip addr add)");
4496 }
4497
4498 /**
4499  * Emit Netlink message to add/remove neighbor.
4500  *
4501  * @param[in] tcf
4502  *   Libmnl socket context object.
4503  * @param[in] encap
4504  *   Encapsulation properties (destination address).
4505  * @param[in] ifindex
4506  *   Network interface.
4507  * @param[in] enable
4508  *   Toggle between add and remove.
4509  * @param[out] error
4510  *   Perform verbose error reporting if not NULL.
4511  *
4512  * @return
4513  *   0 on success, a negative errno value otherwise and rte_errno is set.
4514  */
4515 static int
4516 flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf,
4517                      const struct flow_tcf_vxlan_encap *encap,
4518                      unsigned int ifindex,
4519                      bool enable,
4520                      struct rte_flow_error *error)
4521 {
4522         struct nlmsghdr *nlh;
4523         struct ndmsg *ndm;
4524         alignas(struct nlmsghdr)
4525         uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
4526
4527         nlh = mnl_nlmsg_put_header(buf);
4528         nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
4529         nlh->nlmsg_flags =
4530                 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4531         nlh->nlmsg_seq = 0;
4532         ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4533         ndm->ndm_ifindex = ifindex;
4534         ndm->ndm_state = NUD_PERMANENT;
4535         ndm->ndm_flags = 0;
4536         ndm->ndm_type = 0;
4537         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4538                 ndm->ndm_family = AF_INET;
4539                 mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst);
4540         } else {
4541                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4542                 ndm->ndm_family = AF_INET6;
4543                 mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst),
4544                                                  &encap->ipv6.dst);
4545         }
4546         if (encap->mask & FLOW_TCF_ENCAP_ETH_SRC && enable)
4547                 DRV_LOG(WARNING,
4548                         "outer ethernet source address cannot be "
4549                         "forced for VXLAN encapsulation");
4550         if (encap->mask & FLOW_TCF_ENCAP_ETH_DST)
4551                 mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst),
4552                                                     &encap->eth.dst);
4553         if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4554                 return 0;
4555         return rte_flow_error_set(error, rte_errno,
4556                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4557                                   "netlink: cannot complete ND request"
4558                                   " (ip neigh)");
4559 }
4560
4561 /**
4562  * Manage the local IP addresses and their peers IP addresses on the
4563  * outer interface for encapsulation purposes. The kernel searches the
4564  * appropriate device for tunnel egress traffic using the outer source
4565  * IP, this IP should be assigned to the outer network device, otherwise
4566  * kernel rejects the rule.
4567  *
4568  * Adds or removes the addresses using the Netlink command like this:
4569  *   ip addr add <src_ip> peer <dst_ip> scope link dev <ifouter>
4570  *
4571  * The addresses are local to the netdev ("scope link"), this reduces
4572  * the risk of conflicts. Note that an implicit route is maintained by
4573  * the kernel due to the presence of a peer address (IFA_ADDRESS).
4574  *
4575  * @param[in] tcf
4576  *   Libmnl socket context object.
4577  * @param[in] iface
4578  *   Object, contains rule database and ifouter index.
4579  * @param[in] dev_flow
4580  *   Flow object, contains the tunnel parameters (for encap only).
4581  * @param[in] enable
4582  *   Toggle between add and remove.
4583  * @param[out] error
4584  *   Perform verbose error reporting if not NULL.
4585  *
4586  * @return
4587  *   0 on success, a negative errno value otherwise and rte_errno is set.
4588  */
4589 static int
4590 flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf,
4591                      struct tcf_irule *iface,
4592                      struct mlx5_flow *dev_flow,
4593                      bool enable,
4594                      struct rte_flow_error *error)
4595 {
4596         const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4597         struct tcf_local_rule *rule = NULL;
4598         int ret;
4599
4600         assert(encap);
4601         assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4602         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4603                 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_DST);
4604                 LIST_FOREACH(rule, &iface->local, next) {
4605                         if (rule->mask & FLOW_TCF_ENCAP_IPV4_SRC &&
4606                             encap->ipv4.src == rule->ipv4.src &&
4607                             encap->ipv4.dst == rule->ipv4.dst) {
4608                                 break;
4609                         }
4610                 }
4611         } else {
4612                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4613                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4614                 LIST_FOREACH(rule, &iface->local, next) {
4615                         if (rule->mask & FLOW_TCF_ENCAP_IPV6_SRC &&
4616                             !memcmp(&encap->ipv6.src, &rule->ipv6.src,
4617                                             sizeof(encap->ipv6.src)) &&
4618                             !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4619                                             sizeof(encap->ipv6.dst))) {
4620                                 break;
4621                         }
4622                 }
4623         }
4624         if (rule) {
4625                 if (enable) {
4626                         rule->refcnt++;
4627                         return 0;
4628                 }
4629                 if (!rule->refcnt || !--rule->refcnt) {
4630                         LIST_REMOVE(rule, next);
4631                         return flow_tcf_rule_local(tcf, encap,
4632                                         iface->ifouter, false, error);
4633                 }
4634                 return 0;
4635         }
4636         if (!enable) {
4637                 DRV_LOG(WARNING, "disabling not existing local rule");
4638                 rte_flow_error_set(error, ENOENT,
4639                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4640                                    "disabling not existing local rule");
4641                 return -ENOENT;
4642         }
4643         rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule),
4644                                 alignof(struct tcf_local_rule));
4645         if (!rule) {
4646                 rte_flow_error_set(error, ENOMEM,
4647                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4648                                    "unable to allocate memory for local rule");
4649                 return -rte_errno;
4650         }
4651         *rule = (struct tcf_local_rule){.refcnt = 0,
4652                                         .mask = 0,
4653                                         };
4654         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4655                 rule->mask = FLOW_TCF_ENCAP_IPV4_SRC
4656                            | FLOW_TCF_ENCAP_IPV4_DST;
4657                 rule->ipv4.src = encap->ipv4.src;
4658                 rule->ipv4.dst = encap->ipv4.dst;
4659         } else {
4660                 rule->mask = FLOW_TCF_ENCAP_IPV6_SRC
4661                            | FLOW_TCF_ENCAP_IPV6_DST;
4662                 memcpy(&rule->ipv6.src, &encap->ipv6.src, IPV6_ADDR_LEN);
4663                 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4664         }
4665         ret = flow_tcf_rule_local(tcf, encap, iface->ifouter, true, error);
4666         if (ret) {
4667                 rte_free(rule);
4668                 return ret;
4669         }
4670         rule->refcnt++;
4671         LIST_INSERT_HEAD(&iface->local, rule, next);
4672         return 0;
4673 }
4674
4675 /**
4676  * Manage the destination MAC/IP addresses neigh database, kernel uses
4677  * this one to determine the destination MAC address within encapsulation
4678  * header. Adds or removes the entries using the Netlink command like this:
4679  *   ip neigh add dev <ifouter> lladdr <dst_mac> to <dst_ip> nud permanent
4680  *
4681  * @param[in] tcf
4682  *   Libmnl socket context object.
4683  * @param[in] iface
4684  *   Object, contains rule database and ifouter index.
4685  * @param[in] dev_flow
4686  *   Flow object, contains the tunnel parameters (for encap only).
4687  * @param[in] enable
4688  *   Toggle between add and remove.
4689  * @param[out] error
4690  *   Perform verbose error reporting if not NULL.
4691  *
4692  * @return
4693  *   0 on success, a negative errno value otherwise and rte_errno is set.
4694  */
4695 static int
4696 flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf,
4697                      struct tcf_irule *iface,
4698                      struct mlx5_flow *dev_flow,
4699                      bool enable,
4700                      struct rte_flow_error *error)
4701 {
4702         const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4703         struct tcf_neigh_rule *rule = NULL;
4704         int ret;
4705
4706         assert(encap);
4707         assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4708         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4709                 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_SRC);
4710                 LIST_FOREACH(rule, &iface->neigh, next) {
4711                         if (rule->mask & FLOW_TCF_ENCAP_IPV4_DST &&
4712                             encap->ipv4.dst == rule->ipv4.dst) {
4713                                 break;
4714                         }
4715                 }
4716         } else {
4717                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4718                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4719                 LIST_FOREACH(rule, &iface->neigh, next) {
4720                         if (rule->mask & FLOW_TCF_ENCAP_IPV6_DST &&
4721                             !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4722                                                 sizeof(encap->ipv6.dst))) {
4723                                 break;
4724                         }
4725                 }
4726         }
4727         if (rule) {
4728                 if (memcmp(&encap->eth.dst, &rule->eth,
4729                            sizeof(encap->eth.dst))) {
4730                         DRV_LOG(WARNING, "Destination MAC differs"
4731                                          " in neigh rule");
4732                         rte_flow_error_set(error, EEXIST,
4733                                            RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
4734                                            NULL, "Different MAC address"
4735                                            " neigh rule for the same"
4736                                            " destination IP");
4737                                         return -EEXIST;
4738                 }
4739                 if (enable) {
4740                         rule->refcnt++;
4741                         return 0;
4742                 }
4743                 if (!rule->refcnt || !--rule->refcnt) {
4744                         LIST_REMOVE(rule, next);
4745                         return flow_tcf_rule_neigh(tcf, encap,
4746                                                    iface->ifouter,
4747                                                    false, error);
4748                 }
4749                 return 0;
4750         }
4751         if (!enable) {
4752                 DRV_LOG(WARNING, "Disabling not existing neigh rule");
4753                 rte_flow_error_set(error, ENOENT,
4754                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4755                                    "unable to allocate memory for neigh rule");
4756                 return -ENOENT;
4757         }
4758         rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule),
4759                                 alignof(struct tcf_neigh_rule));
4760         if (!rule) {
4761                 rte_flow_error_set(error, ENOMEM,
4762                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4763                                    "unable to allocate memory for neigh rule");
4764                 return -rte_errno;
4765         }
4766         *rule = (struct tcf_neigh_rule){.refcnt = 0,
4767                                         .mask = 0,
4768                                         };
4769         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4770                 rule->mask = FLOW_TCF_ENCAP_IPV4_DST;
4771                 rule->ipv4.dst = encap->ipv4.dst;
4772         } else {
4773                 rule->mask = FLOW_TCF_ENCAP_IPV6_DST;
4774                 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4775         }
4776         memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth));
4777         ret = flow_tcf_rule_neigh(tcf, encap, iface->ifouter, true, error);
4778         if (ret) {
4779                 rte_free(rule);
4780                 return ret;
4781         }
4782         rule->refcnt++;
4783         LIST_INSERT_HEAD(&iface->neigh, rule, next);
4784         return 0;
4785 }
4786
4787 /* VXLAN encap rule database for outer interfaces. */
4788 static  LIST_HEAD(, tcf_irule) iface_list_vxlan = LIST_HEAD_INITIALIZER();
4789
4790 /* VTEP device list is shared between PMD port instances. */
4791 static LIST_HEAD(, tcf_vtep) vtep_list_vxlan = LIST_HEAD_INITIALIZER();
4792 static pthread_mutex_t vtep_list_mutex = PTHREAD_MUTEX_INITIALIZER;
4793
4794 /**
4795  * Acquire the VXLAN encap rules container for specified interface.
4796  * First looks for the container in the existing ones list, creates
4797  * and initializes the new container if existing not found.
4798  *
4799  * @param[in] tcf
4800  *   Context object initialized by mlx5_flow_tcf_context_create().
4801  * @param[in] ifouter
4802  *   Network interface index to create VXLAN encap rules on.
4803  * @param[out] error
4804  *   Perform verbose error reporting if not NULL.
4805  * @return
4806  *   Rule container pointer on success,
4807  *   NULL otherwise and rte_errno is set.
4808  */
4809 static struct tcf_irule*
4810 flow_tcf_encap_irule_acquire(struct mlx5_flow_tcf_context *tcf,
4811                              unsigned int ifouter,
4812                              struct rte_flow_error *error)
4813 {
4814         struct tcf_irule *iface;
4815
4816         /* Look whether the container for encap rules is created. */
4817         assert(ifouter);
4818         LIST_FOREACH(iface, &iface_list_vxlan, next) {
4819                 if (iface->ifouter == ifouter)
4820                         break;
4821         }
4822         if (iface) {
4823                 /* Container already exists, just increment the reference. */
4824                 iface->refcnt++;
4825                 return iface;
4826         }
4827         /* Not found, we should create the new container. */
4828         iface = rte_zmalloc(__func__, sizeof(*iface),
4829                             alignof(struct tcf_irule));
4830         if (!iface) {
4831                 rte_flow_error_set(error, ENOMEM,
4832                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4833                                    "unable to allocate memory for container");
4834                 return NULL;
4835         }
4836         *iface = (struct tcf_irule){
4837                         .local = LIST_HEAD_INITIALIZER(),
4838                         .neigh = LIST_HEAD_INITIALIZER(),
4839                         .ifouter = ifouter,
4840                         .refcnt = 1,
4841         };
4842         /* Interface cleanup for new container created. */
4843         flow_tcf_encap_iface_cleanup(tcf, ifouter);
4844         flow_tcf_encap_local_cleanup(tcf, ifouter);
4845         flow_tcf_encap_neigh_cleanup(tcf, ifouter);
4846         LIST_INSERT_HEAD(&iface_list_vxlan, iface, next);
4847         return iface;
4848 }
4849
4850 /**
4851  * Releases VXLAN encap rules container by pointer. Decrements the
4852  * reference cointer and deletes the container if counter is zero.
4853  *
4854  * @param[in] irule
4855  *   VXLAN rule container pointer to release.
4856  */
4857 static void
4858 flow_tcf_encap_irule_release(struct tcf_irule *iface)
4859 {
4860         assert(iface->refcnt);
4861         if (--iface->refcnt == 0) {
4862                 /* Reference counter is zero, delete the container. */
4863                 assert(LIST_EMPTY(&iface->local));
4864                 assert(LIST_EMPTY(&iface->neigh));
4865                 LIST_REMOVE(iface, next);
4866                 rte_free(iface);
4867         }
4868 }
4869
4870 /**
4871  * Deletes VTEP network device.
4872  *
4873  * @param[in] tcf
4874  *   Context object initialized by mlx5_flow_tcf_context_create().
4875  * @param[in] vtep
4876  *   Object represinting the network device to delete. Memory
4877  *   allocated for this object is freed by routine.
4878  */
4879 static void
4880 flow_tcf_vtep_delete(struct mlx5_flow_tcf_context *tcf,
4881                      struct tcf_vtep *vtep)
4882 {
4883         struct nlmsghdr *nlh;
4884         struct ifinfomsg *ifm;
4885         alignas(struct nlmsghdr)
4886         uint8_t buf[mnl_nlmsg_size(MNL_ALIGN(sizeof(*ifm))) +
4887                     MNL_BUF_EXTRA_SPACE];
4888         int ret;
4889
4890         assert(!vtep->refcnt);
4891         /* Delete only ifaces those we actually created. */
4892         if (vtep->created && vtep->ifindex) {
4893                 DRV_LOG(INFO, "VTEP delete (%d)", vtep->ifindex);
4894                 nlh = mnl_nlmsg_put_header(buf);
4895                 nlh->nlmsg_type = RTM_DELLINK;
4896                 nlh->nlmsg_flags = NLM_F_REQUEST;
4897                 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4898                 ifm->ifi_family = AF_UNSPEC;
4899                 ifm->ifi_index = vtep->ifindex;
4900                 assert(sizeof(buf) >= nlh->nlmsg_len);
4901                 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4902                 if (ret)
4903                         DRV_LOG(WARNING, "netlink: error deleting vxlan"
4904                                          " encap/decap ifindex %u",
4905                                          ifm->ifi_index);
4906         }
4907         rte_free(vtep);
4908 }
4909
4910 /**
4911  * Creates VTEP network device.
4912  *
4913  * @param[in] tcf
4914  *   Context object initialized by mlx5_flow_tcf_context_create().
4915  * @param[in] ifouter
4916  *   Outer interface to attach new-created VXLAN device
4917  *   If zero the VXLAN device will not be attached to any device.
4918  *   These VTEPs are used for decapsulation and can be precreated
4919  *   and shared between processes.
4920  * @param[in] port
4921  *   UDP port of created VTEP device.
4922  * @param[out] error
4923  *   Perform verbose error reporting if not NULL.
4924  *
4925  * @return
4926  * Pointer to created device structure on success,
4927  * NULL otherwise and rte_errno is set.
4928  */
4929 #ifdef HAVE_IFLA_VXLAN_COLLECT_METADATA
4930 static struct tcf_vtep*
4931 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf,
4932                      unsigned int ifouter,
4933                      uint16_t port, struct rte_flow_error *error)
4934 {
4935         struct tcf_vtep *vtep;
4936         struct nlmsghdr *nlh;
4937         struct ifinfomsg *ifm;
4938         char name[sizeof(MLX5_VXLAN_DEVICE_PFX) + 24];
4939         alignas(struct nlmsghdr)
4940         uint8_t buf[mnl_nlmsg_size(sizeof(*ifm)) +
4941                     SZ_NLATTR_DATA_OF(sizeof(name)) +
4942                     SZ_NLATTR_NEST * 2 +
4943                     SZ_NLATTR_STRZ_OF("vxlan") +
4944                     SZ_NLATTR_DATA_OF(sizeof(uint32_t)) +
4945                     SZ_NLATTR_DATA_OF(sizeof(uint16_t)) +
4946                     SZ_NLATTR_DATA_OF(sizeof(uint8_t)) * 3 +
4947                     MNL_BUF_EXTRA_SPACE];
4948         struct nlattr *na_info;
4949         struct nlattr *na_vxlan;
4950         rte_be16_t vxlan_port = rte_cpu_to_be_16(port);
4951         int ret;
4952
4953         vtep = rte_zmalloc(__func__, sizeof(*vtep), alignof(struct tcf_vtep));
4954         if (!vtep) {
4955                 rte_flow_error_set(error, ENOMEM,
4956                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4957                                    "unable to allocate memory for VTEP");
4958                 return NULL;
4959         }
4960         *vtep = (struct tcf_vtep){
4961                         .port = port,
4962                         .local = LIST_HEAD_INITIALIZER(),
4963                         .neigh = LIST_HEAD_INITIALIZER(),
4964         };
4965         memset(buf, 0, sizeof(buf));
4966         nlh = mnl_nlmsg_put_header(buf);
4967         nlh->nlmsg_type = RTM_NEWLINK;
4968         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE  | NLM_F_EXCL;
4969         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4970         ifm->ifi_family = AF_UNSPEC;
4971         ifm->ifi_type = 0;
4972         ifm->ifi_index = 0;
4973         ifm->ifi_flags = IFF_UP;
4974         ifm->ifi_change = 0xffffffff;
4975         snprintf(name, sizeof(name), "%s%u", MLX5_VXLAN_DEVICE_PFX, port);
4976         mnl_attr_put_strz(nlh, IFLA_IFNAME, name);
4977         na_info = mnl_attr_nest_start(nlh, IFLA_LINKINFO);
4978         assert(na_info);
4979         mnl_attr_put_strz(nlh, IFLA_INFO_KIND, "vxlan");
4980         na_vxlan = mnl_attr_nest_start(nlh, IFLA_INFO_DATA);
4981         if (ifouter)
4982                 mnl_attr_put_u32(nlh, IFLA_VXLAN_LINK, ifouter);
4983         assert(na_vxlan);
4984         mnl_attr_put_u8(nlh, IFLA_VXLAN_COLLECT_METADATA, 1);
4985         mnl_attr_put_u8(nlh, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1);
4986         mnl_attr_put_u8(nlh, IFLA_VXLAN_LEARNING, 0);
4987         mnl_attr_put_u16(nlh, IFLA_VXLAN_PORT, vxlan_port);
4988         mnl_attr_nest_end(nlh, na_vxlan);
4989         mnl_attr_nest_end(nlh, na_info);
4990         assert(sizeof(buf) >= nlh->nlmsg_len);
4991         ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4992         if (ret) {
4993                 DRV_LOG(WARNING,
4994                         "netlink: VTEP %s create failure (%d)",
4995                         name, rte_errno);
4996                 if (rte_errno != EEXIST || ifouter)
4997                         /*
4998                          * Some unhandled error occurred or device is
4999                          * for encapsulation and cannot be shared.
5000                          */
5001                         goto error;
5002         } else {
5003                 /*
5004                  * Mark device we actually created.
5005                  * We should explicitly delete
5006                  * when we do not need it anymore.
5007                  */
5008                 vtep->created = 1;
5009         }
5010         /* Try to get ifindex of created of pre-existing device. */
5011         ret = if_nametoindex(name);
5012         if (!ret) {
5013                 DRV_LOG(WARNING,
5014                         "VTEP %s failed to get index (%d)", name, errno);
5015                 rte_flow_error_set
5016                         (error, -errno,
5017                          RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5018                          "netlink: failed to retrieve VTEP ifindex");
5019                 goto error;
5020         }
5021         vtep->ifindex = ret;
5022         vtep->ifouter = ifouter;
5023         memset(buf, 0, sizeof(buf));
5024         nlh = mnl_nlmsg_put_header(buf);
5025         nlh->nlmsg_type = RTM_NEWLINK;
5026         nlh->nlmsg_flags = NLM_F_REQUEST;
5027         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5028         ifm->ifi_family = AF_UNSPEC;
5029         ifm->ifi_type = 0;
5030         ifm->ifi_index = vtep->ifindex;
5031         ifm->ifi_flags = IFF_UP;
5032         ifm->ifi_change = IFF_UP;
5033         ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5034         if (ret) {
5035                 rte_flow_error_set(error, -errno,
5036                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5037                                    "netlink: failed to set VTEP link up");
5038                 DRV_LOG(WARNING, "netlink: VTEP %s set link up failure (%d)",
5039                         name, rte_errno);
5040                 goto clean;
5041         }
5042         ret = mlx5_flow_tcf_init(tcf, vtep->ifindex, error);
5043         if (ret) {
5044                 DRV_LOG(WARNING, "VTEP %s init failure (%d)", name, rte_errno);
5045                 goto clean;
5046         }
5047         DRV_LOG(INFO, "VTEP create (%d, %d)", vtep->port, vtep->ifindex);
5048         vtep->refcnt = 1;
5049         return vtep;
5050 clean:
5051         flow_tcf_vtep_delete(tcf, vtep);
5052         return NULL;
5053 error:
5054         rte_free(vtep);
5055         return NULL;
5056 }
5057 #else
5058 static struct tcf_vtep*
5059 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf __rte_unused,
5060                      unsigned int ifouter __rte_unused,
5061                      uint16_t port __rte_unused,
5062                      struct rte_flow_error *error)
5063 {
5064         rte_flow_error_set(error, ENOTSUP,
5065                            RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5066                            "netlink: failed to create VTEP, "
5067                            "vxlan metadata are not supported by kernel");
5068         return NULL;
5069 }
5070 #endif /* HAVE_IFLA_VXLAN_COLLECT_METADATA */
5071
5072 /**
5073  * Acquire target interface index for VXLAN tunneling decapsulation.
5074  * In order to share the UDP port within the other interfaces the
5075  * VXLAN device created as not attached to any interface (if created).
5076  *
5077  * @param[in] tcf
5078  *   Context object initialized by mlx5_flow_tcf_context_create().
5079  * @param[in] dev_flow
5080  *   Flow tcf object with tunnel structure pointer set.
5081  * @param[out] error
5082  *   Perform verbose error reporting if not NULL.
5083  * @return
5084  *   Interface descriptor pointer on success,
5085  *   NULL otherwise and rte_errno is set.
5086  */
5087 static struct tcf_vtep*
5088 flow_tcf_decap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5089                             struct mlx5_flow *dev_flow,
5090                             struct rte_flow_error *error)
5091 {
5092         struct tcf_vtep *vtep;
5093         uint16_t port = dev_flow->tcf.vxlan_decap->udp_port;
5094
5095         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5096                 if (vtep->port == port)
5097                         break;
5098         }
5099         if (vtep && vtep->ifouter) {
5100                 rte_flow_error_set(error, -errno,
5101                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5102                                    "Failed to create decap VTEP with specified"
5103                                    " UDP port, atatched device exists");
5104                 return NULL;
5105         }
5106         if (vtep) {
5107                 /* Device exists, just increment the reference counter. */
5108                 vtep->refcnt++;
5109                 assert(vtep->ifindex);
5110                 return vtep;
5111         }
5112         /* No decapsulation device exists, try to create the new one. */
5113         vtep = flow_tcf_vtep_create(tcf, 0, port, error);
5114         if (vtep)
5115                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5116         return vtep;
5117 }
5118
5119 /**
5120  * Aqcuire target interface index for VXLAN tunneling encapsulation.
5121  *
5122  * @param[in] tcf
5123  *   Context object initialized by mlx5_flow_tcf_context_create().
5124  * @param[in] ifouter
5125  *   Network interface index to attach VXLAN encap device to.
5126  * @param[in] dev_flow
5127  *   Flow tcf object with tunnel structure pointer set.
5128  * @param[out] error
5129  *   Perform verbose error reporting if not NULL.
5130  * @return
5131  *   Interface descriptor pointer on success,
5132  *   NULL otherwise and rte_errno is set.
5133  */
5134 static struct tcf_vtep*
5135 flow_tcf_encap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5136                             unsigned int ifouter,
5137                             struct mlx5_flow *dev_flow __rte_unused,
5138                             struct rte_flow_error *error)
5139 {
5140         static uint16_t encap_port = MLX5_VXLAN_PORT_MIN - 1;
5141         struct tcf_vtep *vtep;
5142         struct tcf_irule *iface;
5143         int ret;
5144
5145         assert(ifouter);
5146         /* Look whether the attached VTEP for encap is created. */
5147         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5148                 if (vtep->ifouter == ifouter)
5149                         break;
5150         }
5151         if (vtep) {
5152                 /* VTEP already exists, just increment the reference. */
5153                 vtep->refcnt++;
5154         } else {
5155                 uint16_t pcnt;
5156
5157                 /* Not found, we should create the new attached VTEP. */
5158                 flow_tcf_encap_iface_cleanup(tcf, ifouter);
5159                 flow_tcf_encap_local_cleanup(tcf, ifouter);
5160                 flow_tcf_encap_neigh_cleanup(tcf, ifouter);
5161                 for (pcnt = 0; pcnt <= (MLX5_VXLAN_PORT_MAX
5162                                      - MLX5_VXLAN_PORT_MIN); pcnt++) {
5163                         encap_port++;
5164                         /* Wraparound the UDP port index. */
5165                         if (encap_port < MLX5_VXLAN_PORT_MIN ||
5166                             encap_port > MLX5_VXLAN_PORT_MAX)
5167                                 encap_port = MLX5_VXLAN_PORT_MIN;
5168                         /* Check whether UDP port is in already in use. */
5169                         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5170                                 if (vtep->port == encap_port)
5171                                         break;
5172                         }
5173                         if (vtep) {
5174                                 /* Port is in use, try the next one. */
5175                                 vtep = NULL;
5176                                 continue;
5177                         }
5178                         vtep = flow_tcf_vtep_create(tcf, ifouter,
5179                                                     encap_port, error);
5180                         if (vtep) {
5181                                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5182                                 break;
5183                         }
5184                         if (rte_errno != EEXIST)
5185                                 break;
5186                 }
5187                 if (!vtep)
5188                         return NULL;
5189         }
5190         assert(vtep->ifouter == ifouter);
5191         assert(vtep->ifindex);
5192         iface = flow_tcf_encap_irule_acquire(tcf, ifouter, error);
5193         if (!iface) {
5194                 if (--vtep->refcnt == 0)
5195                         flow_tcf_vtep_delete(tcf, vtep);
5196                 return NULL;
5197         }
5198         dev_flow->tcf.vxlan_encap->iface = iface;
5199         /* Create local ipaddr with peer to specify the outer IPs. */
5200         ret = flow_tcf_encap_local(tcf, iface, dev_flow, true, error);
5201         if (!ret) {
5202                 /* Create neigh rule to specify outer destination MAC. */
5203                 ret = flow_tcf_encap_neigh(tcf, iface, dev_flow, true, error);
5204                 if (ret)
5205                         flow_tcf_encap_local(tcf, iface,
5206                                              dev_flow, false, error);
5207         }
5208         if (ret) {
5209                 dev_flow->tcf.vxlan_encap->iface = NULL;
5210                 flow_tcf_encap_irule_release(iface);
5211                 if (--vtep->refcnt == 0)
5212                         flow_tcf_vtep_delete(tcf, vtep);
5213                 return NULL;
5214         }
5215         return vtep;
5216 }
5217
5218 /**
5219  * Acquires target interface index for tunneling of any type.
5220  * Creates the new VTEP if needed.
5221  *
5222  * @param[in] tcf
5223  *   Context object initialized by mlx5_flow_tcf_context_create().
5224  * @param[in] ifouter
5225  *   Network interface index to attach VXLAN encap device to.
5226  * @param[in] dev_flow
5227  *   Flow tcf object with tunnel structure pointer set.
5228  * @param[out] error
5229  *   Perform verbose error reporting if not NULL.
5230  * @return
5231  *   Interface descriptor pointer on success,
5232  *   NULL otherwise and rte_errno is set.
5233  */
5234 static struct tcf_vtep*
5235 flow_tcf_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5236                       unsigned int ifouter,
5237                       struct mlx5_flow *dev_flow,
5238                       struct rte_flow_error *error)
5239 {
5240         struct tcf_vtep *vtep = NULL;
5241
5242         assert(dev_flow->tcf.tunnel);
5243         pthread_mutex_lock(&vtep_list_mutex);
5244         switch (dev_flow->tcf.tunnel->type) {
5245         case FLOW_TCF_TUNACT_VXLAN_ENCAP:
5246                 vtep = flow_tcf_encap_vtep_acquire(tcf, ifouter,
5247                                                   dev_flow, error);
5248                 break;
5249         case FLOW_TCF_TUNACT_VXLAN_DECAP:
5250                 vtep = flow_tcf_decap_vtep_acquire(tcf, dev_flow, error);
5251                 break;
5252         default:
5253                 rte_flow_error_set(error, ENOTSUP,
5254                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5255                                    "unsupported tunnel type");
5256                 break;
5257         }
5258         pthread_mutex_unlock(&vtep_list_mutex);
5259         return vtep;
5260 }
5261
5262 /**
5263  * Release tunneling interface by ifindex. Decrements reference
5264  * counter and actually removes the device if counter is zero.
5265  *
5266  * @param[in] tcf
5267  *   Context object initialized by mlx5_flow_tcf_context_create().
5268  * @param[in] vtep
5269  *   VTEP device descriptor structure.
5270  * @param[in] dev_flow
5271  *   Flow tcf object with tunnel structure pointer set.
5272  */
5273 static void
5274 flow_tcf_vtep_release(struct mlx5_flow_tcf_context *tcf,
5275                       struct tcf_vtep *vtep,
5276                       struct mlx5_flow *dev_flow)
5277 {
5278         assert(dev_flow->tcf.tunnel);
5279         pthread_mutex_lock(&vtep_list_mutex);
5280         switch (dev_flow->tcf.tunnel->type) {
5281         case FLOW_TCF_TUNACT_VXLAN_DECAP:
5282                 break;
5283         case FLOW_TCF_TUNACT_VXLAN_ENCAP: {
5284                 struct tcf_irule *iface;
5285
5286                 /* Remove the encap ancillary rules first. */
5287                 iface = dev_flow->tcf.vxlan_encap->iface;
5288                 assert(iface);
5289                 flow_tcf_encap_neigh(tcf, iface, dev_flow, false, NULL);
5290                 flow_tcf_encap_local(tcf, iface, dev_flow, false, NULL);
5291                 flow_tcf_encap_irule_release(iface);
5292                 dev_flow->tcf.vxlan_encap->iface = NULL;
5293                 break;
5294         }
5295         default:
5296                 assert(false);
5297                 DRV_LOG(WARNING, "Unsupported tunnel type");
5298                 break;
5299         }
5300         assert(vtep->refcnt);
5301         if (--vtep->refcnt == 0) {
5302                 LIST_REMOVE(vtep, next);
5303                 flow_tcf_vtep_delete(tcf, vtep);
5304         }
5305         pthread_mutex_unlock(&vtep_list_mutex);
5306 }
5307
5308 struct tcf_nlcb_query {
5309         uint32_t handle;
5310         uint32_t tc_flags;
5311         uint32_t flags_valid:1;
5312 };
5313
5314 /**
5315  * Collect queried rule attributes. This is callback routine called by
5316  * libmnl mnl_cb_run() in loop for every message in received packet.
5317  * Current implementation collects the flower flags only.
5318  *
5319  * @param[in] nlh
5320  *   Pointer to reply header.
5321  * @param[in, out] arg
5322  *   Context pointer for this callback.
5323  *
5324  * @return
5325  *   A positive, nonzero value on success (required by libmnl
5326  *   to continue messages processing).
5327  */
5328 static int
5329 flow_tcf_collect_query_cb(const struct nlmsghdr *nlh, void *arg)
5330 {
5331         struct tcf_nlcb_query *query = arg;
5332         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
5333         struct nlattr *na, *na_opt;
5334         bool flower = false;
5335
5336         if (nlh->nlmsg_type != RTM_NEWTFILTER ||
5337             tcm->tcm_handle != query->handle)
5338                 return 1;
5339         mnl_attr_for_each(na, nlh, sizeof(*tcm)) {
5340                 switch (mnl_attr_get_type(na)) {
5341                 case TCA_KIND:
5342                         if (strcmp(mnl_attr_get_payload(na), "flower")) {
5343                                 /* Not flower filter, drop entire message. */
5344                                 return 1;
5345                         }
5346                         flower = true;
5347                         break;
5348                 case TCA_OPTIONS:
5349                         if (!flower) {
5350                                 /* Not flower options, drop entire message. */
5351                                 return 1;
5352                         }
5353                         /* Check nested flower options. */
5354                         mnl_attr_for_each_nested(na_opt, na) {
5355                                 switch (mnl_attr_get_type(na_opt)) {
5356                                 case TCA_FLOWER_FLAGS:
5357                                         query->flags_valid = 1;
5358                                         query->tc_flags =
5359                                                 mnl_attr_get_u32(na_opt);
5360                                         break;
5361                                 }
5362                         }
5363                         break;
5364                 }
5365         }
5366         return 1;
5367 }
5368
5369 /**
5370  * Query a TC flower rule flags via netlink.
5371  *
5372  * @param[in] tcf
5373  *   Context object initialized by mlx5_flow_tcf_context_create().
5374  * @param[in] dev_flow
5375  *   Pointer to the flow.
5376  * @param[out] pflags
5377  *   pointer to the data retrieved by the query.
5378  *
5379  * @return
5380  *   0 on success, a negative errno value otherwise.
5381  */
5382 static int
5383 flow_tcf_query_flags(struct mlx5_flow_tcf_context *tcf,
5384                      struct mlx5_flow *dev_flow,
5385                      uint32_t *pflags)
5386 {
5387         struct nlmsghdr *nlh;
5388         struct tcmsg *tcm;
5389         struct tcf_nlcb_query query = {
5390                 .handle = dev_flow->tcf.tcm->tcm_handle,
5391         };
5392
5393         nlh = mnl_nlmsg_put_header(tcf->buf);
5394         nlh->nlmsg_type = RTM_GETTFILTER;
5395         nlh->nlmsg_flags = NLM_F_REQUEST;
5396         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
5397         memcpy(tcm, dev_flow->tcf.tcm, sizeof(*tcm));
5398         /*
5399          * Ignore Netlink error for filter query operations.
5400          * The reply length is sent by kernel as errno.
5401          * Just check we got the flags option.
5402          */
5403         flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_query_cb, &query);
5404         if (!query.flags_valid) {
5405                 *pflags = 0;
5406                 return -ENOENT;
5407         }
5408         *pflags = query.tc_flags;
5409         return 0;
5410 }
5411
5412 /**
5413  * Query and check the in_hw set for specified rule.
5414  *
5415  * @param[in] tcf
5416  *   Context object initialized by mlx5_flow_tcf_context_create().
5417  * @param[in] dev_flow
5418  *   Pointer to the flow to check.
5419  *
5420  * @return
5421  *   0 on success, a negative errno value otherwise.
5422  */
5423 static int
5424 flow_tcf_check_inhw(struct mlx5_flow_tcf_context *tcf,
5425                     struct mlx5_flow *dev_flow)
5426 {
5427         uint32_t flags;
5428         int ret;
5429
5430         ret = flow_tcf_query_flags(tcf, dev_flow, &flags);
5431         if (ret)
5432                 return ret;
5433         return  (flags & TCA_CLS_FLAGS_IN_HW) ? 0 : -ENOENT;
5434 }
5435
5436 /**
5437  * Remove flow from E-Switch by sending Netlink message.
5438  *
5439  * @param[in] dev
5440  *   Pointer to Ethernet device.
5441  * @param[in, out] flow
5442  *   Pointer to the sub flow.
5443  */
5444 static void
5445 flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
5446 {
5447         struct priv *priv = dev->data->dev_private;
5448         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5449         struct mlx5_flow *dev_flow;
5450         struct nlmsghdr *nlh;
5451
5452         if (!flow)
5453                 return;
5454         dev_flow = LIST_FIRST(&flow->dev_flows);
5455         if (!dev_flow)
5456                 return;
5457         /* E-Switch flow can't be expanded. */
5458         assert(!LIST_NEXT(dev_flow, next));
5459         if (dev_flow->tcf.applied) {
5460                 nlh = dev_flow->tcf.nlh;
5461                 nlh->nlmsg_type = RTM_DELTFILTER;
5462                 nlh->nlmsg_flags = NLM_F_REQUEST;
5463                 flow_tcf_nl_ack(ctx, nlh, NULL, NULL);
5464                 if (dev_flow->tcf.tunnel) {
5465                         assert(dev_flow->tcf.tunnel->vtep);
5466                         flow_tcf_vtep_release(ctx,
5467                                 dev_flow->tcf.tunnel->vtep,
5468                                 dev_flow);
5469                         dev_flow->tcf.tunnel->vtep = NULL;
5470                 }
5471                 dev_flow->tcf.applied = 0;
5472         }
5473 }
5474
5475 /**
5476  * Apply flow to E-Switch by sending Netlink message.
5477  *
5478  * @param[in] dev
5479  *   Pointer to Ethernet device.
5480  * @param[in, out] flow
5481  *   Pointer to the sub flow.
5482  * @param[out] error
5483  *   Pointer to the error structure.
5484  *
5485  * @return
5486  *   0 on success, a negative errno value otherwise and rte_errno is set.
5487  */
5488 static int
5489 flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
5490                struct rte_flow_error *error)
5491 {
5492         struct priv *priv = dev->data->dev_private;
5493         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5494         struct mlx5_flow *dev_flow;
5495         struct nlmsghdr *nlh;
5496
5497         dev_flow = LIST_FIRST(&flow->dev_flows);
5498         /* E-Switch flow can't be expanded. */
5499         assert(!LIST_NEXT(dev_flow, next));
5500         if (dev_flow->tcf.applied)
5501                 return 0;
5502         nlh = dev_flow->tcf.nlh;
5503         nlh->nlmsg_type = RTM_NEWTFILTER;
5504         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
5505         if (dev_flow->tcf.tunnel) {
5506                 /*
5507                  * Replace the interface index, target for
5508                  * encapsulation, source for decapsulation.
5509                  */
5510                 assert(!dev_flow->tcf.tunnel->vtep);
5511                 assert(dev_flow->tcf.tunnel->ifindex_ptr);
5512                 /* Acquire actual VTEP device when rule is being applied. */
5513                 dev_flow->tcf.tunnel->vtep =
5514                         flow_tcf_vtep_acquire(ctx,
5515                                         dev_flow->tcf.tunnel->ifindex_org,
5516                                         dev_flow, error);
5517                 if (!dev_flow->tcf.tunnel->vtep)
5518                         return -rte_errno;
5519                 DRV_LOG(INFO, "Replace ifindex: %d->%d",
5520                                 dev_flow->tcf.tunnel->vtep->ifindex,
5521                                 dev_flow->tcf.tunnel->ifindex_org);
5522                 *dev_flow->tcf.tunnel->ifindex_ptr =
5523                         dev_flow->tcf.tunnel->vtep->ifindex;
5524         }
5525         if (!flow_tcf_nl_ack(ctx, nlh, NULL, NULL)) {
5526                 dev_flow->tcf.applied = 1;
5527                 if (*dev_flow->tcf.ptc_flags & TCA_CLS_FLAGS_SKIP_SW)
5528                         return 0;
5529                 /*
5530                  * Rule was applied without skip_sw flag set.
5531                  * We should check whether the rule was acctually
5532                  * accepted by hardware (have look at in_hw flag).
5533                  */
5534                 if (flow_tcf_check_inhw(ctx, dev_flow)) {
5535                         flow_tcf_remove(dev, flow);
5536                         return rte_flow_error_set
5537                                 (error, ENOENT,
5538                                  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5539                                  "netlink: rule has no in_hw flag set");
5540                 }
5541                 return 0;
5542         }
5543         if (dev_flow->tcf.tunnel) {
5544                 /* Rollback the VTEP configuration if rule apply failed. */
5545                 assert(dev_flow->tcf.tunnel->vtep);
5546                 flow_tcf_vtep_release(ctx, dev_flow->tcf.tunnel->vtep,
5547                                       dev_flow);
5548                 dev_flow->tcf.tunnel->vtep = NULL;
5549         }
5550         return rte_flow_error_set(error, rte_errno,
5551                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5552                                   "netlink: failed to create TC flow rule");
5553 }
5554
5555 /**
5556  * Remove flow from E-Switch and release resources of the device flow.
5557  *
5558  * @param[in] dev
5559  *   Pointer to Ethernet device.
5560  * @param[in, out] flow
5561  *   Pointer to the sub flow.
5562  */
5563 static void
5564 flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
5565 {
5566         struct mlx5_flow *dev_flow;
5567
5568         if (!flow)
5569                 return;
5570         flow_tcf_remove(dev, flow);
5571         if (flow->counter) {
5572                 if (--flow->counter->ref_cnt == 0) {
5573                         rte_free(flow->counter);
5574                         flow->counter = NULL;
5575                 }
5576         }
5577         dev_flow = LIST_FIRST(&flow->dev_flows);
5578         if (!dev_flow)
5579                 return;
5580         /* E-Switch flow can't be expanded. */
5581         assert(!LIST_NEXT(dev_flow, next));
5582         LIST_REMOVE(dev_flow, next);
5583         rte_free(dev_flow);
5584 }
5585
5586 /**
5587  * Helper routine for figuring the space size required for a parse buffer.
5588  *
5589  * @param array
5590  *   array of values to use.
5591  * @param idx
5592  *   Current location in array.
5593  * @param value
5594  *   Value to compare with.
5595  *
5596  * @return
5597  *   The maximum between the given value and the array value on index.
5598  */
5599 static uint16_t
5600 flow_tcf_arr_val_max(uint16_t array[], int idx, uint16_t value)
5601 {
5602         return idx < 0 ? (value) : RTE_MAX((array)[idx], value);
5603 }
5604
5605 /**
5606  * Parse rtnetlink message attributes filling the attribute table with the info
5607  * retrieved.
5608  *
5609  * @param tb
5610  *   Attribute table to be filled.
5611  * @param[out] max
5612  *   Maxinum entry in the attribute table.
5613  * @param rte
5614  *   The attributes section in the message to be parsed.
5615  * @param len
5616  *   The length of the attributes section in the message.
5617  */
5618 static void
5619 flow_tcf_nl_parse_rtattr(struct rtattr *tb[], int max,
5620                          struct rtattr *rta, int len)
5621 {
5622         unsigned short type;
5623         memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
5624         while (RTA_OK(rta, len)) {
5625                 type = rta->rta_type;
5626                 if (type <= max && !tb[type])
5627                         tb[type] = rta;
5628                 rta = RTA_NEXT(rta, len);
5629         }
5630 }
5631
5632 /**
5633  * Extract flow counters from flower action.
5634  *
5635  * @param rta
5636  *   flower action stats properties in the Netlink message received.
5637  * @param rta_type
5638  *   The backward sequence of rta_types, as written in the attribute table,
5639  *   we need to traverse in order to get to the requested object.
5640  * @param idx
5641  *   Current location in rta_type table.
5642  * @param[out] data
5643  *   data holding the count statistics of the rte_flow retrieved from
5644  *   the message.
5645  *
5646  * @return
5647  *   0 if data was found and retrieved, -1 otherwise.
5648  */
5649 static int
5650 flow_tcf_nl_action_stats_parse_and_get(struct rtattr *rta,
5651                                        uint16_t rta_type[], int idx,
5652                                        struct gnet_stats_basic *data)
5653 {
5654         int tca_stats_max = flow_tcf_arr_val_max(rta_type, idx,
5655                                                  TCA_STATS_BASIC);
5656         struct rtattr *tbs[tca_stats_max + 1];
5657
5658         if (rta == NULL || idx < 0)
5659                 return -1;
5660         flow_tcf_nl_parse_rtattr(tbs, tca_stats_max,
5661                                  RTA_DATA(rta), RTA_PAYLOAD(rta));
5662         switch (rta_type[idx]) {
5663         case TCA_STATS_BASIC:
5664                 if (tbs[TCA_STATS_BASIC]) {
5665                         memcpy(data, RTA_DATA(tbs[TCA_STATS_BASIC]),
5666                                RTE_MIN(RTA_PAYLOAD(tbs[TCA_STATS_BASIC]),
5667                                sizeof(*data)));
5668                         return 0;
5669                 }
5670                 break;
5671         default:
5672                 break;
5673         }
5674         return -1;
5675 }
5676
5677 /**
5678  * Parse flower single action retrieving the requested action attribute,
5679  * if found.
5680  *
5681  * @param arg
5682  *   flower action properties in the Netlink message received.
5683  * @param rta_type
5684  *   The backward sequence of rta_types, as written in the attribute table,
5685  *   we need to traverse in order to get to the requested object.
5686  * @param idx
5687  *   Current location in rta_type table.
5688  * @param[out] data
5689  *   Count statistics retrieved from the message query.
5690  *
5691  * @return
5692  *   0 if data was found and retrieved, -1 otherwise.
5693  */
5694 static int
5695 flow_tcf_nl_parse_one_action_and_get(struct rtattr *arg,
5696                                      uint16_t rta_type[], int idx, void *data)
5697 {
5698         int tca_act_max = flow_tcf_arr_val_max(rta_type, idx, TCA_ACT_STATS);
5699         struct rtattr *tb[tca_act_max + 1];
5700
5701         if (arg == NULL || idx < 0)
5702                 return -1;
5703         flow_tcf_nl_parse_rtattr(tb, tca_act_max,
5704                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
5705         if (tb[TCA_ACT_KIND] == NULL)
5706                 return -1;
5707         switch (rta_type[idx]) {
5708         case TCA_ACT_STATS:
5709                 if (tb[TCA_ACT_STATS])
5710                         return flow_tcf_nl_action_stats_parse_and_get
5711                                         (tb[TCA_ACT_STATS],
5712                                          rta_type, --idx,
5713                                          (struct gnet_stats_basic *)data);
5714                 break;
5715         default:
5716                 break;
5717         }
5718         return -1;
5719 }
5720
5721 /**
5722  * Parse flower action section in the message retrieving the requested
5723  * attribute from the first action that provides it.
5724  *
5725  * @param opt
5726  *   flower section in the Netlink message received.
5727  * @param rta_type
5728  *   The backward sequence of rta_types, as written in the attribute table,
5729  *   we need to traverse in order to get to the requested object.
5730  * @param idx
5731  *   Current location in rta_type table.
5732  * @param[out] data
5733  *   data retrieved from the message query.
5734  *
5735  * @return
5736  *   0 if data was found and retrieved, -1 otherwise.
5737  */
5738 static int
5739 flow_tcf_nl_action_parse_and_get(struct rtattr *arg,
5740                                  uint16_t rta_type[], int idx, void *data)
5741 {
5742         struct rtattr *tb[TCA_ACT_MAX_PRIO + 1];
5743         int i;
5744
5745         if (arg == NULL || idx < 0)
5746                 return -1;
5747         flow_tcf_nl_parse_rtattr(tb, TCA_ACT_MAX_PRIO,
5748                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
5749         switch (rta_type[idx]) {
5750         /*
5751          * flow counters are stored in the actions defined by the flow
5752          * and not in the flow itself, therefore we need to traverse the
5753          * flower chain of actions in search for them.
5754          *
5755          * Note that the index is not decremented here.
5756          */
5757         case TCA_ACT_STATS:
5758                 for (i = 0; i <= TCA_ACT_MAX_PRIO; i++) {
5759                         if (tb[i] &&
5760                         !flow_tcf_nl_parse_one_action_and_get(tb[i],
5761                                                               rta_type,
5762                                                               idx, data))
5763                                 return 0;
5764                 }
5765                 break;
5766         default:
5767                 break;
5768         }
5769         return -1;
5770 }
5771
5772 /**
5773  * Parse flower classifier options in the message, retrieving the requested
5774  * attribute if found.
5775  *
5776  * @param opt
5777  *   flower section in the Netlink message received.
5778  * @param rta_type
5779  *   The backward sequence of rta_types, as written in the attribute table,
5780  *   we need to traverse in order to get to the requested object.
5781  * @param idx
5782  *   Current location in rta_type table.
5783  * @param[out] data
5784  *   data retrieved from the message query.
5785  *
5786  * @return
5787  *   0 if data was found and retrieved, -1 otherwise.
5788  */
5789 static int
5790 flow_tcf_nl_opts_parse_and_get(struct rtattr *opt,
5791                                uint16_t rta_type[], int idx, void *data)
5792 {
5793         int tca_flower_max = flow_tcf_arr_val_max(rta_type, idx,
5794                                                   TCA_FLOWER_ACT);
5795         struct rtattr *tb[tca_flower_max + 1];
5796
5797         if (!opt || idx < 0)
5798                 return -1;
5799         flow_tcf_nl_parse_rtattr(tb, tca_flower_max,
5800                                  RTA_DATA(opt), RTA_PAYLOAD(opt));
5801         switch (rta_type[idx]) {
5802         case TCA_FLOWER_ACT:
5803                 if (tb[TCA_FLOWER_ACT])
5804                         return flow_tcf_nl_action_parse_and_get
5805                                                         (tb[TCA_FLOWER_ACT],
5806                                                          rta_type, --idx, data);
5807                 break;
5808         default:
5809                 break;
5810         }
5811         return -1;
5812 }
5813
5814 /**
5815  * Parse Netlink reply on filter query, retrieving the flow counters.
5816  *
5817  * @param nlh
5818  *   Message received from Netlink.
5819  * @param rta_type
5820  *   The backward sequence of rta_types, as written in the attribute table,
5821  *   we need to traverse in order to get to the requested object.
5822  * @param idx
5823  *   Current location in rta_type table.
5824  * @param[out] data
5825  *   data retrieved from the message query.
5826  *
5827  * @return
5828  *   0 if data was found and retrieved, -1 otherwise.
5829  */
5830 static int
5831 flow_tcf_nl_filter_parse_and_get(struct nlmsghdr *cnlh,
5832                                  uint16_t rta_type[], int idx, void *data)
5833 {
5834         struct nlmsghdr *nlh = cnlh;
5835         struct tcmsg *t = NLMSG_DATA(nlh);
5836         int len = nlh->nlmsg_len;
5837         int tca_max = flow_tcf_arr_val_max(rta_type, idx, TCA_OPTIONS);
5838         struct rtattr *tb[tca_max + 1];
5839
5840         if (idx < 0)
5841                 return -1;
5842         if (nlh->nlmsg_type != RTM_NEWTFILTER &&
5843             nlh->nlmsg_type != RTM_GETTFILTER &&
5844             nlh->nlmsg_type != RTM_DELTFILTER)
5845                 return -1;
5846         len -= NLMSG_LENGTH(sizeof(*t));
5847         if (len < 0)
5848                 return -1;
5849         flow_tcf_nl_parse_rtattr(tb, tca_max, TCA_RTA(t), len);
5850         /* Not a TC flower flow - bail out */
5851         if (!tb[TCA_KIND] ||
5852             strcmp(RTA_DATA(tb[TCA_KIND]), "flower"))
5853                 return -1;
5854         switch (rta_type[idx]) {
5855         case TCA_OPTIONS:
5856                 if (tb[TCA_OPTIONS])
5857                         return flow_tcf_nl_opts_parse_and_get(tb[TCA_OPTIONS],
5858                                                               rta_type,
5859                                                               --idx, data);
5860                 break;
5861         default:
5862                 break;
5863         }
5864         return -1;
5865 }
5866
5867 /**
5868  * A callback to parse Netlink reply on TC flower query.
5869  *
5870  * @param nlh
5871  *   Message received from Netlink.
5872  * @param[out] data
5873  *   Pointer to data area to be filled by the parsing routine.
5874  *   assumed to be a pointer to struct flow_tcf_stats_basic.
5875  *
5876  * @return
5877  *   MNL_CB_OK value.
5878  */
5879 static int
5880 flow_tcf_nl_message_get_stats_basic(const struct nlmsghdr *nlh, void *data)
5881 {
5882         /*
5883          * The backward sequence of rta_types to pass in order to get
5884          *  to the counters.
5885          */
5886         uint16_t rta_type[] = { TCA_STATS_BASIC, TCA_ACT_STATS,
5887                                 TCA_FLOWER_ACT, TCA_OPTIONS };
5888         struct flow_tcf_stats_basic *sb_data = data;
5889         union {
5890                 const struct nlmsghdr *c;
5891                 struct nlmsghdr *nc;
5892         } tnlh = { .c = nlh };
5893
5894         if (!flow_tcf_nl_filter_parse_and_get(tnlh.nc, rta_type,
5895                                               RTE_DIM(rta_type) - 1,
5896                                               (void *)&sb_data->counters))
5897                 sb_data->valid = true;
5898         return MNL_CB_OK;
5899 }
5900
5901 /**
5902  * Query a TC flower rule for its statistics via netlink.
5903  *
5904  * @param[in] dev
5905  *   Pointer to Ethernet device.
5906  * @param[in] flow
5907  *   Pointer to the sub flow.
5908  * @param[out] data
5909  *   data retrieved by the query.
5910  * @param[out] error
5911  *   Perform verbose error reporting if not NULL.
5912  *
5913  * @return
5914  *   0 on success, a negative errno value otherwise and rte_errno is set.
5915  */
5916 static int
5917 flow_tcf_query_count(struct rte_eth_dev *dev,
5918                           struct rte_flow *flow,
5919                           void *data,
5920                           struct rte_flow_error *error)
5921 {
5922         struct flow_tcf_stats_basic sb_data;
5923         struct rte_flow_query_count *qc = data;
5924         struct priv *priv = dev->data->dev_private;
5925         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5926         struct mnl_socket *nl = ctx->nl;
5927         struct mlx5_flow *dev_flow;
5928         struct nlmsghdr *nlh;
5929         uint32_t seq = priv->tcf_context->seq++;
5930         ssize_t ret;
5931         assert(qc);
5932
5933         memset(&sb_data, 0, sizeof(sb_data));
5934         dev_flow = LIST_FIRST(&flow->dev_flows);
5935         /* E-Switch flow can't be expanded. */
5936         assert(!LIST_NEXT(dev_flow, next));
5937         if (!dev_flow->flow->counter)
5938                 goto notsup_exit;
5939         nlh = dev_flow->tcf.nlh;
5940         nlh->nlmsg_type = RTM_GETTFILTER;
5941         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ECHO;
5942         nlh->nlmsg_seq = seq;
5943         if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) == -1)
5944                 goto error_exit;
5945         do {
5946                 ret = mnl_socket_recvfrom(nl, ctx->buf, ctx->buf_size);
5947                 if (ret <= 0)
5948                         break;
5949                 ret = mnl_cb_run(ctx->buf, ret, seq,
5950                                  mnl_socket_get_portid(nl),
5951                                  flow_tcf_nl_message_get_stats_basic,
5952                                  (void *)&sb_data);
5953         } while (ret > 0);
5954         /* Return the delta from last reset. */
5955         if (sb_data.valid) {
5956                 /* Return the delta from last reset. */
5957                 qc->hits_set = 1;
5958                 qc->bytes_set = 1;
5959                 qc->hits = sb_data.counters.packets - flow->counter->hits;
5960                 qc->bytes = sb_data.counters.bytes - flow->counter->bytes;
5961                 if (qc->reset) {
5962                         flow->counter->hits = sb_data.counters.packets;
5963                         flow->counter->bytes = sb_data.counters.bytes;
5964                 }
5965                 return 0;
5966         }
5967         return rte_flow_error_set(error, EINVAL,
5968                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5969                                   NULL,
5970                                   "flow does not have counter");
5971 error_exit:
5972         return rte_flow_error_set
5973                         (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5974                          NULL, "netlink: failed to read flow rule counters");
5975 notsup_exit:
5976         return rte_flow_error_set
5977                         (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5978                          NULL, "counters are not available.");
5979 }
5980
5981 /**
5982  * Query a flow.
5983  *
5984  * @see rte_flow_query()
5985  * @see rte_flow_ops
5986  */
5987 static int
5988 flow_tcf_query(struct rte_eth_dev *dev,
5989                struct rte_flow *flow,
5990                const struct rte_flow_action *actions,
5991                void *data,
5992                struct rte_flow_error *error)
5993 {
5994         int ret = -EINVAL;
5995
5996         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
5997                 switch (actions->type) {
5998                 case RTE_FLOW_ACTION_TYPE_VOID:
5999                         break;
6000                 case RTE_FLOW_ACTION_TYPE_COUNT:
6001                         ret = flow_tcf_query_count(dev, flow, data, error);
6002                         break;
6003                 default:
6004                         return rte_flow_error_set(error, ENOTSUP,
6005                                                   RTE_FLOW_ERROR_TYPE_ACTION,
6006                                                   actions,
6007                                                   "action not supported");
6008                 }
6009         }
6010         return ret;
6011 }
6012
6013 const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops = {
6014         .validate = flow_tcf_validate,
6015         .prepare = flow_tcf_prepare,
6016         .translate = flow_tcf_translate,
6017         .apply = flow_tcf_apply,
6018         .remove = flow_tcf_remove,
6019         .destroy = flow_tcf_destroy,
6020         .query = flow_tcf_query,
6021 };
6022
6023 /**
6024  * Create and configure a libmnl socket for Netlink flow rules.
6025  *
6026  * @return
6027  *   A valid libmnl socket object pointer on success, NULL otherwise and
6028  *   rte_errno is set.
6029  */
6030 static struct mnl_socket *
6031 flow_tcf_mnl_socket_create(void)
6032 {
6033         struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
6034
6035         if (nl) {
6036                 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
6037                                       sizeof(int));
6038                 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
6039                         return nl;
6040         }
6041         rte_errno = errno;
6042         if (nl)
6043                 mnl_socket_close(nl);
6044         return NULL;
6045 }
6046
6047 /**
6048  * Destroy a libmnl socket.
6049  *
6050  * @param nl
6051  *   Libmnl socket of the @p NETLINK_ROUTE kind.
6052  */
6053 static void
6054 flow_tcf_mnl_socket_destroy(struct mnl_socket *nl)
6055 {
6056         if (nl)
6057                 mnl_socket_close(nl);
6058 }
6059
6060 /**
6061  * Initialize ingress qdisc of a given network interface.
6062  *
6063  * @param ctx
6064  *   Pointer to tc-flower context to use.
6065  * @param ifindex
6066  *   Index of network interface to initialize.
6067  * @param[out] error
6068  *   Perform verbose error reporting if not NULL.
6069  *
6070  * @return
6071  *   0 on success, a negative errno value otherwise and rte_errno is set.
6072  */
6073 int
6074 mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx,
6075                    unsigned int ifindex, struct rte_flow_error *error)
6076 {
6077         struct nlmsghdr *nlh;
6078         struct tcmsg *tcm;
6079         alignas(struct nlmsghdr)
6080         uint8_t buf[mnl_nlmsg_size(sizeof(*tcm)) +
6081                     SZ_NLATTR_STRZ_OF("ingress") +
6082                     MNL_BUF_EXTRA_SPACE];
6083
6084         /* Destroy existing ingress qdisc and everything attached to it. */
6085         nlh = mnl_nlmsg_put_header(buf);
6086         nlh->nlmsg_type = RTM_DELQDISC;
6087         nlh->nlmsg_flags = NLM_F_REQUEST;
6088         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6089         tcm->tcm_family = AF_UNSPEC;
6090         tcm->tcm_ifindex = ifindex;
6091         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6092         tcm->tcm_parent = TC_H_INGRESS;
6093         assert(sizeof(buf) >= nlh->nlmsg_len);
6094         /* Ignore errors when qdisc is already absent. */
6095         if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL) &&
6096             rte_errno != EINVAL && rte_errno != ENOENT)
6097                 return rte_flow_error_set(error, rte_errno,
6098                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6099                                           "netlink: failed to remove ingress"
6100                                           " qdisc");
6101         /* Create fresh ingress qdisc. */
6102         nlh = mnl_nlmsg_put_header(buf);
6103         nlh->nlmsg_type = RTM_NEWQDISC;
6104         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
6105         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6106         tcm->tcm_family = AF_UNSPEC;
6107         tcm->tcm_ifindex = ifindex;
6108         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6109         tcm->tcm_parent = TC_H_INGRESS;
6110         mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
6111         assert(sizeof(buf) >= nlh->nlmsg_len);
6112         if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL))
6113                 return rte_flow_error_set(error, rte_errno,
6114                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6115                                           "netlink: failed to create ingress"
6116                                           " qdisc");
6117         return 0;
6118 }
6119
6120 /**
6121  * Create libmnl context for Netlink flow rules.
6122  *
6123  * @return
6124  *   A valid libmnl socket object pointer on success, NULL otherwise and
6125  *   rte_errno is set.
6126  */
6127 struct mlx5_flow_tcf_context *
6128 mlx5_flow_tcf_context_create(void)
6129 {
6130         struct mlx5_flow_tcf_context *ctx = rte_zmalloc(__func__,
6131                                                         sizeof(*ctx),
6132                                                         sizeof(uint32_t));
6133         if (!ctx)
6134                 goto error;
6135         ctx->nl = flow_tcf_mnl_socket_create();
6136         if (!ctx->nl)
6137                 goto error;
6138         ctx->buf_size = MNL_SOCKET_BUFFER_SIZE;
6139         ctx->buf = rte_zmalloc(__func__,
6140                                ctx->buf_size, sizeof(uint32_t));
6141         if (!ctx->buf)
6142                 goto error;
6143         ctx->seq = random();
6144         return ctx;
6145 error:
6146         mlx5_flow_tcf_context_destroy(ctx);
6147         return NULL;
6148 }
6149
6150 /**
6151  * Destroy a libmnl context.
6152  *
6153  * @param ctx
6154  *   Libmnl socket of the @p NETLINK_ROUTE kind.
6155  */
6156 void
6157 mlx5_flow_tcf_context_destroy(struct mlx5_flow_tcf_context *ctx)
6158 {
6159         if (!ctx)
6160                 return;
6161         flow_tcf_mnl_socket_destroy(ctx->nl);
6162         rte_free(ctx->buf);
6163         rte_free(ctx);
6164 }