net/mlx5: optimize neigh and local encap rules search
[dpdk.git] / drivers / net / mlx5 / mlx5_flow_tcf.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018 6WIND S.A.
3  * Copyright 2018 Mellanox Technologies, Ltd
4  */
5
6 #include <assert.h>
7 #include <errno.h>
8 #include <libmnl/libmnl.h>
9 #include <linux/gen_stats.h>
10 #include <linux/if_ether.h>
11 #include <linux/netlink.h>
12 #include <linux/pkt_cls.h>
13 #include <linux/pkt_sched.h>
14 #include <linux/rtnetlink.h>
15 #include <linux/tc_act/tc_gact.h>
16 #include <linux/tc_act/tc_mirred.h>
17 #include <netinet/in.h>
18 #include <stdalign.h>
19 #include <stdbool.h>
20 #include <stddef.h>
21 #include <stdint.h>
22 #include <stdlib.h>
23 #include <sys/socket.h>
24
25 #include <rte_byteorder.h>
26 #include <rte_errno.h>
27 #include <rte_ether.h>
28 #include <rte_flow.h>
29 #include <rte_malloc.h>
30 #include <rte_common.h>
31
32 #include "mlx5.h"
33 #include "mlx5_flow.h"
34 #include "mlx5_autoconf.h"
35
36 #ifdef HAVE_TC_ACT_VLAN
37
38 #include <linux/tc_act/tc_vlan.h>
39
40 #else /* HAVE_TC_ACT_VLAN */
41
42 #define TCA_VLAN_ACT_POP 1
43 #define TCA_VLAN_ACT_PUSH 2
44 #define TCA_VLAN_ACT_MODIFY 3
45 #define TCA_VLAN_PARMS 2
46 #define TCA_VLAN_PUSH_VLAN_ID 3
47 #define TCA_VLAN_PUSH_VLAN_PROTOCOL 4
48 #define TCA_VLAN_PAD 5
49 #define TCA_VLAN_PUSH_VLAN_PRIORITY 6
50
51 struct tc_vlan {
52         tc_gen;
53         int v_action;
54 };
55
56 #endif /* HAVE_TC_ACT_VLAN */
57
58 #ifdef HAVE_TC_ACT_PEDIT
59
60 #include <linux/tc_act/tc_pedit.h>
61
62 #else /* HAVE_TC_ACT_VLAN */
63
64 enum {
65         TCA_PEDIT_UNSPEC,
66         TCA_PEDIT_TM,
67         TCA_PEDIT_PARMS,
68         TCA_PEDIT_PAD,
69         TCA_PEDIT_PARMS_EX,
70         TCA_PEDIT_KEYS_EX,
71         TCA_PEDIT_KEY_EX,
72         __TCA_PEDIT_MAX
73 };
74
75 enum {
76         TCA_PEDIT_KEY_EX_HTYPE = 1,
77         TCA_PEDIT_KEY_EX_CMD = 2,
78         __TCA_PEDIT_KEY_EX_MAX
79 };
80
81 enum pedit_header_type {
82         TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK = 0,
83         TCA_PEDIT_KEY_EX_HDR_TYPE_ETH = 1,
84         TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 = 2,
85         TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 = 3,
86         TCA_PEDIT_KEY_EX_HDR_TYPE_TCP = 4,
87         TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
88         __PEDIT_HDR_TYPE_MAX,
89 };
90
91 enum pedit_cmd {
92         TCA_PEDIT_KEY_EX_CMD_SET = 0,
93         TCA_PEDIT_KEY_EX_CMD_ADD = 1,
94         __PEDIT_CMD_MAX,
95 };
96
97 struct tc_pedit_key {
98         __u32 mask; /* AND */
99         __u32 val; /*XOR */
100         __u32 off; /*offset */
101         __u32 at;
102         __u32 offmask;
103         __u32 shift;
104 };
105
106 __extension__
107 struct tc_pedit_sel {
108         tc_gen;
109         unsigned char nkeys;
110         unsigned char flags;
111         struct tc_pedit_key keys[0];
112 };
113
114 #endif /* HAVE_TC_ACT_VLAN */
115
116 #ifdef HAVE_TC_ACT_TUNNEL_KEY
117
118 #include <linux/tc_act/tc_tunnel_key.h>
119
120 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
121 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
122 #endif
123
124 #ifndef HAVE_TCA_TUNNEL_KEY_NO_CSUM
125 #define TCA_TUNNEL_KEY_NO_CSUM 10
126 #endif
127
128 #else /* HAVE_TC_ACT_TUNNEL_KEY */
129
130 #define TCA_ACT_TUNNEL_KEY 17
131 #define TCA_TUNNEL_KEY_ACT_SET 1
132 #define TCA_TUNNEL_KEY_ACT_RELEASE 2
133 #define TCA_TUNNEL_KEY_PARMS 2
134 #define TCA_TUNNEL_KEY_ENC_IPV4_SRC 3
135 #define TCA_TUNNEL_KEY_ENC_IPV4_DST 4
136 #define TCA_TUNNEL_KEY_ENC_IPV6_SRC 5
137 #define TCA_TUNNEL_KEY_ENC_IPV6_DST 6
138 #define TCA_TUNNEL_KEY_ENC_KEY_ID 7
139 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
140 #define TCA_TUNNEL_KEY_NO_CSUM 10
141
142 struct tc_tunnel_key {
143         tc_gen;
144         int t_action;
145 };
146
147 #endif /* HAVE_TC_ACT_TUNNEL_KEY */
148
149 /* Normally found in linux/netlink.h. */
150 #ifndef NETLINK_CAP_ACK
151 #define NETLINK_CAP_ACK 10
152 #endif
153
154 /* Normally found in linux/pkt_sched.h. */
155 #ifndef TC_H_MIN_INGRESS
156 #define TC_H_MIN_INGRESS 0xfff2u
157 #endif
158
159 /* Normally found in linux/pkt_cls.h. */
160 #ifndef TCA_CLS_FLAGS_SKIP_SW
161 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
162 #endif
163 #ifndef TCA_CLS_FLAGS_IN_HW
164 #define TCA_CLS_FLAGS_IN_HW (1 << 2)
165 #endif
166 #ifndef HAVE_TCA_CHAIN
167 #define TCA_CHAIN 11
168 #endif
169 #ifndef HAVE_TCA_FLOWER_ACT
170 #define TCA_FLOWER_ACT 3
171 #endif
172 #ifndef HAVE_TCA_FLOWER_FLAGS
173 #define TCA_FLOWER_FLAGS 22
174 #endif
175 #ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE
176 #define TCA_FLOWER_KEY_ETH_TYPE 8
177 #endif
178 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST
179 #define TCA_FLOWER_KEY_ETH_DST 4
180 #endif
181 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK
182 #define TCA_FLOWER_KEY_ETH_DST_MASK 5
183 #endif
184 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC
185 #define TCA_FLOWER_KEY_ETH_SRC 6
186 #endif
187 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK
188 #define TCA_FLOWER_KEY_ETH_SRC_MASK 7
189 #endif
190 #ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO
191 #define TCA_FLOWER_KEY_IP_PROTO 9
192 #endif
193 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC
194 #define TCA_FLOWER_KEY_IPV4_SRC 10
195 #endif
196 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK
197 #define TCA_FLOWER_KEY_IPV4_SRC_MASK 11
198 #endif
199 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST
200 #define TCA_FLOWER_KEY_IPV4_DST 12
201 #endif
202 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK
203 #define TCA_FLOWER_KEY_IPV4_DST_MASK 13
204 #endif
205 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC
206 #define TCA_FLOWER_KEY_IPV6_SRC 14
207 #endif
208 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK
209 #define TCA_FLOWER_KEY_IPV6_SRC_MASK 15
210 #endif
211 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST
212 #define TCA_FLOWER_KEY_IPV6_DST 16
213 #endif
214 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK
215 #define TCA_FLOWER_KEY_IPV6_DST_MASK 17
216 #endif
217 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC
218 #define TCA_FLOWER_KEY_TCP_SRC 18
219 #endif
220 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK
221 #define TCA_FLOWER_KEY_TCP_SRC_MASK 35
222 #endif
223 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST
224 #define TCA_FLOWER_KEY_TCP_DST 19
225 #endif
226 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK
227 #define TCA_FLOWER_KEY_TCP_DST_MASK 36
228 #endif
229 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC
230 #define TCA_FLOWER_KEY_UDP_SRC 20
231 #endif
232 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK
233 #define TCA_FLOWER_KEY_UDP_SRC_MASK 37
234 #endif
235 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST
236 #define TCA_FLOWER_KEY_UDP_DST 21
237 #endif
238 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK
239 #define TCA_FLOWER_KEY_UDP_DST_MASK 38
240 #endif
241 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID
242 #define TCA_FLOWER_KEY_VLAN_ID 23
243 #endif
244 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO
245 #define TCA_FLOWER_KEY_VLAN_PRIO 24
246 #endif
247 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
248 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
249 #endif
250 #ifndef HAVE_TCA_FLOWER_KEY_ENC_KEY_ID
251 #define TCA_FLOWER_KEY_ENC_KEY_ID 26
252 #endif
253 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC
254 #define TCA_FLOWER_KEY_ENC_IPV4_SRC 27
255 #endif
256 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK
257 #define TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK 28
258 #endif
259 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST
260 #define TCA_FLOWER_KEY_ENC_IPV4_DST 29
261 #endif
262 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK
263 #define TCA_FLOWER_KEY_ENC_IPV4_DST_MASK 30
264 #endif
265 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC
266 #define TCA_FLOWER_KEY_ENC_IPV6_SRC 31
267 #endif
268 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK
269 #define TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK 32
270 #endif
271 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST
272 #define TCA_FLOWER_KEY_ENC_IPV6_DST 33
273 #endif
274 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK
275 #define TCA_FLOWER_KEY_ENC_IPV6_DST_MASK 34
276 #endif
277 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT
278 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT 43
279 #endif
280 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK
281 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK 44
282 #endif
283 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT
284 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT 45
285 #endif
286 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK
287 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK 46
288 #endif
289 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS
290 #define TCA_FLOWER_KEY_TCP_FLAGS 71
291 #endif
292 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK
293 #define TCA_FLOWER_KEY_TCP_FLAGS_MASK 72
294 #endif
295 #ifndef HAVE_TC_ACT_GOTO_CHAIN
296 #define TC_ACT_GOTO_CHAIN 0x20000000
297 #endif
298
299 #ifndef IPV6_ADDR_LEN
300 #define IPV6_ADDR_LEN 16
301 #endif
302
303 #ifndef IPV4_ADDR_LEN
304 #define IPV4_ADDR_LEN 4
305 #endif
306
307 #ifndef TP_PORT_LEN
308 #define TP_PORT_LEN 2 /* Transport Port (UDP/TCP) Length */
309 #endif
310
311 #ifndef TTL_LEN
312 #define TTL_LEN 1
313 #endif
314
315 #ifndef TCA_ACT_MAX_PRIO
316 #define TCA_ACT_MAX_PRIO 32
317 #endif
318
319 /** UDP port range of VXLAN devices created by driver. */
320 #define MLX5_VXLAN_PORT_MIN 30000
321 #define MLX5_VXLAN_PORT_MAX 60000
322 #define MLX5_VXLAN_DEVICE_PFX "vmlx_"
323
324 /** Tunnel action type, used for @p type in header structure. */
325 enum flow_tcf_tunact_type {
326         FLOW_TCF_TUNACT_VXLAN_DECAP,
327         FLOW_TCF_TUNACT_VXLAN_ENCAP,
328 };
329
330 /** Flags used for @p mask in tunnel action encap descriptors. */
331 #define FLOW_TCF_ENCAP_ETH_SRC (1u << 0)
332 #define FLOW_TCF_ENCAP_ETH_DST (1u << 1)
333 #define FLOW_TCF_ENCAP_IPV4_SRC (1u << 2)
334 #define FLOW_TCF_ENCAP_IPV4_DST (1u << 3)
335 #define FLOW_TCF_ENCAP_IPV6_SRC (1u << 4)
336 #define FLOW_TCF_ENCAP_IPV6_DST (1u << 5)
337 #define FLOW_TCF_ENCAP_UDP_SRC (1u << 6)
338 #define FLOW_TCF_ENCAP_UDP_DST (1u << 7)
339 #define FLOW_TCF_ENCAP_VXLAN_VNI (1u << 8)
340
341 /**
342  * Structure for holding netlink context.
343  * Note the size of the message buffer which is MNL_SOCKET_BUFFER_SIZE.
344  * Using this (8KB) buffer size ensures that netlink messages will never be
345  * truncated.
346  */
347 struct mlx5_flow_tcf_context {
348         struct mnl_socket *nl; /* NETLINK_ROUTE libmnl socket. */
349         uint32_t seq; /* Message sequence number. */
350         uint32_t buf_size; /* Message buffer size. */
351         uint8_t *buf; /* Message buffer. */
352 };
353
354 /**
355  * Neigh rule structure. The neigh rule is applied via Netlink to
356  * outer tunnel iface in order to provide destination MAC address
357  * for the VXLAN encapsultion. The neigh rule is implicitly related
358  * to the Flow itself and can be shared by multiple Flows.
359  */
360 struct tcf_neigh_rule {
361         LIST_ENTRY(tcf_neigh_rule) next;
362         uint32_t refcnt;
363         struct ether_addr eth;
364         uint16_t mask;
365         union {
366                 struct {
367                         rte_be32_t dst;
368                 } ipv4;
369                 struct {
370                         uint8_t dst[IPV6_ADDR_LEN];
371                 } ipv6;
372         };
373 };
374
375 /**
376  * Local rule structure. The local rule is applied via Netlink to
377  * outer tunnel iface in order to provide local and peer IP addresses
378  * of the VXLAN tunnel for encapsulation. The local rule is implicitly
379  * related to the Flow itself and can be shared by multiple Flows.
380  */
381 struct tcf_local_rule {
382         LIST_ENTRY(tcf_local_rule) next;
383         uint32_t refcnt;
384         uint16_t mask;
385         union {
386                 struct {
387                         rte_be32_t dst;
388                         rte_be32_t src;
389                 } ipv4;
390                 struct {
391                         uint8_t dst[IPV6_ADDR_LEN];
392                         uint8_t src[IPV6_ADDR_LEN];
393                 } ipv6;
394         };
395 };
396
397 /** VXLAN virtual netdev. */
398 struct tcf_vtep {
399         LIST_ENTRY(tcf_vtep) next;
400         LIST_HEAD(, tcf_neigh_rule) neigh;
401         LIST_HEAD(, tcf_local_rule) local;
402         uint32_t refcnt;
403         unsigned int ifindex; /**< Own interface index. */
404         unsigned int ifouter; /**< Index of device attached to. */
405         uint16_t port;
406         uint8_t created;
407 };
408
409 /** Tunnel descriptor header, common for all tunnel types. */
410 struct flow_tcf_tunnel_hdr {
411         uint32_t type; /**< Tunnel action type. */
412         struct tcf_vtep *vtep; /**< Virtual tunnel endpoint device. */
413         unsigned int ifindex_org; /**< Original dst/src interface */
414         unsigned int *ifindex_ptr; /**< Interface ptr in message. */
415 };
416
417 struct flow_tcf_vxlan_decap {
418         struct flow_tcf_tunnel_hdr hdr;
419         uint16_t udp_port;
420 };
421
422 struct flow_tcf_vxlan_encap {
423         struct flow_tcf_tunnel_hdr hdr;
424         uint32_t mask;
425         struct {
426                 struct ether_addr dst;
427                 struct ether_addr src;
428         } eth;
429         union {
430                 struct {
431                         rte_be32_t dst;
432                         rte_be32_t src;
433                 } ipv4;
434                 struct {
435                         uint8_t dst[IPV6_ADDR_LEN];
436                         uint8_t src[IPV6_ADDR_LEN];
437                 } ipv6;
438         };
439         struct {
440                 rte_be16_t src;
441                 rte_be16_t dst;
442         } udp;
443         struct {
444                 uint8_t vni[3];
445         } vxlan;
446 };
447
448 /** Structure used when extracting the values of a flow counters
449  * from a netlink message.
450  */
451 struct flow_tcf_stats_basic {
452         bool valid;
453         struct gnet_stats_basic counters;
454 };
455
456 /** Empty masks for known item types. */
457 static const union {
458         struct rte_flow_item_port_id port_id;
459         struct rte_flow_item_eth eth;
460         struct rte_flow_item_vlan vlan;
461         struct rte_flow_item_ipv4 ipv4;
462         struct rte_flow_item_ipv6 ipv6;
463         struct rte_flow_item_tcp tcp;
464         struct rte_flow_item_udp udp;
465         struct rte_flow_item_vxlan vxlan;
466 } flow_tcf_mask_empty = {
467         {0},
468 };
469
470 /** Supported masks for known item types. */
471 static const struct {
472         struct rte_flow_item_port_id port_id;
473         struct rte_flow_item_eth eth;
474         struct rte_flow_item_vlan vlan;
475         struct rte_flow_item_ipv4 ipv4;
476         struct rte_flow_item_ipv6 ipv6;
477         struct rte_flow_item_tcp tcp;
478         struct rte_flow_item_udp udp;
479         struct rte_flow_item_vxlan vxlan;
480 } flow_tcf_mask_supported = {
481         .port_id = {
482                 .id = 0xffffffff,
483         },
484         .eth = {
485                 .type = RTE_BE16(0xffff),
486                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
487                 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
488         },
489         .vlan = {
490                 /* PCP and VID only, no DEI. */
491                 .tci = RTE_BE16(0xefff),
492                 .inner_type = RTE_BE16(0xffff),
493         },
494         .ipv4.hdr = {
495                 .next_proto_id = 0xff,
496                 .src_addr = RTE_BE32(0xffffffff),
497                 .dst_addr = RTE_BE32(0xffffffff),
498         },
499         .ipv6.hdr = {
500                 .proto = 0xff,
501                 .src_addr =
502                         "\xff\xff\xff\xff\xff\xff\xff\xff"
503                         "\xff\xff\xff\xff\xff\xff\xff\xff",
504                 .dst_addr =
505                         "\xff\xff\xff\xff\xff\xff\xff\xff"
506                         "\xff\xff\xff\xff\xff\xff\xff\xff",
507         },
508         .tcp.hdr = {
509                 .src_port = RTE_BE16(0xffff),
510                 .dst_port = RTE_BE16(0xffff),
511                 .tcp_flags = 0xff,
512         },
513         .udp.hdr = {
514                 .src_port = RTE_BE16(0xffff),
515                 .dst_port = RTE_BE16(0xffff),
516         },
517         .vxlan = {
518                .vni = "\xff\xff\xff",
519         },
520 };
521
522 #define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr))
523 #define SZ_NLATTR_NEST SZ_NLATTR_HDR
524 #define SZ_NLATTR_DATA_OF(len) MNL_ALIGN(SZ_NLATTR_HDR + (len))
525 #define SZ_NLATTR_TYPE_OF(typ) SZ_NLATTR_DATA_OF(sizeof(typ))
526 #define SZ_NLATTR_STRZ_OF(str) SZ_NLATTR_DATA_OF(strlen(str) + 1)
527
528 #define PTOI_TABLE_SZ_MAX(dev) (mlx5_dev_to_port_id((dev)->device, NULL, 0) + 2)
529
530 /** DPDK port to network interface index (ifindex) conversion. */
531 struct flow_tcf_ptoi {
532         uint16_t port_id; /**< DPDK port ID. */
533         unsigned int ifindex; /**< Network interface index. */
534 };
535
536 /* Due to a limitation on driver/FW. */
537 #define MLX5_TCF_GROUP_ID_MAX 3
538
539 /*
540  * Due to a limitation on driver/FW, priority ranges from 1 to 16 in kernel.
541  * Priority in rte_flow attribute starts from 0 and is added by 1 in
542  * translation. This is subject to be changed to determine the max priority
543  * based on trial-and-error like Verbs driver once the restriction is lifted or
544  * the range is extended.
545  */
546 #define MLX5_TCF_GROUP_PRIORITY_MAX 15
547
548 #define MLX5_TCF_FATE_ACTIONS \
549         (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \
550          MLX5_FLOW_ACTION_JUMP)
551
552 #define MLX5_TCF_VLAN_ACTIONS \
553         (MLX5_FLOW_ACTION_OF_POP_VLAN | MLX5_FLOW_ACTION_OF_PUSH_VLAN | \
554          MLX5_FLOW_ACTION_OF_SET_VLAN_VID | MLX5_FLOW_ACTION_OF_SET_VLAN_PCP)
555
556 #define MLX5_TCF_VXLAN_ACTIONS \
557         (MLX5_FLOW_ACTION_VXLAN_ENCAP | MLX5_FLOW_ACTION_VXLAN_DECAP)
558
559 #define MLX5_TCF_PEDIT_ACTIONS \
560         (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST | \
561          MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST | \
562          MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST | \
563          MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL | \
564          MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)
565
566 #define MLX5_TCF_CONFIG_ACTIONS \
567         (MLX5_FLOW_ACTION_PORT_ID | MLX5_FLOW_ACTION_JUMP | \
568          MLX5_FLOW_ACTION_OF_PUSH_VLAN | MLX5_FLOW_ACTION_OF_SET_VLAN_VID | \
569          MLX5_FLOW_ACTION_OF_SET_VLAN_PCP | \
570          (MLX5_TCF_PEDIT_ACTIONS & ~MLX5_FLOW_ACTION_DEC_TTL))
571
572 #define MAX_PEDIT_KEYS 128
573 #define SZ_PEDIT_KEY_VAL 4
574
575 #define NUM_OF_PEDIT_KEYS(sz) \
576         (((sz) / SZ_PEDIT_KEY_VAL) + (((sz) % SZ_PEDIT_KEY_VAL) ? 1 : 0))
577
578 struct pedit_key_ex {
579         enum pedit_header_type htype;
580         enum pedit_cmd cmd;
581 };
582
583 struct pedit_parser {
584         struct tc_pedit_sel sel;
585         struct tc_pedit_key keys[MAX_PEDIT_KEYS];
586         struct pedit_key_ex keys_ex[MAX_PEDIT_KEYS];
587 };
588
589 /**
590  * Create space for using the implicitly created TC flow counter.
591  *
592  * @param[in] dev
593  *   Pointer to the Ethernet device structure.
594  *
595  * @return
596  *   A pointer to the counter data structure, NULL otherwise and
597  *   rte_errno is set.
598  */
599 static struct mlx5_flow_counter *
600 flow_tcf_counter_new(void)
601 {
602         struct mlx5_flow_counter *cnt;
603
604         /*
605          * eswitch counter cannot be shared and its id is unknown.
606          * currently returning all with id 0.
607          * in the future maybe better to switch to unique numbers.
608          */
609         struct mlx5_flow_counter tmpl = {
610                 .ref_cnt = 1,
611         };
612         cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0);
613         if (!cnt) {
614                 rte_errno = ENOMEM;
615                 return NULL;
616         }
617         *cnt = tmpl;
618         /* Implicit counter, do not add to list. */
619         return cnt;
620 }
621
622 /**
623  * Set pedit key of MAC address
624  *
625  * @param[in] actions
626  *   pointer to action specification
627  * @param[in,out] p_parser
628  *   pointer to pedit_parser
629  */
630 static void
631 flow_tcf_pedit_key_set_mac(const struct rte_flow_action *actions,
632                            struct pedit_parser *p_parser)
633 {
634         int idx = p_parser->sel.nkeys;
635         uint32_t off = actions->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ?
636                                         offsetof(struct ether_hdr, s_addr) :
637                                         offsetof(struct ether_hdr, d_addr);
638         const struct rte_flow_action_set_mac *conf =
639                 (const struct rte_flow_action_set_mac *)actions->conf;
640
641         p_parser->keys[idx].off = off;
642         p_parser->keys[idx].mask = ~UINT32_MAX;
643         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
644         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
645         memcpy(&p_parser->keys[idx].val,
646                 conf->mac_addr, SZ_PEDIT_KEY_VAL);
647         idx++;
648         p_parser->keys[idx].off = off + SZ_PEDIT_KEY_VAL;
649         p_parser->keys[idx].mask = 0xFFFF0000;
650         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
651         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
652         memcpy(&p_parser->keys[idx].val,
653                 conf->mac_addr + SZ_PEDIT_KEY_VAL,
654                 ETHER_ADDR_LEN - SZ_PEDIT_KEY_VAL);
655         p_parser->sel.nkeys = (++idx);
656 }
657
658 /**
659  * Set pedit key of decrease/set ttl
660  *
661  * @param[in] actions
662  *   pointer to action specification
663  * @param[in,out] p_parser
664  *   pointer to pedit_parser
665  * @param[in] item_flags
666  *   flags of all items presented
667  */
668 static void
669 flow_tcf_pedit_key_set_dec_ttl(const struct rte_flow_action *actions,
670                                 struct pedit_parser *p_parser,
671                                 uint64_t item_flags)
672 {
673         int idx = p_parser->sel.nkeys;
674
675         p_parser->keys[idx].mask = 0xFFFFFF00;
676         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) {
677                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
678                 p_parser->keys[idx].off =
679                         offsetof(struct ipv4_hdr, time_to_live);
680         }
681         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6) {
682                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
683                 p_parser->keys[idx].off =
684                         offsetof(struct ipv6_hdr, hop_limits);
685         }
686         if (actions->type == RTE_FLOW_ACTION_TYPE_DEC_TTL) {
687                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_ADD;
688                 p_parser->keys[idx].val = 0x000000FF;
689         } else {
690                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
691                 p_parser->keys[idx].val =
692                         (__u32)((const struct rte_flow_action_set_ttl *)
693                          actions->conf)->ttl_value;
694         }
695         p_parser->sel.nkeys = (++idx);
696 }
697
698 /**
699  * Set pedit key of transport (TCP/UDP) port value
700  *
701  * @param[in] actions
702  *   pointer to action specification
703  * @param[in,out] p_parser
704  *   pointer to pedit_parser
705  * @param[in] item_flags
706  *   flags of all items presented
707  */
708 static void
709 flow_tcf_pedit_key_set_tp_port(const struct rte_flow_action *actions,
710                                 struct pedit_parser *p_parser,
711                                 uint64_t item_flags)
712 {
713         int idx = p_parser->sel.nkeys;
714
715         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)
716                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_UDP;
717         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_TCP)
718                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_TCP;
719         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
720         /* offset of src/dst port is same for TCP and UDP */
721         p_parser->keys[idx].off =
722                 actions->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC ?
723                 offsetof(struct tcp_hdr, src_port) :
724                 offsetof(struct tcp_hdr, dst_port);
725         p_parser->keys[idx].mask = 0xFFFF0000;
726         p_parser->keys[idx].val =
727                 (__u32)((const struct rte_flow_action_set_tp *)
728                                 actions->conf)->port;
729         p_parser->sel.nkeys = (++idx);
730 }
731
732 /**
733  * Set pedit key of ipv6 address
734  *
735  * @param[in] actions
736  *   pointer to action specification
737  * @param[in,out] p_parser
738  *   pointer to pedit_parser
739  */
740 static void
741 flow_tcf_pedit_key_set_ipv6_addr(const struct rte_flow_action *actions,
742                                  struct pedit_parser *p_parser)
743 {
744         int idx = p_parser->sel.nkeys;
745         int keys = NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
746         int off_base =
747                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ?
748                 offsetof(struct ipv6_hdr, src_addr) :
749                 offsetof(struct ipv6_hdr, dst_addr);
750         const struct rte_flow_action_set_ipv6 *conf =
751                 (const struct rte_flow_action_set_ipv6 *)actions->conf;
752
753         for (int i = 0; i < keys; i++, idx++) {
754                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
755                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
756                 p_parser->keys[idx].off = off_base + i * SZ_PEDIT_KEY_VAL;
757                 p_parser->keys[idx].mask = ~UINT32_MAX;
758                 memcpy(&p_parser->keys[idx].val,
759                         conf->ipv6_addr + i *  SZ_PEDIT_KEY_VAL,
760                         SZ_PEDIT_KEY_VAL);
761         }
762         p_parser->sel.nkeys += keys;
763 }
764
765 /**
766  * Set pedit key of ipv4 address
767  *
768  * @param[in] actions
769  *   pointer to action specification
770  * @param[in,out] p_parser
771  *   pointer to pedit_parser
772  */
773 static void
774 flow_tcf_pedit_key_set_ipv4_addr(const struct rte_flow_action *actions,
775                                  struct pedit_parser *p_parser)
776 {
777         int idx = p_parser->sel.nkeys;
778
779         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
780         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
781         p_parser->keys[idx].off =
782                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ?
783                 offsetof(struct ipv4_hdr, src_addr) :
784                 offsetof(struct ipv4_hdr, dst_addr);
785         p_parser->keys[idx].mask = ~UINT32_MAX;
786         p_parser->keys[idx].val =
787                 ((const struct rte_flow_action_set_ipv4 *)
788                  actions->conf)->ipv4_addr;
789         p_parser->sel.nkeys = (++idx);
790 }
791
792 /**
793  * Create the pedit's na attribute in netlink message
794  * on pre-allocate message buffer
795  *
796  * @param[in,out] nl
797  *   pointer to pre-allocated netlink message buffer
798  * @param[in,out] actions
799  *   pointer to pointer of actions specification.
800  * @param[in,out] action_flags
801  *   pointer to actions flags
802  * @param[in] item_flags
803  *   flags of all item presented
804  */
805 static void
806 flow_tcf_create_pedit_mnl_msg(struct nlmsghdr *nl,
807                               const struct rte_flow_action **actions,
808                               uint64_t item_flags)
809 {
810         struct pedit_parser p_parser;
811         struct nlattr *na_act_options;
812         struct nlattr *na_pedit_keys;
813
814         memset(&p_parser, 0, sizeof(p_parser));
815         mnl_attr_put_strz(nl, TCA_ACT_KIND, "pedit");
816         na_act_options = mnl_attr_nest_start(nl, TCA_ACT_OPTIONS);
817         /* all modify header actions should be in one tc-pedit action */
818         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
819                 switch ((*actions)->type) {
820                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
821                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
822                         flow_tcf_pedit_key_set_ipv4_addr(*actions, &p_parser);
823                         break;
824                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
825                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
826                         flow_tcf_pedit_key_set_ipv6_addr(*actions, &p_parser);
827                         break;
828                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
829                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
830                         flow_tcf_pedit_key_set_tp_port(*actions,
831                                                         &p_parser, item_flags);
832                         break;
833                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
834                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
835                         flow_tcf_pedit_key_set_dec_ttl(*actions,
836                                                         &p_parser, item_flags);
837                         break;
838                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
839                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
840                         flow_tcf_pedit_key_set_mac(*actions, &p_parser);
841                         break;
842                 default:
843                         goto pedit_mnl_msg_done;
844                 }
845         }
846 pedit_mnl_msg_done:
847         p_parser.sel.action = TC_ACT_PIPE;
848         mnl_attr_put(nl, TCA_PEDIT_PARMS_EX,
849                      sizeof(p_parser.sel) +
850                      p_parser.sel.nkeys * sizeof(struct tc_pedit_key),
851                      &p_parser);
852         na_pedit_keys =
853                 mnl_attr_nest_start(nl, TCA_PEDIT_KEYS_EX | NLA_F_NESTED);
854         for (int i = 0; i < p_parser.sel.nkeys; i++) {
855                 struct nlattr *na_pedit_key =
856                         mnl_attr_nest_start(nl,
857                                             TCA_PEDIT_KEY_EX | NLA_F_NESTED);
858                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_HTYPE,
859                                  p_parser.keys_ex[i].htype);
860                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_CMD,
861                                  p_parser.keys_ex[i].cmd);
862                 mnl_attr_nest_end(nl, na_pedit_key);
863         }
864         mnl_attr_nest_end(nl, na_pedit_keys);
865         mnl_attr_nest_end(nl, na_act_options);
866         (*actions)--;
867 }
868
869 /**
870  * Calculate max memory size of one TC-pedit actions.
871  * One TC-pedit action can contain set of keys each defining
872  * a rewrite element (rte_flow action)
873  *
874  * @param[in,out] actions
875  *   actions specification.
876  * @param[in,out] action_flags
877  *   actions flags
878  * @param[in,out] size
879  *   accumulated size
880  * @return
881  *   Max memory size of one TC-pedit action
882  */
883 static int
884 flow_tcf_get_pedit_actions_size(const struct rte_flow_action **actions,
885                                 uint64_t *action_flags)
886 {
887         int pedit_size = 0;
888         int keys = 0;
889         uint64_t flags = 0;
890
891         pedit_size += SZ_NLATTR_NEST + /* na_act_index. */
892                       SZ_NLATTR_STRZ_OF("pedit") +
893                       SZ_NLATTR_NEST; /* TCA_ACT_OPTIONS. */
894         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
895                 switch ((*actions)->type) {
896                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
897                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
898                         flags |= MLX5_FLOW_ACTION_SET_IPV4_SRC;
899                         break;
900                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
901                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
902                         flags |= MLX5_FLOW_ACTION_SET_IPV4_DST;
903                         break;
904                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
905                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
906                         flags |= MLX5_FLOW_ACTION_SET_IPV6_SRC;
907                         break;
908                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
909                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
910                         flags |= MLX5_FLOW_ACTION_SET_IPV6_DST;
911                         break;
912                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
913                         /* TCP is as same as UDP */
914                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
915                         flags |= MLX5_FLOW_ACTION_SET_TP_SRC;
916                         break;
917                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
918                         /* TCP is as same as UDP */
919                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
920                         flags |= MLX5_FLOW_ACTION_SET_TP_DST;
921                         break;
922                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
923                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
924                         flags |= MLX5_FLOW_ACTION_SET_TTL;
925                         break;
926                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
927                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
928                         flags |= MLX5_FLOW_ACTION_DEC_TTL;
929                         break;
930                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
931                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
932                         flags |= MLX5_FLOW_ACTION_SET_MAC_SRC;
933                         break;
934                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
935                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
936                         flags |= MLX5_FLOW_ACTION_SET_MAC_DST;
937                         break;
938                 default:
939                         goto get_pedit_action_size_done;
940                 }
941         }
942 get_pedit_action_size_done:
943         /* TCA_PEDIT_PARAMS_EX */
944         pedit_size +=
945                 SZ_NLATTR_DATA_OF(sizeof(struct tc_pedit_sel) +
946                                   keys * sizeof(struct tc_pedit_key));
947         pedit_size += SZ_NLATTR_NEST; /* TCA_PEDIT_KEYS */
948         pedit_size += keys *
949                       /* TCA_PEDIT_KEY_EX + HTYPE + CMD */
950                       (SZ_NLATTR_NEST + SZ_NLATTR_DATA_OF(2) +
951                        SZ_NLATTR_DATA_OF(2));
952         (*action_flags) |= flags;
953         (*actions)--;
954         return pedit_size;
955 }
956
957 /**
958  * Retrieve mask for pattern item.
959  *
960  * This function does basic sanity checks on a pattern item in order to
961  * return the most appropriate mask for it.
962  *
963  * @param[in] item
964  *   Item specification.
965  * @param[in] mask_default
966  *   Default mask for pattern item as specified by the flow API.
967  * @param[in] mask_supported
968  *   Mask fields supported by the implementation.
969  * @param[in] mask_empty
970  *   Empty mask to return when there is no specification.
971  * @param[out] error
972  *   Perform verbose error reporting if not NULL.
973  *
974  * @return
975  *   Either @p item->mask or one of the mask parameters on success, NULL
976  *   otherwise and rte_errno is set.
977  */
978 static const void *
979 flow_tcf_item_mask(const struct rte_flow_item *item, const void *mask_default,
980                    const void *mask_supported, const void *mask_empty,
981                    size_t mask_size, struct rte_flow_error *error)
982 {
983         const uint8_t *mask;
984         size_t i;
985
986         /* item->last and item->mask cannot exist without item->spec. */
987         if (!item->spec && (item->mask || item->last)) {
988                 rte_flow_error_set(error, EINVAL,
989                                    RTE_FLOW_ERROR_TYPE_ITEM, item,
990                                    "\"mask\" or \"last\" field provided without"
991                                    " a corresponding \"spec\"");
992                 return NULL;
993         }
994         /* No spec, no mask, no problem. */
995         if (!item->spec)
996                 return mask_empty;
997         mask = item->mask ? item->mask : mask_default;
998         assert(mask);
999         /*
1000          * Single-pass check to make sure that:
1001          * - Mask is supported, no bits are set outside mask_supported.
1002          * - Both item->spec and item->last are included in mask.
1003          */
1004         for (i = 0; i != mask_size; ++i) {
1005                 if (!mask[i])
1006                         continue;
1007                 if ((mask[i] | ((const uint8_t *)mask_supported)[i]) !=
1008                     ((const uint8_t *)mask_supported)[i]) {
1009                         rte_flow_error_set(error, ENOTSUP,
1010                                            RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1011                                            "unsupported field found"
1012                                            " in \"mask\"");
1013                         return NULL;
1014                 }
1015                 if (item->last &&
1016                     (((const uint8_t *)item->spec)[i] & mask[i]) !=
1017                     (((const uint8_t *)item->last)[i] & mask[i])) {
1018                         rte_flow_error_set(error, EINVAL,
1019                                            RTE_FLOW_ERROR_TYPE_ITEM_LAST,
1020                                            item->last,
1021                                            "range between \"spec\" and \"last\""
1022                                            " not comprised in \"mask\"");
1023                         return NULL;
1024                 }
1025         }
1026         return mask;
1027 }
1028
1029 /**
1030  * Build a conversion table between port ID and ifindex.
1031  *
1032  * @param[in] dev
1033  *   Pointer to Ethernet device.
1034  * @param[out] ptoi
1035  *   Pointer to ptoi table.
1036  * @param[in] len
1037  *   Size of ptoi table provided.
1038  *
1039  * @return
1040  *   Size of ptoi table filled.
1041  */
1042 static unsigned int
1043 flow_tcf_build_ptoi_table(struct rte_eth_dev *dev, struct flow_tcf_ptoi *ptoi,
1044                           unsigned int len)
1045 {
1046         unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
1047         uint16_t port_id[n + 1];
1048         unsigned int i;
1049         unsigned int own = 0;
1050
1051         /* At least one port is needed when no switch domain is present. */
1052         if (!n) {
1053                 n = 1;
1054                 port_id[0] = dev->data->port_id;
1055         } else {
1056                 n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
1057         }
1058         if (n > len)
1059                 return 0;
1060         for (i = 0; i != n; ++i) {
1061                 struct rte_eth_dev_info dev_info;
1062
1063                 rte_eth_dev_info_get(port_id[i], &dev_info);
1064                 if (port_id[i] == dev->data->port_id)
1065                         own = i;
1066                 ptoi[i].port_id = port_id[i];
1067                 ptoi[i].ifindex = dev_info.if_index;
1068         }
1069         /* Ensure first entry of ptoi[] is the current device. */
1070         if (own) {
1071                 ptoi[n] = ptoi[0];
1072                 ptoi[0] = ptoi[own];
1073                 ptoi[own] = ptoi[n];
1074         }
1075         /* An entry with zero ifindex terminates ptoi[]. */
1076         ptoi[n].port_id = 0;
1077         ptoi[n].ifindex = 0;
1078         return n;
1079 }
1080
1081 /**
1082  * Verify the @p attr will be correctly understood by the E-switch.
1083  *
1084  * @param[in] attr
1085  *   Pointer to flow attributes
1086  * @param[out] error
1087  *   Pointer to error structure.
1088  *
1089  * @return
1090  *   0 on success, a negative errno value otherwise and rte_errno is set.
1091  */
1092 static int
1093 flow_tcf_validate_attributes(const struct rte_flow_attr *attr,
1094                              struct rte_flow_error *error)
1095 {
1096         /*
1097          * Supported attributes: groups, some priorities and ingress only.
1098          * group is supported only if kernel supports chain. Don't care about
1099          * transfer as it is the caller's problem.
1100          */
1101         if (attr->group > MLX5_TCF_GROUP_ID_MAX)
1102                 return rte_flow_error_set(error, ENOTSUP,
1103                                           RTE_FLOW_ERROR_TYPE_ATTR_GROUP, attr,
1104                                           "group ID larger than "
1105                                           RTE_STR(MLX5_TCF_GROUP_ID_MAX)
1106                                           " isn't supported");
1107         else if (attr->priority > MLX5_TCF_GROUP_PRIORITY_MAX)
1108                 return rte_flow_error_set(error, ENOTSUP,
1109                                           RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1110                                           attr,
1111                                           "priority more than "
1112                                           RTE_STR(MLX5_TCF_GROUP_PRIORITY_MAX)
1113                                           " is not supported");
1114         if (!attr->ingress)
1115                 return rte_flow_error_set(error, EINVAL,
1116                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1117                                           attr, "only ingress is supported");
1118         if (attr->egress)
1119                 return rte_flow_error_set(error, ENOTSUP,
1120                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1121                                           attr, "egress is not supported");
1122         return 0;
1123 }
1124
1125 /**
1126  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_ETH item for E-Switch.
1127  * The routine checks the L2 fields to be used in encapsulation header.
1128  *
1129  * @param[in] item
1130  *   Pointer to the item structure.
1131  * @param[out] error
1132  *   Pointer to the error structure.
1133  *
1134  * @return
1135  *   0 on success, a negative errno value otherwise and rte_errno is set.
1136  **/
1137 static int
1138 flow_tcf_validate_vxlan_encap_eth(const struct rte_flow_item *item,
1139                                   struct rte_flow_error *error)
1140 {
1141         const struct rte_flow_item_eth *spec = item->spec;
1142         const struct rte_flow_item_eth *mask = item->mask;
1143
1144         if (!spec) {
1145                 /*
1146                  * Specification for L2 addresses can be empty
1147                  * because these ones are optional and not
1148                  * required directly by tc rule. Kernel tries
1149                  * to resolve these ones on its own
1150                  */
1151                 return 0;
1152         }
1153         if (!mask) {
1154                 /* If mask is not specified use the default one. */
1155                 mask = &rte_flow_item_eth_mask;
1156         }
1157         if (memcmp(&mask->dst,
1158                    &flow_tcf_mask_empty.eth.dst,
1159                    sizeof(flow_tcf_mask_empty.eth.dst))) {
1160                 if (memcmp(&mask->dst,
1161                            &rte_flow_item_eth_mask.dst,
1162                            sizeof(rte_flow_item_eth_mask.dst)))
1163                         return rte_flow_error_set
1164                                 (error, ENOTSUP,
1165                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1166                                  "no support for partial mask on"
1167                                  " \"eth.dst\" field");
1168         }
1169         if (memcmp(&mask->src,
1170                    &flow_tcf_mask_empty.eth.src,
1171                    sizeof(flow_tcf_mask_empty.eth.src))) {
1172                 if (memcmp(&mask->src,
1173                            &rte_flow_item_eth_mask.src,
1174                            sizeof(rte_flow_item_eth_mask.src)))
1175                         return rte_flow_error_set
1176                                 (error, ENOTSUP,
1177                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1178                                  "no support for partial mask on"
1179                                  " \"eth.src\" field");
1180         }
1181         if (mask->type != RTE_BE16(0x0000)) {
1182                 if (mask->type != RTE_BE16(0xffff))
1183                         return rte_flow_error_set
1184                                 (error, ENOTSUP,
1185                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1186                                  "no support for partial mask on"
1187                                  " \"eth.type\" field");
1188                 DRV_LOG(WARNING,
1189                         "outer ethernet type field"
1190                         " cannot be forced for vxlan"
1191                         " encapsulation, parameter ignored");
1192         }
1193         return 0;
1194 }
1195
1196 /**
1197  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV4 item for E-Switch.
1198  * The routine checks the IPv4 fields to be used in encapsulation header.
1199  *
1200  * @param[in] item
1201  *   Pointer to the item structure.
1202  * @param[out] error
1203  *   Pointer to the error structure.
1204  *
1205  * @return
1206  *   0 on success, a negative errno value otherwise and rte_errno is set.
1207  **/
1208 static int
1209 flow_tcf_validate_vxlan_encap_ipv4(const struct rte_flow_item *item,
1210                                    struct rte_flow_error *error)
1211 {
1212         const struct rte_flow_item_ipv4 *spec = item->spec;
1213         const struct rte_flow_item_ipv4 *mask = item->mask;
1214
1215         if (!spec) {
1216                 /*
1217                  * Specification for IP addresses cannot be empty
1218                  * because it is required by tunnel_key parameter.
1219                  */
1220                 return rte_flow_error_set(error, EINVAL,
1221                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1222                                           "NULL outer ipv4 address"
1223                                           " specification for vxlan"
1224                                           " encapsulation");
1225         }
1226         if (!mask)
1227                 mask = &rte_flow_item_ipv4_mask;
1228         if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) {
1229                 if (mask->hdr.dst_addr != RTE_BE32(0xffffffff))
1230                         return rte_flow_error_set
1231                                 (error, ENOTSUP,
1232                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1233                                  "no support for partial mask on"
1234                                  " \"ipv4.hdr.dst_addr\" field"
1235                                  " for vxlan encapsulation");
1236                 /* More IPv4 address validations can be put here. */
1237         } else {
1238                 /*
1239                  * Kernel uses the destination IP address to determine
1240                  * the routing path and obtain the MAC destination
1241                  * address, so IP destination address must be
1242                  * specified in the tc rule.
1243                  */
1244                 return rte_flow_error_set(error, EINVAL,
1245                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1246                                           "outer ipv4 destination address"
1247                                           " must be specified for"
1248                                           " vxlan encapsulation");
1249         }
1250         if (mask->hdr.src_addr != RTE_BE32(0x00000000)) {
1251                 if (mask->hdr.src_addr != RTE_BE32(0xffffffff))
1252                         return rte_flow_error_set
1253                                 (error, ENOTSUP,
1254                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1255                                  "no support for partial mask on"
1256                                  " \"ipv4.hdr.src_addr\" field"
1257                                  " for vxlan encapsulation");
1258                 /* More IPv4 address validations can be put here. */
1259         } else {
1260                 /*
1261                  * Kernel uses the source IP address to select the
1262                  * interface for egress encapsulated traffic, so
1263                  * it must be specified in the tc rule.
1264                  */
1265                 return rte_flow_error_set(error, EINVAL,
1266                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1267                                           "outer ipv4 source address"
1268                                           " must be specified for"
1269                                           " vxlan encapsulation");
1270         }
1271         return 0;
1272 }
1273
1274 /**
1275  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV6 item for E-Switch.
1276  * The routine checks the IPv6 fields to be used in encapsulation header.
1277  *
1278  * @param[in] item
1279  *   Pointer to the item structure.
1280  * @param[out] error
1281  *   Pointer to the error structure.
1282  *
1283  * @return
1284  *   0 on success, a negative errno value otherwise and rte_errno is set.
1285  **/
1286 static int
1287 flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item,
1288                                    struct rte_flow_error *error)
1289 {
1290         const struct rte_flow_item_ipv6 *spec = item->spec;
1291         const struct rte_flow_item_ipv6 *mask = item->mask;
1292
1293         if (!spec) {
1294                 /*
1295                  * Specification for IP addresses cannot be empty
1296                  * because it is required by tunnel_key parameter.
1297                  */
1298                 return rte_flow_error_set(error, EINVAL,
1299                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1300                                           "NULL outer ipv6 address"
1301                                           " specification for"
1302                                           " vxlan encapsulation");
1303         }
1304         if (!mask)
1305                 mask = &rte_flow_item_ipv6_mask;
1306         if (memcmp(&mask->hdr.dst_addr,
1307                    &flow_tcf_mask_empty.ipv6.hdr.dst_addr,
1308                    IPV6_ADDR_LEN)) {
1309                 if (memcmp(&mask->hdr.dst_addr,
1310                            &rte_flow_item_ipv6_mask.hdr.dst_addr,
1311                            IPV6_ADDR_LEN))
1312                         return rte_flow_error_set
1313                                         (error, ENOTSUP,
1314                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1315                                          "no support for partial mask on"
1316                                          " \"ipv6.hdr.dst_addr\" field"
1317                                          " for vxlan encapsulation");
1318                 /* More IPv6 address validations can be put here. */
1319         } else {
1320                 /*
1321                  * Kernel uses the destination IP address to determine
1322                  * the routing path and obtain the MAC destination
1323                  * address (heigh or gate), so IP destination address
1324                  * must be specified within the tc rule.
1325                  */
1326                 return rte_flow_error_set(error, EINVAL,
1327                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1328                                           "outer ipv6 destination address"
1329                                           " must be specified for"
1330                                           " vxlan encapsulation");
1331         }
1332         if (memcmp(&mask->hdr.src_addr,
1333                    &flow_tcf_mask_empty.ipv6.hdr.src_addr,
1334                    IPV6_ADDR_LEN)) {
1335                 if (memcmp(&mask->hdr.src_addr,
1336                            &rte_flow_item_ipv6_mask.hdr.src_addr,
1337                            IPV6_ADDR_LEN))
1338                         return rte_flow_error_set
1339                                         (error, ENOTSUP,
1340                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1341                                          "no support for partial mask on"
1342                                          " \"ipv6.hdr.src_addr\" field"
1343                                          " for vxlan encapsulation");
1344                 /* More L3 address validation can be put here. */
1345         } else {
1346                 /*
1347                  * Kernel uses the source IP address to select the
1348                  * interface for egress encapsulated traffic, so
1349                  * it must be specified in the tc rule.
1350                  */
1351                 return rte_flow_error_set(error, EINVAL,
1352                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1353                                           "outer L3 source address"
1354                                           " must be specified for"
1355                                           " vxlan encapsulation");
1356         }
1357         return 0;
1358 }
1359
1360 /**
1361  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_UDP item for E-Switch.
1362  * The routine checks the UDP fields to be used in encapsulation header.
1363  *
1364  * @param[in] item
1365  *   Pointer to the item structure.
1366  * @param[out] error
1367  *   Pointer to the error structure.
1368  *
1369  * @return
1370  *   0 on success, a negative errno value otherwise and rte_errno is set.
1371  **/
1372 static int
1373 flow_tcf_validate_vxlan_encap_udp(const struct rte_flow_item *item,
1374                                   struct rte_flow_error *error)
1375 {
1376         const struct rte_flow_item_udp *spec = item->spec;
1377         const struct rte_flow_item_udp *mask = item->mask;
1378
1379         if (!spec) {
1380                 /*
1381                  * Specification for UDP ports cannot be empty
1382                  * because it is required by tunnel_key parameter.
1383                  */
1384                 return rte_flow_error_set(error, EINVAL,
1385                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1386                                           "NULL UDP port specification "
1387                                           " for vxlan encapsulation");
1388         }
1389         if (!mask)
1390                 mask = &rte_flow_item_udp_mask;
1391         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1392                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1393                         return rte_flow_error_set
1394                                         (error, ENOTSUP,
1395                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1396                                          "no support for partial mask on"
1397                                          " \"udp.hdr.dst_port\" field"
1398                                          " for vxlan encapsulation");
1399                 if (!spec->hdr.dst_port)
1400                         return rte_flow_error_set
1401                                         (error, EINVAL,
1402                                          RTE_FLOW_ERROR_TYPE_ITEM, item,
1403                                          "outer UDP remote port cannot be"
1404                                          " 0 for vxlan encapsulation");
1405         } else {
1406                 return rte_flow_error_set(error, EINVAL,
1407                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1408                                           "outer UDP remote port"
1409                                           " must be specified for"
1410                                           " vxlan encapsulation");
1411         }
1412         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1413                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1414                         return rte_flow_error_set
1415                                         (error, ENOTSUP,
1416                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1417                                          "no support for partial mask on"
1418                                          " \"udp.hdr.src_port\" field"
1419                                          " for vxlan encapsulation");
1420                 DRV_LOG(WARNING,
1421                         "outer UDP source port cannot be"
1422                         " forced for vxlan encapsulation,"
1423                         " parameter ignored");
1424         }
1425         return 0;
1426 }
1427
1428 /**
1429  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_VXLAN item for E-Switch.
1430  * The routine checks the VNIP fields to be used in encapsulation header.
1431  *
1432  * @param[in] item
1433  *   Pointer to the item structure.
1434  * @param[out] error
1435  *   Pointer to the error structure.
1436  *
1437  * @return
1438  *   0 on success, a negative errno value otherwise and rte_errno is set.
1439  **/
1440 static int
1441 flow_tcf_validate_vxlan_encap_vni(const struct rte_flow_item *item,
1442                                   struct rte_flow_error *error)
1443 {
1444         const struct rte_flow_item_vxlan *spec = item->spec;
1445         const struct rte_flow_item_vxlan *mask = item->mask;
1446
1447         if (!spec) {
1448                 /* Outer VNI is required by tunnel_key parameter. */
1449                 return rte_flow_error_set(error, EINVAL,
1450                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1451                                           "NULL VNI specification"
1452                                           " for vxlan encapsulation");
1453         }
1454         if (!mask)
1455                 mask = &rte_flow_item_vxlan_mask;
1456         if (!mask->vni[0] && !mask->vni[1] && !mask->vni[2])
1457                 return rte_flow_error_set(error, EINVAL,
1458                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1459                                           "outer VNI must be specified "
1460                                           "for vxlan encapsulation");
1461         if (mask->vni[0] != 0xff ||
1462             mask->vni[1] != 0xff ||
1463             mask->vni[2] != 0xff)
1464                 return rte_flow_error_set(error, ENOTSUP,
1465                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1466                                           "no support for partial mask on"
1467                                           " \"vxlan.vni\" field");
1468
1469         if (!spec->vni[0] && !spec->vni[1] && !spec->vni[2])
1470                 return rte_flow_error_set(error, EINVAL,
1471                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1472                                           "vxlan vni cannot be 0");
1473         return 0;
1474 }
1475
1476 /**
1477  * Validate VXLAN_ENCAP action item list for E-Switch.
1478  * The routine checks items to be used in encapsulation header.
1479  *
1480  * @param[in] action
1481  *   Pointer to the VXLAN_ENCAP action structure.
1482  * @param[out] error
1483  *   Pointer to the error structure.
1484  *
1485  * @return
1486  *   0 on success, a negative errno value otherwise and rte_errno is set.
1487  **/
1488 static int
1489 flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action,
1490                               struct rte_flow_error *error)
1491 {
1492         const struct rte_flow_item *items;
1493         int ret;
1494         uint32_t item_flags = 0;
1495
1496         if (!action->conf)
1497                 return rte_flow_error_set(error, EINVAL,
1498                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1499                                           "Missing vxlan tunnel"
1500                                           " action configuration");
1501         items = ((const struct rte_flow_action_vxlan_encap *)
1502                                         action->conf)->definition;
1503         if (!items)
1504                 return rte_flow_error_set(error, EINVAL,
1505                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1506                                           "Missing vxlan tunnel"
1507                                           " encapsulation parameters");
1508         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1509                 switch (items->type) {
1510                 case RTE_FLOW_ITEM_TYPE_VOID:
1511                         break;
1512                 case RTE_FLOW_ITEM_TYPE_ETH:
1513                         ret = mlx5_flow_validate_item_eth(items, item_flags,
1514                                                           error);
1515                         if (ret < 0)
1516                                 return ret;
1517                         ret = flow_tcf_validate_vxlan_encap_eth(items, error);
1518                         if (ret < 0)
1519                                 return ret;
1520                         item_flags |= MLX5_FLOW_LAYER_OUTER_L2;
1521                         break;
1522                 break;
1523                 case RTE_FLOW_ITEM_TYPE_IPV4:
1524                         ret = mlx5_flow_validate_item_ipv4(items, item_flags,
1525                                                            error);
1526                         if (ret < 0)
1527                                 return ret;
1528                         ret = flow_tcf_validate_vxlan_encap_ipv4(items, error);
1529                         if (ret < 0)
1530                                 return ret;
1531                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
1532                         break;
1533                 case RTE_FLOW_ITEM_TYPE_IPV6:
1534                         ret = mlx5_flow_validate_item_ipv6(items, item_flags,
1535                                                            error);
1536                         if (ret < 0)
1537                                 return ret;
1538                         ret = flow_tcf_validate_vxlan_encap_ipv6(items, error);
1539                         if (ret < 0)
1540                                 return ret;
1541                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
1542                         break;
1543                 case RTE_FLOW_ITEM_TYPE_UDP:
1544                         ret = mlx5_flow_validate_item_udp(items, item_flags,
1545                                                            0xFF, error);
1546                         if (ret < 0)
1547                                 return ret;
1548                         ret = flow_tcf_validate_vxlan_encap_udp(items, error);
1549                         if (ret < 0)
1550                                 return ret;
1551                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
1552                         break;
1553                 case RTE_FLOW_ITEM_TYPE_VXLAN:
1554                         ret = mlx5_flow_validate_item_vxlan(items,
1555                                                             item_flags, error);
1556                         if (ret < 0)
1557                                 return ret;
1558                         ret = flow_tcf_validate_vxlan_encap_vni(items, error);
1559                         if (ret < 0)
1560                                 return ret;
1561                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
1562                         break;
1563                 default:
1564                         return rte_flow_error_set
1565                                         (error, ENOTSUP,
1566                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
1567                                          "vxlan encap item not supported");
1568                 }
1569         }
1570         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3))
1571                 return rte_flow_error_set(error, EINVAL,
1572                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1573                                           "no outer IP layer found"
1574                                           " for vxlan encapsulation");
1575         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
1576                 return rte_flow_error_set(error, EINVAL,
1577                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1578                                           "no outer UDP layer found"
1579                                           " for vxlan encapsulation");
1580         if (!(item_flags & MLX5_FLOW_LAYER_VXLAN))
1581                 return rte_flow_error_set(error, EINVAL,
1582                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1583                                           "no VXLAN VNI found"
1584                                           " for vxlan encapsulation");
1585         return 0;
1586 }
1587
1588 /**
1589  * Validate outer RTE_FLOW_ITEM_TYPE_UDP item if tunnel item
1590  * RTE_FLOW_ITEM_TYPE_VXLAN is present in item list.
1591  *
1592  * @param[in] udp
1593  *   Outer UDP layer item (if any, NULL otherwise).
1594  * @param[out] error
1595  *   Pointer to the error structure.
1596  *
1597  * @return
1598  *   0 on success, a negative errno value otherwise and rte_errno is set.
1599  **/
1600 static int
1601 flow_tcf_validate_vxlan_decap_udp(const struct rte_flow_item *udp,
1602                                   struct rte_flow_error *error)
1603 {
1604         const struct rte_flow_item_udp *spec = udp->spec;
1605         const struct rte_flow_item_udp *mask = udp->mask;
1606
1607         if (!spec)
1608                 /*
1609                  * Specification for UDP ports cannot be empty
1610                  * because it is required as decap parameter.
1611                  */
1612                 return rte_flow_error_set(error, EINVAL,
1613                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1614                                           "NULL UDP port specification"
1615                                           " for VXLAN decapsulation");
1616         if (!mask)
1617                 mask = &rte_flow_item_udp_mask;
1618         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1619                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1620                         return rte_flow_error_set
1621                                         (error, ENOTSUP,
1622                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1623                                          "no support for partial mask on"
1624                                          " \"udp.hdr.dst_port\" field");
1625                 if (!spec->hdr.dst_port)
1626                         return rte_flow_error_set
1627                                         (error, EINVAL,
1628                                          RTE_FLOW_ERROR_TYPE_ITEM, udp,
1629                                          "zero decap local UDP port");
1630         } else {
1631                 return rte_flow_error_set(error, EINVAL,
1632                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1633                                           "outer UDP destination port must be "
1634                                           "specified for vxlan decapsulation");
1635         }
1636         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1637                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1638                         return rte_flow_error_set
1639                                         (error, ENOTSUP,
1640                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1641                                          "no support for partial mask on"
1642                                          " \"udp.hdr.src_port\" field");
1643                 DRV_LOG(WARNING,
1644                         "outer UDP local port cannot be "
1645                         "forced for VXLAN encapsulation, "
1646                         "parameter ignored");
1647         }
1648         return 0;
1649 }
1650
1651 /**
1652  * Validate flow for E-Switch.
1653  *
1654  * @param[in] priv
1655  *   Pointer to the priv structure.
1656  * @param[in] attr
1657  *   Pointer to the flow attributes.
1658  * @param[in] items
1659  *   Pointer to the list of items.
1660  * @param[in] actions
1661  *   Pointer to the list of actions.
1662  * @param[out] error
1663  *   Pointer to the error structure.
1664  *
1665  * @return
1666  *   0 on success, a negative errno value otherwise and rte_errno is set.
1667  */
1668 static int
1669 flow_tcf_validate(struct rte_eth_dev *dev,
1670                   const struct rte_flow_attr *attr,
1671                   const struct rte_flow_item items[],
1672                   const struct rte_flow_action actions[],
1673                   struct rte_flow_error *error)
1674 {
1675         union {
1676                 const struct rte_flow_item_port_id *port_id;
1677                 const struct rte_flow_item_eth *eth;
1678                 const struct rte_flow_item_vlan *vlan;
1679                 const struct rte_flow_item_ipv4 *ipv4;
1680                 const struct rte_flow_item_ipv6 *ipv6;
1681                 const struct rte_flow_item_tcp *tcp;
1682                 const struct rte_flow_item_udp *udp;
1683                 const struct rte_flow_item_vxlan *vxlan;
1684         } spec, mask;
1685         union {
1686                 const struct rte_flow_action_port_id *port_id;
1687                 const struct rte_flow_action_jump *jump;
1688                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
1689                 const struct rte_flow_action_of_set_vlan_vid *
1690                         of_set_vlan_vid;
1691                 const struct rte_flow_action_of_set_vlan_pcp *
1692                         of_set_vlan_pcp;
1693                 const struct rte_flow_action_vxlan_encap *vxlan_encap;
1694                 const struct rte_flow_action_set_ipv4 *set_ipv4;
1695                 const struct rte_flow_action_set_ipv6 *set_ipv6;
1696         } conf;
1697         const struct rte_flow_item *outer_udp = NULL;
1698         rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
1699         rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
1700         rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
1701         uint64_t item_flags = 0;
1702         uint64_t action_flags = 0;
1703         uint8_t next_protocol = 0xff;
1704         unsigned int tcm_ifindex = 0;
1705         uint8_t pedit_validated = 0;
1706         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
1707         struct rte_eth_dev *port_id_dev = NULL;
1708         bool in_port_id_set;
1709         int ret;
1710
1711         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
1712                                                 PTOI_TABLE_SZ_MAX(dev)));
1713         ret = flow_tcf_validate_attributes(attr, error);
1714         if (ret < 0)
1715                 return ret;
1716         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
1717                 unsigned int i;
1718                 uint64_t current_action_flag = 0;
1719
1720                 switch (actions->type) {
1721                 case RTE_FLOW_ACTION_TYPE_VOID:
1722                         break;
1723                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
1724                         current_action_flag = MLX5_FLOW_ACTION_PORT_ID;
1725                         if (!actions->conf)
1726                                 break;
1727                         conf.port_id = actions->conf;
1728                         if (conf.port_id->original)
1729                                 i = 0;
1730                         else
1731                                 for (i = 0; ptoi[i].ifindex; ++i)
1732                                         if (ptoi[i].port_id == conf.port_id->id)
1733                                                 break;
1734                         if (!ptoi[i].ifindex)
1735                                 return rte_flow_error_set
1736                                         (error, ENODEV,
1737                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1738                                          conf.port_id,
1739                                          "missing data to convert port ID to"
1740                                          " ifindex");
1741                         port_id_dev = &rte_eth_devices[conf.port_id->id];
1742                         break;
1743                 case RTE_FLOW_ACTION_TYPE_JUMP:
1744                         current_action_flag = MLX5_FLOW_ACTION_JUMP;
1745                         if (!actions->conf)
1746                                 break;
1747                         conf.jump = actions->conf;
1748                         if (attr->group >= conf.jump->group)
1749                                 return rte_flow_error_set
1750                                         (error, ENOTSUP,
1751                                          RTE_FLOW_ERROR_TYPE_ACTION,
1752                                          actions,
1753                                          "can jump only to a group forward");
1754                         break;
1755                 case RTE_FLOW_ACTION_TYPE_DROP:
1756                         current_action_flag = MLX5_FLOW_ACTION_DROP;
1757                         break;
1758                 case RTE_FLOW_ACTION_TYPE_COUNT:
1759                         break;
1760                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
1761                         current_action_flag = MLX5_FLOW_ACTION_OF_POP_VLAN;
1762                         break;
1763                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: {
1764                         rte_be16_t ethertype;
1765
1766                         current_action_flag = MLX5_FLOW_ACTION_OF_PUSH_VLAN;
1767                         if (!actions->conf)
1768                                 break;
1769                         conf.of_push_vlan = actions->conf;
1770                         ethertype = conf.of_push_vlan->ethertype;
1771                         if (ethertype != RTE_BE16(ETH_P_8021Q) &&
1772                             ethertype != RTE_BE16(ETH_P_8021AD))
1773                                 return rte_flow_error_set
1774                                         (error, EINVAL,
1775                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1776                                          "vlan push TPID must be "
1777                                          "802.1Q or 802.1AD");
1778                         break;
1779                 }
1780                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
1781                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1782                                 return rte_flow_error_set
1783                                         (error, ENOTSUP,
1784                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1785                                          "vlan modify is not supported,"
1786                                          " set action must follow push action");
1787                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
1788                         break;
1789                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
1790                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1791                                 return rte_flow_error_set
1792                                         (error, ENOTSUP,
1793                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1794                                          "vlan modify is not supported,"
1795                                          " set action must follow push action");
1796                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
1797                         break;
1798                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
1799                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_DECAP;
1800                         break;
1801                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
1802                         ret = flow_tcf_validate_vxlan_encap(actions, error);
1803                         if (ret < 0)
1804                                 return ret;
1805                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_ENCAP;
1806                         break;
1807                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
1808                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_SRC;
1809                         break;
1810                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
1811                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_DST;
1812                         break;
1813                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
1814                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_SRC;
1815                         break;
1816                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
1817                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_DST;
1818                         break;
1819                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
1820                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_SRC;
1821                         break;
1822                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
1823                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_DST;
1824                         break;
1825                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
1826                         current_action_flag = MLX5_FLOW_ACTION_SET_TTL;
1827                         break;
1828                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
1829                         current_action_flag = MLX5_FLOW_ACTION_DEC_TTL;
1830                         break;
1831                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
1832                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_SRC;
1833                         break;
1834                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
1835                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_DST;
1836                         break;
1837                 default:
1838                         return rte_flow_error_set(error, ENOTSUP,
1839                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1840                                                   actions,
1841                                                   "action not supported");
1842                 }
1843                 if (current_action_flag & MLX5_TCF_CONFIG_ACTIONS) {
1844                         if (!actions->conf)
1845                                 return rte_flow_error_set
1846                                         (error, EINVAL,
1847                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1848                                          actions,
1849                                          "action configuration not set");
1850                 }
1851                 if ((current_action_flag & MLX5_TCF_PEDIT_ACTIONS) &&
1852                     pedit_validated)
1853                         return rte_flow_error_set(error, ENOTSUP,
1854                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1855                                                   actions,
1856                                                   "set actions should be "
1857                                                   "listed successively");
1858                 if ((current_action_flag & ~MLX5_TCF_PEDIT_ACTIONS) &&
1859                     (action_flags & MLX5_TCF_PEDIT_ACTIONS))
1860                         pedit_validated = 1;
1861                 if ((current_action_flag & MLX5_TCF_FATE_ACTIONS) &&
1862                     (action_flags & MLX5_TCF_FATE_ACTIONS))
1863                         return rte_flow_error_set(error, EINVAL,
1864                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1865                                                   actions,
1866                                                   "can't have multiple fate"
1867                                                   " actions");
1868                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1869                     (action_flags & MLX5_TCF_VXLAN_ACTIONS))
1870                         return rte_flow_error_set(error, EINVAL,
1871                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1872                                                   actions,
1873                                                   "can't have multiple vxlan"
1874                                                   " actions");
1875                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1876                     (action_flags & MLX5_TCF_VLAN_ACTIONS))
1877                         return rte_flow_error_set(error, ENOTSUP,
1878                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1879                                                   actions,
1880                                                   "can't have vxlan and vlan"
1881                                                   " actions in the same rule");
1882                 action_flags |= current_action_flag;
1883         }
1884         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1885                 unsigned int i;
1886
1887                 switch (items->type) {
1888                 case RTE_FLOW_ITEM_TYPE_VOID:
1889                         break;
1890                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
1891                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
1892                                 return rte_flow_error_set
1893                                         (error, ENOTSUP,
1894                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
1895                                          "inner tunnel port id"
1896                                          " item is not supported");
1897                         mask.port_id = flow_tcf_item_mask
1898                                 (items, &rte_flow_item_port_id_mask,
1899                                  &flow_tcf_mask_supported.port_id,
1900                                  &flow_tcf_mask_empty.port_id,
1901                                  sizeof(flow_tcf_mask_supported.port_id),
1902                                  error);
1903                         if (!mask.port_id)
1904                                 return -rte_errno;
1905                         if (mask.port_id == &flow_tcf_mask_empty.port_id) {
1906                                 in_port_id_set = 1;
1907                                 break;
1908                         }
1909                         spec.port_id = items->spec;
1910                         if (mask.port_id->id && mask.port_id->id != 0xffffffff)
1911                                 return rte_flow_error_set
1912                                         (error, ENOTSUP,
1913                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
1914                                          mask.port_id,
1915                                          "no support for partial mask on"
1916                                          " \"id\" field");
1917                         if (!mask.port_id->id)
1918                                 i = 0;
1919                         else
1920                                 for (i = 0; ptoi[i].ifindex; ++i)
1921                                         if (ptoi[i].port_id == spec.port_id->id)
1922                                                 break;
1923                         if (!ptoi[i].ifindex)
1924                                 return rte_flow_error_set
1925                                         (error, ENODEV,
1926                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
1927                                          spec.port_id,
1928                                          "missing data to convert port ID to"
1929                                          " ifindex");
1930                         if (in_port_id_set && ptoi[i].ifindex != tcm_ifindex)
1931                                 return rte_flow_error_set
1932                                         (error, ENOTSUP,
1933                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
1934                                          spec.port_id,
1935                                          "cannot match traffic for"
1936                                          " several port IDs through"
1937                                          " a single flow rule");
1938                         tcm_ifindex = ptoi[i].ifindex;
1939                         in_port_id_set = 1;
1940                         break;
1941                 case RTE_FLOW_ITEM_TYPE_ETH:
1942                         ret = mlx5_flow_validate_item_eth(items, item_flags,
1943                                                           error);
1944                         if (ret < 0)
1945                                 return ret;
1946                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
1947                                       MLX5_FLOW_LAYER_INNER_L2 :
1948                                       MLX5_FLOW_LAYER_OUTER_L2;
1949                         /* TODO:
1950                          * Redundant check due to different supported mask.
1951                          * Same for the rest of items.
1952                          */
1953                         mask.eth = flow_tcf_item_mask
1954                                 (items, &rte_flow_item_eth_mask,
1955                                  &flow_tcf_mask_supported.eth,
1956                                  &flow_tcf_mask_empty.eth,
1957                                  sizeof(flow_tcf_mask_supported.eth),
1958                                  error);
1959                         if (!mask.eth)
1960                                 return -rte_errno;
1961                         if (mask.eth->type && mask.eth->type !=
1962                             RTE_BE16(0xffff))
1963                                 return rte_flow_error_set
1964                                         (error, ENOTSUP,
1965                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
1966                                          mask.eth,
1967                                          "no support for partial mask on"
1968                                          " \"type\" field");
1969                         assert(items->spec);
1970                         spec.eth = items->spec;
1971                         if (mask.eth->type &&
1972                             (item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
1973                             inner_etype != RTE_BE16(ETH_P_ALL) &&
1974                             inner_etype != spec.eth->type)
1975                                 return rte_flow_error_set
1976                                         (error, EINVAL,
1977                                          RTE_FLOW_ERROR_TYPE_ITEM,
1978                                          items,
1979                                          "inner eth_type conflict");
1980                         if (mask.eth->type &&
1981                             !(item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
1982                             outer_etype != RTE_BE16(ETH_P_ALL) &&
1983                             outer_etype != spec.eth->type)
1984                                 return rte_flow_error_set
1985                                         (error, EINVAL,
1986                                          RTE_FLOW_ERROR_TYPE_ITEM,
1987                                          items,
1988                                          "outer eth_type conflict");
1989                         if (mask.eth->type) {
1990                                 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
1991                                         inner_etype = spec.eth->type;
1992                                 else
1993                                         outer_etype = spec.eth->type;
1994                         }
1995                         break;
1996                 case RTE_FLOW_ITEM_TYPE_VLAN:
1997                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
1998                                 return rte_flow_error_set
1999                                         (error, ENOTSUP,
2000                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
2001                                          "inner tunnel VLAN"
2002                                          " is not supported");
2003                         ret = mlx5_flow_validate_item_vlan(items, item_flags,
2004                                                            error);
2005                         if (ret < 0)
2006                                 return ret;
2007                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
2008                         mask.vlan = flow_tcf_item_mask
2009                                 (items, &rte_flow_item_vlan_mask,
2010                                  &flow_tcf_mask_supported.vlan,
2011                                  &flow_tcf_mask_empty.vlan,
2012                                  sizeof(flow_tcf_mask_supported.vlan),
2013                                  error);
2014                         if (!mask.vlan)
2015                                 return -rte_errno;
2016                         if ((mask.vlan->tci & RTE_BE16(0xe000) &&
2017                              (mask.vlan->tci & RTE_BE16(0xe000)) !=
2018                               RTE_BE16(0xe000)) ||
2019                             (mask.vlan->tci & RTE_BE16(0x0fff) &&
2020                              (mask.vlan->tci & RTE_BE16(0x0fff)) !=
2021                               RTE_BE16(0x0fff)) ||
2022                             (mask.vlan->inner_type &&
2023                              mask.vlan->inner_type != RTE_BE16(0xffff)))
2024                                 return rte_flow_error_set
2025                                         (error, ENOTSUP,
2026                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2027                                          mask.vlan,
2028                                          "no support for partial masks on"
2029                                          " \"tci\" (PCP and VID parts) and"
2030                                          " \"inner_type\" fields");
2031                         if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2032                             outer_etype != RTE_BE16(ETH_P_8021Q))
2033                                 return rte_flow_error_set
2034                                         (error, EINVAL,
2035                                          RTE_FLOW_ERROR_TYPE_ITEM,
2036                                          items,
2037                                          "outer eth_type conflict,"
2038                                          " must be 802.1Q");
2039                         outer_etype = RTE_BE16(ETH_P_8021Q);
2040                         assert(items->spec);
2041                         spec.vlan = items->spec;
2042                         if (mask.vlan->inner_type &&
2043                             vlan_etype != RTE_BE16(ETH_P_ALL) &&
2044                             vlan_etype != spec.vlan->inner_type)
2045                                 return rte_flow_error_set
2046                                         (error, EINVAL,
2047                                          RTE_FLOW_ERROR_TYPE_ITEM,
2048                                          items,
2049                                          "vlan eth_type conflict");
2050                         if (mask.vlan->inner_type)
2051                                 vlan_etype = spec.vlan->inner_type;
2052                         break;
2053                 case RTE_FLOW_ITEM_TYPE_IPV4:
2054                         ret = mlx5_flow_validate_item_ipv4(items, item_flags,
2055                                                            error);
2056                         if (ret < 0)
2057                                 return ret;
2058                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2059                                       MLX5_FLOW_LAYER_INNER_L3_IPV4 :
2060                                       MLX5_FLOW_LAYER_OUTER_L3_IPV4;
2061                         mask.ipv4 = flow_tcf_item_mask
2062                                 (items, &rte_flow_item_ipv4_mask,
2063                                  &flow_tcf_mask_supported.ipv4,
2064                                  &flow_tcf_mask_empty.ipv4,
2065                                  sizeof(flow_tcf_mask_supported.ipv4),
2066                                  error);
2067                         if (!mask.ipv4)
2068                                 return -rte_errno;
2069                         if (mask.ipv4->hdr.next_proto_id &&
2070                             mask.ipv4->hdr.next_proto_id != 0xff)
2071                                 return rte_flow_error_set
2072                                         (error, ENOTSUP,
2073                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2074                                          mask.ipv4,
2075                                          "no support for partial mask on"
2076                                          " \"hdr.next_proto_id\" field");
2077                         else if (mask.ipv4->hdr.next_proto_id)
2078                                 next_protocol =
2079                                         ((const struct rte_flow_item_ipv4 *)
2080                                          (items->spec))->hdr.next_proto_id;
2081                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2082                                 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2083                                     inner_etype != RTE_BE16(ETH_P_IP))
2084                                         return rte_flow_error_set
2085                                                 (error, EINVAL,
2086                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2087                                                  items,
2088                                                  "inner eth_type conflict,"
2089                                                  " IPv4 is required");
2090                                 inner_etype = RTE_BE16(ETH_P_IP);
2091                         } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2092                                 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2093                                     vlan_etype != RTE_BE16(ETH_P_IP))
2094                                         return rte_flow_error_set
2095                                                 (error, EINVAL,
2096                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2097                                                  items,
2098                                                  "vlan eth_type conflict,"
2099                                                  " IPv4 is required");
2100                                 vlan_etype = RTE_BE16(ETH_P_IP);
2101                         } else {
2102                                 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2103                                     outer_etype != RTE_BE16(ETH_P_IP))
2104                                         return rte_flow_error_set
2105                                                 (error, EINVAL,
2106                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2107                                                  items,
2108                                                  "eth_type conflict,"
2109                                                  " IPv4 is required");
2110                                 outer_etype = RTE_BE16(ETH_P_IP);
2111                         }
2112                         break;
2113                 case RTE_FLOW_ITEM_TYPE_IPV6:
2114                         ret = mlx5_flow_validate_item_ipv6(items, item_flags,
2115                                                            error);
2116                         if (ret < 0)
2117                                 return ret;
2118                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2119                                       MLX5_FLOW_LAYER_INNER_L3_IPV6 :
2120                                       MLX5_FLOW_LAYER_OUTER_L3_IPV6;
2121                         mask.ipv6 = flow_tcf_item_mask
2122                                 (items, &rte_flow_item_ipv6_mask,
2123                                  &flow_tcf_mask_supported.ipv6,
2124                                  &flow_tcf_mask_empty.ipv6,
2125                                  sizeof(flow_tcf_mask_supported.ipv6),
2126                                  error);
2127                         if (!mask.ipv6)
2128                                 return -rte_errno;
2129                         if (mask.ipv6->hdr.proto &&
2130                             mask.ipv6->hdr.proto != 0xff)
2131                                 return rte_flow_error_set
2132                                         (error, ENOTSUP,
2133                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2134                                          mask.ipv6,
2135                                          "no support for partial mask on"
2136                                          " \"hdr.proto\" field");
2137                         else if (mask.ipv6->hdr.proto)
2138                                 next_protocol =
2139                                         ((const struct rte_flow_item_ipv6 *)
2140                                          (items->spec))->hdr.proto;
2141                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2142                                 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2143                                     inner_etype != RTE_BE16(ETH_P_IPV6))
2144                                         return rte_flow_error_set
2145                                                 (error, EINVAL,
2146                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2147                                                  items,
2148                                                  "inner eth_type conflict,"
2149                                                  " IPv6 is required");
2150                                 inner_etype = RTE_BE16(ETH_P_IPV6);
2151                         } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2152                                 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2153                                     vlan_etype != RTE_BE16(ETH_P_IPV6))
2154                                         return rte_flow_error_set
2155                                                 (error, EINVAL,
2156                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2157                                                  items,
2158                                                  "vlan eth_type conflict,"
2159                                                  " IPv6 is required");
2160                                 vlan_etype = RTE_BE16(ETH_P_IPV6);
2161                         } else {
2162                                 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2163                                     outer_etype != RTE_BE16(ETH_P_IPV6))
2164                                         return rte_flow_error_set
2165                                                 (error, EINVAL,
2166                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2167                                                  items,
2168                                                  "eth_type conflict,"
2169                                                  " IPv6 is required");
2170                                 outer_etype = RTE_BE16(ETH_P_IPV6);
2171                         }
2172                         break;
2173                 case RTE_FLOW_ITEM_TYPE_UDP:
2174                         ret = mlx5_flow_validate_item_udp(items, item_flags,
2175                                                           next_protocol, error);
2176                         if (ret < 0)
2177                                 return ret;
2178                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2179                                       MLX5_FLOW_LAYER_INNER_L4_UDP :
2180                                       MLX5_FLOW_LAYER_OUTER_L4_UDP;
2181                         mask.udp = flow_tcf_item_mask
2182                                 (items, &rte_flow_item_udp_mask,
2183                                  &flow_tcf_mask_supported.udp,
2184                                  &flow_tcf_mask_empty.udp,
2185                                  sizeof(flow_tcf_mask_supported.udp),
2186                                  error);
2187                         if (!mask.udp)
2188                                 return -rte_errno;
2189                         /*
2190                          * Save the presumed outer UDP item for extra check
2191                          * if the tunnel item will be found later in the list.
2192                          */
2193                         if (!(item_flags & MLX5_FLOW_LAYER_TUNNEL))
2194                                 outer_udp = items;
2195                         break;
2196                 case RTE_FLOW_ITEM_TYPE_TCP:
2197                         ret = mlx5_flow_validate_item_tcp
2198                                              (items, item_flags,
2199                                               next_protocol,
2200                                               &flow_tcf_mask_supported.tcp,
2201                                               error);
2202                         if (ret < 0)
2203                                 return ret;
2204                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2205                                       MLX5_FLOW_LAYER_INNER_L4_TCP :
2206                                       MLX5_FLOW_LAYER_OUTER_L4_TCP;
2207                         mask.tcp = flow_tcf_item_mask
2208                                 (items, &rte_flow_item_tcp_mask,
2209                                  &flow_tcf_mask_supported.tcp,
2210                                  &flow_tcf_mask_empty.tcp,
2211                                  sizeof(flow_tcf_mask_supported.tcp),
2212                                  error);
2213                         if (!mask.tcp)
2214                                 return -rte_errno;
2215                         break;
2216                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2217                         if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN)
2218                                 return rte_flow_error_set
2219                                         (error, ENOTSUP,
2220                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
2221                                          "vxlan tunnel over vlan"
2222                                          " is not supported");
2223                         ret = mlx5_flow_validate_item_vxlan(items,
2224                                                             item_flags, error);
2225                         if (ret < 0)
2226                                 return ret;
2227                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
2228                         mask.vxlan = flow_tcf_item_mask
2229                                 (items, &rte_flow_item_vxlan_mask,
2230                                  &flow_tcf_mask_supported.vxlan,
2231                                  &flow_tcf_mask_empty.vxlan,
2232                                  sizeof(flow_tcf_mask_supported.vxlan), error);
2233                         if (!mask.vxlan)
2234                                 return -rte_errno;
2235                         if (mask.vxlan->vni[0] != 0xff ||
2236                             mask.vxlan->vni[1] != 0xff ||
2237                             mask.vxlan->vni[2] != 0xff)
2238                                 return rte_flow_error_set
2239                                         (error, ENOTSUP,
2240                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2241                                          mask.vxlan,
2242                                          "no support for partial or "
2243                                          "empty mask on \"vxlan.vni\" field");
2244                         /*
2245                          * The VNI item assumes the VXLAN tunnel, it requires
2246                          * at least the outer destination UDP port must be
2247                          * specified without wildcards to allow kernel select
2248                          * the virtual VXLAN device by port. Also outer IPv4
2249                          * or IPv6 item must be specified (wilcards or even
2250                          * zero mask are allowed) to let driver know the tunnel
2251                          * IP version and process UDP traffic correctly.
2252                          */
2253                         if (!(item_flags &
2254                              (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2255                               MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2256                                 return rte_flow_error_set
2257                                                  (error, EINVAL,
2258                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2259                                                   NULL,
2260                                                   "no outer IP pattern found"
2261                                                   " for vxlan tunnel");
2262                         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
2263                                 return rte_flow_error_set
2264                                                  (error, EINVAL,
2265                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2266                                                   NULL,
2267                                                   "no outer UDP pattern found"
2268                                                   " for vxlan tunnel");
2269                         /*
2270                          * All items preceding the tunnel item become outer
2271                          * ones and we should do extra validation for them
2272                          * due to tc limitations for tunnel outer parameters.
2273                          * Currently only outer UDP item requres extra check,
2274                          * use the saved pointer instead of item list rescan.
2275                          */
2276                         assert(outer_udp);
2277                         ret = flow_tcf_validate_vxlan_decap_udp
2278                                                 (outer_udp, error);
2279                         if (ret < 0)
2280                                 return ret;
2281                         /* Reset L4 protocol for inner parameters. */
2282                         next_protocol = 0xff;
2283                         break;
2284                 default:
2285                         return rte_flow_error_set(error, ENOTSUP,
2286                                                   RTE_FLOW_ERROR_TYPE_ITEM,
2287                                                   items, "item not supported");
2288                 }
2289         }
2290         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2291             (action_flags & MLX5_FLOW_ACTION_DROP))
2292                 return rte_flow_error_set(error, ENOTSUP,
2293                                           RTE_FLOW_ERROR_TYPE_ACTION,
2294                                           actions,
2295                                           "set action is not compatible with "
2296                                           "drop action");
2297         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2298             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2299                 return rte_flow_error_set(error, ENOTSUP,
2300                                           RTE_FLOW_ERROR_TYPE_ACTION,
2301                                           actions,
2302                                           "set action must be followed by "
2303                                           "port_id action");
2304         if (action_flags &
2305            (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST)) {
2306                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4))
2307                         return rte_flow_error_set(error, EINVAL,
2308                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2309                                                   actions,
2310                                                   "no ipv4 item found in"
2311                                                   " pattern");
2312         }
2313         if (action_flags &
2314            (MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST)) {
2315                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6))
2316                         return rte_flow_error_set(error, EINVAL,
2317                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2318                                                   actions,
2319                                                   "no ipv6 item found in"
2320                                                   " pattern");
2321         }
2322         if (action_flags &
2323            (MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST)) {
2324                 if (!(item_flags &
2325                      (MLX5_FLOW_LAYER_OUTER_L4_UDP |
2326                       MLX5_FLOW_LAYER_OUTER_L4_TCP)))
2327                         return rte_flow_error_set(error, EINVAL,
2328                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2329                                                   actions,
2330                                                   "no TCP/UDP item found in"
2331                                                   " pattern");
2332         }
2333         /*
2334          * FW syndrome (0xA9C090):
2335          *     set_flow_table_entry: push vlan action fte in fdb can ONLY be
2336          *     forward to the uplink.
2337          */
2338         if ((action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) &&
2339             (action_flags & MLX5_FLOW_ACTION_PORT_ID) &&
2340             ((struct priv *)port_id_dev->data->dev_private)->representor)
2341                 return rte_flow_error_set(error, ENOTSUP,
2342                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2343                                           "vlan push can only be applied"
2344                                           " when forwarding to uplink port");
2345         /*
2346          * FW syndrome (0x294609):
2347          *     set_flow_table_entry: modify/pop/push actions in fdb flow table
2348          *     are supported only while forwarding to vport.
2349          */
2350         if ((action_flags & MLX5_TCF_VLAN_ACTIONS) &&
2351             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2352                 return rte_flow_error_set(error, ENOTSUP,
2353                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2354                                           "vlan actions are supported"
2355                                           " only with port_id action");
2356         if ((action_flags & MLX5_TCF_VXLAN_ACTIONS) &&
2357             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2358                 return rte_flow_error_set(error, ENOTSUP,
2359                                           RTE_FLOW_ERROR_TYPE_ACTION, NULL,
2360                                           "vxlan actions are supported"
2361                                           " only with port_id action");
2362         if (!(action_flags & MLX5_TCF_FATE_ACTIONS))
2363                 return rte_flow_error_set(error, EINVAL,
2364                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2365                                           "no fate action is found");
2366         if (action_flags &
2367            (MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL)) {
2368                 if (!(item_flags &
2369                      (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2370                       MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2371                         return rte_flow_error_set(error, EINVAL,
2372                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2373                                                   actions,
2374                                                   "no IP found in pattern");
2375         }
2376         if (action_flags &
2377             (MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)) {
2378                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L2))
2379                         return rte_flow_error_set(error, ENOTSUP,
2380                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2381                                                   actions,
2382                                                   "no ethernet found in"
2383                                                   " pattern");
2384         }
2385         if ((action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) &&
2386             !(item_flags & MLX5_FLOW_LAYER_VXLAN))
2387                 return rte_flow_error_set(error, EINVAL,
2388                                           RTE_FLOW_ERROR_TYPE_ACTION,
2389                                           NULL,
2390                                           "no VNI pattern found"
2391                                           " for vxlan decap action");
2392         if ((action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) &&
2393             (item_flags & MLX5_FLOW_LAYER_TUNNEL))
2394                 return rte_flow_error_set(error, EINVAL,
2395                                           RTE_FLOW_ERROR_TYPE_ACTION,
2396                                           NULL,
2397                                           "vxlan encap not supported"
2398                                           " for tunneled traffic");
2399         return 0;
2400 }
2401
2402 /**
2403  * Calculate maximum size of memory for flow items of Linux TC flower.
2404  *
2405  * @param[in] attr
2406  *   Pointer to the flow attributes.
2407  * @param[in] items
2408  *   Pointer to the list of items.
2409  * @param[out] action_flags
2410  *   Pointer to the detected actions.
2411  *
2412  * @return
2413  *   Maximum size of memory for items.
2414  */
2415 static int
2416 flow_tcf_get_items_size(const struct rte_flow_attr *attr,
2417                         const struct rte_flow_item items[],
2418                         uint64_t *action_flags)
2419 {
2420         int size = 0;
2421
2422         size += SZ_NLATTR_STRZ_OF("flower") +
2423                 SZ_NLATTR_TYPE_OF(uint16_t) + /* Outer ether type. */
2424                 SZ_NLATTR_NEST + /* TCA_OPTIONS. */
2425                 SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */
2426         if (attr->group > 0)
2427                 size += SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CHAIN. */
2428         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2429                 switch (items->type) {
2430                 case RTE_FLOW_ITEM_TYPE_VOID:
2431                         break;
2432                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
2433                         break;
2434                 case RTE_FLOW_ITEM_TYPE_ETH:
2435                         size += SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4;
2436                                 /* dst/src MAC addr and mask. */
2437                         break;
2438                 case RTE_FLOW_ITEM_TYPE_VLAN:
2439                         size += SZ_NLATTR_TYPE_OF(uint16_t) +
2440                                 /* VLAN Ether type. */
2441                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */
2442                                 SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */
2443                         break;
2444                 case RTE_FLOW_ITEM_TYPE_IPV4:
2445                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2446                                 SZ_NLATTR_TYPE_OF(uint32_t) * 4;
2447                                 /* dst/src IP addr and mask. */
2448                         break;
2449                 case RTE_FLOW_ITEM_TYPE_IPV6:
2450                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2451                                 SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4;
2452                                 /* dst/src IP addr and mask. */
2453                         break;
2454                 case RTE_FLOW_ITEM_TYPE_UDP:
2455                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2456                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2457                                 /* dst/src port and mask. */
2458                         break;
2459                 case RTE_FLOW_ITEM_TYPE_TCP:
2460                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2461                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2462                                 /* dst/src port and mask. */
2463                         break;
2464                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2465                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2466                         /*
2467                          * There might be no VXLAN decap action in the action
2468                          * list, nonetheless the VXLAN tunnel flow requires
2469                          * the decap structure to be correctly applied to
2470                          * VXLAN device, set the flag to create the structure.
2471                          * Translation routine will not put the decap action
2472                          * in tne Netlink message if there is no actual action
2473                          * in the list.
2474                          */
2475                         *action_flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2476                         break;
2477                 default:
2478                         DRV_LOG(WARNING,
2479                                 "unsupported item %p type %d,"
2480                                 " items must be validated before flow creation",
2481                                 (const void *)items, items->type);
2482                         break;
2483                 }
2484         }
2485         return size;
2486 }
2487
2488 /**
2489  * Calculate size of memory to store the VXLAN encapsultion
2490  * related items in the Netlink message buffer. Items list
2491  * is specified by RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action.
2492  * The item list should be validated.
2493  *
2494  * @param[in] action
2495  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2496  *   List of pattern items to scan data from.
2497  *
2498  * @return
2499  *   The size the part of Netlink message buffer to store the
2500  *   VXLAN encapsulation item attributes.
2501  */
2502 static int
2503 flow_tcf_vxlan_encap_size(const struct rte_flow_action *action)
2504 {
2505         const struct rte_flow_item *items;
2506         int size = 0;
2507
2508         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
2509         assert(action->conf);
2510
2511         items = ((const struct rte_flow_action_vxlan_encap *)
2512                                         action->conf)->definition;
2513         assert(items);
2514         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2515                 switch (items->type) {
2516                 case RTE_FLOW_ITEM_TYPE_VOID:
2517                         break;
2518                 case RTE_FLOW_ITEM_TYPE_ETH:
2519                         /* This item does not require message buffer. */
2520                         break;
2521                 case RTE_FLOW_ITEM_TYPE_IPV4:
2522                         size += SZ_NLATTR_DATA_OF(IPV4_ADDR_LEN) * 2;
2523                         break;
2524                 case RTE_FLOW_ITEM_TYPE_IPV6:
2525                         size += SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 2;
2526                         break;
2527                 case RTE_FLOW_ITEM_TYPE_UDP: {
2528                         const struct rte_flow_item_udp *udp = items->mask;
2529
2530                         size += SZ_NLATTR_TYPE_OF(uint16_t);
2531                         if (!udp || udp->hdr.src_port != RTE_BE16(0x0000))
2532                                 size += SZ_NLATTR_TYPE_OF(uint16_t);
2533                         break;
2534                 }
2535                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2536                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2537                         break;
2538                 default:
2539                         assert(false);
2540                         DRV_LOG(WARNING,
2541                                 "unsupported item %p type %d,"
2542                                 " items must be validated"
2543                                 " before flow creation",
2544                                 (const void *)items, items->type);
2545                         return 0;
2546                 }
2547         }
2548         return size;
2549 }
2550
2551 /**
2552  * Calculate maximum size of memory for flow actions of Linux TC flower and
2553  * extract specified actions.
2554  *
2555  * @param[in] actions
2556  *   Pointer to the list of actions.
2557  * @param[out] action_flags
2558  *   Pointer to the detected actions.
2559  *
2560  * @return
2561  *   Maximum size of memory for actions.
2562  */
2563 static int
2564 flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
2565                               uint64_t *action_flags)
2566 {
2567         int size = 0;
2568         uint64_t flags = 0;
2569
2570         size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */
2571         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
2572                 switch (actions->type) {
2573                 case RTE_FLOW_ACTION_TYPE_VOID:
2574                         break;
2575                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
2576                         size += SZ_NLATTR_NEST + /* na_act_index. */
2577                                 SZ_NLATTR_STRZ_OF("mirred") +
2578                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2579                                 SZ_NLATTR_TYPE_OF(struct tc_mirred);
2580                         flags |= MLX5_FLOW_ACTION_PORT_ID;
2581                         break;
2582                 case RTE_FLOW_ACTION_TYPE_JUMP:
2583                         size += SZ_NLATTR_NEST + /* na_act_index. */
2584                                 SZ_NLATTR_STRZ_OF("gact") +
2585                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2586                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2587                         flags |= MLX5_FLOW_ACTION_JUMP;
2588                         break;
2589                 case RTE_FLOW_ACTION_TYPE_DROP:
2590                         size += SZ_NLATTR_NEST + /* na_act_index. */
2591                                 SZ_NLATTR_STRZ_OF("gact") +
2592                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2593                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2594                         flags |= MLX5_FLOW_ACTION_DROP;
2595                         break;
2596                 case RTE_FLOW_ACTION_TYPE_COUNT:
2597                         break;
2598                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
2599                         flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
2600                         goto action_of_vlan;
2601                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
2602                         flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
2603                         goto action_of_vlan;
2604                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
2605                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
2606                         goto action_of_vlan;
2607                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
2608                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
2609                         goto action_of_vlan;
2610 action_of_vlan:
2611                         size += SZ_NLATTR_NEST + /* na_act_index. */
2612                                 SZ_NLATTR_STRZ_OF("vlan") +
2613                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2614                                 SZ_NLATTR_TYPE_OF(struct tc_vlan) +
2615                                 SZ_NLATTR_TYPE_OF(uint16_t) +
2616                                 /* VLAN protocol. */
2617                                 SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
2618                                 SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
2619                         break;
2620                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
2621                         size += SZ_NLATTR_NEST + /* na_act_index. */
2622                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2623                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2624                                 SZ_NLATTR_TYPE_OF(uint8_t);
2625                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2626                         size += flow_tcf_vxlan_encap_size(actions) +
2627                                 RTE_ALIGN_CEIL /* preceding encap params. */
2628                                 (sizeof(struct flow_tcf_vxlan_encap),
2629                                 MNL_ALIGNTO);
2630                         flags |= MLX5_FLOW_ACTION_VXLAN_ENCAP;
2631                         break;
2632                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
2633                         size += SZ_NLATTR_NEST + /* na_act_index. */
2634                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2635                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2636                                 SZ_NLATTR_TYPE_OF(uint8_t);
2637                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2638                         size += RTE_ALIGN_CEIL /* preceding decap params. */
2639                                 (sizeof(struct flow_tcf_vxlan_decap),
2640                                 MNL_ALIGNTO);
2641                         flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2642                         break;
2643                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
2644                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
2645                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
2646                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
2647                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
2648                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
2649                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
2650                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
2651                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
2652                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
2653                         size += flow_tcf_get_pedit_actions_size(&actions,
2654                                                                 &flags);
2655                         break;
2656                 default:
2657                         DRV_LOG(WARNING,
2658                                 "unsupported action %p type %d,"
2659                                 " items must be validated before flow creation",
2660                                 (const void *)actions, actions->type);
2661                         break;
2662                 }
2663         }
2664         *action_flags = flags;
2665         return size;
2666 }
2667
2668 /**
2669  * Brand rtnetlink buffer with unique handle.
2670  *
2671  * This handle should be unique for a given network interface to avoid
2672  * collisions.
2673  *
2674  * @param nlh
2675  *   Pointer to Netlink message.
2676  * @param handle
2677  *   Unique 32-bit handle to use.
2678  */
2679 static void
2680 flow_tcf_nl_brand(struct nlmsghdr *nlh, uint32_t handle)
2681 {
2682         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
2683
2684         tcm->tcm_handle = handle;
2685         DRV_LOG(DEBUG, "Netlink msg %p is branded with handle %x",
2686                 (void *)nlh, handle);
2687 }
2688
2689 /**
2690  * Prepare a flow object for Linux TC flower. It calculates the maximum size of
2691  * memory required, allocates the memory, initializes Netlink message headers
2692  * and set unique TC message handle.
2693  *
2694  * @param[in] attr
2695  *   Pointer to the flow attributes.
2696  * @param[in] items
2697  *   Pointer to the list of items.
2698  * @param[in] actions
2699  *   Pointer to the list of actions.
2700  * @param[out] error
2701  *   Pointer to the error structure.
2702  *
2703  * @return
2704  *   Pointer to mlx5_flow object on success,
2705  *   otherwise NULL and rte_errno is set.
2706  */
2707 static struct mlx5_flow *
2708 flow_tcf_prepare(const struct rte_flow_attr *attr,
2709                  const struct rte_flow_item items[],
2710                  const struct rte_flow_action actions[],
2711                  struct rte_flow_error *error)
2712 {
2713         size_t size = RTE_ALIGN_CEIL
2714                         (sizeof(struct mlx5_flow),
2715                          alignof(struct flow_tcf_tunnel_hdr)) +
2716                       MNL_ALIGN(sizeof(struct nlmsghdr)) +
2717                       MNL_ALIGN(sizeof(struct tcmsg));
2718         struct mlx5_flow *dev_flow;
2719         uint64_t action_flags = 0;
2720         struct nlmsghdr *nlh;
2721         struct tcmsg *tcm;
2722         uint8_t *sp, *tun = NULL;
2723
2724         size += flow_tcf_get_items_size(attr, items, &action_flags);
2725         size += flow_tcf_get_actions_and_size(actions, &action_flags);
2726         dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
2727         if (!dev_flow) {
2728                 rte_flow_error_set(error, ENOMEM,
2729                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
2730                                    "not enough memory to create E-Switch flow");
2731                 return NULL;
2732         }
2733         sp = (uint8_t *)(dev_flow + 1);
2734         if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) {
2735                 sp = RTE_PTR_ALIGN
2736                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2737                 tun = sp;
2738                 sp += RTE_ALIGN_CEIL
2739                         (sizeof(struct flow_tcf_vxlan_encap),
2740                         MNL_ALIGNTO);
2741 #ifndef NDEBUG
2742                 size -= RTE_ALIGN_CEIL
2743                         (sizeof(struct flow_tcf_vxlan_encap),
2744                         MNL_ALIGNTO);
2745 #endif
2746         } else if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2747                 sp = RTE_PTR_ALIGN
2748                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2749                 tun = sp;
2750                 sp += RTE_ALIGN_CEIL
2751                         (sizeof(struct flow_tcf_vxlan_decap),
2752                         MNL_ALIGNTO);
2753 #ifndef NDEBUG
2754                 size -= RTE_ALIGN_CEIL
2755                         (sizeof(struct flow_tcf_vxlan_decap),
2756                         MNL_ALIGNTO);
2757 #endif
2758         } else {
2759                 sp = RTE_PTR_ALIGN(sp, MNL_ALIGNTO);
2760         }
2761         nlh = mnl_nlmsg_put_header(sp);
2762         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
2763         *dev_flow = (struct mlx5_flow){
2764                 .tcf = (struct mlx5_flow_tcf){
2765 #ifndef NDEBUG
2766                         .nlsize = size - RTE_ALIGN_CEIL
2767                                 (sizeof(struct mlx5_flow),
2768                                  alignof(struct flow_tcf_tunnel_hdr)),
2769 #endif
2770                         .tunnel = (struct flow_tcf_tunnel_hdr *)tun,
2771                         .nlh = nlh,
2772                         .tcm = tcm,
2773                 },
2774         };
2775         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP)
2776                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_DECAP;
2777         else if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP)
2778                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_ENCAP;
2779         /*
2780          * Generate a reasonably unique handle based on the address of the
2781          * target buffer.
2782          *
2783          * This is straightforward on 32-bit systems where the flow pointer can
2784          * be used directly. Otherwise, its least significant part is taken
2785          * after shifting it by the previous power of two of the pointed buffer
2786          * size.
2787          */
2788         if (sizeof(dev_flow) <= 4)
2789                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow);
2790         else
2791                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow >>
2792                                        rte_log2_u32(rte_align32prevpow2(size)));
2793         return dev_flow;
2794 }
2795
2796 /**
2797  * Make adjustments for supporting count actions.
2798  *
2799  * @param[in] dev
2800  *   Pointer to the Ethernet device structure.
2801  * @param[in] dev_flow
2802  *   Pointer to mlx5_flow.
2803  * @param[out] error
2804  *   Pointer to error structure.
2805  *
2806  * @return
2807  *   0 On success else a negative errno value is returned and rte_errno is set.
2808  */
2809 static int
2810 flow_tcf_translate_action_count(struct rte_eth_dev *dev __rte_unused,
2811                                   struct mlx5_flow *dev_flow,
2812                                   struct rte_flow_error *error)
2813 {
2814         struct rte_flow *flow = dev_flow->flow;
2815
2816         if (!flow->counter) {
2817                 flow->counter = flow_tcf_counter_new();
2818                 if (!flow->counter)
2819                         return rte_flow_error_set(error, rte_errno,
2820                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2821                                                   NULL,
2822                                                   "cannot get counter"
2823                                                   " context.");
2824         }
2825         return 0;
2826 }
2827
2828 /**
2829  * Convert VXLAN VNI to 32-bit integer.
2830  *
2831  * @param[in] vni
2832  *   VXLAN VNI in 24-bit wire format.
2833  *
2834  * @return
2835  *   VXLAN VNI as a 32-bit integer value in network endian.
2836  */
2837 static inline rte_be32_t
2838 vxlan_vni_as_be32(const uint8_t vni[3])
2839 {
2840         union {
2841                 uint8_t vni[4];
2842                 rte_be32_t dword;
2843         } ret = {
2844                 .vni = { 0, vni[0], vni[1], vni[2] },
2845         };
2846         return ret.dword;
2847 }
2848
2849 /**
2850  * Helper function to process RTE_FLOW_ITEM_TYPE_ETH entry in configuration
2851  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the MAC address fields
2852  * in the encapsulation parameters structure. The item must be prevalidated,
2853  * no any validation checks performed by function.
2854  *
2855  * @param[in] spec
2856  *   RTE_FLOW_ITEM_TYPE_ETH entry specification.
2857  * @param[in] mask
2858  *   RTE_FLOW_ITEM_TYPE_ETH entry mask.
2859  * @param[out] encap
2860  *   Structure to fill the gathered MAC address data.
2861  */
2862 static void
2863 flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec,
2864                                const struct rte_flow_item_eth *mask,
2865                                struct flow_tcf_vxlan_encap *encap)
2866 {
2867         /* Item must be validated before. No redundant checks. */
2868         assert(spec);
2869         if (!mask || !memcmp(&mask->dst,
2870                              &rte_flow_item_eth_mask.dst,
2871                              sizeof(rte_flow_item_eth_mask.dst))) {
2872                 /*
2873                  * Ethernet addresses are not supported by
2874                  * tc as tunnel_key parameters. Destination
2875                  * address is needed to form encap packet
2876                  * header and retrieved by kernel from
2877                  * implicit sources (ARP table, etc),
2878                  * address masks are not supported at all.
2879                  */
2880                 encap->eth.dst = spec->dst;
2881                 encap->mask |= FLOW_TCF_ENCAP_ETH_DST;
2882         }
2883         if (!mask || !memcmp(&mask->src,
2884                              &rte_flow_item_eth_mask.src,
2885                              sizeof(rte_flow_item_eth_mask.src))) {
2886                 /*
2887                  * Ethernet addresses are not supported by
2888                  * tc as tunnel_key parameters. Source ethernet
2889                  * address is ignored anyway.
2890                  */
2891                 encap->eth.src = spec->src;
2892                 encap->mask |= FLOW_TCF_ENCAP_ETH_SRC;
2893         }
2894 }
2895
2896 /**
2897  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV4 entry in configuration
2898  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV4 address fields
2899  * in the encapsulation parameters structure. The item must be prevalidated,
2900  * no any validation checks performed by function.
2901  *
2902  * @param[in] spec
2903  *   RTE_FLOW_ITEM_TYPE_IPV4 entry specification.
2904  * @param[out] encap
2905  *   Structure to fill the gathered IPV4 address data.
2906  */
2907 static void
2908 flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec,
2909                                 struct flow_tcf_vxlan_encap *encap)
2910 {
2911         /* Item must be validated before. No redundant checks. */
2912         assert(spec);
2913         encap->ipv4.dst = spec->hdr.dst_addr;
2914         encap->ipv4.src = spec->hdr.src_addr;
2915         encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC |
2916                        FLOW_TCF_ENCAP_IPV4_DST;
2917 }
2918
2919 /**
2920  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV6 entry in configuration
2921  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV6 address fields
2922  * in the encapsulation parameters structure. The item must be prevalidated,
2923  * no any validation checks performed by function.
2924  *
2925  * @param[in] spec
2926  *   RTE_FLOW_ITEM_TYPE_IPV6 entry specification.
2927  * @param[out] encap
2928  *   Structure to fill the gathered IPV6 address data.
2929  */
2930 static void
2931 flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec,
2932                                 struct flow_tcf_vxlan_encap *encap)
2933 {
2934         /* Item must be validated before. No redundant checks. */
2935         assert(spec);
2936         memcpy(encap->ipv6.dst, spec->hdr.dst_addr, IPV6_ADDR_LEN);
2937         memcpy(encap->ipv6.src, spec->hdr.src_addr, IPV6_ADDR_LEN);
2938         encap->mask |= FLOW_TCF_ENCAP_IPV6_SRC |
2939                        FLOW_TCF_ENCAP_IPV6_DST;
2940 }
2941
2942 /**
2943  * Helper function to process RTE_FLOW_ITEM_TYPE_UDP entry in configuration
2944  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the UDP port fields
2945  * in the encapsulation parameters structure. The item must be prevalidated,
2946  * no any validation checks performed by function.
2947  *
2948  * @param[in] spec
2949  *   RTE_FLOW_ITEM_TYPE_UDP entry specification.
2950  * @param[in] mask
2951  *   RTE_FLOW_ITEM_TYPE_UDP entry mask.
2952  * @param[out] encap
2953  *   Structure to fill the gathered UDP port data.
2954  */
2955 static void
2956 flow_tcf_parse_vxlan_encap_udp(const struct rte_flow_item_udp *spec,
2957                                const struct rte_flow_item_udp *mask,
2958                                struct flow_tcf_vxlan_encap *encap)
2959 {
2960         assert(spec);
2961         encap->udp.dst = spec->hdr.dst_port;
2962         encap->mask |= FLOW_TCF_ENCAP_UDP_DST;
2963         if (!mask || mask->hdr.src_port != RTE_BE16(0x0000)) {
2964                 encap->udp.src = spec->hdr.src_port;
2965                 encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC;
2966         }
2967 }
2968
2969 /**
2970  * Helper function to process RTE_FLOW_ITEM_TYPE_VXLAN entry in configuration
2971  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the VNI fields
2972  * in the encapsulation parameters structure. The item must be prevalidated,
2973  * no any validation checks performed by function.
2974  *
2975  * @param[in] spec
2976  *   RTE_FLOW_ITEM_TYPE_VXLAN entry specification.
2977  * @param[out] encap
2978  *   Structure to fill the gathered VNI address data.
2979  */
2980 static void
2981 flow_tcf_parse_vxlan_encap_vni(const struct rte_flow_item_vxlan *spec,
2982                                struct flow_tcf_vxlan_encap *encap)
2983 {
2984         /* Item must be validated before. Do not redundant checks. */
2985         assert(spec);
2986         memcpy(encap->vxlan.vni, spec->vni, sizeof(encap->vxlan.vni));
2987         encap->mask |= FLOW_TCF_ENCAP_VXLAN_VNI;
2988 }
2989
2990 /**
2991  * Populate consolidated encapsulation object from list of pattern items.
2992  *
2993  * Helper function to process configuration of action such as
2994  * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. The item list should be
2995  * validated, there is no way to return an meaningful error.
2996  *
2997  * @param[in] action
2998  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2999  *   List of pattern items to gather data from.
3000  * @param[out] src
3001  *   Structure to fill gathered data.
3002  */
3003 static void
3004 flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action,
3005                            struct flow_tcf_vxlan_encap *encap)
3006 {
3007         union {
3008                 const struct rte_flow_item_eth *eth;
3009                 const struct rte_flow_item_ipv4 *ipv4;
3010                 const struct rte_flow_item_ipv6 *ipv6;
3011                 const struct rte_flow_item_udp *udp;
3012                 const struct rte_flow_item_vxlan *vxlan;
3013         } spec, mask;
3014         const struct rte_flow_item *items;
3015
3016         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
3017         assert(action->conf);
3018
3019         items = ((const struct rte_flow_action_vxlan_encap *)
3020                                         action->conf)->definition;
3021         assert(items);
3022         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3023                 switch (items->type) {
3024                 case RTE_FLOW_ITEM_TYPE_VOID:
3025                         break;
3026                 case RTE_FLOW_ITEM_TYPE_ETH:
3027                         mask.eth = items->mask;
3028                         spec.eth = items->spec;
3029                         flow_tcf_parse_vxlan_encap_eth(spec.eth, mask.eth,
3030                                                        encap);
3031                         break;
3032                 case RTE_FLOW_ITEM_TYPE_IPV4:
3033                         spec.ipv4 = items->spec;
3034                         flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4, encap);
3035                         break;
3036                 case RTE_FLOW_ITEM_TYPE_IPV6:
3037                         spec.ipv6 = items->spec;
3038                         flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6, encap);
3039                         break;
3040                 case RTE_FLOW_ITEM_TYPE_UDP:
3041                         mask.udp = items->mask;
3042                         spec.udp = items->spec;
3043                         flow_tcf_parse_vxlan_encap_udp(spec.udp, mask.udp,
3044                                                        encap);
3045                         break;
3046                 case RTE_FLOW_ITEM_TYPE_VXLAN:
3047                         spec.vxlan = items->spec;
3048                         flow_tcf_parse_vxlan_encap_vni(spec.vxlan, encap);
3049                         break;
3050                 default:
3051                         assert(false);
3052                         DRV_LOG(WARNING,
3053                                 "unsupported item %p type %d,"
3054                                 " items must be validated"
3055                                 " before flow creation",
3056                                 (const void *)items, items->type);
3057                         encap->mask = 0;
3058                         return;
3059                 }
3060         }
3061 }
3062
3063 /**
3064  * Translate flow for Linux TC flower and construct Netlink message.
3065  *
3066  * @param[in] priv
3067  *   Pointer to the priv structure.
3068  * @param[in, out] flow
3069  *   Pointer to the sub flow.
3070  * @param[in] attr
3071  *   Pointer to the flow attributes.
3072  * @param[in] items
3073  *   Pointer to the list of items.
3074  * @param[in] actions
3075  *   Pointer to the list of actions.
3076  * @param[out] error
3077  *   Pointer to the error structure.
3078  *
3079  * @return
3080  *   0 on success, a negative errno value otherwise and rte_errno is set.
3081  */
3082 static int
3083 flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow,
3084                    const struct rte_flow_attr *attr,
3085                    const struct rte_flow_item items[],
3086                    const struct rte_flow_action actions[],
3087                    struct rte_flow_error *error)
3088 {
3089         union {
3090                 const struct rte_flow_item_port_id *port_id;
3091                 const struct rte_flow_item_eth *eth;
3092                 const struct rte_flow_item_vlan *vlan;
3093                 const struct rte_flow_item_ipv4 *ipv4;
3094                 const struct rte_flow_item_ipv6 *ipv6;
3095                 const struct rte_flow_item_tcp *tcp;
3096                 const struct rte_flow_item_udp *udp;
3097                 const struct rte_flow_item_vxlan *vxlan;
3098         } spec, mask;
3099         union {
3100                 const struct rte_flow_action_port_id *port_id;
3101                 const struct rte_flow_action_jump *jump;
3102                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
3103                 const struct rte_flow_action_of_set_vlan_vid *
3104                         of_set_vlan_vid;
3105                 const struct rte_flow_action_of_set_vlan_pcp *
3106                         of_set_vlan_pcp;
3107         } conf;
3108         union {
3109                 struct flow_tcf_tunnel_hdr *hdr;
3110                 struct flow_tcf_vxlan_decap *vxlan;
3111         } decap = {
3112                 .hdr = NULL,
3113         };
3114         union {
3115                 struct flow_tcf_tunnel_hdr *hdr;
3116                 struct flow_tcf_vxlan_encap *vxlan;
3117         } encap = {
3118                 .hdr = NULL,
3119         };
3120         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
3121         struct nlmsghdr *nlh = dev_flow->tcf.nlh;
3122         struct tcmsg *tcm = dev_flow->tcf.tcm;
3123         uint32_t na_act_index_cur;
3124         rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
3125         rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
3126         rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
3127         bool ip_proto_set = 0;
3128         bool tunnel_outer = 0;
3129         struct nlattr *na_flower;
3130         struct nlattr *na_flower_act;
3131         struct nlattr *na_vlan_id = NULL;
3132         struct nlattr *na_vlan_priority = NULL;
3133         uint64_t item_flags = 0;
3134         int ret;
3135
3136         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
3137                                                 PTOI_TABLE_SZ_MAX(dev)));
3138         if (dev_flow->tcf.tunnel) {
3139                 switch (dev_flow->tcf.tunnel->type) {
3140                 case FLOW_TCF_TUNACT_VXLAN_DECAP:
3141                         decap.vxlan = dev_flow->tcf.vxlan_decap;
3142                         tunnel_outer = 1;
3143                         break;
3144                 case FLOW_TCF_TUNACT_VXLAN_ENCAP:
3145                         encap.vxlan = dev_flow->tcf.vxlan_encap;
3146                         break;
3147                 /* New tunnel actions can be added here. */
3148                 default:
3149                         assert(false);
3150                         break;
3151                 }
3152         }
3153         nlh = dev_flow->tcf.nlh;
3154         tcm = dev_flow->tcf.tcm;
3155         /* Prepare API must have been called beforehand. */
3156         assert(nlh != NULL && tcm != NULL);
3157         tcm->tcm_family = AF_UNSPEC;
3158         tcm->tcm_ifindex = ptoi[0].ifindex;
3159         tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
3160         /*
3161          * Priority cannot be zero to prevent the kernel from picking one
3162          * automatically.
3163          */
3164         tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16, outer_etype);
3165         if (attr->group > 0)
3166                 mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group);
3167         mnl_attr_put_strz(nlh, TCA_KIND, "flower");
3168         na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
3169         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3170                 unsigned int i;
3171
3172                 switch (items->type) {
3173                 case RTE_FLOW_ITEM_TYPE_VOID:
3174                         break;
3175                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
3176                         mask.port_id = flow_tcf_item_mask
3177                                 (items, &rte_flow_item_port_id_mask,
3178                                  &flow_tcf_mask_supported.port_id,
3179                                  &flow_tcf_mask_empty.port_id,
3180                                  sizeof(flow_tcf_mask_supported.port_id),
3181                                  error);
3182                         assert(mask.port_id);
3183                         if (mask.port_id == &flow_tcf_mask_empty.port_id)
3184                                 break;
3185                         spec.port_id = items->spec;
3186                         if (!mask.port_id->id)
3187                                 i = 0;
3188                         else
3189                                 for (i = 0; ptoi[i].ifindex; ++i)
3190                                         if (ptoi[i].port_id == spec.port_id->id)
3191                                                 break;
3192                         assert(ptoi[i].ifindex);
3193                         tcm->tcm_ifindex = ptoi[i].ifindex;
3194                         break;
3195                 case RTE_FLOW_ITEM_TYPE_ETH:
3196                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3197                                       MLX5_FLOW_LAYER_INNER_L2 :
3198                                       MLX5_FLOW_LAYER_OUTER_L2;
3199                         mask.eth = flow_tcf_item_mask
3200                                 (items, &rte_flow_item_eth_mask,
3201                                  &flow_tcf_mask_supported.eth,
3202                                  &flow_tcf_mask_empty.eth,
3203                                  sizeof(flow_tcf_mask_supported.eth),
3204                                  error);
3205                         assert(mask.eth);
3206                         if (mask.eth == &flow_tcf_mask_empty.eth)
3207                                 break;
3208                         spec.eth = items->spec;
3209                         if (mask.eth->type) {
3210                                 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
3211                                         inner_etype = spec.eth->type;
3212                                 else
3213                                         outer_etype = spec.eth->type;
3214                         }
3215                         if (tunnel_outer) {
3216                                 DRV_LOG(WARNING,
3217                                         "outer L2 addresses cannot be"
3218                                         " forced is outer ones for tunnel,"
3219                                         " parameter is ignored");
3220                                 break;
3221                         }
3222                         if (!is_zero_ether_addr(&mask.eth->dst)) {
3223                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
3224                                              ETHER_ADDR_LEN,
3225                                              spec.eth->dst.addr_bytes);
3226                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK,
3227                                              ETHER_ADDR_LEN,
3228                                              mask.eth->dst.addr_bytes);
3229                         }
3230                         if (!is_zero_ether_addr(&mask.eth->src)) {
3231                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC,
3232                                              ETHER_ADDR_LEN,
3233                                              spec.eth->src.addr_bytes);
3234                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK,
3235                                              ETHER_ADDR_LEN,
3236                                              mask.eth->src.addr_bytes);
3237                         }
3238                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3239                         break;
3240                 case RTE_FLOW_ITEM_TYPE_VLAN:
3241                         assert(!encap.hdr);
3242                         assert(!decap.hdr);
3243                         assert(!tunnel_outer);
3244                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
3245                         mask.vlan = flow_tcf_item_mask
3246                                 (items, &rte_flow_item_vlan_mask,
3247                                  &flow_tcf_mask_supported.vlan,
3248                                  &flow_tcf_mask_empty.vlan,
3249                                  sizeof(flow_tcf_mask_supported.vlan),
3250                                  error);
3251                         assert(mask.vlan);
3252                         if (mask.vlan == &flow_tcf_mask_empty.vlan)
3253                                 break;
3254                         spec.vlan = items->spec;
3255                         assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3256                                outer_etype == RTE_BE16(ETH_P_8021Q));
3257                         outer_etype = RTE_BE16(ETH_P_8021Q);
3258                         if (mask.vlan->inner_type)
3259                                 vlan_etype = spec.vlan->inner_type;
3260                         if (mask.vlan->tci & RTE_BE16(0xe000))
3261                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_VLAN_PRIO,
3262                                                 (rte_be_to_cpu_16
3263                                                  (spec.vlan->tci) >> 13) & 0x7);
3264                         if (mask.vlan->tci & RTE_BE16(0x0fff))
3265                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_VLAN_ID,
3266                                                  rte_be_to_cpu_16
3267                                                  (spec.vlan->tci &
3268                                                   RTE_BE16(0x0fff)));
3269                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3270                         break;
3271                 case RTE_FLOW_ITEM_TYPE_IPV4:
3272                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3273                                       MLX5_FLOW_LAYER_INNER_L3_IPV4 :
3274                                       MLX5_FLOW_LAYER_OUTER_L3_IPV4;
3275                         mask.ipv4 = flow_tcf_item_mask
3276                                 (items, &rte_flow_item_ipv4_mask,
3277                                  &flow_tcf_mask_supported.ipv4,
3278                                  &flow_tcf_mask_empty.ipv4,
3279                                  sizeof(flow_tcf_mask_supported.ipv4),
3280                                  error);
3281                         assert(mask.ipv4);
3282                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3283                                 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3284                                        inner_etype == RTE_BE16(ETH_P_IP));
3285                                 inner_etype = RTE_BE16(ETH_P_IP);
3286                         } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3287                                 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3288                                        vlan_etype == RTE_BE16(ETH_P_IP));
3289                                 vlan_etype = RTE_BE16(ETH_P_IP);
3290                         } else {
3291                                 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3292                                        outer_etype == RTE_BE16(ETH_P_IP));
3293                                 outer_etype = RTE_BE16(ETH_P_IP);
3294                         }
3295                         spec.ipv4 = items->spec;
3296                         if (!tunnel_outer && mask.ipv4->hdr.next_proto_id) {
3297                                 /*
3298                                  * No way to set IP protocol for outer tunnel
3299                                  * layers. Usually it is fixed, for example,
3300                                  * to UDP for VXLAN/GPE.
3301                                  */
3302                                 assert(spec.ipv4); /* Mask is not empty. */
3303                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3304                                                 spec.ipv4->hdr.next_proto_id);
3305                                 ip_proto_set = 1;
3306                         }
3307                         if (mask.ipv4 == &flow_tcf_mask_empty.ipv4 ||
3308                              (!mask.ipv4->hdr.src_addr &&
3309                               !mask.ipv4->hdr.dst_addr)) {
3310                                 if (!tunnel_outer)
3311                                         break;
3312                                 /*
3313                                  * For tunnel outer we must set outer IP key
3314                                  * anyway, even if the specification/mask is
3315                                  * empty. There is no another way to tell
3316                                  * kernel about he outer layer protocol.
3317                                  */
3318                                 mnl_attr_put_u32
3319                                         (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC,
3320                                          mask.ipv4->hdr.src_addr);
3321                                 mnl_attr_put_u32
3322                                         (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
3323                                          mask.ipv4->hdr.src_addr);
3324                                 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3325                                 break;
3326                         }
3327                         if (mask.ipv4->hdr.src_addr) {
3328                                 mnl_attr_put_u32
3329                                         (nlh, tunnel_outer ?
3330                                          TCA_FLOWER_KEY_ENC_IPV4_SRC :
3331                                          TCA_FLOWER_KEY_IPV4_SRC,
3332                                          spec.ipv4->hdr.src_addr);
3333                                 mnl_attr_put_u32
3334                                         (nlh, tunnel_outer ?
3335                                          TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK :
3336                                          TCA_FLOWER_KEY_IPV4_SRC_MASK,
3337                                          mask.ipv4->hdr.src_addr);
3338                         }
3339                         if (mask.ipv4->hdr.dst_addr) {
3340                                 mnl_attr_put_u32
3341                                         (nlh, tunnel_outer ?
3342                                          TCA_FLOWER_KEY_ENC_IPV4_DST :
3343                                          TCA_FLOWER_KEY_IPV4_DST,
3344                                          spec.ipv4->hdr.dst_addr);
3345                                 mnl_attr_put_u32
3346                                         (nlh, tunnel_outer ?
3347                                          TCA_FLOWER_KEY_ENC_IPV4_DST_MASK :
3348                                          TCA_FLOWER_KEY_IPV4_DST_MASK,
3349                                          mask.ipv4->hdr.dst_addr);
3350                         }
3351                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3352                         break;
3353                 case RTE_FLOW_ITEM_TYPE_IPV6: {
3354                         bool ipv6_src, ipv6_dst;
3355
3356                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3357                                       MLX5_FLOW_LAYER_INNER_L3_IPV6 :
3358                                       MLX5_FLOW_LAYER_OUTER_L3_IPV6;
3359                         mask.ipv6 = flow_tcf_item_mask
3360                                 (items, &rte_flow_item_ipv6_mask,
3361                                  &flow_tcf_mask_supported.ipv6,
3362                                  &flow_tcf_mask_empty.ipv6,
3363                                  sizeof(flow_tcf_mask_supported.ipv6),
3364                                  error);
3365                         assert(mask.ipv6);
3366                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3367                                 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3368                                        inner_etype == RTE_BE16(ETH_P_IPV6));
3369                                 inner_etype = RTE_BE16(ETH_P_IPV6);
3370                         } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3371                                 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3372                                        vlan_etype == RTE_BE16(ETH_P_IPV6));
3373                                 vlan_etype = RTE_BE16(ETH_P_IPV6);
3374                         } else {
3375                                 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3376                                        outer_etype == RTE_BE16(ETH_P_IPV6));
3377                                 outer_etype = RTE_BE16(ETH_P_IPV6);
3378                         }
3379                         spec.ipv6 = items->spec;
3380                         if (!tunnel_outer && mask.ipv6->hdr.proto) {
3381                                 /*
3382                                  * No way to set IP protocol for outer tunnel
3383                                  * layers. Usually it is fixed, for example,
3384                                  * to UDP for VXLAN/GPE.
3385                                  */
3386                                 assert(spec.ipv6); /* Mask is not empty. */
3387                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3388                                                 spec.ipv6->hdr.proto);
3389                                 ip_proto_set = 1;
3390                         }
3391                         ipv6_dst = !IN6_IS_ADDR_UNSPECIFIED
3392                                                 (mask.ipv6->hdr.dst_addr);
3393                         ipv6_src = !IN6_IS_ADDR_UNSPECIFIED
3394                                                 (mask.ipv6->hdr.src_addr);
3395                         if (mask.ipv6 == &flow_tcf_mask_empty.ipv6 ||
3396                              (!ipv6_dst && !ipv6_src)) {
3397                                 if (!tunnel_outer)
3398                                         break;
3399                                 /*
3400                                  * For tunnel outer we must set outer IP key
3401                                  * anyway, even if the specification/mask is
3402                                  * empty. There is no another way to tell
3403                                  * kernel about he outer layer protocol.
3404                                  */
3405                                 mnl_attr_put(nlh,
3406                                              TCA_FLOWER_KEY_ENC_IPV6_SRC,
3407                                              IPV6_ADDR_LEN,
3408                                              mask.ipv6->hdr.src_addr);
3409                                 mnl_attr_put(nlh,
3410                                              TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
3411                                              IPV6_ADDR_LEN,
3412                                              mask.ipv6->hdr.src_addr);
3413                                 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3414                                 break;
3415                         }
3416                         if (ipv6_src) {
3417                                 mnl_attr_put(nlh, tunnel_outer ?
3418                                              TCA_FLOWER_KEY_ENC_IPV6_SRC :
3419                                              TCA_FLOWER_KEY_IPV6_SRC,
3420                                              IPV6_ADDR_LEN,
3421                                              spec.ipv6->hdr.src_addr);
3422                                 mnl_attr_put(nlh, tunnel_outer ?
3423                                              TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK :
3424                                              TCA_FLOWER_KEY_IPV6_SRC_MASK,
3425                                              IPV6_ADDR_LEN,
3426                                              mask.ipv6->hdr.src_addr);
3427                         }
3428                         if (ipv6_dst) {
3429                                 mnl_attr_put(nlh, tunnel_outer ?
3430                                              TCA_FLOWER_KEY_ENC_IPV6_DST :
3431                                              TCA_FLOWER_KEY_IPV6_DST,
3432                                              IPV6_ADDR_LEN,
3433                                              spec.ipv6->hdr.dst_addr);
3434                                 mnl_attr_put(nlh, tunnel_outer ?
3435                                              TCA_FLOWER_KEY_ENC_IPV6_DST_MASK :
3436                                              TCA_FLOWER_KEY_IPV6_DST_MASK,
3437                                              IPV6_ADDR_LEN,
3438                                              mask.ipv6->hdr.dst_addr);
3439                         }
3440                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3441                         break;
3442                 }
3443                 case RTE_FLOW_ITEM_TYPE_UDP:
3444                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3445                                       MLX5_FLOW_LAYER_INNER_L4_UDP :
3446                                       MLX5_FLOW_LAYER_OUTER_L4_UDP;
3447                         mask.udp = flow_tcf_item_mask
3448                                 (items, &rte_flow_item_udp_mask,
3449                                  &flow_tcf_mask_supported.udp,
3450                                  &flow_tcf_mask_empty.udp,
3451                                  sizeof(flow_tcf_mask_supported.udp),
3452                                  error);
3453                         assert(mask.udp);
3454                         spec.udp = items->spec;
3455                         if (!tunnel_outer) {
3456                                 if (!ip_proto_set)
3457                                         mnl_attr_put_u8
3458                                                 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3459                                                 IPPROTO_UDP);
3460                                 if (mask.udp == &flow_tcf_mask_empty.udp)
3461                                         break;
3462                         } else {
3463                                 assert(mask.udp != &flow_tcf_mask_empty.udp);
3464                                 decap.vxlan->udp_port =
3465                                         rte_be_to_cpu_16
3466                                                 (spec.udp->hdr.dst_port);
3467                         }
3468                         if (mask.udp->hdr.src_port) {
3469                                 mnl_attr_put_u16
3470                                         (nlh, tunnel_outer ?
3471                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT :
3472                                          TCA_FLOWER_KEY_UDP_SRC,
3473                                          spec.udp->hdr.src_port);
3474                                 mnl_attr_put_u16
3475                                         (nlh, tunnel_outer ?
3476                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK :
3477                                          TCA_FLOWER_KEY_UDP_SRC_MASK,
3478                                          mask.udp->hdr.src_port);
3479                         }
3480                         if (mask.udp->hdr.dst_port) {
3481                                 mnl_attr_put_u16
3482                                         (nlh, tunnel_outer ?
3483                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT :
3484                                          TCA_FLOWER_KEY_UDP_DST,
3485                                          spec.udp->hdr.dst_port);
3486                                 mnl_attr_put_u16
3487                                         (nlh, tunnel_outer ?
3488                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK :
3489                                          TCA_FLOWER_KEY_UDP_DST_MASK,
3490                                          mask.udp->hdr.dst_port);
3491                         }
3492                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3493                         break;
3494                 case RTE_FLOW_ITEM_TYPE_TCP:
3495                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3496                                       MLX5_FLOW_LAYER_INNER_L4_TCP :
3497                                       MLX5_FLOW_LAYER_OUTER_L4_TCP;
3498                         mask.tcp = flow_tcf_item_mask
3499                                 (items, &rte_flow_item_tcp_mask,
3500                                  &flow_tcf_mask_supported.tcp,
3501                                  &flow_tcf_mask_empty.tcp,
3502                                  sizeof(flow_tcf_mask_supported.tcp),
3503                                  error);
3504                         assert(mask.tcp);
3505                         if (!ip_proto_set)
3506                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3507                                                 IPPROTO_TCP);
3508                         if (mask.tcp == &flow_tcf_mask_empty.tcp)
3509                                 break;
3510                         spec.tcp = items->spec;
3511                         if (mask.tcp->hdr.src_port) {
3512                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_SRC,
3513                                                  spec.tcp->hdr.src_port);
3514                                 mnl_attr_put_u16(nlh,
3515                                                  TCA_FLOWER_KEY_TCP_SRC_MASK,
3516                                                  mask.tcp->hdr.src_port);
3517                         }
3518                         if (mask.tcp->hdr.dst_port) {
3519                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_DST,
3520                                                  spec.tcp->hdr.dst_port);
3521                                 mnl_attr_put_u16(nlh,
3522                                                  TCA_FLOWER_KEY_TCP_DST_MASK,
3523                                                  mask.tcp->hdr.dst_port);
3524                         }
3525                         if (mask.tcp->hdr.tcp_flags) {
3526                                 mnl_attr_put_u16
3527                                         (nlh,
3528                                          TCA_FLOWER_KEY_TCP_FLAGS,
3529                                          rte_cpu_to_be_16
3530                                                 (spec.tcp->hdr.tcp_flags));
3531                                 mnl_attr_put_u16
3532                                         (nlh,
3533                                          TCA_FLOWER_KEY_TCP_FLAGS_MASK,
3534                                          rte_cpu_to_be_16
3535                                                 (mask.tcp->hdr.tcp_flags));
3536                         }
3537                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3538                         break;
3539                 case RTE_FLOW_ITEM_TYPE_VXLAN:
3540                         assert(decap.vxlan);
3541                         tunnel_outer = 0;
3542                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
3543                         spec.vxlan = items->spec;
3544                         mnl_attr_put_u32(nlh,
3545                                          TCA_FLOWER_KEY_ENC_KEY_ID,
3546                                          vxlan_vni_as_be32(spec.vxlan->vni));
3547                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3548                         break;
3549                 default:
3550                         return rte_flow_error_set(error, ENOTSUP,
3551                                                   RTE_FLOW_ERROR_TYPE_ITEM,
3552                                                   NULL, "item not supported");
3553                 }
3554         }
3555         /*
3556          * Set the ether_type flower key and tc rule protocol:
3557          * - if there is nor VLAN neither VXLAN the key is taken from
3558          *   eth item directly or deduced from L3 items.
3559          * - if there is vlan item then key is fixed to 802.1q.
3560          * - if there is vxlan item then key is set to inner tunnel type.
3561          * - simultaneous vlan and vxlan items are prohibited.
3562          */
3563         if (outer_etype != RTE_BE16(ETH_P_ALL)) {
3564                 tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
3565                                            outer_etype);
3566                 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3567                         if (inner_etype != RTE_BE16(ETH_P_ALL))
3568                                 mnl_attr_put_u16(nlh,
3569                                                  TCA_FLOWER_KEY_ETH_TYPE,
3570                                                  inner_etype);
3571                 } else {
3572                         mnl_attr_put_u16(nlh,
3573                                          TCA_FLOWER_KEY_ETH_TYPE,
3574                                          outer_etype);
3575                         if (outer_etype == RTE_BE16(ETH_P_8021Q) &&
3576                             vlan_etype != RTE_BE16(ETH_P_ALL))
3577                                 mnl_attr_put_u16(nlh,
3578                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE,
3579                                                  vlan_etype);
3580                 }
3581                 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3582         }
3583         na_flower_act = mnl_attr_nest_start(nlh, TCA_FLOWER_ACT);
3584         na_act_index_cur = 1;
3585         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
3586                 struct nlattr *na_act_index;
3587                 struct nlattr *na_act;
3588                 unsigned int vlan_act;
3589                 unsigned int i;
3590
3591                 switch (actions->type) {
3592                 case RTE_FLOW_ACTION_TYPE_VOID:
3593                         break;
3594                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
3595                         conf.port_id = actions->conf;
3596                         if (conf.port_id->original)
3597                                 i = 0;
3598                         else
3599                                 for (i = 0; ptoi[i].ifindex; ++i)
3600                                         if (ptoi[i].port_id == conf.port_id->id)
3601                                                 break;
3602                         assert(ptoi[i].ifindex);
3603                         na_act_index =
3604                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3605                         assert(na_act_index);
3606                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
3607                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3608                         assert(na_act);
3609                         if (encap.hdr) {
3610                                 assert(dev_flow->tcf.tunnel);
3611                                 dev_flow->tcf.tunnel->ifindex_ptr =
3612                                         &((struct tc_mirred *)
3613                                         mnl_attr_get_payload
3614                                         (mnl_nlmsg_get_payload_tail
3615                                                 (nlh)))->ifindex;
3616                         }
3617                         mnl_attr_put(nlh, TCA_MIRRED_PARMS,
3618                                      sizeof(struct tc_mirred),
3619                                      &(struct tc_mirred){
3620                                         .action = TC_ACT_STOLEN,
3621                                         .eaction = TCA_EGRESS_REDIR,
3622                                         .ifindex = ptoi[i].ifindex,
3623                                      });
3624                         mnl_attr_nest_end(nlh, na_act);
3625                         mnl_attr_nest_end(nlh, na_act_index);
3626                         break;
3627                 case RTE_FLOW_ACTION_TYPE_JUMP:
3628                         conf.jump = actions->conf;
3629                         na_act_index =
3630                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3631                         assert(na_act_index);
3632                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3633                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3634                         assert(na_act);
3635                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3636                                      sizeof(struct tc_gact),
3637                                      &(struct tc_gact){
3638                                         .action = TC_ACT_GOTO_CHAIN |
3639                                                   conf.jump->group,
3640                                      });
3641                         mnl_attr_nest_end(nlh, na_act);
3642                         mnl_attr_nest_end(nlh, na_act_index);
3643                         break;
3644                 case RTE_FLOW_ACTION_TYPE_DROP:
3645                         na_act_index =
3646                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3647                         assert(na_act_index);
3648                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3649                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3650                         assert(na_act);
3651                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3652                                      sizeof(struct tc_gact),
3653                                      &(struct tc_gact){
3654                                         .action = TC_ACT_SHOT,
3655                                      });
3656                         mnl_attr_nest_end(nlh, na_act);
3657                         mnl_attr_nest_end(nlh, na_act_index);
3658                         break;
3659                 case RTE_FLOW_ACTION_TYPE_COUNT:
3660                         /*
3661                          * Driver adds the count action implicitly for
3662                          * each rule it creates.
3663                          */
3664                         ret = flow_tcf_translate_action_count(dev,
3665                                                               dev_flow, error);
3666                         if (ret < 0)
3667                                 return ret;
3668                         break;
3669                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
3670                         conf.of_push_vlan = NULL;
3671                         vlan_act = TCA_VLAN_ACT_POP;
3672                         goto action_of_vlan;
3673                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
3674                         conf.of_push_vlan = actions->conf;
3675                         vlan_act = TCA_VLAN_ACT_PUSH;
3676                         goto action_of_vlan;
3677                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
3678                         conf.of_set_vlan_vid = actions->conf;
3679                         if (na_vlan_id)
3680                                 goto override_na_vlan_id;
3681                         vlan_act = TCA_VLAN_ACT_MODIFY;
3682                         goto action_of_vlan;
3683                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
3684                         conf.of_set_vlan_pcp = actions->conf;
3685                         if (na_vlan_priority)
3686                                 goto override_na_vlan_priority;
3687                         vlan_act = TCA_VLAN_ACT_MODIFY;
3688                         goto action_of_vlan;
3689 action_of_vlan:
3690                         na_act_index =
3691                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3692                         assert(na_act_index);
3693                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "vlan");
3694                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3695                         assert(na_act);
3696                         mnl_attr_put(nlh, TCA_VLAN_PARMS,
3697                                      sizeof(struct tc_vlan),
3698                                      &(struct tc_vlan){
3699                                         .action = TC_ACT_PIPE,
3700                                         .v_action = vlan_act,
3701                                      });
3702                         if (vlan_act == TCA_VLAN_ACT_POP) {
3703                                 mnl_attr_nest_end(nlh, na_act);
3704                                 mnl_attr_nest_end(nlh, na_act_index);
3705                                 break;
3706                         }
3707                         if (vlan_act == TCA_VLAN_ACT_PUSH)
3708                                 mnl_attr_put_u16(nlh,
3709                                                  TCA_VLAN_PUSH_VLAN_PROTOCOL,
3710                                                  conf.of_push_vlan->ethertype);
3711                         na_vlan_id = mnl_nlmsg_get_payload_tail(nlh);
3712                         mnl_attr_put_u16(nlh, TCA_VLAN_PAD, 0);
3713                         na_vlan_priority = mnl_nlmsg_get_payload_tail(nlh);
3714                         mnl_attr_put_u8(nlh, TCA_VLAN_PAD, 0);
3715                         mnl_attr_nest_end(nlh, na_act);
3716                         mnl_attr_nest_end(nlh, na_act_index);
3717                         if (actions->type ==
3718                             RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
3719 override_na_vlan_id:
3720                                 na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID;
3721                                 *(uint16_t *)mnl_attr_get_payload(na_vlan_id) =
3722                                         rte_be_to_cpu_16
3723                                         (conf.of_set_vlan_vid->vlan_vid);
3724                         } else if (actions->type ==
3725                                    RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
3726 override_na_vlan_priority:
3727                                 na_vlan_priority->nla_type =
3728                                         TCA_VLAN_PUSH_VLAN_PRIORITY;
3729                                 *(uint8_t *)mnl_attr_get_payload
3730                                         (na_vlan_priority) =
3731                                         conf.of_set_vlan_pcp->vlan_pcp;
3732                         }
3733                         break;
3734                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
3735                         assert(decap.vxlan);
3736                         assert(dev_flow->tcf.tunnel);
3737                         dev_flow->tcf.tunnel->ifindex_ptr =
3738                                 (unsigned int *)&tcm->tcm_ifindex;
3739                         na_act_index =
3740                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3741                         assert(na_act_index);
3742                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3743                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3744                         assert(na_act);
3745                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3746                                 sizeof(struct tc_tunnel_key),
3747                                 &(struct tc_tunnel_key){
3748                                         .action = TC_ACT_PIPE,
3749                                         .t_action = TCA_TUNNEL_KEY_ACT_RELEASE,
3750                                         });
3751                         mnl_attr_nest_end(nlh, na_act);
3752                         mnl_attr_nest_end(nlh, na_act_index);
3753                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3754                         break;
3755                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
3756                         assert(encap.vxlan);
3757                         flow_tcf_vxlan_encap_parse(actions, encap.vxlan);
3758                         na_act_index =
3759                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3760                         assert(na_act_index);
3761                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3762                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3763                         assert(na_act);
3764                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3765                                 sizeof(struct tc_tunnel_key),
3766                                 &(struct tc_tunnel_key){
3767                                         .action = TC_ACT_PIPE,
3768                                         .t_action = TCA_TUNNEL_KEY_ACT_SET,
3769                                         });
3770                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_UDP_DST)
3771                                 mnl_attr_put_u16(nlh,
3772                                          TCA_TUNNEL_KEY_ENC_DST_PORT,
3773                                          encap.vxlan->udp.dst);
3774                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_SRC)
3775                                 mnl_attr_put_u32(nlh,
3776                                          TCA_TUNNEL_KEY_ENC_IPV4_SRC,
3777                                          encap.vxlan->ipv4.src);
3778                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_DST)
3779                                 mnl_attr_put_u32(nlh,
3780                                          TCA_TUNNEL_KEY_ENC_IPV4_DST,
3781                                          encap.vxlan->ipv4.dst);
3782                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_SRC)
3783                                 mnl_attr_put(nlh,
3784                                          TCA_TUNNEL_KEY_ENC_IPV6_SRC,
3785                                          sizeof(encap.vxlan->ipv6.src),
3786                                          &encap.vxlan->ipv6.src);
3787                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_DST)
3788                                 mnl_attr_put(nlh,
3789                                          TCA_TUNNEL_KEY_ENC_IPV6_DST,
3790                                          sizeof(encap.vxlan->ipv6.dst),
3791                                          &encap.vxlan->ipv6.dst);
3792                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_VXLAN_VNI)
3793                                 mnl_attr_put_u32(nlh,
3794                                          TCA_TUNNEL_KEY_ENC_KEY_ID,
3795                                          vxlan_vni_as_be32
3796                                                 (encap.vxlan->vxlan.vni));
3797                         mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM, 0);
3798                         mnl_attr_nest_end(nlh, na_act);
3799                         mnl_attr_nest_end(nlh, na_act_index);
3800                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3801                         break;
3802                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
3803                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
3804                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
3805                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
3806                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
3807                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
3808                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
3809                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
3810                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
3811                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
3812                         na_act_index =
3813                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3814                         flow_tcf_create_pedit_mnl_msg(nlh,
3815                                                       &actions, item_flags);
3816                         mnl_attr_nest_end(nlh, na_act_index);
3817                         break;
3818                 default:
3819                         return rte_flow_error_set(error, ENOTSUP,
3820                                                   RTE_FLOW_ERROR_TYPE_ACTION,
3821                                                   actions,
3822                                                   "action not supported");
3823                 }
3824         }
3825         assert(na_flower);
3826         assert(na_flower_act);
3827         mnl_attr_nest_end(nlh, na_flower_act);
3828         dev_flow->tcf.ptc_flags = mnl_attr_get_payload
3829                                         (mnl_nlmsg_get_payload_tail(nlh));
3830         mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, decap.vxlan ?
3831                                                 0 : TCA_CLS_FLAGS_SKIP_SW);
3832         mnl_attr_nest_end(nlh, na_flower);
3833         if (dev_flow->tcf.tunnel && dev_flow->tcf.tunnel->ifindex_ptr)
3834                 dev_flow->tcf.tunnel->ifindex_org =
3835                         *dev_flow->tcf.tunnel->ifindex_ptr;
3836         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3837         return 0;
3838 }
3839
3840 /**
3841  * Send Netlink message with acknowledgment.
3842  *
3843  * @param tcf
3844  *   Flow context to use.
3845  * @param nlh
3846  *   Message to send. This function always raises the NLM_F_ACK flag before
3847  *   sending.
3848  * @param[in] cb
3849  *   Callback handler for received message.
3850  * @param[in] arg
3851  *   Context pointer for callback handler.
3852  *
3853  * @return
3854  *   0 on success, a negative errno value otherwise and rte_errno is set.
3855  */
3856 static int
3857 flow_tcf_nl_ack(struct mlx5_flow_tcf_context *tcf,
3858                 struct nlmsghdr *nlh,
3859                 mnl_cb_t cb, void *arg)
3860 {
3861         unsigned int portid = mnl_socket_get_portid(tcf->nl);
3862         uint32_t seq = tcf->seq++;
3863         int ret, err = 0;
3864
3865         assert(tcf->nl);
3866         assert(tcf->buf);
3867         if (!seq) {
3868                 /* seq 0 is reserved for kernel event-driven notifications. */
3869                 seq = tcf->seq++;
3870         }
3871         nlh->nlmsg_seq = seq;
3872         nlh->nlmsg_flags |= NLM_F_ACK;
3873         ret = mnl_socket_sendto(tcf->nl, nlh, nlh->nlmsg_len);
3874         if (ret <= 0) {
3875                 /* Message send error occurres. */
3876                 rte_errno = errno;
3877                 return -rte_errno;
3878         }
3879         nlh = (struct nlmsghdr *)(tcf->buf);
3880         /*
3881          * The following loop postpones non-fatal errors until multipart
3882          * messages are complete.
3883          */
3884         while (true) {
3885                 ret = mnl_socket_recvfrom(tcf->nl, tcf->buf, tcf->buf_size);
3886                 if (ret < 0) {
3887                         err = errno;
3888                         /*
3889                          * In case of overflow Will receive till
3890                          * end of multipart message. We may lost part
3891                          * of reply messages but mark and return an error.
3892                          */
3893                         if (err != ENOSPC ||
3894                             !(nlh->nlmsg_flags & NLM_F_MULTI) ||
3895                             nlh->nlmsg_type == NLMSG_DONE)
3896                                 break;
3897                 } else {
3898                         ret = mnl_cb_run(nlh, ret, seq, portid, cb, arg);
3899                         if (!ret) {
3900                                 /*
3901                                  * libmnl returns 0 if DONE or
3902                                  * success ACK message found.
3903                                  */
3904                                 break;
3905                         }
3906                         if (ret < 0) {
3907                                 /*
3908                                  * ACK message with error found
3909                                  * or some error occurred.
3910                                  */
3911                                 err = errno;
3912                                 break;
3913                         }
3914                         /* We should continue receiving. */
3915                 }
3916         }
3917         if (!err)
3918                 return 0;
3919         rte_errno = err;
3920         return -err;
3921 }
3922
3923 #define MNL_BUF_EXTRA_SPACE 16
3924 #define MNL_REQUEST_SIZE_MIN 256
3925 #define MNL_REQUEST_SIZE_MAX 2048
3926 #define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \
3927                                  MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX)
3928
3929 /* Data structures used by flow_tcf_xxx_cb() routines. */
3930 struct tcf_nlcb_buf {
3931         LIST_ENTRY(tcf_nlcb_buf) next;
3932         uint32_t size;
3933         alignas(struct nlmsghdr)
3934         uint8_t msg[]; /**< Netlink message data. */
3935 };
3936
3937 struct tcf_nlcb_context {
3938         unsigned int ifindex; /**< Base interface index. */
3939         uint32_t bufsize;
3940         LIST_HEAD(, tcf_nlcb_buf) nlbuf;
3941 };
3942
3943 /**
3944  * Allocate space for netlink command in buffer list
3945  *
3946  * @param[in, out] ctx
3947  *   Pointer to callback context with command buffers list.
3948  * @param[in] size
3949  *   Required size of data buffer to be allocated.
3950  *
3951  * @return
3952  *   Pointer to allocated memory, aligned as message header.
3953  *   NULL if some error occurred.
3954  */
3955 static struct nlmsghdr *
3956 flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size)
3957 {
3958         struct tcf_nlcb_buf *buf;
3959         struct nlmsghdr *nlh;
3960
3961         size = NLMSG_ALIGN(size);
3962         buf = LIST_FIRST(&ctx->nlbuf);
3963         if (buf && (buf->size + size) <= ctx->bufsize) {
3964                 nlh = (struct nlmsghdr *)&buf->msg[buf->size];
3965                 buf->size += size;
3966                 return nlh;
3967         }
3968         if (size > ctx->bufsize) {
3969                 DRV_LOG(WARNING, "netlink: too long command buffer requested");
3970                 return NULL;
3971         }
3972         buf = rte_malloc(__func__,
3973                         ctx->bufsize + sizeof(struct tcf_nlcb_buf),
3974                         alignof(struct tcf_nlcb_buf));
3975         if (!buf) {
3976                 DRV_LOG(WARNING, "netlink: no memory for command buffer");
3977                 return NULL;
3978         }
3979         LIST_INSERT_HEAD(&ctx->nlbuf, buf, next);
3980         buf->size = size;
3981         nlh = (struct nlmsghdr *)&buf->msg[0];
3982         return nlh;
3983 }
3984
3985 /**
3986  * Send the buffers with prepared netlink commands. Scans the list and
3987  * sends all found buffers. Buffers are sent and freed anyway in order
3988  * to prevent memory leakage if some every message in received packet.
3989  *
3990  * @param[in] tcf
3991  *   Context object initialized by mlx5_flow_tcf_context_create().
3992  * @param[in, out] ctx
3993  *   Pointer to callback context with command buffers list.
3994  *
3995  * @return
3996  *   Zero value on success, negative errno value otherwise
3997  *   and rte_errno is set.
3998  */
3999 static int
4000 flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf,
4001                     struct tcf_nlcb_context *ctx)
4002 {
4003         struct tcf_nlcb_buf *bc = LIST_FIRST(&ctx->nlbuf);
4004         int ret = 0;
4005
4006         while (bc) {
4007                 struct tcf_nlcb_buf *bn = LIST_NEXT(bc, next);
4008                 struct nlmsghdr *nlh;
4009                 uint32_t msg = 0;
4010                 int rc;
4011
4012                 while (msg < bc->size) {
4013                         /*
4014                          * Send Netlink commands from buffer in one by one
4015                          * fashion. If we send multiple rule deletion commands
4016                          * in one Netlink message and some error occurs it may
4017                          * cause multiple ACK error messages and break sequence
4018                          * numbers of Netlink communication, because we expect
4019                          * the only one ACK reply.
4020                          */
4021                         assert((bc->size - msg) >= sizeof(struct nlmsghdr));
4022                         nlh = (struct nlmsghdr *)&bc->msg[msg];
4023                         assert((bc->size - msg) >= nlh->nlmsg_len);
4024                         msg += nlh->nlmsg_len;
4025                         rc = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4026                         if (rc) {
4027                                 DRV_LOG(WARNING,
4028                                         "netlink: cleanup error %d", rc);
4029                                 if (!ret)
4030                                         ret = rc;
4031                         }
4032                 }
4033                 rte_free(bc);
4034                 bc = bn;
4035         }
4036         LIST_INIT(&ctx->nlbuf);
4037         return ret;
4038 }
4039
4040 /**
4041  * Collect local IP address rules with scope link attribute  on specified
4042  * network device. This is callback routine called by libmnl mnl_cb_run()
4043  * in loop for every message in received packet.
4044  *
4045  * @param[in] nlh
4046  *   Pointer to reply header.
4047  * @param[in, out] arg
4048  *   Opaque data pointer for this callback.
4049  *
4050  * @return
4051  *   A positive, nonzero value on success, negative errno value otherwise
4052  *   and rte_errno is set.
4053  */
4054 static int
4055 flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg)
4056 {
4057         struct tcf_nlcb_context *ctx = arg;
4058         struct nlmsghdr *cmd;
4059         struct ifaddrmsg *ifa;
4060         struct nlattr *na;
4061         struct nlattr *na_local = NULL;
4062         struct nlattr *na_peer = NULL;
4063         unsigned char family;
4064         uint32_t size;
4065
4066         if (nlh->nlmsg_type != RTM_NEWADDR) {
4067                 rte_errno = EINVAL;
4068                 return -rte_errno;
4069         }
4070         ifa = mnl_nlmsg_get_payload(nlh);
4071         family = ifa->ifa_family;
4072         if (ifa->ifa_index != ctx->ifindex ||
4073             ifa->ifa_scope != RT_SCOPE_LINK ||
4074             !(ifa->ifa_flags & IFA_F_PERMANENT) ||
4075             (family != AF_INET && family != AF_INET6))
4076                 return 1;
4077         mnl_attr_for_each(na, nlh, sizeof(*ifa)) {
4078                 switch (mnl_attr_get_type(na)) {
4079                 case IFA_LOCAL:
4080                         na_local = na;
4081                         break;
4082                 case IFA_ADDRESS:
4083                         na_peer = na;
4084                         break;
4085                 }
4086                 if (na_local && na_peer)
4087                         break;
4088         }
4089         if (!na_local || !na_peer)
4090                 return 1;
4091         /* Local rule found with scope link, permanent and assigned peer. */
4092         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4093                MNL_ALIGN(sizeof(struct ifaddrmsg)) +
4094                (family == AF_INET6 ? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4095                                    : 2 * SZ_NLATTR_TYPE_OF(uint32_t));
4096         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4097         if (!cmd) {
4098                 rte_errno = ENOMEM;
4099                 return -rte_errno;
4100         }
4101         cmd = mnl_nlmsg_put_header(cmd);
4102         cmd->nlmsg_type = RTM_DELADDR;
4103         cmd->nlmsg_flags = NLM_F_REQUEST;
4104         ifa = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifa));
4105         ifa->ifa_flags = IFA_F_PERMANENT;
4106         ifa->ifa_scope = RT_SCOPE_LINK;
4107         ifa->ifa_index = ctx->ifindex;
4108         if (family == AF_INET) {
4109                 ifa->ifa_family = AF_INET;
4110                 ifa->ifa_prefixlen = 32;
4111                 mnl_attr_put_u32(cmd, IFA_LOCAL, mnl_attr_get_u32(na_local));
4112                 mnl_attr_put_u32(cmd, IFA_ADDRESS, mnl_attr_get_u32(na_peer));
4113         } else {
4114                 ifa->ifa_family = AF_INET6;
4115                 ifa->ifa_prefixlen = 128;
4116                 mnl_attr_put(cmd, IFA_LOCAL, IPV6_ADDR_LEN,
4117                         mnl_attr_get_payload(na_local));
4118                 mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN,
4119                         mnl_attr_get_payload(na_peer));
4120         }
4121         assert(size == cmd->nlmsg_len);
4122         return 1;
4123 }
4124
4125 /**
4126  * Cleanup the local IP addresses on outer interface.
4127  *
4128  * @param[in] tcf
4129  *   Context object initialized by mlx5_flow_tcf_context_create().
4130  * @param[in] ifindex
4131  *   Network inferface index to perform cleanup.
4132  */
4133 static void
4134 flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf,
4135                             unsigned int ifindex)
4136 {
4137         struct nlmsghdr *nlh;
4138         struct ifaddrmsg *ifa;
4139         struct tcf_nlcb_context ctx = {
4140                 .ifindex = ifindex,
4141                 .bufsize = MNL_REQUEST_SIZE,
4142                 .nlbuf = LIST_HEAD_INITIALIZER(),
4143         };
4144         int ret;
4145
4146         assert(ifindex);
4147         /*
4148          * Seek and destroy leftovers of local IP addresses with
4149          * matching properties "scope link".
4150          */
4151         nlh = mnl_nlmsg_put_header(tcf->buf);
4152         nlh->nlmsg_type = RTM_GETADDR;
4153         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4154         ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4155         ifa->ifa_family = AF_UNSPEC;
4156         ifa->ifa_index = ifindex;
4157         ifa->ifa_scope = RT_SCOPE_LINK;
4158         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_local_cb, &ctx);
4159         if (ret)
4160                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4161         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4162         if (ret)
4163                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4164 }
4165
4166 /**
4167  * Collect neigh permament rules on specified network device.
4168  * This is callback routine called by libmnl mnl_cb_run() in loop for
4169  * every message in received packet.
4170  *
4171  * @param[in] nlh
4172  *   Pointer to reply header.
4173  * @param[in, out] arg
4174  *   Opaque data pointer for this callback.
4175  *
4176  * @return
4177  *   A positive, nonzero value on success, negative errno value otherwise
4178  *   and rte_errno is set.
4179  */
4180 static int
4181 flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg)
4182 {
4183         struct tcf_nlcb_context *ctx = arg;
4184         struct nlmsghdr *cmd;
4185         struct ndmsg *ndm;
4186         struct nlattr *na;
4187         struct nlattr *na_ip = NULL;
4188         struct nlattr *na_mac = NULL;
4189         unsigned char family;
4190         uint32_t size;
4191
4192         if (nlh->nlmsg_type != RTM_NEWNEIGH) {
4193                 rte_errno = EINVAL;
4194                 return -rte_errno;
4195         }
4196         ndm = mnl_nlmsg_get_payload(nlh);
4197         family = ndm->ndm_family;
4198         if (ndm->ndm_ifindex != (int)ctx->ifindex ||
4199            !(ndm->ndm_state & NUD_PERMANENT) ||
4200            (family != AF_INET && family != AF_INET6))
4201                 return 1;
4202         mnl_attr_for_each(na, nlh, sizeof(*ndm)) {
4203                 switch (mnl_attr_get_type(na)) {
4204                 case NDA_DST:
4205                         na_ip = na;
4206                         break;
4207                 case NDA_LLADDR:
4208                         na_mac = na;
4209                         break;
4210                 }
4211                 if (na_mac && na_ip)
4212                         break;
4213         }
4214         if (!na_mac || !na_ip)
4215                 return 1;
4216         /* Neigh rule with permenent attribute found. */
4217         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4218                MNL_ALIGN(sizeof(struct ndmsg)) +
4219                SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) +
4220                (family == AF_INET6 ? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4221                                    : SZ_NLATTR_TYPE_OF(uint32_t));
4222         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4223         if (!cmd) {
4224                 rte_errno = ENOMEM;
4225                 return -rte_errno;
4226         }
4227         cmd = mnl_nlmsg_put_header(cmd);
4228         cmd->nlmsg_type = RTM_DELNEIGH;
4229         cmd->nlmsg_flags = NLM_F_REQUEST;
4230         ndm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ndm));
4231         ndm->ndm_ifindex = ctx->ifindex;
4232         ndm->ndm_state = NUD_PERMANENT;
4233         ndm->ndm_flags = 0;
4234         ndm->ndm_type = 0;
4235         if (family == AF_INET) {
4236                 ndm->ndm_family = AF_INET;
4237                 mnl_attr_put_u32(cmd, NDA_DST, mnl_attr_get_u32(na_ip));
4238         } else {
4239                 ndm->ndm_family = AF_INET6;
4240                 mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN,
4241                              mnl_attr_get_payload(na_ip));
4242         }
4243         mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN,
4244                      mnl_attr_get_payload(na_mac));
4245         assert(size == cmd->nlmsg_len);
4246         return 1;
4247 }
4248
4249 /**
4250  * Cleanup the neigh rules on outer interface.
4251  *
4252  * @param[in] tcf
4253  *   Context object initialized by mlx5_flow_tcf_context_create().
4254  * @param[in] ifindex
4255  *   Network inferface index to perform cleanup.
4256  */
4257 static void
4258 flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf,
4259                             unsigned int ifindex)
4260 {
4261         struct nlmsghdr *nlh;
4262         struct ndmsg *ndm;
4263         struct tcf_nlcb_context ctx = {
4264                 .ifindex = ifindex,
4265                 .bufsize = MNL_REQUEST_SIZE,
4266                 .nlbuf = LIST_HEAD_INITIALIZER(),
4267         };
4268         int ret;
4269
4270         assert(ifindex);
4271         /* Seek and destroy leftovers of neigh rules. */
4272         nlh = mnl_nlmsg_put_header(tcf->buf);
4273         nlh->nlmsg_type = RTM_GETNEIGH;
4274         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4275         ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4276         ndm->ndm_family = AF_UNSPEC;
4277         ndm->ndm_ifindex = ifindex;
4278         ndm->ndm_state = NUD_PERMANENT;
4279         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_neigh_cb, &ctx);
4280         if (ret)
4281                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4282         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4283         if (ret)
4284                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4285 }
4286
4287 /**
4288  * Collect indices of VXLAN encap/decap interfaces associated with device.
4289  * This is callback routine called by libmnl mnl_cb_run() in loop for
4290  * every message in received packet.
4291  *
4292  * @param[in] nlh
4293  *   Pointer to reply header.
4294  * @param[in, out] arg
4295  *   Opaque data pointer for this callback.
4296  *
4297  * @return
4298  *   A positive, nonzero value on success, negative errno value otherwise
4299  *   and rte_errno is set.
4300  */
4301 static int
4302 flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg)
4303 {
4304         struct tcf_nlcb_context *ctx = arg;
4305         struct nlmsghdr *cmd;
4306         struct ifinfomsg *ifm;
4307         struct nlattr *na;
4308         struct nlattr *na_info = NULL;
4309         struct nlattr *na_vxlan = NULL;
4310         bool found = false;
4311         unsigned int vxindex;
4312         uint32_t size;
4313
4314         if (nlh->nlmsg_type != RTM_NEWLINK) {
4315                 rte_errno = EINVAL;
4316                 return -rte_errno;
4317         }
4318         ifm = mnl_nlmsg_get_payload(nlh);
4319         if (!ifm->ifi_index) {
4320                 rte_errno = EINVAL;
4321                 return -rte_errno;
4322         }
4323         mnl_attr_for_each(na, nlh, sizeof(*ifm))
4324                 if (mnl_attr_get_type(na) == IFLA_LINKINFO) {
4325                         na_info = na;
4326                         break;
4327                 }
4328         if (!na_info)
4329                 return 1;
4330         mnl_attr_for_each_nested(na, na_info) {
4331                 switch (mnl_attr_get_type(na)) {
4332                 case IFLA_INFO_KIND:
4333                         if (!strncmp("vxlan", mnl_attr_get_str(na),
4334                                      mnl_attr_get_len(na)))
4335                                 found = true;
4336                         break;
4337                 case IFLA_INFO_DATA:
4338                         na_vxlan = na;
4339                         break;
4340                 }
4341                 if (found && na_vxlan)
4342                         break;
4343         }
4344         if (!found || !na_vxlan)
4345                 return 1;
4346         found = false;
4347         mnl_attr_for_each_nested(na, na_vxlan) {
4348                 if (mnl_attr_get_type(na) == IFLA_VXLAN_LINK &&
4349                     mnl_attr_get_u32(na) == ctx->ifindex) {
4350                         found = true;
4351                         break;
4352                 }
4353         }
4354         if (!found)
4355                 return 1;
4356         /* Attached VXLAN device found, store the command to delete. */
4357         vxindex = ifm->ifi_index;
4358         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4359                MNL_ALIGN(sizeof(struct ifinfomsg));
4360         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4361         if (!cmd) {
4362                 rte_errno = ENOMEM;
4363                 return -rte_errno;
4364         }
4365         cmd = mnl_nlmsg_put_header(cmd);
4366         cmd->nlmsg_type = RTM_DELLINK;
4367         cmd->nlmsg_flags = NLM_F_REQUEST;
4368         ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm));
4369         ifm->ifi_family = AF_UNSPEC;
4370         ifm->ifi_index = vxindex;
4371         assert(size == cmd->nlmsg_len);
4372         return 1;
4373 }
4374
4375 /**
4376  * Cleanup the outer interface. Removes all found vxlan devices
4377  * attached to specified index, flushes the neigh and local IP
4378  * database.
4379  *
4380  * @param[in] tcf
4381  *   Context object initialized by mlx5_flow_tcf_context_create().
4382  * @param[in] ifindex
4383  *   Network inferface index to perform cleanup.
4384  */
4385 static void
4386 flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf,
4387                             unsigned int ifindex)
4388 {
4389         struct nlmsghdr *nlh;
4390         struct ifinfomsg *ifm;
4391         struct tcf_nlcb_context ctx = {
4392                 .ifindex = ifindex,
4393                 .bufsize = MNL_REQUEST_SIZE,
4394                 .nlbuf = LIST_HEAD_INITIALIZER(),
4395         };
4396         int ret;
4397
4398         assert(ifindex);
4399         /*
4400          * Seek and destroy leftover VXLAN encap/decap interfaces with
4401          * matching properties.
4402          */
4403         nlh = mnl_nlmsg_put_header(tcf->buf);
4404         nlh->nlmsg_type = RTM_GETLINK;
4405         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4406         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4407         ifm->ifi_family = AF_UNSPEC;
4408         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_vxlan_cb, &ctx);
4409         if (ret)
4410                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4411         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4412         if (ret)
4413                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4414 }
4415
4416 /**
4417  * Emit Netlink message to add/remove local address to the outer device.
4418  * The address being added is visible within the link only (scope link).
4419  *
4420  * Note that an implicit route is maintained by the kernel due to the
4421  * presence of a peer address (IFA_ADDRESS).
4422  *
4423  * These rules are used for encapsultion only and allow to assign
4424  * the outer tunnel source IP address.
4425  *
4426  * @param[in] tcf
4427  *   Libmnl socket context object.
4428  * @param[in] encap
4429  *   Encapsulation properties (source address and its peer).
4430  * @param[in] ifindex
4431  *   Network interface to apply rule.
4432  * @param[in] enable
4433  *   Toggle between add and remove.
4434  * @param[out] error
4435  *   Perform verbose error reporting if not NULL.
4436  *
4437  * @return
4438  *   0 on success, a negative errno value otherwise and rte_errno is set.
4439  */
4440 static int
4441 flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf,
4442                     const struct flow_tcf_vxlan_encap *encap,
4443                     unsigned int ifindex,
4444                     bool enable,
4445                     struct rte_flow_error *error)
4446 {
4447         struct nlmsghdr *nlh;
4448         struct ifaddrmsg *ifa;
4449         alignas(struct nlmsghdr)
4450         uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
4451
4452         nlh = mnl_nlmsg_put_header(buf);
4453         nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
4454         nlh->nlmsg_flags =
4455                 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4456         nlh->nlmsg_seq = 0;
4457         ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4458         ifa->ifa_flags = IFA_F_PERMANENT;
4459         ifa->ifa_scope = RT_SCOPE_LINK;
4460         ifa->ifa_index = ifindex;
4461         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4462                 ifa->ifa_family = AF_INET;
4463                 ifa->ifa_prefixlen = 32;
4464                 mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src);
4465                 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST)
4466                         mnl_attr_put_u32(nlh, IFA_ADDRESS,
4467                                               encap->ipv4.dst);
4468         } else {
4469                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4470                 ifa->ifa_family = AF_INET6;
4471                 ifa->ifa_prefixlen = 128;
4472                 mnl_attr_put(nlh, IFA_LOCAL,
4473                                   sizeof(encap->ipv6.src),
4474                                   &encap->ipv6.src);
4475                 if (encap->mask & FLOW_TCF_ENCAP_IPV6_DST)
4476                         mnl_attr_put(nlh, IFA_ADDRESS,
4477                                           sizeof(encap->ipv6.dst),
4478                                           &encap->ipv6.dst);
4479         }
4480         if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4481                 return 0;
4482         return rte_flow_error_set(error, rte_errno,
4483                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4484                                   "netlink: cannot complete IFA request"
4485                                   " (ip addr add)");
4486 }
4487
4488 /**
4489  * Emit Netlink message to add/remove neighbor.
4490  *
4491  * @param[in] tcf
4492  *   Libmnl socket context object.
4493  * @param[in] encap
4494  *   Encapsulation properties (destination address).
4495  * @param[in] ifindex
4496  *   Network interface.
4497  * @param[in] enable
4498  *   Toggle between add and remove.
4499  * @param[out] error
4500  *   Perform verbose error reporting if not NULL.
4501  *
4502  * @return
4503  *   0 on success, a negative errno value otherwise and rte_errno is set.
4504  */
4505 static int
4506 flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf,
4507                      const struct flow_tcf_vxlan_encap *encap,
4508                      unsigned int ifindex,
4509                      bool enable,
4510                      struct rte_flow_error *error)
4511 {
4512         struct nlmsghdr *nlh;
4513         struct ndmsg *ndm;
4514         alignas(struct nlmsghdr)
4515         uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
4516
4517         nlh = mnl_nlmsg_put_header(buf);
4518         nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
4519         nlh->nlmsg_flags =
4520                 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4521         nlh->nlmsg_seq = 0;
4522         ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4523         ndm->ndm_ifindex = ifindex;
4524         ndm->ndm_state = NUD_PERMANENT;
4525         ndm->ndm_flags = 0;
4526         ndm->ndm_type = 0;
4527         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4528                 ndm->ndm_family = AF_INET;
4529                 mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst);
4530         } else {
4531                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4532                 ndm->ndm_family = AF_INET6;
4533                 mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst),
4534                                                  &encap->ipv6.dst);
4535         }
4536         if (encap->mask & FLOW_TCF_ENCAP_ETH_SRC && enable)
4537                 DRV_LOG(WARNING,
4538                         "outer ethernet source address cannot be "
4539                         "forced for VXLAN encapsulation");
4540         if (encap->mask & FLOW_TCF_ENCAP_ETH_DST)
4541                 mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst),
4542                                                     &encap->eth.dst);
4543         if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4544                 return 0;
4545         return rte_flow_error_set(error, rte_errno,
4546                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4547                                   "netlink: cannot complete ND request"
4548                                   " (ip neigh)");
4549 }
4550
4551 /**
4552  * Manage the local IP addresses and their peers IP addresses on the
4553  * outer interface for encapsulation purposes. The kernel searches the
4554  * appropriate device for tunnel egress traffic using the outer source
4555  * IP, this IP should be assigned to the outer network device, otherwise
4556  * kernel rejects the rule.
4557  *
4558  * Adds or removes the addresses using the Netlink command like this:
4559  *   ip addr add <src_ip> peer <dst_ip> scope link dev <ifouter>
4560  *
4561  * The addresses are local to the netdev ("scope link"), this reduces
4562  * the risk of conflicts. Note that an implicit route is maintained by
4563  * the kernel due to the presence of a peer address (IFA_ADDRESS).
4564  *
4565  * @param[in] tcf
4566  *   Libmnl socket context object.
4567  * @param[in] vtep
4568  *   VTEP object, contains rule database and ifouter index.
4569  * @param[in] dev_flow
4570  *   Flow object, contains the tunnel parameters (for encap only).
4571  * @param[in] enable
4572  *   Toggle between add and remove.
4573  * @param[out] error
4574  *   Perform verbose error reporting if not NULL.
4575  *
4576  * @return
4577  *   0 on success, a negative errno value otherwise and rte_errno is set.
4578  */
4579 static int
4580 flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf,
4581                      struct tcf_vtep *vtep,
4582                      struct mlx5_flow *dev_flow,
4583                      bool enable,
4584                      struct rte_flow_error *error)
4585 {
4586         const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4587         struct tcf_local_rule *rule = NULL;
4588         int ret;
4589
4590         assert(encap);
4591         assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4592         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4593                 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_DST);
4594                 LIST_FOREACH(rule, &vtep->local, next) {
4595                         if (rule->mask & FLOW_TCF_ENCAP_IPV4_SRC &&
4596                             encap->ipv4.src == rule->ipv4.src &&
4597                             encap->ipv4.dst == rule->ipv4.dst) {
4598                                 break;
4599                         }
4600                 }
4601         } else {
4602                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4603                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4604                 LIST_FOREACH(rule, &vtep->local, next) {
4605                         if (rule->mask & FLOW_TCF_ENCAP_IPV6_SRC &&
4606                             !memcmp(&encap->ipv6.src, &rule->ipv6.src,
4607                                             sizeof(encap->ipv6.src)) &&
4608                             !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4609                                             sizeof(encap->ipv6.dst))) {
4610                                 break;
4611                         }
4612                 }
4613         }
4614         if (rule) {
4615                 if (enable) {
4616                         rule->refcnt++;
4617                         return 0;
4618                 }
4619                 if (!rule->refcnt || !--rule->refcnt) {
4620                         LIST_REMOVE(rule, next);
4621                         return flow_tcf_rule_local(tcf, encap,
4622                                         vtep->ifouter, false, error);
4623                 }
4624                 return 0;
4625         }
4626         if (!enable) {
4627                 DRV_LOG(WARNING, "disabling not existing local rule");
4628                 rte_flow_error_set(error, ENOENT,
4629                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4630                                    "disabling not existing local rule");
4631                 return -ENOENT;
4632         }
4633         rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule),
4634                                 alignof(struct tcf_local_rule));
4635         if (!rule) {
4636                 rte_flow_error_set(error, ENOMEM,
4637                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4638                                    "unable to allocate memory for local rule");
4639                 return -rte_errno;
4640         }
4641         *rule = (struct tcf_local_rule){.refcnt = 0,
4642                                         .mask = 0,
4643                                         };
4644         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4645                 rule->mask = FLOW_TCF_ENCAP_IPV4_SRC
4646                            | FLOW_TCF_ENCAP_IPV4_DST;
4647                 rule->ipv4.src = encap->ipv4.src;
4648                 rule->ipv4.dst = encap->ipv4.dst;
4649         } else {
4650                 rule->mask = FLOW_TCF_ENCAP_IPV6_SRC
4651                            | FLOW_TCF_ENCAP_IPV6_DST;
4652                 memcpy(&rule->ipv6.src, &encap->ipv6.src, IPV6_ADDR_LEN);
4653                 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4654         }
4655         ret = flow_tcf_rule_local(tcf, encap, vtep->ifouter, true, error);
4656         if (ret) {
4657                 rte_free(rule);
4658                 return ret;
4659         }
4660         rule->refcnt++;
4661         LIST_INSERT_HEAD(&vtep->local, rule, next);
4662         return 0;
4663 }
4664
4665 /**
4666  * Manage the destination MAC/IP addresses neigh database, kernel uses
4667  * this one to determine the destination MAC address within encapsulation
4668  * header. Adds or removes the entries using the Netlink command like this:
4669  *   ip neigh add dev <ifouter> lladdr <dst_mac> to <dst_ip> nud permanent
4670  *
4671  * @param[in] tcf
4672  *   Libmnl socket context object.
4673  * @param[in] vtep
4674  *   VTEP object, contains rule database and ifouter index.
4675  * @param[in] dev_flow
4676  *   Flow object, contains the tunnel parameters (for encap only).
4677  * @param[in] enable
4678  *   Toggle between add and remove.
4679  * @param[out] error
4680  *   Perform verbose error reporting if not NULL.
4681  *
4682  * @return
4683  *   0 on success, a negative errno value otherwise and rte_errno is set.
4684  */
4685 static int
4686 flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf,
4687                      struct tcf_vtep *vtep,
4688                      struct mlx5_flow *dev_flow,
4689                      bool enable,
4690                      struct rte_flow_error *error)
4691 {
4692         const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4693         struct tcf_neigh_rule *rule = NULL;
4694         int ret;
4695
4696         assert(encap);
4697         assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4698         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4699                 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_SRC);
4700                 LIST_FOREACH(rule, &vtep->neigh, next) {
4701                         if (rule->mask & FLOW_TCF_ENCAP_IPV4_DST &&
4702                             encap->ipv4.dst == rule->ipv4.dst) {
4703                                 break;
4704                         }
4705                 }
4706         } else {
4707                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4708                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4709                 LIST_FOREACH(rule, &vtep->neigh, next) {
4710                         if (rule->mask & FLOW_TCF_ENCAP_IPV6_DST &&
4711                             !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4712                                                 sizeof(encap->ipv6.dst))) {
4713                                 break;
4714                         }
4715                 }
4716         }
4717         if (rule) {
4718                 if (memcmp(&encap->eth.dst, &rule->eth,
4719                            sizeof(encap->eth.dst))) {
4720                         DRV_LOG(WARNING, "Destination MAC differs"
4721                                          " in neigh rule");
4722                         rte_flow_error_set(error, EEXIST,
4723                                            RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
4724                                            NULL, "Different MAC address"
4725                                            " neigh rule for the same"
4726                                            " destination IP");
4727                                         return -EEXIST;
4728                 }
4729                 if (enable) {
4730                         rule->refcnt++;
4731                         return 0;
4732                 }
4733                 if (!rule->refcnt || !--rule->refcnt) {
4734                         LIST_REMOVE(rule, next);
4735                         return flow_tcf_rule_neigh(tcf, encap,
4736                                                    vtep->ifouter,
4737                                                    false, error);
4738                 }
4739                 return 0;
4740         }
4741         if (!enable) {
4742                 DRV_LOG(WARNING, "Disabling not existing neigh rule");
4743                 rte_flow_error_set(error, ENOENT,
4744                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4745                                    "unable to allocate memory for neigh rule");
4746                 return -ENOENT;
4747         }
4748         rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule),
4749                                 alignof(struct tcf_neigh_rule));
4750         if (!rule) {
4751                 rte_flow_error_set(error, ENOMEM,
4752                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4753                                    "unable to allocate memory for neigh rule");
4754                 return -rte_errno;
4755         }
4756         *rule = (struct tcf_neigh_rule){.refcnt = 0,
4757                                         .mask = 0,
4758                                         };
4759         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4760                 rule->mask = FLOW_TCF_ENCAP_IPV4_DST;
4761                 rule->ipv4.dst = encap->ipv4.dst;
4762         } else {
4763                 rule->mask = FLOW_TCF_ENCAP_IPV6_DST;
4764                 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4765         }
4766         memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth));
4767         ret = flow_tcf_rule_neigh(tcf, encap, vtep->ifouter, true, error);
4768         if (ret) {
4769                 rte_free(rule);
4770                 return ret;
4771         }
4772         rule->refcnt++;
4773         LIST_INSERT_HEAD(&vtep->neigh, rule, next);
4774         return 0;
4775 }
4776
4777 /* VTEP device list is shared between PMD port instances. */
4778 static LIST_HEAD(, tcf_vtep) vtep_list_vxlan = LIST_HEAD_INITIALIZER();
4779 static pthread_mutex_t vtep_list_mutex = PTHREAD_MUTEX_INITIALIZER;
4780
4781 /**
4782  * Deletes VTEP network device.
4783  *
4784  * @param[in] tcf
4785  *   Context object initialized by mlx5_flow_tcf_context_create().
4786  * @param[in] vtep
4787  *   Object represinting the network device to delete. Memory
4788  *   allocated for this object is freed by routine.
4789  */
4790 static void
4791 flow_tcf_vtep_delete(struct mlx5_flow_tcf_context *tcf,
4792                      struct tcf_vtep *vtep)
4793 {
4794         struct nlmsghdr *nlh;
4795         struct ifinfomsg *ifm;
4796         alignas(struct nlmsghdr)
4797         uint8_t buf[mnl_nlmsg_size(MNL_ALIGN(sizeof(*ifm))) +
4798                     MNL_BUF_EXTRA_SPACE];
4799         int ret;
4800
4801         assert(!vtep->refcnt);
4802         /* Delete only ifaces those we actually created. */
4803         if (vtep->created && vtep->ifindex) {
4804                 DRV_LOG(INFO, "VTEP delete (%d)", vtep->ifindex);
4805                 nlh = mnl_nlmsg_put_header(buf);
4806                 nlh->nlmsg_type = RTM_DELLINK;
4807                 nlh->nlmsg_flags = NLM_F_REQUEST;
4808                 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4809                 ifm->ifi_family = AF_UNSPEC;
4810                 ifm->ifi_index = vtep->ifindex;
4811                 assert(sizeof(buf) >= nlh->nlmsg_len);
4812                 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4813                 if (ret)
4814                         DRV_LOG(WARNING, "netlink: error deleting vxlan"
4815                                          " encap/decap ifindex %u",
4816                                          ifm->ifi_index);
4817         }
4818         rte_free(vtep);
4819 }
4820
4821 /**
4822  * Creates VTEP network device.
4823  *
4824  * @param[in] tcf
4825  *   Context object initialized by mlx5_flow_tcf_context_create().
4826  * @param[in] ifouter
4827  *   Outer interface to attach new-created VXLAN device
4828  *   If zero the VXLAN device will not be attached to any device.
4829  *   These VTEPs are used for decapsulation and can be precreated
4830  *   and shared between processes.
4831  * @param[in] port
4832  *   UDP port of created VTEP device.
4833  * @param[out] error
4834  *   Perform verbose error reporting if not NULL.
4835  *
4836  * @return
4837  * Pointer to created device structure on success,
4838  * NULL otherwise and rte_errno is set.
4839  */
4840 #ifdef HAVE_IFLA_VXLAN_COLLECT_METADATA
4841 static struct tcf_vtep*
4842 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf,
4843                      unsigned int ifouter,
4844                      uint16_t port, struct rte_flow_error *error)
4845 {
4846         struct tcf_vtep *vtep;
4847         struct nlmsghdr *nlh;
4848         struct ifinfomsg *ifm;
4849         char name[sizeof(MLX5_VXLAN_DEVICE_PFX) + 24];
4850         alignas(struct nlmsghdr)
4851         uint8_t buf[mnl_nlmsg_size(sizeof(*ifm)) +
4852                     SZ_NLATTR_DATA_OF(sizeof(name)) +
4853                     SZ_NLATTR_NEST * 2 +
4854                     SZ_NLATTR_STRZ_OF("vxlan") +
4855                     SZ_NLATTR_DATA_OF(sizeof(uint32_t)) +
4856                     SZ_NLATTR_DATA_OF(sizeof(uint16_t)) +
4857                     SZ_NLATTR_DATA_OF(sizeof(uint8_t)) * 3 +
4858                     MNL_BUF_EXTRA_SPACE];
4859         struct nlattr *na_info;
4860         struct nlattr *na_vxlan;
4861         rte_be16_t vxlan_port = rte_cpu_to_be_16(port);
4862         int ret;
4863
4864         vtep = rte_zmalloc(__func__, sizeof(*vtep), alignof(struct tcf_vtep));
4865         if (!vtep) {
4866                 rte_flow_error_set(error, ENOMEM,
4867                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4868                                    "unable to allocate memory for VTEP");
4869                 return NULL;
4870         }
4871         *vtep = (struct tcf_vtep){
4872                         .port = port,
4873                         .local = LIST_HEAD_INITIALIZER(),
4874                         .neigh = LIST_HEAD_INITIALIZER(),
4875         };
4876         memset(buf, 0, sizeof(buf));
4877         nlh = mnl_nlmsg_put_header(buf);
4878         nlh->nlmsg_type = RTM_NEWLINK;
4879         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE  | NLM_F_EXCL;
4880         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4881         ifm->ifi_family = AF_UNSPEC;
4882         ifm->ifi_type = 0;
4883         ifm->ifi_index = 0;
4884         ifm->ifi_flags = IFF_UP;
4885         ifm->ifi_change = 0xffffffff;
4886         snprintf(name, sizeof(name), "%s%u", MLX5_VXLAN_DEVICE_PFX, port);
4887         mnl_attr_put_strz(nlh, IFLA_IFNAME, name);
4888         na_info = mnl_attr_nest_start(nlh, IFLA_LINKINFO);
4889         assert(na_info);
4890         mnl_attr_put_strz(nlh, IFLA_INFO_KIND, "vxlan");
4891         na_vxlan = mnl_attr_nest_start(nlh, IFLA_INFO_DATA);
4892         if (ifouter)
4893                 mnl_attr_put_u32(nlh, IFLA_VXLAN_LINK, ifouter);
4894         assert(na_vxlan);
4895         mnl_attr_put_u8(nlh, IFLA_VXLAN_COLLECT_METADATA, 1);
4896         mnl_attr_put_u8(nlh, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1);
4897         mnl_attr_put_u8(nlh, IFLA_VXLAN_LEARNING, 0);
4898         mnl_attr_put_u16(nlh, IFLA_VXLAN_PORT, vxlan_port);
4899         mnl_attr_nest_end(nlh, na_vxlan);
4900         mnl_attr_nest_end(nlh, na_info);
4901         assert(sizeof(buf) >= nlh->nlmsg_len);
4902         ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4903         if (ret) {
4904                 DRV_LOG(WARNING,
4905                         "netlink: VTEP %s create failure (%d)",
4906                         name, rte_errno);
4907                 if (rte_errno != EEXIST || ifouter)
4908                         /*
4909                          * Some unhandled error occurred or device is
4910                          * for encapsulation and cannot be shared.
4911                          */
4912                         goto error;
4913         } else {
4914                 /*
4915                  * Mark device we actually created.
4916                  * We should explicitly delete
4917                  * when we do not need it anymore.
4918                  */
4919                 vtep->created = 1;
4920         }
4921         /* Try to get ifindex of created of pre-existing device. */
4922         ret = if_nametoindex(name);
4923         if (!ret) {
4924                 DRV_LOG(WARNING,
4925                         "VTEP %s failed to get index (%d)", name, errno);
4926                 rte_flow_error_set
4927                         (error, -errno,
4928                          RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4929                          "netlink: failed to retrieve VTEP ifindex");
4930                 goto error;
4931         }
4932         vtep->ifindex = ret;
4933         vtep->ifouter = ifouter;
4934         memset(buf, 0, sizeof(buf));
4935         nlh = mnl_nlmsg_put_header(buf);
4936         nlh->nlmsg_type = RTM_NEWLINK;
4937         nlh->nlmsg_flags = NLM_F_REQUEST;
4938         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4939         ifm->ifi_family = AF_UNSPEC;
4940         ifm->ifi_type = 0;
4941         ifm->ifi_index = vtep->ifindex;
4942         ifm->ifi_flags = IFF_UP;
4943         ifm->ifi_change = IFF_UP;
4944         ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4945         if (ret) {
4946                 rte_flow_error_set(error, -errno,
4947                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4948                                    "netlink: failed to set VTEP link up");
4949                 DRV_LOG(WARNING, "netlink: VTEP %s set link up failure (%d)",
4950                         name, rte_errno);
4951                 goto clean;
4952         }
4953         ret = mlx5_flow_tcf_init(tcf, vtep->ifindex, error);
4954         if (ret) {
4955                 DRV_LOG(WARNING, "VTEP %s init failure (%d)", name, rte_errno);
4956                 goto clean;
4957         }
4958         DRV_LOG(INFO, "VTEP create (%d, %d)", vtep->port, vtep->ifindex);
4959         vtep->refcnt = 1;
4960         return vtep;
4961 clean:
4962         flow_tcf_vtep_delete(tcf, vtep);
4963         return NULL;
4964 error:
4965         rte_free(vtep);
4966         return NULL;
4967 }
4968 #else
4969 static struct tcf_vtep*
4970 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf __rte_unused,
4971                      unsigned int ifouter __rte_unused,
4972                      uint16_t port __rte_unused,
4973                      struct rte_flow_error *error)
4974 {
4975         rte_flow_error_set(error, ENOTSUP,
4976                            RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4977                            "netlink: failed to create VTEP, "
4978                            "vxlan metadata are not supported by kernel");
4979         return NULL;
4980 }
4981 #endif /* HAVE_IFLA_VXLAN_COLLECT_METADATA */
4982
4983 /**
4984  * Acquire target interface index for VXLAN tunneling decapsulation.
4985  * In order to share the UDP port within the other interfaces the
4986  * VXLAN device created as not attached to any interface (if created).
4987  *
4988  * @param[in] tcf
4989  *   Context object initialized by mlx5_flow_tcf_context_create().
4990  * @param[in] dev_flow
4991  *   Flow tcf object with tunnel structure pointer set.
4992  * @param[out] error
4993  *   Perform verbose error reporting if not NULL.
4994  * @return
4995  *   Interface descriptor pointer on success,
4996  *   NULL otherwise and rte_errno is set.
4997  */
4998 static struct tcf_vtep*
4999 flow_tcf_decap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5000                             struct mlx5_flow *dev_flow,
5001                             struct rte_flow_error *error)
5002 {
5003         struct tcf_vtep *vtep;
5004         uint16_t port = dev_flow->tcf.vxlan_decap->udp_port;
5005
5006         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5007                 if (vtep->port == port)
5008                         break;
5009         }
5010         if (vtep && vtep->ifouter) {
5011                 rte_flow_error_set(error, -errno,
5012                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5013                                    "Failed to create decap VTEP with specified"
5014                                    " UDP port, atatched device exists");
5015                 return NULL;
5016         }
5017         if (vtep) {
5018                 /* Device exists, just increment the reference counter. */
5019                 vtep->refcnt++;
5020                 assert(vtep->ifindex);
5021                 return vtep;
5022         }
5023         /* No decapsulation device exists, try to create the new one. */
5024         vtep = flow_tcf_vtep_create(tcf, 0, port, error);
5025         if (vtep)
5026                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5027         return vtep;
5028 }
5029
5030 /**
5031  * Aqcuire target interface index for VXLAN tunneling encapsulation.
5032  *
5033  * @param[in] tcf
5034  *   Context object initialized by mlx5_flow_tcf_context_create().
5035  * @param[in] ifouter
5036  *   Network interface index to attach VXLAN encap device to.
5037  * @param[in] dev_flow
5038  *   Flow tcf object with tunnel structure pointer set.
5039  * @param[out] error
5040  *   Perform verbose error reporting if not NULL.
5041  * @return
5042  *   Interface descriptor pointer on success,
5043  *   NULL otherwise and rte_errno is set.
5044  */
5045 static struct tcf_vtep*
5046 flow_tcf_encap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5047                             unsigned int ifouter,
5048                             struct mlx5_flow *dev_flow __rte_unused,
5049                             struct rte_flow_error *error)
5050 {
5051         static uint16_t encap_port = MLX5_VXLAN_PORT_MIN - 1;
5052         struct tcf_vtep *vtep;
5053         int ret;
5054
5055         assert(ifouter);
5056         /* Look whether the attached VTEP for encap is created. */
5057         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5058                 if (vtep->ifouter == ifouter)
5059                         break;
5060         }
5061         if (vtep) {
5062                 /* VTEP already exists, just increment the reference. */
5063                 vtep->refcnt++;
5064         } else {
5065                 uint16_t pcnt;
5066
5067                 /* Not found, we should create the new attached VTEP. */
5068                 flow_tcf_encap_iface_cleanup(tcf, ifouter);
5069                 flow_tcf_encap_local_cleanup(tcf, ifouter);
5070                 flow_tcf_encap_neigh_cleanup(tcf, ifouter);
5071                 for (pcnt = 0; pcnt <= (MLX5_VXLAN_PORT_MAX
5072                                      - MLX5_VXLAN_PORT_MIN); pcnt++) {
5073                         encap_port++;
5074                         /* Wraparound the UDP port index. */
5075                         if (encap_port < MLX5_VXLAN_PORT_MIN ||
5076                             encap_port > MLX5_VXLAN_PORT_MAX)
5077                                 encap_port = MLX5_VXLAN_PORT_MIN;
5078                         /* Check whether UDP port is in already in use. */
5079                         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5080                                 if (vtep->port == encap_port)
5081                                         break;
5082                         }
5083                         if (vtep) {
5084                                 /* Port is in use, try the next one. */
5085                                 vtep = NULL;
5086                                 continue;
5087                         }
5088                         vtep = flow_tcf_vtep_create(tcf, ifouter,
5089                                                     encap_port, error);
5090                         if (vtep) {
5091                                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5092                                 break;
5093                         }
5094                         if (rte_errno != EEXIST)
5095                                 break;
5096                 }
5097                 if (!vtep)
5098                         return NULL;
5099         }
5100         assert(vtep->ifouter == ifouter);
5101         assert(vtep->ifindex);
5102         /* Create local ipaddr with peer to specify the outer IPs. */
5103         ret = flow_tcf_encap_local(tcf, vtep, dev_flow, true, error);
5104         if (!ret) {
5105                 /* Create neigh rule to specify outer destination MAC. */
5106                 ret = flow_tcf_encap_neigh(tcf, vtep, dev_flow, true, error);
5107                 if (ret)
5108                         flow_tcf_encap_local(tcf, vtep,
5109                                              dev_flow, false, error);
5110         }
5111         if (ret) {
5112                 if (--vtep->refcnt == 0)
5113                         flow_tcf_vtep_delete(tcf, vtep);
5114                 return NULL;
5115         }
5116         return vtep;
5117 }
5118
5119 /**
5120  * Acquires target interface index for tunneling of any type.
5121  * Creates the new VTEP if needed.
5122  *
5123  * @param[in] tcf
5124  *   Context object initialized by mlx5_flow_tcf_context_create().
5125  * @param[in] ifouter
5126  *   Network interface index to attach VXLAN encap device to.
5127  * @param[in] dev_flow
5128  *   Flow tcf object with tunnel structure pointer set.
5129  * @param[out] error
5130  *   Perform verbose error reporting if not NULL.
5131  * @return
5132  *   Interface descriptor pointer on success,
5133  *   NULL otherwise and rte_errno is set.
5134  */
5135 static struct tcf_vtep*
5136 flow_tcf_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5137                       unsigned int ifouter,
5138                       struct mlx5_flow *dev_flow,
5139                       struct rte_flow_error *error)
5140 {
5141         struct tcf_vtep *vtep = NULL;
5142
5143         assert(dev_flow->tcf.tunnel);
5144         pthread_mutex_lock(&vtep_list_mutex);
5145         switch (dev_flow->tcf.tunnel->type) {
5146         case FLOW_TCF_TUNACT_VXLAN_ENCAP:
5147                 vtep = flow_tcf_encap_vtep_acquire(tcf, ifouter,
5148                                                   dev_flow, error);
5149                 break;
5150         case FLOW_TCF_TUNACT_VXLAN_DECAP:
5151                 vtep = flow_tcf_decap_vtep_acquire(tcf, dev_flow, error);
5152                 break;
5153         default:
5154                 rte_flow_error_set(error, ENOTSUP,
5155                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5156                                    "unsupported tunnel type");
5157                 break;
5158         }
5159         pthread_mutex_unlock(&vtep_list_mutex);
5160         return vtep;
5161 }
5162
5163 /**
5164  * Release tunneling interface by ifindex. Decrements reference
5165  * counter and actually removes the device if counter is zero.
5166  *
5167  * @param[in] tcf
5168  *   Context object initialized by mlx5_flow_tcf_context_create().
5169  * @param[in] vtep
5170  *   VTEP device descriptor structure.
5171  * @param[in] dev_flow
5172  *   Flow tcf object with tunnel structure pointer set.
5173  */
5174 static void
5175 flow_tcf_vtep_release(struct mlx5_flow_tcf_context *tcf,
5176                       struct tcf_vtep *vtep,
5177                       struct mlx5_flow *dev_flow)
5178 {
5179         assert(dev_flow->tcf.tunnel);
5180         pthread_mutex_lock(&vtep_list_mutex);
5181         switch (dev_flow->tcf.tunnel->type) {
5182         case FLOW_TCF_TUNACT_VXLAN_DECAP:
5183                 break;
5184         case FLOW_TCF_TUNACT_VXLAN_ENCAP:
5185                 /* Remove the encap ancillary rules first. */
5186                 flow_tcf_encap_neigh(tcf, vtep, dev_flow, false, NULL);
5187                 flow_tcf_encap_local(tcf, vtep, dev_flow, false, NULL);
5188                 break;
5189         default:
5190                 assert(false);
5191                 DRV_LOG(WARNING, "Unsupported tunnel type");
5192                 break;
5193         }
5194         assert(vtep->refcnt);
5195         if (--vtep->refcnt == 0) {
5196                 LIST_REMOVE(vtep, next);
5197                 flow_tcf_vtep_delete(tcf, vtep);
5198         }
5199         pthread_mutex_unlock(&vtep_list_mutex);
5200 }
5201
5202 struct tcf_nlcb_query {
5203         uint32_t handle;
5204         uint32_t tc_flags;
5205         uint32_t flags_valid:1;
5206 };
5207
5208 /**
5209  * Collect queried rule attributes. This is callback routine called by
5210  * libmnl mnl_cb_run() in loop for every message in received packet.
5211  * Current implementation collects the flower flags only.
5212  *
5213  * @param[in] nlh
5214  *   Pointer to reply header.
5215  * @param[in, out] arg
5216  *   Context pointer for this callback.
5217  *
5218  * @return
5219  *   A positive, nonzero value on success (required by libmnl
5220  *   to continue messages processing).
5221  */
5222 static int
5223 flow_tcf_collect_query_cb(const struct nlmsghdr *nlh, void *arg)
5224 {
5225         struct tcf_nlcb_query *query = arg;
5226         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
5227         struct nlattr *na, *na_opt;
5228         bool flower = false;
5229
5230         if (nlh->nlmsg_type != RTM_NEWTFILTER ||
5231             tcm->tcm_handle != query->handle)
5232                 return 1;
5233         mnl_attr_for_each(na, nlh, sizeof(*tcm)) {
5234                 switch (mnl_attr_get_type(na)) {
5235                 case TCA_KIND:
5236                         if (strcmp(mnl_attr_get_payload(na), "flower")) {
5237                                 /* Not flower filter, drop entire message. */
5238                                 return 1;
5239                         }
5240                         flower = true;
5241                         break;
5242                 case TCA_OPTIONS:
5243                         if (!flower) {
5244                                 /* Not flower options, drop entire message. */
5245                                 return 1;
5246                         }
5247                         /* Check nested flower options. */
5248                         mnl_attr_for_each_nested(na_opt, na) {
5249                                 switch (mnl_attr_get_type(na_opt)) {
5250                                 case TCA_FLOWER_FLAGS:
5251                                         query->flags_valid = 1;
5252                                         query->tc_flags =
5253                                                 mnl_attr_get_u32(na_opt);
5254                                         break;
5255                                 }
5256                         }
5257                         break;
5258                 }
5259         }
5260         return 1;
5261 }
5262
5263 /**
5264  * Query a TC flower rule flags via netlink.
5265  *
5266  * @param[in] tcf
5267  *   Context object initialized by mlx5_flow_tcf_context_create().
5268  * @param[in] dev_flow
5269  *   Pointer to the flow.
5270  * @param[out] pflags
5271  *   pointer to the data retrieved by the query.
5272  *
5273  * @return
5274  *   0 on success, a negative errno value otherwise.
5275  */
5276 static int
5277 flow_tcf_query_flags(struct mlx5_flow_tcf_context *tcf,
5278                      struct mlx5_flow *dev_flow,
5279                      uint32_t *pflags)
5280 {
5281         struct nlmsghdr *nlh;
5282         struct tcmsg *tcm;
5283         struct tcf_nlcb_query query = {
5284                 .handle = dev_flow->tcf.tcm->tcm_handle,
5285         };
5286
5287         nlh = mnl_nlmsg_put_header(tcf->buf);
5288         nlh->nlmsg_type = RTM_GETTFILTER;
5289         nlh->nlmsg_flags = NLM_F_REQUEST;
5290         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
5291         memcpy(tcm, dev_flow->tcf.tcm, sizeof(*tcm));
5292         /*
5293          * Ignore Netlink error for filter query operations.
5294          * The reply length is sent by kernel as errno.
5295          * Just check we got the flags option.
5296          */
5297         flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_query_cb, &query);
5298         if (!query.flags_valid) {
5299                 *pflags = 0;
5300                 return -ENOENT;
5301         }
5302         *pflags = query.tc_flags;
5303         return 0;
5304 }
5305
5306 /**
5307  * Query and check the in_hw set for specified rule.
5308  *
5309  * @param[in] tcf
5310  *   Context object initialized by mlx5_flow_tcf_context_create().
5311  * @param[in] dev_flow
5312  *   Pointer to the flow to check.
5313  *
5314  * @return
5315  *   0 on success, a negative errno value otherwise.
5316  */
5317 static int
5318 flow_tcf_check_inhw(struct mlx5_flow_tcf_context *tcf,
5319                     struct mlx5_flow *dev_flow)
5320 {
5321         uint32_t flags;
5322         int ret;
5323
5324         ret = flow_tcf_query_flags(tcf, dev_flow, &flags);
5325         if (ret)
5326                 return ret;
5327         return  (flags & TCA_CLS_FLAGS_IN_HW) ? 0 : -ENOENT;
5328 }
5329
5330 /**
5331  * Remove flow from E-Switch by sending Netlink message.
5332  *
5333  * @param[in] dev
5334  *   Pointer to Ethernet device.
5335  * @param[in, out] flow
5336  *   Pointer to the sub flow.
5337  */
5338 static void
5339 flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
5340 {
5341         struct priv *priv = dev->data->dev_private;
5342         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5343         struct mlx5_flow *dev_flow;
5344         struct nlmsghdr *nlh;
5345
5346         if (!flow)
5347                 return;
5348         dev_flow = LIST_FIRST(&flow->dev_flows);
5349         if (!dev_flow)
5350                 return;
5351         /* E-Switch flow can't be expanded. */
5352         assert(!LIST_NEXT(dev_flow, next));
5353         if (dev_flow->tcf.applied) {
5354                 nlh = dev_flow->tcf.nlh;
5355                 nlh->nlmsg_type = RTM_DELTFILTER;
5356                 nlh->nlmsg_flags = NLM_F_REQUEST;
5357                 flow_tcf_nl_ack(ctx, nlh, NULL, NULL);
5358                 if (dev_flow->tcf.tunnel) {
5359                         assert(dev_flow->tcf.tunnel->vtep);
5360                         flow_tcf_vtep_release(ctx,
5361                                 dev_flow->tcf.tunnel->vtep,
5362                                 dev_flow);
5363                         dev_flow->tcf.tunnel->vtep = NULL;
5364                 }
5365                 dev_flow->tcf.applied = 0;
5366         }
5367 }
5368
5369 /**
5370  * Apply flow to E-Switch by sending Netlink message.
5371  *
5372  * @param[in] dev
5373  *   Pointer to Ethernet device.
5374  * @param[in, out] flow
5375  *   Pointer to the sub flow.
5376  * @param[out] error
5377  *   Pointer to the error structure.
5378  *
5379  * @return
5380  *   0 on success, a negative errno value otherwise and rte_errno is set.
5381  */
5382 static int
5383 flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
5384                struct rte_flow_error *error)
5385 {
5386         struct priv *priv = dev->data->dev_private;
5387         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5388         struct mlx5_flow *dev_flow;
5389         struct nlmsghdr *nlh;
5390
5391         dev_flow = LIST_FIRST(&flow->dev_flows);
5392         /* E-Switch flow can't be expanded. */
5393         assert(!LIST_NEXT(dev_flow, next));
5394         if (dev_flow->tcf.applied)
5395                 return 0;
5396         nlh = dev_flow->tcf.nlh;
5397         nlh->nlmsg_type = RTM_NEWTFILTER;
5398         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
5399         if (dev_flow->tcf.tunnel) {
5400                 /*
5401                  * Replace the interface index, target for
5402                  * encapsulation, source for decapsulation.
5403                  */
5404                 assert(!dev_flow->tcf.tunnel->vtep);
5405                 assert(dev_flow->tcf.tunnel->ifindex_ptr);
5406                 /* Acquire actual VTEP device when rule is being applied. */
5407                 dev_flow->tcf.tunnel->vtep =
5408                         flow_tcf_vtep_acquire(ctx,
5409                                         dev_flow->tcf.tunnel->ifindex_org,
5410                                         dev_flow, error);
5411                 if (!dev_flow->tcf.tunnel->vtep)
5412                         return -rte_errno;
5413                 DRV_LOG(INFO, "Replace ifindex: %d->%d",
5414                                 dev_flow->tcf.tunnel->vtep->ifindex,
5415                                 dev_flow->tcf.tunnel->ifindex_org);
5416                 *dev_flow->tcf.tunnel->ifindex_ptr =
5417                         dev_flow->tcf.tunnel->vtep->ifindex;
5418         }
5419         if (!flow_tcf_nl_ack(ctx, nlh, NULL, NULL)) {
5420                 dev_flow->tcf.applied = 1;
5421                 if (*dev_flow->tcf.ptc_flags & TCA_CLS_FLAGS_SKIP_SW)
5422                         return 0;
5423                 /*
5424                  * Rule was applied without skip_sw flag set.
5425                  * We should check whether the rule was acctually
5426                  * accepted by hardware (have look at in_hw flag).
5427                  */
5428                 if (flow_tcf_check_inhw(ctx, dev_flow)) {
5429                         flow_tcf_remove(dev, flow);
5430                         return rte_flow_error_set
5431                                 (error, ENOENT,
5432                                  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5433                                  "netlink: rule has no in_hw flag set");
5434                 }
5435                 return 0;
5436         }
5437         if (dev_flow->tcf.tunnel) {
5438                 /* Rollback the VTEP configuration if rule apply failed. */
5439                 assert(dev_flow->tcf.tunnel->vtep);
5440                 flow_tcf_vtep_release(ctx, dev_flow->tcf.tunnel->vtep,
5441                                       dev_flow);
5442                 dev_flow->tcf.tunnel->vtep = NULL;
5443         }
5444         return rte_flow_error_set(error, rte_errno,
5445                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5446                                   "netlink: failed to create TC flow rule");
5447 }
5448
5449 /**
5450  * Remove flow from E-Switch and release resources of the device flow.
5451  *
5452  * @param[in] dev
5453  *   Pointer to Ethernet device.
5454  * @param[in, out] flow
5455  *   Pointer to the sub flow.
5456  */
5457 static void
5458 flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
5459 {
5460         struct mlx5_flow *dev_flow;
5461
5462         if (!flow)
5463                 return;
5464         flow_tcf_remove(dev, flow);
5465         if (flow->counter) {
5466                 if (--flow->counter->ref_cnt == 0) {
5467                         rte_free(flow->counter);
5468                         flow->counter = NULL;
5469                 }
5470         }
5471         dev_flow = LIST_FIRST(&flow->dev_flows);
5472         if (!dev_flow)
5473                 return;
5474         /* E-Switch flow can't be expanded. */
5475         assert(!LIST_NEXT(dev_flow, next));
5476         LIST_REMOVE(dev_flow, next);
5477         rte_free(dev_flow);
5478 }
5479
5480 /**
5481  * Helper routine for figuring the space size required for a parse buffer.
5482  *
5483  * @param array
5484  *   array of values to use.
5485  * @param idx
5486  *   Current location in array.
5487  * @param value
5488  *   Value to compare with.
5489  *
5490  * @return
5491  *   The maximum between the given value and the array value on index.
5492  */
5493 static uint16_t
5494 flow_tcf_arr_val_max(uint16_t array[], int idx, uint16_t value)
5495 {
5496         return idx < 0 ? (value) : RTE_MAX((array)[idx], value);
5497 }
5498
5499 /**
5500  * Parse rtnetlink message attributes filling the attribute table with the info
5501  * retrieved.
5502  *
5503  * @param tb
5504  *   Attribute table to be filled.
5505  * @param[out] max
5506  *   Maxinum entry in the attribute table.
5507  * @param rte
5508  *   The attributes section in the message to be parsed.
5509  * @param len
5510  *   The length of the attributes section in the message.
5511  */
5512 static void
5513 flow_tcf_nl_parse_rtattr(struct rtattr *tb[], int max,
5514                          struct rtattr *rta, int len)
5515 {
5516         unsigned short type;
5517         memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
5518         while (RTA_OK(rta, len)) {
5519                 type = rta->rta_type;
5520                 if (type <= max && !tb[type])
5521                         tb[type] = rta;
5522                 rta = RTA_NEXT(rta, len);
5523         }
5524 }
5525
5526 /**
5527  * Extract flow counters from flower action.
5528  *
5529  * @param rta
5530  *   flower action stats properties in the Netlink message received.
5531  * @param rta_type
5532  *   The backward sequence of rta_types, as written in the attribute table,
5533  *   we need to traverse in order to get to the requested object.
5534  * @param idx
5535  *   Current location in rta_type table.
5536  * @param[out] data
5537  *   data holding the count statistics of the rte_flow retrieved from
5538  *   the message.
5539  *
5540  * @return
5541  *   0 if data was found and retrieved, -1 otherwise.
5542  */
5543 static int
5544 flow_tcf_nl_action_stats_parse_and_get(struct rtattr *rta,
5545                                        uint16_t rta_type[], int idx,
5546                                        struct gnet_stats_basic *data)
5547 {
5548         int tca_stats_max = flow_tcf_arr_val_max(rta_type, idx,
5549                                                  TCA_STATS_BASIC);
5550         struct rtattr *tbs[tca_stats_max + 1];
5551
5552         if (rta == NULL || idx < 0)
5553                 return -1;
5554         flow_tcf_nl_parse_rtattr(tbs, tca_stats_max,
5555                                  RTA_DATA(rta), RTA_PAYLOAD(rta));
5556         switch (rta_type[idx]) {
5557         case TCA_STATS_BASIC:
5558                 if (tbs[TCA_STATS_BASIC]) {
5559                         memcpy(data, RTA_DATA(tbs[TCA_STATS_BASIC]),
5560                                RTE_MIN(RTA_PAYLOAD(tbs[TCA_STATS_BASIC]),
5561                                sizeof(*data)));
5562                         return 0;
5563                 }
5564                 break;
5565         default:
5566                 break;
5567         }
5568         return -1;
5569 }
5570
5571 /**
5572  * Parse flower single action retrieving the requested action attribute,
5573  * if found.
5574  *
5575  * @param arg
5576  *   flower action properties in the Netlink message received.
5577  * @param rta_type
5578  *   The backward sequence of rta_types, as written in the attribute table,
5579  *   we need to traverse in order to get to the requested object.
5580  * @param idx
5581  *   Current location in rta_type table.
5582  * @param[out] data
5583  *   Count statistics retrieved from the message query.
5584  *
5585  * @return
5586  *   0 if data was found and retrieved, -1 otherwise.
5587  */
5588 static int
5589 flow_tcf_nl_parse_one_action_and_get(struct rtattr *arg,
5590                                      uint16_t rta_type[], int idx, void *data)
5591 {
5592         int tca_act_max = flow_tcf_arr_val_max(rta_type, idx, TCA_ACT_STATS);
5593         struct rtattr *tb[tca_act_max + 1];
5594
5595         if (arg == NULL || idx < 0)
5596                 return -1;
5597         flow_tcf_nl_parse_rtattr(tb, tca_act_max,
5598                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
5599         if (tb[TCA_ACT_KIND] == NULL)
5600                 return -1;
5601         switch (rta_type[idx]) {
5602         case TCA_ACT_STATS:
5603                 if (tb[TCA_ACT_STATS])
5604                         return flow_tcf_nl_action_stats_parse_and_get
5605                                         (tb[TCA_ACT_STATS],
5606                                          rta_type, --idx,
5607                                          (struct gnet_stats_basic *)data);
5608                 break;
5609         default:
5610                 break;
5611         }
5612         return -1;
5613 }
5614
5615 /**
5616  * Parse flower action section in the message retrieving the requested
5617  * attribute from the first action that provides it.
5618  *
5619  * @param opt
5620  *   flower section in the Netlink message received.
5621  * @param rta_type
5622  *   The backward sequence of rta_types, as written in the attribute table,
5623  *   we need to traverse in order to get to the requested object.
5624  * @param idx
5625  *   Current location in rta_type table.
5626  * @param[out] data
5627  *   data retrieved from the message query.
5628  *
5629  * @return
5630  *   0 if data was found and retrieved, -1 otherwise.
5631  */
5632 static int
5633 flow_tcf_nl_action_parse_and_get(struct rtattr *arg,
5634                                  uint16_t rta_type[], int idx, void *data)
5635 {
5636         struct rtattr *tb[TCA_ACT_MAX_PRIO + 1];
5637         int i;
5638
5639         if (arg == NULL || idx < 0)
5640                 return -1;
5641         flow_tcf_nl_parse_rtattr(tb, TCA_ACT_MAX_PRIO,
5642                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
5643         switch (rta_type[idx]) {
5644         /*
5645          * flow counters are stored in the actions defined by the flow
5646          * and not in the flow itself, therefore we need to traverse the
5647          * flower chain of actions in search for them.
5648          *
5649          * Note that the index is not decremented here.
5650          */
5651         case TCA_ACT_STATS:
5652                 for (i = 0; i <= TCA_ACT_MAX_PRIO; i++) {
5653                         if (tb[i] &&
5654                         !flow_tcf_nl_parse_one_action_and_get(tb[i],
5655                                                               rta_type,
5656                                                               idx, data))
5657                                 return 0;
5658                 }
5659                 break;
5660         default:
5661                 break;
5662         }
5663         return -1;
5664 }
5665
5666 /**
5667  * Parse flower classifier options in the message, retrieving the requested
5668  * attribute if found.
5669  *
5670  * @param opt
5671  *   flower section in the Netlink message received.
5672  * @param rta_type
5673  *   The backward sequence of rta_types, as written in the attribute table,
5674  *   we need to traverse in order to get to the requested object.
5675  * @param idx
5676  *   Current location in rta_type table.
5677  * @param[out] data
5678  *   data retrieved from the message query.
5679  *
5680  * @return
5681  *   0 if data was found and retrieved, -1 otherwise.
5682  */
5683 static int
5684 flow_tcf_nl_opts_parse_and_get(struct rtattr *opt,
5685                                uint16_t rta_type[], int idx, void *data)
5686 {
5687         int tca_flower_max = flow_tcf_arr_val_max(rta_type, idx,
5688                                                   TCA_FLOWER_ACT);
5689         struct rtattr *tb[tca_flower_max + 1];
5690
5691         if (!opt || idx < 0)
5692                 return -1;
5693         flow_tcf_nl_parse_rtattr(tb, tca_flower_max,
5694                                  RTA_DATA(opt), RTA_PAYLOAD(opt));
5695         switch (rta_type[idx]) {
5696         case TCA_FLOWER_ACT:
5697                 if (tb[TCA_FLOWER_ACT])
5698                         return flow_tcf_nl_action_parse_and_get
5699                                                         (tb[TCA_FLOWER_ACT],
5700                                                          rta_type, --idx, data);
5701                 break;
5702         default:
5703                 break;
5704         }
5705         return -1;
5706 }
5707
5708 /**
5709  * Parse Netlink reply on filter query, retrieving the flow counters.
5710  *
5711  * @param nlh
5712  *   Message received from Netlink.
5713  * @param rta_type
5714  *   The backward sequence of rta_types, as written in the attribute table,
5715  *   we need to traverse in order to get to the requested object.
5716  * @param idx
5717  *   Current location in rta_type table.
5718  * @param[out] data
5719  *   data retrieved from the message query.
5720  *
5721  * @return
5722  *   0 if data was found and retrieved, -1 otherwise.
5723  */
5724 static int
5725 flow_tcf_nl_filter_parse_and_get(struct nlmsghdr *cnlh,
5726                                  uint16_t rta_type[], int idx, void *data)
5727 {
5728         struct nlmsghdr *nlh = cnlh;
5729         struct tcmsg *t = NLMSG_DATA(nlh);
5730         int len = nlh->nlmsg_len;
5731         int tca_max = flow_tcf_arr_val_max(rta_type, idx, TCA_OPTIONS);
5732         struct rtattr *tb[tca_max + 1];
5733
5734         if (idx < 0)
5735                 return -1;
5736         if (nlh->nlmsg_type != RTM_NEWTFILTER &&
5737             nlh->nlmsg_type != RTM_GETTFILTER &&
5738             nlh->nlmsg_type != RTM_DELTFILTER)
5739                 return -1;
5740         len -= NLMSG_LENGTH(sizeof(*t));
5741         if (len < 0)
5742                 return -1;
5743         flow_tcf_nl_parse_rtattr(tb, tca_max, TCA_RTA(t), len);
5744         /* Not a TC flower flow - bail out */
5745         if (!tb[TCA_KIND] ||
5746             strcmp(RTA_DATA(tb[TCA_KIND]), "flower"))
5747                 return -1;
5748         switch (rta_type[idx]) {
5749         case TCA_OPTIONS:
5750                 if (tb[TCA_OPTIONS])
5751                         return flow_tcf_nl_opts_parse_and_get(tb[TCA_OPTIONS],
5752                                                               rta_type,
5753                                                               --idx, data);
5754                 break;
5755         default:
5756                 break;
5757         }
5758         return -1;
5759 }
5760
5761 /**
5762  * A callback to parse Netlink reply on TC flower query.
5763  *
5764  * @param nlh
5765  *   Message received from Netlink.
5766  * @param[out] data
5767  *   Pointer to data area to be filled by the parsing routine.
5768  *   assumed to be a pointer to struct flow_tcf_stats_basic.
5769  *
5770  * @return
5771  *   MNL_CB_OK value.
5772  */
5773 static int
5774 flow_tcf_nl_message_get_stats_basic(const struct nlmsghdr *nlh, void *data)
5775 {
5776         /*
5777          * The backward sequence of rta_types to pass in order to get
5778          *  to the counters.
5779          */
5780         uint16_t rta_type[] = { TCA_STATS_BASIC, TCA_ACT_STATS,
5781                                 TCA_FLOWER_ACT, TCA_OPTIONS };
5782         struct flow_tcf_stats_basic *sb_data = data;
5783         union {
5784                 const struct nlmsghdr *c;
5785                 struct nlmsghdr *nc;
5786         } tnlh = { .c = nlh };
5787
5788         if (!flow_tcf_nl_filter_parse_and_get(tnlh.nc, rta_type,
5789                                               RTE_DIM(rta_type) - 1,
5790                                               (void *)&sb_data->counters))
5791                 sb_data->valid = true;
5792         return MNL_CB_OK;
5793 }
5794
5795 /**
5796  * Query a TC flower rule for its statistics via netlink.
5797  *
5798  * @param[in] dev
5799  *   Pointer to Ethernet device.
5800  * @param[in] flow
5801  *   Pointer to the sub flow.
5802  * @param[out] data
5803  *   data retrieved by the query.
5804  * @param[out] error
5805  *   Perform verbose error reporting if not NULL.
5806  *
5807  * @return
5808  *   0 on success, a negative errno value otherwise and rte_errno is set.
5809  */
5810 static int
5811 flow_tcf_query_count(struct rte_eth_dev *dev,
5812                           struct rte_flow *flow,
5813                           void *data,
5814                           struct rte_flow_error *error)
5815 {
5816         struct flow_tcf_stats_basic sb_data;
5817         struct rte_flow_query_count *qc = data;
5818         struct priv *priv = dev->data->dev_private;
5819         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5820         struct mnl_socket *nl = ctx->nl;
5821         struct mlx5_flow *dev_flow;
5822         struct nlmsghdr *nlh;
5823         uint32_t seq = priv->tcf_context->seq++;
5824         ssize_t ret;
5825         assert(qc);
5826
5827         memset(&sb_data, 0, sizeof(sb_data));
5828         dev_flow = LIST_FIRST(&flow->dev_flows);
5829         /* E-Switch flow can't be expanded. */
5830         assert(!LIST_NEXT(dev_flow, next));
5831         if (!dev_flow->flow->counter)
5832                 goto notsup_exit;
5833         nlh = dev_flow->tcf.nlh;
5834         nlh->nlmsg_type = RTM_GETTFILTER;
5835         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ECHO;
5836         nlh->nlmsg_seq = seq;
5837         if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) == -1)
5838                 goto error_exit;
5839         do {
5840                 ret = mnl_socket_recvfrom(nl, ctx->buf, ctx->buf_size);
5841                 if (ret <= 0)
5842                         break;
5843                 ret = mnl_cb_run(ctx->buf, ret, seq,
5844                                  mnl_socket_get_portid(nl),
5845                                  flow_tcf_nl_message_get_stats_basic,
5846                                  (void *)&sb_data);
5847         } while (ret > 0);
5848         /* Return the delta from last reset. */
5849         if (sb_data.valid) {
5850                 /* Return the delta from last reset. */
5851                 qc->hits_set = 1;
5852                 qc->bytes_set = 1;
5853                 qc->hits = sb_data.counters.packets - flow->counter->hits;
5854                 qc->bytes = sb_data.counters.bytes - flow->counter->bytes;
5855                 if (qc->reset) {
5856                         flow->counter->hits = sb_data.counters.packets;
5857                         flow->counter->bytes = sb_data.counters.bytes;
5858                 }
5859                 return 0;
5860         }
5861         return rte_flow_error_set(error, EINVAL,
5862                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5863                                   NULL,
5864                                   "flow does not have counter");
5865 error_exit:
5866         return rte_flow_error_set
5867                         (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5868                          NULL, "netlink: failed to read flow rule counters");
5869 notsup_exit:
5870         return rte_flow_error_set
5871                         (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5872                          NULL, "counters are not available.");
5873 }
5874
5875 /**
5876  * Query a flow.
5877  *
5878  * @see rte_flow_query()
5879  * @see rte_flow_ops
5880  */
5881 static int
5882 flow_tcf_query(struct rte_eth_dev *dev,
5883                struct rte_flow *flow,
5884                const struct rte_flow_action *actions,
5885                void *data,
5886                struct rte_flow_error *error)
5887 {
5888         int ret = -EINVAL;
5889
5890         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
5891                 switch (actions->type) {
5892                 case RTE_FLOW_ACTION_TYPE_VOID:
5893                         break;
5894                 case RTE_FLOW_ACTION_TYPE_COUNT:
5895                         ret = flow_tcf_query_count(dev, flow, data, error);
5896                         break;
5897                 default:
5898                         return rte_flow_error_set(error, ENOTSUP,
5899                                                   RTE_FLOW_ERROR_TYPE_ACTION,
5900                                                   actions,
5901                                                   "action not supported");
5902                 }
5903         }
5904         return ret;
5905 }
5906
5907 const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops = {
5908         .validate = flow_tcf_validate,
5909         .prepare = flow_tcf_prepare,
5910         .translate = flow_tcf_translate,
5911         .apply = flow_tcf_apply,
5912         .remove = flow_tcf_remove,
5913         .destroy = flow_tcf_destroy,
5914         .query = flow_tcf_query,
5915 };
5916
5917 /**
5918  * Create and configure a libmnl socket for Netlink flow rules.
5919  *
5920  * @return
5921  *   A valid libmnl socket object pointer on success, NULL otherwise and
5922  *   rte_errno is set.
5923  */
5924 static struct mnl_socket *
5925 flow_tcf_mnl_socket_create(void)
5926 {
5927         struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
5928
5929         if (nl) {
5930                 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
5931                                       sizeof(int));
5932                 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
5933                         return nl;
5934         }
5935         rte_errno = errno;
5936         if (nl)
5937                 mnl_socket_close(nl);
5938         return NULL;
5939 }
5940
5941 /**
5942  * Destroy a libmnl socket.
5943  *
5944  * @param nl
5945  *   Libmnl socket of the @p NETLINK_ROUTE kind.
5946  */
5947 static void
5948 flow_tcf_mnl_socket_destroy(struct mnl_socket *nl)
5949 {
5950         if (nl)
5951                 mnl_socket_close(nl);
5952 }
5953
5954 /**
5955  * Initialize ingress qdisc of a given network interface.
5956  *
5957  * @param ctx
5958  *   Pointer to tc-flower context to use.
5959  * @param ifindex
5960  *   Index of network interface to initialize.
5961  * @param[out] error
5962  *   Perform verbose error reporting if not NULL.
5963  *
5964  * @return
5965  *   0 on success, a negative errno value otherwise and rte_errno is set.
5966  */
5967 int
5968 mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx,
5969                    unsigned int ifindex, struct rte_flow_error *error)
5970 {
5971         struct nlmsghdr *nlh;
5972         struct tcmsg *tcm;
5973         alignas(struct nlmsghdr)
5974         uint8_t buf[mnl_nlmsg_size(sizeof(*tcm)) +
5975                     SZ_NLATTR_STRZ_OF("ingress") +
5976                     MNL_BUF_EXTRA_SPACE];
5977
5978         /* Destroy existing ingress qdisc and everything attached to it. */
5979         nlh = mnl_nlmsg_put_header(buf);
5980         nlh->nlmsg_type = RTM_DELQDISC;
5981         nlh->nlmsg_flags = NLM_F_REQUEST;
5982         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
5983         tcm->tcm_family = AF_UNSPEC;
5984         tcm->tcm_ifindex = ifindex;
5985         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
5986         tcm->tcm_parent = TC_H_INGRESS;
5987         assert(sizeof(buf) >= nlh->nlmsg_len);
5988         /* Ignore errors when qdisc is already absent. */
5989         if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL) &&
5990             rte_errno != EINVAL && rte_errno != ENOENT)
5991                 return rte_flow_error_set(error, rte_errno,
5992                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5993                                           "netlink: failed to remove ingress"
5994                                           " qdisc");
5995         /* Create fresh ingress qdisc. */
5996         nlh = mnl_nlmsg_put_header(buf);
5997         nlh->nlmsg_type = RTM_NEWQDISC;
5998         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
5999         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6000         tcm->tcm_family = AF_UNSPEC;
6001         tcm->tcm_ifindex = ifindex;
6002         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6003         tcm->tcm_parent = TC_H_INGRESS;
6004         mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
6005         assert(sizeof(buf) >= nlh->nlmsg_len);
6006         if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL))
6007                 return rte_flow_error_set(error, rte_errno,
6008                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6009                                           "netlink: failed to create ingress"
6010                                           " qdisc");
6011         return 0;
6012 }
6013
6014 /**
6015  * Create libmnl context for Netlink flow rules.
6016  *
6017  * @return
6018  *   A valid libmnl socket object pointer on success, NULL otherwise and
6019  *   rte_errno is set.
6020  */
6021 struct mlx5_flow_tcf_context *
6022 mlx5_flow_tcf_context_create(void)
6023 {
6024         struct mlx5_flow_tcf_context *ctx = rte_zmalloc(__func__,
6025                                                         sizeof(*ctx),
6026                                                         sizeof(uint32_t));
6027         if (!ctx)
6028                 goto error;
6029         ctx->nl = flow_tcf_mnl_socket_create();
6030         if (!ctx->nl)
6031                 goto error;
6032         ctx->buf_size = MNL_SOCKET_BUFFER_SIZE;
6033         ctx->buf = rte_zmalloc(__func__,
6034                                ctx->buf_size, sizeof(uint32_t));
6035         if (!ctx->buf)
6036                 goto error;
6037         ctx->seq = random();
6038         return ctx;
6039 error:
6040         mlx5_flow_tcf_context_destroy(ctx);
6041         return NULL;
6042 }
6043
6044 /**
6045  * Destroy a libmnl context.
6046  *
6047  * @param ctx
6048  *   Libmnl socket of the @p NETLINK_ROUTE kind.
6049  */
6050 void
6051 mlx5_flow_tcf_context_destroy(struct mlx5_flow_tcf_context *ctx)
6052 {
6053         if (!ctx)
6054                 return;
6055         flow_tcf_mnl_socket_destroy(ctx->nl);
6056         rte_free(ctx->buf);
6057         rte_free(ctx);
6058 }