net/mlx5: add RHEL-7.2 VXLAN device metadata workaround
[dpdk.git] / drivers / net / mlx5 / mlx5_flow_tcf.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018 6WIND S.A.
3  * Copyright 2018 Mellanox Technologies, Ltd
4  */
5
6 #include <assert.h>
7 #include <errno.h>
8 #include <libmnl/libmnl.h>
9 #include <linux/gen_stats.h>
10 #include <linux/if_ether.h>
11 #include <linux/netlink.h>
12 #include <linux/pkt_cls.h>
13 #include <linux/pkt_sched.h>
14 #include <linux/rtnetlink.h>
15 #include <linux/tc_act/tc_gact.h>
16 #include <linux/tc_act/tc_mirred.h>
17 #include <netinet/in.h>
18 #include <stdalign.h>
19 #include <stdbool.h>
20 #include <stddef.h>
21 #include <stdint.h>
22 #include <stdlib.h>
23 #include <sys/socket.h>
24
25 #include <rte_byteorder.h>
26 #include <rte_errno.h>
27 #include <rte_ether.h>
28 #include <rte_flow.h>
29 #include <rte_malloc.h>
30 #include <rte_common.h>
31
32 #include "mlx5.h"
33 #include "mlx5_flow.h"
34 #include "mlx5_autoconf.h"
35
36 #ifdef HAVE_TC_ACT_VLAN
37
38 #include <linux/tc_act/tc_vlan.h>
39
40 #else /* HAVE_TC_ACT_VLAN */
41
42 #define TCA_VLAN_ACT_POP 1
43 #define TCA_VLAN_ACT_PUSH 2
44 #define TCA_VLAN_ACT_MODIFY 3
45 #define TCA_VLAN_PARMS 2
46 #define TCA_VLAN_PUSH_VLAN_ID 3
47 #define TCA_VLAN_PUSH_VLAN_PROTOCOL 4
48 #define TCA_VLAN_PAD 5
49 #define TCA_VLAN_PUSH_VLAN_PRIORITY 6
50
51 struct tc_vlan {
52         tc_gen;
53         int v_action;
54 };
55
56 #endif /* HAVE_TC_ACT_VLAN */
57
58 #ifdef HAVE_TC_ACT_PEDIT
59
60 #include <linux/tc_act/tc_pedit.h>
61
62 #else /* HAVE_TC_ACT_VLAN */
63
64 enum {
65         TCA_PEDIT_UNSPEC,
66         TCA_PEDIT_TM,
67         TCA_PEDIT_PARMS,
68         TCA_PEDIT_PAD,
69         TCA_PEDIT_PARMS_EX,
70         TCA_PEDIT_KEYS_EX,
71         TCA_PEDIT_KEY_EX,
72         __TCA_PEDIT_MAX
73 };
74
75 enum {
76         TCA_PEDIT_KEY_EX_HTYPE = 1,
77         TCA_PEDIT_KEY_EX_CMD = 2,
78         __TCA_PEDIT_KEY_EX_MAX
79 };
80
81 enum pedit_header_type {
82         TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK = 0,
83         TCA_PEDIT_KEY_EX_HDR_TYPE_ETH = 1,
84         TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 = 2,
85         TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 = 3,
86         TCA_PEDIT_KEY_EX_HDR_TYPE_TCP = 4,
87         TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
88         __PEDIT_HDR_TYPE_MAX,
89 };
90
91 enum pedit_cmd {
92         TCA_PEDIT_KEY_EX_CMD_SET = 0,
93         TCA_PEDIT_KEY_EX_CMD_ADD = 1,
94         __PEDIT_CMD_MAX,
95 };
96
97 struct tc_pedit_key {
98         __u32 mask; /* AND */
99         __u32 val; /*XOR */
100         __u32 off; /*offset */
101         __u32 at;
102         __u32 offmask;
103         __u32 shift;
104 };
105
106 __extension__
107 struct tc_pedit_sel {
108         tc_gen;
109         unsigned char nkeys;
110         unsigned char flags;
111         struct tc_pedit_key keys[0];
112 };
113
114 #endif /* HAVE_TC_ACT_VLAN */
115
116 #ifdef HAVE_TC_ACT_TUNNEL_KEY
117
118 #include <linux/tc_act/tc_tunnel_key.h>
119
120 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
121 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
122 #endif
123
124 #ifndef HAVE_TCA_TUNNEL_KEY_NO_CSUM
125 #define TCA_TUNNEL_KEY_NO_CSUM 10
126 #endif
127
128 #else /* HAVE_TC_ACT_TUNNEL_KEY */
129
130 #define TCA_ACT_TUNNEL_KEY 17
131 #define TCA_TUNNEL_KEY_ACT_SET 1
132 #define TCA_TUNNEL_KEY_ACT_RELEASE 2
133 #define TCA_TUNNEL_KEY_PARMS 2
134 #define TCA_TUNNEL_KEY_ENC_IPV4_SRC 3
135 #define TCA_TUNNEL_KEY_ENC_IPV4_DST 4
136 #define TCA_TUNNEL_KEY_ENC_IPV6_SRC 5
137 #define TCA_TUNNEL_KEY_ENC_IPV6_DST 6
138 #define TCA_TUNNEL_KEY_ENC_KEY_ID 7
139 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
140 #define TCA_TUNNEL_KEY_NO_CSUM 10
141
142 struct tc_tunnel_key {
143         tc_gen;
144         int t_action;
145 };
146
147 #endif /* HAVE_TC_ACT_TUNNEL_KEY */
148
149 /* Normally found in linux/netlink.h. */
150 #ifndef NETLINK_CAP_ACK
151 #define NETLINK_CAP_ACK 10
152 #endif
153
154 /* Normally found in linux/pkt_sched.h. */
155 #ifndef TC_H_MIN_INGRESS
156 #define TC_H_MIN_INGRESS 0xfff2u
157 #endif
158
159 /* Normally found in linux/pkt_cls.h. */
160 #ifndef TCA_CLS_FLAGS_SKIP_SW
161 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
162 #endif
163 #ifndef TCA_CLS_FLAGS_IN_HW
164 #define TCA_CLS_FLAGS_IN_HW (1 << 2)
165 #endif
166 #ifndef HAVE_TCA_CHAIN
167 #define TCA_CHAIN 11
168 #endif
169 #ifndef HAVE_TCA_FLOWER_ACT
170 #define TCA_FLOWER_ACT 3
171 #endif
172 #ifndef HAVE_TCA_FLOWER_FLAGS
173 #define TCA_FLOWER_FLAGS 22
174 #endif
175 #ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE
176 #define TCA_FLOWER_KEY_ETH_TYPE 8
177 #endif
178 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST
179 #define TCA_FLOWER_KEY_ETH_DST 4
180 #endif
181 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK
182 #define TCA_FLOWER_KEY_ETH_DST_MASK 5
183 #endif
184 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC
185 #define TCA_FLOWER_KEY_ETH_SRC 6
186 #endif
187 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK
188 #define TCA_FLOWER_KEY_ETH_SRC_MASK 7
189 #endif
190 #ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO
191 #define TCA_FLOWER_KEY_IP_PROTO 9
192 #endif
193 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC
194 #define TCA_FLOWER_KEY_IPV4_SRC 10
195 #endif
196 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK
197 #define TCA_FLOWER_KEY_IPV4_SRC_MASK 11
198 #endif
199 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST
200 #define TCA_FLOWER_KEY_IPV4_DST 12
201 #endif
202 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK
203 #define TCA_FLOWER_KEY_IPV4_DST_MASK 13
204 #endif
205 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC
206 #define TCA_FLOWER_KEY_IPV6_SRC 14
207 #endif
208 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK
209 #define TCA_FLOWER_KEY_IPV6_SRC_MASK 15
210 #endif
211 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST
212 #define TCA_FLOWER_KEY_IPV6_DST 16
213 #endif
214 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK
215 #define TCA_FLOWER_KEY_IPV6_DST_MASK 17
216 #endif
217 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC
218 #define TCA_FLOWER_KEY_TCP_SRC 18
219 #endif
220 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK
221 #define TCA_FLOWER_KEY_TCP_SRC_MASK 35
222 #endif
223 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST
224 #define TCA_FLOWER_KEY_TCP_DST 19
225 #endif
226 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK
227 #define TCA_FLOWER_KEY_TCP_DST_MASK 36
228 #endif
229 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC
230 #define TCA_FLOWER_KEY_UDP_SRC 20
231 #endif
232 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK
233 #define TCA_FLOWER_KEY_UDP_SRC_MASK 37
234 #endif
235 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST
236 #define TCA_FLOWER_KEY_UDP_DST 21
237 #endif
238 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK
239 #define TCA_FLOWER_KEY_UDP_DST_MASK 38
240 #endif
241 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID
242 #define TCA_FLOWER_KEY_VLAN_ID 23
243 #endif
244 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO
245 #define TCA_FLOWER_KEY_VLAN_PRIO 24
246 #endif
247 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
248 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
249 #endif
250 #ifndef HAVE_TCA_FLOWER_KEY_ENC_KEY_ID
251 #define TCA_FLOWER_KEY_ENC_KEY_ID 26
252 #endif
253 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC
254 #define TCA_FLOWER_KEY_ENC_IPV4_SRC 27
255 #endif
256 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK
257 #define TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK 28
258 #endif
259 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST
260 #define TCA_FLOWER_KEY_ENC_IPV4_DST 29
261 #endif
262 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK
263 #define TCA_FLOWER_KEY_ENC_IPV4_DST_MASK 30
264 #endif
265 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC
266 #define TCA_FLOWER_KEY_ENC_IPV6_SRC 31
267 #endif
268 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK
269 #define TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK 32
270 #endif
271 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST
272 #define TCA_FLOWER_KEY_ENC_IPV6_DST 33
273 #endif
274 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK
275 #define TCA_FLOWER_KEY_ENC_IPV6_DST_MASK 34
276 #endif
277 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT
278 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT 43
279 #endif
280 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK
281 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK 44
282 #endif
283 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT
284 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT 45
285 #endif
286 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK
287 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK 46
288 #endif
289 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS
290 #define TCA_FLOWER_KEY_TCP_FLAGS 71
291 #endif
292 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK
293 #define TCA_FLOWER_KEY_TCP_FLAGS_MASK 72
294 #endif
295 #ifndef HAVE_TC_ACT_GOTO_CHAIN
296 #define TC_ACT_GOTO_CHAIN 0x20000000
297 #endif
298
299 #ifndef IPV6_ADDR_LEN
300 #define IPV6_ADDR_LEN 16
301 #endif
302
303 #ifndef IPV4_ADDR_LEN
304 #define IPV4_ADDR_LEN 4
305 #endif
306
307 #ifndef TP_PORT_LEN
308 #define TP_PORT_LEN 2 /* Transport Port (UDP/TCP) Length */
309 #endif
310
311 #ifndef TTL_LEN
312 #define TTL_LEN 1
313 #endif
314
315 #ifndef TCA_ACT_MAX_PRIO
316 #define TCA_ACT_MAX_PRIO 32
317 #endif
318
319 /** Parameters of VXLAN devices created by driver. */
320 #define MLX5_VXLAN_DEFAULT_VNI  1
321 #define MLX5_VXLAN_DEVICE_PFX "vmlx_"
322
323 /** Tunnel action type, used for @p type in header structure. */
324 enum flow_tcf_tunact_type {
325         FLOW_TCF_TUNACT_VXLAN_DECAP,
326         FLOW_TCF_TUNACT_VXLAN_ENCAP,
327 };
328
329 /** Flags used for @p mask in tunnel action encap descriptors. */
330 #define FLOW_TCF_ENCAP_ETH_SRC (1u << 0)
331 #define FLOW_TCF_ENCAP_ETH_DST (1u << 1)
332 #define FLOW_TCF_ENCAP_IPV4_SRC (1u << 2)
333 #define FLOW_TCF_ENCAP_IPV4_DST (1u << 3)
334 #define FLOW_TCF_ENCAP_IPV6_SRC (1u << 4)
335 #define FLOW_TCF_ENCAP_IPV6_DST (1u << 5)
336 #define FLOW_TCF_ENCAP_UDP_SRC (1u << 6)
337 #define FLOW_TCF_ENCAP_UDP_DST (1u << 7)
338 #define FLOW_TCF_ENCAP_VXLAN_VNI (1u << 8)
339
340 /**
341  * Structure for holding netlink context.
342  * Note the size of the message buffer which is MNL_SOCKET_BUFFER_SIZE.
343  * Using this (8KB) buffer size ensures that netlink messages will never be
344  * truncated.
345  */
346 struct mlx5_flow_tcf_context {
347         struct mnl_socket *nl; /* NETLINK_ROUTE libmnl socket. */
348         uint32_t seq; /* Message sequence number. */
349         uint32_t buf_size; /* Message buffer size. */
350         uint8_t *buf; /* Message buffer. */
351 };
352
353 /**
354  * Neigh rule structure. The neigh rule is applied via Netlink to
355  * outer tunnel iface in order to provide destination MAC address
356  * for the VXLAN encapsultion. The neigh rule is implicitly related
357  * to the Flow itself and can be shared by multiple Flows.
358  */
359 struct tcf_neigh_rule {
360         LIST_ENTRY(tcf_neigh_rule) next;
361         uint32_t refcnt;
362         struct ether_addr eth;
363         uint16_t mask;
364         union {
365                 struct {
366                         rte_be32_t dst;
367                 } ipv4;
368                 struct {
369                         uint8_t dst[IPV6_ADDR_LEN];
370                 } ipv6;
371         };
372 };
373
374 /**
375  * Local rule structure. The local rule is applied via Netlink to
376  * outer tunnel iface in order to provide local and peer IP addresses
377  * of the VXLAN tunnel for encapsulation. The local rule is implicitly
378  * related to the Flow itself and can be shared by multiple Flows.
379  */
380 struct tcf_local_rule {
381         LIST_ENTRY(tcf_local_rule) next;
382         uint32_t refcnt;
383         uint16_t mask;
384         union {
385                 struct {
386                         rte_be32_t dst;
387                         rte_be32_t src;
388                 } ipv4;
389                 struct {
390                         uint8_t dst[IPV6_ADDR_LEN];
391                         uint8_t src[IPV6_ADDR_LEN];
392                 } ipv6;
393         };
394 };
395
396 /** Outer interface VXLAN encapsulation rules container. */
397 struct tcf_irule {
398         LIST_ENTRY(tcf_irule) next;
399         LIST_HEAD(, tcf_neigh_rule) neigh;
400         LIST_HEAD(, tcf_local_rule) local;
401         uint32_t refcnt;
402         unsigned int ifouter; /**< Own interface index. */
403 };
404
405 /** VXLAN virtual netdev. */
406 struct tcf_vtep {
407         LIST_ENTRY(tcf_vtep) next;
408         uint32_t refcnt;
409         unsigned int ifindex; /**< Own interface index. */
410         uint16_t port;
411         uint8_t created;
412 };
413
414 /** Tunnel descriptor header, common for all tunnel types. */
415 struct flow_tcf_tunnel_hdr {
416         uint32_t type; /**< Tunnel action type. */
417         struct tcf_vtep *vtep; /**< Virtual tunnel endpoint device. */
418         unsigned int ifindex_org; /**< Original dst/src interface */
419         unsigned int *ifindex_ptr; /**< Interface ptr in message. */
420 };
421
422 struct flow_tcf_vxlan_decap {
423         struct flow_tcf_tunnel_hdr hdr;
424         uint16_t udp_port;
425 };
426
427 struct flow_tcf_vxlan_encap {
428         struct flow_tcf_tunnel_hdr hdr;
429         struct tcf_irule *iface;
430         uint32_t mask;
431         struct {
432                 struct ether_addr dst;
433                 struct ether_addr src;
434         } eth;
435         union {
436                 struct {
437                         rte_be32_t dst;
438                         rte_be32_t src;
439                 } ipv4;
440                 struct {
441                         uint8_t dst[IPV6_ADDR_LEN];
442                         uint8_t src[IPV6_ADDR_LEN];
443                 } ipv6;
444         };
445         struct {
446                 rte_be16_t src;
447                 rte_be16_t dst;
448         } udp;
449         struct {
450                 uint8_t vni[3];
451         } vxlan;
452 };
453
454 /** Structure used when extracting the values of a flow counters
455  * from a netlink message.
456  */
457 struct flow_tcf_stats_basic {
458         bool valid;
459         struct gnet_stats_basic counters;
460 };
461
462 /** Empty masks for known item types. */
463 static const union {
464         struct rte_flow_item_port_id port_id;
465         struct rte_flow_item_eth eth;
466         struct rte_flow_item_vlan vlan;
467         struct rte_flow_item_ipv4 ipv4;
468         struct rte_flow_item_ipv6 ipv6;
469         struct rte_flow_item_tcp tcp;
470         struct rte_flow_item_udp udp;
471         struct rte_flow_item_vxlan vxlan;
472 } flow_tcf_mask_empty = {
473         {0},
474 };
475
476 /** Supported masks for known item types. */
477 static const struct {
478         struct rte_flow_item_port_id port_id;
479         struct rte_flow_item_eth eth;
480         struct rte_flow_item_vlan vlan;
481         struct rte_flow_item_ipv4 ipv4;
482         struct rte_flow_item_ipv6 ipv6;
483         struct rte_flow_item_tcp tcp;
484         struct rte_flow_item_udp udp;
485         struct rte_flow_item_vxlan vxlan;
486 } flow_tcf_mask_supported = {
487         .port_id = {
488                 .id = 0xffffffff,
489         },
490         .eth = {
491                 .type = RTE_BE16(0xffff),
492                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
493                 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
494         },
495         .vlan = {
496                 /* PCP and VID only, no DEI. */
497                 .tci = RTE_BE16(0xefff),
498                 .inner_type = RTE_BE16(0xffff),
499         },
500         .ipv4.hdr = {
501                 .next_proto_id = 0xff,
502                 .src_addr = RTE_BE32(0xffffffff),
503                 .dst_addr = RTE_BE32(0xffffffff),
504         },
505         .ipv6.hdr = {
506                 .proto = 0xff,
507                 .src_addr =
508                         "\xff\xff\xff\xff\xff\xff\xff\xff"
509                         "\xff\xff\xff\xff\xff\xff\xff\xff",
510                 .dst_addr =
511                         "\xff\xff\xff\xff\xff\xff\xff\xff"
512                         "\xff\xff\xff\xff\xff\xff\xff\xff",
513         },
514         .tcp.hdr = {
515                 .src_port = RTE_BE16(0xffff),
516                 .dst_port = RTE_BE16(0xffff),
517                 .tcp_flags = 0xff,
518         },
519         .udp.hdr = {
520                 .src_port = RTE_BE16(0xffff),
521                 .dst_port = RTE_BE16(0xffff),
522         },
523         .vxlan = {
524                .vni = "\xff\xff\xff",
525         },
526 };
527
528 #define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr))
529 #define SZ_NLATTR_NEST SZ_NLATTR_HDR
530 #define SZ_NLATTR_DATA_OF(len) MNL_ALIGN(SZ_NLATTR_HDR + (len))
531 #define SZ_NLATTR_TYPE_OF(typ) SZ_NLATTR_DATA_OF(sizeof(typ))
532 #define SZ_NLATTR_STRZ_OF(str) SZ_NLATTR_DATA_OF(strlen(str) + 1)
533
534 #define PTOI_TABLE_SZ_MAX(dev) (mlx5_dev_to_port_id((dev)->device, NULL, 0) + 2)
535
536 /** DPDK port to network interface index (ifindex) conversion. */
537 struct flow_tcf_ptoi {
538         uint16_t port_id; /**< DPDK port ID. */
539         unsigned int ifindex; /**< Network interface index. */
540 };
541
542 /* Due to a limitation on driver/FW. */
543 #define MLX5_TCF_GROUP_ID_MAX 3
544
545 /*
546  * Due to a limitation on driver/FW, priority ranges from 1 to 16 in kernel.
547  * Priority in rte_flow attribute starts from 0 and is added by 1 in
548  * translation. This is subject to be changed to determine the max priority
549  * based on trial-and-error like Verbs driver once the restriction is lifted or
550  * the range is extended.
551  */
552 #define MLX5_TCF_GROUP_PRIORITY_MAX 15
553
554 #define MLX5_TCF_FATE_ACTIONS \
555         (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \
556          MLX5_FLOW_ACTION_JUMP)
557
558 #define MLX5_TCF_VLAN_ACTIONS \
559         (MLX5_FLOW_ACTION_OF_POP_VLAN | MLX5_FLOW_ACTION_OF_PUSH_VLAN | \
560          MLX5_FLOW_ACTION_OF_SET_VLAN_VID | MLX5_FLOW_ACTION_OF_SET_VLAN_PCP)
561
562 #define MLX5_TCF_VXLAN_ACTIONS \
563         (MLX5_FLOW_ACTION_VXLAN_ENCAP | MLX5_FLOW_ACTION_VXLAN_DECAP)
564
565 #define MLX5_TCF_PEDIT_ACTIONS \
566         (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST | \
567          MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST | \
568          MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST | \
569          MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL | \
570          MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)
571
572 #define MLX5_TCF_CONFIG_ACTIONS \
573         (MLX5_FLOW_ACTION_PORT_ID | MLX5_FLOW_ACTION_JUMP | \
574          MLX5_FLOW_ACTION_OF_PUSH_VLAN | MLX5_FLOW_ACTION_OF_SET_VLAN_VID | \
575          MLX5_FLOW_ACTION_OF_SET_VLAN_PCP | \
576          (MLX5_TCF_PEDIT_ACTIONS & ~MLX5_FLOW_ACTION_DEC_TTL))
577
578 #define MAX_PEDIT_KEYS 128
579 #define SZ_PEDIT_KEY_VAL 4
580
581 #define NUM_OF_PEDIT_KEYS(sz) \
582         (((sz) / SZ_PEDIT_KEY_VAL) + (((sz) % SZ_PEDIT_KEY_VAL) ? 1 : 0))
583
584 struct pedit_key_ex {
585         enum pedit_header_type htype;
586         enum pedit_cmd cmd;
587 };
588
589 struct pedit_parser {
590         struct tc_pedit_sel sel;
591         struct tc_pedit_key keys[MAX_PEDIT_KEYS];
592         struct pedit_key_ex keys_ex[MAX_PEDIT_KEYS];
593 };
594
595 /**
596  * Create space for using the implicitly created TC flow counter.
597  *
598  * @param[in] dev
599  *   Pointer to the Ethernet device structure.
600  *
601  * @return
602  *   A pointer to the counter data structure, NULL otherwise and
603  *   rte_errno is set.
604  */
605 static struct mlx5_flow_counter *
606 flow_tcf_counter_new(void)
607 {
608         struct mlx5_flow_counter *cnt;
609
610         /*
611          * eswitch counter cannot be shared and its id is unknown.
612          * currently returning all with id 0.
613          * in the future maybe better to switch to unique numbers.
614          */
615         struct mlx5_flow_counter tmpl = {
616                 .ref_cnt = 1,
617         };
618         cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0);
619         if (!cnt) {
620                 rte_errno = ENOMEM;
621                 return NULL;
622         }
623         *cnt = tmpl;
624         /* Implicit counter, do not add to list. */
625         return cnt;
626 }
627
628 /**
629  * Set pedit key of MAC address
630  *
631  * @param[in] actions
632  *   pointer to action specification
633  * @param[in,out] p_parser
634  *   pointer to pedit_parser
635  */
636 static void
637 flow_tcf_pedit_key_set_mac(const struct rte_flow_action *actions,
638                            struct pedit_parser *p_parser)
639 {
640         int idx = p_parser->sel.nkeys;
641         uint32_t off = actions->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ?
642                                         offsetof(struct ether_hdr, s_addr) :
643                                         offsetof(struct ether_hdr, d_addr);
644         const struct rte_flow_action_set_mac *conf =
645                 (const struct rte_flow_action_set_mac *)actions->conf;
646
647         p_parser->keys[idx].off = off;
648         p_parser->keys[idx].mask = ~UINT32_MAX;
649         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
650         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
651         memcpy(&p_parser->keys[idx].val,
652                 conf->mac_addr, SZ_PEDIT_KEY_VAL);
653         idx++;
654         p_parser->keys[idx].off = off + SZ_PEDIT_KEY_VAL;
655         p_parser->keys[idx].mask = 0xFFFF0000;
656         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
657         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
658         memcpy(&p_parser->keys[idx].val,
659                 conf->mac_addr + SZ_PEDIT_KEY_VAL,
660                 ETHER_ADDR_LEN - SZ_PEDIT_KEY_VAL);
661         p_parser->sel.nkeys = (++idx);
662 }
663
664 /**
665  * Set pedit key of decrease/set ttl
666  *
667  * @param[in] actions
668  *   pointer to action specification
669  * @param[in,out] p_parser
670  *   pointer to pedit_parser
671  * @param[in] item_flags
672  *   flags of all items presented
673  */
674 static void
675 flow_tcf_pedit_key_set_dec_ttl(const struct rte_flow_action *actions,
676                                 struct pedit_parser *p_parser,
677                                 uint64_t item_flags)
678 {
679         int idx = p_parser->sel.nkeys;
680
681         p_parser->keys[idx].mask = 0xFFFFFF00;
682         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) {
683                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
684                 p_parser->keys[idx].off =
685                         offsetof(struct ipv4_hdr, time_to_live);
686         }
687         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6) {
688                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
689                 p_parser->keys[idx].off =
690                         offsetof(struct ipv6_hdr, hop_limits);
691         }
692         if (actions->type == RTE_FLOW_ACTION_TYPE_DEC_TTL) {
693                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_ADD;
694                 p_parser->keys[idx].val = 0x000000FF;
695         } else {
696                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
697                 p_parser->keys[idx].val =
698                         (__u32)((const struct rte_flow_action_set_ttl *)
699                          actions->conf)->ttl_value;
700         }
701         p_parser->sel.nkeys = (++idx);
702 }
703
704 /**
705  * Set pedit key of transport (TCP/UDP) port value
706  *
707  * @param[in] actions
708  *   pointer to action specification
709  * @param[in,out] p_parser
710  *   pointer to pedit_parser
711  * @param[in] item_flags
712  *   flags of all items presented
713  */
714 static void
715 flow_tcf_pedit_key_set_tp_port(const struct rte_flow_action *actions,
716                                 struct pedit_parser *p_parser,
717                                 uint64_t item_flags)
718 {
719         int idx = p_parser->sel.nkeys;
720
721         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)
722                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_UDP;
723         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_TCP)
724                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_TCP;
725         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
726         /* offset of src/dst port is same for TCP and UDP */
727         p_parser->keys[idx].off =
728                 actions->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC ?
729                 offsetof(struct tcp_hdr, src_port) :
730                 offsetof(struct tcp_hdr, dst_port);
731         p_parser->keys[idx].mask = 0xFFFF0000;
732         p_parser->keys[idx].val =
733                 (__u32)((const struct rte_flow_action_set_tp *)
734                                 actions->conf)->port;
735         p_parser->sel.nkeys = (++idx);
736 }
737
738 /**
739  * Set pedit key of ipv6 address
740  *
741  * @param[in] actions
742  *   pointer to action specification
743  * @param[in,out] p_parser
744  *   pointer to pedit_parser
745  */
746 static void
747 flow_tcf_pedit_key_set_ipv6_addr(const struct rte_flow_action *actions,
748                                  struct pedit_parser *p_parser)
749 {
750         int idx = p_parser->sel.nkeys;
751         int keys = NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
752         int off_base =
753                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ?
754                 offsetof(struct ipv6_hdr, src_addr) :
755                 offsetof(struct ipv6_hdr, dst_addr);
756         const struct rte_flow_action_set_ipv6 *conf =
757                 (const struct rte_flow_action_set_ipv6 *)actions->conf;
758
759         for (int i = 0; i < keys; i++, idx++) {
760                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
761                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
762                 p_parser->keys[idx].off = off_base + i * SZ_PEDIT_KEY_VAL;
763                 p_parser->keys[idx].mask = ~UINT32_MAX;
764                 memcpy(&p_parser->keys[idx].val,
765                         conf->ipv6_addr + i *  SZ_PEDIT_KEY_VAL,
766                         SZ_PEDIT_KEY_VAL);
767         }
768         p_parser->sel.nkeys += keys;
769 }
770
771 /**
772  * Set pedit key of ipv4 address
773  *
774  * @param[in] actions
775  *   pointer to action specification
776  * @param[in,out] p_parser
777  *   pointer to pedit_parser
778  */
779 static void
780 flow_tcf_pedit_key_set_ipv4_addr(const struct rte_flow_action *actions,
781                                  struct pedit_parser *p_parser)
782 {
783         int idx = p_parser->sel.nkeys;
784
785         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
786         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
787         p_parser->keys[idx].off =
788                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ?
789                 offsetof(struct ipv4_hdr, src_addr) :
790                 offsetof(struct ipv4_hdr, dst_addr);
791         p_parser->keys[idx].mask = ~UINT32_MAX;
792         p_parser->keys[idx].val =
793                 ((const struct rte_flow_action_set_ipv4 *)
794                  actions->conf)->ipv4_addr;
795         p_parser->sel.nkeys = (++idx);
796 }
797
798 /**
799  * Create the pedit's na attribute in netlink message
800  * on pre-allocate message buffer
801  *
802  * @param[in,out] nl
803  *   pointer to pre-allocated netlink message buffer
804  * @param[in,out] actions
805  *   pointer to pointer of actions specification.
806  * @param[in,out] action_flags
807  *   pointer to actions flags
808  * @param[in] item_flags
809  *   flags of all item presented
810  */
811 static void
812 flow_tcf_create_pedit_mnl_msg(struct nlmsghdr *nl,
813                               const struct rte_flow_action **actions,
814                               uint64_t item_flags)
815 {
816         struct pedit_parser p_parser;
817         struct nlattr *na_act_options;
818         struct nlattr *na_pedit_keys;
819
820         memset(&p_parser, 0, sizeof(p_parser));
821         mnl_attr_put_strz(nl, TCA_ACT_KIND, "pedit");
822         na_act_options = mnl_attr_nest_start(nl, TCA_ACT_OPTIONS);
823         /* all modify header actions should be in one tc-pedit action */
824         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
825                 switch ((*actions)->type) {
826                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
827                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
828                         flow_tcf_pedit_key_set_ipv4_addr(*actions, &p_parser);
829                         break;
830                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
831                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
832                         flow_tcf_pedit_key_set_ipv6_addr(*actions, &p_parser);
833                         break;
834                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
835                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
836                         flow_tcf_pedit_key_set_tp_port(*actions,
837                                                         &p_parser, item_flags);
838                         break;
839                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
840                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
841                         flow_tcf_pedit_key_set_dec_ttl(*actions,
842                                                         &p_parser, item_flags);
843                         break;
844                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
845                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
846                         flow_tcf_pedit_key_set_mac(*actions, &p_parser);
847                         break;
848                 default:
849                         goto pedit_mnl_msg_done;
850                 }
851         }
852 pedit_mnl_msg_done:
853         p_parser.sel.action = TC_ACT_PIPE;
854         mnl_attr_put(nl, TCA_PEDIT_PARMS_EX,
855                      sizeof(p_parser.sel) +
856                      p_parser.sel.nkeys * sizeof(struct tc_pedit_key),
857                      &p_parser);
858         na_pedit_keys =
859                 mnl_attr_nest_start(nl, TCA_PEDIT_KEYS_EX | NLA_F_NESTED);
860         for (int i = 0; i < p_parser.sel.nkeys; i++) {
861                 struct nlattr *na_pedit_key =
862                         mnl_attr_nest_start(nl,
863                                             TCA_PEDIT_KEY_EX | NLA_F_NESTED);
864                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_HTYPE,
865                                  p_parser.keys_ex[i].htype);
866                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_CMD,
867                                  p_parser.keys_ex[i].cmd);
868                 mnl_attr_nest_end(nl, na_pedit_key);
869         }
870         mnl_attr_nest_end(nl, na_pedit_keys);
871         mnl_attr_nest_end(nl, na_act_options);
872         (*actions)--;
873 }
874
875 /**
876  * Calculate max memory size of one TC-pedit actions.
877  * One TC-pedit action can contain set of keys each defining
878  * a rewrite element (rte_flow action)
879  *
880  * @param[in,out] actions
881  *   actions specification.
882  * @param[in,out] action_flags
883  *   actions flags
884  * @param[in,out] size
885  *   accumulated size
886  * @return
887  *   Max memory size of one TC-pedit action
888  */
889 static int
890 flow_tcf_get_pedit_actions_size(const struct rte_flow_action **actions,
891                                 uint64_t *action_flags)
892 {
893         int pedit_size = 0;
894         int keys = 0;
895         uint64_t flags = 0;
896
897         pedit_size += SZ_NLATTR_NEST + /* na_act_index. */
898                       SZ_NLATTR_STRZ_OF("pedit") +
899                       SZ_NLATTR_NEST; /* TCA_ACT_OPTIONS. */
900         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
901                 switch ((*actions)->type) {
902                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
903                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
904                         flags |= MLX5_FLOW_ACTION_SET_IPV4_SRC;
905                         break;
906                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
907                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
908                         flags |= MLX5_FLOW_ACTION_SET_IPV4_DST;
909                         break;
910                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
911                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
912                         flags |= MLX5_FLOW_ACTION_SET_IPV6_SRC;
913                         break;
914                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
915                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
916                         flags |= MLX5_FLOW_ACTION_SET_IPV6_DST;
917                         break;
918                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
919                         /* TCP is as same as UDP */
920                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
921                         flags |= MLX5_FLOW_ACTION_SET_TP_SRC;
922                         break;
923                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
924                         /* TCP is as same as UDP */
925                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
926                         flags |= MLX5_FLOW_ACTION_SET_TP_DST;
927                         break;
928                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
929                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
930                         flags |= MLX5_FLOW_ACTION_SET_TTL;
931                         break;
932                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
933                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
934                         flags |= MLX5_FLOW_ACTION_DEC_TTL;
935                         break;
936                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
937                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
938                         flags |= MLX5_FLOW_ACTION_SET_MAC_SRC;
939                         break;
940                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
941                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
942                         flags |= MLX5_FLOW_ACTION_SET_MAC_DST;
943                         break;
944                 default:
945                         goto get_pedit_action_size_done;
946                 }
947         }
948 get_pedit_action_size_done:
949         /* TCA_PEDIT_PARAMS_EX */
950         pedit_size +=
951                 SZ_NLATTR_DATA_OF(sizeof(struct tc_pedit_sel) +
952                                   keys * sizeof(struct tc_pedit_key));
953         pedit_size += SZ_NLATTR_NEST; /* TCA_PEDIT_KEYS */
954         pedit_size += keys *
955                       /* TCA_PEDIT_KEY_EX + HTYPE + CMD */
956                       (SZ_NLATTR_NEST + SZ_NLATTR_DATA_OF(2) +
957                        SZ_NLATTR_DATA_OF(2));
958         (*action_flags) |= flags;
959         (*actions)--;
960         return pedit_size;
961 }
962
963 /**
964  * Retrieve mask for pattern item.
965  *
966  * This function does basic sanity checks on a pattern item in order to
967  * return the most appropriate mask for it.
968  *
969  * @param[in] item
970  *   Item specification.
971  * @param[in] mask_default
972  *   Default mask for pattern item as specified by the flow API.
973  * @param[in] mask_supported
974  *   Mask fields supported by the implementation.
975  * @param[in] mask_empty
976  *   Empty mask to return when there is no specification.
977  * @param[out] error
978  *   Perform verbose error reporting if not NULL.
979  *
980  * @return
981  *   Either @p item->mask or one of the mask parameters on success, NULL
982  *   otherwise and rte_errno is set.
983  */
984 static const void *
985 flow_tcf_item_mask(const struct rte_flow_item *item, const void *mask_default,
986                    const void *mask_supported, const void *mask_empty,
987                    size_t mask_size, struct rte_flow_error *error)
988 {
989         const uint8_t *mask;
990         size_t i;
991
992         /* item->last and item->mask cannot exist without item->spec. */
993         if (!item->spec && (item->mask || item->last)) {
994                 rte_flow_error_set(error, EINVAL,
995                                    RTE_FLOW_ERROR_TYPE_ITEM, item,
996                                    "\"mask\" or \"last\" field provided without"
997                                    " a corresponding \"spec\"");
998                 return NULL;
999         }
1000         /* No spec, no mask, no problem. */
1001         if (!item->spec)
1002                 return mask_empty;
1003         mask = item->mask ? item->mask : mask_default;
1004         assert(mask);
1005         /*
1006          * Single-pass check to make sure that:
1007          * - Mask is supported, no bits are set outside mask_supported.
1008          * - Both item->spec and item->last are included in mask.
1009          */
1010         for (i = 0; i != mask_size; ++i) {
1011                 if (!mask[i])
1012                         continue;
1013                 if ((mask[i] | ((const uint8_t *)mask_supported)[i]) !=
1014                     ((const uint8_t *)mask_supported)[i]) {
1015                         rte_flow_error_set(error, ENOTSUP,
1016                                            RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1017                                            "unsupported field found"
1018                                            " in \"mask\"");
1019                         return NULL;
1020                 }
1021                 if (item->last &&
1022                     (((const uint8_t *)item->spec)[i] & mask[i]) !=
1023                     (((const uint8_t *)item->last)[i] & mask[i])) {
1024                         rte_flow_error_set(error, EINVAL,
1025                                            RTE_FLOW_ERROR_TYPE_ITEM_LAST,
1026                                            item->last,
1027                                            "range between \"spec\" and \"last\""
1028                                            " not comprised in \"mask\"");
1029                         return NULL;
1030                 }
1031         }
1032         return mask;
1033 }
1034
1035 /**
1036  * Build a conversion table between port ID and ifindex.
1037  *
1038  * @param[in] dev
1039  *   Pointer to Ethernet device.
1040  * @param[out] ptoi
1041  *   Pointer to ptoi table.
1042  * @param[in] len
1043  *   Size of ptoi table provided.
1044  *
1045  * @return
1046  *   Size of ptoi table filled.
1047  */
1048 static unsigned int
1049 flow_tcf_build_ptoi_table(struct rte_eth_dev *dev, struct flow_tcf_ptoi *ptoi,
1050                           unsigned int len)
1051 {
1052         unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
1053         uint16_t port_id[n + 1];
1054         unsigned int i;
1055         unsigned int own = 0;
1056
1057         /* At least one port is needed when no switch domain is present. */
1058         if (!n) {
1059                 n = 1;
1060                 port_id[0] = dev->data->port_id;
1061         } else {
1062                 n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
1063         }
1064         if (n > len)
1065                 return 0;
1066         for (i = 0; i != n; ++i) {
1067                 struct rte_eth_dev_info dev_info;
1068
1069                 rte_eth_dev_info_get(port_id[i], &dev_info);
1070                 if (port_id[i] == dev->data->port_id)
1071                         own = i;
1072                 ptoi[i].port_id = port_id[i];
1073                 ptoi[i].ifindex = dev_info.if_index;
1074         }
1075         /* Ensure first entry of ptoi[] is the current device. */
1076         if (own) {
1077                 ptoi[n] = ptoi[0];
1078                 ptoi[0] = ptoi[own];
1079                 ptoi[own] = ptoi[n];
1080         }
1081         /* An entry with zero ifindex terminates ptoi[]. */
1082         ptoi[n].port_id = 0;
1083         ptoi[n].ifindex = 0;
1084         return n;
1085 }
1086
1087 /**
1088  * Verify the @p attr will be correctly understood by the E-switch.
1089  *
1090  * @param[in] attr
1091  *   Pointer to flow attributes
1092  * @param[out] error
1093  *   Pointer to error structure.
1094  *
1095  * @return
1096  *   0 on success, a negative errno value otherwise and rte_errno is set.
1097  */
1098 static int
1099 flow_tcf_validate_attributes(const struct rte_flow_attr *attr,
1100                              struct rte_flow_error *error)
1101 {
1102         /*
1103          * Supported attributes: groups, some priorities and ingress only.
1104          * group is supported only if kernel supports chain. Don't care about
1105          * transfer as it is the caller's problem.
1106          */
1107         if (attr->group > MLX5_TCF_GROUP_ID_MAX)
1108                 return rte_flow_error_set(error, ENOTSUP,
1109                                           RTE_FLOW_ERROR_TYPE_ATTR_GROUP, attr,
1110                                           "group ID larger than "
1111                                           RTE_STR(MLX5_TCF_GROUP_ID_MAX)
1112                                           " isn't supported");
1113         else if (attr->priority > MLX5_TCF_GROUP_PRIORITY_MAX)
1114                 return rte_flow_error_set(error, ENOTSUP,
1115                                           RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1116                                           attr,
1117                                           "priority more than "
1118                                           RTE_STR(MLX5_TCF_GROUP_PRIORITY_MAX)
1119                                           " is not supported");
1120         if (!attr->ingress)
1121                 return rte_flow_error_set(error, EINVAL,
1122                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1123                                           attr, "only ingress is supported");
1124         if (attr->egress)
1125                 return rte_flow_error_set(error, ENOTSUP,
1126                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1127                                           attr, "egress is not supported");
1128         return 0;
1129 }
1130
1131 /**
1132  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_ETH item for E-Switch.
1133  * The routine checks the L2 fields to be used in encapsulation header.
1134  *
1135  * @param[in] item
1136  *   Pointer to the item structure.
1137  * @param[out] error
1138  *   Pointer to the error structure.
1139  *
1140  * @return
1141  *   0 on success, a negative errno value otherwise and rte_errno is set.
1142  **/
1143 static int
1144 flow_tcf_validate_vxlan_encap_eth(const struct rte_flow_item *item,
1145                                   struct rte_flow_error *error)
1146 {
1147         const struct rte_flow_item_eth *spec = item->spec;
1148         const struct rte_flow_item_eth *mask = item->mask;
1149
1150         if (!spec) {
1151                 /*
1152                  * Specification for L2 addresses can be empty
1153                  * because these ones are optional and not
1154                  * required directly by tc rule. Kernel tries
1155                  * to resolve these ones on its own
1156                  */
1157                 return 0;
1158         }
1159         if (!mask) {
1160                 /* If mask is not specified use the default one. */
1161                 mask = &rte_flow_item_eth_mask;
1162         }
1163         if (memcmp(&mask->dst,
1164                    &flow_tcf_mask_empty.eth.dst,
1165                    sizeof(flow_tcf_mask_empty.eth.dst))) {
1166                 if (memcmp(&mask->dst,
1167                            &rte_flow_item_eth_mask.dst,
1168                            sizeof(rte_flow_item_eth_mask.dst)))
1169                         return rte_flow_error_set
1170                                 (error, ENOTSUP,
1171                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1172                                  "no support for partial mask on"
1173                                  " \"eth.dst\" field");
1174         }
1175         if (memcmp(&mask->src,
1176                    &flow_tcf_mask_empty.eth.src,
1177                    sizeof(flow_tcf_mask_empty.eth.src))) {
1178                 if (memcmp(&mask->src,
1179                            &rte_flow_item_eth_mask.src,
1180                            sizeof(rte_flow_item_eth_mask.src)))
1181                         return rte_flow_error_set
1182                                 (error, ENOTSUP,
1183                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1184                                  "no support for partial mask on"
1185                                  " \"eth.src\" field");
1186         }
1187         if (mask->type != RTE_BE16(0x0000)) {
1188                 if (mask->type != RTE_BE16(0xffff))
1189                         return rte_flow_error_set
1190                                 (error, ENOTSUP,
1191                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1192                                  "no support for partial mask on"
1193                                  " \"eth.type\" field");
1194                 DRV_LOG(WARNING,
1195                         "outer ethernet type field"
1196                         " cannot be forced for vxlan"
1197                         " encapsulation, parameter ignored");
1198         }
1199         return 0;
1200 }
1201
1202 /**
1203  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV4 item for E-Switch.
1204  * The routine checks the IPv4 fields to be used in encapsulation header.
1205  *
1206  * @param[in] item
1207  *   Pointer to the item structure.
1208  * @param[out] error
1209  *   Pointer to the error structure.
1210  *
1211  * @return
1212  *   0 on success, a negative errno value otherwise and rte_errno is set.
1213  **/
1214 static int
1215 flow_tcf_validate_vxlan_encap_ipv4(const struct rte_flow_item *item,
1216                                    struct rte_flow_error *error)
1217 {
1218         const struct rte_flow_item_ipv4 *spec = item->spec;
1219         const struct rte_flow_item_ipv4 *mask = item->mask;
1220
1221         if (!spec) {
1222                 /*
1223                  * Specification for IP addresses cannot be empty
1224                  * because it is required by tunnel_key parameter.
1225                  */
1226                 return rte_flow_error_set(error, EINVAL,
1227                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1228                                           "NULL outer ipv4 address"
1229                                           " specification for vxlan"
1230                                           " encapsulation");
1231         }
1232         if (!mask)
1233                 mask = &rte_flow_item_ipv4_mask;
1234         if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) {
1235                 if (mask->hdr.dst_addr != RTE_BE32(0xffffffff))
1236                         return rte_flow_error_set
1237                                 (error, ENOTSUP,
1238                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1239                                  "no support for partial mask on"
1240                                  " \"ipv4.hdr.dst_addr\" field"
1241                                  " for vxlan encapsulation");
1242                 /* More IPv4 address validations can be put here. */
1243         } else {
1244                 /*
1245                  * Kernel uses the destination IP address to determine
1246                  * the routing path and obtain the MAC destination
1247                  * address, so IP destination address must be
1248                  * specified in the tc rule.
1249                  */
1250                 return rte_flow_error_set(error, EINVAL,
1251                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1252                                           "outer ipv4 destination address"
1253                                           " must be specified for"
1254                                           " vxlan encapsulation");
1255         }
1256         if (mask->hdr.src_addr != RTE_BE32(0x00000000)) {
1257                 if (mask->hdr.src_addr != RTE_BE32(0xffffffff))
1258                         return rte_flow_error_set
1259                                 (error, ENOTSUP,
1260                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1261                                  "no support for partial mask on"
1262                                  " \"ipv4.hdr.src_addr\" field"
1263                                  " for vxlan encapsulation");
1264                 /* More IPv4 address validations can be put here. */
1265         } else {
1266                 /*
1267                  * Kernel uses the source IP address to select the
1268                  * interface for egress encapsulated traffic, so
1269                  * it must be specified in the tc rule.
1270                  */
1271                 return rte_flow_error_set(error, EINVAL,
1272                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1273                                           "outer ipv4 source address"
1274                                           " must be specified for"
1275                                           " vxlan encapsulation");
1276         }
1277         return 0;
1278 }
1279
1280 /**
1281  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV6 item for E-Switch.
1282  * The routine checks the IPv6 fields to be used in encapsulation header.
1283  *
1284  * @param[in] item
1285  *   Pointer to the item structure.
1286  * @param[out] error
1287  *   Pointer to the error structure.
1288  *
1289  * @return
1290  *   0 on success, a negative errno value otherwise and rte_errno is set.
1291  **/
1292 static int
1293 flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item,
1294                                    struct rte_flow_error *error)
1295 {
1296         const struct rte_flow_item_ipv6 *spec = item->spec;
1297         const struct rte_flow_item_ipv6 *mask = item->mask;
1298
1299         if (!spec) {
1300                 /*
1301                  * Specification for IP addresses cannot be empty
1302                  * because it is required by tunnel_key parameter.
1303                  */
1304                 return rte_flow_error_set(error, EINVAL,
1305                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1306                                           "NULL outer ipv6 address"
1307                                           " specification for"
1308                                           " vxlan encapsulation");
1309         }
1310         if (!mask)
1311                 mask = &rte_flow_item_ipv6_mask;
1312         if (memcmp(&mask->hdr.dst_addr,
1313                    &flow_tcf_mask_empty.ipv6.hdr.dst_addr,
1314                    IPV6_ADDR_LEN)) {
1315                 if (memcmp(&mask->hdr.dst_addr,
1316                            &rte_flow_item_ipv6_mask.hdr.dst_addr,
1317                            IPV6_ADDR_LEN))
1318                         return rte_flow_error_set
1319                                         (error, ENOTSUP,
1320                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1321                                          "no support for partial mask on"
1322                                          " \"ipv6.hdr.dst_addr\" field"
1323                                          " for vxlan encapsulation");
1324                 /* More IPv6 address validations can be put here. */
1325         } else {
1326                 /*
1327                  * Kernel uses the destination IP address to determine
1328                  * the routing path and obtain the MAC destination
1329                  * address (heigh or gate), so IP destination address
1330                  * must be specified within the tc rule.
1331                  */
1332                 return rte_flow_error_set(error, EINVAL,
1333                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1334                                           "outer ipv6 destination address"
1335                                           " must be specified for"
1336                                           " vxlan encapsulation");
1337         }
1338         if (memcmp(&mask->hdr.src_addr,
1339                    &flow_tcf_mask_empty.ipv6.hdr.src_addr,
1340                    IPV6_ADDR_LEN)) {
1341                 if (memcmp(&mask->hdr.src_addr,
1342                            &rte_flow_item_ipv6_mask.hdr.src_addr,
1343                            IPV6_ADDR_LEN))
1344                         return rte_flow_error_set
1345                                         (error, ENOTSUP,
1346                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1347                                          "no support for partial mask on"
1348                                          " \"ipv6.hdr.src_addr\" field"
1349                                          " for vxlan encapsulation");
1350                 /* More L3 address validation can be put here. */
1351         } else {
1352                 /*
1353                  * Kernel uses the source IP address to select the
1354                  * interface for egress encapsulated traffic, so
1355                  * it must be specified in the tc rule.
1356                  */
1357                 return rte_flow_error_set(error, EINVAL,
1358                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1359                                           "outer L3 source address"
1360                                           " must be specified for"
1361                                           " vxlan encapsulation");
1362         }
1363         return 0;
1364 }
1365
1366 /**
1367  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_UDP item for E-Switch.
1368  * The routine checks the UDP fields to be used in encapsulation header.
1369  *
1370  * @param[in] item
1371  *   Pointer to the item structure.
1372  * @param[out] error
1373  *   Pointer to the error structure.
1374  *
1375  * @return
1376  *   0 on success, a negative errno value otherwise and rte_errno is set.
1377  **/
1378 static int
1379 flow_tcf_validate_vxlan_encap_udp(const struct rte_flow_item *item,
1380                                   struct rte_flow_error *error)
1381 {
1382         const struct rte_flow_item_udp *spec = item->spec;
1383         const struct rte_flow_item_udp *mask = item->mask;
1384
1385         if (!spec) {
1386                 /*
1387                  * Specification for UDP ports cannot be empty
1388                  * because it is required by tunnel_key parameter.
1389                  */
1390                 return rte_flow_error_set(error, EINVAL,
1391                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1392                                           "NULL UDP port specification "
1393                                           " for vxlan encapsulation");
1394         }
1395         if (!mask)
1396                 mask = &rte_flow_item_udp_mask;
1397         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1398                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1399                         return rte_flow_error_set
1400                                         (error, ENOTSUP,
1401                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1402                                          "no support for partial mask on"
1403                                          " \"udp.hdr.dst_port\" field"
1404                                          " for vxlan encapsulation");
1405                 if (!spec->hdr.dst_port)
1406                         return rte_flow_error_set
1407                                         (error, EINVAL,
1408                                          RTE_FLOW_ERROR_TYPE_ITEM, item,
1409                                          "outer UDP remote port cannot be"
1410                                          " 0 for vxlan encapsulation");
1411         } else {
1412                 return rte_flow_error_set(error, EINVAL,
1413                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1414                                           "outer UDP remote port"
1415                                           " must be specified for"
1416                                           " vxlan encapsulation");
1417         }
1418         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1419                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1420                         return rte_flow_error_set
1421                                         (error, ENOTSUP,
1422                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1423                                          "no support for partial mask on"
1424                                          " \"udp.hdr.src_port\" field"
1425                                          " for vxlan encapsulation");
1426                 DRV_LOG(WARNING,
1427                         "outer UDP source port cannot be"
1428                         " forced for vxlan encapsulation,"
1429                         " parameter ignored");
1430         }
1431         return 0;
1432 }
1433
1434 /**
1435  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_VXLAN item for E-Switch.
1436  * The routine checks the VNIP fields to be used in encapsulation header.
1437  *
1438  * @param[in] item
1439  *   Pointer to the item structure.
1440  * @param[out] error
1441  *   Pointer to the error structure.
1442  *
1443  * @return
1444  *   0 on success, a negative errno value otherwise and rte_errno is set.
1445  **/
1446 static int
1447 flow_tcf_validate_vxlan_encap_vni(const struct rte_flow_item *item,
1448                                   struct rte_flow_error *error)
1449 {
1450         const struct rte_flow_item_vxlan *spec = item->spec;
1451         const struct rte_flow_item_vxlan *mask = item->mask;
1452
1453         if (!spec) {
1454                 /* Outer VNI is required by tunnel_key parameter. */
1455                 return rte_flow_error_set(error, EINVAL,
1456                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1457                                           "NULL VNI specification"
1458                                           " for vxlan encapsulation");
1459         }
1460         if (!mask)
1461                 mask = &rte_flow_item_vxlan_mask;
1462         if (!mask->vni[0] && !mask->vni[1] && !mask->vni[2])
1463                 return rte_flow_error_set(error, EINVAL,
1464                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1465                                           "outer VNI must be specified "
1466                                           "for vxlan encapsulation");
1467         if (mask->vni[0] != 0xff ||
1468             mask->vni[1] != 0xff ||
1469             mask->vni[2] != 0xff)
1470                 return rte_flow_error_set(error, ENOTSUP,
1471                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1472                                           "no support for partial mask on"
1473                                           " \"vxlan.vni\" field");
1474
1475         if (!spec->vni[0] && !spec->vni[1] && !spec->vni[2])
1476                 return rte_flow_error_set(error, EINVAL,
1477                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1478                                           "vxlan vni cannot be 0");
1479         return 0;
1480 }
1481
1482 /**
1483  * Validate VXLAN_ENCAP action item list for E-Switch.
1484  * The routine checks items to be used in encapsulation header.
1485  *
1486  * @param[in] action
1487  *   Pointer to the VXLAN_ENCAP action structure.
1488  * @param[out] error
1489  *   Pointer to the error structure.
1490  *
1491  * @return
1492  *   0 on success, a negative errno value otherwise and rte_errno is set.
1493  **/
1494 static int
1495 flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action,
1496                               struct rte_flow_error *error)
1497 {
1498         const struct rte_flow_item *items;
1499         int ret;
1500         uint32_t item_flags = 0;
1501
1502         if (!action->conf)
1503                 return rte_flow_error_set(error, EINVAL,
1504                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1505                                           "Missing vxlan tunnel"
1506                                           " action configuration");
1507         items = ((const struct rte_flow_action_vxlan_encap *)
1508                                         action->conf)->definition;
1509         if (!items)
1510                 return rte_flow_error_set(error, EINVAL,
1511                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1512                                           "Missing vxlan tunnel"
1513                                           " encapsulation parameters");
1514         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1515                 switch (items->type) {
1516                 case RTE_FLOW_ITEM_TYPE_VOID:
1517                         break;
1518                 case RTE_FLOW_ITEM_TYPE_ETH:
1519                         ret = mlx5_flow_validate_item_eth(items, item_flags,
1520                                                           error);
1521                         if (ret < 0)
1522                                 return ret;
1523                         ret = flow_tcf_validate_vxlan_encap_eth(items, error);
1524                         if (ret < 0)
1525                                 return ret;
1526                         item_flags |= MLX5_FLOW_LAYER_OUTER_L2;
1527                         break;
1528                 break;
1529                 case RTE_FLOW_ITEM_TYPE_IPV4:
1530                         ret = mlx5_flow_validate_item_ipv4(items, item_flags,
1531                                                            error);
1532                         if (ret < 0)
1533                                 return ret;
1534                         ret = flow_tcf_validate_vxlan_encap_ipv4(items, error);
1535                         if (ret < 0)
1536                                 return ret;
1537                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
1538                         break;
1539                 case RTE_FLOW_ITEM_TYPE_IPV6:
1540                         ret = mlx5_flow_validate_item_ipv6(items, item_flags,
1541                                                            error);
1542                         if (ret < 0)
1543                                 return ret;
1544                         ret = flow_tcf_validate_vxlan_encap_ipv6(items, error);
1545                         if (ret < 0)
1546                                 return ret;
1547                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
1548                         break;
1549                 case RTE_FLOW_ITEM_TYPE_UDP:
1550                         ret = mlx5_flow_validate_item_udp(items, item_flags,
1551                                                            0xFF, error);
1552                         if (ret < 0)
1553                                 return ret;
1554                         ret = flow_tcf_validate_vxlan_encap_udp(items, error);
1555                         if (ret < 0)
1556                                 return ret;
1557                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
1558                         break;
1559                 case RTE_FLOW_ITEM_TYPE_VXLAN:
1560                         ret = mlx5_flow_validate_item_vxlan(items,
1561                                                             item_flags, error);
1562                         if (ret < 0)
1563                                 return ret;
1564                         ret = flow_tcf_validate_vxlan_encap_vni(items, error);
1565                         if (ret < 0)
1566                                 return ret;
1567                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
1568                         break;
1569                 default:
1570                         return rte_flow_error_set
1571                                         (error, ENOTSUP,
1572                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
1573                                          "vxlan encap item not supported");
1574                 }
1575         }
1576         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3))
1577                 return rte_flow_error_set(error, EINVAL,
1578                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1579                                           "no outer IP layer found"
1580                                           " for vxlan encapsulation");
1581         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
1582                 return rte_flow_error_set(error, EINVAL,
1583                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1584                                           "no outer UDP layer found"
1585                                           " for vxlan encapsulation");
1586         if (!(item_flags & MLX5_FLOW_LAYER_VXLAN))
1587                 return rte_flow_error_set(error, EINVAL,
1588                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1589                                           "no VXLAN VNI found"
1590                                           " for vxlan encapsulation");
1591         return 0;
1592 }
1593
1594 /**
1595  * Validate outer RTE_FLOW_ITEM_TYPE_UDP item if tunnel item
1596  * RTE_FLOW_ITEM_TYPE_VXLAN is present in item list.
1597  *
1598  * @param[in] udp
1599  *   Outer UDP layer item (if any, NULL otherwise).
1600  * @param[out] error
1601  *   Pointer to the error structure.
1602  *
1603  * @return
1604  *   0 on success, a negative errno value otherwise and rte_errno is set.
1605  **/
1606 static int
1607 flow_tcf_validate_vxlan_decap_udp(const struct rte_flow_item *udp,
1608                                   struct rte_flow_error *error)
1609 {
1610         const struct rte_flow_item_udp *spec = udp->spec;
1611         const struct rte_flow_item_udp *mask = udp->mask;
1612
1613         if (!spec)
1614                 /*
1615                  * Specification for UDP ports cannot be empty
1616                  * because it is required as decap parameter.
1617                  */
1618                 return rte_flow_error_set(error, EINVAL,
1619                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1620                                           "NULL UDP port specification"
1621                                           " for VXLAN decapsulation");
1622         if (!mask)
1623                 mask = &rte_flow_item_udp_mask;
1624         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1625                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1626                         return rte_flow_error_set
1627                                         (error, ENOTSUP,
1628                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1629                                          "no support for partial mask on"
1630                                          " \"udp.hdr.dst_port\" field");
1631                 if (!spec->hdr.dst_port)
1632                         return rte_flow_error_set
1633                                         (error, EINVAL,
1634                                          RTE_FLOW_ERROR_TYPE_ITEM, udp,
1635                                          "zero decap local UDP port");
1636         } else {
1637                 return rte_flow_error_set(error, EINVAL,
1638                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1639                                           "outer UDP destination port must be "
1640                                           "specified for vxlan decapsulation");
1641         }
1642         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1643                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1644                         return rte_flow_error_set
1645                                         (error, ENOTSUP,
1646                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1647                                          "no support for partial mask on"
1648                                          " \"udp.hdr.src_port\" field");
1649                 DRV_LOG(WARNING,
1650                         "outer UDP local port cannot be "
1651                         "forced for VXLAN encapsulation, "
1652                         "parameter ignored");
1653         }
1654         return 0;
1655 }
1656
1657 /**
1658  * Validate flow for E-Switch.
1659  *
1660  * @param[in] priv
1661  *   Pointer to the priv structure.
1662  * @param[in] attr
1663  *   Pointer to the flow attributes.
1664  * @param[in] items
1665  *   Pointer to the list of items.
1666  * @param[in] actions
1667  *   Pointer to the list of actions.
1668  * @param[out] error
1669  *   Pointer to the error structure.
1670  *
1671  * @return
1672  *   0 on success, a negative errno value otherwise and rte_errno is set.
1673  */
1674 static int
1675 flow_tcf_validate(struct rte_eth_dev *dev,
1676                   const struct rte_flow_attr *attr,
1677                   const struct rte_flow_item items[],
1678                   const struct rte_flow_action actions[],
1679                   struct rte_flow_error *error)
1680 {
1681         union {
1682                 const struct rte_flow_item_port_id *port_id;
1683                 const struct rte_flow_item_eth *eth;
1684                 const struct rte_flow_item_vlan *vlan;
1685                 const struct rte_flow_item_ipv4 *ipv4;
1686                 const struct rte_flow_item_ipv6 *ipv6;
1687                 const struct rte_flow_item_tcp *tcp;
1688                 const struct rte_flow_item_udp *udp;
1689                 const struct rte_flow_item_vxlan *vxlan;
1690         } spec, mask;
1691         union {
1692                 const struct rte_flow_action_port_id *port_id;
1693                 const struct rte_flow_action_jump *jump;
1694                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
1695                 const struct rte_flow_action_of_set_vlan_vid *
1696                         of_set_vlan_vid;
1697                 const struct rte_flow_action_of_set_vlan_pcp *
1698                         of_set_vlan_pcp;
1699                 const struct rte_flow_action_vxlan_encap *vxlan_encap;
1700                 const struct rte_flow_action_set_ipv4 *set_ipv4;
1701                 const struct rte_flow_action_set_ipv6 *set_ipv6;
1702         } conf;
1703         const struct rte_flow_item *outer_udp = NULL;
1704         rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
1705         rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
1706         rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
1707         uint64_t item_flags = 0;
1708         uint64_t action_flags = 0;
1709         uint8_t next_protocol = 0xff;
1710         unsigned int tcm_ifindex = 0;
1711         uint8_t pedit_validated = 0;
1712         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
1713         struct rte_eth_dev *port_id_dev = NULL;
1714         bool in_port_id_set;
1715         int ret;
1716
1717         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
1718                                                 PTOI_TABLE_SZ_MAX(dev)));
1719         ret = flow_tcf_validate_attributes(attr, error);
1720         if (ret < 0)
1721                 return ret;
1722         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
1723                 unsigned int i;
1724                 uint64_t current_action_flag = 0;
1725
1726                 switch (actions->type) {
1727                 case RTE_FLOW_ACTION_TYPE_VOID:
1728                         break;
1729                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
1730                         current_action_flag = MLX5_FLOW_ACTION_PORT_ID;
1731                         if (!actions->conf)
1732                                 break;
1733                         conf.port_id = actions->conf;
1734                         if (conf.port_id->original)
1735                                 i = 0;
1736                         else
1737                                 for (i = 0; ptoi[i].ifindex; ++i)
1738                                         if (ptoi[i].port_id == conf.port_id->id)
1739                                                 break;
1740                         if (!ptoi[i].ifindex)
1741                                 return rte_flow_error_set
1742                                         (error, ENODEV,
1743                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1744                                          conf.port_id,
1745                                          "missing data to convert port ID to"
1746                                          " ifindex");
1747                         port_id_dev = &rte_eth_devices[conf.port_id->id];
1748                         break;
1749                 case RTE_FLOW_ACTION_TYPE_JUMP:
1750                         current_action_flag = MLX5_FLOW_ACTION_JUMP;
1751                         if (!actions->conf)
1752                                 break;
1753                         conf.jump = actions->conf;
1754                         if (attr->group >= conf.jump->group)
1755                                 return rte_flow_error_set
1756                                         (error, ENOTSUP,
1757                                          RTE_FLOW_ERROR_TYPE_ACTION,
1758                                          actions,
1759                                          "can jump only to a group forward");
1760                         break;
1761                 case RTE_FLOW_ACTION_TYPE_DROP:
1762                         current_action_flag = MLX5_FLOW_ACTION_DROP;
1763                         break;
1764                 case RTE_FLOW_ACTION_TYPE_COUNT:
1765                         break;
1766                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
1767                         current_action_flag = MLX5_FLOW_ACTION_OF_POP_VLAN;
1768                         break;
1769                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: {
1770                         rte_be16_t ethertype;
1771
1772                         current_action_flag = MLX5_FLOW_ACTION_OF_PUSH_VLAN;
1773                         if (!actions->conf)
1774                                 break;
1775                         conf.of_push_vlan = actions->conf;
1776                         ethertype = conf.of_push_vlan->ethertype;
1777                         if (ethertype != RTE_BE16(ETH_P_8021Q) &&
1778                             ethertype != RTE_BE16(ETH_P_8021AD))
1779                                 return rte_flow_error_set
1780                                         (error, EINVAL,
1781                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1782                                          "vlan push TPID must be "
1783                                          "802.1Q or 802.1AD");
1784                         break;
1785                 }
1786                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
1787                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1788                                 return rte_flow_error_set
1789                                         (error, ENOTSUP,
1790                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1791                                          "vlan modify is not supported,"
1792                                          " set action must follow push action");
1793                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
1794                         break;
1795                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
1796                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1797                                 return rte_flow_error_set
1798                                         (error, ENOTSUP,
1799                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1800                                          "vlan modify is not supported,"
1801                                          " set action must follow push action");
1802                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
1803                         break;
1804                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
1805                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_DECAP;
1806                         break;
1807                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
1808                         ret = flow_tcf_validate_vxlan_encap(actions, error);
1809                         if (ret < 0)
1810                                 return ret;
1811                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_ENCAP;
1812                         break;
1813                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
1814                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_SRC;
1815                         break;
1816                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
1817                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_DST;
1818                         break;
1819                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
1820                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_SRC;
1821                         break;
1822                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
1823                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_DST;
1824                         break;
1825                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
1826                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_SRC;
1827                         break;
1828                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
1829                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_DST;
1830                         break;
1831                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
1832                         current_action_flag = MLX5_FLOW_ACTION_SET_TTL;
1833                         break;
1834                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
1835                         current_action_flag = MLX5_FLOW_ACTION_DEC_TTL;
1836                         break;
1837                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
1838                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_SRC;
1839                         break;
1840                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
1841                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_DST;
1842                         break;
1843                 default:
1844                         return rte_flow_error_set(error, ENOTSUP,
1845                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1846                                                   actions,
1847                                                   "action not supported");
1848                 }
1849                 if (current_action_flag & MLX5_TCF_CONFIG_ACTIONS) {
1850                         if (!actions->conf)
1851                                 return rte_flow_error_set
1852                                         (error, EINVAL,
1853                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1854                                          actions,
1855                                          "action configuration not set");
1856                 }
1857                 if ((current_action_flag & MLX5_TCF_PEDIT_ACTIONS) &&
1858                     pedit_validated)
1859                         return rte_flow_error_set(error, ENOTSUP,
1860                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1861                                                   actions,
1862                                                   "set actions should be "
1863                                                   "listed successively");
1864                 if ((current_action_flag & ~MLX5_TCF_PEDIT_ACTIONS) &&
1865                     (action_flags & MLX5_TCF_PEDIT_ACTIONS))
1866                         pedit_validated = 1;
1867                 if ((current_action_flag & MLX5_TCF_FATE_ACTIONS) &&
1868                     (action_flags & MLX5_TCF_FATE_ACTIONS))
1869                         return rte_flow_error_set(error, EINVAL,
1870                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1871                                                   actions,
1872                                                   "can't have multiple fate"
1873                                                   " actions");
1874                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1875                     (action_flags & MLX5_TCF_VXLAN_ACTIONS))
1876                         return rte_flow_error_set(error, EINVAL,
1877                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1878                                                   actions,
1879                                                   "can't have multiple vxlan"
1880                                                   " actions");
1881                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1882                     (action_flags & MLX5_TCF_VLAN_ACTIONS))
1883                         return rte_flow_error_set(error, ENOTSUP,
1884                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1885                                                   actions,
1886                                                   "can't have vxlan and vlan"
1887                                                   " actions in the same rule");
1888                 action_flags |= current_action_flag;
1889         }
1890         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1891                 unsigned int i;
1892
1893                 switch (items->type) {
1894                 case RTE_FLOW_ITEM_TYPE_VOID:
1895                         break;
1896                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
1897                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
1898                                 return rte_flow_error_set
1899                                         (error, ENOTSUP,
1900                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
1901                                          "inner tunnel port id"
1902                                          " item is not supported");
1903                         mask.port_id = flow_tcf_item_mask
1904                                 (items, &rte_flow_item_port_id_mask,
1905                                  &flow_tcf_mask_supported.port_id,
1906                                  &flow_tcf_mask_empty.port_id,
1907                                  sizeof(flow_tcf_mask_supported.port_id),
1908                                  error);
1909                         if (!mask.port_id)
1910                                 return -rte_errno;
1911                         if (mask.port_id == &flow_tcf_mask_empty.port_id) {
1912                                 in_port_id_set = 1;
1913                                 break;
1914                         }
1915                         spec.port_id = items->spec;
1916                         if (mask.port_id->id && mask.port_id->id != 0xffffffff)
1917                                 return rte_flow_error_set
1918                                         (error, ENOTSUP,
1919                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
1920                                          mask.port_id,
1921                                          "no support for partial mask on"
1922                                          " \"id\" field");
1923                         if (!mask.port_id->id)
1924                                 i = 0;
1925                         else
1926                                 for (i = 0; ptoi[i].ifindex; ++i)
1927                                         if (ptoi[i].port_id == spec.port_id->id)
1928                                                 break;
1929                         if (!ptoi[i].ifindex)
1930                                 return rte_flow_error_set
1931                                         (error, ENODEV,
1932                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
1933                                          spec.port_id,
1934                                          "missing data to convert port ID to"
1935                                          " ifindex");
1936                         if (in_port_id_set && ptoi[i].ifindex != tcm_ifindex)
1937                                 return rte_flow_error_set
1938                                         (error, ENOTSUP,
1939                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
1940                                          spec.port_id,
1941                                          "cannot match traffic for"
1942                                          " several port IDs through"
1943                                          " a single flow rule");
1944                         tcm_ifindex = ptoi[i].ifindex;
1945                         in_port_id_set = 1;
1946                         break;
1947                 case RTE_FLOW_ITEM_TYPE_ETH:
1948                         ret = mlx5_flow_validate_item_eth(items, item_flags,
1949                                                           error);
1950                         if (ret < 0)
1951                                 return ret;
1952                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
1953                                       MLX5_FLOW_LAYER_INNER_L2 :
1954                                       MLX5_FLOW_LAYER_OUTER_L2;
1955                         /* TODO:
1956                          * Redundant check due to different supported mask.
1957                          * Same for the rest of items.
1958                          */
1959                         mask.eth = flow_tcf_item_mask
1960                                 (items, &rte_flow_item_eth_mask,
1961                                  &flow_tcf_mask_supported.eth,
1962                                  &flow_tcf_mask_empty.eth,
1963                                  sizeof(flow_tcf_mask_supported.eth),
1964                                  error);
1965                         if (!mask.eth)
1966                                 return -rte_errno;
1967                         if (mask.eth->type && mask.eth->type !=
1968                             RTE_BE16(0xffff))
1969                                 return rte_flow_error_set
1970                                         (error, ENOTSUP,
1971                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
1972                                          mask.eth,
1973                                          "no support for partial mask on"
1974                                          " \"type\" field");
1975                         assert(items->spec);
1976                         spec.eth = items->spec;
1977                         if (mask.eth->type &&
1978                             (item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
1979                             inner_etype != RTE_BE16(ETH_P_ALL) &&
1980                             inner_etype != spec.eth->type)
1981                                 return rte_flow_error_set
1982                                         (error, EINVAL,
1983                                          RTE_FLOW_ERROR_TYPE_ITEM,
1984                                          items,
1985                                          "inner eth_type conflict");
1986                         if (mask.eth->type &&
1987                             !(item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
1988                             outer_etype != RTE_BE16(ETH_P_ALL) &&
1989                             outer_etype != spec.eth->type)
1990                                 return rte_flow_error_set
1991                                         (error, EINVAL,
1992                                          RTE_FLOW_ERROR_TYPE_ITEM,
1993                                          items,
1994                                          "outer eth_type conflict");
1995                         if (mask.eth->type) {
1996                                 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
1997                                         inner_etype = spec.eth->type;
1998                                 else
1999                                         outer_etype = spec.eth->type;
2000                         }
2001                         break;
2002                 case RTE_FLOW_ITEM_TYPE_VLAN:
2003                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
2004                                 return rte_flow_error_set
2005                                         (error, ENOTSUP,
2006                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
2007                                          "inner tunnel VLAN"
2008                                          " is not supported");
2009                         ret = mlx5_flow_validate_item_vlan(items, item_flags,
2010                                                            error);
2011                         if (ret < 0)
2012                                 return ret;
2013                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
2014                         mask.vlan = flow_tcf_item_mask
2015                                 (items, &rte_flow_item_vlan_mask,
2016                                  &flow_tcf_mask_supported.vlan,
2017                                  &flow_tcf_mask_empty.vlan,
2018                                  sizeof(flow_tcf_mask_supported.vlan),
2019                                  error);
2020                         if (!mask.vlan)
2021                                 return -rte_errno;
2022                         if ((mask.vlan->tci & RTE_BE16(0xe000) &&
2023                              (mask.vlan->tci & RTE_BE16(0xe000)) !=
2024                               RTE_BE16(0xe000)) ||
2025                             (mask.vlan->tci & RTE_BE16(0x0fff) &&
2026                              (mask.vlan->tci & RTE_BE16(0x0fff)) !=
2027                               RTE_BE16(0x0fff)) ||
2028                             (mask.vlan->inner_type &&
2029                              mask.vlan->inner_type != RTE_BE16(0xffff)))
2030                                 return rte_flow_error_set
2031                                         (error, ENOTSUP,
2032                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2033                                          mask.vlan,
2034                                          "no support for partial masks on"
2035                                          " \"tci\" (PCP and VID parts) and"
2036                                          " \"inner_type\" fields");
2037                         if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2038                             outer_etype != RTE_BE16(ETH_P_8021Q))
2039                                 return rte_flow_error_set
2040                                         (error, EINVAL,
2041                                          RTE_FLOW_ERROR_TYPE_ITEM,
2042                                          items,
2043                                          "outer eth_type conflict,"
2044                                          " must be 802.1Q");
2045                         outer_etype = RTE_BE16(ETH_P_8021Q);
2046                         assert(items->spec);
2047                         spec.vlan = items->spec;
2048                         if (mask.vlan->inner_type &&
2049                             vlan_etype != RTE_BE16(ETH_P_ALL) &&
2050                             vlan_etype != spec.vlan->inner_type)
2051                                 return rte_flow_error_set
2052                                         (error, EINVAL,
2053                                          RTE_FLOW_ERROR_TYPE_ITEM,
2054                                          items,
2055                                          "vlan eth_type conflict");
2056                         if (mask.vlan->inner_type)
2057                                 vlan_etype = spec.vlan->inner_type;
2058                         break;
2059                 case RTE_FLOW_ITEM_TYPE_IPV4:
2060                         ret = mlx5_flow_validate_item_ipv4(items, item_flags,
2061                                                            error);
2062                         if (ret < 0)
2063                                 return ret;
2064                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2065                                       MLX5_FLOW_LAYER_INNER_L3_IPV4 :
2066                                       MLX5_FLOW_LAYER_OUTER_L3_IPV4;
2067                         mask.ipv4 = flow_tcf_item_mask
2068                                 (items, &rte_flow_item_ipv4_mask,
2069                                  &flow_tcf_mask_supported.ipv4,
2070                                  &flow_tcf_mask_empty.ipv4,
2071                                  sizeof(flow_tcf_mask_supported.ipv4),
2072                                  error);
2073                         if (!mask.ipv4)
2074                                 return -rte_errno;
2075                         if (mask.ipv4->hdr.next_proto_id &&
2076                             mask.ipv4->hdr.next_proto_id != 0xff)
2077                                 return rte_flow_error_set
2078                                         (error, ENOTSUP,
2079                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2080                                          mask.ipv4,
2081                                          "no support for partial mask on"
2082                                          " \"hdr.next_proto_id\" field");
2083                         else if (mask.ipv4->hdr.next_proto_id)
2084                                 next_protocol =
2085                                         ((const struct rte_flow_item_ipv4 *)
2086                                          (items->spec))->hdr.next_proto_id;
2087                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2088                                 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2089                                     inner_etype != RTE_BE16(ETH_P_IP))
2090                                         return rte_flow_error_set
2091                                                 (error, EINVAL,
2092                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2093                                                  items,
2094                                                  "inner eth_type conflict,"
2095                                                  " IPv4 is required");
2096                                 inner_etype = RTE_BE16(ETH_P_IP);
2097                         } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2098                                 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2099                                     vlan_etype != RTE_BE16(ETH_P_IP))
2100                                         return rte_flow_error_set
2101                                                 (error, EINVAL,
2102                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2103                                                  items,
2104                                                  "vlan eth_type conflict,"
2105                                                  " IPv4 is required");
2106                                 vlan_etype = RTE_BE16(ETH_P_IP);
2107                         } else {
2108                                 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2109                                     outer_etype != RTE_BE16(ETH_P_IP))
2110                                         return rte_flow_error_set
2111                                                 (error, EINVAL,
2112                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2113                                                  items,
2114                                                  "eth_type conflict,"
2115                                                  " IPv4 is required");
2116                                 outer_etype = RTE_BE16(ETH_P_IP);
2117                         }
2118                         break;
2119                 case RTE_FLOW_ITEM_TYPE_IPV6:
2120                         ret = mlx5_flow_validate_item_ipv6(items, item_flags,
2121                                                            error);
2122                         if (ret < 0)
2123                                 return ret;
2124                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2125                                       MLX5_FLOW_LAYER_INNER_L3_IPV6 :
2126                                       MLX5_FLOW_LAYER_OUTER_L3_IPV6;
2127                         mask.ipv6 = flow_tcf_item_mask
2128                                 (items, &rte_flow_item_ipv6_mask,
2129                                  &flow_tcf_mask_supported.ipv6,
2130                                  &flow_tcf_mask_empty.ipv6,
2131                                  sizeof(flow_tcf_mask_supported.ipv6),
2132                                  error);
2133                         if (!mask.ipv6)
2134                                 return -rte_errno;
2135                         if (mask.ipv6->hdr.proto &&
2136                             mask.ipv6->hdr.proto != 0xff)
2137                                 return rte_flow_error_set
2138                                         (error, ENOTSUP,
2139                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2140                                          mask.ipv6,
2141                                          "no support for partial mask on"
2142                                          " \"hdr.proto\" field");
2143                         else if (mask.ipv6->hdr.proto)
2144                                 next_protocol =
2145                                         ((const struct rte_flow_item_ipv6 *)
2146                                          (items->spec))->hdr.proto;
2147                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2148                                 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2149                                     inner_etype != RTE_BE16(ETH_P_IPV6))
2150                                         return rte_flow_error_set
2151                                                 (error, EINVAL,
2152                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2153                                                  items,
2154                                                  "inner eth_type conflict,"
2155                                                  " IPv6 is required");
2156                                 inner_etype = RTE_BE16(ETH_P_IPV6);
2157                         } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2158                                 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2159                                     vlan_etype != RTE_BE16(ETH_P_IPV6))
2160                                         return rte_flow_error_set
2161                                                 (error, EINVAL,
2162                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2163                                                  items,
2164                                                  "vlan eth_type conflict,"
2165                                                  " IPv6 is required");
2166                                 vlan_etype = RTE_BE16(ETH_P_IPV6);
2167                         } else {
2168                                 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2169                                     outer_etype != RTE_BE16(ETH_P_IPV6))
2170                                         return rte_flow_error_set
2171                                                 (error, EINVAL,
2172                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2173                                                  items,
2174                                                  "eth_type conflict,"
2175                                                  " IPv6 is required");
2176                                 outer_etype = RTE_BE16(ETH_P_IPV6);
2177                         }
2178                         break;
2179                 case RTE_FLOW_ITEM_TYPE_UDP:
2180                         ret = mlx5_flow_validate_item_udp(items, item_flags,
2181                                                           next_protocol, error);
2182                         if (ret < 0)
2183                                 return ret;
2184                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2185                                       MLX5_FLOW_LAYER_INNER_L4_UDP :
2186                                       MLX5_FLOW_LAYER_OUTER_L4_UDP;
2187                         mask.udp = flow_tcf_item_mask
2188                                 (items, &rte_flow_item_udp_mask,
2189                                  &flow_tcf_mask_supported.udp,
2190                                  &flow_tcf_mask_empty.udp,
2191                                  sizeof(flow_tcf_mask_supported.udp),
2192                                  error);
2193                         if (!mask.udp)
2194                                 return -rte_errno;
2195                         /*
2196                          * Save the presumed outer UDP item for extra check
2197                          * if the tunnel item will be found later in the list.
2198                          */
2199                         if (!(item_flags & MLX5_FLOW_LAYER_TUNNEL))
2200                                 outer_udp = items;
2201                         break;
2202                 case RTE_FLOW_ITEM_TYPE_TCP:
2203                         ret = mlx5_flow_validate_item_tcp
2204                                              (items, item_flags,
2205                                               next_protocol,
2206                                               &flow_tcf_mask_supported.tcp,
2207                                               error);
2208                         if (ret < 0)
2209                                 return ret;
2210                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2211                                       MLX5_FLOW_LAYER_INNER_L4_TCP :
2212                                       MLX5_FLOW_LAYER_OUTER_L4_TCP;
2213                         mask.tcp = flow_tcf_item_mask
2214                                 (items, &rte_flow_item_tcp_mask,
2215                                  &flow_tcf_mask_supported.tcp,
2216                                  &flow_tcf_mask_empty.tcp,
2217                                  sizeof(flow_tcf_mask_supported.tcp),
2218                                  error);
2219                         if (!mask.tcp)
2220                                 return -rte_errno;
2221                         break;
2222                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2223                         if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN)
2224                                 return rte_flow_error_set
2225                                         (error, ENOTSUP,
2226                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
2227                                          "vxlan tunnel over vlan"
2228                                          " is not supported");
2229                         ret = mlx5_flow_validate_item_vxlan(items,
2230                                                             item_flags, error);
2231                         if (ret < 0)
2232                                 return ret;
2233                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
2234                         mask.vxlan = flow_tcf_item_mask
2235                                 (items, &rte_flow_item_vxlan_mask,
2236                                  &flow_tcf_mask_supported.vxlan,
2237                                  &flow_tcf_mask_empty.vxlan,
2238                                  sizeof(flow_tcf_mask_supported.vxlan), error);
2239                         if (!mask.vxlan)
2240                                 return -rte_errno;
2241                         if (mask.vxlan->vni[0] != 0xff ||
2242                             mask.vxlan->vni[1] != 0xff ||
2243                             mask.vxlan->vni[2] != 0xff)
2244                                 return rte_flow_error_set
2245                                         (error, ENOTSUP,
2246                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2247                                          mask.vxlan,
2248                                          "no support for partial or "
2249                                          "empty mask on \"vxlan.vni\" field");
2250                         /*
2251                          * The VNI item assumes the VXLAN tunnel, it requires
2252                          * at least the outer destination UDP port must be
2253                          * specified without wildcards to allow kernel select
2254                          * the virtual VXLAN device by port. Also outer IPv4
2255                          * or IPv6 item must be specified (wilcards or even
2256                          * zero mask are allowed) to let driver know the tunnel
2257                          * IP version and process UDP traffic correctly.
2258                          */
2259                         if (!(item_flags &
2260                              (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2261                               MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2262                                 return rte_flow_error_set
2263                                                  (error, EINVAL,
2264                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2265                                                   NULL,
2266                                                   "no outer IP pattern found"
2267                                                   " for vxlan tunnel");
2268                         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
2269                                 return rte_flow_error_set
2270                                                  (error, EINVAL,
2271                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2272                                                   NULL,
2273                                                   "no outer UDP pattern found"
2274                                                   " for vxlan tunnel");
2275                         /*
2276                          * All items preceding the tunnel item become outer
2277                          * ones and we should do extra validation for them
2278                          * due to tc limitations for tunnel outer parameters.
2279                          * Currently only outer UDP item requres extra check,
2280                          * use the saved pointer instead of item list rescan.
2281                          */
2282                         assert(outer_udp);
2283                         ret = flow_tcf_validate_vxlan_decap_udp
2284                                                 (outer_udp, error);
2285                         if (ret < 0)
2286                                 return ret;
2287                         /* Reset L4 protocol for inner parameters. */
2288                         next_protocol = 0xff;
2289                         break;
2290                 default:
2291                         return rte_flow_error_set(error, ENOTSUP,
2292                                                   RTE_FLOW_ERROR_TYPE_ITEM,
2293                                                   items, "item not supported");
2294                 }
2295         }
2296         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2297             (action_flags & MLX5_FLOW_ACTION_DROP))
2298                 return rte_flow_error_set(error, ENOTSUP,
2299                                           RTE_FLOW_ERROR_TYPE_ACTION,
2300                                           actions,
2301                                           "set action is not compatible with "
2302                                           "drop action");
2303         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2304             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2305                 return rte_flow_error_set(error, ENOTSUP,
2306                                           RTE_FLOW_ERROR_TYPE_ACTION,
2307                                           actions,
2308                                           "set action must be followed by "
2309                                           "port_id action");
2310         if (action_flags &
2311            (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST)) {
2312                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4))
2313                         return rte_flow_error_set(error, EINVAL,
2314                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2315                                                   actions,
2316                                                   "no ipv4 item found in"
2317                                                   " pattern");
2318         }
2319         if (action_flags &
2320            (MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST)) {
2321                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6))
2322                         return rte_flow_error_set(error, EINVAL,
2323                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2324                                                   actions,
2325                                                   "no ipv6 item found in"
2326                                                   " pattern");
2327         }
2328         if (action_flags &
2329            (MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST)) {
2330                 if (!(item_flags &
2331                      (MLX5_FLOW_LAYER_OUTER_L4_UDP |
2332                       MLX5_FLOW_LAYER_OUTER_L4_TCP)))
2333                         return rte_flow_error_set(error, EINVAL,
2334                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2335                                                   actions,
2336                                                   "no TCP/UDP item found in"
2337                                                   " pattern");
2338         }
2339         /*
2340          * FW syndrome (0xA9C090):
2341          *     set_flow_table_entry: push vlan action fte in fdb can ONLY be
2342          *     forward to the uplink.
2343          */
2344         if ((action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) &&
2345             (action_flags & MLX5_FLOW_ACTION_PORT_ID) &&
2346             ((struct priv *)port_id_dev->data->dev_private)->representor)
2347                 return rte_flow_error_set(error, ENOTSUP,
2348                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2349                                           "vlan push can only be applied"
2350                                           " when forwarding to uplink port");
2351         /*
2352          * FW syndrome (0x294609):
2353          *     set_flow_table_entry: modify/pop/push actions in fdb flow table
2354          *     are supported only while forwarding to vport.
2355          */
2356         if ((action_flags & MLX5_TCF_VLAN_ACTIONS) &&
2357             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2358                 return rte_flow_error_set(error, ENOTSUP,
2359                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2360                                           "vlan actions are supported"
2361                                           " only with port_id action");
2362         if ((action_flags & MLX5_TCF_VXLAN_ACTIONS) &&
2363             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2364                 return rte_flow_error_set(error, ENOTSUP,
2365                                           RTE_FLOW_ERROR_TYPE_ACTION, NULL,
2366                                           "vxlan actions are supported"
2367                                           " only with port_id action");
2368         if (!(action_flags & MLX5_TCF_FATE_ACTIONS))
2369                 return rte_flow_error_set(error, EINVAL,
2370                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2371                                           "no fate action is found");
2372         if (action_flags &
2373            (MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL)) {
2374                 if (!(item_flags &
2375                      (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2376                       MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2377                         return rte_flow_error_set(error, EINVAL,
2378                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2379                                                   actions,
2380                                                   "no IP found in pattern");
2381         }
2382         if (action_flags &
2383             (MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)) {
2384                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L2))
2385                         return rte_flow_error_set(error, ENOTSUP,
2386                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2387                                                   actions,
2388                                                   "no ethernet found in"
2389                                                   " pattern");
2390         }
2391         if ((action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) &&
2392             !(item_flags & MLX5_FLOW_LAYER_VXLAN))
2393                 return rte_flow_error_set(error, EINVAL,
2394                                           RTE_FLOW_ERROR_TYPE_ACTION,
2395                                           NULL,
2396                                           "no VNI pattern found"
2397                                           " for vxlan decap action");
2398         if ((action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) &&
2399             (item_flags & MLX5_FLOW_LAYER_TUNNEL))
2400                 return rte_flow_error_set(error, EINVAL,
2401                                           RTE_FLOW_ERROR_TYPE_ACTION,
2402                                           NULL,
2403                                           "vxlan encap not supported"
2404                                           " for tunneled traffic");
2405         return 0;
2406 }
2407
2408 /**
2409  * Calculate maximum size of memory for flow items of Linux TC flower.
2410  *
2411  * @param[in] attr
2412  *   Pointer to the flow attributes.
2413  * @param[in] items
2414  *   Pointer to the list of items.
2415  * @param[out] action_flags
2416  *   Pointer to the detected actions.
2417  *
2418  * @return
2419  *   Maximum size of memory for items.
2420  */
2421 static int
2422 flow_tcf_get_items_size(const struct rte_flow_attr *attr,
2423                         const struct rte_flow_item items[],
2424                         uint64_t *action_flags)
2425 {
2426         int size = 0;
2427
2428         size += SZ_NLATTR_STRZ_OF("flower") +
2429                 SZ_NLATTR_TYPE_OF(uint16_t) + /* Outer ether type. */
2430                 SZ_NLATTR_NEST + /* TCA_OPTIONS. */
2431                 SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */
2432         if (attr->group > 0)
2433                 size += SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CHAIN. */
2434         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2435                 switch (items->type) {
2436                 case RTE_FLOW_ITEM_TYPE_VOID:
2437                         break;
2438                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
2439                         break;
2440                 case RTE_FLOW_ITEM_TYPE_ETH:
2441                         size += SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4;
2442                                 /* dst/src MAC addr and mask. */
2443                         break;
2444                 case RTE_FLOW_ITEM_TYPE_VLAN:
2445                         size += SZ_NLATTR_TYPE_OF(uint16_t) +
2446                                 /* VLAN Ether type. */
2447                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */
2448                                 SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */
2449                         break;
2450                 case RTE_FLOW_ITEM_TYPE_IPV4:
2451                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2452                                 SZ_NLATTR_TYPE_OF(uint32_t) * 4;
2453                                 /* dst/src IP addr and mask. */
2454                         break;
2455                 case RTE_FLOW_ITEM_TYPE_IPV6:
2456                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2457                                 SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4;
2458                                 /* dst/src IP addr and mask. */
2459                         break;
2460                 case RTE_FLOW_ITEM_TYPE_UDP:
2461                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2462                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2463                                 /* dst/src port and mask. */
2464                         break;
2465                 case RTE_FLOW_ITEM_TYPE_TCP:
2466                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2467                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2468                                 /* dst/src port and mask. */
2469                         break;
2470                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2471                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2472                         /*
2473                          * There might be no VXLAN decap action in the action
2474                          * list, nonetheless the VXLAN tunnel flow requires
2475                          * the decap structure to be correctly applied to
2476                          * VXLAN device, set the flag to create the structure.
2477                          * Translation routine will not put the decap action
2478                          * in tne Netlink message if there is no actual action
2479                          * in the list.
2480                          */
2481                         *action_flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2482                         break;
2483                 default:
2484                         DRV_LOG(WARNING,
2485                                 "unsupported item %p type %d,"
2486                                 " items must be validated before flow creation",
2487                                 (const void *)items, items->type);
2488                         break;
2489                 }
2490         }
2491         return size;
2492 }
2493
2494 /**
2495  * Calculate size of memory to store the VXLAN encapsultion
2496  * related items in the Netlink message buffer. Items list
2497  * is specified by RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action.
2498  * The item list should be validated.
2499  *
2500  * @param[in] action
2501  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2502  *   List of pattern items to scan data from.
2503  *
2504  * @return
2505  *   The size the part of Netlink message buffer to store the
2506  *   VXLAN encapsulation item attributes.
2507  */
2508 static int
2509 flow_tcf_vxlan_encap_size(const struct rte_flow_action *action)
2510 {
2511         const struct rte_flow_item *items;
2512         int size = 0;
2513
2514         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
2515         assert(action->conf);
2516
2517         items = ((const struct rte_flow_action_vxlan_encap *)
2518                                         action->conf)->definition;
2519         assert(items);
2520         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2521                 switch (items->type) {
2522                 case RTE_FLOW_ITEM_TYPE_VOID:
2523                         break;
2524                 case RTE_FLOW_ITEM_TYPE_ETH:
2525                         /* This item does not require message buffer. */
2526                         break;
2527                 case RTE_FLOW_ITEM_TYPE_IPV4:
2528                         size += SZ_NLATTR_DATA_OF(IPV4_ADDR_LEN) * 2;
2529                         break;
2530                 case RTE_FLOW_ITEM_TYPE_IPV6:
2531                         size += SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 2;
2532                         break;
2533                 case RTE_FLOW_ITEM_TYPE_UDP: {
2534                         const struct rte_flow_item_udp *udp = items->mask;
2535
2536                         size += SZ_NLATTR_TYPE_OF(uint16_t);
2537                         if (!udp || udp->hdr.src_port != RTE_BE16(0x0000))
2538                                 size += SZ_NLATTR_TYPE_OF(uint16_t);
2539                         break;
2540                 }
2541                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2542                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2543                         break;
2544                 default:
2545                         assert(false);
2546                         DRV_LOG(WARNING,
2547                                 "unsupported item %p type %d,"
2548                                 " items must be validated"
2549                                 " before flow creation",
2550                                 (const void *)items, items->type);
2551                         return 0;
2552                 }
2553         }
2554         return size;
2555 }
2556
2557 /**
2558  * Calculate maximum size of memory for flow actions of Linux TC flower and
2559  * extract specified actions.
2560  *
2561  * @param[in] actions
2562  *   Pointer to the list of actions.
2563  * @param[out] action_flags
2564  *   Pointer to the detected actions.
2565  *
2566  * @return
2567  *   Maximum size of memory for actions.
2568  */
2569 static int
2570 flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
2571                               uint64_t *action_flags)
2572 {
2573         int size = 0;
2574         uint64_t flags = 0;
2575
2576         size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */
2577         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
2578                 switch (actions->type) {
2579                 case RTE_FLOW_ACTION_TYPE_VOID:
2580                         break;
2581                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
2582                         size += SZ_NLATTR_NEST + /* na_act_index. */
2583                                 SZ_NLATTR_STRZ_OF("mirred") +
2584                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2585                                 SZ_NLATTR_TYPE_OF(struct tc_mirred);
2586                         flags |= MLX5_FLOW_ACTION_PORT_ID;
2587                         break;
2588                 case RTE_FLOW_ACTION_TYPE_JUMP:
2589                         size += SZ_NLATTR_NEST + /* na_act_index. */
2590                                 SZ_NLATTR_STRZ_OF("gact") +
2591                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2592                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2593                         flags |= MLX5_FLOW_ACTION_JUMP;
2594                         break;
2595                 case RTE_FLOW_ACTION_TYPE_DROP:
2596                         size += SZ_NLATTR_NEST + /* na_act_index. */
2597                                 SZ_NLATTR_STRZ_OF("gact") +
2598                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2599                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2600                         flags |= MLX5_FLOW_ACTION_DROP;
2601                         break;
2602                 case RTE_FLOW_ACTION_TYPE_COUNT:
2603                         break;
2604                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
2605                         flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
2606                         goto action_of_vlan;
2607                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
2608                         flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
2609                         goto action_of_vlan;
2610                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
2611                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
2612                         goto action_of_vlan;
2613                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
2614                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
2615                         goto action_of_vlan;
2616 action_of_vlan:
2617                         size += SZ_NLATTR_NEST + /* na_act_index. */
2618                                 SZ_NLATTR_STRZ_OF("vlan") +
2619                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2620                                 SZ_NLATTR_TYPE_OF(struct tc_vlan) +
2621                                 SZ_NLATTR_TYPE_OF(uint16_t) +
2622                                 /* VLAN protocol. */
2623                                 SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
2624                                 SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
2625                         break;
2626                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
2627                         size += SZ_NLATTR_NEST + /* na_act_index. */
2628                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2629                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2630                                 SZ_NLATTR_TYPE_OF(uint8_t);
2631                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2632                         size += flow_tcf_vxlan_encap_size(actions) +
2633                                 RTE_ALIGN_CEIL /* preceding encap params. */
2634                                 (sizeof(struct flow_tcf_vxlan_encap),
2635                                 MNL_ALIGNTO);
2636                         flags |= MLX5_FLOW_ACTION_VXLAN_ENCAP;
2637                         break;
2638                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
2639                         size += SZ_NLATTR_NEST + /* na_act_index. */
2640                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2641                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2642                                 SZ_NLATTR_TYPE_OF(uint8_t);
2643                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2644                         size += RTE_ALIGN_CEIL /* preceding decap params. */
2645                                 (sizeof(struct flow_tcf_vxlan_decap),
2646                                 MNL_ALIGNTO);
2647                         flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2648                         break;
2649                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
2650                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
2651                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
2652                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
2653                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
2654                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
2655                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
2656                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
2657                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
2658                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
2659                         size += flow_tcf_get_pedit_actions_size(&actions,
2660                                                                 &flags);
2661                         break;
2662                 default:
2663                         DRV_LOG(WARNING,
2664                                 "unsupported action %p type %d,"
2665                                 " items must be validated before flow creation",
2666                                 (const void *)actions, actions->type);
2667                         break;
2668                 }
2669         }
2670         *action_flags = flags;
2671         return size;
2672 }
2673
2674 /**
2675  * Brand rtnetlink buffer with unique handle.
2676  *
2677  * This handle should be unique for a given network interface to avoid
2678  * collisions.
2679  *
2680  * @param nlh
2681  *   Pointer to Netlink message.
2682  * @param handle
2683  *   Unique 32-bit handle to use.
2684  */
2685 static void
2686 flow_tcf_nl_brand(struct nlmsghdr *nlh, uint32_t handle)
2687 {
2688         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
2689
2690         tcm->tcm_handle = handle;
2691         DRV_LOG(DEBUG, "Netlink msg %p is branded with handle %x",
2692                 (void *)nlh, handle);
2693 }
2694
2695 /**
2696  * Prepare a flow object for Linux TC flower. It calculates the maximum size of
2697  * memory required, allocates the memory, initializes Netlink message headers
2698  * and set unique TC message handle.
2699  *
2700  * @param[in] attr
2701  *   Pointer to the flow attributes.
2702  * @param[in] items
2703  *   Pointer to the list of items.
2704  * @param[in] actions
2705  *   Pointer to the list of actions.
2706  * @param[out] error
2707  *   Pointer to the error structure.
2708  *
2709  * @return
2710  *   Pointer to mlx5_flow object on success,
2711  *   otherwise NULL and rte_errno is set.
2712  */
2713 static struct mlx5_flow *
2714 flow_tcf_prepare(const struct rte_flow_attr *attr,
2715                  const struct rte_flow_item items[],
2716                  const struct rte_flow_action actions[],
2717                  struct rte_flow_error *error)
2718 {
2719         size_t size = RTE_ALIGN_CEIL
2720                         (sizeof(struct mlx5_flow),
2721                          alignof(struct flow_tcf_tunnel_hdr)) +
2722                       MNL_ALIGN(sizeof(struct nlmsghdr)) +
2723                       MNL_ALIGN(sizeof(struct tcmsg));
2724         struct mlx5_flow *dev_flow;
2725         uint64_t action_flags = 0;
2726         struct nlmsghdr *nlh;
2727         struct tcmsg *tcm;
2728         uint8_t *sp, *tun = NULL;
2729
2730         size += flow_tcf_get_items_size(attr, items, &action_flags);
2731         size += flow_tcf_get_actions_and_size(actions, &action_flags);
2732         dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
2733         if (!dev_flow) {
2734                 rte_flow_error_set(error, ENOMEM,
2735                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
2736                                    "not enough memory to create E-Switch flow");
2737                 return NULL;
2738         }
2739         sp = (uint8_t *)(dev_flow + 1);
2740         if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) {
2741                 sp = RTE_PTR_ALIGN
2742                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2743                 tun = sp;
2744                 sp += RTE_ALIGN_CEIL
2745                         (sizeof(struct flow_tcf_vxlan_encap),
2746                         MNL_ALIGNTO);
2747 #ifndef NDEBUG
2748                 size -= RTE_ALIGN_CEIL
2749                         (sizeof(struct flow_tcf_vxlan_encap),
2750                         MNL_ALIGNTO);
2751 #endif
2752         } else if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2753                 sp = RTE_PTR_ALIGN
2754                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2755                 tun = sp;
2756                 sp += RTE_ALIGN_CEIL
2757                         (sizeof(struct flow_tcf_vxlan_decap),
2758                         MNL_ALIGNTO);
2759 #ifndef NDEBUG
2760                 size -= RTE_ALIGN_CEIL
2761                         (sizeof(struct flow_tcf_vxlan_decap),
2762                         MNL_ALIGNTO);
2763 #endif
2764         } else {
2765                 sp = RTE_PTR_ALIGN(sp, MNL_ALIGNTO);
2766         }
2767         nlh = mnl_nlmsg_put_header(sp);
2768         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
2769         *dev_flow = (struct mlx5_flow){
2770                 .tcf = (struct mlx5_flow_tcf){
2771 #ifndef NDEBUG
2772                         .nlsize = size - RTE_ALIGN_CEIL
2773                                 (sizeof(struct mlx5_flow),
2774                                  alignof(struct flow_tcf_tunnel_hdr)),
2775 #endif
2776                         .tunnel = (struct flow_tcf_tunnel_hdr *)tun,
2777                         .nlh = nlh,
2778                         .tcm = tcm,
2779                 },
2780         };
2781         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP)
2782                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_DECAP;
2783         else if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP)
2784                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_ENCAP;
2785         /*
2786          * Generate a reasonably unique handle based on the address of the
2787          * target buffer.
2788          *
2789          * This is straightforward on 32-bit systems where the flow pointer can
2790          * be used directly. Otherwise, its least significant part is taken
2791          * after shifting it by the previous power of two of the pointed buffer
2792          * size.
2793          */
2794         if (sizeof(dev_flow) <= 4)
2795                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow);
2796         else
2797                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow >>
2798                                        rte_log2_u32(rte_align32prevpow2(size)));
2799         return dev_flow;
2800 }
2801
2802 /**
2803  * Make adjustments for supporting count actions.
2804  *
2805  * @param[in] dev
2806  *   Pointer to the Ethernet device structure.
2807  * @param[in] dev_flow
2808  *   Pointer to mlx5_flow.
2809  * @param[out] error
2810  *   Pointer to error structure.
2811  *
2812  * @return
2813  *   0 On success else a negative errno value is returned and rte_errno is set.
2814  */
2815 static int
2816 flow_tcf_translate_action_count(struct rte_eth_dev *dev __rte_unused,
2817                                   struct mlx5_flow *dev_flow,
2818                                   struct rte_flow_error *error)
2819 {
2820         struct rte_flow *flow = dev_flow->flow;
2821
2822         if (!flow->counter) {
2823                 flow->counter = flow_tcf_counter_new();
2824                 if (!flow->counter)
2825                         return rte_flow_error_set(error, rte_errno,
2826                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2827                                                   NULL,
2828                                                   "cannot get counter"
2829                                                   " context.");
2830         }
2831         return 0;
2832 }
2833
2834 /**
2835  * Convert VXLAN VNI to 32-bit integer.
2836  *
2837  * @param[in] vni
2838  *   VXLAN VNI in 24-bit wire format.
2839  *
2840  * @return
2841  *   VXLAN VNI as a 32-bit integer value in network endian.
2842  */
2843 static inline rte_be32_t
2844 vxlan_vni_as_be32(const uint8_t vni[3])
2845 {
2846         union {
2847                 uint8_t vni[4];
2848                 rte_be32_t dword;
2849         } ret = {
2850                 .vni = { 0, vni[0], vni[1], vni[2] },
2851         };
2852         return ret.dword;
2853 }
2854
2855 /**
2856  * Helper function to process RTE_FLOW_ITEM_TYPE_ETH entry in configuration
2857  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the MAC address fields
2858  * in the encapsulation parameters structure. The item must be prevalidated,
2859  * no any validation checks performed by function.
2860  *
2861  * @param[in] spec
2862  *   RTE_FLOW_ITEM_TYPE_ETH entry specification.
2863  * @param[in] mask
2864  *   RTE_FLOW_ITEM_TYPE_ETH entry mask.
2865  * @param[out] encap
2866  *   Structure to fill the gathered MAC address data.
2867  */
2868 static void
2869 flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec,
2870                                const struct rte_flow_item_eth *mask,
2871                                struct flow_tcf_vxlan_encap *encap)
2872 {
2873         /* Item must be validated before. No redundant checks. */
2874         assert(spec);
2875         if (!mask || !memcmp(&mask->dst,
2876                              &rte_flow_item_eth_mask.dst,
2877                              sizeof(rte_flow_item_eth_mask.dst))) {
2878                 /*
2879                  * Ethernet addresses are not supported by
2880                  * tc as tunnel_key parameters. Destination
2881                  * address is needed to form encap packet
2882                  * header and retrieved by kernel from
2883                  * implicit sources (ARP table, etc),
2884                  * address masks are not supported at all.
2885                  */
2886                 encap->eth.dst = spec->dst;
2887                 encap->mask |= FLOW_TCF_ENCAP_ETH_DST;
2888         }
2889         if (!mask || !memcmp(&mask->src,
2890                              &rte_flow_item_eth_mask.src,
2891                              sizeof(rte_flow_item_eth_mask.src))) {
2892                 /*
2893                  * Ethernet addresses are not supported by
2894                  * tc as tunnel_key parameters. Source ethernet
2895                  * address is ignored anyway.
2896                  */
2897                 encap->eth.src = spec->src;
2898                 encap->mask |= FLOW_TCF_ENCAP_ETH_SRC;
2899         }
2900 }
2901
2902 /**
2903  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV4 entry in configuration
2904  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV4 address fields
2905  * in the encapsulation parameters structure. The item must be prevalidated,
2906  * no any validation checks performed by function.
2907  *
2908  * @param[in] spec
2909  *   RTE_FLOW_ITEM_TYPE_IPV4 entry specification.
2910  * @param[out] encap
2911  *   Structure to fill the gathered IPV4 address data.
2912  */
2913 static void
2914 flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec,
2915                                 struct flow_tcf_vxlan_encap *encap)
2916 {
2917         /* Item must be validated before. No redundant checks. */
2918         assert(spec);
2919         encap->ipv4.dst = spec->hdr.dst_addr;
2920         encap->ipv4.src = spec->hdr.src_addr;
2921         encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC |
2922                        FLOW_TCF_ENCAP_IPV4_DST;
2923 }
2924
2925 /**
2926  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV6 entry in configuration
2927  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV6 address fields
2928  * in the encapsulation parameters structure. The item must be prevalidated,
2929  * no any validation checks performed by function.
2930  *
2931  * @param[in] spec
2932  *   RTE_FLOW_ITEM_TYPE_IPV6 entry specification.
2933  * @param[out] encap
2934  *   Structure to fill the gathered IPV6 address data.
2935  */
2936 static void
2937 flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec,
2938                                 struct flow_tcf_vxlan_encap *encap)
2939 {
2940         /* Item must be validated before. No redundant checks. */
2941         assert(spec);
2942         memcpy(encap->ipv6.dst, spec->hdr.dst_addr, IPV6_ADDR_LEN);
2943         memcpy(encap->ipv6.src, spec->hdr.src_addr, IPV6_ADDR_LEN);
2944         encap->mask |= FLOW_TCF_ENCAP_IPV6_SRC |
2945                        FLOW_TCF_ENCAP_IPV6_DST;
2946 }
2947
2948 /**
2949  * Helper function to process RTE_FLOW_ITEM_TYPE_UDP entry in configuration
2950  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the UDP port fields
2951  * in the encapsulation parameters structure. The item must be prevalidated,
2952  * no any validation checks performed by function.
2953  *
2954  * @param[in] spec
2955  *   RTE_FLOW_ITEM_TYPE_UDP entry specification.
2956  * @param[in] mask
2957  *   RTE_FLOW_ITEM_TYPE_UDP entry mask.
2958  * @param[out] encap
2959  *   Structure to fill the gathered UDP port data.
2960  */
2961 static void
2962 flow_tcf_parse_vxlan_encap_udp(const struct rte_flow_item_udp *spec,
2963                                const struct rte_flow_item_udp *mask,
2964                                struct flow_tcf_vxlan_encap *encap)
2965 {
2966         assert(spec);
2967         encap->udp.dst = spec->hdr.dst_port;
2968         encap->mask |= FLOW_TCF_ENCAP_UDP_DST;
2969         if (!mask || mask->hdr.src_port != RTE_BE16(0x0000)) {
2970                 encap->udp.src = spec->hdr.src_port;
2971                 encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC;
2972         }
2973 }
2974
2975 /**
2976  * Helper function to process RTE_FLOW_ITEM_TYPE_VXLAN entry in configuration
2977  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the VNI fields
2978  * in the encapsulation parameters structure. The item must be prevalidated,
2979  * no any validation checks performed by function.
2980  *
2981  * @param[in] spec
2982  *   RTE_FLOW_ITEM_TYPE_VXLAN entry specification.
2983  * @param[out] encap
2984  *   Structure to fill the gathered VNI address data.
2985  */
2986 static void
2987 flow_tcf_parse_vxlan_encap_vni(const struct rte_flow_item_vxlan *spec,
2988                                struct flow_tcf_vxlan_encap *encap)
2989 {
2990         /* Item must be validated before. Do not redundant checks. */
2991         assert(spec);
2992         memcpy(encap->vxlan.vni, spec->vni, sizeof(encap->vxlan.vni));
2993         encap->mask |= FLOW_TCF_ENCAP_VXLAN_VNI;
2994 }
2995
2996 /**
2997  * Populate consolidated encapsulation object from list of pattern items.
2998  *
2999  * Helper function to process configuration of action such as
3000  * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. The item list should be
3001  * validated, there is no way to return an meaningful error.
3002  *
3003  * @param[in] action
3004  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
3005  *   List of pattern items to gather data from.
3006  * @param[out] src
3007  *   Structure to fill gathered data.
3008  */
3009 static void
3010 flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action,
3011                            struct flow_tcf_vxlan_encap *encap)
3012 {
3013         union {
3014                 const struct rte_flow_item_eth *eth;
3015                 const struct rte_flow_item_ipv4 *ipv4;
3016                 const struct rte_flow_item_ipv6 *ipv6;
3017                 const struct rte_flow_item_udp *udp;
3018                 const struct rte_flow_item_vxlan *vxlan;
3019         } spec, mask;
3020         const struct rte_flow_item *items;
3021
3022         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
3023         assert(action->conf);
3024
3025         items = ((const struct rte_flow_action_vxlan_encap *)
3026                                         action->conf)->definition;
3027         assert(items);
3028         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3029                 switch (items->type) {
3030                 case RTE_FLOW_ITEM_TYPE_VOID:
3031                         break;
3032                 case RTE_FLOW_ITEM_TYPE_ETH:
3033                         mask.eth = items->mask;
3034                         spec.eth = items->spec;
3035                         flow_tcf_parse_vxlan_encap_eth(spec.eth, mask.eth,
3036                                                        encap);
3037                         break;
3038                 case RTE_FLOW_ITEM_TYPE_IPV4:
3039                         spec.ipv4 = items->spec;
3040                         flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4, encap);
3041                         break;
3042                 case RTE_FLOW_ITEM_TYPE_IPV6:
3043                         spec.ipv6 = items->spec;
3044                         flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6, encap);
3045                         break;
3046                 case RTE_FLOW_ITEM_TYPE_UDP:
3047                         mask.udp = items->mask;
3048                         spec.udp = items->spec;
3049                         flow_tcf_parse_vxlan_encap_udp(spec.udp, mask.udp,
3050                                                        encap);
3051                         break;
3052                 case RTE_FLOW_ITEM_TYPE_VXLAN:
3053                         spec.vxlan = items->spec;
3054                         flow_tcf_parse_vxlan_encap_vni(spec.vxlan, encap);
3055                         break;
3056                 default:
3057                         assert(false);
3058                         DRV_LOG(WARNING,
3059                                 "unsupported item %p type %d,"
3060                                 " items must be validated"
3061                                 " before flow creation",
3062                                 (const void *)items, items->type);
3063                         encap->mask = 0;
3064                         return;
3065                 }
3066         }
3067 }
3068
3069 /**
3070  * Translate flow for Linux TC flower and construct Netlink message.
3071  *
3072  * @param[in] priv
3073  *   Pointer to the priv structure.
3074  * @param[in, out] flow
3075  *   Pointer to the sub flow.
3076  * @param[in] attr
3077  *   Pointer to the flow attributes.
3078  * @param[in] items
3079  *   Pointer to the list of items.
3080  * @param[in] actions
3081  *   Pointer to the list of actions.
3082  * @param[out] error
3083  *   Pointer to the error structure.
3084  *
3085  * @return
3086  *   0 on success, a negative errno value otherwise and rte_errno is set.
3087  */
3088 static int
3089 flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow,
3090                    const struct rte_flow_attr *attr,
3091                    const struct rte_flow_item items[],
3092                    const struct rte_flow_action actions[],
3093                    struct rte_flow_error *error)
3094 {
3095         union {
3096                 const struct rte_flow_item_port_id *port_id;
3097                 const struct rte_flow_item_eth *eth;
3098                 const struct rte_flow_item_vlan *vlan;
3099                 const struct rte_flow_item_ipv4 *ipv4;
3100                 const struct rte_flow_item_ipv6 *ipv6;
3101                 const struct rte_flow_item_tcp *tcp;
3102                 const struct rte_flow_item_udp *udp;
3103                 const struct rte_flow_item_vxlan *vxlan;
3104         } spec, mask;
3105         union {
3106                 const struct rte_flow_action_port_id *port_id;
3107                 const struct rte_flow_action_jump *jump;
3108                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
3109                 const struct rte_flow_action_of_set_vlan_vid *
3110                         of_set_vlan_vid;
3111                 const struct rte_flow_action_of_set_vlan_pcp *
3112                         of_set_vlan_pcp;
3113         } conf;
3114         union {
3115                 struct flow_tcf_tunnel_hdr *hdr;
3116                 struct flow_tcf_vxlan_decap *vxlan;
3117         } decap = {
3118                 .hdr = NULL,
3119         };
3120         union {
3121                 struct flow_tcf_tunnel_hdr *hdr;
3122                 struct flow_tcf_vxlan_encap *vxlan;
3123         } encap = {
3124                 .hdr = NULL,
3125         };
3126         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
3127         struct nlmsghdr *nlh = dev_flow->tcf.nlh;
3128         struct tcmsg *tcm = dev_flow->tcf.tcm;
3129         uint32_t na_act_index_cur;
3130         rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
3131         rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
3132         rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
3133         bool ip_proto_set = 0;
3134         bool tunnel_outer = 0;
3135         struct nlattr *na_flower;
3136         struct nlattr *na_flower_act;
3137         struct nlattr *na_vlan_id = NULL;
3138         struct nlattr *na_vlan_priority = NULL;
3139         uint64_t item_flags = 0;
3140         int ret;
3141
3142         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
3143                                                 PTOI_TABLE_SZ_MAX(dev)));
3144         if (dev_flow->tcf.tunnel) {
3145                 switch (dev_flow->tcf.tunnel->type) {
3146                 case FLOW_TCF_TUNACT_VXLAN_DECAP:
3147                         decap.vxlan = dev_flow->tcf.vxlan_decap;
3148                         tunnel_outer = 1;
3149                         break;
3150                 case FLOW_TCF_TUNACT_VXLAN_ENCAP:
3151                         encap.vxlan = dev_flow->tcf.vxlan_encap;
3152                         break;
3153                 /* New tunnel actions can be added here. */
3154                 default:
3155                         assert(false);
3156                         break;
3157                 }
3158         }
3159         nlh = dev_flow->tcf.nlh;
3160         tcm = dev_flow->tcf.tcm;
3161         /* Prepare API must have been called beforehand. */
3162         assert(nlh != NULL && tcm != NULL);
3163         tcm->tcm_family = AF_UNSPEC;
3164         tcm->tcm_ifindex = ptoi[0].ifindex;
3165         tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
3166         /*
3167          * Priority cannot be zero to prevent the kernel from picking one
3168          * automatically.
3169          */
3170         tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16, outer_etype);
3171         if (attr->group > 0)
3172                 mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group);
3173         mnl_attr_put_strz(nlh, TCA_KIND, "flower");
3174         na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
3175         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3176                 unsigned int i;
3177
3178                 switch (items->type) {
3179                 case RTE_FLOW_ITEM_TYPE_VOID:
3180                         break;
3181                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
3182                         mask.port_id = flow_tcf_item_mask
3183                                 (items, &rte_flow_item_port_id_mask,
3184                                  &flow_tcf_mask_supported.port_id,
3185                                  &flow_tcf_mask_empty.port_id,
3186                                  sizeof(flow_tcf_mask_supported.port_id),
3187                                  error);
3188                         assert(mask.port_id);
3189                         if (mask.port_id == &flow_tcf_mask_empty.port_id)
3190                                 break;
3191                         spec.port_id = items->spec;
3192                         if (!mask.port_id->id)
3193                                 i = 0;
3194                         else
3195                                 for (i = 0; ptoi[i].ifindex; ++i)
3196                                         if (ptoi[i].port_id == spec.port_id->id)
3197                                                 break;
3198                         assert(ptoi[i].ifindex);
3199                         tcm->tcm_ifindex = ptoi[i].ifindex;
3200                         break;
3201                 case RTE_FLOW_ITEM_TYPE_ETH:
3202                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3203                                       MLX5_FLOW_LAYER_INNER_L2 :
3204                                       MLX5_FLOW_LAYER_OUTER_L2;
3205                         mask.eth = flow_tcf_item_mask
3206                                 (items, &rte_flow_item_eth_mask,
3207                                  &flow_tcf_mask_supported.eth,
3208                                  &flow_tcf_mask_empty.eth,
3209                                  sizeof(flow_tcf_mask_supported.eth),
3210                                  error);
3211                         assert(mask.eth);
3212                         if (mask.eth == &flow_tcf_mask_empty.eth)
3213                                 break;
3214                         spec.eth = items->spec;
3215                         if (mask.eth->type) {
3216                                 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
3217                                         inner_etype = spec.eth->type;
3218                                 else
3219                                         outer_etype = spec.eth->type;
3220                         }
3221                         if (tunnel_outer) {
3222                                 DRV_LOG(WARNING,
3223                                         "outer L2 addresses cannot be"
3224                                         " forced is outer ones for tunnel,"
3225                                         " parameter is ignored");
3226                                 break;
3227                         }
3228                         if (!is_zero_ether_addr(&mask.eth->dst)) {
3229                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
3230                                              ETHER_ADDR_LEN,
3231                                              spec.eth->dst.addr_bytes);
3232                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK,
3233                                              ETHER_ADDR_LEN,
3234                                              mask.eth->dst.addr_bytes);
3235                         }
3236                         if (!is_zero_ether_addr(&mask.eth->src)) {
3237                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC,
3238                                              ETHER_ADDR_LEN,
3239                                              spec.eth->src.addr_bytes);
3240                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK,
3241                                              ETHER_ADDR_LEN,
3242                                              mask.eth->src.addr_bytes);
3243                         }
3244                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3245                         break;
3246                 case RTE_FLOW_ITEM_TYPE_VLAN:
3247                         assert(!encap.hdr);
3248                         assert(!decap.hdr);
3249                         assert(!tunnel_outer);
3250                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
3251                         mask.vlan = flow_tcf_item_mask
3252                                 (items, &rte_flow_item_vlan_mask,
3253                                  &flow_tcf_mask_supported.vlan,
3254                                  &flow_tcf_mask_empty.vlan,
3255                                  sizeof(flow_tcf_mask_supported.vlan),
3256                                  error);
3257                         assert(mask.vlan);
3258                         if (mask.vlan == &flow_tcf_mask_empty.vlan)
3259                                 break;
3260                         spec.vlan = items->spec;
3261                         assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3262                                outer_etype == RTE_BE16(ETH_P_8021Q));
3263                         outer_etype = RTE_BE16(ETH_P_8021Q);
3264                         if (mask.vlan->inner_type)
3265                                 vlan_etype = spec.vlan->inner_type;
3266                         if (mask.vlan->tci & RTE_BE16(0xe000))
3267                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_VLAN_PRIO,
3268                                                 (rte_be_to_cpu_16
3269                                                  (spec.vlan->tci) >> 13) & 0x7);
3270                         if (mask.vlan->tci & RTE_BE16(0x0fff))
3271                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_VLAN_ID,
3272                                                  rte_be_to_cpu_16
3273                                                  (spec.vlan->tci &
3274                                                   RTE_BE16(0x0fff)));
3275                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3276                         break;
3277                 case RTE_FLOW_ITEM_TYPE_IPV4:
3278                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3279                                       MLX5_FLOW_LAYER_INNER_L3_IPV4 :
3280                                       MLX5_FLOW_LAYER_OUTER_L3_IPV4;
3281                         mask.ipv4 = flow_tcf_item_mask
3282                                 (items, &rte_flow_item_ipv4_mask,
3283                                  &flow_tcf_mask_supported.ipv4,
3284                                  &flow_tcf_mask_empty.ipv4,
3285                                  sizeof(flow_tcf_mask_supported.ipv4),
3286                                  error);
3287                         assert(mask.ipv4);
3288                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3289                                 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3290                                        inner_etype == RTE_BE16(ETH_P_IP));
3291                                 inner_etype = RTE_BE16(ETH_P_IP);
3292                         } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3293                                 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3294                                        vlan_etype == RTE_BE16(ETH_P_IP));
3295                                 vlan_etype = RTE_BE16(ETH_P_IP);
3296                         } else {
3297                                 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3298                                        outer_etype == RTE_BE16(ETH_P_IP));
3299                                 outer_etype = RTE_BE16(ETH_P_IP);
3300                         }
3301                         spec.ipv4 = items->spec;
3302                         if (!tunnel_outer && mask.ipv4->hdr.next_proto_id) {
3303                                 /*
3304                                  * No way to set IP protocol for outer tunnel
3305                                  * layers. Usually it is fixed, for example,
3306                                  * to UDP for VXLAN/GPE.
3307                                  */
3308                                 assert(spec.ipv4); /* Mask is not empty. */
3309                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3310                                                 spec.ipv4->hdr.next_proto_id);
3311                                 ip_proto_set = 1;
3312                         }
3313                         if (mask.ipv4 == &flow_tcf_mask_empty.ipv4 ||
3314                              (!mask.ipv4->hdr.src_addr &&
3315                               !mask.ipv4->hdr.dst_addr)) {
3316                                 if (!tunnel_outer)
3317                                         break;
3318                                 /*
3319                                  * For tunnel outer we must set outer IP key
3320                                  * anyway, even if the specification/mask is
3321                                  * empty. There is no another way to tell
3322                                  * kernel about he outer layer protocol.
3323                                  */
3324                                 mnl_attr_put_u32
3325                                         (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC,
3326                                          mask.ipv4->hdr.src_addr);
3327                                 mnl_attr_put_u32
3328                                         (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
3329                                          mask.ipv4->hdr.src_addr);
3330                                 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3331                                 break;
3332                         }
3333                         if (mask.ipv4->hdr.src_addr) {
3334                                 mnl_attr_put_u32
3335                                         (nlh, tunnel_outer ?
3336                                          TCA_FLOWER_KEY_ENC_IPV4_SRC :
3337                                          TCA_FLOWER_KEY_IPV4_SRC,
3338                                          spec.ipv4->hdr.src_addr);
3339                                 mnl_attr_put_u32
3340                                         (nlh, tunnel_outer ?
3341                                          TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK :
3342                                          TCA_FLOWER_KEY_IPV4_SRC_MASK,
3343                                          mask.ipv4->hdr.src_addr);
3344                         }
3345                         if (mask.ipv4->hdr.dst_addr) {
3346                                 mnl_attr_put_u32
3347                                         (nlh, tunnel_outer ?
3348                                          TCA_FLOWER_KEY_ENC_IPV4_DST :
3349                                          TCA_FLOWER_KEY_IPV4_DST,
3350                                          spec.ipv4->hdr.dst_addr);
3351                                 mnl_attr_put_u32
3352                                         (nlh, tunnel_outer ?
3353                                          TCA_FLOWER_KEY_ENC_IPV4_DST_MASK :
3354                                          TCA_FLOWER_KEY_IPV4_DST_MASK,
3355                                          mask.ipv4->hdr.dst_addr);
3356                         }
3357                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3358                         break;
3359                 case RTE_FLOW_ITEM_TYPE_IPV6: {
3360                         bool ipv6_src, ipv6_dst;
3361
3362                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3363                                       MLX5_FLOW_LAYER_INNER_L3_IPV6 :
3364                                       MLX5_FLOW_LAYER_OUTER_L3_IPV6;
3365                         mask.ipv6 = flow_tcf_item_mask
3366                                 (items, &rte_flow_item_ipv6_mask,
3367                                  &flow_tcf_mask_supported.ipv6,
3368                                  &flow_tcf_mask_empty.ipv6,
3369                                  sizeof(flow_tcf_mask_supported.ipv6),
3370                                  error);
3371                         assert(mask.ipv6);
3372                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3373                                 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3374                                        inner_etype == RTE_BE16(ETH_P_IPV6));
3375                                 inner_etype = RTE_BE16(ETH_P_IPV6);
3376                         } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3377                                 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3378                                        vlan_etype == RTE_BE16(ETH_P_IPV6));
3379                                 vlan_etype = RTE_BE16(ETH_P_IPV6);
3380                         } else {
3381                                 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3382                                        outer_etype == RTE_BE16(ETH_P_IPV6));
3383                                 outer_etype = RTE_BE16(ETH_P_IPV6);
3384                         }
3385                         spec.ipv6 = items->spec;
3386                         if (!tunnel_outer && mask.ipv6->hdr.proto) {
3387                                 /*
3388                                  * No way to set IP protocol for outer tunnel
3389                                  * layers. Usually it is fixed, for example,
3390                                  * to UDP for VXLAN/GPE.
3391                                  */
3392                                 assert(spec.ipv6); /* Mask is not empty. */
3393                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3394                                                 spec.ipv6->hdr.proto);
3395                                 ip_proto_set = 1;
3396                         }
3397                         ipv6_dst = !IN6_IS_ADDR_UNSPECIFIED
3398                                                 (mask.ipv6->hdr.dst_addr);
3399                         ipv6_src = !IN6_IS_ADDR_UNSPECIFIED
3400                                                 (mask.ipv6->hdr.src_addr);
3401                         if (mask.ipv6 == &flow_tcf_mask_empty.ipv6 ||
3402                              (!ipv6_dst && !ipv6_src)) {
3403                                 if (!tunnel_outer)
3404                                         break;
3405                                 /*
3406                                  * For tunnel outer we must set outer IP key
3407                                  * anyway, even if the specification/mask is
3408                                  * empty. There is no another way to tell
3409                                  * kernel about he outer layer protocol.
3410                                  */
3411                                 mnl_attr_put(nlh,
3412                                              TCA_FLOWER_KEY_ENC_IPV6_SRC,
3413                                              IPV6_ADDR_LEN,
3414                                              mask.ipv6->hdr.src_addr);
3415                                 mnl_attr_put(nlh,
3416                                              TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
3417                                              IPV6_ADDR_LEN,
3418                                              mask.ipv6->hdr.src_addr);
3419                                 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3420                                 break;
3421                         }
3422                         if (ipv6_src) {
3423                                 mnl_attr_put(nlh, tunnel_outer ?
3424                                              TCA_FLOWER_KEY_ENC_IPV6_SRC :
3425                                              TCA_FLOWER_KEY_IPV6_SRC,
3426                                              IPV6_ADDR_LEN,
3427                                              spec.ipv6->hdr.src_addr);
3428                                 mnl_attr_put(nlh, tunnel_outer ?
3429                                              TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK :
3430                                              TCA_FLOWER_KEY_IPV6_SRC_MASK,
3431                                              IPV6_ADDR_LEN,
3432                                              mask.ipv6->hdr.src_addr);
3433                         }
3434                         if (ipv6_dst) {
3435                                 mnl_attr_put(nlh, tunnel_outer ?
3436                                              TCA_FLOWER_KEY_ENC_IPV6_DST :
3437                                              TCA_FLOWER_KEY_IPV6_DST,
3438                                              IPV6_ADDR_LEN,
3439                                              spec.ipv6->hdr.dst_addr);
3440                                 mnl_attr_put(nlh, tunnel_outer ?
3441                                              TCA_FLOWER_KEY_ENC_IPV6_DST_MASK :
3442                                              TCA_FLOWER_KEY_IPV6_DST_MASK,
3443                                              IPV6_ADDR_LEN,
3444                                              mask.ipv6->hdr.dst_addr);
3445                         }
3446                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3447                         break;
3448                 }
3449                 case RTE_FLOW_ITEM_TYPE_UDP:
3450                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3451                                       MLX5_FLOW_LAYER_INNER_L4_UDP :
3452                                       MLX5_FLOW_LAYER_OUTER_L4_UDP;
3453                         mask.udp = flow_tcf_item_mask
3454                                 (items, &rte_flow_item_udp_mask,
3455                                  &flow_tcf_mask_supported.udp,
3456                                  &flow_tcf_mask_empty.udp,
3457                                  sizeof(flow_tcf_mask_supported.udp),
3458                                  error);
3459                         assert(mask.udp);
3460                         spec.udp = items->spec;
3461                         if (!tunnel_outer) {
3462                                 if (!ip_proto_set)
3463                                         mnl_attr_put_u8
3464                                                 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3465                                                 IPPROTO_UDP);
3466                                 if (mask.udp == &flow_tcf_mask_empty.udp)
3467                                         break;
3468                         } else {
3469                                 assert(mask.udp != &flow_tcf_mask_empty.udp);
3470                                 decap.vxlan->udp_port =
3471                                         rte_be_to_cpu_16
3472                                                 (spec.udp->hdr.dst_port);
3473                         }
3474                         if (mask.udp->hdr.src_port) {
3475                                 mnl_attr_put_u16
3476                                         (nlh, tunnel_outer ?
3477                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT :
3478                                          TCA_FLOWER_KEY_UDP_SRC,
3479                                          spec.udp->hdr.src_port);
3480                                 mnl_attr_put_u16
3481                                         (nlh, tunnel_outer ?
3482                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK :
3483                                          TCA_FLOWER_KEY_UDP_SRC_MASK,
3484                                          mask.udp->hdr.src_port);
3485                         }
3486                         if (mask.udp->hdr.dst_port) {
3487                                 mnl_attr_put_u16
3488                                         (nlh, tunnel_outer ?
3489                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT :
3490                                          TCA_FLOWER_KEY_UDP_DST,
3491                                          spec.udp->hdr.dst_port);
3492                                 mnl_attr_put_u16
3493                                         (nlh, tunnel_outer ?
3494                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK :
3495                                          TCA_FLOWER_KEY_UDP_DST_MASK,
3496                                          mask.udp->hdr.dst_port);
3497                         }
3498                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3499                         break;
3500                 case RTE_FLOW_ITEM_TYPE_TCP:
3501                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3502                                       MLX5_FLOW_LAYER_INNER_L4_TCP :
3503                                       MLX5_FLOW_LAYER_OUTER_L4_TCP;
3504                         mask.tcp = flow_tcf_item_mask
3505                                 (items, &rte_flow_item_tcp_mask,
3506                                  &flow_tcf_mask_supported.tcp,
3507                                  &flow_tcf_mask_empty.tcp,
3508                                  sizeof(flow_tcf_mask_supported.tcp),
3509                                  error);
3510                         assert(mask.tcp);
3511                         if (!ip_proto_set)
3512                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3513                                                 IPPROTO_TCP);
3514                         if (mask.tcp == &flow_tcf_mask_empty.tcp)
3515                                 break;
3516                         spec.tcp = items->spec;
3517                         if (mask.tcp->hdr.src_port) {
3518                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_SRC,
3519                                                  spec.tcp->hdr.src_port);
3520                                 mnl_attr_put_u16(nlh,
3521                                                  TCA_FLOWER_KEY_TCP_SRC_MASK,
3522                                                  mask.tcp->hdr.src_port);
3523                         }
3524                         if (mask.tcp->hdr.dst_port) {
3525                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_DST,
3526                                                  spec.tcp->hdr.dst_port);
3527                                 mnl_attr_put_u16(nlh,
3528                                                  TCA_FLOWER_KEY_TCP_DST_MASK,
3529                                                  mask.tcp->hdr.dst_port);
3530                         }
3531                         if (mask.tcp->hdr.tcp_flags) {
3532                                 mnl_attr_put_u16
3533                                         (nlh,
3534                                          TCA_FLOWER_KEY_TCP_FLAGS,
3535                                          rte_cpu_to_be_16
3536                                                 (spec.tcp->hdr.tcp_flags));
3537                                 mnl_attr_put_u16
3538                                         (nlh,
3539                                          TCA_FLOWER_KEY_TCP_FLAGS_MASK,
3540                                          rte_cpu_to_be_16
3541                                                 (mask.tcp->hdr.tcp_flags));
3542                         }
3543                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3544                         break;
3545                 case RTE_FLOW_ITEM_TYPE_VXLAN:
3546                         assert(decap.vxlan);
3547                         tunnel_outer = 0;
3548                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
3549                         spec.vxlan = items->spec;
3550                         mnl_attr_put_u32(nlh,
3551                                          TCA_FLOWER_KEY_ENC_KEY_ID,
3552                                          vxlan_vni_as_be32(spec.vxlan->vni));
3553                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3554                         break;
3555                 default:
3556                         return rte_flow_error_set(error, ENOTSUP,
3557                                                   RTE_FLOW_ERROR_TYPE_ITEM,
3558                                                   NULL, "item not supported");
3559                 }
3560         }
3561         /*
3562          * Set the ether_type flower key and tc rule protocol:
3563          * - if there is nor VLAN neither VXLAN the key is taken from
3564          *   eth item directly or deduced from L3 items.
3565          * - if there is vlan item then key is fixed to 802.1q.
3566          * - if there is vxlan item then key is set to inner tunnel type.
3567          * - simultaneous vlan and vxlan items are prohibited.
3568          */
3569         if (outer_etype != RTE_BE16(ETH_P_ALL)) {
3570                 tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
3571                                            outer_etype);
3572                 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3573                         if (inner_etype != RTE_BE16(ETH_P_ALL))
3574                                 mnl_attr_put_u16(nlh,
3575                                                  TCA_FLOWER_KEY_ETH_TYPE,
3576                                                  inner_etype);
3577                 } else {
3578                         mnl_attr_put_u16(nlh,
3579                                          TCA_FLOWER_KEY_ETH_TYPE,
3580                                          outer_etype);
3581                         if (outer_etype == RTE_BE16(ETH_P_8021Q) &&
3582                             vlan_etype != RTE_BE16(ETH_P_ALL))
3583                                 mnl_attr_put_u16(nlh,
3584                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE,
3585                                                  vlan_etype);
3586                 }
3587                 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3588         }
3589         na_flower_act = mnl_attr_nest_start(nlh, TCA_FLOWER_ACT);
3590         na_act_index_cur = 1;
3591         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
3592                 struct nlattr *na_act_index;
3593                 struct nlattr *na_act;
3594                 unsigned int vlan_act;
3595                 unsigned int i;
3596
3597                 switch (actions->type) {
3598                 case RTE_FLOW_ACTION_TYPE_VOID:
3599                         break;
3600                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
3601                         conf.port_id = actions->conf;
3602                         if (conf.port_id->original)
3603                                 i = 0;
3604                         else
3605                                 for (i = 0; ptoi[i].ifindex; ++i)
3606                                         if (ptoi[i].port_id == conf.port_id->id)
3607                                                 break;
3608                         assert(ptoi[i].ifindex);
3609                         na_act_index =
3610                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3611                         assert(na_act_index);
3612                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
3613                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3614                         assert(na_act);
3615                         if (encap.hdr) {
3616                                 assert(dev_flow->tcf.tunnel);
3617                                 dev_flow->tcf.tunnel->ifindex_ptr =
3618                                         &((struct tc_mirred *)
3619                                         mnl_attr_get_payload
3620                                         (mnl_nlmsg_get_payload_tail
3621                                                 (nlh)))->ifindex;
3622                         }
3623                         mnl_attr_put(nlh, TCA_MIRRED_PARMS,
3624                                      sizeof(struct tc_mirred),
3625                                      &(struct tc_mirred){
3626                                         .action = TC_ACT_STOLEN,
3627                                         .eaction = TCA_EGRESS_REDIR,
3628                                         .ifindex = ptoi[i].ifindex,
3629                                      });
3630                         mnl_attr_nest_end(nlh, na_act);
3631                         mnl_attr_nest_end(nlh, na_act_index);
3632                         break;
3633                 case RTE_FLOW_ACTION_TYPE_JUMP:
3634                         conf.jump = actions->conf;
3635                         na_act_index =
3636                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3637                         assert(na_act_index);
3638                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3639                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3640                         assert(na_act);
3641                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3642                                      sizeof(struct tc_gact),
3643                                      &(struct tc_gact){
3644                                         .action = TC_ACT_GOTO_CHAIN |
3645                                                   conf.jump->group,
3646                                      });
3647                         mnl_attr_nest_end(nlh, na_act);
3648                         mnl_attr_nest_end(nlh, na_act_index);
3649                         break;
3650                 case RTE_FLOW_ACTION_TYPE_DROP:
3651                         na_act_index =
3652                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3653                         assert(na_act_index);
3654                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3655                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3656                         assert(na_act);
3657                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3658                                      sizeof(struct tc_gact),
3659                                      &(struct tc_gact){
3660                                         .action = TC_ACT_SHOT,
3661                                      });
3662                         mnl_attr_nest_end(nlh, na_act);
3663                         mnl_attr_nest_end(nlh, na_act_index);
3664                         break;
3665                 case RTE_FLOW_ACTION_TYPE_COUNT:
3666                         /*
3667                          * Driver adds the count action implicitly for
3668                          * each rule it creates.
3669                          */
3670                         ret = flow_tcf_translate_action_count(dev,
3671                                                               dev_flow, error);
3672                         if (ret < 0)
3673                                 return ret;
3674                         break;
3675                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
3676                         conf.of_push_vlan = NULL;
3677                         vlan_act = TCA_VLAN_ACT_POP;
3678                         goto action_of_vlan;
3679                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
3680                         conf.of_push_vlan = actions->conf;
3681                         vlan_act = TCA_VLAN_ACT_PUSH;
3682                         goto action_of_vlan;
3683                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
3684                         conf.of_set_vlan_vid = actions->conf;
3685                         if (na_vlan_id)
3686                                 goto override_na_vlan_id;
3687                         vlan_act = TCA_VLAN_ACT_MODIFY;
3688                         goto action_of_vlan;
3689                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
3690                         conf.of_set_vlan_pcp = actions->conf;
3691                         if (na_vlan_priority)
3692                                 goto override_na_vlan_priority;
3693                         vlan_act = TCA_VLAN_ACT_MODIFY;
3694                         goto action_of_vlan;
3695 action_of_vlan:
3696                         na_act_index =
3697                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3698                         assert(na_act_index);
3699                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "vlan");
3700                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3701                         assert(na_act);
3702                         mnl_attr_put(nlh, TCA_VLAN_PARMS,
3703                                      sizeof(struct tc_vlan),
3704                                      &(struct tc_vlan){
3705                                         .action = TC_ACT_PIPE,
3706                                         .v_action = vlan_act,
3707                                      });
3708                         if (vlan_act == TCA_VLAN_ACT_POP) {
3709                                 mnl_attr_nest_end(nlh, na_act);
3710                                 mnl_attr_nest_end(nlh, na_act_index);
3711                                 break;
3712                         }
3713                         if (vlan_act == TCA_VLAN_ACT_PUSH)
3714                                 mnl_attr_put_u16(nlh,
3715                                                  TCA_VLAN_PUSH_VLAN_PROTOCOL,
3716                                                  conf.of_push_vlan->ethertype);
3717                         na_vlan_id = mnl_nlmsg_get_payload_tail(nlh);
3718                         mnl_attr_put_u16(nlh, TCA_VLAN_PAD, 0);
3719                         na_vlan_priority = mnl_nlmsg_get_payload_tail(nlh);
3720                         mnl_attr_put_u8(nlh, TCA_VLAN_PAD, 0);
3721                         mnl_attr_nest_end(nlh, na_act);
3722                         mnl_attr_nest_end(nlh, na_act_index);
3723                         if (actions->type ==
3724                             RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
3725 override_na_vlan_id:
3726                                 na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID;
3727                                 *(uint16_t *)mnl_attr_get_payload(na_vlan_id) =
3728                                         rte_be_to_cpu_16
3729                                         (conf.of_set_vlan_vid->vlan_vid);
3730                         } else if (actions->type ==
3731                                    RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
3732 override_na_vlan_priority:
3733                                 na_vlan_priority->nla_type =
3734                                         TCA_VLAN_PUSH_VLAN_PRIORITY;
3735                                 *(uint8_t *)mnl_attr_get_payload
3736                                         (na_vlan_priority) =
3737                                         conf.of_set_vlan_pcp->vlan_pcp;
3738                         }
3739                         break;
3740                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
3741                         assert(decap.vxlan);
3742                         assert(dev_flow->tcf.tunnel);
3743                         dev_flow->tcf.tunnel->ifindex_ptr =
3744                                 (unsigned int *)&tcm->tcm_ifindex;
3745                         na_act_index =
3746                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3747                         assert(na_act_index);
3748                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3749                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3750                         assert(na_act);
3751                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3752                                 sizeof(struct tc_tunnel_key),
3753                                 &(struct tc_tunnel_key){
3754                                         .action = TC_ACT_PIPE,
3755                                         .t_action = TCA_TUNNEL_KEY_ACT_RELEASE,
3756                                         });
3757                         mnl_attr_nest_end(nlh, na_act);
3758                         mnl_attr_nest_end(nlh, na_act_index);
3759                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3760                         break;
3761                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
3762                         assert(encap.vxlan);
3763                         flow_tcf_vxlan_encap_parse(actions, encap.vxlan);
3764                         na_act_index =
3765                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3766                         assert(na_act_index);
3767                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3768                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3769                         assert(na_act);
3770                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3771                                 sizeof(struct tc_tunnel_key),
3772                                 &(struct tc_tunnel_key){
3773                                         .action = TC_ACT_PIPE,
3774                                         .t_action = TCA_TUNNEL_KEY_ACT_SET,
3775                                         });
3776                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_UDP_DST)
3777                                 mnl_attr_put_u16(nlh,
3778                                          TCA_TUNNEL_KEY_ENC_DST_PORT,
3779                                          encap.vxlan->udp.dst);
3780                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_SRC)
3781                                 mnl_attr_put_u32(nlh,
3782                                          TCA_TUNNEL_KEY_ENC_IPV4_SRC,
3783                                          encap.vxlan->ipv4.src);
3784                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_DST)
3785                                 mnl_attr_put_u32(nlh,
3786                                          TCA_TUNNEL_KEY_ENC_IPV4_DST,
3787                                          encap.vxlan->ipv4.dst);
3788                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_SRC)
3789                                 mnl_attr_put(nlh,
3790                                          TCA_TUNNEL_KEY_ENC_IPV6_SRC,
3791                                          sizeof(encap.vxlan->ipv6.src),
3792                                          &encap.vxlan->ipv6.src);
3793                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_DST)
3794                                 mnl_attr_put(nlh,
3795                                          TCA_TUNNEL_KEY_ENC_IPV6_DST,
3796                                          sizeof(encap.vxlan->ipv6.dst),
3797                                          &encap.vxlan->ipv6.dst);
3798                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_VXLAN_VNI)
3799                                 mnl_attr_put_u32(nlh,
3800                                          TCA_TUNNEL_KEY_ENC_KEY_ID,
3801                                          vxlan_vni_as_be32
3802                                                 (encap.vxlan->vxlan.vni));
3803                         mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM, 0);
3804                         mnl_attr_nest_end(nlh, na_act);
3805                         mnl_attr_nest_end(nlh, na_act_index);
3806                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3807                         break;
3808                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
3809                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
3810                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
3811                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
3812                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
3813                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
3814                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
3815                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
3816                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
3817                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
3818                         na_act_index =
3819                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3820                         flow_tcf_create_pedit_mnl_msg(nlh,
3821                                                       &actions, item_flags);
3822                         mnl_attr_nest_end(nlh, na_act_index);
3823                         break;
3824                 default:
3825                         return rte_flow_error_set(error, ENOTSUP,
3826                                                   RTE_FLOW_ERROR_TYPE_ACTION,
3827                                                   actions,
3828                                                   "action not supported");
3829                 }
3830         }
3831         assert(na_flower);
3832         assert(na_flower_act);
3833         mnl_attr_nest_end(nlh, na_flower_act);
3834         dev_flow->tcf.ptc_flags = mnl_attr_get_payload
3835                                         (mnl_nlmsg_get_payload_tail(nlh));
3836         mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, decap.vxlan ?
3837                                                 0 : TCA_CLS_FLAGS_SKIP_SW);
3838         mnl_attr_nest_end(nlh, na_flower);
3839         if (dev_flow->tcf.tunnel && dev_flow->tcf.tunnel->ifindex_ptr)
3840                 dev_flow->tcf.tunnel->ifindex_org =
3841                         *dev_flow->tcf.tunnel->ifindex_ptr;
3842         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3843         return 0;
3844 }
3845
3846 /**
3847  * Send Netlink message with acknowledgment.
3848  *
3849  * @param tcf
3850  *   Flow context to use.
3851  * @param nlh
3852  *   Message to send. This function always raises the NLM_F_ACK flag before
3853  *   sending.
3854  * @param[in] cb
3855  *   Callback handler for received message.
3856  * @param[in] arg
3857  *   Context pointer for callback handler.
3858  *
3859  * @return
3860  *   0 on success, a negative errno value otherwise and rte_errno is set.
3861  */
3862 static int
3863 flow_tcf_nl_ack(struct mlx5_flow_tcf_context *tcf,
3864                 struct nlmsghdr *nlh,
3865                 mnl_cb_t cb, void *arg)
3866 {
3867         unsigned int portid = mnl_socket_get_portid(tcf->nl);
3868         uint32_t seq = tcf->seq++;
3869         int ret, err = 0;
3870
3871         assert(tcf->nl);
3872         assert(tcf->buf);
3873         if (!seq) {
3874                 /* seq 0 is reserved for kernel event-driven notifications. */
3875                 seq = tcf->seq++;
3876         }
3877         nlh->nlmsg_seq = seq;
3878         nlh->nlmsg_flags |= NLM_F_ACK;
3879         ret = mnl_socket_sendto(tcf->nl, nlh, nlh->nlmsg_len);
3880         if (ret <= 0) {
3881                 /* Message send error occurres. */
3882                 rte_errno = errno;
3883                 return -rte_errno;
3884         }
3885         nlh = (struct nlmsghdr *)(tcf->buf);
3886         /*
3887          * The following loop postpones non-fatal errors until multipart
3888          * messages are complete.
3889          */
3890         while (true) {
3891                 ret = mnl_socket_recvfrom(tcf->nl, tcf->buf, tcf->buf_size);
3892                 if (ret < 0) {
3893                         err = errno;
3894                         /*
3895                          * In case of overflow Will receive till
3896                          * end of multipart message. We may lost part
3897                          * of reply messages but mark and return an error.
3898                          */
3899                         if (err != ENOSPC ||
3900                             !(nlh->nlmsg_flags & NLM_F_MULTI) ||
3901                             nlh->nlmsg_type == NLMSG_DONE)
3902                                 break;
3903                 } else {
3904                         ret = mnl_cb_run(nlh, ret, seq, portid, cb, arg);
3905                         if (!ret) {
3906                                 /*
3907                                  * libmnl returns 0 if DONE or
3908                                  * success ACK message found.
3909                                  */
3910                                 break;
3911                         }
3912                         if (ret < 0) {
3913                                 /*
3914                                  * ACK message with error found
3915                                  * or some error occurred.
3916                                  */
3917                                 err = errno;
3918                                 break;
3919                         }
3920                         /* We should continue receiving. */
3921                 }
3922         }
3923         if (!err)
3924                 return 0;
3925         rte_errno = err;
3926         return -err;
3927 }
3928
3929 #define MNL_BUF_EXTRA_SPACE 16
3930 #define MNL_REQUEST_SIZE_MIN 256
3931 #define MNL_REQUEST_SIZE_MAX 2048
3932 #define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \
3933                                  MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX)
3934
3935 /* Data structures used by flow_tcf_xxx_cb() routines. */
3936 struct tcf_nlcb_buf {
3937         LIST_ENTRY(tcf_nlcb_buf) next;
3938         uint32_t size;
3939         alignas(struct nlmsghdr)
3940         uint8_t msg[]; /**< Netlink message data. */
3941 };
3942
3943 struct tcf_nlcb_context {
3944         unsigned int ifindex; /**< Base interface index. */
3945         uint32_t bufsize;
3946         LIST_HEAD(, tcf_nlcb_buf) nlbuf;
3947 };
3948
3949 /**
3950  * Allocate space for netlink command in buffer list
3951  *
3952  * @param[in, out] ctx
3953  *   Pointer to callback context with command buffers list.
3954  * @param[in] size
3955  *   Required size of data buffer to be allocated.
3956  *
3957  * @return
3958  *   Pointer to allocated memory, aligned as message header.
3959  *   NULL if some error occurred.
3960  */
3961 static struct nlmsghdr *
3962 flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size)
3963 {
3964         struct tcf_nlcb_buf *buf;
3965         struct nlmsghdr *nlh;
3966
3967         size = NLMSG_ALIGN(size);
3968         buf = LIST_FIRST(&ctx->nlbuf);
3969         if (buf && (buf->size + size) <= ctx->bufsize) {
3970                 nlh = (struct nlmsghdr *)&buf->msg[buf->size];
3971                 buf->size += size;
3972                 return nlh;
3973         }
3974         if (size > ctx->bufsize) {
3975                 DRV_LOG(WARNING, "netlink: too long command buffer requested");
3976                 return NULL;
3977         }
3978         buf = rte_malloc(__func__,
3979                         ctx->bufsize + sizeof(struct tcf_nlcb_buf),
3980                         alignof(struct tcf_nlcb_buf));
3981         if (!buf) {
3982                 DRV_LOG(WARNING, "netlink: no memory for command buffer");
3983                 return NULL;
3984         }
3985         LIST_INSERT_HEAD(&ctx->nlbuf, buf, next);
3986         buf->size = size;
3987         nlh = (struct nlmsghdr *)&buf->msg[0];
3988         return nlh;
3989 }
3990
3991 /**
3992  * Send the buffers with prepared netlink commands. Scans the list and
3993  * sends all found buffers. Buffers are sent and freed anyway in order
3994  * to prevent memory leakage if some every message in received packet.
3995  *
3996  * @param[in] tcf
3997  *   Context object initialized by mlx5_flow_tcf_context_create().
3998  * @param[in, out] ctx
3999  *   Pointer to callback context with command buffers list.
4000  *
4001  * @return
4002  *   Zero value on success, negative errno value otherwise
4003  *   and rte_errno is set.
4004  */
4005 static int
4006 flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf,
4007                     struct tcf_nlcb_context *ctx)
4008 {
4009         struct tcf_nlcb_buf *bc = LIST_FIRST(&ctx->nlbuf);
4010         int ret = 0;
4011
4012         while (bc) {
4013                 struct tcf_nlcb_buf *bn = LIST_NEXT(bc, next);
4014                 struct nlmsghdr *nlh;
4015                 uint32_t msg = 0;
4016                 int rc;
4017
4018                 while (msg < bc->size) {
4019                         /*
4020                          * Send Netlink commands from buffer in one by one
4021                          * fashion. If we send multiple rule deletion commands
4022                          * in one Netlink message and some error occurs it may
4023                          * cause multiple ACK error messages and break sequence
4024                          * numbers of Netlink communication, because we expect
4025                          * the only one ACK reply.
4026                          */
4027                         assert((bc->size - msg) >= sizeof(struct nlmsghdr));
4028                         nlh = (struct nlmsghdr *)&bc->msg[msg];
4029                         assert((bc->size - msg) >= nlh->nlmsg_len);
4030                         msg += nlh->nlmsg_len;
4031                         rc = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4032                         if (rc) {
4033                                 DRV_LOG(WARNING,
4034                                         "netlink: cleanup error %d", rc);
4035                                 if (!ret)
4036                                         ret = rc;
4037                         }
4038                 }
4039                 rte_free(bc);
4040                 bc = bn;
4041         }
4042         LIST_INIT(&ctx->nlbuf);
4043         return ret;
4044 }
4045
4046 /**
4047  * Collect local IP address rules with scope link attribute  on specified
4048  * network device. This is callback routine called by libmnl mnl_cb_run()
4049  * in loop for every message in received packet.
4050  *
4051  * @param[in] nlh
4052  *   Pointer to reply header.
4053  * @param[in, out] arg
4054  *   Opaque data pointer for this callback.
4055  *
4056  * @return
4057  *   A positive, nonzero value on success, negative errno value otherwise
4058  *   and rte_errno is set.
4059  */
4060 static int
4061 flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg)
4062 {
4063         struct tcf_nlcb_context *ctx = arg;
4064         struct nlmsghdr *cmd;
4065         struct ifaddrmsg *ifa;
4066         struct nlattr *na;
4067         struct nlattr *na_local = NULL;
4068         struct nlattr *na_peer = NULL;
4069         unsigned char family;
4070         uint32_t size;
4071
4072         if (nlh->nlmsg_type != RTM_NEWADDR) {
4073                 rte_errno = EINVAL;
4074                 return -rte_errno;
4075         }
4076         ifa = mnl_nlmsg_get_payload(nlh);
4077         family = ifa->ifa_family;
4078         if (ifa->ifa_index != ctx->ifindex ||
4079             ifa->ifa_scope != RT_SCOPE_LINK ||
4080             !(ifa->ifa_flags & IFA_F_PERMANENT) ||
4081             (family != AF_INET && family != AF_INET6))
4082                 return 1;
4083         mnl_attr_for_each(na, nlh, sizeof(*ifa)) {
4084                 switch (mnl_attr_get_type(na)) {
4085                 case IFA_LOCAL:
4086                         na_local = na;
4087                         break;
4088                 case IFA_ADDRESS:
4089                         na_peer = na;
4090                         break;
4091                 }
4092                 if (na_local && na_peer)
4093                         break;
4094         }
4095         if (!na_local || !na_peer)
4096                 return 1;
4097         /* Local rule found with scope link, permanent and assigned peer. */
4098         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4099                MNL_ALIGN(sizeof(struct ifaddrmsg)) +
4100                (family == AF_INET6 ? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4101                                    : 2 * SZ_NLATTR_TYPE_OF(uint32_t));
4102         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4103         if (!cmd) {
4104                 rte_errno = ENOMEM;
4105                 return -rte_errno;
4106         }
4107         cmd = mnl_nlmsg_put_header(cmd);
4108         cmd->nlmsg_type = RTM_DELADDR;
4109         cmd->nlmsg_flags = NLM_F_REQUEST;
4110         ifa = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifa));
4111         ifa->ifa_flags = IFA_F_PERMANENT;
4112         ifa->ifa_scope = RT_SCOPE_LINK;
4113         ifa->ifa_index = ctx->ifindex;
4114         if (family == AF_INET) {
4115                 ifa->ifa_family = AF_INET;
4116                 ifa->ifa_prefixlen = 32;
4117                 mnl_attr_put_u32(cmd, IFA_LOCAL, mnl_attr_get_u32(na_local));
4118                 mnl_attr_put_u32(cmd, IFA_ADDRESS, mnl_attr_get_u32(na_peer));
4119         } else {
4120                 ifa->ifa_family = AF_INET6;
4121                 ifa->ifa_prefixlen = 128;
4122                 mnl_attr_put(cmd, IFA_LOCAL, IPV6_ADDR_LEN,
4123                         mnl_attr_get_payload(na_local));
4124                 mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN,
4125                         mnl_attr_get_payload(na_peer));
4126         }
4127         assert(size == cmd->nlmsg_len);
4128         return 1;
4129 }
4130
4131 /**
4132  * Cleanup the local IP addresses on outer interface.
4133  *
4134  * @param[in] tcf
4135  *   Context object initialized by mlx5_flow_tcf_context_create().
4136  * @param[in] ifindex
4137  *   Network inferface index to perform cleanup.
4138  */
4139 static void
4140 flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf,
4141                             unsigned int ifindex)
4142 {
4143         struct nlmsghdr *nlh;
4144         struct ifaddrmsg *ifa;
4145         struct tcf_nlcb_context ctx = {
4146                 .ifindex = ifindex,
4147                 .bufsize = MNL_REQUEST_SIZE,
4148                 .nlbuf = LIST_HEAD_INITIALIZER(),
4149         };
4150         int ret;
4151
4152         assert(ifindex);
4153         /*
4154          * Seek and destroy leftovers of local IP addresses with
4155          * matching properties "scope link".
4156          */
4157         nlh = mnl_nlmsg_put_header(tcf->buf);
4158         nlh->nlmsg_type = RTM_GETADDR;
4159         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4160         ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4161         ifa->ifa_family = AF_UNSPEC;
4162         ifa->ifa_index = ifindex;
4163         ifa->ifa_scope = RT_SCOPE_LINK;
4164         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_local_cb, &ctx);
4165         if (ret)
4166                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4167         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4168         if (ret)
4169                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4170 }
4171
4172 /**
4173  * Collect neigh permament rules on specified network device.
4174  * This is callback routine called by libmnl mnl_cb_run() in loop for
4175  * every message in received packet.
4176  *
4177  * @param[in] nlh
4178  *   Pointer to reply header.
4179  * @param[in, out] arg
4180  *   Opaque data pointer for this callback.
4181  *
4182  * @return
4183  *   A positive, nonzero value on success, negative errno value otherwise
4184  *   and rte_errno is set.
4185  */
4186 static int
4187 flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg)
4188 {
4189         struct tcf_nlcb_context *ctx = arg;
4190         struct nlmsghdr *cmd;
4191         struct ndmsg *ndm;
4192         struct nlattr *na;
4193         struct nlattr *na_ip = NULL;
4194         struct nlattr *na_mac = NULL;
4195         unsigned char family;
4196         uint32_t size;
4197
4198         if (nlh->nlmsg_type != RTM_NEWNEIGH) {
4199                 rte_errno = EINVAL;
4200                 return -rte_errno;
4201         }
4202         ndm = mnl_nlmsg_get_payload(nlh);
4203         family = ndm->ndm_family;
4204         if (ndm->ndm_ifindex != (int)ctx->ifindex ||
4205            !(ndm->ndm_state & NUD_PERMANENT) ||
4206            (family != AF_INET && family != AF_INET6))
4207                 return 1;
4208         mnl_attr_for_each(na, nlh, sizeof(*ndm)) {
4209                 switch (mnl_attr_get_type(na)) {
4210                 case NDA_DST:
4211                         na_ip = na;
4212                         break;
4213                 case NDA_LLADDR:
4214                         na_mac = na;
4215                         break;
4216                 }
4217                 if (na_mac && na_ip)
4218                         break;
4219         }
4220         if (!na_mac || !na_ip)
4221                 return 1;
4222         /* Neigh rule with permenent attribute found. */
4223         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4224                MNL_ALIGN(sizeof(struct ndmsg)) +
4225                SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) +
4226                (family == AF_INET6 ? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4227                                    : SZ_NLATTR_TYPE_OF(uint32_t));
4228         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4229         if (!cmd) {
4230                 rte_errno = ENOMEM;
4231                 return -rte_errno;
4232         }
4233         cmd = mnl_nlmsg_put_header(cmd);
4234         cmd->nlmsg_type = RTM_DELNEIGH;
4235         cmd->nlmsg_flags = NLM_F_REQUEST;
4236         ndm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ndm));
4237         ndm->ndm_ifindex = ctx->ifindex;
4238         ndm->ndm_state = NUD_PERMANENT;
4239         ndm->ndm_flags = 0;
4240         ndm->ndm_type = 0;
4241         if (family == AF_INET) {
4242                 ndm->ndm_family = AF_INET;
4243                 mnl_attr_put_u32(cmd, NDA_DST, mnl_attr_get_u32(na_ip));
4244         } else {
4245                 ndm->ndm_family = AF_INET6;
4246                 mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN,
4247                              mnl_attr_get_payload(na_ip));
4248         }
4249         mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN,
4250                      mnl_attr_get_payload(na_mac));
4251         assert(size == cmd->nlmsg_len);
4252         return 1;
4253 }
4254
4255 /**
4256  * Cleanup the neigh rules on outer interface.
4257  *
4258  * @param[in] tcf
4259  *   Context object initialized by mlx5_flow_tcf_context_create().
4260  * @param[in] ifindex
4261  *   Network inferface index to perform cleanup.
4262  */
4263 static void
4264 flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf,
4265                             unsigned int ifindex)
4266 {
4267         struct nlmsghdr *nlh;
4268         struct ndmsg *ndm;
4269         struct tcf_nlcb_context ctx = {
4270                 .ifindex = ifindex,
4271                 .bufsize = MNL_REQUEST_SIZE,
4272                 .nlbuf = LIST_HEAD_INITIALIZER(),
4273         };
4274         int ret;
4275
4276         assert(ifindex);
4277         /* Seek and destroy leftovers of neigh rules. */
4278         nlh = mnl_nlmsg_put_header(tcf->buf);
4279         nlh->nlmsg_type = RTM_GETNEIGH;
4280         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4281         ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4282         ndm->ndm_family = AF_UNSPEC;
4283         ndm->ndm_ifindex = ifindex;
4284         ndm->ndm_state = NUD_PERMANENT;
4285         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_neigh_cb, &ctx);
4286         if (ret)
4287                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4288         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4289         if (ret)
4290                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4291 }
4292
4293 /**
4294  * Collect indices of VXLAN encap/decap interfaces associated with device.
4295  * This is callback routine called by libmnl mnl_cb_run() in loop for
4296  * every message in received packet.
4297  *
4298  * @param[in] nlh
4299  *   Pointer to reply header.
4300  * @param[in, out] arg
4301  *   Opaque data pointer for this callback.
4302  *
4303  * @return
4304  *   A positive, nonzero value on success, negative errno value otherwise
4305  *   and rte_errno is set.
4306  */
4307 static int
4308 flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg)
4309 {
4310         struct tcf_nlcb_context *ctx = arg;
4311         struct nlmsghdr *cmd;
4312         struct ifinfomsg *ifm;
4313         struct nlattr *na;
4314         struct nlattr *na_info = NULL;
4315         struct nlattr *na_vxlan = NULL;
4316         bool found = false;
4317         unsigned int vxindex;
4318         uint32_t size;
4319
4320         if (nlh->nlmsg_type != RTM_NEWLINK) {
4321                 rte_errno = EINVAL;
4322                 return -rte_errno;
4323         }
4324         ifm = mnl_nlmsg_get_payload(nlh);
4325         if (!ifm->ifi_index) {
4326                 rte_errno = EINVAL;
4327                 return -rte_errno;
4328         }
4329         mnl_attr_for_each(na, nlh, sizeof(*ifm))
4330                 if (mnl_attr_get_type(na) == IFLA_LINKINFO) {
4331                         na_info = na;
4332                         break;
4333                 }
4334         if (!na_info)
4335                 return 1;
4336         mnl_attr_for_each_nested(na, na_info) {
4337                 switch (mnl_attr_get_type(na)) {
4338                 case IFLA_INFO_KIND:
4339                         if (!strncmp("vxlan", mnl_attr_get_str(na),
4340                                      mnl_attr_get_len(na)))
4341                                 found = true;
4342                         break;
4343                 case IFLA_INFO_DATA:
4344                         na_vxlan = na;
4345                         break;
4346                 }
4347                 if (found && na_vxlan)
4348                         break;
4349         }
4350         if (!found || !na_vxlan)
4351                 return 1;
4352         found = false;
4353         mnl_attr_for_each_nested(na, na_vxlan) {
4354                 if (mnl_attr_get_type(na) == IFLA_VXLAN_LINK &&
4355                     mnl_attr_get_u32(na) == ctx->ifindex) {
4356                         found = true;
4357                         break;
4358                 }
4359         }
4360         if (!found)
4361                 return 1;
4362         /* Attached VXLAN device found, store the command to delete. */
4363         vxindex = ifm->ifi_index;
4364         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4365                MNL_ALIGN(sizeof(struct ifinfomsg));
4366         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4367         if (!cmd) {
4368                 rte_errno = ENOMEM;
4369                 return -rte_errno;
4370         }
4371         cmd = mnl_nlmsg_put_header(cmd);
4372         cmd->nlmsg_type = RTM_DELLINK;
4373         cmd->nlmsg_flags = NLM_F_REQUEST;
4374         ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm));
4375         ifm->ifi_family = AF_UNSPEC;
4376         ifm->ifi_index = vxindex;
4377         assert(size == cmd->nlmsg_len);
4378         return 1;
4379 }
4380
4381 /**
4382  * Cleanup the outer interface. Removes all found vxlan devices
4383  * attached to specified index, flushes the neigh and local IP
4384  * database.
4385  *
4386  * @param[in] tcf
4387  *   Context object initialized by mlx5_flow_tcf_context_create().
4388  * @param[in] ifindex
4389  *   Network inferface index to perform cleanup.
4390  */
4391 static void
4392 flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf,
4393                             unsigned int ifindex)
4394 {
4395         struct nlmsghdr *nlh;
4396         struct ifinfomsg *ifm;
4397         struct tcf_nlcb_context ctx = {
4398                 .ifindex = ifindex,
4399                 .bufsize = MNL_REQUEST_SIZE,
4400                 .nlbuf = LIST_HEAD_INITIALIZER(),
4401         };
4402         int ret;
4403
4404         assert(ifindex);
4405         /*
4406          * Seek and destroy leftover VXLAN encap/decap interfaces with
4407          * matching properties.
4408          */
4409         nlh = mnl_nlmsg_put_header(tcf->buf);
4410         nlh->nlmsg_type = RTM_GETLINK;
4411         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4412         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4413         ifm->ifi_family = AF_UNSPEC;
4414         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_vxlan_cb, &ctx);
4415         if (ret)
4416                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4417         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4418         if (ret)
4419                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4420 }
4421
4422 /**
4423  * Emit Netlink message to add/remove local address to the outer device.
4424  * The address being added is visible within the link only (scope link).
4425  *
4426  * Note that an implicit route is maintained by the kernel due to the
4427  * presence of a peer address (IFA_ADDRESS).
4428  *
4429  * These rules are used for encapsultion only and allow to assign
4430  * the outer tunnel source IP address.
4431  *
4432  * @param[in] tcf
4433  *   Libmnl socket context object.
4434  * @param[in] encap
4435  *   Encapsulation properties (source address and its peer).
4436  * @param[in] ifindex
4437  *   Network interface to apply rule.
4438  * @param[in] enable
4439  *   Toggle between add and remove.
4440  * @param[out] error
4441  *   Perform verbose error reporting if not NULL.
4442  *
4443  * @return
4444  *   0 on success, a negative errno value otherwise and rte_errno is set.
4445  */
4446 static int
4447 flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf,
4448                     const struct flow_tcf_vxlan_encap *encap,
4449                     unsigned int ifindex,
4450                     bool enable,
4451                     struct rte_flow_error *error)
4452 {
4453         struct nlmsghdr *nlh;
4454         struct ifaddrmsg *ifa;
4455         alignas(struct nlmsghdr)
4456         uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
4457
4458         nlh = mnl_nlmsg_put_header(buf);
4459         nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
4460         nlh->nlmsg_flags =
4461                 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4462         nlh->nlmsg_seq = 0;
4463         ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4464         ifa->ifa_flags = IFA_F_PERMANENT;
4465         ifa->ifa_scope = RT_SCOPE_LINK;
4466         ifa->ifa_index = ifindex;
4467         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4468                 ifa->ifa_family = AF_INET;
4469                 ifa->ifa_prefixlen = 32;
4470                 mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src);
4471                 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST)
4472                         mnl_attr_put_u32(nlh, IFA_ADDRESS,
4473                                               encap->ipv4.dst);
4474         } else {
4475                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4476                 ifa->ifa_family = AF_INET6;
4477                 ifa->ifa_prefixlen = 128;
4478                 mnl_attr_put(nlh, IFA_LOCAL,
4479                                   sizeof(encap->ipv6.src),
4480                                   &encap->ipv6.src);
4481                 if (encap->mask & FLOW_TCF_ENCAP_IPV6_DST)
4482                         mnl_attr_put(nlh, IFA_ADDRESS,
4483                                           sizeof(encap->ipv6.dst),
4484                                           &encap->ipv6.dst);
4485         }
4486         if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4487                 return 0;
4488         return rte_flow_error_set(error, rte_errno,
4489                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4490                                   "netlink: cannot complete IFA request"
4491                                   " (ip addr add)");
4492 }
4493
4494 /**
4495  * Emit Netlink message to add/remove neighbor.
4496  *
4497  * @param[in] tcf
4498  *   Libmnl socket context object.
4499  * @param[in] encap
4500  *   Encapsulation properties (destination address).
4501  * @param[in] ifindex
4502  *   Network interface.
4503  * @param[in] enable
4504  *   Toggle between add and remove.
4505  * @param[out] error
4506  *   Perform verbose error reporting if not NULL.
4507  *
4508  * @return
4509  *   0 on success, a negative errno value otherwise and rte_errno is set.
4510  */
4511 static int
4512 flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf,
4513                      const struct flow_tcf_vxlan_encap *encap,
4514                      unsigned int ifindex,
4515                      bool enable,
4516                      struct rte_flow_error *error)
4517 {
4518         struct nlmsghdr *nlh;
4519         struct ndmsg *ndm;
4520         alignas(struct nlmsghdr)
4521         uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
4522
4523         nlh = mnl_nlmsg_put_header(buf);
4524         nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
4525         nlh->nlmsg_flags =
4526                 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4527         nlh->nlmsg_seq = 0;
4528         ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4529         ndm->ndm_ifindex = ifindex;
4530         ndm->ndm_state = NUD_PERMANENT;
4531         ndm->ndm_flags = 0;
4532         ndm->ndm_type = 0;
4533         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4534                 ndm->ndm_family = AF_INET;
4535                 mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst);
4536         } else {
4537                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4538                 ndm->ndm_family = AF_INET6;
4539                 mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst),
4540                                                  &encap->ipv6.dst);
4541         }
4542         if (encap->mask & FLOW_TCF_ENCAP_ETH_SRC && enable)
4543                 DRV_LOG(WARNING,
4544                         "outer ethernet source address cannot be "
4545                         "forced for VXLAN encapsulation");
4546         if (encap->mask & FLOW_TCF_ENCAP_ETH_DST)
4547                 mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst),
4548                                                     &encap->eth.dst);
4549         if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4550                 return 0;
4551         return rte_flow_error_set(error, rte_errno,
4552                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4553                                   "netlink: cannot complete ND request"
4554                                   " (ip neigh)");
4555 }
4556
4557 /**
4558  * Manage the local IP addresses and their peers IP addresses on the
4559  * outer interface for encapsulation purposes. The kernel searches the
4560  * appropriate device for tunnel egress traffic using the outer source
4561  * IP, this IP should be assigned to the outer network device, otherwise
4562  * kernel rejects the rule.
4563  *
4564  * Adds or removes the addresses using the Netlink command like this:
4565  *   ip addr add <src_ip> peer <dst_ip> scope link dev <ifouter>
4566  *
4567  * The addresses are local to the netdev ("scope link"), this reduces
4568  * the risk of conflicts. Note that an implicit route is maintained by
4569  * the kernel due to the presence of a peer address (IFA_ADDRESS).
4570  *
4571  * @param[in] tcf
4572  *   Libmnl socket context object.
4573  * @param[in] iface
4574  *   Object, contains rule database and ifouter index.
4575  * @param[in] dev_flow
4576  *   Flow object, contains the tunnel parameters (for encap only).
4577  * @param[in] enable
4578  *   Toggle between add and remove.
4579  * @param[out] error
4580  *   Perform verbose error reporting if not NULL.
4581  *
4582  * @return
4583  *   0 on success, a negative errno value otherwise and rte_errno is set.
4584  */
4585 static int
4586 flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf,
4587                      struct tcf_irule *iface,
4588                      struct mlx5_flow *dev_flow,
4589                      bool enable,
4590                      struct rte_flow_error *error)
4591 {
4592         const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4593         struct tcf_local_rule *rule = NULL;
4594         int ret;
4595
4596         assert(encap);
4597         assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4598         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4599                 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_DST);
4600                 LIST_FOREACH(rule, &iface->local, next) {
4601                         if (rule->mask & FLOW_TCF_ENCAP_IPV4_SRC &&
4602                             encap->ipv4.src == rule->ipv4.src &&
4603                             encap->ipv4.dst == rule->ipv4.dst) {
4604                                 break;
4605                         }
4606                 }
4607         } else {
4608                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4609                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4610                 LIST_FOREACH(rule, &iface->local, next) {
4611                         if (rule->mask & FLOW_TCF_ENCAP_IPV6_SRC &&
4612                             !memcmp(&encap->ipv6.src, &rule->ipv6.src,
4613                                             sizeof(encap->ipv6.src)) &&
4614                             !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4615                                             sizeof(encap->ipv6.dst))) {
4616                                 break;
4617                         }
4618                 }
4619         }
4620         if (rule) {
4621                 if (enable) {
4622                         rule->refcnt++;
4623                         return 0;
4624                 }
4625                 if (!rule->refcnt || !--rule->refcnt) {
4626                         LIST_REMOVE(rule, next);
4627                         return flow_tcf_rule_local(tcf, encap,
4628                                         iface->ifouter, false, error);
4629                 }
4630                 return 0;
4631         }
4632         if (!enable) {
4633                 DRV_LOG(WARNING, "disabling not existing local rule");
4634                 rte_flow_error_set(error, ENOENT,
4635                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4636                                    "disabling not existing local rule");
4637                 return -ENOENT;
4638         }
4639         rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule),
4640                                 alignof(struct tcf_local_rule));
4641         if (!rule) {
4642                 rte_flow_error_set(error, ENOMEM,
4643                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4644                                    "unable to allocate memory for local rule");
4645                 return -rte_errno;
4646         }
4647         *rule = (struct tcf_local_rule){.refcnt = 0,
4648                                         .mask = 0,
4649                                         };
4650         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4651                 rule->mask = FLOW_TCF_ENCAP_IPV4_SRC
4652                            | FLOW_TCF_ENCAP_IPV4_DST;
4653                 rule->ipv4.src = encap->ipv4.src;
4654                 rule->ipv4.dst = encap->ipv4.dst;
4655         } else {
4656                 rule->mask = FLOW_TCF_ENCAP_IPV6_SRC
4657                            | FLOW_TCF_ENCAP_IPV6_DST;
4658                 memcpy(&rule->ipv6.src, &encap->ipv6.src, IPV6_ADDR_LEN);
4659                 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4660         }
4661         ret = flow_tcf_rule_local(tcf, encap, iface->ifouter, true, error);
4662         if (ret) {
4663                 rte_free(rule);
4664                 return ret;
4665         }
4666         rule->refcnt++;
4667         LIST_INSERT_HEAD(&iface->local, rule, next);
4668         return 0;
4669 }
4670
4671 /**
4672  * Manage the destination MAC/IP addresses neigh database, kernel uses
4673  * this one to determine the destination MAC address within encapsulation
4674  * header. Adds or removes the entries using the Netlink command like this:
4675  *   ip neigh add dev <ifouter> lladdr <dst_mac> to <dst_ip> nud permanent
4676  *
4677  * @param[in] tcf
4678  *   Libmnl socket context object.
4679  * @param[in] iface
4680  *   Object, contains rule database and ifouter index.
4681  * @param[in] dev_flow
4682  *   Flow object, contains the tunnel parameters (for encap only).
4683  * @param[in] enable
4684  *   Toggle between add and remove.
4685  * @param[out] error
4686  *   Perform verbose error reporting if not NULL.
4687  *
4688  * @return
4689  *   0 on success, a negative errno value otherwise and rte_errno is set.
4690  */
4691 static int
4692 flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf,
4693                      struct tcf_irule *iface,
4694                      struct mlx5_flow *dev_flow,
4695                      bool enable,
4696                      struct rte_flow_error *error)
4697 {
4698         const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4699         struct tcf_neigh_rule *rule = NULL;
4700         int ret;
4701
4702         assert(encap);
4703         assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4704         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4705                 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_SRC);
4706                 LIST_FOREACH(rule, &iface->neigh, next) {
4707                         if (rule->mask & FLOW_TCF_ENCAP_IPV4_DST &&
4708                             encap->ipv4.dst == rule->ipv4.dst) {
4709                                 break;
4710                         }
4711                 }
4712         } else {
4713                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4714                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4715                 LIST_FOREACH(rule, &iface->neigh, next) {
4716                         if (rule->mask & FLOW_TCF_ENCAP_IPV6_DST &&
4717                             !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4718                                                 sizeof(encap->ipv6.dst))) {
4719                                 break;
4720                         }
4721                 }
4722         }
4723         if (rule) {
4724                 if (memcmp(&encap->eth.dst, &rule->eth,
4725                            sizeof(encap->eth.dst))) {
4726                         DRV_LOG(WARNING, "Destination MAC differs"
4727                                          " in neigh rule");
4728                         rte_flow_error_set(error, EEXIST,
4729                                            RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
4730                                            NULL, "Different MAC address"
4731                                            " neigh rule for the same"
4732                                            " destination IP");
4733                                         return -EEXIST;
4734                 }
4735                 if (enable) {
4736                         rule->refcnt++;
4737                         return 0;
4738                 }
4739                 if (!rule->refcnt || !--rule->refcnt) {
4740                         LIST_REMOVE(rule, next);
4741                         return flow_tcf_rule_neigh(tcf, encap,
4742                                                    iface->ifouter,
4743                                                    false, error);
4744                 }
4745                 return 0;
4746         }
4747         if (!enable) {
4748                 DRV_LOG(WARNING, "Disabling not existing neigh rule");
4749                 rte_flow_error_set(error, ENOENT,
4750                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4751                                    "unable to allocate memory for neigh rule");
4752                 return -ENOENT;
4753         }
4754         rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule),
4755                                 alignof(struct tcf_neigh_rule));
4756         if (!rule) {
4757                 rte_flow_error_set(error, ENOMEM,
4758                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4759                                    "unable to allocate memory for neigh rule");
4760                 return -rte_errno;
4761         }
4762         *rule = (struct tcf_neigh_rule){.refcnt = 0,
4763                                         .mask = 0,
4764                                         };
4765         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4766                 rule->mask = FLOW_TCF_ENCAP_IPV4_DST;
4767                 rule->ipv4.dst = encap->ipv4.dst;
4768         } else {
4769                 rule->mask = FLOW_TCF_ENCAP_IPV6_DST;
4770                 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4771         }
4772         memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth));
4773         ret = flow_tcf_rule_neigh(tcf, encap, iface->ifouter, true, error);
4774         if (ret) {
4775                 rte_free(rule);
4776                 return ret;
4777         }
4778         rule->refcnt++;
4779         LIST_INSERT_HEAD(&iface->neigh, rule, next);
4780         return 0;
4781 }
4782
4783 /* VXLAN encap rule database for outer interfaces. */
4784 static  LIST_HEAD(, tcf_irule) iface_list_vxlan = LIST_HEAD_INITIALIZER();
4785
4786 /* VTEP device list is shared between PMD port instances. */
4787 static LIST_HEAD(, tcf_vtep) vtep_list_vxlan = LIST_HEAD_INITIALIZER();
4788 static pthread_mutex_t vtep_list_mutex = PTHREAD_MUTEX_INITIALIZER;
4789
4790 /**
4791  * Acquire the VXLAN encap rules container for specified interface.
4792  * First looks for the container in the existing ones list, creates
4793  * and initializes the new container if existing not found.
4794  *
4795  * @param[in] tcf
4796  *   Context object initialized by mlx5_flow_tcf_context_create().
4797  * @param[in] ifouter
4798  *   Network interface index to create VXLAN encap rules on.
4799  * @param[out] error
4800  *   Perform verbose error reporting if not NULL.
4801  * @return
4802  *   Rule container pointer on success,
4803  *   NULL otherwise and rte_errno is set.
4804  */
4805 static struct tcf_irule*
4806 flow_tcf_encap_irule_acquire(struct mlx5_flow_tcf_context *tcf,
4807                              unsigned int ifouter,
4808                              struct rte_flow_error *error)
4809 {
4810         struct tcf_irule *iface;
4811
4812         /* Look whether the container for encap rules is created. */
4813         assert(ifouter);
4814         LIST_FOREACH(iface, &iface_list_vxlan, next) {
4815                 if (iface->ifouter == ifouter)
4816                         break;
4817         }
4818         if (iface) {
4819                 /* Container already exists, just increment the reference. */
4820                 iface->refcnt++;
4821                 return iface;
4822         }
4823         /* Not found, we should create the new container. */
4824         iface = rte_zmalloc(__func__, sizeof(*iface),
4825                             alignof(struct tcf_irule));
4826         if (!iface) {
4827                 rte_flow_error_set(error, ENOMEM,
4828                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4829                                    "unable to allocate memory for container");
4830                 return NULL;
4831         }
4832         *iface = (struct tcf_irule){
4833                         .local = LIST_HEAD_INITIALIZER(),
4834                         .neigh = LIST_HEAD_INITIALIZER(),
4835                         .ifouter = ifouter,
4836                         .refcnt = 1,
4837         };
4838         /* Interface cleanup for new container created. */
4839         flow_tcf_encap_iface_cleanup(tcf, ifouter);
4840         flow_tcf_encap_local_cleanup(tcf, ifouter);
4841         flow_tcf_encap_neigh_cleanup(tcf, ifouter);
4842         LIST_INSERT_HEAD(&iface_list_vxlan, iface, next);
4843         return iface;
4844 }
4845
4846 /**
4847  * Releases VXLAN encap rules container by pointer. Decrements the
4848  * reference cointer and deletes the container if counter is zero.
4849  *
4850  * @param[in] irule
4851  *   VXLAN rule container pointer to release.
4852  */
4853 static void
4854 flow_tcf_encap_irule_release(struct tcf_irule *iface)
4855 {
4856         assert(iface->refcnt);
4857         if (--iface->refcnt == 0) {
4858                 /* Reference counter is zero, delete the container. */
4859                 assert(LIST_EMPTY(&iface->local));
4860                 assert(LIST_EMPTY(&iface->neigh));
4861                 LIST_REMOVE(iface, next);
4862                 rte_free(iface);
4863         }
4864 }
4865
4866 /**
4867  * Deletes VTEP network device.
4868  *
4869  * @param[in] tcf
4870  *   Context object initialized by mlx5_flow_tcf_context_create().
4871  * @param[in] vtep
4872  *   Object represinting the network device to delete. Memory
4873  *   allocated for this object is freed by routine.
4874  */
4875 static void
4876 flow_tcf_vtep_delete(struct mlx5_flow_tcf_context *tcf,
4877                      struct tcf_vtep *vtep)
4878 {
4879         struct nlmsghdr *nlh;
4880         struct ifinfomsg *ifm;
4881         alignas(struct nlmsghdr)
4882         uint8_t buf[mnl_nlmsg_size(MNL_ALIGN(sizeof(*ifm))) +
4883                     MNL_BUF_EXTRA_SPACE];
4884         int ret;
4885
4886         assert(!vtep->refcnt);
4887         /* Delete only ifaces those we actually created. */
4888         if (vtep->created && vtep->ifindex) {
4889                 DRV_LOG(INFO, "VTEP delete (%d)", vtep->ifindex);
4890                 nlh = mnl_nlmsg_put_header(buf);
4891                 nlh->nlmsg_type = RTM_DELLINK;
4892                 nlh->nlmsg_flags = NLM_F_REQUEST;
4893                 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4894                 ifm->ifi_family = AF_UNSPEC;
4895                 ifm->ifi_index = vtep->ifindex;
4896                 assert(sizeof(buf) >= nlh->nlmsg_len);
4897                 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4898                 if (ret)
4899                         DRV_LOG(WARNING, "netlink: error deleting vxlan"
4900                                          " encap/decap ifindex %u",
4901                                          ifm->ifi_index);
4902         }
4903         rte_free(vtep);
4904 }
4905
4906 /**
4907  * Creates VTEP network device.
4908  *
4909  * @param[in] tcf
4910  *   Context object initialized by mlx5_flow_tcf_context_create().
4911  * @param[in] port
4912  *   UDP port of created VTEP device.
4913  * @param[out] error
4914  *   Perform verbose error reporting if not NULL.
4915  *
4916  * @return
4917  * Pointer to created device structure on success,
4918  * NULL otherwise and rte_errno is set.
4919  */
4920 static struct tcf_vtep*
4921 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf,
4922                      uint16_t port, struct rte_flow_error *error)
4923 {
4924         struct tcf_vtep *vtep;
4925         struct nlmsghdr *nlh;
4926         struct ifinfomsg *ifm;
4927         char name[sizeof(MLX5_VXLAN_DEVICE_PFX) + 24];
4928         alignas(struct nlmsghdr)
4929         uint8_t buf[mnl_nlmsg_size(sizeof(*ifm)) +
4930                     SZ_NLATTR_DATA_OF(sizeof(name)) +
4931                     SZ_NLATTR_NEST * 2 +
4932                     SZ_NLATTR_STRZ_OF("vxlan") +
4933                     SZ_NLATTR_DATA_OF(sizeof(uint32_t)) +
4934                     SZ_NLATTR_DATA_OF(sizeof(uint16_t)) +
4935                     SZ_NLATTR_DATA_OF(sizeof(uint8_t)) * 3 +
4936                     MNL_BUF_EXTRA_SPACE];
4937         struct nlattr *na_info;
4938         struct nlattr *na_vxlan;
4939         rte_be16_t vxlan_port = rte_cpu_to_be_16(port);
4940         int ret;
4941
4942         vtep = rte_zmalloc(__func__, sizeof(*vtep), alignof(struct tcf_vtep));
4943         if (!vtep) {
4944                 rte_flow_error_set(error, ENOMEM,
4945                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4946                                    "unable to allocate memory for VTEP");
4947                 return NULL;
4948         }
4949         *vtep = (struct tcf_vtep){
4950                         .port = port,
4951         };
4952         memset(buf, 0, sizeof(buf));
4953         nlh = mnl_nlmsg_put_header(buf);
4954         nlh->nlmsg_type = RTM_NEWLINK;
4955         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE  | NLM_F_EXCL;
4956         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4957         ifm->ifi_family = AF_UNSPEC;
4958         ifm->ifi_type = 0;
4959         ifm->ifi_index = 0;
4960         ifm->ifi_flags = IFF_UP;
4961         ifm->ifi_change = 0xffffffff;
4962         snprintf(name, sizeof(name), "%s%u", MLX5_VXLAN_DEVICE_PFX, port);
4963         mnl_attr_put_strz(nlh, IFLA_IFNAME, name);
4964         na_info = mnl_attr_nest_start(nlh, IFLA_LINKINFO);
4965         assert(na_info);
4966         mnl_attr_put_strz(nlh, IFLA_INFO_KIND, "vxlan");
4967         na_vxlan = mnl_attr_nest_start(nlh, IFLA_INFO_DATA);
4968         assert(na_vxlan);
4969 #ifdef HAVE_IFLA_VXLAN_COLLECT_METADATA
4970         /*
4971          * RH 7.2 does not support metadata for tunnel device.
4972          * It does not matter because we are going to use the
4973          * hardware offload by mlx5 driver.
4974          */
4975         mnl_attr_put_u8(nlh, IFLA_VXLAN_COLLECT_METADATA, 1);
4976 #endif
4977         mnl_attr_put_u8(nlh, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1);
4978         mnl_attr_put_u8(nlh, IFLA_VXLAN_LEARNING, 0);
4979         mnl_attr_put_u16(nlh, IFLA_VXLAN_PORT, vxlan_port);
4980 #ifndef HAVE_IFLA_VXLAN_COLLECT_METADATA
4981         /*
4982          *  We must specify VNI explicitly if metadata not supported.
4983          *  Note, VNI is transferred with native endianness format.
4984          */
4985         mnl_attr_put_u16(nlh, IFLA_VXLAN_ID, MLX5_VXLAN_DEFAULT_VNI);
4986 #endif
4987         mnl_attr_nest_end(nlh, na_vxlan);
4988         mnl_attr_nest_end(nlh, na_info);
4989         assert(sizeof(buf) >= nlh->nlmsg_len);
4990         ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4991         if (ret) {
4992                 DRV_LOG(WARNING,
4993                         "netlink: VTEP %s create failure (%d)",
4994                         name, rte_errno);
4995                 if (rte_errno != EEXIST)
4996                         /*
4997                          * Some unhandled error occurred or device is
4998                          * for encapsulation and cannot be shared.
4999                          */
5000                         goto error;
5001         } else {
5002                 /*
5003                  * Mark device we actually created.
5004                  * We should explicitly delete
5005                  * when we do not need it anymore.
5006                  */
5007                 vtep->created = 1;
5008         }
5009         /* Try to get ifindex of created of pre-existing device. */
5010         ret = if_nametoindex(name);
5011         if (!ret) {
5012                 DRV_LOG(WARNING,
5013                         "VTEP %s failed to get index (%d)", name, errno);
5014                 rte_flow_error_set
5015                         (error, -errno,
5016                          RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5017                          "netlink: failed to retrieve VTEP ifindex");
5018                 goto error;
5019         }
5020         vtep->ifindex = ret;
5021         memset(buf, 0, sizeof(buf));
5022         nlh = mnl_nlmsg_put_header(buf);
5023         nlh->nlmsg_type = RTM_NEWLINK;
5024         nlh->nlmsg_flags = NLM_F_REQUEST;
5025         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5026         ifm->ifi_family = AF_UNSPEC;
5027         ifm->ifi_type = 0;
5028         ifm->ifi_index = vtep->ifindex;
5029         ifm->ifi_flags = IFF_UP;
5030         ifm->ifi_change = IFF_UP;
5031         ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5032         if (ret) {
5033                 rte_flow_error_set(error, -errno,
5034                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5035                                    "netlink: failed to set VTEP link up");
5036                 DRV_LOG(WARNING, "netlink: VTEP %s set link up failure (%d)",
5037                         name, rte_errno);
5038                 goto clean;
5039         }
5040         ret = mlx5_flow_tcf_init(tcf, vtep->ifindex, error);
5041         if (ret) {
5042                 DRV_LOG(WARNING, "VTEP %s init failure (%d)", name, rte_errno);
5043                 goto clean;
5044         }
5045         DRV_LOG(INFO, "VTEP create (%d, %d)", vtep->port, vtep->ifindex);
5046         vtep->refcnt = 1;
5047         return vtep;
5048 clean:
5049         flow_tcf_vtep_delete(tcf, vtep);
5050         return NULL;
5051 error:
5052         rte_free(vtep);
5053         return NULL;
5054 }
5055
5056 /**
5057  * Acquire target interface index for VXLAN tunneling decapsulation.
5058  * In order to share the UDP port within the other interfaces the
5059  * VXLAN device created as not attached to any interface (if created).
5060  *
5061  * @param[in] tcf
5062  *   Context object initialized by mlx5_flow_tcf_context_create().
5063  * @param[in] dev_flow
5064  *   Flow tcf object with tunnel structure pointer set.
5065  * @param[out] error
5066  *   Perform verbose error reporting if not NULL.
5067  * @return
5068  *   Interface descriptor pointer on success,
5069  *   NULL otherwise and rte_errno is set.
5070  */
5071 static struct tcf_vtep*
5072 flow_tcf_decap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5073                             struct mlx5_flow *dev_flow,
5074                             struct rte_flow_error *error)
5075 {
5076         struct tcf_vtep *vtep;
5077         uint16_t port = dev_flow->tcf.vxlan_decap->udp_port;
5078
5079         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5080                 if (vtep->port == port)
5081                         break;
5082         }
5083         if (vtep) {
5084                 /* Device exists, just increment the reference counter. */
5085                 vtep->refcnt++;
5086                 assert(vtep->ifindex);
5087                 return vtep;
5088         }
5089         /* No decapsulation device exists, try to create the new one. */
5090         vtep = flow_tcf_vtep_create(tcf, port, error);
5091         if (vtep)
5092                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5093         return vtep;
5094 }
5095
5096 /**
5097  * Aqcuire target interface index for VXLAN tunneling encapsulation.
5098  *
5099  * @param[in] tcf
5100  *   Context object initialized by mlx5_flow_tcf_context_create().
5101  * @param[in] ifouter
5102  *   Network interface index to attach VXLAN encap device to.
5103  * @param[in] dev_flow
5104  *   Flow tcf object with tunnel structure pointer set.
5105  * @param[out] error
5106  *   Perform verbose error reporting if not NULL.
5107  * @return
5108  *   Interface descriptor pointer on success,
5109  *   NULL otherwise and rte_errno is set.
5110  */
5111 static struct tcf_vtep*
5112 flow_tcf_encap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5113                             unsigned int ifouter,
5114                             struct mlx5_flow *dev_flow,
5115                             struct rte_flow_error *error)
5116 {
5117         static uint16_t port;
5118         struct tcf_vtep *vtep;
5119         struct tcf_irule *iface;
5120         int ret;
5121
5122         assert(ifouter);
5123         /* Look whether the VTEP for specified port is created. */
5124         port = rte_be_to_cpu_16(dev_flow->tcf.vxlan_encap->udp.dst);
5125         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5126                 if (vtep->port == port)
5127                         break;
5128         }
5129         if (vtep) {
5130                 /* VTEP already exists, just increment the reference. */
5131                 vtep->refcnt++;
5132         } else {
5133                 /* Not found, we should create the new VTEP. */
5134                 vtep = flow_tcf_vtep_create(tcf, port, error);
5135                 if (!vtep)
5136                         return NULL;
5137                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5138         }
5139         assert(vtep->ifindex);
5140         iface = flow_tcf_encap_irule_acquire(tcf, ifouter, error);
5141         if (!iface) {
5142                 if (--vtep->refcnt == 0)
5143                         flow_tcf_vtep_delete(tcf, vtep);
5144                 return NULL;
5145         }
5146         dev_flow->tcf.vxlan_encap->iface = iface;
5147         /* Create local ipaddr with peer to specify the outer IPs. */
5148         ret = flow_tcf_encap_local(tcf, iface, dev_flow, true, error);
5149         if (!ret) {
5150                 /* Create neigh rule to specify outer destination MAC. */
5151                 ret = flow_tcf_encap_neigh(tcf, iface, dev_flow, true, error);
5152                 if (ret)
5153                         flow_tcf_encap_local(tcf, iface,
5154                                              dev_flow, false, error);
5155         }
5156         if (ret) {
5157                 dev_flow->tcf.vxlan_encap->iface = NULL;
5158                 flow_tcf_encap_irule_release(iface);
5159                 if (--vtep->refcnt == 0)
5160                         flow_tcf_vtep_delete(tcf, vtep);
5161                 return NULL;
5162         }
5163         return vtep;
5164 }
5165
5166 /**
5167  * Acquires target interface index for tunneling of any type.
5168  * Creates the new VTEP if needed.
5169  *
5170  * @param[in] tcf
5171  *   Context object initialized by mlx5_flow_tcf_context_create().
5172  * @param[in] ifouter
5173  *   Network interface index to create VXLAN encap rules on.
5174  * @param[in] dev_flow
5175  *   Flow tcf object with tunnel structure pointer set.
5176  * @param[out] error
5177  *   Perform verbose error reporting if not NULL.
5178  * @return
5179  *   Interface descriptor pointer on success,
5180  *   NULL otherwise and rte_errno is set.
5181  */
5182 static struct tcf_vtep*
5183 flow_tcf_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5184                       unsigned int ifouter,
5185                       struct mlx5_flow *dev_flow,
5186                       struct rte_flow_error *error)
5187 {
5188         struct tcf_vtep *vtep = NULL;
5189
5190         assert(dev_flow->tcf.tunnel);
5191         pthread_mutex_lock(&vtep_list_mutex);
5192         switch (dev_flow->tcf.tunnel->type) {
5193         case FLOW_TCF_TUNACT_VXLAN_ENCAP:
5194                 vtep = flow_tcf_encap_vtep_acquire(tcf, ifouter,
5195                                                   dev_flow, error);
5196                 break;
5197         case FLOW_TCF_TUNACT_VXLAN_DECAP:
5198                 vtep = flow_tcf_decap_vtep_acquire(tcf, dev_flow, error);
5199                 break;
5200         default:
5201                 rte_flow_error_set(error, ENOTSUP,
5202                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5203                                    "unsupported tunnel type");
5204                 break;
5205         }
5206         pthread_mutex_unlock(&vtep_list_mutex);
5207         return vtep;
5208 }
5209
5210 /**
5211  * Release tunneling interface by ifindex. Decrements reference
5212  * counter and actually removes the device if counter is zero.
5213  *
5214  * @param[in] tcf
5215  *   Context object initialized by mlx5_flow_tcf_context_create().
5216  * @param[in] vtep
5217  *   VTEP device descriptor structure.
5218  * @param[in] dev_flow
5219  *   Flow tcf object with tunnel structure pointer set.
5220  */
5221 static void
5222 flow_tcf_vtep_release(struct mlx5_flow_tcf_context *tcf,
5223                       struct tcf_vtep *vtep,
5224                       struct mlx5_flow *dev_flow)
5225 {
5226         assert(dev_flow->tcf.tunnel);
5227         pthread_mutex_lock(&vtep_list_mutex);
5228         switch (dev_flow->tcf.tunnel->type) {
5229         case FLOW_TCF_TUNACT_VXLAN_DECAP:
5230                 break;
5231         case FLOW_TCF_TUNACT_VXLAN_ENCAP: {
5232                 struct tcf_irule *iface;
5233
5234                 /* Remove the encap ancillary rules first. */
5235                 iface = dev_flow->tcf.vxlan_encap->iface;
5236                 assert(iface);
5237                 flow_tcf_encap_neigh(tcf, iface, dev_flow, false, NULL);
5238                 flow_tcf_encap_local(tcf, iface, dev_flow, false, NULL);
5239                 flow_tcf_encap_irule_release(iface);
5240                 dev_flow->tcf.vxlan_encap->iface = NULL;
5241                 break;
5242         }
5243         default:
5244                 assert(false);
5245                 DRV_LOG(WARNING, "Unsupported tunnel type");
5246                 break;
5247         }
5248         assert(vtep->refcnt);
5249         if (--vtep->refcnt == 0) {
5250                 LIST_REMOVE(vtep, next);
5251                 flow_tcf_vtep_delete(tcf, vtep);
5252         }
5253         pthread_mutex_unlock(&vtep_list_mutex);
5254 }
5255
5256 struct tcf_nlcb_query {
5257         uint32_t handle;
5258         uint32_t tc_flags;
5259         uint32_t flags_valid:1;
5260 };
5261
5262 /**
5263  * Collect queried rule attributes. This is callback routine called by
5264  * libmnl mnl_cb_run() in loop for every message in received packet.
5265  * Current implementation collects the flower flags only.
5266  *
5267  * @param[in] nlh
5268  *   Pointer to reply header.
5269  * @param[in, out] arg
5270  *   Context pointer for this callback.
5271  *
5272  * @return
5273  *   A positive, nonzero value on success (required by libmnl
5274  *   to continue messages processing).
5275  */
5276 static int
5277 flow_tcf_collect_query_cb(const struct nlmsghdr *nlh, void *arg)
5278 {
5279         struct tcf_nlcb_query *query = arg;
5280         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
5281         struct nlattr *na, *na_opt;
5282         bool flower = false;
5283
5284         if (nlh->nlmsg_type != RTM_NEWTFILTER ||
5285             tcm->tcm_handle != query->handle)
5286                 return 1;
5287         mnl_attr_for_each(na, nlh, sizeof(*tcm)) {
5288                 switch (mnl_attr_get_type(na)) {
5289                 case TCA_KIND:
5290                         if (strcmp(mnl_attr_get_payload(na), "flower")) {
5291                                 /* Not flower filter, drop entire message. */
5292                                 return 1;
5293                         }
5294                         flower = true;
5295                         break;
5296                 case TCA_OPTIONS:
5297                         if (!flower) {
5298                                 /* Not flower options, drop entire message. */
5299                                 return 1;
5300                         }
5301                         /* Check nested flower options. */
5302                         mnl_attr_for_each_nested(na_opt, na) {
5303                                 switch (mnl_attr_get_type(na_opt)) {
5304                                 case TCA_FLOWER_FLAGS:
5305                                         query->flags_valid = 1;
5306                                         query->tc_flags =
5307                                                 mnl_attr_get_u32(na_opt);
5308                                         break;
5309                                 }
5310                         }
5311                         break;
5312                 }
5313         }
5314         return 1;
5315 }
5316
5317 /**
5318  * Query a TC flower rule flags via netlink.
5319  *
5320  * @param[in] tcf
5321  *   Context object initialized by mlx5_flow_tcf_context_create().
5322  * @param[in] dev_flow
5323  *   Pointer to the flow.
5324  * @param[out] pflags
5325  *   pointer to the data retrieved by the query.
5326  *
5327  * @return
5328  *   0 on success, a negative errno value otherwise.
5329  */
5330 static int
5331 flow_tcf_query_flags(struct mlx5_flow_tcf_context *tcf,
5332                      struct mlx5_flow *dev_flow,
5333                      uint32_t *pflags)
5334 {
5335         struct nlmsghdr *nlh;
5336         struct tcmsg *tcm;
5337         struct tcf_nlcb_query query = {
5338                 .handle = dev_flow->tcf.tcm->tcm_handle,
5339         };
5340
5341         nlh = mnl_nlmsg_put_header(tcf->buf);
5342         nlh->nlmsg_type = RTM_GETTFILTER;
5343         nlh->nlmsg_flags = NLM_F_REQUEST;
5344         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
5345         memcpy(tcm, dev_flow->tcf.tcm, sizeof(*tcm));
5346         /*
5347          * Ignore Netlink error for filter query operations.
5348          * The reply length is sent by kernel as errno.
5349          * Just check we got the flags option.
5350          */
5351         flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_query_cb, &query);
5352         if (!query.flags_valid) {
5353                 *pflags = 0;
5354                 return -ENOENT;
5355         }
5356         *pflags = query.tc_flags;
5357         return 0;
5358 }
5359
5360 /**
5361  * Query and check the in_hw set for specified rule.
5362  *
5363  * @param[in] tcf
5364  *   Context object initialized by mlx5_flow_tcf_context_create().
5365  * @param[in] dev_flow
5366  *   Pointer to the flow to check.
5367  *
5368  * @return
5369  *   0 on success, a negative errno value otherwise.
5370  */
5371 static int
5372 flow_tcf_check_inhw(struct mlx5_flow_tcf_context *tcf,
5373                     struct mlx5_flow *dev_flow)
5374 {
5375         uint32_t flags;
5376         int ret;
5377
5378         ret = flow_tcf_query_flags(tcf, dev_flow, &flags);
5379         if (ret)
5380                 return ret;
5381         return  (flags & TCA_CLS_FLAGS_IN_HW) ? 0 : -ENOENT;
5382 }
5383
5384 /**
5385  * Remove flow from E-Switch by sending Netlink message.
5386  *
5387  * @param[in] dev
5388  *   Pointer to Ethernet device.
5389  * @param[in, out] flow
5390  *   Pointer to the sub flow.
5391  */
5392 static void
5393 flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
5394 {
5395         struct priv *priv = dev->data->dev_private;
5396         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5397         struct mlx5_flow *dev_flow;
5398         struct nlmsghdr *nlh;
5399
5400         if (!flow)
5401                 return;
5402         dev_flow = LIST_FIRST(&flow->dev_flows);
5403         if (!dev_flow)
5404                 return;
5405         /* E-Switch flow can't be expanded. */
5406         assert(!LIST_NEXT(dev_flow, next));
5407         if (dev_flow->tcf.applied) {
5408                 nlh = dev_flow->tcf.nlh;
5409                 nlh->nlmsg_type = RTM_DELTFILTER;
5410                 nlh->nlmsg_flags = NLM_F_REQUEST;
5411                 flow_tcf_nl_ack(ctx, nlh, NULL, NULL);
5412                 if (dev_flow->tcf.tunnel) {
5413                         assert(dev_flow->tcf.tunnel->vtep);
5414                         flow_tcf_vtep_release(ctx,
5415                                 dev_flow->tcf.tunnel->vtep,
5416                                 dev_flow);
5417                         dev_flow->tcf.tunnel->vtep = NULL;
5418                 }
5419                 dev_flow->tcf.applied = 0;
5420         }
5421 }
5422
5423 /**
5424  * Apply flow to E-Switch by sending Netlink message.
5425  *
5426  * @param[in] dev
5427  *   Pointer to Ethernet device.
5428  * @param[in, out] flow
5429  *   Pointer to the sub flow.
5430  * @param[out] error
5431  *   Pointer to the error structure.
5432  *
5433  * @return
5434  *   0 on success, a negative errno value otherwise and rte_errno is set.
5435  */
5436 static int
5437 flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
5438                struct rte_flow_error *error)
5439 {
5440         struct priv *priv = dev->data->dev_private;
5441         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5442         struct mlx5_flow *dev_flow;
5443         struct nlmsghdr *nlh;
5444
5445         dev_flow = LIST_FIRST(&flow->dev_flows);
5446         /* E-Switch flow can't be expanded. */
5447         assert(!LIST_NEXT(dev_flow, next));
5448         if (dev_flow->tcf.applied)
5449                 return 0;
5450         nlh = dev_flow->tcf.nlh;
5451         nlh->nlmsg_type = RTM_NEWTFILTER;
5452         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
5453         if (dev_flow->tcf.tunnel) {
5454                 /*
5455                  * Replace the interface index, target for
5456                  * encapsulation, source for decapsulation.
5457                  */
5458                 assert(!dev_flow->tcf.tunnel->vtep);
5459                 assert(dev_flow->tcf.tunnel->ifindex_ptr);
5460                 /* Acquire actual VTEP device when rule is being applied. */
5461                 dev_flow->tcf.tunnel->vtep =
5462                         flow_tcf_vtep_acquire(ctx,
5463                                         dev_flow->tcf.tunnel->ifindex_org,
5464                                         dev_flow, error);
5465                 if (!dev_flow->tcf.tunnel->vtep)
5466                         return -rte_errno;
5467                 DRV_LOG(INFO, "Replace ifindex: %d->%d",
5468                                 dev_flow->tcf.tunnel->vtep->ifindex,
5469                                 dev_flow->tcf.tunnel->ifindex_org);
5470                 *dev_flow->tcf.tunnel->ifindex_ptr =
5471                         dev_flow->tcf.tunnel->vtep->ifindex;
5472         }
5473         if (!flow_tcf_nl_ack(ctx, nlh, NULL, NULL)) {
5474                 dev_flow->tcf.applied = 1;
5475                 if (*dev_flow->tcf.ptc_flags & TCA_CLS_FLAGS_SKIP_SW)
5476                         return 0;
5477                 /*
5478                  * Rule was applied without skip_sw flag set.
5479                  * We should check whether the rule was acctually
5480                  * accepted by hardware (have look at in_hw flag).
5481                  */
5482                 if (flow_tcf_check_inhw(ctx, dev_flow)) {
5483                         flow_tcf_remove(dev, flow);
5484                         return rte_flow_error_set
5485                                 (error, ENOENT,
5486                                  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5487                                  "netlink: rule has no in_hw flag set");
5488                 }
5489                 return 0;
5490         }
5491         if (dev_flow->tcf.tunnel) {
5492                 /* Rollback the VTEP configuration if rule apply failed. */
5493                 assert(dev_flow->tcf.tunnel->vtep);
5494                 flow_tcf_vtep_release(ctx, dev_flow->tcf.tunnel->vtep,
5495                                       dev_flow);
5496                 dev_flow->tcf.tunnel->vtep = NULL;
5497         }
5498         return rte_flow_error_set(error, rte_errno,
5499                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5500                                   "netlink: failed to create TC flow rule");
5501 }
5502
5503 /**
5504  * Remove flow from E-Switch and release resources of the device flow.
5505  *
5506  * @param[in] dev
5507  *   Pointer to Ethernet device.
5508  * @param[in, out] flow
5509  *   Pointer to the sub flow.
5510  */
5511 static void
5512 flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
5513 {
5514         struct mlx5_flow *dev_flow;
5515
5516         if (!flow)
5517                 return;
5518         flow_tcf_remove(dev, flow);
5519         if (flow->counter) {
5520                 if (--flow->counter->ref_cnt == 0) {
5521                         rte_free(flow->counter);
5522                         flow->counter = NULL;
5523                 }
5524         }
5525         dev_flow = LIST_FIRST(&flow->dev_flows);
5526         if (!dev_flow)
5527                 return;
5528         /* E-Switch flow can't be expanded. */
5529         assert(!LIST_NEXT(dev_flow, next));
5530         LIST_REMOVE(dev_flow, next);
5531         rte_free(dev_flow);
5532 }
5533
5534 /**
5535  * Helper routine for figuring the space size required for a parse buffer.
5536  *
5537  * @param array
5538  *   array of values to use.
5539  * @param idx
5540  *   Current location in array.
5541  * @param value
5542  *   Value to compare with.
5543  *
5544  * @return
5545  *   The maximum between the given value and the array value on index.
5546  */
5547 static uint16_t
5548 flow_tcf_arr_val_max(uint16_t array[], int idx, uint16_t value)
5549 {
5550         return idx < 0 ? (value) : RTE_MAX((array)[idx], value);
5551 }
5552
5553 /**
5554  * Parse rtnetlink message attributes filling the attribute table with the info
5555  * retrieved.
5556  *
5557  * @param tb
5558  *   Attribute table to be filled.
5559  * @param[out] max
5560  *   Maxinum entry in the attribute table.
5561  * @param rte
5562  *   The attributes section in the message to be parsed.
5563  * @param len
5564  *   The length of the attributes section in the message.
5565  */
5566 static void
5567 flow_tcf_nl_parse_rtattr(struct rtattr *tb[], int max,
5568                          struct rtattr *rta, int len)
5569 {
5570         unsigned short type;
5571         memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
5572         while (RTA_OK(rta, len)) {
5573                 type = rta->rta_type;
5574                 if (type <= max && !tb[type])
5575                         tb[type] = rta;
5576                 rta = RTA_NEXT(rta, len);
5577         }
5578 }
5579
5580 /**
5581  * Extract flow counters from flower action.
5582  *
5583  * @param rta
5584  *   flower action stats properties in the Netlink message received.
5585  * @param rta_type
5586  *   The backward sequence of rta_types, as written in the attribute table,
5587  *   we need to traverse in order to get to the requested object.
5588  * @param idx
5589  *   Current location in rta_type table.
5590  * @param[out] data
5591  *   data holding the count statistics of the rte_flow retrieved from
5592  *   the message.
5593  *
5594  * @return
5595  *   0 if data was found and retrieved, -1 otherwise.
5596  */
5597 static int
5598 flow_tcf_nl_action_stats_parse_and_get(struct rtattr *rta,
5599                                        uint16_t rta_type[], int idx,
5600                                        struct gnet_stats_basic *data)
5601 {
5602         int tca_stats_max = flow_tcf_arr_val_max(rta_type, idx,
5603                                                  TCA_STATS_BASIC);
5604         struct rtattr *tbs[tca_stats_max + 1];
5605
5606         if (rta == NULL || idx < 0)
5607                 return -1;
5608         flow_tcf_nl_parse_rtattr(tbs, tca_stats_max,
5609                                  RTA_DATA(rta), RTA_PAYLOAD(rta));
5610         switch (rta_type[idx]) {
5611         case TCA_STATS_BASIC:
5612                 if (tbs[TCA_STATS_BASIC]) {
5613                         memcpy(data, RTA_DATA(tbs[TCA_STATS_BASIC]),
5614                                RTE_MIN(RTA_PAYLOAD(tbs[TCA_STATS_BASIC]),
5615                                sizeof(*data)));
5616                         return 0;
5617                 }
5618                 break;
5619         default:
5620                 break;
5621         }
5622         return -1;
5623 }
5624
5625 /**
5626  * Parse flower single action retrieving the requested action attribute,
5627  * if found.
5628  *
5629  * @param arg
5630  *   flower action properties in the Netlink message received.
5631  * @param rta_type
5632  *   The backward sequence of rta_types, as written in the attribute table,
5633  *   we need to traverse in order to get to the requested object.
5634  * @param idx
5635  *   Current location in rta_type table.
5636  * @param[out] data
5637  *   Count statistics retrieved from the message query.
5638  *
5639  * @return
5640  *   0 if data was found and retrieved, -1 otherwise.
5641  */
5642 static int
5643 flow_tcf_nl_parse_one_action_and_get(struct rtattr *arg,
5644                                      uint16_t rta_type[], int idx, void *data)
5645 {
5646         int tca_act_max = flow_tcf_arr_val_max(rta_type, idx, TCA_ACT_STATS);
5647         struct rtattr *tb[tca_act_max + 1];
5648
5649         if (arg == NULL || idx < 0)
5650                 return -1;
5651         flow_tcf_nl_parse_rtattr(tb, tca_act_max,
5652                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
5653         if (tb[TCA_ACT_KIND] == NULL)
5654                 return -1;
5655         switch (rta_type[idx]) {
5656         case TCA_ACT_STATS:
5657                 if (tb[TCA_ACT_STATS])
5658                         return flow_tcf_nl_action_stats_parse_and_get
5659                                         (tb[TCA_ACT_STATS],
5660                                          rta_type, --idx,
5661                                          (struct gnet_stats_basic *)data);
5662                 break;
5663         default:
5664                 break;
5665         }
5666         return -1;
5667 }
5668
5669 /**
5670  * Parse flower action section in the message retrieving the requested
5671  * attribute from the first action that provides it.
5672  *
5673  * @param opt
5674  *   flower section in the Netlink message received.
5675  * @param rta_type
5676  *   The backward sequence of rta_types, as written in the attribute table,
5677  *   we need to traverse in order to get to the requested object.
5678  * @param idx
5679  *   Current location in rta_type table.
5680  * @param[out] data
5681  *   data retrieved from the message query.
5682  *
5683  * @return
5684  *   0 if data was found and retrieved, -1 otherwise.
5685  */
5686 static int
5687 flow_tcf_nl_action_parse_and_get(struct rtattr *arg,
5688                                  uint16_t rta_type[], int idx, void *data)
5689 {
5690         struct rtattr *tb[TCA_ACT_MAX_PRIO + 1];
5691         int i;
5692
5693         if (arg == NULL || idx < 0)
5694                 return -1;
5695         flow_tcf_nl_parse_rtattr(tb, TCA_ACT_MAX_PRIO,
5696                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
5697         switch (rta_type[idx]) {
5698         /*
5699          * flow counters are stored in the actions defined by the flow
5700          * and not in the flow itself, therefore we need to traverse the
5701          * flower chain of actions in search for them.
5702          *
5703          * Note that the index is not decremented here.
5704          */
5705         case TCA_ACT_STATS:
5706                 for (i = 0; i <= TCA_ACT_MAX_PRIO; i++) {
5707                         if (tb[i] &&
5708                         !flow_tcf_nl_parse_one_action_and_get(tb[i],
5709                                                               rta_type,
5710                                                               idx, data))
5711                                 return 0;
5712                 }
5713                 break;
5714         default:
5715                 break;
5716         }
5717         return -1;
5718 }
5719
5720 /**
5721  * Parse flower classifier options in the message, retrieving the requested
5722  * attribute if found.
5723  *
5724  * @param opt
5725  *   flower section in the Netlink message received.
5726  * @param rta_type
5727  *   The backward sequence of rta_types, as written in the attribute table,
5728  *   we need to traverse in order to get to the requested object.
5729  * @param idx
5730  *   Current location in rta_type table.
5731  * @param[out] data
5732  *   data retrieved from the message query.
5733  *
5734  * @return
5735  *   0 if data was found and retrieved, -1 otherwise.
5736  */
5737 static int
5738 flow_tcf_nl_opts_parse_and_get(struct rtattr *opt,
5739                                uint16_t rta_type[], int idx, void *data)
5740 {
5741         int tca_flower_max = flow_tcf_arr_val_max(rta_type, idx,
5742                                                   TCA_FLOWER_ACT);
5743         struct rtattr *tb[tca_flower_max + 1];
5744
5745         if (!opt || idx < 0)
5746                 return -1;
5747         flow_tcf_nl_parse_rtattr(tb, tca_flower_max,
5748                                  RTA_DATA(opt), RTA_PAYLOAD(opt));
5749         switch (rta_type[idx]) {
5750         case TCA_FLOWER_ACT:
5751                 if (tb[TCA_FLOWER_ACT])
5752                         return flow_tcf_nl_action_parse_and_get
5753                                                         (tb[TCA_FLOWER_ACT],
5754                                                          rta_type, --idx, data);
5755                 break;
5756         default:
5757                 break;
5758         }
5759         return -1;
5760 }
5761
5762 /**
5763  * Parse Netlink reply on filter query, retrieving the flow counters.
5764  *
5765  * @param nlh
5766  *   Message received from Netlink.
5767  * @param rta_type
5768  *   The backward sequence of rta_types, as written in the attribute table,
5769  *   we need to traverse in order to get to the requested object.
5770  * @param idx
5771  *   Current location in rta_type table.
5772  * @param[out] data
5773  *   data retrieved from the message query.
5774  *
5775  * @return
5776  *   0 if data was found and retrieved, -1 otherwise.
5777  */
5778 static int
5779 flow_tcf_nl_filter_parse_and_get(struct nlmsghdr *cnlh,
5780                                  uint16_t rta_type[], int idx, void *data)
5781 {
5782         struct nlmsghdr *nlh = cnlh;
5783         struct tcmsg *t = NLMSG_DATA(nlh);
5784         int len = nlh->nlmsg_len;
5785         int tca_max = flow_tcf_arr_val_max(rta_type, idx, TCA_OPTIONS);
5786         struct rtattr *tb[tca_max + 1];
5787
5788         if (idx < 0)
5789                 return -1;
5790         if (nlh->nlmsg_type != RTM_NEWTFILTER &&
5791             nlh->nlmsg_type != RTM_GETTFILTER &&
5792             nlh->nlmsg_type != RTM_DELTFILTER)
5793                 return -1;
5794         len -= NLMSG_LENGTH(sizeof(*t));
5795         if (len < 0)
5796                 return -1;
5797         flow_tcf_nl_parse_rtattr(tb, tca_max, TCA_RTA(t), len);
5798         /* Not a TC flower flow - bail out */
5799         if (!tb[TCA_KIND] ||
5800             strcmp(RTA_DATA(tb[TCA_KIND]), "flower"))
5801                 return -1;
5802         switch (rta_type[idx]) {
5803         case TCA_OPTIONS:
5804                 if (tb[TCA_OPTIONS])
5805                         return flow_tcf_nl_opts_parse_and_get(tb[TCA_OPTIONS],
5806                                                               rta_type,
5807                                                               --idx, data);
5808                 break;
5809         default:
5810                 break;
5811         }
5812         return -1;
5813 }
5814
5815 /**
5816  * A callback to parse Netlink reply on TC flower query.
5817  *
5818  * @param nlh
5819  *   Message received from Netlink.
5820  * @param[out] data
5821  *   Pointer to data area to be filled by the parsing routine.
5822  *   assumed to be a pointer to struct flow_tcf_stats_basic.
5823  *
5824  * @return
5825  *   MNL_CB_OK value.
5826  */
5827 static int
5828 flow_tcf_nl_message_get_stats_basic(const struct nlmsghdr *nlh, void *data)
5829 {
5830         /*
5831          * The backward sequence of rta_types to pass in order to get
5832          *  to the counters.
5833          */
5834         uint16_t rta_type[] = { TCA_STATS_BASIC, TCA_ACT_STATS,
5835                                 TCA_FLOWER_ACT, TCA_OPTIONS };
5836         struct flow_tcf_stats_basic *sb_data = data;
5837         union {
5838                 const struct nlmsghdr *c;
5839                 struct nlmsghdr *nc;
5840         } tnlh = { .c = nlh };
5841
5842         if (!flow_tcf_nl_filter_parse_and_get(tnlh.nc, rta_type,
5843                                               RTE_DIM(rta_type) - 1,
5844                                               (void *)&sb_data->counters))
5845                 sb_data->valid = true;
5846         return MNL_CB_OK;
5847 }
5848
5849 /**
5850  * Query a TC flower rule for its statistics via netlink.
5851  *
5852  * @param[in] dev
5853  *   Pointer to Ethernet device.
5854  * @param[in] flow
5855  *   Pointer to the sub flow.
5856  * @param[out] data
5857  *   data retrieved by the query.
5858  * @param[out] error
5859  *   Perform verbose error reporting if not NULL.
5860  *
5861  * @return
5862  *   0 on success, a negative errno value otherwise and rte_errno is set.
5863  */
5864 static int
5865 flow_tcf_query_count(struct rte_eth_dev *dev,
5866                           struct rte_flow *flow,
5867                           void *data,
5868                           struct rte_flow_error *error)
5869 {
5870         struct flow_tcf_stats_basic sb_data;
5871         struct rte_flow_query_count *qc = data;
5872         struct priv *priv = dev->data->dev_private;
5873         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5874         struct mnl_socket *nl = ctx->nl;
5875         struct mlx5_flow *dev_flow;
5876         struct nlmsghdr *nlh;
5877         uint32_t seq = priv->tcf_context->seq++;
5878         ssize_t ret;
5879         assert(qc);
5880
5881         memset(&sb_data, 0, sizeof(sb_data));
5882         dev_flow = LIST_FIRST(&flow->dev_flows);
5883         /* E-Switch flow can't be expanded. */
5884         assert(!LIST_NEXT(dev_flow, next));
5885         if (!dev_flow->flow->counter)
5886                 goto notsup_exit;
5887         nlh = dev_flow->tcf.nlh;
5888         nlh->nlmsg_type = RTM_GETTFILTER;
5889         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ECHO;
5890         nlh->nlmsg_seq = seq;
5891         if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) == -1)
5892                 goto error_exit;
5893         do {
5894                 ret = mnl_socket_recvfrom(nl, ctx->buf, ctx->buf_size);
5895                 if (ret <= 0)
5896                         break;
5897                 ret = mnl_cb_run(ctx->buf, ret, seq,
5898                                  mnl_socket_get_portid(nl),
5899                                  flow_tcf_nl_message_get_stats_basic,
5900                                  (void *)&sb_data);
5901         } while (ret > 0);
5902         /* Return the delta from last reset. */
5903         if (sb_data.valid) {
5904                 /* Return the delta from last reset. */
5905                 qc->hits_set = 1;
5906                 qc->bytes_set = 1;
5907                 qc->hits = sb_data.counters.packets - flow->counter->hits;
5908                 qc->bytes = sb_data.counters.bytes - flow->counter->bytes;
5909                 if (qc->reset) {
5910                         flow->counter->hits = sb_data.counters.packets;
5911                         flow->counter->bytes = sb_data.counters.bytes;
5912                 }
5913                 return 0;
5914         }
5915         return rte_flow_error_set(error, EINVAL,
5916                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5917                                   NULL,
5918                                   "flow does not have counter");
5919 error_exit:
5920         return rte_flow_error_set
5921                         (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5922                          NULL, "netlink: failed to read flow rule counters");
5923 notsup_exit:
5924         return rte_flow_error_set
5925                         (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5926                          NULL, "counters are not available.");
5927 }
5928
5929 /**
5930  * Query a flow.
5931  *
5932  * @see rte_flow_query()
5933  * @see rte_flow_ops
5934  */
5935 static int
5936 flow_tcf_query(struct rte_eth_dev *dev,
5937                struct rte_flow *flow,
5938                const struct rte_flow_action *actions,
5939                void *data,
5940                struct rte_flow_error *error)
5941 {
5942         int ret = -EINVAL;
5943
5944         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
5945                 switch (actions->type) {
5946                 case RTE_FLOW_ACTION_TYPE_VOID:
5947                         break;
5948                 case RTE_FLOW_ACTION_TYPE_COUNT:
5949                         ret = flow_tcf_query_count(dev, flow, data, error);
5950                         break;
5951                 default:
5952                         return rte_flow_error_set(error, ENOTSUP,
5953                                                   RTE_FLOW_ERROR_TYPE_ACTION,
5954                                                   actions,
5955                                                   "action not supported");
5956                 }
5957         }
5958         return ret;
5959 }
5960
5961 const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops = {
5962         .validate = flow_tcf_validate,
5963         .prepare = flow_tcf_prepare,
5964         .translate = flow_tcf_translate,
5965         .apply = flow_tcf_apply,
5966         .remove = flow_tcf_remove,
5967         .destroy = flow_tcf_destroy,
5968         .query = flow_tcf_query,
5969 };
5970
5971 /**
5972  * Create and configure a libmnl socket for Netlink flow rules.
5973  *
5974  * @return
5975  *   A valid libmnl socket object pointer on success, NULL otherwise and
5976  *   rte_errno is set.
5977  */
5978 static struct mnl_socket *
5979 flow_tcf_mnl_socket_create(void)
5980 {
5981         struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
5982
5983         if (nl) {
5984                 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
5985                                       sizeof(int));
5986                 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
5987                         return nl;
5988         }
5989         rte_errno = errno;
5990         if (nl)
5991                 mnl_socket_close(nl);
5992         return NULL;
5993 }
5994
5995 /**
5996  * Destroy a libmnl socket.
5997  *
5998  * @param nl
5999  *   Libmnl socket of the @p NETLINK_ROUTE kind.
6000  */
6001 static void
6002 flow_tcf_mnl_socket_destroy(struct mnl_socket *nl)
6003 {
6004         if (nl)
6005                 mnl_socket_close(nl);
6006 }
6007
6008 /**
6009  * Initialize ingress qdisc of a given network interface.
6010  *
6011  * @param ctx
6012  *   Pointer to tc-flower context to use.
6013  * @param ifindex
6014  *   Index of network interface to initialize.
6015  * @param[out] error
6016  *   Perform verbose error reporting if not NULL.
6017  *
6018  * @return
6019  *   0 on success, a negative errno value otherwise and rte_errno is set.
6020  */
6021 int
6022 mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx,
6023                    unsigned int ifindex, struct rte_flow_error *error)
6024 {
6025         struct nlmsghdr *nlh;
6026         struct tcmsg *tcm;
6027         alignas(struct nlmsghdr)
6028         uint8_t buf[mnl_nlmsg_size(sizeof(*tcm)) +
6029                     SZ_NLATTR_STRZ_OF("ingress") +
6030                     MNL_BUF_EXTRA_SPACE];
6031
6032         /* Destroy existing ingress qdisc and everything attached to it. */
6033         nlh = mnl_nlmsg_put_header(buf);
6034         nlh->nlmsg_type = RTM_DELQDISC;
6035         nlh->nlmsg_flags = NLM_F_REQUEST;
6036         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6037         tcm->tcm_family = AF_UNSPEC;
6038         tcm->tcm_ifindex = ifindex;
6039         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6040         tcm->tcm_parent = TC_H_INGRESS;
6041         assert(sizeof(buf) >= nlh->nlmsg_len);
6042         /* Ignore errors when qdisc is already absent. */
6043         if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL) &&
6044             rte_errno != EINVAL && rte_errno != ENOENT)
6045                 return rte_flow_error_set(error, rte_errno,
6046                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6047                                           "netlink: failed to remove ingress"
6048                                           " qdisc");
6049         /* Create fresh ingress qdisc. */
6050         nlh = mnl_nlmsg_put_header(buf);
6051         nlh->nlmsg_type = RTM_NEWQDISC;
6052         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
6053         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6054         tcm->tcm_family = AF_UNSPEC;
6055         tcm->tcm_ifindex = ifindex;
6056         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6057         tcm->tcm_parent = TC_H_INGRESS;
6058         mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
6059         assert(sizeof(buf) >= nlh->nlmsg_len);
6060         if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL))
6061                 return rte_flow_error_set(error, rte_errno,
6062                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6063                                           "netlink: failed to create ingress"
6064                                           " qdisc");
6065         return 0;
6066 }
6067
6068 /**
6069  * Create libmnl context for Netlink flow rules.
6070  *
6071  * @return
6072  *   A valid libmnl socket object pointer on success, NULL otherwise and
6073  *   rte_errno is set.
6074  */
6075 struct mlx5_flow_tcf_context *
6076 mlx5_flow_tcf_context_create(void)
6077 {
6078         struct mlx5_flow_tcf_context *ctx = rte_zmalloc(__func__,
6079                                                         sizeof(*ctx),
6080                                                         sizeof(uint32_t));
6081         if (!ctx)
6082                 goto error;
6083         ctx->nl = flow_tcf_mnl_socket_create();
6084         if (!ctx->nl)
6085                 goto error;
6086         ctx->buf_size = MNL_SOCKET_BUFFER_SIZE;
6087         ctx->buf = rte_zmalloc(__func__,
6088                                ctx->buf_size, sizeof(uint32_t));
6089         if (!ctx->buf)
6090                 goto error;
6091         ctx->seq = random();
6092         return ctx;
6093 error:
6094         mlx5_flow_tcf_context_destroy(ctx);
6095         return NULL;
6096 }
6097
6098 /**
6099  * Destroy a libmnl context.
6100  *
6101  * @param ctx
6102  *   Libmnl socket of the @p NETLINK_ROUTE kind.
6103  */
6104 void
6105 mlx5_flow_tcf_context_destroy(struct mlx5_flow_tcf_context *ctx)
6106 {
6107         if (!ctx)
6108                 return;
6109         flow_tcf_mnl_socket_destroy(ctx->nl);
6110         rte_free(ctx->buf);
6111         rte_free(ctx);
6112 }