net/mlx5: fix TC rule handle assignment
[dpdk.git] / drivers / net / mlx5 / mlx5_flow_tcf.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018 6WIND S.A.
3  * Copyright 2018 Mellanox Technologies, Ltd
4  */
5
6 #include <assert.h>
7 #include <errno.h>
8 #include <libmnl/libmnl.h>
9 #include <linux/gen_stats.h>
10 #include <linux/if_ether.h>
11 #include <linux/netlink.h>
12 #include <linux/pkt_cls.h>
13 #include <linux/pkt_sched.h>
14 #include <linux/rtnetlink.h>
15 #include <linux/tc_act/tc_gact.h>
16 #include <linux/tc_act/tc_mirred.h>
17 #include <netinet/in.h>
18 #include <stdalign.h>
19 #include <stdbool.h>
20 #include <stddef.h>
21 #include <stdint.h>
22 #include <stdlib.h>
23 #include <sys/socket.h>
24
25 #include <rte_byteorder.h>
26 #include <rte_errno.h>
27 #include <rte_ether.h>
28 #include <rte_flow.h>
29 #include <rte_malloc.h>
30 #include <rte_common.h>
31
32 #include "mlx5.h"
33 #include "mlx5_flow.h"
34 #include "mlx5_autoconf.h"
35
36 #ifdef HAVE_TC_ACT_VLAN
37
38 #include <linux/tc_act/tc_vlan.h>
39
40 #else /* HAVE_TC_ACT_VLAN */
41
42 #define TCA_VLAN_ACT_POP 1
43 #define TCA_VLAN_ACT_PUSH 2
44 #define TCA_VLAN_ACT_MODIFY 3
45 #define TCA_VLAN_PARMS 2
46 #define TCA_VLAN_PUSH_VLAN_ID 3
47 #define TCA_VLAN_PUSH_VLAN_PROTOCOL 4
48 #define TCA_VLAN_PAD 5
49 #define TCA_VLAN_PUSH_VLAN_PRIORITY 6
50
51 struct tc_vlan {
52         tc_gen;
53         int v_action;
54 };
55
56 #endif /* HAVE_TC_ACT_VLAN */
57
58 #ifdef HAVE_TC_ACT_PEDIT
59
60 #include <linux/tc_act/tc_pedit.h>
61
62 #else /* HAVE_TC_ACT_VLAN */
63
64 enum {
65         TCA_PEDIT_UNSPEC,
66         TCA_PEDIT_TM,
67         TCA_PEDIT_PARMS,
68         TCA_PEDIT_PAD,
69         TCA_PEDIT_PARMS_EX,
70         TCA_PEDIT_KEYS_EX,
71         TCA_PEDIT_KEY_EX,
72         __TCA_PEDIT_MAX
73 };
74
75 enum {
76         TCA_PEDIT_KEY_EX_HTYPE = 1,
77         TCA_PEDIT_KEY_EX_CMD = 2,
78         __TCA_PEDIT_KEY_EX_MAX
79 };
80
81 enum pedit_header_type {
82         TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK = 0,
83         TCA_PEDIT_KEY_EX_HDR_TYPE_ETH = 1,
84         TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 = 2,
85         TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 = 3,
86         TCA_PEDIT_KEY_EX_HDR_TYPE_TCP = 4,
87         TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
88         __PEDIT_HDR_TYPE_MAX,
89 };
90
91 enum pedit_cmd {
92         TCA_PEDIT_KEY_EX_CMD_SET = 0,
93         TCA_PEDIT_KEY_EX_CMD_ADD = 1,
94         __PEDIT_CMD_MAX,
95 };
96
97 struct tc_pedit_key {
98         __u32 mask; /* AND */
99         __u32 val; /*XOR */
100         __u32 off; /*offset */
101         __u32 at;
102         __u32 offmask;
103         __u32 shift;
104 };
105
106 __extension__
107 struct tc_pedit_sel {
108         tc_gen;
109         unsigned char nkeys;
110         unsigned char flags;
111         struct tc_pedit_key keys[0];
112 };
113
114 #endif /* HAVE_TC_ACT_VLAN */
115
116 #ifdef HAVE_TC_ACT_TUNNEL_KEY
117
118 #include <linux/tc_act/tc_tunnel_key.h>
119
120 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
121 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
122 #endif
123
124 #ifndef HAVE_TCA_TUNNEL_KEY_NO_CSUM
125 #define TCA_TUNNEL_KEY_NO_CSUM 10
126 #endif
127
128 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_TOS
129 #define TCA_TUNNEL_KEY_ENC_TOS 12
130 #endif
131
132 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_TTL
133 #define TCA_TUNNEL_KEY_ENC_TTL 13
134 #endif
135
136 #else /* HAVE_TC_ACT_TUNNEL_KEY */
137
138 #define TCA_ACT_TUNNEL_KEY 17
139 #define TCA_TUNNEL_KEY_ACT_SET 1
140 #define TCA_TUNNEL_KEY_ACT_RELEASE 2
141 #define TCA_TUNNEL_KEY_PARMS 2
142 #define TCA_TUNNEL_KEY_ENC_IPV4_SRC 3
143 #define TCA_TUNNEL_KEY_ENC_IPV4_DST 4
144 #define TCA_TUNNEL_KEY_ENC_IPV6_SRC 5
145 #define TCA_TUNNEL_KEY_ENC_IPV6_DST 6
146 #define TCA_TUNNEL_KEY_ENC_KEY_ID 7
147 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
148 #define TCA_TUNNEL_KEY_NO_CSUM 10
149 #define TCA_TUNNEL_KEY_ENC_TOS 12
150 #define TCA_TUNNEL_KEY_ENC_TTL 13
151
152 struct tc_tunnel_key {
153         tc_gen;
154         int t_action;
155 };
156
157 #endif /* HAVE_TC_ACT_TUNNEL_KEY */
158
159 /* Normally found in linux/netlink.h. */
160 #ifndef NETLINK_CAP_ACK
161 #define NETLINK_CAP_ACK 10
162 #endif
163
164 /* Normally found in linux/pkt_sched.h. */
165 #ifndef TC_H_MIN_INGRESS
166 #define TC_H_MIN_INGRESS 0xfff2u
167 #endif
168
169 /* Normally found in linux/pkt_cls.h. */
170 #ifndef TCA_CLS_FLAGS_SKIP_SW
171 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
172 #endif
173 #ifndef TCA_CLS_FLAGS_IN_HW
174 #define TCA_CLS_FLAGS_IN_HW (1 << 2)
175 #endif
176 #ifndef HAVE_TCA_CHAIN
177 #define TCA_CHAIN 11
178 #endif
179 #ifndef HAVE_TCA_FLOWER_ACT
180 #define TCA_FLOWER_ACT 3
181 #endif
182 #ifndef HAVE_TCA_FLOWER_FLAGS
183 #define TCA_FLOWER_FLAGS 22
184 #endif
185 #ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE
186 #define TCA_FLOWER_KEY_ETH_TYPE 8
187 #endif
188 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST
189 #define TCA_FLOWER_KEY_ETH_DST 4
190 #endif
191 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK
192 #define TCA_FLOWER_KEY_ETH_DST_MASK 5
193 #endif
194 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC
195 #define TCA_FLOWER_KEY_ETH_SRC 6
196 #endif
197 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK
198 #define TCA_FLOWER_KEY_ETH_SRC_MASK 7
199 #endif
200 #ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO
201 #define TCA_FLOWER_KEY_IP_PROTO 9
202 #endif
203 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC
204 #define TCA_FLOWER_KEY_IPV4_SRC 10
205 #endif
206 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK
207 #define TCA_FLOWER_KEY_IPV4_SRC_MASK 11
208 #endif
209 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST
210 #define TCA_FLOWER_KEY_IPV4_DST 12
211 #endif
212 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK
213 #define TCA_FLOWER_KEY_IPV4_DST_MASK 13
214 #endif
215 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC
216 #define TCA_FLOWER_KEY_IPV6_SRC 14
217 #endif
218 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK
219 #define TCA_FLOWER_KEY_IPV6_SRC_MASK 15
220 #endif
221 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST
222 #define TCA_FLOWER_KEY_IPV6_DST 16
223 #endif
224 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK
225 #define TCA_FLOWER_KEY_IPV6_DST_MASK 17
226 #endif
227 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC
228 #define TCA_FLOWER_KEY_TCP_SRC 18
229 #endif
230 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK
231 #define TCA_FLOWER_KEY_TCP_SRC_MASK 35
232 #endif
233 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST
234 #define TCA_FLOWER_KEY_TCP_DST 19
235 #endif
236 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK
237 #define TCA_FLOWER_KEY_TCP_DST_MASK 36
238 #endif
239 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC
240 #define TCA_FLOWER_KEY_UDP_SRC 20
241 #endif
242 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK
243 #define TCA_FLOWER_KEY_UDP_SRC_MASK 37
244 #endif
245 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST
246 #define TCA_FLOWER_KEY_UDP_DST 21
247 #endif
248 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK
249 #define TCA_FLOWER_KEY_UDP_DST_MASK 38
250 #endif
251 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID
252 #define TCA_FLOWER_KEY_VLAN_ID 23
253 #endif
254 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO
255 #define TCA_FLOWER_KEY_VLAN_PRIO 24
256 #endif
257 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
258 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
259 #endif
260 #ifndef HAVE_TCA_FLOWER_KEY_ENC_KEY_ID
261 #define TCA_FLOWER_KEY_ENC_KEY_ID 26
262 #endif
263 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC
264 #define TCA_FLOWER_KEY_ENC_IPV4_SRC 27
265 #endif
266 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK
267 #define TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK 28
268 #endif
269 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST
270 #define TCA_FLOWER_KEY_ENC_IPV4_DST 29
271 #endif
272 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK
273 #define TCA_FLOWER_KEY_ENC_IPV4_DST_MASK 30
274 #endif
275 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC
276 #define TCA_FLOWER_KEY_ENC_IPV6_SRC 31
277 #endif
278 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK
279 #define TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK 32
280 #endif
281 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST
282 #define TCA_FLOWER_KEY_ENC_IPV6_DST 33
283 #endif
284 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK
285 #define TCA_FLOWER_KEY_ENC_IPV6_DST_MASK 34
286 #endif
287 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT
288 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT 43
289 #endif
290 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK
291 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK 44
292 #endif
293 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT
294 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT 45
295 #endif
296 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK
297 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK 46
298 #endif
299 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS
300 #define TCA_FLOWER_KEY_TCP_FLAGS 71
301 #endif
302 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK
303 #define TCA_FLOWER_KEY_TCP_FLAGS_MASK 72
304 #endif
305 #ifndef HAVE_TCA_FLOWER_KEY_IP_TOS
306 #define TCA_FLOWER_KEY_IP_TOS 73
307 #endif
308 #ifndef HAVE_TCA_FLOWER_KEY_IP_TOS_MASK
309 #define TCA_FLOWER_KEY_IP_TOS_MASK 74
310 #endif
311 #ifndef HAVE_TCA_FLOWER_KEY_IP_TTL
312 #define TCA_FLOWER_KEY_IP_TTL 75
313 #endif
314 #ifndef HAVE_TCA_FLOWER_KEY_IP_TTL_MASK
315 #define TCA_FLOWER_KEY_IP_TTL_MASK 76
316 #endif
317 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TOS
318 #define TCA_FLOWER_KEY_ENC_IP_TOS 80
319 #endif
320 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TOS_MASK
321 #define TCA_FLOWER_KEY_ENC_IP_TOS_MASK 81
322 #endif
323 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TTL
324 #define TCA_FLOWER_KEY_ENC_IP_TTL 82
325 #endif
326 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TTL_MASK
327 #define TCA_FLOWER_KEY_ENC_IP_TTL_MASK 83
328 #endif
329
330 #ifndef HAVE_TC_ACT_GOTO_CHAIN
331 #define TC_ACT_GOTO_CHAIN 0x20000000
332 #endif
333
334 #ifndef IPV6_ADDR_LEN
335 #define IPV6_ADDR_LEN 16
336 #endif
337
338 #ifndef IPV4_ADDR_LEN
339 #define IPV4_ADDR_LEN 4
340 #endif
341
342 #ifndef TP_PORT_LEN
343 #define TP_PORT_LEN 2 /* Transport Port (UDP/TCP) Length */
344 #endif
345
346 #ifndef TTL_LEN
347 #define TTL_LEN 1
348 #endif
349
350 #ifndef TCA_ACT_MAX_PRIO
351 #define TCA_ACT_MAX_PRIO 32
352 #endif
353
354 /** Parameters of VXLAN devices created by driver. */
355 #define MLX5_VXLAN_DEFAULT_VNI  1
356 #define MLX5_VXLAN_DEVICE_PFX "vmlx_"
357
358 /** Tunnel action type, used for @p type in header structure. */
359 enum flow_tcf_tunact_type {
360         FLOW_TCF_TUNACT_VXLAN_DECAP,
361         FLOW_TCF_TUNACT_VXLAN_ENCAP,
362 };
363
364 /** Flags used for @p mask in tunnel action encap descriptors. */
365 #define FLOW_TCF_ENCAP_ETH_SRC (1u << 0)
366 #define FLOW_TCF_ENCAP_ETH_DST (1u << 1)
367 #define FLOW_TCF_ENCAP_IPV4_SRC (1u << 2)
368 #define FLOW_TCF_ENCAP_IPV4_DST (1u << 3)
369 #define FLOW_TCF_ENCAP_IPV6_SRC (1u << 4)
370 #define FLOW_TCF_ENCAP_IPV6_DST (1u << 5)
371 #define FLOW_TCF_ENCAP_UDP_SRC (1u << 6)
372 #define FLOW_TCF_ENCAP_UDP_DST (1u << 7)
373 #define FLOW_TCF_ENCAP_VXLAN_VNI (1u << 8)
374 #define FLOW_TCF_ENCAP_IP_TTL (1u << 9)
375 #define FLOW_TCF_ENCAP_IP_TOS (1u << 10)
376
377 /**
378  * Structure for holding netlink context.
379  * Note the size of the message buffer which is MNL_SOCKET_BUFFER_SIZE.
380  * Using this (8KB) buffer size ensures that netlink messages will never be
381  * truncated.
382  */
383 struct mlx5_flow_tcf_context {
384         struct mnl_socket *nl; /* NETLINK_ROUTE libmnl socket. */
385         uint32_t seq; /* Message sequence number. */
386         uint32_t buf_size; /* Message buffer size. */
387         uint8_t *buf; /* Message buffer. */
388 };
389
390 /**
391  * Neigh rule structure. The neigh rule is applied via Netlink to
392  * outer tunnel iface in order to provide destination MAC address
393  * for the VXLAN encapsultion. The neigh rule is implicitly related
394  * to the Flow itself and can be shared by multiple Flows.
395  */
396 struct tcf_neigh_rule {
397         LIST_ENTRY(tcf_neigh_rule) next;
398         uint32_t refcnt;
399         struct ether_addr eth;
400         uint16_t mask;
401         union {
402                 struct {
403                         rte_be32_t dst;
404                 } ipv4;
405                 struct {
406                         uint8_t dst[IPV6_ADDR_LEN];
407                 } ipv6;
408         };
409 };
410
411 /**
412  * Local rule structure. The local rule is applied via Netlink to
413  * outer tunnel iface in order to provide local and peer IP addresses
414  * of the VXLAN tunnel for encapsulation. The local rule is implicitly
415  * related to the Flow itself and can be shared by multiple Flows.
416  */
417 struct tcf_local_rule {
418         LIST_ENTRY(tcf_local_rule) next;
419         uint32_t refcnt;
420         uint16_t mask;
421         union {
422                 struct {
423                         rte_be32_t dst;
424                         rte_be32_t src;
425                 } ipv4;
426                 struct {
427                         uint8_t dst[IPV6_ADDR_LEN];
428                         uint8_t src[IPV6_ADDR_LEN];
429                 } ipv6;
430         };
431 };
432
433 /** Outer interface VXLAN encapsulation rules container. */
434 struct tcf_irule {
435         LIST_ENTRY(tcf_irule) next;
436         LIST_HEAD(, tcf_neigh_rule) neigh;
437         LIST_HEAD(, tcf_local_rule) local;
438         uint32_t refcnt;
439         unsigned int ifouter; /**< Own interface index. */
440 };
441
442 /** VXLAN virtual netdev. */
443 struct tcf_vtep {
444         LIST_ENTRY(tcf_vtep) next;
445         uint32_t refcnt;
446         unsigned int ifindex; /**< Own interface index. */
447         uint16_t port;
448         uint8_t created;
449 };
450
451 /** Tunnel descriptor header, common for all tunnel types. */
452 struct flow_tcf_tunnel_hdr {
453         uint32_t type; /**< Tunnel action type. */
454         struct tcf_vtep *vtep; /**< Virtual tunnel endpoint device. */
455         unsigned int ifindex_org; /**< Original dst/src interface */
456         unsigned int *ifindex_ptr; /**< Interface ptr in message. */
457 };
458
459 struct flow_tcf_vxlan_decap {
460         struct flow_tcf_tunnel_hdr hdr;
461         uint16_t udp_port;
462 };
463
464 struct flow_tcf_vxlan_encap {
465         struct flow_tcf_tunnel_hdr hdr;
466         struct tcf_irule *iface;
467         uint32_t mask;
468         uint8_t ip_tos;
469         uint8_t ip_ttl_hop;
470         struct {
471                 struct ether_addr dst;
472                 struct ether_addr src;
473         } eth;
474         union {
475                 struct {
476                         rte_be32_t dst;
477                         rte_be32_t src;
478                 } ipv4;
479                 struct {
480                         uint8_t dst[IPV6_ADDR_LEN];
481                         uint8_t src[IPV6_ADDR_LEN];
482                 } ipv6;
483         };
484         struct {
485                 rte_be16_t src;
486                 rte_be16_t dst;
487         } udp;
488         struct {
489                 uint8_t vni[3];
490         } vxlan;
491 };
492
493 /** Structure used when extracting the values of a flow counters
494  * from a netlink message.
495  */
496 struct flow_tcf_stats_basic {
497         bool valid;
498         struct gnet_stats_basic counters;
499 };
500
501 /** Empty masks for known item types. */
502 static const union {
503         struct rte_flow_item_port_id port_id;
504         struct rte_flow_item_eth eth;
505         struct rte_flow_item_vlan vlan;
506         struct rte_flow_item_ipv4 ipv4;
507         struct rte_flow_item_ipv6 ipv6;
508         struct rte_flow_item_tcp tcp;
509         struct rte_flow_item_udp udp;
510         struct rte_flow_item_vxlan vxlan;
511 } flow_tcf_mask_empty = {
512         {0},
513 };
514
515 /** Supported masks for known item types. */
516 static const struct {
517         struct rte_flow_item_port_id port_id;
518         struct rte_flow_item_eth eth;
519         struct rte_flow_item_vlan vlan;
520         struct rte_flow_item_ipv4 ipv4;
521         struct rte_flow_item_ipv6 ipv6;
522         struct rte_flow_item_tcp tcp;
523         struct rte_flow_item_udp udp;
524         struct rte_flow_item_vxlan vxlan;
525 } flow_tcf_mask_supported = {
526         .port_id = {
527                 .id = 0xffffffff,
528         },
529         .eth = {
530                 .type = RTE_BE16(0xffff),
531                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
532                 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
533         },
534         .vlan = {
535                 /* PCP and VID only, no DEI. */
536                 .tci = RTE_BE16(0xefff),
537                 .inner_type = RTE_BE16(0xffff),
538         },
539         .ipv4.hdr = {
540                 .next_proto_id = 0xff,
541                 .time_to_live = 0xff,
542                 .type_of_service = 0xff,
543                 .src_addr = RTE_BE32(0xffffffff),
544                 .dst_addr = RTE_BE32(0xffffffff),
545         },
546         .ipv6.hdr = {
547                 .proto = 0xff,
548                 .vtc_flow = RTE_BE32(0xfful << IPV6_HDR_FL_SHIFT),
549                 .hop_limits = 0xff,
550                 .src_addr =
551                         "\xff\xff\xff\xff\xff\xff\xff\xff"
552                         "\xff\xff\xff\xff\xff\xff\xff\xff",
553                 .dst_addr =
554                         "\xff\xff\xff\xff\xff\xff\xff\xff"
555                         "\xff\xff\xff\xff\xff\xff\xff\xff",
556         },
557         .tcp.hdr = {
558                 .src_port = RTE_BE16(0xffff),
559                 .dst_port = RTE_BE16(0xffff),
560                 .tcp_flags = 0xff,
561         },
562         .udp.hdr = {
563                 .src_port = RTE_BE16(0xffff),
564                 .dst_port = RTE_BE16(0xffff),
565         },
566         .vxlan = {
567                .vni = "\xff\xff\xff",
568         },
569 };
570
571 #define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr))
572 #define SZ_NLATTR_NEST SZ_NLATTR_HDR
573 #define SZ_NLATTR_DATA_OF(len) MNL_ALIGN(SZ_NLATTR_HDR + (len))
574 #define SZ_NLATTR_TYPE_OF(typ) SZ_NLATTR_DATA_OF(sizeof(typ))
575 #define SZ_NLATTR_STRZ_OF(str) SZ_NLATTR_DATA_OF(strlen(str) + 1)
576
577 #define PTOI_TABLE_SZ_MAX(dev) (mlx5_dev_to_port_id((dev)->device, NULL, 0) + 2)
578
579 /** DPDK port to network interface index (ifindex) conversion. */
580 struct flow_tcf_ptoi {
581         uint16_t port_id; /**< DPDK port ID. */
582         unsigned int ifindex; /**< Network interface index. */
583 };
584
585 /* Due to a limitation on driver/FW. */
586 #define MLX5_TCF_GROUP_ID_MAX 3
587
588 /*
589  * Due to a limitation on driver/FW, priority ranges from 1 to 16 in kernel.
590  * Priority in rte_flow attribute starts from 0 and is added by 1 in
591  * translation. This is subject to be changed to determine the max priority
592  * based on trial-and-error like Verbs driver once the restriction is lifted or
593  * the range is extended.
594  */
595 #define MLX5_TCF_GROUP_PRIORITY_MAX 15
596
597 #define MLX5_TCF_FATE_ACTIONS \
598         (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \
599          MLX5_FLOW_ACTION_JUMP)
600
601 #define MLX5_TCF_VLAN_ACTIONS \
602         (MLX5_FLOW_ACTION_OF_POP_VLAN | MLX5_FLOW_ACTION_OF_PUSH_VLAN | \
603          MLX5_FLOW_ACTION_OF_SET_VLAN_VID | MLX5_FLOW_ACTION_OF_SET_VLAN_PCP)
604
605 #define MLX5_TCF_VXLAN_ACTIONS \
606         (MLX5_FLOW_ACTION_VXLAN_ENCAP | MLX5_FLOW_ACTION_VXLAN_DECAP)
607
608 #define MLX5_TCF_PEDIT_ACTIONS \
609         (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST | \
610          MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST | \
611          MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST | \
612          MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL | \
613          MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)
614
615 #define MLX5_TCF_CONFIG_ACTIONS \
616         (MLX5_FLOW_ACTION_PORT_ID | MLX5_FLOW_ACTION_JUMP | \
617          MLX5_FLOW_ACTION_OF_PUSH_VLAN | MLX5_FLOW_ACTION_OF_SET_VLAN_VID | \
618          MLX5_FLOW_ACTION_OF_SET_VLAN_PCP | \
619          (MLX5_TCF_PEDIT_ACTIONS & ~MLX5_FLOW_ACTION_DEC_TTL))
620
621 #define MAX_PEDIT_KEYS 128
622 #define SZ_PEDIT_KEY_VAL 4
623
624 #define NUM_OF_PEDIT_KEYS(sz) \
625         (((sz) / SZ_PEDIT_KEY_VAL) + (((sz) % SZ_PEDIT_KEY_VAL) ? 1 : 0))
626
627 struct pedit_key_ex {
628         enum pedit_header_type htype;
629         enum pedit_cmd cmd;
630 };
631
632 struct pedit_parser {
633         struct tc_pedit_sel sel;
634         struct tc_pedit_key keys[MAX_PEDIT_KEYS];
635         struct pedit_key_ex keys_ex[MAX_PEDIT_KEYS];
636 };
637
638 /**
639  * Create space for using the implicitly created TC flow counter.
640  *
641  * @param[in] dev
642  *   Pointer to the Ethernet device structure.
643  *
644  * @return
645  *   A pointer to the counter data structure, NULL otherwise and
646  *   rte_errno is set.
647  */
648 static struct mlx5_flow_counter *
649 flow_tcf_counter_new(void)
650 {
651         struct mlx5_flow_counter *cnt;
652
653         /*
654          * eswitch counter cannot be shared and its id is unknown.
655          * currently returning all with id 0.
656          * in the future maybe better to switch to unique numbers.
657          */
658         struct mlx5_flow_counter tmpl = {
659                 .ref_cnt = 1,
660         };
661         cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0);
662         if (!cnt) {
663                 rte_errno = ENOMEM;
664                 return NULL;
665         }
666         *cnt = tmpl;
667         /* Implicit counter, do not add to list. */
668         return cnt;
669 }
670
671 /**
672  * Set pedit key of MAC address
673  *
674  * @param[in] actions
675  *   pointer to action specification
676  * @param[in,out] p_parser
677  *   pointer to pedit_parser
678  */
679 static void
680 flow_tcf_pedit_key_set_mac(const struct rte_flow_action *actions,
681                            struct pedit_parser *p_parser)
682 {
683         int idx = p_parser->sel.nkeys;
684         uint32_t off = actions->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ?
685                                         offsetof(struct ether_hdr, s_addr) :
686                                         offsetof(struct ether_hdr, d_addr);
687         const struct rte_flow_action_set_mac *conf =
688                 (const struct rte_flow_action_set_mac *)actions->conf;
689
690         p_parser->keys[idx].off = off;
691         p_parser->keys[idx].mask = ~UINT32_MAX;
692         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
693         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
694         memcpy(&p_parser->keys[idx].val,
695                 conf->mac_addr, SZ_PEDIT_KEY_VAL);
696         idx++;
697         p_parser->keys[idx].off = off + SZ_PEDIT_KEY_VAL;
698         p_parser->keys[idx].mask = 0xFFFF0000;
699         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
700         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
701         memcpy(&p_parser->keys[idx].val,
702                 conf->mac_addr + SZ_PEDIT_KEY_VAL,
703                 ETHER_ADDR_LEN - SZ_PEDIT_KEY_VAL);
704         p_parser->sel.nkeys = (++idx);
705 }
706
707 /**
708  * Set pedit key of decrease/set ttl
709  *
710  * @param[in] actions
711  *   pointer to action specification
712  * @param[in,out] p_parser
713  *   pointer to pedit_parser
714  * @param[in] item_flags
715  *   flags of all items presented
716  */
717 static void
718 flow_tcf_pedit_key_set_dec_ttl(const struct rte_flow_action *actions,
719                                 struct pedit_parser *p_parser,
720                                 uint64_t item_flags)
721 {
722         int idx = p_parser->sel.nkeys;
723
724         p_parser->keys[idx].mask = 0xFFFFFF00;
725         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) {
726                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
727                 p_parser->keys[idx].off =
728                         offsetof(struct ipv4_hdr, time_to_live);
729         }
730         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6) {
731                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
732                 p_parser->keys[idx].off =
733                         offsetof(struct ipv6_hdr, hop_limits);
734         }
735         if (actions->type == RTE_FLOW_ACTION_TYPE_DEC_TTL) {
736                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_ADD;
737                 p_parser->keys[idx].val = 0x000000FF;
738         } else {
739                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
740                 p_parser->keys[idx].val =
741                         (__u32)((const struct rte_flow_action_set_ttl *)
742                          actions->conf)->ttl_value;
743         }
744         p_parser->sel.nkeys = (++idx);
745 }
746
747 /**
748  * Set pedit key of transport (TCP/UDP) port value
749  *
750  * @param[in] actions
751  *   pointer to action specification
752  * @param[in,out] p_parser
753  *   pointer to pedit_parser
754  * @param[in] item_flags
755  *   flags of all items presented
756  */
757 static void
758 flow_tcf_pedit_key_set_tp_port(const struct rte_flow_action *actions,
759                                 struct pedit_parser *p_parser,
760                                 uint64_t item_flags)
761 {
762         int idx = p_parser->sel.nkeys;
763
764         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)
765                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_UDP;
766         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_TCP)
767                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_TCP;
768         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
769         /* offset of src/dst port is same for TCP and UDP */
770         p_parser->keys[idx].off =
771                 actions->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC ?
772                 offsetof(struct tcp_hdr, src_port) :
773                 offsetof(struct tcp_hdr, dst_port);
774         p_parser->keys[idx].mask = 0xFFFF0000;
775         p_parser->keys[idx].val =
776                 (__u32)((const struct rte_flow_action_set_tp *)
777                                 actions->conf)->port;
778         p_parser->sel.nkeys = (++idx);
779 }
780
781 /**
782  * Set pedit key of ipv6 address
783  *
784  * @param[in] actions
785  *   pointer to action specification
786  * @param[in,out] p_parser
787  *   pointer to pedit_parser
788  */
789 static void
790 flow_tcf_pedit_key_set_ipv6_addr(const struct rte_flow_action *actions,
791                                  struct pedit_parser *p_parser)
792 {
793         int idx = p_parser->sel.nkeys;
794         int keys = NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
795         int off_base =
796                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ?
797                 offsetof(struct ipv6_hdr, src_addr) :
798                 offsetof(struct ipv6_hdr, dst_addr);
799         const struct rte_flow_action_set_ipv6 *conf =
800                 (const struct rte_flow_action_set_ipv6 *)actions->conf;
801
802         for (int i = 0; i < keys; i++, idx++) {
803                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
804                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
805                 p_parser->keys[idx].off = off_base + i * SZ_PEDIT_KEY_VAL;
806                 p_parser->keys[idx].mask = ~UINT32_MAX;
807                 memcpy(&p_parser->keys[idx].val,
808                         conf->ipv6_addr + i *  SZ_PEDIT_KEY_VAL,
809                         SZ_PEDIT_KEY_VAL);
810         }
811         p_parser->sel.nkeys += keys;
812 }
813
814 /**
815  * Set pedit key of ipv4 address
816  *
817  * @param[in] actions
818  *   pointer to action specification
819  * @param[in,out] p_parser
820  *   pointer to pedit_parser
821  */
822 static void
823 flow_tcf_pedit_key_set_ipv4_addr(const struct rte_flow_action *actions,
824                                  struct pedit_parser *p_parser)
825 {
826         int idx = p_parser->sel.nkeys;
827
828         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
829         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
830         p_parser->keys[idx].off =
831                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ?
832                 offsetof(struct ipv4_hdr, src_addr) :
833                 offsetof(struct ipv4_hdr, dst_addr);
834         p_parser->keys[idx].mask = ~UINT32_MAX;
835         p_parser->keys[idx].val =
836                 ((const struct rte_flow_action_set_ipv4 *)
837                  actions->conf)->ipv4_addr;
838         p_parser->sel.nkeys = (++idx);
839 }
840
841 /**
842  * Create the pedit's na attribute in netlink message
843  * on pre-allocate message buffer
844  *
845  * @param[in,out] nl
846  *   pointer to pre-allocated netlink message buffer
847  * @param[in,out] actions
848  *   pointer to pointer of actions specification.
849  * @param[in,out] action_flags
850  *   pointer to actions flags
851  * @param[in] item_flags
852  *   flags of all item presented
853  */
854 static void
855 flow_tcf_create_pedit_mnl_msg(struct nlmsghdr *nl,
856                               const struct rte_flow_action **actions,
857                               uint64_t item_flags)
858 {
859         struct pedit_parser p_parser;
860         struct nlattr *na_act_options;
861         struct nlattr *na_pedit_keys;
862
863         memset(&p_parser, 0, sizeof(p_parser));
864         mnl_attr_put_strz(nl, TCA_ACT_KIND, "pedit");
865         na_act_options = mnl_attr_nest_start(nl, TCA_ACT_OPTIONS);
866         /* all modify header actions should be in one tc-pedit action */
867         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
868                 switch ((*actions)->type) {
869                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
870                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
871                         flow_tcf_pedit_key_set_ipv4_addr(*actions, &p_parser);
872                         break;
873                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
874                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
875                         flow_tcf_pedit_key_set_ipv6_addr(*actions, &p_parser);
876                         break;
877                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
878                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
879                         flow_tcf_pedit_key_set_tp_port(*actions,
880                                                         &p_parser, item_flags);
881                         break;
882                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
883                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
884                         flow_tcf_pedit_key_set_dec_ttl(*actions,
885                                                         &p_parser, item_flags);
886                         break;
887                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
888                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
889                         flow_tcf_pedit_key_set_mac(*actions, &p_parser);
890                         break;
891                 default:
892                         goto pedit_mnl_msg_done;
893                 }
894         }
895 pedit_mnl_msg_done:
896         p_parser.sel.action = TC_ACT_PIPE;
897         mnl_attr_put(nl, TCA_PEDIT_PARMS_EX,
898                      sizeof(p_parser.sel) +
899                      p_parser.sel.nkeys * sizeof(struct tc_pedit_key),
900                      &p_parser);
901         na_pedit_keys =
902                 mnl_attr_nest_start(nl, TCA_PEDIT_KEYS_EX | NLA_F_NESTED);
903         for (int i = 0; i < p_parser.sel.nkeys; i++) {
904                 struct nlattr *na_pedit_key =
905                         mnl_attr_nest_start(nl,
906                                             TCA_PEDIT_KEY_EX | NLA_F_NESTED);
907                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_HTYPE,
908                                  p_parser.keys_ex[i].htype);
909                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_CMD,
910                                  p_parser.keys_ex[i].cmd);
911                 mnl_attr_nest_end(nl, na_pedit_key);
912         }
913         mnl_attr_nest_end(nl, na_pedit_keys);
914         mnl_attr_nest_end(nl, na_act_options);
915         (*actions)--;
916 }
917
918 /**
919  * Calculate max memory size of one TC-pedit actions.
920  * One TC-pedit action can contain set of keys each defining
921  * a rewrite element (rte_flow action)
922  *
923  * @param[in,out] actions
924  *   actions specification.
925  * @param[in,out] action_flags
926  *   actions flags
927  * @param[in,out] size
928  *   accumulated size
929  * @return
930  *   Max memory size of one TC-pedit action
931  */
932 static int
933 flow_tcf_get_pedit_actions_size(const struct rte_flow_action **actions,
934                                 uint64_t *action_flags)
935 {
936         int pedit_size = 0;
937         int keys = 0;
938         uint64_t flags = 0;
939
940         pedit_size += SZ_NLATTR_NEST + /* na_act_index. */
941                       SZ_NLATTR_STRZ_OF("pedit") +
942                       SZ_NLATTR_NEST; /* TCA_ACT_OPTIONS. */
943         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
944                 switch ((*actions)->type) {
945                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
946                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
947                         flags |= MLX5_FLOW_ACTION_SET_IPV4_SRC;
948                         break;
949                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
950                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
951                         flags |= MLX5_FLOW_ACTION_SET_IPV4_DST;
952                         break;
953                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
954                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
955                         flags |= MLX5_FLOW_ACTION_SET_IPV6_SRC;
956                         break;
957                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
958                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
959                         flags |= MLX5_FLOW_ACTION_SET_IPV6_DST;
960                         break;
961                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
962                         /* TCP is as same as UDP */
963                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
964                         flags |= MLX5_FLOW_ACTION_SET_TP_SRC;
965                         break;
966                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
967                         /* TCP is as same as UDP */
968                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
969                         flags |= MLX5_FLOW_ACTION_SET_TP_DST;
970                         break;
971                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
972                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
973                         flags |= MLX5_FLOW_ACTION_SET_TTL;
974                         break;
975                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
976                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
977                         flags |= MLX5_FLOW_ACTION_DEC_TTL;
978                         break;
979                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
980                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
981                         flags |= MLX5_FLOW_ACTION_SET_MAC_SRC;
982                         break;
983                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
984                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
985                         flags |= MLX5_FLOW_ACTION_SET_MAC_DST;
986                         break;
987                 default:
988                         goto get_pedit_action_size_done;
989                 }
990         }
991 get_pedit_action_size_done:
992         /* TCA_PEDIT_PARAMS_EX */
993         pedit_size +=
994                 SZ_NLATTR_DATA_OF(sizeof(struct tc_pedit_sel) +
995                                   keys * sizeof(struct tc_pedit_key));
996         pedit_size += SZ_NLATTR_NEST; /* TCA_PEDIT_KEYS */
997         pedit_size += keys *
998                       /* TCA_PEDIT_KEY_EX + HTYPE + CMD */
999                       (SZ_NLATTR_NEST + SZ_NLATTR_DATA_OF(2) +
1000                        SZ_NLATTR_DATA_OF(2));
1001         (*action_flags) |= flags;
1002         (*actions)--;
1003         return pedit_size;
1004 }
1005
1006 /**
1007  * Retrieve mask for pattern item.
1008  *
1009  * This function does basic sanity checks on a pattern item in order to
1010  * return the most appropriate mask for it.
1011  *
1012  * @param[in] item
1013  *   Item specification.
1014  * @param[in] mask_default
1015  *   Default mask for pattern item as specified by the flow API.
1016  * @param[in] mask_supported
1017  *   Mask fields supported by the implementation.
1018  * @param[in] mask_empty
1019  *   Empty mask to return when there is no specification.
1020  * @param[out] error
1021  *   Perform verbose error reporting if not NULL.
1022  *
1023  * @return
1024  *   Either @p item->mask or one of the mask parameters on success, NULL
1025  *   otherwise and rte_errno is set.
1026  */
1027 static const void *
1028 flow_tcf_item_mask(const struct rte_flow_item *item, const void *mask_default,
1029                    const void *mask_supported, const void *mask_empty,
1030                    size_t mask_size, struct rte_flow_error *error)
1031 {
1032         const uint8_t *mask;
1033         size_t i;
1034
1035         /* item->last and item->mask cannot exist without item->spec. */
1036         if (!item->spec && (item->mask || item->last)) {
1037                 rte_flow_error_set(error, EINVAL,
1038                                    RTE_FLOW_ERROR_TYPE_ITEM, item,
1039                                    "\"mask\" or \"last\" field provided without"
1040                                    " a corresponding \"spec\"");
1041                 return NULL;
1042         }
1043         /* No spec, no mask, no problem. */
1044         if (!item->spec)
1045                 return mask_empty;
1046         mask = item->mask ? item->mask : mask_default;
1047         assert(mask);
1048         /*
1049          * Single-pass check to make sure that:
1050          * - Mask is supported, no bits are set outside mask_supported.
1051          * - Both item->spec and item->last are included in mask.
1052          */
1053         for (i = 0; i != mask_size; ++i) {
1054                 if (!mask[i])
1055                         continue;
1056                 if ((mask[i] | ((const uint8_t *)mask_supported)[i]) !=
1057                     ((const uint8_t *)mask_supported)[i]) {
1058                         rte_flow_error_set(error, ENOTSUP,
1059                                            RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1060                                            "unsupported field found"
1061                                            " in \"mask\"");
1062                         return NULL;
1063                 }
1064                 if (item->last &&
1065                     (((const uint8_t *)item->spec)[i] & mask[i]) !=
1066                     (((const uint8_t *)item->last)[i] & mask[i])) {
1067                         rte_flow_error_set(error, EINVAL,
1068                                            RTE_FLOW_ERROR_TYPE_ITEM_LAST,
1069                                            item->last,
1070                                            "range between \"spec\" and \"last\""
1071                                            " not comprised in \"mask\"");
1072                         return NULL;
1073                 }
1074         }
1075         return mask;
1076 }
1077
1078 /**
1079  * Build a conversion table between port ID and ifindex.
1080  *
1081  * @param[in] dev
1082  *   Pointer to Ethernet device.
1083  * @param[out] ptoi
1084  *   Pointer to ptoi table.
1085  * @param[in] len
1086  *   Size of ptoi table provided.
1087  *
1088  * @return
1089  *   Size of ptoi table filled.
1090  */
1091 static unsigned int
1092 flow_tcf_build_ptoi_table(struct rte_eth_dev *dev, struct flow_tcf_ptoi *ptoi,
1093                           unsigned int len)
1094 {
1095         unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
1096         uint16_t port_id[n + 1];
1097         unsigned int i;
1098         unsigned int own = 0;
1099
1100         /* At least one port is needed when no switch domain is present. */
1101         if (!n) {
1102                 n = 1;
1103                 port_id[0] = dev->data->port_id;
1104         } else {
1105                 n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
1106         }
1107         if (n > len)
1108                 return 0;
1109         for (i = 0; i != n; ++i) {
1110                 struct rte_eth_dev_info dev_info;
1111
1112                 rte_eth_dev_info_get(port_id[i], &dev_info);
1113                 if (port_id[i] == dev->data->port_id)
1114                         own = i;
1115                 ptoi[i].port_id = port_id[i];
1116                 ptoi[i].ifindex = dev_info.if_index;
1117         }
1118         /* Ensure first entry of ptoi[] is the current device. */
1119         if (own) {
1120                 ptoi[n] = ptoi[0];
1121                 ptoi[0] = ptoi[own];
1122                 ptoi[own] = ptoi[n];
1123         }
1124         /* An entry with zero ifindex terminates ptoi[]. */
1125         ptoi[n].port_id = 0;
1126         ptoi[n].ifindex = 0;
1127         return n;
1128 }
1129
1130 /**
1131  * Verify the @p attr will be correctly understood by the E-switch.
1132  *
1133  * @param[in] attr
1134  *   Pointer to flow attributes
1135  * @param[out] error
1136  *   Pointer to error structure.
1137  *
1138  * @return
1139  *   0 on success, a negative errno value otherwise and rte_errno is set.
1140  */
1141 static int
1142 flow_tcf_validate_attributes(const struct rte_flow_attr *attr,
1143                              struct rte_flow_error *error)
1144 {
1145         /*
1146          * Supported attributes: groups, some priorities and ingress only.
1147          * group is supported only if kernel supports chain. Don't care about
1148          * transfer as it is the caller's problem.
1149          */
1150         if (attr->group > MLX5_TCF_GROUP_ID_MAX)
1151                 return rte_flow_error_set(error, ENOTSUP,
1152                                           RTE_FLOW_ERROR_TYPE_ATTR_GROUP, attr,
1153                                           "group ID larger than "
1154                                           RTE_STR(MLX5_TCF_GROUP_ID_MAX)
1155                                           " isn't supported");
1156         else if (attr->priority > MLX5_TCF_GROUP_PRIORITY_MAX)
1157                 return rte_flow_error_set(error, ENOTSUP,
1158                                           RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1159                                           attr,
1160                                           "priority more than "
1161                                           RTE_STR(MLX5_TCF_GROUP_PRIORITY_MAX)
1162                                           " is not supported");
1163         if (!attr->ingress)
1164                 return rte_flow_error_set(error, EINVAL,
1165                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1166                                           attr, "only ingress is supported");
1167         if (attr->egress)
1168                 return rte_flow_error_set(error, ENOTSUP,
1169                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1170                                           attr, "egress is not supported");
1171         return 0;
1172 }
1173
1174 /**
1175  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_ETH item for E-Switch.
1176  * The routine checks the L2 fields to be used in encapsulation header.
1177  *
1178  * @param[in] item
1179  *   Pointer to the item structure.
1180  * @param[out] error
1181  *   Pointer to the error structure.
1182  *
1183  * @return
1184  *   0 on success, a negative errno value otherwise and rte_errno is set.
1185  **/
1186 static int
1187 flow_tcf_validate_vxlan_encap_eth(const struct rte_flow_item *item,
1188                                   struct rte_flow_error *error)
1189 {
1190         const struct rte_flow_item_eth *spec = item->spec;
1191         const struct rte_flow_item_eth *mask = item->mask;
1192
1193         if (!spec) {
1194                 /*
1195                  * Specification for L2 addresses can be empty
1196                  * because these ones are optional and not
1197                  * required directly by tc rule. Kernel tries
1198                  * to resolve these ones on its own
1199                  */
1200                 return 0;
1201         }
1202         if (!mask) {
1203                 /* If mask is not specified use the default one. */
1204                 mask = &rte_flow_item_eth_mask;
1205         }
1206         if (memcmp(&mask->dst,
1207                    &flow_tcf_mask_empty.eth.dst,
1208                    sizeof(flow_tcf_mask_empty.eth.dst))) {
1209                 if (memcmp(&mask->dst,
1210                            &rte_flow_item_eth_mask.dst,
1211                            sizeof(rte_flow_item_eth_mask.dst)))
1212                         return rte_flow_error_set
1213                                 (error, ENOTSUP,
1214                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1215                                  "no support for partial mask on"
1216                                  " \"eth.dst\" field");
1217         }
1218         if (memcmp(&mask->src,
1219                    &flow_tcf_mask_empty.eth.src,
1220                    sizeof(flow_tcf_mask_empty.eth.src))) {
1221                 if (memcmp(&mask->src,
1222                            &rte_flow_item_eth_mask.src,
1223                            sizeof(rte_flow_item_eth_mask.src)))
1224                         return rte_flow_error_set
1225                                 (error, ENOTSUP,
1226                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1227                                  "no support for partial mask on"
1228                                  " \"eth.src\" field");
1229         }
1230         if (mask->type != RTE_BE16(0x0000)) {
1231                 if (mask->type != RTE_BE16(0xffff))
1232                         return rte_flow_error_set
1233                                 (error, ENOTSUP,
1234                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1235                                  "no support for partial mask on"
1236                                  " \"eth.type\" field");
1237                 DRV_LOG(WARNING,
1238                         "outer ethernet type field"
1239                         " cannot be forced for vxlan"
1240                         " encapsulation, parameter ignored");
1241         }
1242         return 0;
1243 }
1244
1245 /**
1246  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV4 item for E-Switch.
1247  * The routine checks the IPv4 fields to be used in encapsulation header.
1248  *
1249  * @param[in] item
1250  *   Pointer to the item structure.
1251  * @param[out] error
1252  *   Pointer to the error structure.
1253  *
1254  * @return
1255  *   0 on success, a negative errno value otherwise and rte_errno is set.
1256  **/
1257 static int
1258 flow_tcf_validate_vxlan_encap_ipv4(const struct rte_flow_item *item,
1259                                    struct rte_flow_error *error)
1260 {
1261         const struct rte_flow_item_ipv4 *spec = item->spec;
1262         const struct rte_flow_item_ipv4 *mask = item->mask;
1263
1264         if (!spec) {
1265                 /*
1266                  * Specification for IP addresses cannot be empty
1267                  * because it is required by tunnel_key parameter.
1268                  */
1269                 return rte_flow_error_set(error, EINVAL,
1270                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1271                                           "NULL outer ipv4 address"
1272                                           " specification for vxlan"
1273                                           " encapsulation");
1274         }
1275         if (!mask)
1276                 mask = &rte_flow_item_ipv4_mask;
1277         if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) {
1278                 if (mask->hdr.dst_addr != RTE_BE32(0xffffffff))
1279                         return rte_flow_error_set
1280                                 (error, ENOTSUP,
1281                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1282                                  "no support for partial mask on"
1283                                  " \"ipv4.hdr.dst_addr\" field"
1284                                  " for vxlan encapsulation");
1285                 /* More IPv4 address validations can be put here. */
1286         } else {
1287                 /*
1288                  * Kernel uses the destination IP address to determine
1289                  * the routing path and obtain the MAC destination
1290                  * address, so IP destination address must be
1291                  * specified in the tc rule.
1292                  */
1293                 return rte_flow_error_set(error, EINVAL,
1294                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1295                                           "outer ipv4 destination address"
1296                                           " must be specified for"
1297                                           " vxlan encapsulation");
1298         }
1299         if (mask->hdr.src_addr != RTE_BE32(0x00000000)) {
1300                 if (mask->hdr.src_addr != RTE_BE32(0xffffffff))
1301                         return rte_flow_error_set
1302                                 (error, ENOTSUP,
1303                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1304                                  "no support for partial mask on"
1305                                  " \"ipv4.hdr.src_addr\" field"
1306                                  " for vxlan encapsulation");
1307                 /* More IPv4 address validations can be put here. */
1308         } else {
1309                 /*
1310                  * Kernel uses the source IP address to select the
1311                  * interface for egress encapsulated traffic, so
1312                  * it must be specified in the tc rule.
1313                  */
1314                 return rte_flow_error_set(error, EINVAL,
1315                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1316                                           "outer ipv4 source address"
1317                                           " must be specified for"
1318                                           " vxlan encapsulation");
1319         }
1320         if (mask->hdr.type_of_service &&
1321             mask->hdr.type_of_service != 0xff)
1322                 return rte_flow_error_set(error, ENOTSUP,
1323                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1324                                           "no support for partial mask on"
1325                                           " \"ipv4.hdr.type_of_service\" field"
1326                                           " for vxlan encapsulation");
1327         if (mask->hdr.time_to_live &&
1328             mask->hdr.time_to_live != 0xff)
1329                 return rte_flow_error_set(error, ENOTSUP,
1330                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1331                                           "no support for partial mask on"
1332                                           " \"ipv4.hdr.time_to_live\" field"
1333                                           " for vxlan encapsulation");
1334         return 0;
1335 }
1336
1337 /**
1338  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV6 item for E-Switch.
1339  * The routine checks the IPv6 fields to be used in encapsulation header.
1340  *
1341  * @param[in] item
1342  *   Pointer to the item structure.
1343  * @param[out] error
1344  *   Pointer to the error structure.
1345  *
1346  * @return
1347  *   0 on success, a negative errno value otherwise and rte_errno is set.
1348  **/
1349 static int
1350 flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item,
1351                                    struct rte_flow_error *error)
1352 {
1353         const struct rte_flow_item_ipv6 *spec = item->spec;
1354         const struct rte_flow_item_ipv6 *mask = item->mask;
1355         uint8_t msk6;
1356
1357         if (!spec) {
1358                 /*
1359                  * Specification for IP addresses cannot be empty
1360                  * because it is required by tunnel_key parameter.
1361                  */
1362                 return rte_flow_error_set(error, EINVAL,
1363                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1364                                           "NULL outer ipv6 address"
1365                                           " specification for"
1366                                           " vxlan encapsulation");
1367         }
1368         if (!mask)
1369                 mask = &rte_flow_item_ipv6_mask;
1370         if (memcmp(&mask->hdr.dst_addr,
1371                    &flow_tcf_mask_empty.ipv6.hdr.dst_addr,
1372                    IPV6_ADDR_LEN)) {
1373                 if (memcmp(&mask->hdr.dst_addr,
1374                            &rte_flow_item_ipv6_mask.hdr.dst_addr,
1375                            IPV6_ADDR_LEN))
1376                         return rte_flow_error_set
1377                                         (error, ENOTSUP,
1378                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1379                                          "no support for partial mask on"
1380                                          " \"ipv6.hdr.dst_addr\" field"
1381                                          " for vxlan encapsulation");
1382                 /* More IPv6 address validations can be put here. */
1383         } else {
1384                 /*
1385                  * Kernel uses the destination IP address to determine
1386                  * the routing path and obtain the MAC destination
1387                  * address (heigh or gate), so IP destination address
1388                  * must be specified within the tc rule.
1389                  */
1390                 return rte_flow_error_set(error, EINVAL,
1391                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1392                                           "outer ipv6 destination address"
1393                                           " must be specified for"
1394                                           " vxlan encapsulation");
1395         }
1396         if (memcmp(&mask->hdr.src_addr,
1397                    &flow_tcf_mask_empty.ipv6.hdr.src_addr,
1398                    IPV6_ADDR_LEN)) {
1399                 if (memcmp(&mask->hdr.src_addr,
1400                            &rte_flow_item_ipv6_mask.hdr.src_addr,
1401                            IPV6_ADDR_LEN))
1402                         return rte_flow_error_set
1403                                         (error, ENOTSUP,
1404                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1405                                          "no support for partial mask on"
1406                                          " \"ipv6.hdr.src_addr\" field"
1407                                          " for vxlan encapsulation");
1408                 /* More L3 address validation can be put here. */
1409         } else {
1410                 /*
1411                  * Kernel uses the source IP address to select the
1412                  * interface for egress encapsulated traffic, so
1413                  * it must be specified in the tc rule.
1414                  */
1415                 return rte_flow_error_set(error, EINVAL,
1416                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1417                                           "outer L3 source address"
1418                                           " must be specified for"
1419                                           " vxlan encapsulation");
1420         }
1421         msk6 = (rte_be_to_cpu_32(mask->hdr.vtc_flow) >>
1422                 IPV6_HDR_TC_SHIFT) & 0xff;
1423         if (msk6 && msk6 != 0xff)
1424                 return rte_flow_error_set(error, ENOTSUP,
1425                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1426                                           "no support for partial mask on"
1427                                           " \"ipv6.hdr.vtc_flow.tos\" field"
1428                                           " for vxlan encapsulation");
1429         if (mask->hdr.hop_limits && mask->hdr.hop_limits != 0xff)
1430                 return rte_flow_error_set(error, ENOTSUP,
1431                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1432                                           "no support for partial mask on"
1433                                           " \"ipv6.hdr.hop_limits\" field"
1434                                           " for vxlan encapsulation");
1435         return 0;
1436 }
1437
1438 /**
1439  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_UDP item for E-Switch.
1440  * The routine checks the UDP fields to be used in encapsulation header.
1441  *
1442  * @param[in] item
1443  *   Pointer to the item structure.
1444  * @param[out] error
1445  *   Pointer to the error structure.
1446  *
1447  * @return
1448  *   0 on success, a negative errno value otherwise and rte_errno is set.
1449  **/
1450 static int
1451 flow_tcf_validate_vxlan_encap_udp(const struct rte_flow_item *item,
1452                                   struct rte_flow_error *error)
1453 {
1454         const struct rte_flow_item_udp *spec = item->spec;
1455         const struct rte_flow_item_udp *mask = item->mask;
1456
1457         if (!spec) {
1458                 /*
1459                  * Specification for UDP ports cannot be empty
1460                  * because it is required by tunnel_key parameter.
1461                  */
1462                 return rte_flow_error_set(error, EINVAL,
1463                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1464                                           "NULL UDP port specification "
1465                                           " for vxlan encapsulation");
1466         }
1467         if (!mask)
1468                 mask = &rte_flow_item_udp_mask;
1469         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1470                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1471                         return rte_flow_error_set
1472                                         (error, ENOTSUP,
1473                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1474                                          "no support for partial mask on"
1475                                          " \"udp.hdr.dst_port\" field"
1476                                          " for vxlan encapsulation");
1477                 if (!spec->hdr.dst_port)
1478                         return rte_flow_error_set
1479                                         (error, EINVAL,
1480                                          RTE_FLOW_ERROR_TYPE_ITEM, item,
1481                                          "outer UDP remote port cannot be"
1482                                          " 0 for vxlan encapsulation");
1483         } else {
1484                 return rte_flow_error_set(error, EINVAL,
1485                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1486                                           "outer UDP remote port"
1487                                           " must be specified for"
1488                                           " vxlan encapsulation");
1489         }
1490         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1491                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1492                         return rte_flow_error_set
1493                                         (error, ENOTSUP,
1494                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1495                                          "no support for partial mask on"
1496                                          " \"udp.hdr.src_port\" field"
1497                                          " for vxlan encapsulation");
1498                 DRV_LOG(WARNING,
1499                         "outer UDP source port cannot be"
1500                         " forced for vxlan encapsulation,"
1501                         " parameter ignored");
1502         }
1503         return 0;
1504 }
1505
1506 /**
1507  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_VXLAN item for E-Switch.
1508  * The routine checks the VNIP fields to be used in encapsulation header.
1509  *
1510  * @param[in] item
1511  *   Pointer to the item structure.
1512  * @param[out] error
1513  *   Pointer to the error structure.
1514  *
1515  * @return
1516  *   0 on success, a negative errno value otherwise and rte_errno is set.
1517  **/
1518 static int
1519 flow_tcf_validate_vxlan_encap_vni(const struct rte_flow_item *item,
1520                                   struct rte_flow_error *error)
1521 {
1522         const struct rte_flow_item_vxlan *spec = item->spec;
1523         const struct rte_flow_item_vxlan *mask = item->mask;
1524
1525         if (!spec) {
1526                 /* Outer VNI is required by tunnel_key parameter. */
1527                 return rte_flow_error_set(error, EINVAL,
1528                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1529                                           "NULL VNI specification"
1530                                           " for vxlan encapsulation");
1531         }
1532         if (!mask)
1533                 mask = &rte_flow_item_vxlan_mask;
1534         if (!mask->vni[0] && !mask->vni[1] && !mask->vni[2])
1535                 return rte_flow_error_set(error, EINVAL,
1536                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1537                                           "outer VNI must be specified "
1538                                           "for vxlan encapsulation");
1539         if (mask->vni[0] != 0xff ||
1540             mask->vni[1] != 0xff ||
1541             mask->vni[2] != 0xff)
1542                 return rte_flow_error_set(error, ENOTSUP,
1543                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1544                                           "no support for partial mask on"
1545                                           " \"vxlan.vni\" field");
1546
1547         if (!spec->vni[0] && !spec->vni[1] && !spec->vni[2])
1548                 return rte_flow_error_set(error, EINVAL,
1549                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1550                                           "vxlan vni cannot be 0");
1551         return 0;
1552 }
1553
1554 /**
1555  * Validate VXLAN_ENCAP action item list for E-Switch.
1556  * The routine checks items to be used in encapsulation header.
1557  *
1558  * @param[in] action
1559  *   Pointer to the VXLAN_ENCAP action structure.
1560  * @param[out] error
1561  *   Pointer to the error structure.
1562  *
1563  * @return
1564  *   0 on success, a negative errno value otherwise and rte_errno is set.
1565  **/
1566 static int
1567 flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action,
1568                               struct rte_flow_error *error)
1569 {
1570         const struct rte_flow_item *items;
1571         int ret;
1572         uint32_t item_flags = 0;
1573
1574         if (!action->conf)
1575                 return rte_flow_error_set(error, EINVAL,
1576                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1577                                           "Missing vxlan tunnel"
1578                                           " action configuration");
1579         items = ((const struct rte_flow_action_vxlan_encap *)
1580                                         action->conf)->definition;
1581         if (!items)
1582                 return rte_flow_error_set(error, EINVAL,
1583                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1584                                           "Missing vxlan tunnel"
1585                                           " encapsulation parameters");
1586         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1587                 switch (items->type) {
1588                 case RTE_FLOW_ITEM_TYPE_VOID:
1589                         break;
1590                 case RTE_FLOW_ITEM_TYPE_ETH:
1591                         ret = mlx5_flow_validate_item_eth(items, item_flags,
1592                                                           error);
1593                         if (ret < 0)
1594                                 return ret;
1595                         ret = flow_tcf_validate_vxlan_encap_eth(items, error);
1596                         if (ret < 0)
1597                                 return ret;
1598                         item_flags |= MLX5_FLOW_LAYER_OUTER_L2;
1599                         break;
1600                 break;
1601                 case RTE_FLOW_ITEM_TYPE_IPV4:
1602                         ret = mlx5_flow_validate_item_ipv4
1603                                         (items, item_flags,
1604                                          &flow_tcf_mask_supported.ipv4, error);
1605                         if (ret < 0)
1606                                 return ret;
1607                         ret = flow_tcf_validate_vxlan_encap_ipv4(items, error);
1608                         if (ret < 0)
1609                                 return ret;
1610                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
1611                         break;
1612                 case RTE_FLOW_ITEM_TYPE_IPV6:
1613                         ret = mlx5_flow_validate_item_ipv6
1614                                         (items, item_flags,
1615                                          &flow_tcf_mask_supported.ipv6, error);
1616                         if (ret < 0)
1617                                 return ret;
1618                         ret = flow_tcf_validate_vxlan_encap_ipv6(items, error);
1619                         if (ret < 0)
1620                                 return ret;
1621                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
1622                         break;
1623                 case RTE_FLOW_ITEM_TYPE_UDP:
1624                         ret = mlx5_flow_validate_item_udp(items, item_flags,
1625                                                            0xFF, error);
1626                         if (ret < 0)
1627                                 return ret;
1628                         ret = flow_tcf_validate_vxlan_encap_udp(items, error);
1629                         if (ret < 0)
1630                                 return ret;
1631                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
1632                         break;
1633                 case RTE_FLOW_ITEM_TYPE_VXLAN:
1634                         ret = mlx5_flow_validate_item_vxlan(items,
1635                                                             item_flags, error);
1636                         if (ret < 0)
1637                                 return ret;
1638                         ret = flow_tcf_validate_vxlan_encap_vni(items, error);
1639                         if (ret < 0)
1640                                 return ret;
1641                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
1642                         break;
1643                 default:
1644                         return rte_flow_error_set
1645                                         (error, ENOTSUP,
1646                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
1647                                          "vxlan encap item not supported");
1648                 }
1649         }
1650         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3))
1651                 return rte_flow_error_set(error, EINVAL,
1652                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1653                                           "no outer IP layer found"
1654                                           " for vxlan encapsulation");
1655         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
1656                 return rte_flow_error_set(error, EINVAL,
1657                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1658                                           "no outer UDP layer found"
1659                                           " for vxlan encapsulation");
1660         if (!(item_flags & MLX5_FLOW_LAYER_VXLAN))
1661                 return rte_flow_error_set(error, EINVAL,
1662                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1663                                           "no VXLAN VNI found"
1664                                           " for vxlan encapsulation");
1665         return 0;
1666 }
1667
1668 /**
1669  * Validate outer RTE_FLOW_ITEM_TYPE_UDP item if tunnel item
1670  * RTE_FLOW_ITEM_TYPE_VXLAN is present in item list.
1671  *
1672  * @param[in] udp
1673  *   Outer UDP layer item (if any, NULL otherwise).
1674  * @param[out] error
1675  *   Pointer to the error structure.
1676  *
1677  * @return
1678  *   0 on success, a negative errno value otherwise and rte_errno is set.
1679  **/
1680 static int
1681 flow_tcf_validate_vxlan_decap_udp(const struct rte_flow_item *udp,
1682                                   struct rte_flow_error *error)
1683 {
1684         const struct rte_flow_item_udp *spec = udp->spec;
1685         const struct rte_flow_item_udp *mask = udp->mask;
1686
1687         if (!spec)
1688                 /*
1689                  * Specification for UDP ports cannot be empty
1690                  * because it is required as decap parameter.
1691                  */
1692                 return rte_flow_error_set(error, EINVAL,
1693                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1694                                           "NULL UDP port specification"
1695                                           " for VXLAN decapsulation");
1696         if (!mask)
1697                 mask = &rte_flow_item_udp_mask;
1698         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1699                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1700                         return rte_flow_error_set
1701                                         (error, ENOTSUP,
1702                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1703                                          "no support for partial mask on"
1704                                          " \"udp.hdr.dst_port\" field");
1705                 if (!spec->hdr.dst_port)
1706                         return rte_flow_error_set
1707                                         (error, EINVAL,
1708                                          RTE_FLOW_ERROR_TYPE_ITEM, udp,
1709                                          "zero decap local UDP port");
1710         } else {
1711                 return rte_flow_error_set(error, EINVAL,
1712                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1713                                           "outer UDP destination port must be "
1714                                           "specified for vxlan decapsulation");
1715         }
1716         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1717                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1718                         return rte_flow_error_set
1719                                         (error, ENOTSUP,
1720                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1721                                          "no support for partial mask on"
1722                                          " \"udp.hdr.src_port\" field");
1723                 DRV_LOG(WARNING,
1724                         "outer UDP local port cannot be "
1725                         "forced for VXLAN encapsulation, "
1726                         "parameter ignored");
1727         }
1728         return 0;
1729 }
1730
1731 /**
1732  * Validate flow for E-Switch.
1733  *
1734  * @param[in] priv
1735  *   Pointer to the priv structure.
1736  * @param[in] attr
1737  *   Pointer to the flow attributes.
1738  * @param[in] items
1739  *   Pointer to the list of items.
1740  * @param[in] actions
1741  *   Pointer to the list of actions.
1742  * @param[out] error
1743  *   Pointer to the error structure.
1744  *
1745  * @return
1746  *   0 on success, a negative errno value otherwise and rte_errno is set.
1747  */
1748 static int
1749 flow_tcf_validate(struct rte_eth_dev *dev,
1750                   const struct rte_flow_attr *attr,
1751                   const struct rte_flow_item items[],
1752                   const struct rte_flow_action actions[],
1753                   struct rte_flow_error *error)
1754 {
1755         union {
1756                 const struct rte_flow_item_port_id *port_id;
1757                 const struct rte_flow_item_eth *eth;
1758                 const struct rte_flow_item_vlan *vlan;
1759                 const struct rte_flow_item_ipv4 *ipv4;
1760                 const struct rte_flow_item_ipv6 *ipv6;
1761                 const struct rte_flow_item_tcp *tcp;
1762                 const struct rte_flow_item_udp *udp;
1763                 const struct rte_flow_item_vxlan *vxlan;
1764         } spec, mask;
1765         union {
1766                 const struct rte_flow_action_port_id *port_id;
1767                 const struct rte_flow_action_jump *jump;
1768                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
1769                 const struct rte_flow_action_of_set_vlan_vid *
1770                         of_set_vlan_vid;
1771                 const struct rte_flow_action_of_set_vlan_pcp *
1772                         of_set_vlan_pcp;
1773                 const struct rte_flow_action_vxlan_encap *vxlan_encap;
1774                 const struct rte_flow_action_set_ipv4 *set_ipv4;
1775                 const struct rte_flow_action_set_ipv6 *set_ipv6;
1776         } conf;
1777         const struct rte_flow_item *outer_udp = NULL;
1778         rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
1779         rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
1780         rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
1781         uint64_t item_flags = 0;
1782         uint64_t action_flags = 0;
1783         uint8_t next_protocol = 0xff;
1784         unsigned int tcm_ifindex = 0;
1785         uint8_t pedit_validated = 0;
1786         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
1787         struct rte_eth_dev *port_id_dev = NULL;
1788         bool in_port_id_set;
1789         int ret;
1790
1791         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
1792                                                 PTOI_TABLE_SZ_MAX(dev)));
1793         ret = flow_tcf_validate_attributes(attr, error);
1794         if (ret < 0)
1795                 return ret;
1796         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
1797                 unsigned int i;
1798                 uint64_t current_action_flag = 0;
1799
1800                 switch (actions->type) {
1801                 case RTE_FLOW_ACTION_TYPE_VOID:
1802                         break;
1803                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
1804                         current_action_flag = MLX5_FLOW_ACTION_PORT_ID;
1805                         if (!actions->conf)
1806                                 break;
1807                         conf.port_id = actions->conf;
1808                         if (conf.port_id->original)
1809                                 i = 0;
1810                         else
1811                                 for (i = 0; ptoi[i].ifindex; ++i)
1812                                         if (ptoi[i].port_id == conf.port_id->id)
1813                                                 break;
1814                         if (!ptoi[i].ifindex)
1815                                 return rte_flow_error_set
1816                                         (error, ENODEV,
1817                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1818                                          conf.port_id,
1819                                          "missing data to convert port ID to"
1820                                          " ifindex");
1821                         port_id_dev = &rte_eth_devices[conf.port_id->id];
1822                         break;
1823                 case RTE_FLOW_ACTION_TYPE_JUMP:
1824                         current_action_flag = MLX5_FLOW_ACTION_JUMP;
1825                         if (!actions->conf)
1826                                 break;
1827                         conf.jump = actions->conf;
1828                         if (attr->group >= conf.jump->group)
1829                                 return rte_flow_error_set
1830                                         (error, ENOTSUP,
1831                                          RTE_FLOW_ERROR_TYPE_ACTION,
1832                                          actions,
1833                                          "can jump only to a group forward");
1834                         break;
1835                 case RTE_FLOW_ACTION_TYPE_DROP:
1836                         current_action_flag = MLX5_FLOW_ACTION_DROP;
1837                         break;
1838                 case RTE_FLOW_ACTION_TYPE_COUNT:
1839                         break;
1840                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
1841                         current_action_flag = MLX5_FLOW_ACTION_OF_POP_VLAN;
1842                         break;
1843                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: {
1844                         rte_be16_t ethertype;
1845
1846                         current_action_flag = MLX5_FLOW_ACTION_OF_PUSH_VLAN;
1847                         if (!actions->conf)
1848                                 break;
1849                         conf.of_push_vlan = actions->conf;
1850                         ethertype = conf.of_push_vlan->ethertype;
1851                         if (ethertype != RTE_BE16(ETH_P_8021Q) &&
1852                             ethertype != RTE_BE16(ETH_P_8021AD))
1853                                 return rte_flow_error_set
1854                                         (error, EINVAL,
1855                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1856                                          "vlan push TPID must be "
1857                                          "802.1Q or 802.1AD");
1858                         break;
1859                 }
1860                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
1861                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1862                                 return rte_flow_error_set
1863                                         (error, ENOTSUP,
1864                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1865                                          "vlan modify is not supported,"
1866                                          " set action must follow push action");
1867                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
1868                         break;
1869                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
1870                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1871                                 return rte_flow_error_set
1872                                         (error, ENOTSUP,
1873                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1874                                          "vlan modify is not supported,"
1875                                          " set action must follow push action");
1876                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
1877                         break;
1878                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
1879                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_DECAP;
1880                         break;
1881                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
1882                         ret = flow_tcf_validate_vxlan_encap(actions, error);
1883                         if (ret < 0)
1884                                 return ret;
1885                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_ENCAP;
1886                         break;
1887                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
1888                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_SRC;
1889                         break;
1890                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
1891                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_DST;
1892                         break;
1893                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
1894                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_SRC;
1895                         break;
1896                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
1897                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_DST;
1898                         break;
1899                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
1900                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_SRC;
1901                         break;
1902                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
1903                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_DST;
1904                         break;
1905                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
1906                         current_action_flag = MLX5_FLOW_ACTION_SET_TTL;
1907                         break;
1908                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
1909                         current_action_flag = MLX5_FLOW_ACTION_DEC_TTL;
1910                         break;
1911                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
1912                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_SRC;
1913                         break;
1914                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
1915                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_DST;
1916                         break;
1917                 default:
1918                         return rte_flow_error_set(error, ENOTSUP,
1919                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1920                                                   actions,
1921                                                   "action not supported");
1922                 }
1923                 if (current_action_flag & MLX5_TCF_CONFIG_ACTIONS) {
1924                         if (!actions->conf)
1925                                 return rte_flow_error_set
1926                                         (error, EINVAL,
1927                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1928                                          actions,
1929                                          "action configuration not set");
1930                 }
1931                 if ((current_action_flag & MLX5_TCF_PEDIT_ACTIONS) &&
1932                     pedit_validated)
1933                         return rte_flow_error_set(error, ENOTSUP,
1934                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1935                                                   actions,
1936                                                   "set actions should be "
1937                                                   "listed successively");
1938                 if ((current_action_flag & ~MLX5_TCF_PEDIT_ACTIONS) &&
1939                     (action_flags & MLX5_TCF_PEDIT_ACTIONS))
1940                         pedit_validated = 1;
1941                 if ((current_action_flag & MLX5_TCF_FATE_ACTIONS) &&
1942                     (action_flags & MLX5_TCF_FATE_ACTIONS))
1943                         return rte_flow_error_set(error, EINVAL,
1944                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1945                                                   actions,
1946                                                   "can't have multiple fate"
1947                                                   " actions");
1948                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1949                     (action_flags & MLX5_TCF_VXLAN_ACTIONS))
1950                         return rte_flow_error_set(error, EINVAL,
1951                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1952                                                   actions,
1953                                                   "can't have multiple vxlan"
1954                                                   " actions");
1955                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1956                     (action_flags & MLX5_TCF_VLAN_ACTIONS))
1957                         return rte_flow_error_set(error, ENOTSUP,
1958                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1959                                                   actions,
1960                                                   "can't have vxlan and vlan"
1961                                                   " actions in the same rule");
1962                 action_flags |= current_action_flag;
1963         }
1964         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1965                 unsigned int i;
1966
1967                 switch (items->type) {
1968                 case RTE_FLOW_ITEM_TYPE_VOID:
1969                         break;
1970                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
1971                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
1972                                 return rte_flow_error_set
1973                                         (error, ENOTSUP,
1974                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
1975                                          "inner tunnel port id"
1976                                          " item is not supported");
1977                         mask.port_id = flow_tcf_item_mask
1978                                 (items, &rte_flow_item_port_id_mask,
1979                                  &flow_tcf_mask_supported.port_id,
1980                                  &flow_tcf_mask_empty.port_id,
1981                                  sizeof(flow_tcf_mask_supported.port_id),
1982                                  error);
1983                         if (!mask.port_id)
1984                                 return -rte_errno;
1985                         if (mask.port_id == &flow_tcf_mask_empty.port_id) {
1986                                 in_port_id_set = 1;
1987                                 break;
1988                         }
1989                         spec.port_id = items->spec;
1990                         if (mask.port_id->id && mask.port_id->id != 0xffffffff)
1991                                 return rte_flow_error_set
1992                                         (error, ENOTSUP,
1993                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
1994                                          mask.port_id,
1995                                          "no support for partial mask on"
1996                                          " \"id\" field");
1997                         if (!mask.port_id->id)
1998                                 i = 0;
1999                         else
2000                                 for (i = 0; ptoi[i].ifindex; ++i)
2001                                         if (ptoi[i].port_id == spec.port_id->id)
2002                                                 break;
2003                         if (!ptoi[i].ifindex)
2004                                 return rte_flow_error_set
2005                                         (error, ENODEV,
2006                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
2007                                          spec.port_id,
2008                                          "missing data to convert port ID to"
2009                                          " ifindex");
2010                         if (in_port_id_set && ptoi[i].ifindex != tcm_ifindex)
2011                                 return rte_flow_error_set
2012                                         (error, ENOTSUP,
2013                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
2014                                          spec.port_id,
2015                                          "cannot match traffic for"
2016                                          " several port IDs through"
2017                                          " a single flow rule");
2018                         tcm_ifindex = ptoi[i].ifindex;
2019                         in_port_id_set = 1;
2020                         break;
2021                 case RTE_FLOW_ITEM_TYPE_ETH:
2022                         ret = mlx5_flow_validate_item_eth(items, item_flags,
2023                                                           error);
2024                         if (ret < 0)
2025                                 return ret;
2026                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2027                                       MLX5_FLOW_LAYER_INNER_L2 :
2028                                       MLX5_FLOW_LAYER_OUTER_L2;
2029                         /* TODO:
2030                          * Redundant check due to different supported mask.
2031                          * Same for the rest of items.
2032                          */
2033                         mask.eth = flow_tcf_item_mask
2034                                 (items, &rte_flow_item_eth_mask,
2035                                  &flow_tcf_mask_supported.eth,
2036                                  &flow_tcf_mask_empty.eth,
2037                                  sizeof(flow_tcf_mask_supported.eth),
2038                                  error);
2039                         if (!mask.eth)
2040                                 return -rte_errno;
2041                         if (mask.eth->type && mask.eth->type !=
2042                             RTE_BE16(0xffff))
2043                                 return rte_flow_error_set
2044                                         (error, ENOTSUP,
2045                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2046                                          mask.eth,
2047                                          "no support for partial mask on"
2048                                          " \"type\" field");
2049                         assert(items->spec);
2050                         spec.eth = items->spec;
2051                         if (mask.eth->type &&
2052                             (item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
2053                             inner_etype != RTE_BE16(ETH_P_ALL) &&
2054                             inner_etype != spec.eth->type)
2055                                 return rte_flow_error_set
2056                                         (error, EINVAL,
2057                                          RTE_FLOW_ERROR_TYPE_ITEM,
2058                                          items,
2059                                          "inner eth_type conflict");
2060                         if (mask.eth->type &&
2061                             !(item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
2062                             outer_etype != RTE_BE16(ETH_P_ALL) &&
2063                             outer_etype != spec.eth->type)
2064                                 return rte_flow_error_set
2065                                         (error, EINVAL,
2066                                          RTE_FLOW_ERROR_TYPE_ITEM,
2067                                          items,
2068                                          "outer eth_type conflict");
2069                         if (mask.eth->type) {
2070                                 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
2071                                         inner_etype = spec.eth->type;
2072                                 else
2073                                         outer_etype = spec.eth->type;
2074                         }
2075                         break;
2076                 case RTE_FLOW_ITEM_TYPE_VLAN:
2077                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
2078                                 return rte_flow_error_set
2079                                         (error, ENOTSUP,
2080                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
2081                                          "inner tunnel VLAN"
2082                                          " is not supported");
2083                         ret = mlx5_flow_validate_item_vlan(items, item_flags,
2084                                                            error);
2085                         if (ret < 0)
2086                                 return ret;
2087                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
2088                         mask.vlan = flow_tcf_item_mask
2089                                 (items, &rte_flow_item_vlan_mask,
2090                                  &flow_tcf_mask_supported.vlan,
2091                                  &flow_tcf_mask_empty.vlan,
2092                                  sizeof(flow_tcf_mask_supported.vlan),
2093                                  error);
2094                         if (!mask.vlan)
2095                                 return -rte_errno;
2096                         if ((mask.vlan->tci & RTE_BE16(0xe000) &&
2097                              (mask.vlan->tci & RTE_BE16(0xe000)) !=
2098                               RTE_BE16(0xe000)) ||
2099                             (mask.vlan->tci & RTE_BE16(0x0fff) &&
2100                              (mask.vlan->tci & RTE_BE16(0x0fff)) !=
2101                               RTE_BE16(0x0fff)) ||
2102                             (mask.vlan->inner_type &&
2103                              mask.vlan->inner_type != RTE_BE16(0xffff)))
2104                                 return rte_flow_error_set
2105                                         (error, ENOTSUP,
2106                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2107                                          mask.vlan,
2108                                          "no support for partial masks on"
2109                                          " \"tci\" (PCP and VID parts) and"
2110                                          " \"inner_type\" fields");
2111                         if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2112                             outer_etype != RTE_BE16(ETH_P_8021Q))
2113                                 return rte_flow_error_set
2114                                         (error, EINVAL,
2115                                          RTE_FLOW_ERROR_TYPE_ITEM,
2116                                          items,
2117                                          "outer eth_type conflict,"
2118                                          " must be 802.1Q");
2119                         outer_etype = RTE_BE16(ETH_P_8021Q);
2120                         assert(items->spec);
2121                         spec.vlan = items->spec;
2122                         if (mask.vlan->inner_type &&
2123                             vlan_etype != RTE_BE16(ETH_P_ALL) &&
2124                             vlan_etype != spec.vlan->inner_type)
2125                                 return rte_flow_error_set
2126                                         (error, EINVAL,
2127                                          RTE_FLOW_ERROR_TYPE_ITEM,
2128                                          items,
2129                                          "vlan eth_type conflict");
2130                         if (mask.vlan->inner_type)
2131                                 vlan_etype = spec.vlan->inner_type;
2132                         break;
2133                 case RTE_FLOW_ITEM_TYPE_IPV4:
2134                         ret = mlx5_flow_validate_item_ipv4
2135                                         (items, item_flags,
2136                                          &flow_tcf_mask_supported.ipv4, error);
2137                         if (ret < 0)
2138                                 return ret;
2139                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2140                                       MLX5_FLOW_LAYER_INNER_L3_IPV4 :
2141                                       MLX5_FLOW_LAYER_OUTER_L3_IPV4;
2142                         mask.ipv4 = flow_tcf_item_mask
2143                                 (items, &rte_flow_item_ipv4_mask,
2144                                  &flow_tcf_mask_supported.ipv4,
2145                                  &flow_tcf_mask_empty.ipv4,
2146                                  sizeof(flow_tcf_mask_supported.ipv4),
2147                                  error);
2148                         if (!mask.ipv4)
2149                                 return -rte_errno;
2150                         if (mask.ipv4->hdr.next_proto_id &&
2151                             mask.ipv4->hdr.next_proto_id != 0xff)
2152                                 return rte_flow_error_set
2153                                         (error, ENOTSUP,
2154                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2155                                          mask.ipv4,
2156                                          "no support for partial mask on"
2157                                          " \"hdr.next_proto_id\" field");
2158                         else if (mask.ipv4->hdr.next_proto_id)
2159                                 next_protocol =
2160                                         ((const struct rte_flow_item_ipv4 *)
2161                                          (items->spec))->hdr.next_proto_id;
2162                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2163                                 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2164                                     inner_etype != RTE_BE16(ETH_P_IP))
2165                                         return rte_flow_error_set
2166                                                 (error, EINVAL,
2167                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2168                                                  items,
2169                                                  "inner eth_type conflict,"
2170                                                  " IPv4 is required");
2171                                 inner_etype = RTE_BE16(ETH_P_IP);
2172                         } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2173                                 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2174                                     vlan_etype != RTE_BE16(ETH_P_IP))
2175                                         return rte_flow_error_set
2176                                                 (error, EINVAL,
2177                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2178                                                  items,
2179                                                  "vlan eth_type conflict,"
2180                                                  " IPv4 is required");
2181                                 vlan_etype = RTE_BE16(ETH_P_IP);
2182                         } else {
2183                                 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2184                                     outer_etype != RTE_BE16(ETH_P_IP))
2185                                         return rte_flow_error_set
2186                                                 (error, EINVAL,
2187                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2188                                                  items,
2189                                                  "eth_type conflict,"
2190                                                  " IPv4 is required");
2191                                 outer_etype = RTE_BE16(ETH_P_IP);
2192                         }
2193                         break;
2194                 case RTE_FLOW_ITEM_TYPE_IPV6:
2195                         ret = mlx5_flow_validate_item_ipv6
2196                                         (items, item_flags,
2197                                          &flow_tcf_mask_supported.ipv6, error);
2198                         if (ret < 0)
2199                                 return ret;
2200                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2201                                       MLX5_FLOW_LAYER_INNER_L3_IPV6 :
2202                                       MLX5_FLOW_LAYER_OUTER_L3_IPV6;
2203                         mask.ipv6 = flow_tcf_item_mask
2204                                 (items, &rte_flow_item_ipv6_mask,
2205                                  &flow_tcf_mask_supported.ipv6,
2206                                  &flow_tcf_mask_empty.ipv6,
2207                                  sizeof(flow_tcf_mask_supported.ipv6),
2208                                  error);
2209                         if (!mask.ipv6)
2210                                 return -rte_errno;
2211                         if (mask.ipv6->hdr.proto &&
2212                             mask.ipv6->hdr.proto != 0xff)
2213                                 return rte_flow_error_set
2214                                         (error, ENOTSUP,
2215                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2216                                          mask.ipv6,
2217                                          "no support for partial mask on"
2218                                          " \"hdr.proto\" field");
2219                         else if (mask.ipv6->hdr.proto)
2220                                 next_protocol =
2221                                         ((const struct rte_flow_item_ipv6 *)
2222                                          (items->spec))->hdr.proto;
2223                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2224                                 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2225                                     inner_etype != RTE_BE16(ETH_P_IPV6))
2226                                         return rte_flow_error_set
2227                                                 (error, EINVAL,
2228                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2229                                                  items,
2230                                                  "inner eth_type conflict,"
2231                                                  " IPv6 is required");
2232                                 inner_etype = RTE_BE16(ETH_P_IPV6);
2233                         } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2234                                 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2235                                     vlan_etype != RTE_BE16(ETH_P_IPV6))
2236                                         return rte_flow_error_set
2237                                                 (error, EINVAL,
2238                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2239                                                  items,
2240                                                  "vlan eth_type conflict,"
2241                                                  " IPv6 is required");
2242                                 vlan_etype = RTE_BE16(ETH_P_IPV6);
2243                         } else {
2244                                 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2245                                     outer_etype != RTE_BE16(ETH_P_IPV6))
2246                                         return rte_flow_error_set
2247                                                 (error, EINVAL,
2248                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2249                                                  items,
2250                                                  "eth_type conflict,"
2251                                                  " IPv6 is required");
2252                                 outer_etype = RTE_BE16(ETH_P_IPV6);
2253                         }
2254                         break;
2255                 case RTE_FLOW_ITEM_TYPE_UDP:
2256                         ret = mlx5_flow_validate_item_udp(items, item_flags,
2257                                                           next_protocol, error);
2258                         if (ret < 0)
2259                                 return ret;
2260                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2261                                       MLX5_FLOW_LAYER_INNER_L4_UDP :
2262                                       MLX5_FLOW_LAYER_OUTER_L4_UDP;
2263                         mask.udp = flow_tcf_item_mask
2264                                 (items, &rte_flow_item_udp_mask,
2265                                  &flow_tcf_mask_supported.udp,
2266                                  &flow_tcf_mask_empty.udp,
2267                                  sizeof(flow_tcf_mask_supported.udp),
2268                                  error);
2269                         if (!mask.udp)
2270                                 return -rte_errno;
2271                         /*
2272                          * Save the presumed outer UDP item for extra check
2273                          * if the tunnel item will be found later in the list.
2274                          */
2275                         if (!(item_flags & MLX5_FLOW_LAYER_TUNNEL))
2276                                 outer_udp = items;
2277                         break;
2278                 case RTE_FLOW_ITEM_TYPE_TCP:
2279                         ret = mlx5_flow_validate_item_tcp
2280                                              (items, item_flags,
2281                                               next_protocol,
2282                                               &flow_tcf_mask_supported.tcp,
2283                                               error);
2284                         if (ret < 0)
2285                                 return ret;
2286                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2287                                       MLX5_FLOW_LAYER_INNER_L4_TCP :
2288                                       MLX5_FLOW_LAYER_OUTER_L4_TCP;
2289                         mask.tcp = flow_tcf_item_mask
2290                                 (items, &rte_flow_item_tcp_mask,
2291                                  &flow_tcf_mask_supported.tcp,
2292                                  &flow_tcf_mask_empty.tcp,
2293                                  sizeof(flow_tcf_mask_supported.tcp),
2294                                  error);
2295                         if (!mask.tcp)
2296                                 return -rte_errno;
2297                         break;
2298                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2299                         if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN)
2300                                 return rte_flow_error_set
2301                                         (error, ENOTSUP,
2302                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
2303                                          "vxlan tunnel over vlan"
2304                                          " is not supported");
2305                         ret = mlx5_flow_validate_item_vxlan(items,
2306                                                             item_flags, error);
2307                         if (ret < 0)
2308                                 return ret;
2309                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
2310                         mask.vxlan = flow_tcf_item_mask
2311                                 (items, &rte_flow_item_vxlan_mask,
2312                                  &flow_tcf_mask_supported.vxlan,
2313                                  &flow_tcf_mask_empty.vxlan,
2314                                  sizeof(flow_tcf_mask_supported.vxlan), error);
2315                         if (!mask.vxlan)
2316                                 return -rte_errno;
2317                         if (mask.vxlan->vni[0] != 0xff ||
2318                             mask.vxlan->vni[1] != 0xff ||
2319                             mask.vxlan->vni[2] != 0xff)
2320                                 return rte_flow_error_set
2321                                         (error, ENOTSUP,
2322                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2323                                          mask.vxlan,
2324                                          "no support for partial or "
2325                                          "empty mask on \"vxlan.vni\" field");
2326                         /*
2327                          * The VNI item assumes the VXLAN tunnel, it requires
2328                          * at least the outer destination UDP port must be
2329                          * specified without wildcards to allow kernel select
2330                          * the virtual VXLAN device by port. Also outer IPv4
2331                          * or IPv6 item must be specified (wilcards or even
2332                          * zero mask are allowed) to let driver know the tunnel
2333                          * IP version and process UDP traffic correctly.
2334                          */
2335                         if (!(item_flags &
2336                              (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2337                               MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2338                                 return rte_flow_error_set
2339                                                  (error, EINVAL,
2340                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2341                                                   NULL,
2342                                                   "no outer IP pattern found"
2343                                                   " for vxlan tunnel");
2344                         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
2345                                 return rte_flow_error_set
2346                                                  (error, EINVAL,
2347                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2348                                                   NULL,
2349                                                   "no outer UDP pattern found"
2350                                                   " for vxlan tunnel");
2351                         /*
2352                          * All items preceding the tunnel item become outer
2353                          * ones and we should do extra validation for them
2354                          * due to tc limitations for tunnel outer parameters.
2355                          * Currently only outer UDP item requres extra check,
2356                          * use the saved pointer instead of item list rescan.
2357                          */
2358                         assert(outer_udp);
2359                         ret = flow_tcf_validate_vxlan_decap_udp
2360                                                 (outer_udp, error);
2361                         if (ret < 0)
2362                                 return ret;
2363                         /* Reset L4 protocol for inner parameters. */
2364                         next_protocol = 0xff;
2365                         break;
2366                 default:
2367                         return rte_flow_error_set(error, ENOTSUP,
2368                                                   RTE_FLOW_ERROR_TYPE_ITEM,
2369                                                   items, "item not supported");
2370                 }
2371         }
2372         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2373             (action_flags & MLX5_FLOW_ACTION_DROP))
2374                 return rte_flow_error_set(error, ENOTSUP,
2375                                           RTE_FLOW_ERROR_TYPE_ACTION,
2376                                           actions,
2377                                           "set action is not compatible with "
2378                                           "drop action");
2379         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2380             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2381                 return rte_flow_error_set(error, ENOTSUP,
2382                                           RTE_FLOW_ERROR_TYPE_ACTION,
2383                                           actions,
2384                                           "set action must be followed by "
2385                                           "port_id action");
2386         if (action_flags &
2387            (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST)) {
2388                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4))
2389                         return rte_flow_error_set(error, EINVAL,
2390                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2391                                                   actions,
2392                                                   "no ipv4 item found in"
2393                                                   " pattern");
2394         }
2395         if (action_flags &
2396            (MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST)) {
2397                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6))
2398                         return rte_flow_error_set(error, EINVAL,
2399                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2400                                                   actions,
2401                                                   "no ipv6 item found in"
2402                                                   " pattern");
2403         }
2404         if (action_flags &
2405            (MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST)) {
2406                 if (!(item_flags &
2407                      (MLX5_FLOW_LAYER_OUTER_L4_UDP |
2408                       MLX5_FLOW_LAYER_OUTER_L4_TCP)))
2409                         return rte_flow_error_set(error, EINVAL,
2410                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2411                                                   actions,
2412                                                   "no TCP/UDP item found in"
2413                                                   " pattern");
2414         }
2415         /*
2416          * FW syndrome (0xA9C090):
2417          *     set_flow_table_entry: push vlan action fte in fdb can ONLY be
2418          *     forward to the uplink.
2419          */
2420         if ((action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) &&
2421             (action_flags & MLX5_FLOW_ACTION_PORT_ID) &&
2422             ((struct priv *)port_id_dev->data->dev_private)->representor)
2423                 return rte_flow_error_set(error, ENOTSUP,
2424                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2425                                           "vlan push can only be applied"
2426                                           " when forwarding to uplink port");
2427         /*
2428          * FW syndrome (0x294609):
2429          *     set_flow_table_entry: modify/pop/push actions in fdb flow table
2430          *     are supported only while forwarding to vport.
2431          */
2432         if ((action_flags & MLX5_TCF_VLAN_ACTIONS) &&
2433             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2434                 return rte_flow_error_set(error, ENOTSUP,
2435                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2436                                           "vlan actions are supported"
2437                                           " only with port_id action");
2438         if ((action_flags & MLX5_TCF_VXLAN_ACTIONS) &&
2439             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2440                 return rte_flow_error_set(error, ENOTSUP,
2441                                           RTE_FLOW_ERROR_TYPE_ACTION, NULL,
2442                                           "vxlan actions are supported"
2443                                           " only with port_id action");
2444         if (!(action_flags & MLX5_TCF_FATE_ACTIONS))
2445                 return rte_flow_error_set(error, EINVAL,
2446                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2447                                           "no fate action is found");
2448         if (action_flags &
2449            (MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL)) {
2450                 if (!(item_flags &
2451                      (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2452                       MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2453                         return rte_flow_error_set(error, EINVAL,
2454                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2455                                                   actions,
2456                                                   "no IP found in pattern");
2457         }
2458         if (action_flags &
2459             (MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)) {
2460                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L2))
2461                         return rte_flow_error_set(error, ENOTSUP,
2462                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2463                                                   actions,
2464                                                   "no ethernet found in"
2465                                                   " pattern");
2466         }
2467         if ((action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) &&
2468             !(item_flags & MLX5_FLOW_LAYER_VXLAN))
2469                 return rte_flow_error_set(error, EINVAL,
2470                                           RTE_FLOW_ERROR_TYPE_ACTION,
2471                                           NULL,
2472                                           "no VNI pattern found"
2473                                           " for vxlan decap action");
2474         if ((action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) &&
2475             (item_flags & MLX5_FLOW_LAYER_TUNNEL))
2476                 return rte_flow_error_set(error, EINVAL,
2477                                           RTE_FLOW_ERROR_TYPE_ACTION,
2478                                           NULL,
2479                                           "vxlan encap not supported"
2480                                           " for tunneled traffic");
2481         return 0;
2482 }
2483
2484 /**
2485  * Calculate maximum size of memory for flow items of Linux TC flower.
2486  *
2487  * @param[in] attr
2488  *   Pointer to the flow attributes.
2489  * @param[in] items
2490  *   Pointer to the list of items.
2491  * @param[out] action_flags
2492  *   Pointer to the detected actions.
2493  *
2494  * @return
2495  *   Maximum size of memory for items.
2496  */
2497 static int
2498 flow_tcf_get_items_size(const struct rte_flow_attr *attr,
2499                         const struct rte_flow_item items[],
2500                         uint64_t *action_flags)
2501 {
2502         int size = 0;
2503
2504         size += SZ_NLATTR_STRZ_OF("flower") +
2505                 SZ_NLATTR_TYPE_OF(uint16_t) + /* Outer ether type. */
2506                 SZ_NLATTR_NEST + /* TCA_OPTIONS. */
2507                 SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */
2508         if (attr->group > 0)
2509                 size += SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CHAIN. */
2510         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2511                 switch (items->type) {
2512                 case RTE_FLOW_ITEM_TYPE_VOID:
2513                         break;
2514                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
2515                         break;
2516                 case RTE_FLOW_ITEM_TYPE_ETH:
2517                         size += SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4;
2518                                 /* dst/src MAC addr and mask. */
2519                         break;
2520                 case RTE_FLOW_ITEM_TYPE_VLAN:
2521                         size += SZ_NLATTR_TYPE_OF(uint16_t) +
2522                                 /* VLAN Ether type. */
2523                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */
2524                                 SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */
2525                         break;
2526                 case RTE_FLOW_ITEM_TYPE_IPV4: {
2527                         const struct rte_flow_item_ipv4 *ipv4 = items->mask;
2528
2529                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2530                                 SZ_NLATTR_TYPE_OF(uint32_t) * 4;
2531                                 /* dst/src IP addr and mask. */
2532                         if (ipv4 && ipv4->hdr.time_to_live)
2533                                 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2534                         if (ipv4 && ipv4->hdr.type_of_service)
2535                                 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2536                         break;
2537                 }
2538                 case RTE_FLOW_ITEM_TYPE_IPV6: {
2539                         const struct rte_flow_item_ipv6 *ipv6 = items->mask;
2540
2541                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2542                                 SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4;
2543                                 /* dst/src IP addr and mask. */
2544                         if (ipv6 && ipv6->hdr.hop_limits)
2545                                 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2546                         if (ipv6 && (rte_be_to_cpu_32(ipv6->hdr.vtc_flow) &
2547                                      (0xfful << IPV6_HDR_TC_SHIFT)))
2548                                 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2549                         break;
2550                 }
2551                 case RTE_FLOW_ITEM_TYPE_UDP:
2552                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2553                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2554                                 /* dst/src port and mask. */
2555                         break;
2556                 case RTE_FLOW_ITEM_TYPE_TCP:
2557                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2558                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2559                                 /* dst/src port and mask. */
2560                         break;
2561                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2562                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2563                         /*
2564                          * There might be no VXLAN decap action in the action
2565                          * list, nonetheless the VXLAN tunnel flow requires
2566                          * the decap structure to be correctly applied to
2567                          * VXLAN device, set the flag to create the structure.
2568                          * Translation routine will not put the decap action
2569                          * in tne Netlink message if there is no actual action
2570                          * in the list.
2571                          */
2572                         *action_flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2573                         break;
2574                 default:
2575                         DRV_LOG(WARNING,
2576                                 "unsupported item %p type %d,"
2577                                 " items must be validated before flow creation",
2578                                 (const void *)items, items->type);
2579                         break;
2580                 }
2581         }
2582         return size;
2583 }
2584
2585 /**
2586  * Calculate size of memory to store the VXLAN encapsultion
2587  * related items in the Netlink message buffer. Items list
2588  * is specified by RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action.
2589  * The item list should be validated.
2590  *
2591  * @param[in] action
2592  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2593  *   List of pattern items to scan data from.
2594  *
2595  * @return
2596  *   The size the part of Netlink message buffer to store the
2597  *   VXLAN encapsulation item attributes.
2598  */
2599 static int
2600 flow_tcf_vxlan_encap_size(const struct rte_flow_action *action)
2601 {
2602         const struct rte_flow_item *items;
2603         int size = 0;
2604
2605         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
2606         assert(action->conf);
2607
2608         items = ((const struct rte_flow_action_vxlan_encap *)
2609                                         action->conf)->definition;
2610         assert(items);
2611         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2612                 switch (items->type) {
2613                 case RTE_FLOW_ITEM_TYPE_VOID:
2614                         break;
2615                 case RTE_FLOW_ITEM_TYPE_ETH:
2616                         /* This item does not require message buffer. */
2617                         break;
2618                 case RTE_FLOW_ITEM_TYPE_IPV4: {
2619                         const struct rte_flow_item_ipv4 *ipv4 = items->mask;
2620
2621                         size += SZ_NLATTR_DATA_OF(IPV4_ADDR_LEN) * 2;
2622                         if (ipv4 && ipv4->hdr.time_to_live)
2623                                 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2624                         if (ipv4 && ipv4->hdr.type_of_service)
2625                                 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2626                         break;
2627                 }
2628                 case RTE_FLOW_ITEM_TYPE_IPV6: {
2629                         const struct rte_flow_item_ipv6 *ipv6 = items->mask;
2630
2631                         size += SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 2;
2632                         if (ipv6 && ipv6->hdr.hop_limits)
2633                                 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2634                         if (ipv6 && (rte_be_to_cpu_32(ipv6->hdr.vtc_flow) &
2635                                      (0xfful << IPV6_HDR_TC_SHIFT)))
2636                                 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2637                         break;
2638                 }
2639                 case RTE_FLOW_ITEM_TYPE_UDP: {
2640                         const struct rte_flow_item_udp *udp = items->mask;
2641
2642                         size += SZ_NLATTR_TYPE_OF(uint16_t);
2643                         if (!udp || udp->hdr.src_port != RTE_BE16(0x0000))
2644                                 size += SZ_NLATTR_TYPE_OF(uint16_t);
2645                         break;
2646                 }
2647                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2648                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2649                         break;
2650                 default:
2651                         assert(false);
2652                         DRV_LOG(WARNING,
2653                                 "unsupported item %p type %d,"
2654                                 " items must be validated"
2655                                 " before flow creation",
2656                                 (const void *)items, items->type);
2657                         return 0;
2658                 }
2659         }
2660         return size;
2661 }
2662
2663 /**
2664  * Calculate maximum size of memory for flow actions of Linux TC flower and
2665  * extract specified actions.
2666  *
2667  * @param[in] actions
2668  *   Pointer to the list of actions.
2669  * @param[out] action_flags
2670  *   Pointer to the detected actions.
2671  *
2672  * @return
2673  *   Maximum size of memory for actions.
2674  */
2675 static int
2676 flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
2677                               uint64_t *action_flags)
2678 {
2679         int size = 0;
2680         uint64_t flags = 0;
2681
2682         size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */
2683         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
2684                 switch (actions->type) {
2685                 case RTE_FLOW_ACTION_TYPE_VOID:
2686                         break;
2687                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
2688                         size += SZ_NLATTR_NEST + /* na_act_index. */
2689                                 SZ_NLATTR_STRZ_OF("mirred") +
2690                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2691                                 SZ_NLATTR_TYPE_OF(struct tc_mirred);
2692                         flags |= MLX5_FLOW_ACTION_PORT_ID;
2693                         break;
2694                 case RTE_FLOW_ACTION_TYPE_JUMP:
2695                         size += SZ_NLATTR_NEST + /* na_act_index. */
2696                                 SZ_NLATTR_STRZ_OF("gact") +
2697                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2698                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2699                         flags |= MLX5_FLOW_ACTION_JUMP;
2700                         break;
2701                 case RTE_FLOW_ACTION_TYPE_DROP:
2702                         size += SZ_NLATTR_NEST + /* na_act_index. */
2703                                 SZ_NLATTR_STRZ_OF("gact") +
2704                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2705                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2706                         flags |= MLX5_FLOW_ACTION_DROP;
2707                         break;
2708                 case RTE_FLOW_ACTION_TYPE_COUNT:
2709                         break;
2710                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
2711                         flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
2712                         goto action_of_vlan;
2713                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
2714                         flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
2715                         goto action_of_vlan;
2716                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
2717                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
2718                         goto action_of_vlan;
2719                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
2720                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
2721                         goto action_of_vlan;
2722 action_of_vlan:
2723                         size += SZ_NLATTR_NEST + /* na_act_index. */
2724                                 SZ_NLATTR_STRZ_OF("vlan") +
2725                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2726                                 SZ_NLATTR_TYPE_OF(struct tc_vlan) +
2727                                 SZ_NLATTR_TYPE_OF(uint16_t) +
2728                                 /* VLAN protocol. */
2729                                 SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
2730                                 SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
2731                         break;
2732                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
2733                         size += SZ_NLATTR_NEST + /* na_act_index. */
2734                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2735                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2736                                 SZ_NLATTR_TYPE_OF(uint8_t);
2737                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2738                         size += flow_tcf_vxlan_encap_size(actions) +
2739                                 RTE_ALIGN_CEIL /* preceding encap params. */
2740                                 (sizeof(struct flow_tcf_vxlan_encap),
2741                                 MNL_ALIGNTO);
2742                         flags |= MLX5_FLOW_ACTION_VXLAN_ENCAP;
2743                         break;
2744                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
2745                         size += SZ_NLATTR_NEST + /* na_act_index. */
2746                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2747                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2748                                 SZ_NLATTR_TYPE_OF(uint8_t);
2749                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2750                         size += RTE_ALIGN_CEIL /* preceding decap params. */
2751                                 (sizeof(struct flow_tcf_vxlan_decap),
2752                                 MNL_ALIGNTO);
2753                         flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2754                         break;
2755                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
2756                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
2757                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
2758                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
2759                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
2760                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
2761                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
2762                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
2763                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
2764                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
2765                         size += flow_tcf_get_pedit_actions_size(&actions,
2766                                                                 &flags);
2767                         break;
2768                 default:
2769                         DRV_LOG(WARNING,
2770                                 "unsupported action %p type %d,"
2771                                 " items must be validated before flow creation",
2772                                 (const void *)actions, actions->type);
2773                         break;
2774                 }
2775         }
2776         *action_flags = flags;
2777         return size;
2778 }
2779
2780 /**
2781  * Prepare a flow object for Linux TC flower. It calculates the maximum size of
2782  * memory required, allocates the memory, initializes Netlink message headers
2783  * and set unique TC message handle.
2784  *
2785  * @param[in] attr
2786  *   Pointer to the flow attributes.
2787  * @param[in] items
2788  *   Pointer to the list of items.
2789  * @param[in] actions
2790  *   Pointer to the list of actions.
2791  * @param[out] error
2792  *   Pointer to the error structure.
2793  *
2794  * @return
2795  *   Pointer to mlx5_flow object on success,
2796  *   otherwise NULL and rte_errno is set.
2797  */
2798 static struct mlx5_flow *
2799 flow_tcf_prepare(const struct rte_flow_attr *attr,
2800                  const struct rte_flow_item items[],
2801                  const struct rte_flow_action actions[],
2802                  struct rte_flow_error *error)
2803 {
2804         size_t size = RTE_ALIGN_CEIL
2805                         (sizeof(struct mlx5_flow),
2806                          alignof(struct flow_tcf_tunnel_hdr)) +
2807                       MNL_ALIGN(sizeof(struct nlmsghdr)) +
2808                       MNL_ALIGN(sizeof(struct tcmsg));
2809         struct mlx5_flow *dev_flow;
2810         uint64_t action_flags = 0;
2811         struct nlmsghdr *nlh;
2812         struct tcmsg *tcm;
2813         uint8_t *sp, *tun = NULL;
2814
2815         size += flow_tcf_get_items_size(attr, items, &action_flags);
2816         size += flow_tcf_get_actions_and_size(actions, &action_flags);
2817         dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
2818         if (!dev_flow) {
2819                 rte_flow_error_set(error, ENOMEM,
2820                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
2821                                    "not enough memory to create E-Switch flow");
2822                 return NULL;
2823         }
2824         sp = (uint8_t *)(dev_flow + 1);
2825         if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) {
2826                 sp = RTE_PTR_ALIGN
2827                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2828                 tun = sp;
2829                 sp += RTE_ALIGN_CEIL
2830                         (sizeof(struct flow_tcf_vxlan_encap),
2831                         MNL_ALIGNTO);
2832 #ifndef NDEBUG
2833                 size -= RTE_ALIGN_CEIL
2834                         (sizeof(struct flow_tcf_vxlan_encap),
2835                         MNL_ALIGNTO);
2836 #endif
2837         } else if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2838                 sp = RTE_PTR_ALIGN
2839                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2840                 tun = sp;
2841                 sp += RTE_ALIGN_CEIL
2842                         (sizeof(struct flow_tcf_vxlan_decap),
2843                         MNL_ALIGNTO);
2844 #ifndef NDEBUG
2845                 size -= RTE_ALIGN_CEIL
2846                         (sizeof(struct flow_tcf_vxlan_decap),
2847                         MNL_ALIGNTO);
2848 #endif
2849         } else {
2850                 sp = RTE_PTR_ALIGN(sp, MNL_ALIGNTO);
2851         }
2852         nlh = mnl_nlmsg_put_header(sp);
2853         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
2854         *dev_flow = (struct mlx5_flow){
2855                 .tcf = (struct mlx5_flow_tcf){
2856 #ifndef NDEBUG
2857                         .nlsize = size - RTE_ALIGN_CEIL
2858                                 (sizeof(struct mlx5_flow),
2859                                  alignof(struct flow_tcf_tunnel_hdr)),
2860 #endif
2861                         .tunnel = (struct flow_tcf_tunnel_hdr *)tun,
2862                         .nlh = nlh,
2863                         .tcm = tcm,
2864                 },
2865         };
2866         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP)
2867                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_DECAP;
2868         else if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP)
2869                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_ENCAP;
2870         return dev_flow;
2871 }
2872
2873 /**
2874  * Make adjustments for supporting count actions.
2875  *
2876  * @param[in] dev
2877  *   Pointer to the Ethernet device structure.
2878  * @param[in] dev_flow
2879  *   Pointer to mlx5_flow.
2880  * @param[out] error
2881  *   Pointer to error structure.
2882  *
2883  * @return
2884  *   0 On success else a negative errno value is returned and rte_errno is set.
2885  */
2886 static int
2887 flow_tcf_translate_action_count(struct rte_eth_dev *dev __rte_unused,
2888                                   struct mlx5_flow *dev_flow,
2889                                   struct rte_flow_error *error)
2890 {
2891         struct rte_flow *flow = dev_flow->flow;
2892
2893         if (!flow->counter) {
2894                 flow->counter = flow_tcf_counter_new();
2895                 if (!flow->counter)
2896                         return rte_flow_error_set(error, rte_errno,
2897                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2898                                                   NULL,
2899                                                   "cannot get counter"
2900                                                   " context.");
2901         }
2902         return 0;
2903 }
2904
2905 /**
2906  * Convert VXLAN VNI to 32-bit integer.
2907  *
2908  * @param[in] vni
2909  *   VXLAN VNI in 24-bit wire format.
2910  *
2911  * @return
2912  *   VXLAN VNI as a 32-bit integer value in network endian.
2913  */
2914 static inline rte_be32_t
2915 vxlan_vni_as_be32(const uint8_t vni[3])
2916 {
2917         union {
2918                 uint8_t vni[4];
2919                 rte_be32_t dword;
2920         } ret = {
2921                 .vni = { 0, vni[0], vni[1], vni[2] },
2922         };
2923         return ret.dword;
2924 }
2925
2926 /**
2927  * Helper function to process RTE_FLOW_ITEM_TYPE_ETH entry in configuration
2928  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the MAC address fields
2929  * in the encapsulation parameters structure. The item must be prevalidated,
2930  * no any validation checks performed by function.
2931  *
2932  * @param[in] spec
2933  *   RTE_FLOW_ITEM_TYPE_ETH entry specification.
2934  * @param[in] mask
2935  *   RTE_FLOW_ITEM_TYPE_ETH entry mask.
2936  * @param[out] encap
2937  *   Structure to fill the gathered MAC address data.
2938  */
2939 static void
2940 flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec,
2941                                const struct rte_flow_item_eth *mask,
2942                                struct flow_tcf_vxlan_encap *encap)
2943 {
2944         /* Item must be validated before. No redundant checks. */
2945         assert(spec);
2946         if (!mask || !memcmp(&mask->dst,
2947                              &rte_flow_item_eth_mask.dst,
2948                              sizeof(rte_flow_item_eth_mask.dst))) {
2949                 /*
2950                  * Ethernet addresses are not supported by
2951                  * tc as tunnel_key parameters. Destination
2952                  * address is needed to form encap packet
2953                  * header and retrieved by kernel from
2954                  * implicit sources (ARP table, etc),
2955                  * address masks are not supported at all.
2956                  */
2957                 encap->eth.dst = spec->dst;
2958                 encap->mask |= FLOW_TCF_ENCAP_ETH_DST;
2959         }
2960         if (!mask || !memcmp(&mask->src,
2961                              &rte_flow_item_eth_mask.src,
2962                              sizeof(rte_flow_item_eth_mask.src))) {
2963                 /*
2964                  * Ethernet addresses are not supported by
2965                  * tc as tunnel_key parameters. Source ethernet
2966                  * address is ignored anyway.
2967                  */
2968                 encap->eth.src = spec->src;
2969                 encap->mask |= FLOW_TCF_ENCAP_ETH_SRC;
2970         }
2971 }
2972
2973 /**
2974  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV4 entry in configuration
2975  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV4 address fields
2976  * in the encapsulation parameters structure. The item must be prevalidated,
2977  * no any validation checks performed by function.
2978  *
2979  * @param[in] spec
2980  *   RTE_FLOW_ITEM_TYPE_IPV4 entry specification.
2981  * @param[in] mask
2982  *  RTE_FLOW_ITEM_TYPE_IPV4 entry mask.
2983  * @param[out] encap
2984  *   Structure to fill the gathered IPV4 address data.
2985  */
2986 static void
2987 flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec,
2988                                 const struct rte_flow_item_ipv4 *mask,
2989                                 struct flow_tcf_vxlan_encap *encap)
2990 {
2991         /* Item must be validated before. No redundant checks. */
2992         assert(spec);
2993         encap->ipv4.dst = spec->hdr.dst_addr;
2994         encap->ipv4.src = spec->hdr.src_addr;
2995         encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC |
2996                        FLOW_TCF_ENCAP_IPV4_DST;
2997         if (mask && mask->hdr.type_of_service) {
2998                 encap->mask |= FLOW_TCF_ENCAP_IP_TOS;
2999                 encap->ip_tos = spec->hdr.type_of_service;
3000         }
3001         if (mask && mask->hdr.time_to_live) {
3002                 encap->mask |= FLOW_TCF_ENCAP_IP_TTL;
3003                 encap->ip_ttl_hop = spec->hdr.time_to_live;
3004         }
3005 }
3006
3007 /**
3008  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV6 entry in configuration
3009  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV6 address fields
3010  * in the encapsulation parameters structure. The item must be prevalidated,
3011  * no any validation checks performed by function.
3012  *
3013  * @param[in] spec
3014  *   RTE_FLOW_ITEM_TYPE_IPV6 entry specification.
3015  * @param[in] mask
3016  *  RTE_FLOW_ITEM_TYPE_IPV6 entry mask.
3017  * @param[out] encap
3018  *   Structure to fill the gathered IPV6 address data.
3019  */
3020 static void
3021 flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec,
3022                                 const struct rte_flow_item_ipv6 *mask,
3023                                 struct flow_tcf_vxlan_encap *encap)
3024 {
3025         /* Item must be validated before. No redundant checks. */
3026         assert(spec);
3027         memcpy(encap->ipv6.dst, spec->hdr.dst_addr, IPV6_ADDR_LEN);
3028         memcpy(encap->ipv6.src, spec->hdr.src_addr, IPV6_ADDR_LEN);
3029         encap->mask |= FLOW_TCF_ENCAP_IPV6_SRC |
3030                        FLOW_TCF_ENCAP_IPV6_DST;
3031         if (mask) {
3032                 if ((rte_be_to_cpu_32(mask->hdr.vtc_flow) >>
3033                     IPV6_HDR_TC_SHIFT) & 0xff) {
3034                         encap->mask |= FLOW_TCF_ENCAP_IP_TOS;
3035                         encap->ip_tos = (rte_be_to_cpu_32
3036                                                 (spec->hdr.vtc_flow) >>
3037                                                  IPV6_HDR_TC_SHIFT) & 0xff;
3038                 }
3039                 if (mask->hdr.hop_limits) {
3040                         encap->mask |= FLOW_TCF_ENCAP_IP_TTL;
3041                         encap->ip_ttl_hop = spec->hdr.hop_limits;
3042                 }
3043         }
3044 }
3045
3046 /**
3047  * Helper function to process RTE_FLOW_ITEM_TYPE_UDP entry in configuration
3048  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the UDP port fields
3049  * in the encapsulation parameters structure. The item must be prevalidated,
3050  * no any validation checks performed by function.
3051  *
3052  * @param[in] spec
3053  *   RTE_FLOW_ITEM_TYPE_UDP entry specification.
3054  * @param[in] mask
3055  *   RTE_FLOW_ITEM_TYPE_UDP entry mask.
3056  * @param[out] encap
3057  *   Structure to fill the gathered UDP port data.
3058  */
3059 static void
3060 flow_tcf_parse_vxlan_encap_udp(const struct rte_flow_item_udp *spec,
3061                                const struct rte_flow_item_udp *mask,
3062                                struct flow_tcf_vxlan_encap *encap)
3063 {
3064         assert(spec);
3065         encap->udp.dst = spec->hdr.dst_port;
3066         encap->mask |= FLOW_TCF_ENCAP_UDP_DST;
3067         if (!mask || mask->hdr.src_port != RTE_BE16(0x0000)) {
3068                 encap->udp.src = spec->hdr.src_port;
3069                 encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC;
3070         }
3071 }
3072
3073 /**
3074  * Helper function to process RTE_FLOW_ITEM_TYPE_VXLAN entry in configuration
3075  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the VNI fields
3076  * in the encapsulation parameters structure. The item must be prevalidated,
3077  * no any validation checks performed by function.
3078  *
3079  * @param[in] spec
3080  *   RTE_FLOW_ITEM_TYPE_VXLAN entry specification.
3081  * @param[out] encap
3082  *   Structure to fill the gathered VNI address data.
3083  */
3084 static void
3085 flow_tcf_parse_vxlan_encap_vni(const struct rte_flow_item_vxlan *spec,
3086                                struct flow_tcf_vxlan_encap *encap)
3087 {
3088         /* Item must be validated before. Do not redundant checks. */
3089         assert(spec);
3090         memcpy(encap->vxlan.vni, spec->vni, sizeof(encap->vxlan.vni));
3091         encap->mask |= FLOW_TCF_ENCAP_VXLAN_VNI;
3092 }
3093
3094 /**
3095  * Populate consolidated encapsulation object from list of pattern items.
3096  *
3097  * Helper function to process configuration of action such as
3098  * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. The item list should be
3099  * validated, there is no way to return an meaningful error.
3100  *
3101  * @param[in] action
3102  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
3103  *   List of pattern items to gather data from.
3104  * @param[out] src
3105  *   Structure to fill gathered data.
3106  */
3107 static void
3108 flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action,
3109                            struct flow_tcf_vxlan_encap *encap)
3110 {
3111         union {
3112                 const struct rte_flow_item_eth *eth;
3113                 const struct rte_flow_item_ipv4 *ipv4;
3114                 const struct rte_flow_item_ipv6 *ipv6;
3115                 const struct rte_flow_item_udp *udp;
3116                 const struct rte_flow_item_vxlan *vxlan;
3117         } spec, mask;
3118         const struct rte_flow_item *items;
3119
3120         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
3121         assert(action->conf);
3122
3123         items = ((const struct rte_flow_action_vxlan_encap *)
3124                                         action->conf)->definition;
3125         assert(items);
3126         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3127                 switch (items->type) {
3128                 case RTE_FLOW_ITEM_TYPE_VOID:
3129                         break;
3130                 case RTE_FLOW_ITEM_TYPE_ETH:
3131                         mask.eth = items->mask;
3132                         spec.eth = items->spec;
3133                         flow_tcf_parse_vxlan_encap_eth(spec.eth, mask.eth,
3134                                                        encap);
3135                         break;
3136                 case RTE_FLOW_ITEM_TYPE_IPV4:
3137                         spec.ipv4 = items->spec;
3138                         mask.ipv4 = items->mask;
3139                         flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4, mask.ipv4,
3140                                                         encap);
3141                         break;
3142                 case RTE_FLOW_ITEM_TYPE_IPV6:
3143                         spec.ipv6 = items->spec;
3144                         mask.ipv6 = items->mask;
3145                         flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6, mask.ipv6,
3146                                                         encap);
3147                         break;
3148                 case RTE_FLOW_ITEM_TYPE_UDP:
3149                         mask.udp = items->mask;
3150                         spec.udp = items->spec;
3151                         flow_tcf_parse_vxlan_encap_udp(spec.udp, mask.udp,
3152                                                        encap);
3153                         break;
3154                 case RTE_FLOW_ITEM_TYPE_VXLAN:
3155                         spec.vxlan = items->spec;
3156                         flow_tcf_parse_vxlan_encap_vni(spec.vxlan, encap);
3157                         break;
3158                 default:
3159                         assert(false);
3160                         DRV_LOG(WARNING,
3161                                 "unsupported item %p type %d,"
3162                                 " items must be validated"
3163                                 " before flow creation",
3164                                 (const void *)items, items->type);
3165                         encap->mask = 0;
3166                         return;
3167                 }
3168         }
3169 }
3170
3171 /**
3172  * Translate flow for Linux TC flower and construct Netlink message.
3173  *
3174  * @param[in] priv
3175  *   Pointer to the priv structure.
3176  * @param[in, out] flow
3177  *   Pointer to the sub flow.
3178  * @param[in] attr
3179  *   Pointer to the flow attributes.
3180  * @param[in] items
3181  *   Pointer to the list of items.
3182  * @param[in] actions
3183  *   Pointer to the list of actions.
3184  * @param[out] error
3185  *   Pointer to the error structure.
3186  *
3187  * @return
3188  *   0 on success, a negative errno value otherwise and rte_errno is set.
3189  */
3190 static int
3191 flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow,
3192                    const struct rte_flow_attr *attr,
3193                    const struct rte_flow_item items[],
3194                    const struct rte_flow_action actions[],
3195                    struct rte_flow_error *error)
3196 {
3197         union {
3198                 const struct rte_flow_item_port_id *port_id;
3199                 const struct rte_flow_item_eth *eth;
3200                 const struct rte_flow_item_vlan *vlan;
3201                 const struct rte_flow_item_ipv4 *ipv4;
3202                 const struct rte_flow_item_ipv6 *ipv6;
3203                 const struct rte_flow_item_tcp *tcp;
3204                 const struct rte_flow_item_udp *udp;
3205                 const struct rte_flow_item_vxlan *vxlan;
3206         } spec, mask;
3207         union {
3208                 const struct rte_flow_action_port_id *port_id;
3209                 const struct rte_flow_action_jump *jump;
3210                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
3211                 const struct rte_flow_action_of_set_vlan_vid *
3212                         of_set_vlan_vid;
3213                 const struct rte_flow_action_of_set_vlan_pcp *
3214                         of_set_vlan_pcp;
3215         } conf;
3216         union {
3217                 struct flow_tcf_tunnel_hdr *hdr;
3218                 struct flow_tcf_vxlan_decap *vxlan;
3219         } decap = {
3220                 .hdr = NULL,
3221         };
3222         union {
3223                 struct flow_tcf_tunnel_hdr *hdr;
3224                 struct flow_tcf_vxlan_encap *vxlan;
3225         } encap = {
3226                 .hdr = NULL,
3227         };
3228         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
3229         struct nlmsghdr *nlh = dev_flow->tcf.nlh;
3230         struct tcmsg *tcm = dev_flow->tcf.tcm;
3231         uint32_t na_act_index_cur;
3232         rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
3233         rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
3234         rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
3235         bool ip_proto_set = 0;
3236         bool tunnel_outer = 0;
3237         struct nlattr *na_flower;
3238         struct nlattr *na_flower_act;
3239         struct nlattr *na_vlan_id = NULL;
3240         struct nlattr *na_vlan_priority = NULL;
3241         uint64_t item_flags = 0;
3242         int ret;
3243
3244         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
3245                                                 PTOI_TABLE_SZ_MAX(dev)));
3246         if (dev_flow->tcf.tunnel) {
3247                 switch (dev_flow->tcf.tunnel->type) {
3248                 case FLOW_TCF_TUNACT_VXLAN_DECAP:
3249                         decap.vxlan = dev_flow->tcf.vxlan_decap;
3250                         tunnel_outer = 1;
3251                         break;
3252                 case FLOW_TCF_TUNACT_VXLAN_ENCAP:
3253                         encap.vxlan = dev_flow->tcf.vxlan_encap;
3254                         break;
3255                 /* New tunnel actions can be added here. */
3256                 default:
3257                         assert(false);
3258                         break;
3259                 }
3260         }
3261         nlh = dev_flow->tcf.nlh;
3262         tcm = dev_flow->tcf.tcm;
3263         /* Prepare API must have been called beforehand. */
3264         assert(nlh != NULL && tcm != NULL);
3265         tcm->tcm_family = AF_UNSPEC;
3266         tcm->tcm_ifindex = ptoi[0].ifindex;
3267         tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
3268         /*
3269          * Priority cannot be zero to prevent the kernel from picking one
3270          * automatically.
3271          */
3272         tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16, outer_etype);
3273         if (attr->group > 0)
3274                 mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group);
3275         mnl_attr_put_strz(nlh, TCA_KIND, "flower");
3276         na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
3277         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3278                 unsigned int i;
3279
3280                 switch (items->type) {
3281                 case RTE_FLOW_ITEM_TYPE_VOID:
3282                         break;
3283                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
3284                         mask.port_id = flow_tcf_item_mask
3285                                 (items, &rte_flow_item_port_id_mask,
3286                                  &flow_tcf_mask_supported.port_id,
3287                                  &flow_tcf_mask_empty.port_id,
3288                                  sizeof(flow_tcf_mask_supported.port_id),
3289                                  error);
3290                         assert(mask.port_id);
3291                         if (mask.port_id == &flow_tcf_mask_empty.port_id)
3292                                 break;
3293                         spec.port_id = items->spec;
3294                         if (!mask.port_id->id)
3295                                 i = 0;
3296                         else
3297                                 for (i = 0; ptoi[i].ifindex; ++i)
3298                                         if (ptoi[i].port_id == spec.port_id->id)
3299                                                 break;
3300                         assert(ptoi[i].ifindex);
3301                         tcm->tcm_ifindex = ptoi[i].ifindex;
3302                         break;
3303                 case RTE_FLOW_ITEM_TYPE_ETH:
3304                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3305                                       MLX5_FLOW_LAYER_INNER_L2 :
3306                                       MLX5_FLOW_LAYER_OUTER_L2;
3307                         mask.eth = flow_tcf_item_mask
3308                                 (items, &rte_flow_item_eth_mask,
3309                                  &flow_tcf_mask_supported.eth,
3310                                  &flow_tcf_mask_empty.eth,
3311                                  sizeof(flow_tcf_mask_supported.eth),
3312                                  error);
3313                         assert(mask.eth);
3314                         if (mask.eth == &flow_tcf_mask_empty.eth)
3315                                 break;
3316                         spec.eth = items->spec;
3317                         if (mask.eth->type) {
3318                                 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
3319                                         inner_etype = spec.eth->type;
3320                                 else
3321                                         outer_etype = spec.eth->type;
3322                         }
3323                         if (tunnel_outer) {
3324                                 DRV_LOG(WARNING,
3325                                         "outer L2 addresses cannot be"
3326                                         " forced is outer ones for tunnel,"
3327                                         " parameter is ignored");
3328                                 break;
3329                         }
3330                         if (!is_zero_ether_addr(&mask.eth->dst)) {
3331                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
3332                                              ETHER_ADDR_LEN,
3333                                              spec.eth->dst.addr_bytes);
3334                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK,
3335                                              ETHER_ADDR_LEN,
3336                                              mask.eth->dst.addr_bytes);
3337                         }
3338                         if (!is_zero_ether_addr(&mask.eth->src)) {
3339                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC,
3340                                              ETHER_ADDR_LEN,
3341                                              spec.eth->src.addr_bytes);
3342                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK,
3343                                              ETHER_ADDR_LEN,
3344                                              mask.eth->src.addr_bytes);
3345                         }
3346                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3347                         break;
3348                 case RTE_FLOW_ITEM_TYPE_VLAN:
3349                         assert(!encap.hdr);
3350                         assert(!decap.hdr);
3351                         assert(!tunnel_outer);
3352                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
3353                         mask.vlan = flow_tcf_item_mask
3354                                 (items, &rte_flow_item_vlan_mask,
3355                                  &flow_tcf_mask_supported.vlan,
3356                                  &flow_tcf_mask_empty.vlan,
3357                                  sizeof(flow_tcf_mask_supported.vlan),
3358                                  error);
3359                         assert(mask.vlan);
3360                         if (mask.vlan == &flow_tcf_mask_empty.vlan)
3361                                 break;
3362                         spec.vlan = items->spec;
3363                         assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3364                                outer_etype == RTE_BE16(ETH_P_8021Q));
3365                         outer_etype = RTE_BE16(ETH_P_8021Q);
3366                         if (mask.vlan->inner_type)
3367                                 vlan_etype = spec.vlan->inner_type;
3368                         if (mask.vlan->tci & RTE_BE16(0xe000))
3369                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_VLAN_PRIO,
3370                                                 (rte_be_to_cpu_16
3371                                                  (spec.vlan->tci) >> 13) & 0x7);
3372                         if (mask.vlan->tci & RTE_BE16(0x0fff))
3373                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_VLAN_ID,
3374                                                  rte_be_to_cpu_16
3375                                                  (spec.vlan->tci &
3376                                                   RTE_BE16(0x0fff)));
3377                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3378                         break;
3379                 case RTE_FLOW_ITEM_TYPE_IPV4:
3380                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3381                                       MLX5_FLOW_LAYER_INNER_L3_IPV4 :
3382                                       MLX5_FLOW_LAYER_OUTER_L3_IPV4;
3383                         mask.ipv4 = flow_tcf_item_mask
3384                                 (items, &rte_flow_item_ipv4_mask,
3385                                  &flow_tcf_mask_supported.ipv4,
3386                                  &flow_tcf_mask_empty.ipv4,
3387                                  sizeof(flow_tcf_mask_supported.ipv4),
3388                                  error);
3389                         assert(mask.ipv4);
3390                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3391                                 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3392                                        inner_etype == RTE_BE16(ETH_P_IP));
3393                                 inner_etype = RTE_BE16(ETH_P_IP);
3394                         } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3395                                 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3396                                        vlan_etype == RTE_BE16(ETH_P_IP));
3397                                 vlan_etype = RTE_BE16(ETH_P_IP);
3398                         } else {
3399                                 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3400                                        outer_etype == RTE_BE16(ETH_P_IP));
3401                                 outer_etype = RTE_BE16(ETH_P_IP);
3402                         }
3403                         spec.ipv4 = items->spec;
3404                         if (!tunnel_outer && mask.ipv4->hdr.next_proto_id) {
3405                                 /*
3406                                  * No way to set IP protocol for outer tunnel
3407                                  * layers. Usually it is fixed, for example,
3408                                  * to UDP for VXLAN/GPE.
3409                                  */
3410                                 assert(spec.ipv4); /* Mask is not empty. */
3411                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3412                                                 spec.ipv4->hdr.next_proto_id);
3413                                 ip_proto_set = 1;
3414                         }
3415                         if (mask.ipv4 == &flow_tcf_mask_empty.ipv4 ||
3416                              (!mask.ipv4->hdr.src_addr &&
3417                               !mask.ipv4->hdr.dst_addr)) {
3418                                 if (!tunnel_outer)
3419                                         break;
3420                                 /*
3421                                  * For tunnel outer we must set outer IP key
3422                                  * anyway, even if the specification/mask is
3423                                  * empty. There is no another way to tell
3424                                  * kernel about he outer layer protocol.
3425                                  */
3426                                 mnl_attr_put_u32
3427                                         (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC,
3428                                          mask.ipv4->hdr.src_addr);
3429                                 mnl_attr_put_u32
3430                                         (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
3431                                          mask.ipv4->hdr.src_addr);
3432                                 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3433                                 break;
3434                         }
3435                         if (mask.ipv4->hdr.src_addr) {
3436                                 mnl_attr_put_u32
3437                                         (nlh, tunnel_outer ?
3438                                          TCA_FLOWER_KEY_ENC_IPV4_SRC :
3439                                          TCA_FLOWER_KEY_IPV4_SRC,
3440                                          spec.ipv4->hdr.src_addr);
3441                                 mnl_attr_put_u32
3442                                         (nlh, tunnel_outer ?
3443                                          TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK :
3444                                          TCA_FLOWER_KEY_IPV4_SRC_MASK,
3445                                          mask.ipv4->hdr.src_addr);
3446                         }
3447                         if (mask.ipv4->hdr.dst_addr) {
3448                                 mnl_attr_put_u32
3449                                         (nlh, tunnel_outer ?
3450                                          TCA_FLOWER_KEY_ENC_IPV4_DST :
3451                                          TCA_FLOWER_KEY_IPV4_DST,
3452                                          spec.ipv4->hdr.dst_addr);
3453                                 mnl_attr_put_u32
3454                                         (nlh, tunnel_outer ?
3455                                          TCA_FLOWER_KEY_ENC_IPV4_DST_MASK :
3456                                          TCA_FLOWER_KEY_IPV4_DST_MASK,
3457                                          mask.ipv4->hdr.dst_addr);
3458                         }
3459                         if (mask.ipv4->hdr.time_to_live) {
3460                                 mnl_attr_put_u8
3461                                         (nlh, tunnel_outer ?
3462                                          TCA_FLOWER_KEY_ENC_IP_TTL :
3463                                          TCA_FLOWER_KEY_IP_TTL,
3464                                          spec.ipv4->hdr.time_to_live);
3465                                 mnl_attr_put_u8
3466                                         (nlh, tunnel_outer ?
3467                                          TCA_FLOWER_KEY_ENC_IP_TTL_MASK :
3468                                          TCA_FLOWER_KEY_IP_TTL_MASK,
3469                                          mask.ipv4->hdr.time_to_live);
3470                         }
3471                         if (mask.ipv4->hdr.type_of_service) {
3472                                 mnl_attr_put_u8
3473                                         (nlh, tunnel_outer ?
3474                                          TCA_FLOWER_KEY_ENC_IP_TOS :
3475                                          TCA_FLOWER_KEY_IP_TOS,
3476                                          spec.ipv4->hdr.type_of_service);
3477                                 mnl_attr_put_u8
3478                                         (nlh, tunnel_outer ?
3479                                          TCA_FLOWER_KEY_ENC_IP_TOS_MASK :
3480                                          TCA_FLOWER_KEY_IP_TOS_MASK,
3481                                          mask.ipv4->hdr.type_of_service);
3482                         }
3483                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3484                         break;
3485                 case RTE_FLOW_ITEM_TYPE_IPV6: {
3486                         bool ipv6_src, ipv6_dst;
3487                         uint8_t msk6, tos6;
3488
3489                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3490                                       MLX5_FLOW_LAYER_INNER_L3_IPV6 :
3491                                       MLX5_FLOW_LAYER_OUTER_L3_IPV6;
3492                         mask.ipv6 = flow_tcf_item_mask
3493                                 (items, &rte_flow_item_ipv6_mask,
3494                                  &flow_tcf_mask_supported.ipv6,
3495                                  &flow_tcf_mask_empty.ipv6,
3496                                  sizeof(flow_tcf_mask_supported.ipv6),
3497                                  error);
3498                         assert(mask.ipv6);
3499                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3500                                 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3501                                        inner_etype == RTE_BE16(ETH_P_IPV6));
3502                                 inner_etype = RTE_BE16(ETH_P_IPV6);
3503                         } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3504                                 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3505                                        vlan_etype == RTE_BE16(ETH_P_IPV6));
3506                                 vlan_etype = RTE_BE16(ETH_P_IPV6);
3507                         } else {
3508                                 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3509                                        outer_etype == RTE_BE16(ETH_P_IPV6));
3510                                 outer_etype = RTE_BE16(ETH_P_IPV6);
3511                         }
3512                         spec.ipv6 = items->spec;
3513                         if (!tunnel_outer && mask.ipv6->hdr.proto) {
3514                                 /*
3515                                  * No way to set IP protocol for outer tunnel
3516                                  * layers. Usually it is fixed, for example,
3517                                  * to UDP for VXLAN/GPE.
3518                                  */
3519                                 assert(spec.ipv6); /* Mask is not empty. */
3520                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3521                                                 spec.ipv6->hdr.proto);
3522                                 ip_proto_set = 1;
3523                         }
3524                         ipv6_dst = !IN6_IS_ADDR_UNSPECIFIED
3525                                                 (mask.ipv6->hdr.dst_addr);
3526                         ipv6_src = !IN6_IS_ADDR_UNSPECIFIED
3527                                                 (mask.ipv6->hdr.src_addr);
3528                         if (mask.ipv6 == &flow_tcf_mask_empty.ipv6 ||
3529                              (!ipv6_dst && !ipv6_src)) {
3530                                 if (!tunnel_outer)
3531                                         break;
3532                                 /*
3533                                  * For tunnel outer we must set outer IP key
3534                                  * anyway, even if the specification/mask is
3535                                  * empty. There is no another way to tell
3536                                  * kernel about he outer layer protocol.
3537                                  */
3538                                 mnl_attr_put(nlh,
3539                                              TCA_FLOWER_KEY_ENC_IPV6_SRC,
3540                                              IPV6_ADDR_LEN,
3541                                              mask.ipv6->hdr.src_addr);
3542                                 mnl_attr_put(nlh,
3543                                              TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
3544                                              IPV6_ADDR_LEN,
3545                                              mask.ipv6->hdr.src_addr);
3546                                 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3547                                 break;
3548                         }
3549                         if (ipv6_src) {
3550                                 mnl_attr_put(nlh, tunnel_outer ?
3551                                              TCA_FLOWER_KEY_ENC_IPV6_SRC :
3552                                              TCA_FLOWER_KEY_IPV6_SRC,
3553                                              IPV6_ADDR_LEN,
3554                                              spec.ipv6->hdr.src_addr);
3555                                 mnl_attr_put(nlh, tunnel_outer ?
3556                                              TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK :
3557                                              TCA_FLOWER_KEY_IPV6_SRC_MASK,
3558                                              IPV6_ADDR_LEN,
3559                                              mask.ipv6->hdr.src_addr);
3560                         }
3561                         if (ipv6_dst) {
3562                                 mnl_attr_put(nlh, tunnel_outer ?
3563                                              TCA_FLOWER_KEY_ENC_IPV6_DST :
3564                                              TCA_FLOWER_KEY_IPV6_DST,
3565                                              IPV6_ADDR_LEN,
3566                                              spec.ipv6->hdr.dst_addr);
3567                                 mnl_attr_put(nlh, tunnel_outer ?
3568                                              TCA_FLOWER_KEY_ENC_IPV6_DST_MASK :
3569                                              TCA_FLOWER_KEY_IPV6_DST_MASK,
3570                                              IPV6_ADDR_LEN,
3571                                              mask.ipv6->hdr.dst_addr);
3572                         }
3573                         if (mask.ipv6->hdr.hop_limits) {
3574                                 mnl_attr_put_u8
3575                                         (nlh, tunnel_outer ?
3576                                          TCA_FLOWER_KEY_ENC_IP_TTL :
3577                                          TCA_FLOWER_KEY_IP_TTL,
3578                                          spec.ipv6->hdr.hop_limits);
3579                                 mnl_attr_put_u8
3580                                         (nlh, tunnel_outer ?
3581                                          TCA_FLOWER_KEY_ENC_IP_TTL_MASK :
3582                                          TCA_FLOWER_KEY_IP_TTL_MASK,
3583                                          mask.ipv6->hdr.hop_limits);
3584                         }
3585                         msk6 = (rte_be_to_cpu_32(mask.ipv6->hdr.vtc_flow) >>
3586                                 IPV6_HDR_TC_SHIFT) & 0xff;
3587                         if (msk6) {
3588                                 tos6 = (rte_be_to_cpu_32
3589                                         (spec.ipv6->hdr.vtc_flow) >>
3590                                                 IPV6_HDR_TC_SHIFT) & 0xff;
3591                                 mnl_attr_put_u8
3592                                         (nlh, tunnel_outer ?
3593                                          TCA_FLOWER_KEY_ENC_IP_TOS :
3594                                          TCA_FLOWER_KEY_IP_TOS, tos6);
3595                                 mnl_attr_put_u8
3596                                         (nlh, tunnel_outer ?
3597                                          TCA_FLOWER_KEY_ENC_IP_TOS_MASK :
3598                                          TCA_FLOWER_KEY_IP_TOS_MASK, msk6);
3599                         }
3600                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3601                         break;
3602                 }
3603                 case RTE_FLOW_ITEM_TYPE_UDP:
3604                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3605                                       MLX5_FLOW_LAYER_INNER_L4_UDP :
3606                                       MLX5_FLOW_LAYER_OUTER_L4_UDP;
3607                         mask.udp = flow_tcf_item_mask
3608                                 (items, &rte_flow_item_udp_mask,
3609                                  &flow_tcf_mask_supported.udp,
3610                                  &flow_tcf_mask_empty.udp,
3611                                  sizeof(flow_tcf_mask_supported.udp),
3612                                  error);
3613                         assert(mask.udp);
3614                         spec.udp = items->spec;
3615                         if (!tunnel_outer) {
3616                                 if (!ip_proto_set)
3617                                         mnl_attr_put_u8
3618                                                 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3619                                                 IPPROTO_UDP);
3620                                 if (mask.udp == &flow_tcf_mask_empty.udp)
3621                                         break;
3622                         } else {
3623                                 assert(mask.udp != &flow_tcf_mask_empty.udp);
3624                                 decap.vxlan->udp_port =
3625                                         rte_be_to_cpu_16
3626                                                 (spec.udp->hdr.dst_port);
3627                         }
3628                         if (mask.udp->hdr.src_port) {
3629                                 mnl_attr_put_u16
3630                                         (nlh, tunnel_outer ?
3631                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT :
3632                                          TCA_FLOWER_KEY_UDP_SRC,
3633                                          spec.udp->hdr.src_port);
3634                                 mnl_attr_put_u16
3635                                         (nlh, tunnel_outer ?
3636                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK :
3637                                          TCA_FLOWER_KEY_UDP_SRC_MASK,
3638                                          mask.udp->hdr.src_port);
3639                         }
3640                         if (mask.udp->hdr.dst_port) {
3641                                 mnl_attr_put_u16
3642                                         (nlh, tunnel_outer ?
3643                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT :
3644                                          TCA_FLOWER_KEY_UDP_DST,
3645                                          spec.udp->hdr.dst_port);
3646                                 mnl_attr_put_u16
3647                                         (nlh, tunnel_outer ?
3648                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK :
3649                                          TCA_FLOWER_KEY_UDP_DST_MASK,
3650                                          mask.udp->hdr.dst_port);
3651                         }
3652                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3653                         break;
3654                 case RTE_FLOW_ITEM_TYPE_TCP:
3655                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3656                                       MLX5_FLOW_LAYER_INNER_L4_TCP :
3657                                       MLX5_FLOW_LAYER_OUTER_L4_TCP;
3658                         mask.tcp = flow_tcf_item_mask
3659                                 (items, &rte_flow_item_tcp_mask,
3660                                  &flow_tcf_mask_supported.tcp,
3661                                  &flow_tcf_mask_empty.tcp,
3662                                  sizeof(flow_tcf_mask_supported.tcp),
3663                                  error);
3664                         assert(mask.tcp);
3665                         if (!ip_proto_set)
3666                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3667                                                 IPPROTO_TCP);
3668                         if (mask.tcp == &flow_tcf_mask_empty.tcp)
3669                                 break;
3670                         spec.tcp = items->spec;
3671                         if (mask.tcp->hdr.src_port) {
3672                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_SRC,
3673                                                  spec.tcp->hdr.src_port);
3674                                 mnl_attr_put_u16(nlh,
3675                                                  TCA_FLOWER_KEY_TCP_SRC_MASK,
3676                                                  mask.tcp->hdr.src_port);
3677                         }
3678                         if (mask.tcp->hdr.dst_port) {
3679                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_DST,
3680                                                  spec.tcp->hdr.dst_port);
3681                                 mnl_attr_put_u16(nlh,
3682                                                  TCA_FLOWER_KEY_TCP_DST_MASK,
3683                                                  mask.tcp->hdr.dst_port);
3684                         }
3685                         if (mask.tcp->hdr.tcp_flags) {
3686                                 mnl_attr_put_u16
3687                                         (nlh,
3688                                          TCA_FLOWER_KEY_TCP_FLAGS,
3689                                          rte_cpu_to_be_16
3690                                                 (spec.tcp->hdr.tcp_flags));
3691                                 mnl_attr_put_u16
3692                                         (nlh,
3693                                          TCA_FLOWER_KEY_TCP_FLAGS_MASK,
3694                                          rte_cpu_to_be_16
3695                                                 (mask.tcp->hdr.tcp_flags));
3696                         }
3697                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3698                         break;
3699                 case RTE_FLOW_ITEM_TYPE_VXLAN:
3700                         assert(decap.vxlan);
3701                         tunnel_outer = 0;
3702                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
3703                         spec.vxlan = items->spec;
3704                         mnl_attr_put_u32(nlh,
3705                                          TCA_FLOWER_KEY_ENC_KEY_ID,
3706                                          vxlan_vni_as_be32(spec.vxlan->vni));
3707                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3708                         break;
3709                 default:
3710                         return rte_flow_error_set(error, ENOTSUP,
3711                                                   RTE_FLOW_ERROR_TYPE_ITEM,
3712                                                   NULL, "item not supported");
3713                 }
3714         }
3715         /*
3716          * Set the ether_type flower key and tc rule protocol:
3717          * - if there is nor VLAN neither VXLAN the key is taken from
3718          *   eth item directly or deduced from L3 items.
3719          * - if there is vlan item then key is fixed to 802.1q.
3720          * - if there is vxlan item then key is set to inner tunnel type.
3721          * - simultaneous vlan and vxlan items are prohibited.
3722          */
3723         if (outer_etype != RTE_BE16(ETH_P_ALL)) {
3724                 tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
3725                                            outer_etype);
3726                 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3727                         if (inner_etype != RTE_BE16(ETH_P_ALL))
3728                                 mnl_attr_put_u16(nlh,
3729                                                  TCA_FLOWER_KEY_ETH_TYPE,
3730                                                  inner_etype);
3731                 } else {
3732                         mnl_attr_put_u16(nlh,
3733                                          TCA_FLOWER_KEY_ETH_TYPE,
3734                                          outer_etype);
3735                         if (outer_etype == RTE_BE16(ETH_P_8021Q) &&
3736                             vlan_etype != RTE_BE16(ETH_P_ALL))
3737                                 mnl_attr_put_u16(nlh,
3738                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE,
3739                                                  vlan_etype);
3740                 }
3741                 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3742         }
3743         na_flower_act = mnl_attr_nest_start(nlh, TCA_FLOWER_ACT);
3744         na_act_index_cur = 1;
3745         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
3746                 struct nlattr *na_act_index;
3747                 struct nlattr *na_act;
3748                 unsigned int vlan_act;
3749                 unsigned int i;
3750
3751                 switch (actions->type) {
3752                 case RTE_FLOW_ACTION_TYPE_VOID:
3753                         break;
3754                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
3755                         conf.port_id = actions->conf;
3756                         if (conf.port_id->original)
3757                                 i = 0;
3758                         else
3759                                 for (i = 0; ptoi[i].ifindex; ++i)
3760                                         if (ptoi[i].port_id == conf.port_id->id)
3761                                                 break;
3762                         assert(ptoi[i].ifindex);
3763                         na_act_index =
3764                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3765                         assert(na_act_index);
3766                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
3767                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3768                         assert(na_act);
3769                         if (encap.hdr) {
3770                                 assert(dev_flow->tcf.tunnel);
3771                                 dev_flow->tcf.tunnel->ifindex_ptr =
3772                                         &((struct tc_mirred *)
3773                                         mnl_attr_get_payload
3774                                         (mnl_nlmsg_get_payload_tail
3775                                                 (nlh)))->ifindex;
3776                         }
3777                         mnl_attr_put(nlh, TCA_MIRRED_PARMS,
3778                                      sizeof(struct tc_mirred),
3779                                      &(struct tc_mirred){
3780                                         .action = TC_ACT_STOLEN,
3781                                         .eaction = TCA_EGRESS_REDIR,
3782                                         .ifindex = ptoi[i].ifindex,
3783                                      });
3784                         mnl_attr_nest_end(nlh, na_act);
3785                         mnl_attr_nest_end(nlh, na_act_index);
3786                         break;
3787                 case RTE_FLOW_ACTION_TYPE_JUMP:
3788                         conf.jump = actions->conf;
3789                         na_act_index =
3790                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3791                         assert(na_act_index);
3792                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3793                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3794                         assert(na_act);
3795                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3796                                      sizeof(struct tc_gact),
3797                                      &(struct tc_gact){
3798                                         .action = TC_ACT_GOTO_CHAIN |
3799                                                   conf.jump->group,
3800                                      });
3801                         mnl_attr_nest_end(nlh, na_act);
3802                         mnl_attr_nest_end(nlh, na_act_index);
3803                         break;
3804                 case RTE_FLOW_ACTION_TYPE_DROP:
3805                         na_act_index =
3806                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3807                         assert(na_act_index);
3808                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3809                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3810                         assert(na_act);
3811                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3812                                      sizeof(struct tc_gact),
3813                                      &(struct tc_gact){
3814                                         .action = TC_ACT_SHOT,
3815                                      });
3816                         mnl_attr_nest_end(nlh, na_act);
3817                         mnl_attr_nest_end(nlh, na_act_index);
3818                         break;
3819                 case RTE_FLOW_ACTION_TYPE_COUNT:
3820                         /*
3821                          * Driver adds the count action implicitly for
3822                          * each rule it creates.
3823                          */
3824                         ret = flow_tcf_translate_action_count(dev,
3825                                                               dev_flow, error);
3826                         if (ret < 0)
3827                                 return ret;
3828                         break;
3829                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
3830                         conf.of_push_vlan = NULL;
3831                         vlan_act = TCA_VLAN_ACT_POP;
3832                         goto action_of_vlan;
3833                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
3834                         conf.of_push_vlan = actions->conf;
3835                         vlan_act = TCA_VLAN_ACT_PUSH;
3836                         goto action_of_vlan;
3837                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
3838                         conf.of_set_vlan_vid = actions->conf;
3839                         if (na_vlan_id)
3840                                 goto override_na_vlan_id;
3841                         vlan_act = TCA_VLAN_ACT_MODIFY;
3842                         goto action_of_vlan;
3843                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
3844                         conf.of_set_vlan_pcp = actions->conf;
3845                         if (na_vlan_priority)
3846                                 goto override_na_vlan_priority;
3847                         vlan_act = TCA_VLAN_ACT_MODIFY;
3848                         goto action_of_vlan;
3849 action_of_vlan:
3850                         na_act_index =
3851                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3852                         assert(na_act_index);
3853                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "vlan");
3854                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3855                         assert(na_act);
3856                         mnl_attr_put(nlh, TCA_VLAN_PARMS,
3857                                      sizeof(struct tc_vlan),
3858                                      &(struct tc_vlan){
3859                                         .action = TC_ACT_PIPE,
3860                                         .v_action = vlan_act,
3861                                      });
3862                         if (vlan_act == TCA_VLAN_ACT_POP) {
3863                                 mnl_attr_nest_end(nlh, na_act);
3864                                 mnl_attr_nest_end(nlh, na_act_index);
3865                                 break;
3866                         }
3867                         if (vlan_act == TCA_VLAN_ACT_PUSH)
3868                                 mnl_attr_put_u16(nlh,
3869                                                  TCA_VLAN_PUSH_VLAN_PROTOCOL,
3870                                                  conf.of_push_vlan->ethertype);
3871                         na_vlan_id = mnl_nlmsg_get_payload_tail(nlh);
3872                         mnl_attr_put_u16(nlh, TCA_VLAN_PAD, 0);
3873                         na_vlan_priority = mnl_nlmsg_get_payload_tail(nlh);
3874                         mnl_attr_put_u8(nlh, TCA_VLAN_PAD, 0);
3875                         mnl_attr_nest_end(nlh, na_act);
3876                         mnl_attr_nest_end(nlh, na_act_index);
3877                         if (actions->type ==
3878                             RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
3879 override_na_vlan_id:
3880                                 na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID;
3881                                 *(uint16_t *)mnl_attr_get_payload(na_vlan_id) =
3882                                         rte_be_to_cpu_16
3883                                         (conf.of_set_vlan_vid->vlan_vid);
3884                         } else if (actions->type ==
3885                                    RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
3886 override_na_vlan_priority:
3887                                 na_vlan_priority->nla_type =
3888                                         TCA_VLAN_PUSH_VLAN_PRIORITY;
3889                                 *(uint8_t *)mnl_attr_get_payload
3890                                         (na_vlan_priority) =
3891                                         conf.of_set_vlan_pcp->vlan_pcp;
3892                         }
3893                         break;
3894                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
3895                         assert(decap.vxlan);
3896                         assert(dev_flow->tcf.tunnel);
3897                         dev_flow->tcf.tunnel->ifindex_ptr =
3898                                 (unsigned int *)&tcm->tcm_ifindex;
3899                         na_act_index =
3900                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3901                         assert(na_act_index);
3902                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3903                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3904                         assert(na_act);
3905                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3906                                 sizeof(struct tc_tunnel_key),
3907                                 &(struct tc_tunnel_key){
3908                                         .action = TC_ACT_PIPE,
3909                                         .t_action = TCA_TUNNEL_KEY_ACT_RELEASE,
3910                                         });
3911                         mnl_attr_nest_end(nlh, na_act);
3912                         mnl_attr_nest_end(nlh, na_act_index);
3913                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3914                         break;
3915                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
3916                         assert(encap.vxlan);
3917                         flow_tcf_vxlan_encap_parse(actions, encap.vxlan);
3918                         na_act_index =
3919                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3920                         assert(na_act_index);
3921                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3922                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3923                         assert(na_act);
3924                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3925                                 sizeof(struct tc_tunnel_key),
3926                                 &(struct tc_tunnel_key){
3927                                         .action = TC_ACT_PIPE,
3928                                         .t_action = TCA_TUNNEL_KEY_ACT_SET,
3929                                         });
3930                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_UDP_DST)
3931                                 mnl_attr_put_u16(nlh,
3932                                          TCA_TUNNEL_KEY_ENC_DST_PORT,
3933                                          encap.vxlan->udp.dst);
3934                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_SRC)
3935                                 mnl_attr_put_u32(nlh,
3936                                          TCA_TUNNEL_KEY_ENC_IPV4_SRC,
3937                                          encap.vxlan->ipv4.src);
3938                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_DST)
3939                                 mnl_attr_put_u32(nlh,
3940                                          TCA_TUNNEL_KEY_ENC_IPV4_DST,
3941                                          encap.vxlan->ipv4.dst);
3942                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_SRC)
3943                                 mnl_attr_put(nlh,
3944                                          TCA_TUNNEL_KEY_ENC_IPV6_SRC,
3945                                          sizeof(encap.vxlan->ipv6.src),
3946                                          &encap.vxlan->ipv6.src);
3947                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_DST)
3948                                 mnl_attr_put(nlh,
3949                                          TCA_TUNNEL_KEY_ENC_IPV6_DST,
3950                                          sizeof(encap.vxlan->ipv6.dst),
3951                                          &encap.vxlan->ipv6.dst);
3952                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IP_TTL)
3953                                 mnl_attr_put_u8(nlh,
3954                                          TCA_TUNNEL_KEY_ENC_TTL,
3955                                          encap.vxlan->ip_ttl_hop);
3956                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IP_TOS)
3957                                 mnl_attr_put_u8(nlh,
3958                                          TCA_TUNNEL_KEY_ENC_TOS,
3959                                          encap.vxlan->ip_tos);
3960                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_VXLAN_VNI)
3961                                 mnl_attr_put_u32(nlh,
3962                                          TCA_TUNNEL_KEY_ENC_KEY_ID,
3963                                          vxlan_vni_as_be32
3964                                                 (encap.vxlan->vxlan.vni));
3965                         mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM, 0);
3966                         mnl_attr_nest_end(nlh, na_act);
3967                         mnl_attr_nest_end(nlh, na_act_index);
3968                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3969                         break;
3970                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
3971                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
3972                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
3973                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
3974                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
3975                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
3976                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
3977                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
3978                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
3979                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
3980                         na_act_index =
3981                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3982                         flow_tcf_create_pedit_mnl_msg(nlh,
3983                                                       &actions, item_flags);
3984                         mnl_attr_nest_end(nlh, na_act_index);
3985                         break;
3986                 default:
3987                         return rte_flow_error_set(error, ENOTSUP,
3988                                                   RTE_FLOW_ERROR_TYPE_ACTION,
3989                                                   actions,
3990                                                   "action not supported");
3991                 }
3992         }
3993         assert(na_flower);
3994         assert(na_flower_act);
3995         mnl_attr_nest_end(nlh, na_flower_act);
3996         dev_flow->tcf.ptc_flags = mnl_attr_get_payload
3997                                         (mnl_nlmsg_get_payload_tail(nlh));
3998         mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, decap.vxlan ?
3999                                                 0 : TCA_CLS_FLAGS_SKIP_SW);
4000         mnl_attr_nest_end(nlh, na_flower);
4001         if (dev_flow->tcf.tunnel && dev_flow->tcf.tunnel->ifindex_ptr)
4002                 dev_flow->tcf.tunnel->ifindex_org =
4003                         *dev_flow->tcf.tunnel->ifindex_ptr;
4004         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
4005         return 0;
4006 }
4007
4008 /**
4009  * Send Netlink message with acknowledgment.
4010  *
4011  * @param tcf
4012  *   Flow context to use.
4013  * @param nlh
4014  *   Message to send. This function always raises the NLM_F_ACK flag before
4015  *   sending.
4016  * @param[in] cb
4017  *   Callback handler for received message.
4018  * @param[in] arg
4019  *   Context pointer for callback handler.
4020  *
4021  * @return
4022  *   0 on success, a negative errno value otherwise and rte_errno is set.
4023  */
4024 static int
4025 flow_tcf_nl_ack(struct mlx5_flow_tcf_context *tcf,
4026                 struct nlmsghdr *nlh,
4027                 mnl_cb_t cb, void *arg)
4028 {
4029         unsigned int portid = mnl_socket_get_portid(tcf->nl);
4030         uint32_t seq = tcf->seq++;
4031         int ret, err = 0;
4032
4033         assert(tcf->nl);
4034         assert(tcf->buf);
4035         if (!seq) {
4036                 /* seq 0 is reserved for kernel event-driven notifications. */
4037                 seq = tcf->seq++;
4038         }
4039         nlh->nlmsg_seq = seq;
4040         nlh->nlmsg_flags |= NLM_F_ACK;
4041         ret = mnl_socket_sendto(tcf->nl, nlh, nlh->nlmsg_len);
4042         if (ret <= 0) {
4043                 /* Message send error occurres. */
4044                 rte_errno = errno;
4045                 return -rte_errno;
4046         }
4047         nlh = (struct nlmsghdr *)(tcf->buf);
4048         /*
4049          * The following loop postpones non-fatal errors until multipart
4050          * messages are complete.
4051          */
4052         while (true) {
4053                 ret = mnl_socket_recvfrom(tcf->nl, tcf->buf, tcf->buf_size);
4054                 if (ret < 0) {
4055                         err = errno;
4056                         /*
4057                          * In case of overflow Will receive till
4058                          * end of multipart message. We may lost part
4059                          * of reply messages but mark and return an error.
4060                          */
4061                         if (err != ENOSPC ||
4062                             !(nlh->nlmsg_flags & NLM_F_MULTI) ||
4063                             nlh->nlmsg_type == NLMSG_DONE)
4064                                 break;
4065                 } else {
4066                         ret = mnl_cb_run(nlh, ret, seq, portid, cb, arg);
4067                         if (!ret) {
4068                                 /*
4069                                  * libmnl returns 0 if DONE or
4070                                  * success ACK message found.
4071                                  */
4072                                 break;
4073                         }
4074                         if (ret < 0) {
4075                                 /*
4076                                  * ACK message with error found
4077                                  * or some error occurred.
4078                                  */
4079                                 err = errno;
4080                                 break;
4081                         }
4082                         /* We should continue receiving. */
4083                 }
4084         }
4085         if (!err)
4086                 return 0;
4087         rte_errno = err;
4088         return -err;
4089 }
4090
4091 #define MNL_BUF_EXTRA_SPACE 16
4092 #define MNL_REQUEST_SIZE_MIN 256
4093 #define MNL_REQUEST_SIZE_MAX 2048
4094 #define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \
4095                                  MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX)
4096
4097 /* Data structures used by flow_tcf_xxx_cb() routines. */
4098 struct tcf_nlcb_buf {
4099         LIST_ENTRY(tcf_nlcb_buf) next;
4100         uint32_t size;
4101         alignas(struct nlmsghdr)
4102         uint8_t msg[]; /**< Netlink message data. */
4103 };
4104
4105 struct tcf_nlcb_context {
4106         unsigned int ifindex; /**< Base interface index. */
4107         uint32_t bufsize;
4108         LIST_HEAD(, tcf_nlcb_buf) nlbuf;
4109 };
4110
4111 /**
4112  * Allocate space for netlink command in buffer list
4113  *
4114  * @param[in, out] ctx
4115  *   Pointer to callback context with command buffers list.
4116  * @param[in] size
4117  *   Required size of data buffer to be allocated.
4118  *
4119  * @return
4120  *   Pointer to allocated memory, aligned as message header.
4121  *   NULL if some error occurred.
4122  */
4123 static struct nlmsghdr *
4124 flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size)
4125 {
4126         struct tcf_nlcb_buf *buf;
4127         struct nlmsghdr *nlh;
4128
4129         size = NLMSG_ALIGN(size);
4130         buf = LIST_FIRST(&ctx->nlbuf);
4131         if (buf && (buf->size + size) <= ctx->bufsize) {
4132                 nlh = (struct nlmsghdr *)&buf->msg[buf->size];
4133                 buf->size += size;
4134                 return nlh;
4135         }
4136         if (size > ctx->bufsize) {
4137                 DRV_LOG(WARNING, "netlink: too long command buffer requested");
4138                 return NULL;
4139         }
4140         buf = rte_malloc(__func__,
4141                         ctx->bufsize + sizeof(struct tcf_nlcb_buf),
4142                         alignof(struct tcf_nlcb_buf));
4143         if (!buf) {
4144                 DRV_LOG(WARNING, "netlink: no memory for command buffer");
4145                 return NULL;
4146         }
4147         LIST_INSERT_HEAD(&ctx->nlbuf, buf, next);
4148         buf->size = size;
4149         nlh = (struct nlmsghdr *)&buf->msg[0];
4150         return nlh;
4151 }
4152
4153 /**
4154  * Send the buffers with prepared netlink commands. Scans the list and
4155  * sends all found buffers. Buffers are sent and freed anyway in order
4156  * to prevent memory leakage if some every message in received packet.
4157  *
4158  * @param[in] tcf
4159  *   Context object initialized by mlx5_flow_tcf_context_create().
4160  * @param[in, out] ctx
4161  *   Pointer to callback context with command buffers list.
4162  *
4163  * @return
4164  *   Zero value on success, negative errno value otherwise
4165  *   and rte_errno is set.
4166  */
4167 static int
4168 flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf,
4169                     struct tcf_nlcb_context *ctx)
4170 {
4171         struct tcf_nlcb_buf *bc = LIST_FIRST(&ctx->nlbuf);
4172         int ret = 0;
4173
4174         while (bc) {
4175                 struct tcf_nlcb_buf *bn = LIST_NEXT(bc, next);
4176                 struct nlmsghdr *nlh;
4177                 uint32_t msg = 0;
4178                 int rc;
4179
4180                 while (msg < bc->size) {
4181                         /*
4182                          * Send Netlink commands from buffer in one by one
4183                          * fashion. If we send multiple rule deletion commands
4184                          * in one Netlink message and some error occurs it may
4185                          * cause multiple ACK error messages and break sequence
4186                          * numbers of Netlink communication, because we expect
4187                          * the only one ACK reply.
4188                          */
4189                         assert((bc->size - msg) >= sizeof(struct nlmsghdr));
4190                         nlh = (struct nlmsghdr *)&bc->msg[msg];
4191                         assert((bc->size - msg) >= nlh->nlmsg_len);
4192                         msg += nlh->nlmsg_len;
4193                         rc = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4194                         if (rc) {
4195                                 DRV_LOG(WARNING,
4196                                         "netlink: cleanup error %d", rc);
4197                                 if (!ret)
4198                                         ret = rc;
4199                         }
4200                 }
4201                 rte_free(bc);
4202                 bc = bn;
4203         }
4204         LIST_INIT(&ctx->nlbuf);
4205         return ret;
4206 }
4207
4208 /**
4209  * Collect local IP address rules with scope link attribute  on specified
4210  * network device. This is callback routine called by libmnl mnl_cb_run()
4211  * in loop for every message in received packet.
4212  *
4213  * @param[in] nlh
4214  *   Pointer to reply header.
4215  * @param[in, out] arg
4216  *   Opaque data pointer for this callback.
4217  *
4218  * @return
4219  *   A positive, nonzero value on success, negative errno value otherwise
4220  *   and rte_errno is set.
4221  */
4222 static int
4223 flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg)
4224 {
4225         struct tcf_nlcb_context *ctx = arg;
4226         struct nlmsghdr *cmd;
4227         struct ifaddrmsg *ifa;
4228         struct nlattr *na;
4229         struct nlattr *na_local = NULL;
4230         struct nlattr *na_peer = NULL;
4231         unsigned char family;
4232         uint32_t size;
4233
4234         if (nlh->nlmsg_type != RTM_NEWADDR) {
4235                 rte_errno = EINVAL;
4236                 return -rte_errno;
4237         }
4238         ifa = mnl_nlmsg_get_payload(nlh);
4239         family = ifa->ifa_family;
4240         if (ifa->ifa_index != ctx->ifindex ||
4241             ifa->ifa_scope != RT_SCOPE_LINK ||
4242             !(ifa->ifa_flags & IFA_F_PERMANENT) ||
4243             (family != AF_INET && family != AF_INET6))
4244                 return 1;
4245         mnl_attr_for_each(na, nlh, sizeof(*ifa)) {
4246                 switch (mnl_attr_get_type(na)) {
4247                 case IFA_LOCAL:
4248                         na_local = na;
4249                         break;
4250                 case IFA_ADDRESS:
4251                         na_peer = na;
4252                         break;
4253                 }
4254                 if (na_local && na_peer)
4255                         break;
4256         }
4257         if (!na_local || !na_peer)
4258                 return 1;
4259         /* Local rule found with scope link, permanent and assigned peer. */
4260         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4261                MNL_ALIGN(sizeof(struct ifaddrmsg)) +
4262                (family == AF_INET6 ? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4263                                    : 2 * SZ_NLATTR_TYPE_OF(uint32_t));
4264         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4265         if (!cmd) {
4266                 rte_errno = ENOMEM;
4267                 return -rte_errno;
4268         }
4269         cmd = mnl_nlmsg_put_header(cmd);
4270         cmd->nlmsg_type = RTM_DELADDR;
4271         cmd->nlmsg_flags = NLM_F_REQUEST;
4272         ifa = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifa));
4273         ifa->ifa_flags = IFA_F_PERMANENT;
4274         ifa->ifa_scope = RT_SCOPE_LINK;
4275         ifa->ifa_index = ctx->ifindex;
4276         if (family == AF_INET) {
4277                 ifa->ifa_family = AF_INET;
4278                 ifa->ifa_prefixlen = 32;
4279                 mnl_attr_put_u32(cmd, IFA_LOCAL, mnl_attr_get_u32(na_local));
4280                 mnl_attr_put_u32(cmd, IFA_ADDRESS, mnl_attr_get_u32(na_peer));
4281         } else {
4282                 ifa->ifa_family = AF_INET6;
4283                 ifa->ifa_prefixlen = 128;
4284                 mnl_attr_put(cmd, IFA_LOCAL, IPV6_ADDR_LEN,
4285                         mnl_attr_get_payload(na_local));
4286                 mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN,
4287                         mnl_attr_get_payload(na_peer));
4288         }
4289         assert(size == cmd->nlmsg_len);
4290         return 1;
4291 }
4292
4293 /**
4294  * Cleanup the local IP addresses on outer interface.
4295  *
4296  * @param[in] tcf
4297  *   Context object initialized by mlx5_flow_tcf_context_create().
4298  * @param[in] ifindex
4299  *   Network inferface index to perform cleanup.
4300  */
4301 static void
4302 flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf,
4303                             unsigned int ifindex)
4304 {
4305         struct nlmsghdr *nlh;
4306         struct ifaddrmsg *ifa;
4307         struct tcf_nlcb_context ctx = {
4308                 .ifindex = ifindex,
4309                 .bufsize = MNL_REQUEST_SIZE,
4310                 .nlbuf = LIST_HEAD_INITIALIZER(),
4311         };
4312         int ret;
4313
4314         assert(ifindex);
4315         /*
4316          * Seek and destroy leftovers of local IP addresses with
4317          * matching properties "scope link".
4318          */
4319         nlh = mnl_nlmsg_put_header(tcf->buf);
4320         nlh->nlmsg_type = RTM_GETADDR;
4321         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4322         ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4323         ifa->ifa_family = AF_UNSPEC;
4324         ifa->ifa_index = ifindex;
4325         ifa->ifa_scope = RT_SCOPE_LINK;
4326         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_local_cb, &ctx);
4327         if (ret)
4328                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4329         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4330         if (ret)
4331                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4332 }
4333
4334 /**
4335  * Collect neigh permament rules on specified network device.
4336  * This is callback routine called by libmnl mnl_cb_run() in loop for
4337  * every message in received packet.
4338  *
4339  * @param[in] nlh
4340  *   Pointer to reply header.
4341  * @param[in, out] arg
4342  *   Opaque data pointer for this callback.
4343  *
4344  * @return
4345  *   A positive, nonzero value on success, negative errno value otherwise
4346  *   and rte_errno is set.
4347  */
4348 static int
4349 flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg)
4350 {
4351         struct tcf_nlcb_context *ctx = arg;
4352         struct nlmsghdr *cmd;
4353         struct ndmsg *ndm;
4354         struct nlattr *na;
4355         struct nlattr *na_ip = NULL;
4356         struct nlattr *na_mac = NULL;
4357         unsigned char family;
4358         uint32_t size;
4359
4360         if (nlh->nlmsg_type != RTM_NEWNEIGH) {
4361                 rte_errno = EINVAL;
4362                 return -rte_errno;
4363         }
4364         ndm = mnl_nlmsg_get_payload(nlh);
4365         family = ndm->ndm_family;
4366         if (ndm->ndm_ifindex != (int)ctx->ifindex ||
4367            !(ndm->ndm_state & NUD_PERMANENT) ||
4368            (family != AF_INET && family != AF_INET6))
4369                 return 1;
4370         mnl_attr_for_each(na, nlh, sizeof(*ndm)) {
4371                 switch (mnl_attr_get_type(na)) {
4372                 case NDA_DST:
4373                         na_ip = na;
4374                         break;
4375                 case NDA_LLADDR:
4376                         na_mac = na;
4377                         break;
4378                 }
4379                 if (na_mac && na_ip)
4380                         break;
4381         }
4382         if (!na_mac || !na_ip)
4383                 return 1;
4384         /* Neigh rule with permenent attribute found. */
4385         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4386                MNL_ALIGN(sizeof(struct ndmsg)) +
4387                SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) +
4388                (family == AF_INET6 ? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4389                                    : SZ_NLATTR_TYPE_OF(uint32_t));
4390         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4391         if (!cmd) {
4392                 rte_errno = ENOMEM;
4393                 return -rte_errno;
4394         }
4395         cmd = mnl_nlmsg_put_header(cmd);
4396         cmd->nlmsg_type = RTM_DELNEIGH;
4397         cmd->nlmsg_flags = NLM_F_REQUEST;
4398         ndm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ndm));
4399         ndm->ndm_ifindex = ctx->ifindex;
4400         ndm->ndm_state = NUD_PERMANENT;
4401         ndm->ndm_flags = 0;
4402         ndm->ndm_type = 0;
4403         if (family == AF_INET) {
4404                 ndm->ndm_family = AF_INET;
4405                 mnl_attr_put_u32(cmd, NDA_DST, mnl_attr_get_u32(na_ip));
4406         } else {
4407                 ndm->ndm_family = AF_INET6;
4408                 mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN,
4409                              mnl_attr_get_payload(na_ip));
4410         }
4411         mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN,
4412                      mnl_attr_get_payload(na_mac));
4413         assert(size == cmd->nlmsg_len);
4414         return 1;
4415 }
4416
4417 /**
4418  * Cleanup the neigh rules on outer interface.
4419  *
4420  * @param[in] tcf
4421  *   Context object initialized by mlx5_flow_tcf_context_create().
4422  * @param[in] ifindex
4423  *   Network inferface index to perform cleanup.
4424  */
4425 static void
4426 flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf,
4427                             unsigned int ifindex)
4428 {
4429         struct nlmsghdr *nlh;
4430         struct ndmsg *ndm;
4431         struct tcf_nlcb_context ctx = {
4432                 .ifindex = ifindex,
4433                 .bufsize = MNL_REQUEST_SIZE,
4434                 .nlbuf = LIST_HEAD_INITIALIZER(),
4435         };
4436         int ret;
4437
4438         assert(ifindex);
4439         /* Seek and destroy leftovers of neigh rules. */
4440         nlh = mnl_nlmsg_put_header(tcf->buf);
4441         nlh->nlmsg_type = RTM_GETNEIGH;
4442         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4443         ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4444         ndm->ndm_family = AF_UNSPEC;
4445         ndm->ndm_ifindex = ifindex;
4446         ndm->ndm_state = NUD_PERMANENT;
4447         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_neigh_cb, &ctx);
4448         if (ret)
4449                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4450         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4451         if (ret)
4452                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4453 }
4454
4455 /**
4456  * Collect indices of VXLAN encap/decap interfaces associated with device.
4457  * This is callback routine called by libmnl mnl_cb_run() in loop for
4458  * every message in received packet.
4459  *
4460  * @param[in] nlh
4461  *   Pointer to reply header.
4462  * @param[in, out] arg
4463  *   Opaque data pointer for this callback.
4464  *
4465  * @return
4466  *   A positive, nonzero value on success, negative errno value otherwise
4467  *   and rte_errno is set.
4468  */
4469 static int
4470 flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg)
4471 {
4472         struct tcf_nlcb_context *ctx = arg;
4473         struct nlmsghdr *cmd;
4474         struct ifinfomsg *ifm;
4475         struct nlattr *na;
4476         struct nlattr *na_info = NULL;
4477         struct nlattr *na_vxlan = NULL;
4478         bool found = false;
4479         unsigned int vxindex;
4480         uint32_t size;
4481
4482         if (nlh->nlmsg_type != RTM_NEWLINK) {
4483                 rte_errno = EINVAL;
4484                 return -rte_errno;
4485         }
4486         ifm = mnl_nlmsg_get_payload(nlh);
4487         if (!ifm->ifi_index) {
4488                 rte_errno = EINVAL;
4489                 return -rte_errno;
4490         }
4491         mnl_attr_for_each(na, nlh, sizeof(*ifm))
4492                 if (mnl_attr_get_type(na) == IFLA_LINKINFO) {
4493                         na_info = na;
4494                         break;
4495                 }
4496         if (!na_info)
4497                 return 1;
4498         mnl_attr_for_each_nested(na, na_info) {
4499                 switch (mnl_attr_get_type(na)) {
4500                 case IFLA_INFO_KIND:
4501                         if (!strncmp("vxlan", mnl_attr_get_str(na),
4502                                      mnl_attr_get_len(na)))
4503                                 found = true;
4504                         break;
4505                 case IFLA_INFO_DATA:
4506                         na_vxlan = na;
4507                         break;
4508                 }
4509                 if (found && na_vxlan)
4510                         break;
4511         }
4512         if (!found || !na_vxlan)
4513                 return 1;
4514         found = false;
4515         mnl_attr_for_each_nested(na, na_vxlan) {
4516                 if (mnl_attr_get_type(na) == IFLA_VXLAN_LINK &&
4517                     mnl_attr_get_u32(na) == ctx->ifindex) {
4518                         found = true;
4519                         break;
4520                 }
4521         }
4522         if (!found)
4523                 return 1;
4524         /* Attached VXLAN device found, store the command to delete. */
4525         vxindex = ifm->ifi_index;
4526         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4527                MNL_ALIGN(sizeof(struct ifinfomsg));
4528         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4529         if (!cmd) {
4530                 rte_errno = ENOMEM;
4531                 return -rte_errno;
4532         }
4533         cmd = mnl_nlmsg_put_header(cmd);
4534         cmd->nlmsg_type = RTM_DELLINK;
4535         cmd->nlmsg_flags = NLM_F_REQUEST;
4536         ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm));
4537         ifm->ifi_family = AF_UNSPEC;
4538         ifm->ifi_index = vxindex;
4539         assert(size == cmd->nlmsg_len);
4540         return 1;
4541 }
4542
4543 /**
4544  * Cleanup the outer interface. Removes all found vxlan devices
4545  * attached to specified index, flushes the neigh and local IP
4546  * database.
4547  *
4548  * @param[in] tcf
4549  *   Context object initialized by mlx5_flow_tcf_context_create().
4550  * @param[in] ifindex
4551  *   Network inferface index to perform cleanup.
4552  */
4553 static void
4554 flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf,
4555                             unsigned int ifindex)
4556 {
4557         struct nlmsghdr *nlh;
4558         struct ifinfomsg *ifm;
4559         struct tcf_nlcb_context ctx = {
4560                 .ifindex = ifindex,
4561                 .bufsize = MNL_REQUEST_SIZE,
4562                 .nlbuf = LIST_HEAD_INITIALIZER(),
4563         };
4564         int ret;
4565
4566         assert(ifindex);
4567         /*
4568          * Seek and destroy leftover VXLAN encap/decap interfaces with
4569          * matching properties.
4570          */
4571         nlh = mnl_nlmsg_put_header(tcf->buf);
4572         nlh->nlmsg_type = RTM_GETLINK;
4573         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4574         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4575         ifm->ifi_family = AF_UNSPEC;
4576         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_vxlan_cb, &ctx);
4577         if (ret)
4578                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4579         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4580         if (ret)
4581                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4582 }
4583
4584 /**
4585  * Emit Netlink message to add/remove local address to the outer device.
4586  * The address being added is visible within the link only (scope link).
4587  *
4588  * Note that an implicit route is maintained by the kernel due to the
4589  * presence of a peer address (IFA_ADDRESS).
4590  *
4591  * These rules are used for encapsultion only and allow to assign
4592  * the outer tunnel source IP address.
4593  *
4594  * @param[in] tcf
4595  *   Libmnl socket context object.
4596  * @param[in] encap
4597  *   Encapsulation properties (source address and its peer).
4598  * @param[in] ifindex
4599  *   Network interface to apply rule.
4600  * @param[in] enable
4601  *   Toggle between add and remove.
4602  * @param[out] error
4603  *   Perform verbose error reporting if not NULL.
4604  *
4605  * @return
4606  *   0 on success, a negative errno value otherwise and rte_errno is set.
4607  */
4608 static int
4609 flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf,
4610                     const struct flow_tcf_vxlan_encap *encap,
4611                     unsigned int ifindex,
4612                     bool enable,
4613                     struct rte_flow_error *error)
4614 {
4615         struct nlmsghdr *nlh;
4616         struct ifaddrmsg *ifa;
4617         alignas(struct nlmsghdr)
4618         uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
4619
4620         nlh = mnl_nlmsg_put_header(buf);
4621         nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
4622         nlh->nlmsg_flags =
4623                 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4624         nlh->nlmsg_seq = 0;
4625         ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4626         ifa->ifa_flags = IFA_F_PERMANENT;
4627         ifa->ifa_scope = RT_SCOPE_LINK;
4628         ifa->ifa_index = ifindex;
4629         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4630                 ifa->ifa_family = AF_INET;
4631                 ifa->ifa_prefixlen = 32;
4632                 mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src);
4633                 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST)
4634                         mnl_attr_put_u32(nlh, IFA_ADDRESS,
4635                                               encap->ipv4.dst);
4636         } else {
4637                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4638                 ifa->ifa_family = AF_INET6;
4639                 ifa->ifa_prefixlen = 128;
4640                 mnl_attr_put(nlh, IFA_LOCAL,
4641                                   sizeof(encap->ipv6.src),
4642                                   &encap->ipv6.src);
4643                 if (encap->mask & FLOW_TCF_ENCAP_IPV6_DST)
4644                         mnl_attr_put(nlh, IFA_ADDRESS,
4645                                           sizeof(encap->ipv6.dst),
4646                                           &encap->ipv6.dst);
4647         }
4648         if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4649                 return 0;
4650         return rte_flow_error_set(error, rte_errno,
4651                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4652                                   "netlink: cannot complete IFA request"
4653                                   " (ip addr add)");
4654 }
4655
4656 /**
4657  * Emit Netlink message to add/remove neighbor.
4658  *
4659  * @param[in] tcf
4660  *   Libmnl socket context object.
4661  * @param[in] encap
4662  *   Encapsulation properties (destination address).
4663  * @param[in] ifindex
4664  *   Network interface.
4665  * @param[in] enable
4666  *   Toggle between add and remove.
4667  * @param[out] error
4668  *   Perform verbose error reporting if not NULL.
4669  *
4670  * @return
4671  *   0 on success, a negative errno value otherwise and rte_errno is set.
4672  */
4673 static int
4674 flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf,
4675                      const struct flow_tcf_vxlan_encap *encap,
4676                      unsigned int ifindex,
4677                      bool enable,
4678                      struct rte_flow_error *error)
4679 {
4680         struct nlmsghdr *nlh;
4681         struct ndmsg *ndm;
4682         alignas(struct nlmsghdr)
4683         uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
4684
4685         nlh = mnl_nlmsg_put_header(buf);
4686         nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
4687         nlh->nlmsg_flags =
4688                 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4689         nlh->nlmsg_seq = 0;
4690         ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4691         ndm->ndm_ifindex = ifindex;
4692         ndm->ndm_state = NUD_PERMANENT;
4693         ndm->ndm_flags = 0;
4694         ndm->ndm_type = 0;
4695         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4696                 ndm->ndm_family = AF_INET;
4697                 mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst);
4698         } else {
4699                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4700                 ndm->ndm_family = AF_INET6;
4701                 mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst),
4702                                                  &encap->ipv6.dst);
4703         }
4704         if (encap->mask & FLOW_TCF_ENCAP_ETH_SRC && enable)
4705                 DRV_LOG(WARNING,
4706                         "outer ethernet source address cannot be "
4707                         "forced for VXLAN encapsulation");
4708         if (encap->mask & FLOW_TCF_ENCAP_ETH_DST)
4709                 mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst),
4710                                                     &encap->eth.dst);
4711         if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4712                 return 0;
4713         return rte_flow_error_set(error, rte_errno,
4714                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4715                                   "netlink: cannot complete ND request"
4716                                   " (ip neigh)");
4717 }
4718
4719 /**
4720  * Manage the local IP addresses and their peers IP addresses on the
4721  * outer interface for encapsulation purposes. The kernel searches the
4722  * appropriate device for tunnel egress traffic using the outer source
4723  * IP, this IP should be assigned to the outer network device, otherwise
4724  * kernel rejects the rule.
4725  *
4726  * Adds or removes the addresses using the Netlink command like this:
4727  *   ip addr add <src_ip> peer <dst_ip> scope link dev <ifouter>
4728  *
4729  * The addresses are local to the netdev ("scope link"), this reduces
4730  * the risk of conflicts. Note that an implicit route is maintained by
4731  * the kernel due to the presence of a peer address (IFA_ADDRESS).
4732  *
4733  * @param[in] tcf
4734  *   Libmnl socket context object.
4735  * @param[in] iface
4736  *   Object, contains rule database and ifouter index.
4737  * @param[in] dev_flow
4738  *   Flow object, contains the tunnel parameters (for encap only).
4739  * @param[in] enable
4740  *   Toggle between add and remove.
4741  * @param[out] error
4742  *   Perform verbose error reporting if not NULL.
4743  *
4744  * @return
4745  *   0 on success, a negative errno value otherwise and rte_errno is set.
4746  */
4747 static int
4748 flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf,
4749                      struct tcf_irule *iface,
4750                      struct mlx5_flow *dev_flow,
4751                      bool enable,
4752                      struct rte_flow_error *error)
4753 {
4754         const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4755         struct tcf_local_rule *rule = NULL;
4756         int ret;
4757
4758         assert(encap);
4759         assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4760         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4761                 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_DST);
4762                 LIST_FOREACH(rule, &iface->local, next) {
4763                         if (rule->mask & FLOW_TCF_ENCAP_IPV4_SRC &&
4764                             encap->ipv4.src == rule->ipv4.src &&
4765                             encap->ipv4.dst == rule->ipv4.dst) {
4766                                 break;
4767                         }
4768                 }
4769         } else {
4770                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4771                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4772                 LIST_FOREACH(rule, &iface->local, next) {
4773                         if (rule->mask & FLOW_TCF_ENCAP_IPV6_SRC &&
4774                             !memcmp(&encap->ipv6.src, &rule->ipv6.src,
4775                                             sizeof(encap->ipv6.src)) &&
4776                             !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4777                                             sizeof(encap->ipv6.dst))) {
4778                                 break;
4779                         }
4780                 }
4781         }
4782         if (rule) {
4783                 if (enable) {
4784                         rule->refcnt++;
4785                         return 0;
4786                 }
4787                 if (!rule->refcnt || !--rule->refcnt) {
4788                         LIST_REMOVE(rule, next);
4789                         return flow_tcf_rule_local(tcf, encap,
4790                                         iface->ifouter, false, error);
4791                 }
4792                 return 0;
4793         }
4794         if (!enable) {
4795                 DRV_LOG(WARNING, "disabling not existing local rule");
4796                 rte_flow_error_set(error, ENOENT,
4797                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4798                                    "disabling not existing local rule");
4799                 return -ENOENT;
4800         }
4801         rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule),
4802                                 alignof(struct tcf_local_rule));
4803         if (!rule) {
4804                 rte_flow_error_set(error, ENOMEM,
4805                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4806                                    "unable to allocate memory for local rule");
4807                 return -rte_errno;
4808         }
4809         *rule = (struct tcf_local_rule){.refcnt = 0,
4810                                         .mask = 0,
4811                                         };
4812         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4813                 rule->mask = FLOW_TCF_ENCAP_IPV4_SRC
4814                            | FLOW_TCF_ENCAP_IPV4_DST;
4815                 rule->ipv4.src = encap->ipv4.src;
4816                 rule->ipv4.dst = encap->ipv4.dst;
4817         } else {
4818                 rule->mask = FLOW_TCF_ENCAP_IPV6_SRC
4819                            | FLOW_TCF_ENCAP_IPV6_DST;
4820                 memcpy(&rule->ipv6.src, &encap->ipv6.src, IPV6_ADDR_LEN);
4821                 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4822         }
4823         ret = flow_tcf_rule_local(tcf, encap, iface->ifouter, true, error);
4824         if (ret) {
4825                 rte_free(rule);
4826                 return ret;
4827         }
4828         rule->refcnt++;
4829         LIST_INSERT_HEAD(&iface->local, rule, next);
4830         return 0;
4831 }
4832
4833 /**
4834  * Manage the destination MAC/IP addresses neigh database, kernel uses
4835  * this one to determine the destination MAC address within encapsulation
4836  * header. Adds or removes the entries using the Netlink command like this:
4837  *   ip neigh add dev <ifouter> lladdr <dst_mac> to <dst_ip> nud permanent
4838  *
4839  * @param[in] tcf
4840  *   Libmnl socket context object.
4841  * @param[in] iface
4842  *   Object, contains rule database and ifouter index.
4843  * @param[in] dev_flow
4844  *   Flow object, contains the tunnel parameters (for encap only).
4845  * @param[in] enable
4846  *   Toggle between add and remove.
4847  * @param[out] error
4848  *   Perform verbose error reporting if not NULL.
4849  *
4850  * @return
4851  *   0 on success, a negative errno value otherwise and rte_errno is set.
4852  */
4853 static int
4854 flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf,
4855                      struct tcf_irule *iface,
4856                      struct mlx5_flow *dev_flow,
4857                      bool enable,
4858                      struct rte_flow_error *error)
4859 {
4860         const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4861         struct tcf_neigh_rule *rule = NULL;
4862         int ret;
4863
4864         assert(encap);
4865         assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4866         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4867                 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_SRC);
4868                 LIST_FOREACH(rule, &iface->neigh, next) {
4869                         if (rule->mask & FLOW_TCF_ENCAP_IPV4_DST &&
4870                             encap->ipv4.dst == rule->ipv4.dst) {
4871                                 break;
4872                         }
4873                 }
4874         } else {
4875                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4876                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4877                 LIST_FOREACH(rule, &iface->neigh, next) {
4878                         if (rule->mask & FLOW_TCF_ENCAP_IPV6_DST &&
4879                             !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4880                                                 sizeof(encap->ipv6.dst))) {
4881                                 break;
4882                         }
4883                 }
4884         }
4885         if (rule) {
4886                 if (memcmp(&encap->eth.dst, &rule->eth,
4887                            sizeof(encap->eth.dst))) {
4888                         DRV_LOG(WARNING, "Destination MAC differs"
4889                                          " in neigh rule");
4890                         rte_flow_error_set(error, EEXIST,
4891                                            RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
4892                                            NULL, "Different MAC address"
4893                                            " neigh rule for the same"
4894                                            " destination IP");
4895                                         return -EEXIST;
4896                 }
4897                 if (enable) {
4898                         rule->refcnt++;
4899                         return 0;
4900                 }
4901                 if (!rule->refcnt || !--rule->refcnt) {
4902                         LIST_REMOVE(rule, next);
4903                         return flow_tcf_rule_neigh(tcf, encap,
4904                                                    iface->ifouter,
4905                                                    false, error);
4906                 }
4907                 return 0;
4908         }
4909         if (!enable) {
4910                 DRV_LOG(WARNING, "Disabling not existing neigh rule");
4911                 rte_flow_error_set(error, ENOENT,
4912                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4913                                    "unable to allocate memory for neigh rule");
4914                 return -ENOENT;
4915         }
4916         rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule),
4917                                 alignof(struct tcf_neigh_rule));
4918         if (!rule) {
4919                 rte_flow_error_set(error, ENOMEM,
4920                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4921                                    "unable to allocate memory for neigh rule");
4922                 return -rte_errno;
4923         }
4924         *rule = (struct tcf_neigh_rule){.refcnt = 0,
4925                                         .mask = 0,
4926                                         };
4927         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4928                 rule->mask = FLOW_TCF_ENCAP_IPV4_DST;
4929                 rule->ipv4.dst = encap->ipv4.dst;
4930         } else {
4931                 rule->mask = FLOW_TCF_ENCAP_IPV6_DST;
4932                 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4933         }
4934         memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth));
4935         ret = flow_tcf_rule_neigh(tcf, encap, iface->ifouter, true, error);
4936         if (ret) {
4937                 rte_free(rule);
4938                 return ret;
4939         }
4940         rule->refcnt++;
4941         LIST_INSERT_HEAD(&iface->neigh, rule, next);
4942         return 0;
4943 }
4944
4945 /* VXLAN encap rule database for outer interfaces. */
4946 static  LIST_HEAD(, tcf_irule) iface_list_vxlan = LIST_HEAD_INITIALIZER();
4947
4948 /* VTEP device list is shared between PMD port instances. */
4949 static LIST_HEAD(, tcf_vtep) vtep_list_vxlan = LIST_HEAD_INITIALIZER();
4950 static pthread_mutex_t vtep_list_mutex = PTHREAD_MUTEX_INITIALIZER;
4951
4952 /**
4953  * Acquire the VXLAN encap rules container for specified interface.
4954  * First looks for the container in the existing ones list, creates
4955  * and initializes the new container if existing not found.
4956  *
4957  * @param[in] tcf
4958  *   Context object initialized by mlx5_flow_tcf_context_create().
4959  * @param[in] ifouter
4960  *   Network interface index to create VXLAN encap rules on.
4961  * @param[out] error
4962  *   Perform verbose error reporting if not NULL.
4963  * @return
4964  *   Rule container pointer on success,
4965  *   NULL otherwise and rte_errno is set.
4966  */
4967 static struct tcf_irule*
4968 flow_tcf_encap_irule_acquire(struct mlx5_flow_tcf_context *tcf,
4969                              unsigned int ifouter,
4970                              struct rte_flow_error *error)
4971 {
4972         struct tcf_irule *iface;
4973
4974         /* Look whether the container for encap rules is created. */
4975         assert(ifouter);
4976         LIST_FOREACH(iface, &iface_list_vxlan, next) {
4977                 if (iface->ifouter == ifouter)
4978                         break;
4979         }
4980         if (iface) {
4981                 /* Container already exists, just increment the reference. */
4982                 iface->refcnt++;
4983                 return iface;
4984         }
4985         /* Not found, we should create the new container. */
4986         iface = rte_zmalloc(__func__, sizeof(*iface),
4987                             alignof(struct tcf_irule));
4988         if (!iface) {
4989                 rte_flow_error_set(error, ENOMEM,
4990                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4991                                    "unable to allocate memory for container");
4992                 return NULL;
4993         }
4994         *iface = (struct tcf_irule){
4995                         .local = LIST_HEAD_INITIALIZER(),
4996                         .neigh = LIST_HEAD_INITIALIZER(),
4997                         .ifouter = ifouter,
4998                         .refcnt = 1,
4999         };
5000         /* Interface cleanup for new container created. */
5001         flow_tcf_encap_iface_cleanup(tcf, ifouter);
5002         flow_tcf_encap_local_cleanup(tcf, ifouter);
5003         flow_tcf_encap_neigh_cleanup(tcf, ifouter);
5004         LIST_INSERT_HEAD(&iface_list_vxlan, iface, next);
5005         return iface;
5006 }
5007
5008 /**
5009  * Releases VXLAN encap rules container by pointer. Decrements the
5010  * reference cointer and deletes the container if counter is zero.
5011  *
5012  * @param[in] irule
5013  *   VXLAN rule container pointer to release.
5014  */
5015 static void
5016 flow_tcf_encap_irule_release(struct tcf_irule *iface)
5017 {
5018         assert(iface->refcnt);
5019         if (--iface->refcnt == 0) {
5020                 /* Reference counter is zero, delete the container. */
5021                 assert(LIST_EMPTY(&iface->local));
5022                 assert(LIST_EMPTY(&iface->neigh));
5023                 LIST_REMOVE(iface, next);
5024                 rte_free(iface);
5025         }
5026 }
5027
5028 /**
5029  * Deletes VTEP network device.
5030  *
5031  * @param[in] tcf
5032  *   Context object initialized by mlx5_flow_tcf_context_create().
5033  * @param[in] vtep
5034  *   Object represinting the network device to delete. Memory
5035  *   allocated for this object is freed by routine.
5036  */
5037 static void
5038 flow_tcf_vtep_delete(struct mlx5_flow_tcf_context *tcf,
5039                      struct tcf_vtep *vtep)
5040 {
5041         struct nlmsghdr *nlh;
5042         struct ifinfomsg *ifm;
5043         alignas(struct nlmsghdr)
5044         uint8_t buf[mnl_nlmsg_size(MNL_ALIGN(sizeof(*ifm))) +
5045                     MNL_BUF_EXTRA_SPACE];
5046         int ret;
5047
5048         assert(!vtep->refcnt);
5049         /* Delete only ifaces those we actually created. */
5050         if (vtep->created && vtep->ifindex) {
5051                 DRV_LOG(INFO, "VTEP delete (%d)", vtep->ifindex);
5052                 nlh = mnl_nlmsg_put_header(buf);
5053                 nlh->nlmsg_type = RTM_DELLINK;
5054                 nlh->nlmsg_flags = NLM_F_REQUEST;
5055                 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5056                 ifm->ifi_family = AF_UNSPEC;
5057                 ifm->ifi_index = vtep->ifindex;
5058                 assert(sizeof(buf) >= nlh->nlmsg_len);
5059                 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5060                 if (ret)
5061                         DRV_LOG(WARNING, "netlink: error deleting vxlan"
5062                                          " encap/decap ifindex %u",
5063                                          ifm->ifi_index);
5064         }
5065         rte_free(vtep);
5066 }
5067
5068 /**
5069  * Creates VTEP network device.
5070  *
5071  * @param[in] tcf
5072  *   Context object initialized by mlx5_flow_tcf_context_create().
5073  * @param[in] port
5074  *   UDP port of created VTEP device.
5075  * @param[out] error
5076  *   Perform verbose error reporting if not NULL.
5077  *
5078  * @return
5079  * Pointer to created device structure on success,
5080  * NULL otherwise and rte_errno is set.
5081  */
5082 static struct tcf_vtep*
5083 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf,
5084                      uint16_t port, struct rte_flow_error *error)
5085 {
5086         struct tcf_vtep *vtep;
5087         struct nlmsghdr *nlh;
5088         struct ifinfomsg *ifm;
5089         char name[sizeof(MLX5_VXLAN_DEVICE_PFX) + 24];
5090         alignas(struct nlmsghdr)
5091         uint8_t buf[mnl_nlmsg_size(sizeof(*ifm)) +
5092                     SZ_NLATTR_DATA_OF(sizeof(name)) +
5093                     SZ_NLATTR_NEST * 2 +
5094                     SZ_NLATTR_STRZ_OF("vxlan") +
5095                     SZ_NLATTR_DATA_OF(sizeof(uint32_t)) +
5096                     SZ_NLATTR_DATA_OF(sizeof(uint16_t)) +
5097                     SZ_NLATTR_DATA_OF(sizeof(uint8_t)) * 3 +
5098                     MNL_BUF_EXTRA_SPACE];
5099         struct nlattr *na_info;
5100         struct nlattr *na_vxlan;
5101         rte_be16_t vxlan_port = rte_cpu_to_be_16(port);
5102         int ret;
5103
5104         vtep = rte_zmalloc(__func__, sizeof(*vtep), alignof(struct tcf_vtep));
5105         if (!vtep) {
5106                 rte_flow_error_set(error, ENOMEM,
5107                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5108                                    "unable to allocate memory for VTEP");
5109                 return NULL;
5110         }
5111         *vtep = (struct tcf_vtep){
5112                         .port = port,
5113         };
5114         memset(buf, 0, sizeof(buf));
5115         nlh = mnl_nlmsg_put_header(buf);
5116         nlh->nlmsg_type = RTM_NEWLINK;
5117         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE  | NLM_F_EXCL;
5118         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5119         ifm->ifi_family = AF_UNSPEC;
5120         ifm->ifi_type = 0;
5121         ifm->ifi_index = 0;
5122         ifm->ifi_flags = IFF_UP;
5123         ifm->ifi_change = 0xffffffff;
5124         snprintf(name, sizeof(name), "%s%u", MLX5_VXLAN_DEVICE_PFX, port);
5125         mnl_attr_put_strz(nlh, IFLA_IFNAME, name);
5126         na_info = mnl_attr_nest_start(nlh, IFLA_LINKINFO);
5127         assert(na_info);
5128         mnl_attr_put_strz(nlh, IFLA_INFO_KIND, "vxlan");
5129         na_vxlan = mnl_attr_nest_start(nlh, IFLA_INFO_DATA);
5130         assert(na_vxlan);
5131 #ifdef HAVE_IFLA_VXLAN_COLLECT_METADATA
5132         /*
5133          * RH 7.2 does not support metadata for tunnel device.
5134          * It does not matter because we are going to use the
5135          * hardware offload by mlx5 driver.
5136          */
5137         mnl_attr_put_u8(nlh, IFLA_VXLAN_COLLECT_METADATA, 1);
5138 #endif
5139         mnl_attr_put_u8(nlh, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1);
5140         mnl_attr_put_u8(nlh, IFLA_VXLAN_LEARNING, 0);
5141         mnl_attr_put_u16(nlh, IFLA_VXLAN_PORT, vxlan_port);
5142 #ifndef HAVE_IFLA_VXLAN_COLLECT_METADATA
5143         /*
5144          *  We must specify VNI explicitly if metadata not supported.
5145          *  Note, VNI is transferred with native endianness format.
5146          */
5147         mnl_attr_put_u16(nlh, IFLA_VXLAN_ID, MLX5_VXLAN_DEFAULT_VNI);
5148 #endif
5149         mnl_attr_nest_end(nlh, na_vxlan);
5150         mnl_attr_nest_end(nlh, na_info);
5151         assert(sizeof(buf) >= nlh->nlmsg_len);
5152         ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5153         if (ret) {
5154                 DRV_LOG(WARNING,
5155                         "netlink: VTEP %s create failure (%d)",
5156                         name, rte_errno);
5157                 if (rte_errno != EEXIST)
5158                         /*
5159                          * Some unhandled error occurred or device is
5160                          * for encapsulation and cannot be shared.
5161                          */
5162                         goto error;
5163         } else {
5164                 /*
5165                  * Mark device we actually created.
5166                  * We should explicitly delete
5167                  * when we do not need it anymore.
5168                  */
5169                 vtep->created = 1;
5170         }
5171         /* Try to get ifindex of created of pre-existing device. */
5172         ret = if_nametoindex(name);
5173         if (!ret) {
5174                 DRV_LOG(WARNING,
5175                         "VTEP %s failed to get index (%d)", name, errno);
5176                 rte_flow_error_set
5177                         (error, -errno,
5178                          RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5179                          "netlink: failed to retrieve VTEP ifindex");
5180                 goto error;
5181         }
5182         vtep->ifindex = ret;
5183         memset(buf, 0, sizeof(buf));
5184         nlh = mnl_nlmsg_put_header(buf);
5185         nlh->nlmsg_type = RTM_NEWLINK;
5186         nlh->nlmsg_flags = NLM_F_REQUEST;
5187         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5188         ifm->ifi_family = AF_UNSPEC;
5189         ifm->ifi_type = 0;
5190         ifm->ifi_index = vtep->ifindex;
5191         ifm->ifi_flags = IFF_UP;
5192         ifm->ifi_change = IFF_UP;
5193         ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5194         if (ret) {
5195                 rte_flow_error_set(error, -errno,
5196                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5197                                    "netlink: failed to set VTEP link up");
5198                 DRV_LOG(WARNING, "netlink: VTEP %s set link up failure (%d)",
5199                         name, rte_errno);
5200                 goto clean;
5201         }
5202         ret = mlx5_flow_tcf_init(tcf, vtep->ifindex, error);
5203         if (ret) {
5204                 DRV_LOG(WARNING, "VTEP %s init failure (%d)", name, rte_errno);
5205                 goto clean;
5206         }
5207         DRV_LOG(INFO, "VTEP create (%d, %d)", vtep->port, vtep->ifindex);
5208         vtep->refcnt = 1;
5209         return vtep;
5210 clean:
5211         flow_tcf_vtep_delete(tcf, vtep);
5212         return NULL;
5213 error:
5214         rte_free(vtep);
5215         return NULL;
5216 }
5217
5218 /**
5219  * Acquire target interface index for VXLAN tunneling decapsulation.
5220  * In order to share the UDP port within the other interfaces the
5221  * VXLAN device created as not attached to any interface (if created).
5222  *
5223  * @param[in] tcf
5224  *   Context object initialized by mlx5_flow_tcf_context_create().
5225  * @param[in] dev_flow
5226  *   Flow tcf object with tunnel structure pointer set.
5227  * @param[out] error
5228  *   Perform verbose error reporting if not NULL.
5229  * @return
5230  *   Interface descriptor pointer on success,
5231  *   NULL otherwise and rte_errno is set.
5232  */
5233 static struct tcf_vtep*
5234 flow_tcf_decap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5235                             struct mlx5_flow *dev_flow,
5236                             struct rte_flow_error *error)
5237 {
5238         struct tcf_vtep *vtep;
5239         uint16_t port = dev_flow->tcf.vxlan_decap->udp_port;
5240
5241         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5242                 if (vtep->port == port)
5243                         break;
5244         }
5245         if (vtep) {
5246                 /* Device exists, just increment the reference counter. */
5247                 vtep->refcnt++;
5248                 assert(vtep->ifindex);
5249                 return vtep;
5250         }
5251         /* No decapsulation device exists, try to create the new one. */
5252         vtep = flow_tcf_vtep_create(tcf, port, error);
5253         if (vtep)
5254                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5255         return vtep;
5256 }
5257
5258 /**
5259  * Aqcuire target interface index for VXLAN tunneling encapsulation.
5260  *
5261  * @param[in] tcf
5262  *   Context object initialized by mlx5_flow_tcf_context_create().
5263  * @param[in] ifouter
5264  *   Network interface index to attach VXLAN encap device to.
5265  * @param[in] dev_flow
5266  *   Flow tcf object with tunnel structure pointer set.
5267  * @param[out] error
5268  *   Perform verbose error reporting if not NULL.
5269  * @return
5270  *   Interface descriptor pointer on success,
5271  *   NULL otherwise and rte_errno is set.
5272  */
5273 static struct tcf_vtep*
5274 flow_tcf_encap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5275                             unsigned int ifouter,
5276                             struct mlx5_flow *dev_flow,
5277                             struct rte_flow_error *error)
5278 {
5279         static uint16_t port;
5280         struct tcf_vtep *vtep;
5281         struct tcf_irule *iface;
5282         int ret;
5283
5284         assert(ifouter);
5285         /* Look whether the VTEP for specified port is created. */
5286         port = rte_be_to_cpu_16(dev_flow->tcf.vxlan_encap->udp.dst);
5287         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5288                 if (vtep->port == port)
5289                         break;
5290         }
5291         if (vtep) {
5292                 /* VTEP already exists, just increment the reference. */
5293                 vtep->refcnt++;
5294         } else {
5295                 /* Not found, we should create the new VTEP. */
5296                 vtep = flow_tcf_vtep_create(tcf, port, error);
5297                 if (!vtep)
5298                         return NULL;
5299                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5300         }
5301         assert(vtep->ifindex);
5302         iface = flow_tcf_encap_irule_acquire(tcf, ifouter, error);
5303         if (!iface) {
5304                 if (--vtep->refcnt == 0)
5305                         flow_tcf_vtep_delete(tcf, vtep);
5306                 return NULL;
5307         }
5308         dev_flow->tcf.vxlan_encap->iface = iface;
5309         /* Create local ipaddr with peer to specify the outer IPs. */
5310         ret = flow_tcf_encap_local(tcf, iface, dev_flow, true, error);
5311         if (!ret) {
5312                 /* Create neigh rule to specify outer destination MAC. */
5313                 ret = flow_tcf_encap_neigh(tcf, iface, dev_flow, true, error);
5314                 if (ret)
5315                         flow_tcf_encap_local(tcf, iface,
5316                                              dev_flow, false, error);
5317         }
5318         if (ret) {
5319                 dev_flow->tcf.vxlan_encap->iface = NULL;
5320                 flow_tcf_encap_irule_release(iface);
5321                 if (--vtep->refcnt == 0)
5322                         flow_tcf_vtep_delete(tcf, vtep);
5323                 return NULL;
5324         }
5325         return vtep;
5326 }
5327
5328 /**
5329  * Acquires target interface index for tunneling of any type.
5330  * Creates the new VTEP if needed.
5331  *
5332  * @param[in] tcf
5333  *   Context object initialized by mlx5_flow_tcf_context_create().
5334  * @param[in] ifouter
5335  *   Network interface index to create VXLAN encap rules on.
5336  * @param[in] dev_flow
5337  *   Flow tcf object with tunnel structure pointer set.
5338  * @param[out] error
5339  *   Perform verbose error reporting if not NULL.
5340  * @return
5341  *   Interface descriptor pointer on success,
5342  *   NULL otherwise and rte_errno is set.
5343  */
5344 static struct tcf_vtep*
5345 flow_tcf_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5346                       unsigned int ifouter,
5347                       struct mlx5_flow *dev_flow,
5348                       struct rte_flow_error *error)
5349 {
5350         struct tcf_vtep *vtep = NULL;
5351
5352         assert(dev_flow->tcf.tunnel);
5353         pthread_mutex_lock(&vtep_list_mutex);
5354         switch (dev_flow->tcf.tunnel->type) {
5355         case FLOW_TCF_TUNACT_VXLAN_ENCAP:
5356                 vtep = flow_tcf_encap_vtep_acquire(tcf, ifouter,
5357                                                   dev_flow, error);
5358                 break;
5359         case FLOW_TCF_TUNACT_VXLAN_DECAP:
5360                 vtep = flow_tcf_decap_vtep_acquire(tcf, dev_flow, error);
5361                 break;
5362         default:
5363                 rte_flow_error_set(error, ENOTSUP,
5364                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5365                                    "unsupported tunnel type");
5366                 break;
5367         }
5368         pthread_mutex_unlock(&vtep_list_mutex);
5369         return vtep;
5370 }
5371
5372 /**
5373  * Release tunneling interface by ifindex. Decrements reference
5374  * counter and actually removes the device if counter is zero.
5375  *
5376  * @param[in] tcf
5377  *   Context object initialized by mlx5_flow_tcf_context_create().
5378  * @param[in] vtep
5379  *   VTEP device descriptor structure.
5380  * @param[in] dev_flow
5381  *   Flow tcf object with tunnel structure pointer set.
5382  */
5383 static void
5384 flow_tcf_vtep_release(struct mlx5_flow_tcf_context *tcf,
5385                       struct tcf_vtep *vtep,
5386                       struct mlx5_flow *dev_flow)
5387 {
5388         assert(dev_flow->tcf.tunnel);
5389         pthread_mutex_lock(&vtep_list_mutex);
5390         switch (dev_flow->tcf.tunnel->type) {
5391         case FLOW_TCF_TUNACT_VXLAN_DECAP:
5392                 break;
5393         case FLOW_TCF_TUNACT_VXLAN_ENCAP: {
5394                 struct tcf_irule *iface;
5395
5396                 /* Remove the encap ancillary rules first. */
5397                 iface = dev_flow->tcf.vxlan_encap->iface;
5398                 assert(iface);
5399                 flow_tcf_encap_neigh(tcf, iface, dev_flow, false, NULL);
5400                 flow_tcf_encap_local(tcf, iface, dev_flow, false, NULL);
5401                 flow_tcf_encap_irule_release(iface);
5402                 dev_flow->tcf.vxlan_encap->iface = NULL;
5403                 break;
5404         }
5405         default:
5406                 assert(false);
5407                 DRV_LOG(WARNING, "Unsupported tunnel type");
5408                 break;
5409         }
5410         assert(vtep->refcnt);
5411         if (--vtep->refcnt == 0) {
5412                 LIST_REMOVE(vtep, next);
5413                 flow_tcf_vtep_delete(tcf, vtep);
5414         }
5415         pthread_mutex_unlock(&vtep_list_mutex);
5416 }
5417
5418 struct tcf_nlcb_query {
5419         uint32_t handle;
5420         uint32_t tc_flags;
5421         uint32_t flags_valid:1;
5422 };
5423
5424 /**
5425  * Collect queried rule attributes. This is callback routine called by
5426  * libmnl mnl_cb_run() in loop for every message in received packet.
5427  * Current implementation collects the flower flags only.
5428  *
5429  * @param[in] nlh
5430  *   Pointer to reply header.
5431  * @param[in, out] arg
5432  *   Context pointer for this callback.
5433  *
5434  * @return
5435  *   A positive, nonzero value on success (required by libmnl
5436  *   to continue messages processing).
5437  */
5438 static int
5439 flow_tcf_collect_query_cb(const struct nlmsghdr *nlh, void *arg)
5440 {
5441         struct tcf_nlcb_query *query = arg;
5442         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
5443         struct nlattr *na, *na_opt;
5444         bool flower = false;
5445
5446         if (nlh->nlmsg_type != RTM_NEWTFILTER ||
5447             tcm->tcm_handle != query->handle)
5448                 return 1;
5449         mnl_attr_for_each(na, nlh, sizeof(*tcm)) {
5450                 switch (mnl_attr_get_type(na)) {
5451                 case TCA_KIND:
5452                         if (strcmp(mnl_attr_get_payload(na), "flower")) {
5453                                 /* Not flower filter, drop entire message. */
5454                                 return 1;
5455                         }
5456                         flower = true;
5457                         break;
5458                 case TCA_OPTIONS:
5459                         if (!flower) {
5460                                 /* Not flower options, drop entire message. */
5461                                 return 1;
5462                         }
5463                         /* Check nested flower options. */
5464                         mnl_attr_for_each_nested(na_opt, na) {
5465                                 switch (mnl_attr_get_type(na_opt)) {
5466                                 case TCA_FLOWER_FLAGS:
5467                                         query->flags_valid = 1;
5468                                         query->tc_flags =
5469                                                 mnl_attr_get_u32(na_opt);
5470                                         break;
5471                                 }
5472                         }
5473                         break;
5474                 }
5475         }
5476         return 1;
5477 }
5478
5479 /**
5480  * Query a TC flower rule flags via netlink.
5481  *
5482  * @param[in] tcf
5483  *   Context object initialized by mlx5_flow_tcf_context_create().
5484  * @param[in] dev_flow
5485  *   Pointer to the flow.
5486  * @param[out] pflags
5487  *   pointer to the data retrieved by the query.
5488  *
5489  * @return
5490  *   0 on success, a negative errno value otherwise.
5491  */
5492 static int
5493 flow_tcf_query_flags(struct mlx5_flow_tcf_context *tcf,
5494                      struct mlx5_flow *dev_flow,
5495                      uint32_t *pflags)
5496 {
5497         struct nlmsghdr *nlh;
5498         struct tcmsg *tcm;
5499         struct tcf_nlcb_query query = {
5500                 .handle = dev_flow->tcf.tcm->tcm_handle,
5501         };
5502
5503         nlh = mnl_nlmsg_put_header(tcf->buf);
5504         nlh->nlmsg_type = RTM_GETTFILTER;
5505         nlh->nlmsg_flags = NLM_F_REQUEST;
5506         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
5507         memcpy(tcm, dev_flow->tcf.tcm, sizeof(*tcm));
5508         /*
5509          * Ignore Netlink error for filter query operations.
5510          * The reply length is sent by kernel as errno.
5511          * Just check we got the flags option.
5512          */
5513         flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_query_cb, &query);
5514         if (!query.flags_valid) {
5515                 *pflags = 0;
5516                 return -ENOENT;
5517         }
5518         *pflags = query.tc_flags;
5519         return 0;
5520 }
5521
5522 /**
5523  * Query and check the in_hw set for specified rule.
5524  *
5525  * @param[in] tcf
5526  *   Context object initialized by mlx5_flow_tcf_context_create().
5527  * @param[in] dev_flow
5528  *   Pointer to the flow to check.
5529  *
5530  * @return
5531  *   0 on success, a negative errno value otherwise.
5532  */
5533 static int
5534 flow_tcf_check_inhw(struct mlx5_flow_tcf_context *tcf,
5535                     struct mlx5_flow *dev_flow)
5536 {
5537         uint32_t flags;
5538         int ret;
5539
5540         ret = flow_tcf_query_flags(tcf, dev_flow, &flags);
5541         if (ret)
5542                 return ret;
5543         return  (flags & TCA_CLS_FLAGS_IN_HW) ? 0 : -ENOENT;
5544 }
5545
5546 /**
5547  * Remove flow from E-Switch by sending Netlink message.
5548  *
5549  * @param[in] dev
5550  *   Pointer to Ethernet device.
5551  * @param[in, out] flow
5552  *   Pointer to the sub flow.
5553  */
5554 static void
5555 flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
5556 {
5557         struct priv *priv = dev->data->dev_private;
5558         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5559         struct mlx5_flow *dev_flow;
5560         struct nlmsghdr *nlh;
5561         struct tcmsg *tcm;
5562
5563         if (!flow)
5564                 return;
5565         dev_flow = LIST_FIRST(&flow->dev_flows);
5566         if (!dev_flow)
5567                 return;
5568         /* E-Switch flow can't be expanded. */
5569         assert(!LIST_NEXT(dev_flow, next));
5570         if (dev_flow->tcf.applied) {
5571                 nlh = dev_flow->tcf.nlh;
5572                 nlh->nlmsg_type = RTM_DELTFILTER;
5573                 nlh->nlmsg_flags = NLM_F_REQUEST;
5574                 flow_tcf_nl_ack(ctx, nlh, NULL, NULL);
5575                 if (dev_flow->tcf.tunnel) {
5576                         assert(dev_flow->tcf.tunnel->vtep);
5577                         flow_tcf_vtep_release(ctx,
5578                                 dev_flow->tcf.tunnel->vtep,
5579                                 dev_flow);
5580                         dev_flow->tcf.tunnel->vtep = NULL;
5581                 }
5582                 /* Cleanup the rule handle value. */
5583                 tcm = mnl_nlmsg_get_payload(nlh);
5584                 tcm->tcm_handle = 0;
5585                 dev_flow->tcf.applied = 0;
5586         }
5587 }
5588
5589 /**
5590  * Fetch the applied rule handle. This is callback routine called by
5591  * libmnl mnl_cb_run() in loop for every message in received packet.
5592  * When the NLM_F_ECHO flag i sspecified the kernel sends the created
5593  * rule descriptor back to the application and we can retrieve the
5594  * actual rule handle from updated descriptor.
5595  *
5596  * @param[in] nlh
5597  *   Pointer to reply header.
5598  * @param[in, out] arg
5599  *   Context pointer for this callback.
5600  *
5601  * @return
5602  *   A positive, nonzero value on success (required by libmnl
5603  *   to continue messages processing).
5604  */
5605 static int
5606 flow_tcf_collect_apply_cb(const struct nlmsghdr *nlh, void *arg)
5607 {
5608         struct nlmsghdr *nlhrq = arg;
5609         struct tcmsg *tcmrq = mnl_nlmsg_get_payload(nlhrq);
5610         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
5611         struct nlattr *na;
5612
5613         if (nlh->nlmsg_type != RTM_NEWTFILTER ||
5614             nlh->nlmsg_seq != nlhrq->nlmsg_seq)
5615                 return 1;
5616         mnl_attr_for_each(na, nlh, sizeof(*tcm)) {
5617                 switch (mnl_attr_get_type(na)) {
5618                 case TCA_KIND:
5619                         if (strcmp(mnl_attr_get_payload(na), "flower")) {
5620                                 /* Not flower filter, drop entire message. */
5621                                 return 1;
5622                         }
5623                         tcmrq->tcm_handle = tcm->tcm_handle;
5624                         return 1;
5625                 }
5626         }
5627         return 1;
5628 }
5629 /**
5630  * Apply flow to E-Switch by sending Netlink message.
5631  *
5632  * @param[in] dev
5633  *   Pointer to Ethernet device.
5634  * @param[in, out] flow
5635  *   Pointer to the sub flow.
5636  * @param[out] error
5637  *   Pointer to the error structure.
5638  *
5639  * @return
5640  *   0 on success, a negative errno value otherwise and rte_errno is set.
5641  */
5642 static int
5643 flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
5644                struct rte_flow_error *error)
5645 {
5646         struct priv *priv = dev->data->dev_private;
5647         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5648         struct mlx5_flow *dev_flow;
5649         struct nlmsghdr *nlh;
5650         struct tcmsg *tcm;
5651         int ret;
5652
5653         dev_flow = LIST_FIRST(&flow->dev_flows);
5654         /* E-Switch flow can't be expanded. */
5655         assert(!LIST_NEXT(dev_flow, next));
5656         if (dev_flow->tcf.applied)
5657                 return 0;
5658         nlh = dev_flow->tcf.nlh;
5659         nlh->nlmsg_type = RTM_NEWTFILTER;
5660         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
5661                            NLM_F_EXCL | NLM_F_ECHO;
5662         tcm = mnl_nlmsg_get_payload(nlh);
5663         /* Allow kernel to assign handle on its own. */
5664         tcm->tcm_handle = 0;
5665         if (dev_flow->tcf.tunnel) {
5666                 /*
5667                  * Replace the interface index, target for
5668                  * encapsulation, source for decapsulation.
5669                  */
5670                 assert(!dev_flow->tcf.tunnel->vtep);
5671                 assert(dev_flow->tcf.tunnel->ifindex_ptr);
5672                 /* Acquire actual VTEP device when rule is being applied. */
5673                 dev_flow->tcf.tunnel->vtep =
5674                         flow_tcf_vtep_acquire(ctx,
5675                                         dev_flow->tcf.tunnel->ifindex_org,
5676                                         dev_flow, error);
5677                 if (!dev_flow->tcf.tunnel->vtep)
5678                         return -rte_errno;
5679                 DRV_LOG(INFO, "Replace ifindex: %d->%d",
5680                                 dev_flow->tcf.tunnel->vtep->ifindex,
5681                                 dev_flow->tcf.tunnel->ifindex_org);
5682                 *dev_flow->tcf.tunnel->ifindex_ptr =
5683                         dev_flow->tcf.tunnel->vtep->ifindex;
5684         }
5685         ret = flow_tcf_nl_ack(ctx, nlh, flow_tcf_collect_apply_cb, nlh);
5686         if (!ret) {
5687                 if (!tcm->tcm_handle) {
5688                         flow_tcf_remove(dev, flow);
5689                         return rte_flow_error_set
5690                                 (error, ENOENT,
5691                                  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5692                                  "netlink: rule zero handle returned");
5693                 }
5694                 dev_flow->tcf.applied = 1;
5695                 if (*dev_flow->tcf.ptc_flags & TCA_CLS_FLAGS_SKIP_SW)
5696                         return 0;
5697                 /*
5698                  * Rule was applied without skip_sw flag set.
5699                  * We should check whether the rule was acctually
5700                  * accepted by hardware (have look at in_hw flag).
5701                  */
5702                 if (flow_tcf_check_inhw(ctx, dev_flow)) {
5703                         flow_tcf_remove(dev, flow);
5704                         return rte_flow_error_set
5705                                 (error, ENOENT,
5706                                  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5707                                  "netlink: rule has no in_hw flag set");
5708                 }
5709                 return 0;
5710         }
5711         if (dev_flow->tcf.tunnel) {
5712                 /* Rollback the VTEP configuration if rule apply failed. */
5713                 assert(dev_flow->tcf.tunnel->vtep);
5714                 flow_tcf_vtep_release(ctx, dev_flow->tcf.tunnel->vtep,
5715                                       dev_flow);
5716                 dev_flow->tcf.tunnel->vtep = NULL;
5717         }
5718         return rte_flow_error_set(error, rte_errno,
5719                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5720                                   "netlink: failed to create TC flow rule");
5721 }
5722
5723 /**
5724  * Remove flow from E-Switch and release resources of the device flow.
5725  *
5726  * @param[in] dev
5727  *   Pointer to Ethernet device.
5728  * @param[in, out] flow
5729  *   Pointer to the sub flow.
5730  */
5731 static void
5732 flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
5733 {
5734         struct mlx5_flow *dev_flow;
5735
5736         if (!flow)
5737                 return;
5738         flow_tcf_remove(dev, flow);
5739         if (flow->counter) {
5740                 if (--flow->counter->ref_cnt == 0) {
5741                         rte_free(flow->counter);
5742                         flow->counter = NULL;
5743                 }
5744         }
5745         dev_flow = LIST_FIRST(&flow->dev_flows);
5746         if (!dev_flow)
5747                 return;
5748         /* E-Switch flow can't be expanded. */
5749         assert(!LIST_NEXT(dev_flow, next));
5750         LIST_REMOVE(dev_flow, next);
5751         rte_free(dev_flow);
5752 }
5753
5754 /**
5755  * Helper routine for figuring the space size required for a parse buffer.
5756  *
5757  * @param array
5758  *   array of values to use.
5759  * @param idx
5760  *   Current location in array.
5761  * @param value
5762  *   Value to compare with.
5763  *
5764  * @return
5765  *   The maximum between the given value and the array value on index.
5766  */
5767 static uint16_t
5768 flow_tcf_arr_val_max(uint16_t array[], int idx, uint16_t value)
5769 {
5770         return idx < 0 ? (value) : RTE_MAX((array)[idx], value);
5771 }
5772
5773 /**
5774  * Parse rtnetlink message attributes filling the attribute table with the info
5775  * retrieved.
5776  *
5777  * @param tb
5778  *   Attribute table to be filled.
5779  * @param[out] max
5780  *   Maxinum entry in the attribute table.
5781  * @param rte
5782  *   The attributes section in the message to be parsed.
5783  * @param len
5784  *   The length of the attributes section in the message.
5785  */
5786 static void
5787 flow_tcf_nl_parse_rtattr(struct rtattr *tb[], int max,
5788                          struct rtattr *rta, int len)
5789 {
5790         unsigned short type;
5791         memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
5792         while (RTA_OK(rta, len)) {
5793                 type = rta->rta_type;
5794                 if (type <= max && !tb[type])
5795                         tb[type] = rta;
5796                 rta = RTA_NEXT(rta, len);
5797         }
5798 }
5799
5800 /**
5801  * Extract flow counters from flower action.
5802  *
5803  * @param rta
5804  *   flower action stats properties in the Netlink message received.
5805  * @param rta_type
5806  *   The backward sequence of rta_types, as written in the attribute table,
5807  *   we need to traverse in order to get to the requested object.
5808  * @param idx
5809  *   Current location in rta_type table.
5810  * @param[out] data
5811  *   data holding the count statistics of the rte_flow retrieved from
5812  *   the message.
5813  *
5814  * @return
5815  *   0 if data was found and retrieved, -1 otherwise.
5816  */
5817 static int
5818 flow_tcf_nl_action_stats_parse_and_get(struct rtattr *rta,
5819                                        uint16_t rta_type[], int idx,
5820                                        struct gnet_stats_basic *data)
5821 {
5822         int tca_stats_max = flow_tcf_arr_val_max(rta_type, idx,
5823                                                  TCA_STATS_BASIC);
5824         struct rtattr *tbs[tca_stats_max + 1];
5825
5826         if (rta == NULL || idx < 0)
5827                 return -1;
5828         flow_tcf_nl_parse_rtattr(tbs, tca_stats_max,
5829                                  RTA_DATA(rta), RTA_PAYLOAD(rta));
5830         switch (rta_type[idx]) {
5831         case TCA_STATS_BASIC:
5832                 if (tbs[TCA_STATS_BASIC]) {
5833                         memcpy(data, RTA_DATA(tbs[TCA_STATS_BASIC]),
5834                                RTE_MIN(RTA_PAYLOAD(tbs[TCA_STATS_BASIC]),
5835                                sizeof(*data)));
5836                         return 0;
5837                 }
5838                 break;
5839         default:
5840                 break;
5841         }
5842         return -1;
5843 }
5844
5845 /**
5846  * Parse flower single action retrieving the requested action attribute,
5847  * if found.
5848  *
5849  * @param arg
5850  *   flower action properties in the Netlink message received.
5851  * @param rta_type
5852  *   The backward sequence of rta_types, as written in the attribute table,
5853  *   we need to traverse in order to get to the requested object.
5854  * @param idx
5855  *   Current location in rta_type table.
5856  * @param[out] data
5857  *   Count statistics retrieved from the message query.
5858  *
5859  * @return
5860  *   0 if data was found and retrieved, -1 otherwise.
5861  */
5862 static int
5863 flow_tcf_nl_parse_one_action_and_get(struct rtattr *arg,
5864                                      uint16_t rta_type[], int idx, void *data)
5865 {
5866         int tca_act_max = flow_tcf_arr_val_max(rta_type, idx, TCA_ACT_STATS);
5867         struct rtattr *tb[tca_act_max + 1];
5868
5869         if (arg == NULL || idx < 0)
5870                 return -1;
5871         flow_tcf_nl_parse_rtattr(tb, tca_act_max,
5872                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
5873         if (tb[TCA_ACT_KIND] == NULL)
5874                 return -1;
5875         switch (rta_type[idx]) {
5876         case TCA_ACT_STATS:
5877                 if (tb[TCA_ACT_STATS])
5878                         return flow_tcf_nl_action_stats_parse_and_get
5879                                         (tb[TCA_ACT_STATS],
5880                                          rta_type, --idx,
5881                                          (struct gnet_stats_basic *)data);
5882                 break;
5883         default:
5884                 break;
5885         }
5886         return -1;
5887 }
5888
5889 /**
5890  * Parse flower action section in the message retrieving the requested
5891  * attribute from the first action that provides it.
5892  *
5893  * @param opt
5894  *   flower section in the Netlink message received.
5895  * @param rta_type
5896  *   The backward sequence of rta_types, as written in the attribute table,
5897  *   we need to traverse in order to get to the requested object.
5898  * @param idx
5899  *   Current location in rta_type table.
5900  * @param[out] data
5901  *   data retrieved from the message query.
5902  *
5903  * @return
5904  *   0 if data was found and retrieved, -1 otherwise.
5905  */
5906 static int
5907 flow_tcf_nl_action_parse_and_get(struct rtattr *arg,
5908                                  uint16_t rta_type[], int idx, void *data)
5909 {
5910         struct rtattr *tb[TCA_ACT_MAX_PRIO + 1];
5911         int i;
5912
5913         if (arg == NULL || idx < 0)
5914                 return -1;
5915         flow_tcf_nl_parse_rtattr(tb, TCA_ACT_MAX_PRIO,
5916                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
5917         switch (rta_type[idx]) {
5918         /*
5919          * flow counters are stored in the actions defined by the flow
5920          * and not in the flow itself, therefore we need to traverse the
5921          * flower chain of actions in search for them.
5922          *
5923          * Note that the index is not decremented here.
5924          */
5925         case TCA_ACT_STATS:
5926                 for (i = 0; i <= TCA_ACT_MAX_PRIO; i++) {
5927                         if (tb[i] &&
5928                         !flow_tcf_nl_parse_one_action_and_get(tb[i],
5929                                                               rta_type,
5930                                                               idx, data))
5931                                 return 0;
5932                 }
5933                 break;
5934         default:
5935                 break;
5936         }
5937         return -1;
5938 }
5939
5940 /**
5941  * Parse flower classifier options in the message, retrieving the requested
5942  * attribute if found.
5943  *
5944  * @param opt
5945  *   flower section in the Netlink message received.
5946  * @param rta_type
5947  *   The backward sequence of rta_types, as written in the attribute table,
5948  *   we need to traverse in order to get to the requested object.
5949  * @param idx
5950  *   Current location in rta_type table.
5951  * @param[out] data
5952  *   data retrieved from the message query.
5953  *
5954  * @return
5955  *   0 if data was found and retrieved, -1 otherwise.
5956  */
5957 static int
5958 flow_tcf_nl_opts_parse_and_get(struct rtattr *opt,
5959                                uint16_t rta_type[], int idx, void *data)
5960 {
5961         int tca_flower_max = flow_tcf_arr_val_max(rta_type, idx,
5962                                                   TCA_FLOWER_ACT);
5963         struct rtattr *tb[tca_flower_max + 1];
5964
5965         if (!opt || idx < 0)
5966                 return -1;
5967         flow_tcf_nl_parse_rtattr(tb, tca_flower_max,
5968                                  RTA_DATA(opt), RTA_PAYLOAD(opt));
5969         switch (rta_type[idx]) {
5970         case TCA_FLOWER_ACT:
5971                 if (tb[TCA_FLOWER_ACT])
5972                         return flow_tcf_nl_action_parse_and_get
5973                                                         (tb[TCA_FLOWER_ACT],
5974                                                          rta_type, --idx, data);
5975                 break;
5976         default:
5977                 break;
5978         }
5979         return -1;
5980 }
5981
5982 /**
5983  * Parse Netlink reply on filter query, retrieving the flow counters.
5984  *
5985  * @param nlh
5986  *   Message received from Netlink.
5987  * @param rta_type
5988  *   The backward sequence of rta_types, as written in the attribute table,
5989  *   we need to traverse in order to get to the requested object.
5990  * @param idx
5991  *   Current location in rta_type table.
5992  * @param[out] data
5993  *   data retrieved from the message query.
5994  *
5995  * @return
5996  *   0 if data was found and retrieved, -1 otherwise.
5997  */
5998 static int
5999 flow_tcf_nl_filter_parse_and_get(struct nlmsghdr *cnlh,
6000                                  uint16_t rta_type[], int idx, void *data)
6001 {
6002         struct nlmsghdr *nlh = cnlh;
6003         struct tcmsg *t = NLMSG_DATA(nlh);
6004         int len = nlh->nlmsg_len;
6005         int tca_max = flow_tcf_arr_val_max(rta_type, idx, TCA_OPTIONS);
6006         struct rtattr *tb[tca_max + 1];
6007
6008         if (idx < 0)
6009                 return -1;
6010         if (nlh->nlmsg_type != RTM_NEWTFILTER &&
6011             nlh->nlmsg_type != RTM_GETTFILTER &&
6012             nlh->nlmsg_type != RTM_DELTFILTER)
6013                 return -1;
6014         len -= NLMSG_LENGTH(sizeof(*t));
6015         if (len < 0)
6016                 return -1;
6017         flow_tcf_nl_parse_rtattr(tb, tca_max, TCA_RTA(t), len);
6018         /* Not a TC flower flow - bail out */
6019         if (!tb[TCA_KIND] ||
6020             strcmp(RTA_DATA(tb[TCA_KIND]), "flower"))
6021                 return -1;
6022         switch (rta_type[idx]) {
6023         case TCA_OPTIONS:
6024                 if (tb[TCA_OPTIONS])
6025                         return flow_tcf_nl_opts_parse_and_get(tb[TCA_OPTIONS],
6026                                                               rta_type,
6027                                                               --idx, data);
6028                 break;
6029         default:
6030                 break;
6031         }
6032         return -1;
6033 }
6034
6035 /**
6036  * A callback to parse Netlink reply on TC flower query.
6037  *
6038  * @param nlh
6039  *   Message received from Netlink.
6040  * @param[out] data
6041  *   Pointer to data area to be filled by the parsing routine.
6042  *   assumed to be a pointer to struct flow_tcf_stats_basic.
6043  *
6044  * @return
6045  *   MNL_CB_OK value.
6046  */
6047 static int
6048 flow_tcf_nl_message_get_stats_basic(const struct nlmsghdr *nlh, void *data)
6049 {
6050         /*
6051          * The backward sequence of rta_types to pass in order to get
6052          *  to the counters.
6053          */
6054         uint16_t rta_type[] = { TCA_STATS_BASIC, TCA_ACT_STATS,
6055                                 TCA_FLOWER_ACT, TCA_OPTIONS };
6056         struct flow_tcf_stats_basic *sb_data = data;
6057         union {
6058                 const struct nlmsghdr *c;
6059                 struct nlmsghdr *nc;
6060         } tnlh = { .c = nlh };
6061
6062         if (!flow_tcf_nl_filter_parse_and_get(tnlh.nc, rta_type,
6063                                               RTE_DIM(rta_type) - 1,
6064                                               (void *)&sb_data->counters))
6065                 sb_data->valid = true;
6066         return MNL_CB_OK;
6067 }
6068
6069 /**
6070  * Query a TC flower rule for its statistics via netlink.
6071  *
6072  * @param[in] dev
6073  *   Pointer to Ethernet device.
6074  * @param[in] flow
6075  *   Pointer to the sub flow.
6076  * @param[out] data
6077  *   data retrieved by the query.
6078  * @param[out] error
6079  *   Perform verbose error reporting if not NULL.
6080  *
6081  * @return
6082  *   0 on success, a negative errno value otherwise and rte_errno is set.
6083  */
6084 static int
6085 flow_tcf_query_count(struct rte_eth_dev *dev,
6086                           struct rte_flow *flow,
6087                           void *data,
6088                           struct rte_flow_error *error)
6089 {
6090         struct flow_tcf_stats_basic sb_data;
6091         struct rte_flow_query_count *qc = data;
6092         struct priv *priv = dev->data->dev_private;
6093         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
6094         struct mnl_socket *nl = ctx->nl;
6095         struct mlx5_flow *dev_flow;
6096         struct nlmsghdr *nlh;
6097         uint32_t seq = priv->tcf_context->seq++;
6098         ssize_t ret;
6099         assert(qc);
6100
6101         memset(&sb_data, 0, sizeof(sb_data));
6102         dev_flow = LIST_FIRST(&flow->dev_flows);
6103         /* E-Switch flow can't be expanded. */
6104         assert(!LIST_NEXT(dev_flow, next));
6105         if (!dev_flow->flow->counter)
6106                 goto notsup_exit;
6107         nlh = dev_flow->tcf.nlh;
6108         nlh->nlmsg_type = RTM_GETTFILTER;
6109         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ECHO;
6110         nlh->nlmsg_seq = seq;
6111         if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) == -1)
6112                 goto error_exit;
6113         do {
6114                 ret = mnl_socket_recvfrom(nl, ctx->buf, ctx->buf_size);
6115                 if (ret <= 0)
6116                         break;
6117                 ret = mnl_cb_run(ctx->buf, ret, seq,
6118                                  mnl_socket_get_portid(nl),
6119                                  flow_tcf_nl_message_get_stats_basic,
6120                                  (void *)&sb_data);
6121         } while (ret > 0);
6122         /* Return the delta from last reset. */
6123         if (sb_data.valid) {
6124                 /* Return the delta from last reset. */
6125                 qc->hits_set = 1;
6126                 qc->bytes_set = 1;
6127                 qc->hits = sb_data.counters.packets - flow->counter->hits;
6128                 qc->bytes = sb_data.counters.bytes - flow->counter->bytes;
6129                 if (qc->reset) {
6130                         flow->counter->hits = sb_data.counters.packets;
6131                         flow->counter->bytes = sb_data.counters.bytes;
6132                 }
6133                 return 0;
6134         }
6135         return rte_flow_error_set(error, EINVAL,
6136                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
6137                                   NULL,
6138                                   "flow does not have counter");
6139 error_exit:
6140         return rte_flow_error_set
6141                         (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
6142                          NULL, "netlink: failed to read flow rule counters");
6143 notsup_exit:
6144         return rte_flow_error_set
6145                         (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
6146                          NULL, "counters are not available.");
6147 }
6148
6149 /**
6150  * Query a flow.
6151  *
6152  * @see rte_flow_query()
6153  * @see rte_flow_ops
6154  */
6155 static int
6156 flow_tcf_query(struct rte_eth_dev *dev,
6157                struct rte_flow *flow,
6158                const struct rte_flow_action *actions,
6159                void *data,
6160                struct rte_flow_error *error)
6161 {
6162         int ret = -EINVAL;
6163
6164         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
6165                 switch (actions->type) {
6166                 case RTE_FLOW_ACTION_TYPE_VOID:
6167                         break;
6168                 case RTE_FLOW_ACTION_TYPE_COUNT:
6169                         ret = flow_tcf_query_count(dev, flow, data, error);
6170                         break;
6171                 default:
6172                         return rte_flow_error_set(error, ENOTSUP,
6173                                                   RTE_FLOW_ERROR_TYPE_ACTION,
6174                                                   actions,
6175                                                   "action not supported");
6176                 }
6177         }
6178         return ret;
6179 }
6180
6181 const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops = {
6182         .validate = flow_tcf_validate,
6183         .prepare = flow_tcf_prepare,
6184         .translate = flow_tcf_translate,
6185         .apply = flow_tcf_apply,
6186         .remove = flow_tcf_remove,
6187         .destroy = flow_tcf_destroy,
6188         .query = flow_tcf_query,
6189 };
6190
6191 /**
6192  * Create and configure a libmnl socket for Netlink flow rules.
6193  *
6194  * @return
6195  *   A valid libmnl socket object pointer on success, NULL otherwise and
6196  *   rte_errno is set.
6197  */
6198 static struct mnl_socket *
6199 flow_tcf_mnl_socket_create(void)
6200 {
6201         struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
6202
6203         if (nl) {
6204                 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
6205                                       sizeof(int));
6206                 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
6207                         return nl;
6208         }
6209         rte_errno = errno;
6210         if (nl)
6211                 mnl_socket_close(nl);
6212         return NULL;
6213 }
6214
6215 /**
6216  * Destroy a libmnl socket.
6217  *
6218  * @param nl
6219  *   Libmnl socket of the @p NETLINK_ROUTE kind.
6220  */
6221 static void
6222 flow_tcf_mnl_socket_destroy(struct mnl_socket *nl)
6223 {
6224         if (nl)
6225                 mnl_socket_close(nl);
6226 }
6227
6228 /**
6229  * Initialize ingress qdisc of a given network interface.
6230  *
6231  * @param ctx
6232  *   Pointer to tc-flower context to use.
6233  * @param ifindex
6234  *   Index of network interface to initialize.
6235  * @param[out] error
6236  *   Perform verbose error reporting if not NULL.
6237  *
6238  * @return
6239  *   0 on success, a negative errno value otherwise and rte_errno is set.
6240  */
6241 int
6242 mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx,
6243                    unsigned int ifindex, struct rte_flow_error *error)
6244 {
6245         struct nlmsghdr *nlh;
6246         struct tcmsg *tcm;
6247         alignas(struct nlmsghdr)
6248         uint8_t buf[mnl_nlmsg_size(sizeof(*tcm)) +
6249                     SZ_NLATTR_STRZ_OF("ingress") +
6250                     MNL_BUF_EXTRA_SPACE];
6251
6252         /* Destroy existing ingress qdisc and everything attached to it. */
6253         nlh = mnl_nlmsg_put_header(buf);
6254         nlh->nlmsg_type = RTM_DELQDISC;
6255         nlh->nlmsg_flags = NLM_F_REQUEST;
6256         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6257         tcm->tcm_family = AF_UNSPEC;
6258         tcm->tcm_ifindex = ifindex;
6259         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6260         tcm->tcm_parent = TC_H_INGRESS;
6261         assert(sizeof(buf) >= nlh->nlmsg_len);
6262         /* Ignore errors when qdisc is already absent. */
6263         if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL) &&
6264             rte_errno != EINVAL && rte_errno != ENOENT)
6265                 return rte_flow_error_set(error, rte_errno,
6266                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6267                                           "netlink: failed to remove ingress"
6268                                           " qdisc");
6269         /* Create fresh ingress qdisc. */
6270         nlh = mnl_nlmsg_put_header(buf);
6271         nlh->nlmsg_type = RTM_NEWQDISC;
6272         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
6273         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6274         tcm->tcm_family = AF_UNSPEC;
6275         tcm->tcm_ifindex = ifindex;
6276         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6277         tcm->tcm_parent = TC_H_INGRESS;
6278         mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
6279         assert(sizeof(buf) >= nlh->nlmsg_len);
6280         if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL))
6281                 return rte_flow_error_set(error, rte_errno,
6282                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6283                                           "netlink: failed to create ingress"
6284                                           " qdisc");
6285         return 0;
6286 }
6287
6288 /**
6289  * Create libmnl context for Netlink flow rules.
6290  *
6291  * @return
6292  *   A valid libmnl socket object pointer on success, NULL otherwise and
6293  *   rte_errno is set.
6294  */
6295 struct mlx5_flow_tcf_context *
6296 mlx5_flow_tcf_context_create(void)
6297 {
6298         struct mlx5_flow_tcf_context *ctx = rte_zmalloc(__func__,
6299                                                         sizeof(*ctx),
6300                                                         sizeof(uint32_t));
6301         if (!ctx)
6302                 goto error;
6303         ctx->nl = flow_tcf_mnl_socket_create();
6304         if (!ctx->nl)
6305                 goto error;
6306         ctx->buf_size = MNL_SOCKET_BUFFER_SIZE;
6307         ctx->buf = rte_zmalloc(__func__,
6308                                ctx->buf_size, sizeof(uint32_t));
6309         if (!ctx->buf)
6310                 goto error;
6311         ctx->seq = random();
6312         return ctx;
6313 error:
6314         mlx5_flow_tcf_context_destroy(ctx);
6315         return NULL;
6316 }
6317
6318 /**
6319  * Destroy a libmnl context.
6320  *
6321  * @param ctx
6322  *   Libmnl socket of the @p NETLINK_ROUTE kind.
6323  */
6324 void
6325 mlx5_flow_tcf_context_destroy(struct mlx5_flow_tcf_context *ctx)
6326 {
6327         if (!ctx)
6328                 return;
6329         flow_tcf_mnl_socket_destroy(ctx->nl);
6330         rte_free(ctx->buf);
6331         rte_free(ctx);
6332 }