net/mlx5: support TOS and TTL fields on E-Switch
[dpdk.git] / drivers / net / mlx5 / mlx5_flow_tcf.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018 6WIND S.A.
3  * Copyright 2018 Mellanox Technologies, Ltd
4  */
5
6 #include <assert.h>
7 #include <errno.h>
8 #include <libmnl/libmnl.h>
9 #include <linux/gen_stats.h>
10 #include <linux/if_ether.h>
11 #include <linux/netlink.h>
12 #include <linux/pkt_cls.h>
13 #include <linux/pkt_sched.h>
14 #include <linux/rtnetlink.h>
15 #include <linux/tc_act/tc_gact.h>
16 #include <linux/tc_act/tc_mirred.h>
17 #include <netinet/in.h>
18 #include <stdalign.h>
19 #include <stdbool.h>
20 #include <stddef.h>
21 #include <stdint.h>
22 #include <stdlib.h>
23 #include <sys/socket.h>
24
25 #include <rte_byteorder.h>
26 #include <rte_errno.h>
27 #include <rte_ether.h>
28 #include <rte_flow.h>
29 #include <rte_malloc.h>
30 #include <rte_common.h>
31
32 #include "mlx5.h"
33 #include "mlx5_flow.h"
34 #include "mlx5_autoconf.h"
35
36 #ifdef HAVE_TC_ACT_VLAN
37
38 #include <linux/tc_act/tc_vlan.h>
39
40 #else /* HAVE_TC_ACT_VLAN */
41
42 #define TCA_VLAN_ACT_POP 1
43 #define TCA_VLAN_ACT_PUSH 2
44 #define TCA_VLAN_ACT_MODIFY 3
45 #define TCA_VLAN_PARMS 2
46 #define TCA_VLAN_PUSH_VLAN_ID 3
47 #define TCA_VLAN_PUSH_VLAN_PROTOCOL 4
48 #define TCA_VLAN_PAD 5
49 #define TCA_VLAN_PUSH_VLAN_PRIORITY 6
50
51 struct tc_vlan {
52         tc_gen;
53         int v_action;
54 };
55
56 #endif /* HAVE_TC_ACT_VLAN */
57
58 #ifdef HAVE_TC_ACT_PEDIT
59
60 #include <linux/tc_act/tc_pedit.h>
61
62 #else /* HAVE_TC_ACT_VLAN */
63
64 enum {
65         TCA_PEDIT_UNSPEC,
66         TCA_PEDIT_TM,
67         TCA_PEDIT_PARMS,
68         TCA_PEDIT_PAD,
69         TCA_PEDIT_PARMS_EX,
70         TCA_PEDIT_KEYS_EX,
71         TCA_PEDIT_KEY_EX,
72         __TCA_PEDIT_MAX
73 };
74
75 enum {
76         TCA_PEDIT_KEY_EX_HTYPE = 1,
77         TCA_PEDIT_KEY_EX_CMD = 2,
78         __TCA_PEDIT_KEY_EX_MAX
79 };
80
81 enum pedit_header_type {
82         TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK = 0,
83         TCA_PEDIT_KEY_EX_HDR_TYPE_ETH = 1,
84         TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 = 2,
85         TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 = 3,
86         TCA_PEDIT_KEY_EX_HDR_TYPE_TCP = 4,
87         TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
88         __PEDIT_HDR_TYPE_MAX,
89 };
90
91 enum pedit_cmd {
92         TCA_PEDIT_KEY_EX_CMD_SET = 0,
93         TCA_PEDIT_KEY_EX_CMD_ADD = 1,
94         __PEDIT_CMD_MAX,
95 };
96
97 struct tc_pedit_key {
98         __u32 mask; /* AND */
99         __u32 val; /*XOR */
100         __u32 off; /*offset */
101         __u32 at;
102         __u32 offmask;
103         __u32 shift;
104 };
105
106 __extension__
107 struct tc_pedit_sel {
108         tc_gen;
109         unsigned char nkeys;
110         unsigned char flags;
111         struct tc_pedit_key keys[0];
112 };
113
114 #endif /* HAVE_TC_ACT_VLAN */
115
116 #ifdef HAVE_TC_ACT_TUNNEL_KEY
117
118 #include <linux/tc_act/tc_tunnel_key.h>
119
120 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
121 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
122 #endif
123
124 #ifndef HAVE_TCA_TUNNEL_KEY_NO_CSUM
125 #define TCA_TUNNEL_KEY_NO_CSUM 10
126 #endif
127
128 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_TOS
129 #define TCA_TUNNEL_KEY_ENC_TOS 12
130 #endif
131
132 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_TTL
133 #define TCA_TUNNEL_KEY_ENC_TTL 13
134 #endif
135
136 #else /* HAVE_TC_ACT_TUNNEL_KEY */
137
138 #define TCA_ACT_TUNNEL_KEY 17
139 #define TCA_TUNNEL_KEY_ACT_SET 1
140 #define TCA_TUNNEL_KEY_ACT_RELEASE 2
141 #define TCA_TUNNEL_KEY_PARMS 2
142 #define TCA_TUNNEL_KEY_ENC_IPV4_SRC 3
143 #define TCA_TUNNEL_KEY_ENC_IPV4_DST 4
144 #define TCA_TUNNEL_KEY_ENC_IPV6_SRC 5
145 #define TCA_TUNNEL_KEY_ENC_IPV6_DST 6
146 #define TCA_TUNNEL_KEY_ENC_KEY_ID 7
147 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
148 #define TCA_TUNNEL_KEY_NO_CSUM 10
149 #define TCA_TUNNEL_KEY_ENC_TOS 12
150 #define TCA_TUNNEL_KEY_ENC_TTL 13
151
152 struct tc_tunnel_key {
153         tc_gen;
154         int t_action;
155 };
156
157 #endif /* HAVE_TC_ACT_TUNNEL_KEY */
158
159 /* Normally found in linux/netlink.h. */
160 #ifndef NETLINK_CAP_ACK
161 #define NETLINK_CAP_ACK 10
162 #endif
163
164 /* Normally found in linux/pkt_sched.h. */
165 #ifndef TC_H_MIN_INGRESS
166 #define TC_H_MIN_INGRESS 0xfff2u
167 #endif
168
169 /* Normally found in linux/pkt_cls.h. */
170 #ifndef TCA_CLS_FLAGS_SKIP_SW
171 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
172 #endif
173 #ifndef TCA_CLS_FLAGS_IN_HW
174 #define TCA_CLS_FLAGS_IN_HW (1 << 2)
175 #endif
176 #ifndef HAVE_TCA_CHAIN
177 #define TCA_CHAIN 11
178 #endif
179 #ifndef HAVE_TCA_FLOWER_ACT
180 #define TCA_FLOWER_ACT 3
181 #endif
182 #ifndef HAVE_TCA_FLOWER_FLAGS
183 #define TCA_FLOWER_FLAGS 22
184 #endif
185 #ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE
186 #define TCA_FLOWER_KEY_ETH_TYPE 8
187 #endif
188 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST
189 #define TCA_FLOWER_KEY_ETH_DST 4
190 #endif
191 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK
192 #define TCA_FLOWER_KEY_ETH_DST_MASK 5
193 #endif
194 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC
195 #define TCA_FLOWER_KEY_ETH_SRC 6
196 #endif
197 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK
198 #define TCA_FLOWER_KEY_ETH_SRC_MASK 7
199 #endif
200 #ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO
201 #define TCA_FLOWER_KEY_IP_PROTO 9
202 #endif
203 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC
204 #define TCA_FLOWER_KEY_IPV4_SRC 10
205 #endif
206 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK
207 #define TCA_FLOWER_KEY_IPV4_SRC_MASK 11
208 #endif
209 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST
210 #define TCA_FLOWER_KEY_IPV4_DST 12
211 #endif
212 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK
213 #define TCA_FLOWER_KEY_IPV4_DST_MASK 13
214 #endif
215 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC
216 #define TCA_FLOWER_KEY_IPV6_SRC 14
217 #endif
218 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK
219 #define TCA_FLOWER_KEY_IPV6_SRC_MASK 15
220 #endif
221 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST
222 #define TCA_FLOWER_KEY_IPV6_DST 16
223 #endif
224 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK
225 #define TCA_FLOWER_KEY_IPV6_DST_MASK 17
226 #endif
227 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC
228 #define TCA_FLOWER_KEY_TCP_SRC 18
229 #endif
230 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK
231 #define TCA_FLOWER_KEY_TCP_SRC_MASK 35
232 #endif
233 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST
234 #define TCA_FLOWER_KEY_TCP_DST 19
235 #endif
236 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK
237 #define TCA_FLOWER_KEY_TCP_DST_MASK 36
238 #endif
239 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC
240 #define TCA_FLOWER_KEY_UDP_SRC 20
241 #endif
242 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK
243 #define TCA_FLOWER_KEY_UDP_SRC_MASK 37
244 #endif
245 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST
246 #define TCA_FLOWER_KEY_UDP_DST 21
247 #endif
248 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK
249 #define TCA_FLOWER_KEY_UDP_DST_MASK 38
250 #endif
251 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID
252 #define TCA_FLOWER_KEY_VLAN_ID 23
253 #endif
254 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO
255 #define TCA_FLOWER_KEY_VLAN_PRIO 24
256 #endif
257 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
258 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
259 #endif
260 #ifndef HAVE_TCA_FLOWER_KEY_ENC_KEY_ID
261 #define TCA_FLOWER_KEY_ENC_KEY_ID 26
262 #endif
263 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC
264 #define TCA_FLOWER_KEY_ENC_IPV4_SRC 27
265 #endif
266 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK
267 #define TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK 28
268 #endif
269 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST
270 #define TCA_FLOWER_KEY_ENC_IPV4_DST 29
271 #endif
272 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK
273 #define TCA_FLOWER_KEY_ENC_IPV4_DST_MASK 30
274 #endif
275 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC
276 #define TCA_FLOWER_KEY_ENC_IPV6_SRC 31
277 #endif
278 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK
279 #define TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK 32
280 #endif
281 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST
282 #define TCA_FLOWER_KEY_ENC_IPV6_DST 33
283 #endif
284 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK
285 #define TCA_FLOWER_KEY_ENC_IPV6_DST_MASK 34
286 #endif
287 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT
288 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT 43
289 #endif
290 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK
291 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK 44
292 #endif
293 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT
294 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT 45
295 #endif
296 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK
297 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK 46
298 #endif
299 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS
300 #define TCA_FLOWER_KEY_TCP_FLAGS 71
301 #endif
302 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK
303 #define TCA_FLOWER_KEY_TCP_FLAGS_MASK 72
304 #endif
305 #ifndef HAVE_TCA_FLOWER_KEY_IP_TOS
306 #define TCA_FLOWER_KEY_IP_TOS 73
307 #endif
308 #ifndef HAVE_TCA_FLOWER_KEY_IP_TOS_MASK
309 #define TCA_FLOWER_KEY_IP_TOS_MASK 74
310 #endif
311 #ifndef HAVE_TCA_FLOWER_KEY_IP_TTL
312 #define TCA_FLOWER_KEY_IP_TTL 75
313 #endif
314 #ifndef HAVE_TCA_FLOWER_KEY_IP_TTL_MASK
315 #define TCA_FLOWER_KEY_IP_TTL_MASK 76
316 #endif
317 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TOS
318 #define TCA_FLOWER_KEY_ENC_IP_TOS 80
319 #endif
320 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TOS_MASK
321 #define TCA_FLOWER_KEY_ENC_IP_TOS_MASK 81
322 #endif
323 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TTL
324 #define TCA_FLOWER_KEY_ENC_IP_TTL 82
325 #endif
326 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TTL_MASK
327 #define TCA_FLOWER_KEY_ENC_IP_TTL_MASK 83
328 #endif
329
330 #ifndef HAVE_TC_ACT_GOTO_CHAIN
331 #define TC_ACT_GOTO_CHAIN 0x20000000
332 #endif
333
334 #ifndef IPV6_ADDR_LEN
335 #define IPV6_ADDR_LEN 16
336 #endif
337
338 #ifndef IPV4_ADDR_LEN
339 #define IPV4_ADDR_LEN 4
340 #endif
341
342 #ifndef TP_PORT_LEN
343 #define TP_PORT_LEN 2 /* Transport Port (UDP/TCP) Length */
344 #endif
345
346 #ifndef TTL_LEN
347 #define TTL_LEN 1
348 #endif
349
350 #ifndef TCA_ACT_MAX_PRIO
351 #define TCA_ACT_MAX_PRIO 32
352 #endif
353
354 /** Parameters of VXLAN devices created by driver. */
355 #define MLX5_VXLAN_DEFAULT_VNI  1
356 #define MLX5_VXLAN_DEVICE_PFX "vmlx_"
357
358 /** Tunnel action type, used for @p type in header structure. */
359 enum flow_tcf_tunact_type {
360         FLOW_TCF_TUNACT_VXLAN_DECAP,
361         FLOW_TCF_TUNACT_VXLAN_ENCAP,
362 };
363
364 /** Flags used for @p mask in tunnel action encap descriptors. */
365 #define FLOW_TCF_ENCAP_ETH_SRC (1u << 0)
366 #define FLOW_TCF_ENCAP_ETH_DST (1u << 1)
367 #define FLOW_TCF_ENCAP_IPV4_SRC (1u << 2)
368 #define FLOW_TCF_ENCAP_IPV4_DST (1u << 3)
369 #define FLOW_TCF_ENCAP_IPV6_SRC (1u << 4)
370 #define FLOW_TCF_ENCAP_IPV6_DST (1u << 5)
371 #define FLOW_TCF_ENCAP_UDP_SRC (1u << 6)
372 #define FLOW_TCF_ENCAP_UDP_DST (1u << 7)
373 #define FLOW_TCF_ENCAP_VXLAN_VNI (1u << 8)
374 #define FLOW_TCF_ENCAP_IP_TTL (1u << 9)
375 #define FLOW_TCF_ENCAP_IP_TOS (1u << 10)
376
377 /**
378  * Structure for holding netlink context.
379  * Note the size of the message buffer which is MNL_SOCKET_BUFFER_SIZE.
380  * Using this (8KB) buffer size ensures that netlink messages will never be
381  * truncated.
382  */
383 struct mlx5_flow_tcf_context {
384         struct mnl_socket *nl; /* NETLINK_ROUTE libmnl socket. */
385         uint32_t seq; /* Message sequence number. */
386         uint32_t buf_size; /* Message buffer size. */
387         uint8_t *buf; /* Message buffer. */
388 };
389
390 /**
391  * Neigh rule structure. The neigh rule is applied via Netlink to
392  * outer tunnel iface in order to provide destination MAC address
393  * for the VXLAN encapsultion. The neigh rule is implicitly related
394  * to the Flow itself and can be shared by multiple Flows.
395  */
396 struct tcf_neigh_rule {
397         LIST_ENTRY(tcf_neigh_rule) next;
398         uint32_t refcnt;
399         struct ether_addr eth;
400         uint16_t mask;
401         union {
402                 struct {
403                         rte_be32_t dst;
404                 } ipv4;
405                 struct {
406                         uint8_t dst[IPV6_ADDR_LEN];
407                 } ipv6;
408         };
409 };
410
411 /**
412  * Local rule structure. The local rule is applied via Netlink to
413  * outer tunnel iface in order to provide local and peer IP addresses
414  * of the VXLAN tunnel for encapsulation. The local rule is implicitly
415  * related to the Flow itself and can be shared by multiple Flows.
416  */
417 struct tcf_local_rule {
418         LIST_ENTRY(tcf_local_rule) next;
419         uint32_t refcnt;
420         uint16_t mask;
421         union {
422                 struct {
423                         rte_be32_t dst;
424                         rte_be32_t src;
425                 } ipv4;
426                 struct {
427                         uint8_t dst[IPV6_ADDR_LEN];
428                         uint8_t src[IPV6_ADDR_LEN];
429                 } ipv6;
430         };
431 };
432
433 /** Outer interface VXLAN encapsulation rules container. */
434 struct tcf_irule {
435         LIST_ENTRY(tcf_irule) next;
436         LIST_HEAD(, tcf_neigh_rule) neigh;
437         LIST_HEAD(, tcf_local_rule) local;
438         uint32_t refcnt;
439         unsigned int ifouter; /**< Own interface index. */
440 };
441
442 /** VXLAN virtual netdev. */
443 struct tcf_vtep {
444         LIST_ENTRY(tcf_vtep) next;
445         uint32_t refcnt;
446         unsigned int ifindex; /**< Own interface index. */
447         uint16_t port;
448         uint8_t created;
449 };
450
451 /** Tunnel descriptor header, common for all tunnel types. */
452 struct flow_tcf_tunnel_hdr {
453         uint32_t type; /**< Tunnel action type. */
454         struct tcf_vtep *vtep; /**< Virtual tunnel endpoint device. */
455         unsigned int ifindex_org; /**< Original dst/src interface */
456         unsigned int *ifindex_ptr; /**< Interface ptr in message. */
457 };
458
459 struct flow_tcf_vxlan_decap {
460         struct flow_tcf_tunnel_hdr hdr;
461         uint16_t udp_port;
462 };
463
464 struct flow_tcf_vxlan_encap {
465         struct flow_tcf_tunnel_hdr hdr;
466         struct tcf_irule *iface;
467         uint32_t mask;
468         uint8_t ip_tos;
469         uint8_t ip_ttl_hop;
470         struct {
471                 struct ether_addr dst;
472                 struct ether_addr src;
473         } eth;
474         union {
475                 struct {
476                         rte_be32_t dst;
477                         rte_be32_t src;
478                 } ipv4;
479                 struct {
480                         uint8_t dst[IPV6_ADDR_LEN];
481                         uint8_t src[IPV6_ADDR_LEN];
482                 } ipv6;
483         };
484         struct {
485                 rte_be16_t src;
486                 rte_be16_t dst;
487         } udp;
488         struct {
489                 uint8_t vni[3];
490         } vxlan;
491 };
492
493 /** Structure used when extracting the values of a flow counters
494  * from a netlink message.
495  */
496 struct flow_tcf_stats_basic {
497         bool valid;
498         struct gnet_stats_basic counters;
499 };
500
501 /** Empty masks for known item types. */
502 static const union {
503         struct rte_flow_item_port_id port_id;
504         struct rte_flow_item_eth eth;
505         struct rte_flow_item_vlan vlan;
506         struct rte_flow_item_ipv4 ipv4;
507         struct rte_flow_item_ipv6 ipv6;
508         struct rte_flow_item_tcp tcp;
509         struct rte_flow_item_udp udp;
510         struct rte_flow_item_vxlan vxlan;
511 } flow_tcf_mask_empty = {
512         {0},
513 };
514
515 /** Supported masks for known item types. */
516 static const struct {
517         struct rte_flow_item_port_id port_id;
518         struct rte_flow_item_eth eth;
519         struct rte_flow_item_vlan vlan;
520         struct rte_flow_item_ipv4 ipv4;
521         struct rte_flow_item_ipv6 ipv6;
522         struct rte_flow_item_tcp tcp;
523         struct rte_flow_item_udp udp;
524         struct rte_flow_item_vxlan vxlan;
525 } flow_tcf_mask_supported = {
526         .port_id = {
527                 .id = 0xffffffff,
528         },
529         .eth = {
530                 .type = RTE_BE16(0xffff),
531                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
532                 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
533         },
534         .vlan = {
535                 /* PCP and VID only, no DEI. */
536                 .tci = RTE_BE16(0xefff),
537                 .inner_type = RTE_BE16(0xffff),
538         },
539         .ipv4.hdr = {
540                 .next_proto_id = 0xff,
541                 .src_addr = RTE_BE32(0xffffffff),
542                 .dst_addr = RTE_BE32(0xffffffff),
543         },
544         .ipv6.hdr = {
545                 .proto = 0xff,
546                 .src_addr =
547                         "\xff\xff\xff\xff\xff\xff\xff\xff"
548                         "\xff\xff\xff\xff\xff\xff\xff\xff",
549                 .dst_addr =
550                         "\xff\xff\xff\xff\xff\xff\xff\xff"
551                         "\xff\xff\xff\xff\xff\xff\xff\xff",
552         },
553         .tcp.hdr = {
554                 .src_port = RTE_BE16(0xffff),
555                 .dst_port = RTE_BE16(0xffff),
556                 .tcp_flags = 0xff,
557         },
558         .udp.hdr = {
559                 .src_port = RTE_BE16(0xffff),
560                 .dst_port = RTE_BE16(0xffff),
561         },
562         .vxlan = {
563                .vni = "\xff\xff\xff",
564         },
565 };
566
567 #define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr))
568 #define SZ_NLATTR_NEST SZ_NLATTR_HDR
569 #define SZ_NLATTR_DATA_OF(len) MNL_ALIGN(SZ_NLATTR_HDR + (len))
570 #define SZ_NLATTR_TYPE_OF(typ) SZ_NLATTR_DATA_OF(sizeof(typ))
571 #define SZ_NLATTR_STRZ_OF(str) SZ_NLATTR_DATA_OF(strlen(str) + 1)
572
573 #define PTOI_TABLE_SZ_MAX(dev) (mlx5_dev_to_port_id((dev)->device, NULL, 0) + 2)
574
575 /** DPDK port to network interface index (ifindex) conversion. */
576 struct flow_tcf_ptoi {
577         uint16_t port_id; /**< DPDK port ID. */
578         unsigned int ifindex; /**< Network interface index. */
579 };
580
581 /* Due to a limitation on driver/FW. */
582 #define MLX5_TCF_GROUP_ID_MAX 3
583
584 /*
585  * Due to a limitation on driver/FW, priority ranges from 1 to 16 in kernel.
586  * Priority in rte_flow attribute starts from 0 and is added by 1 in
587  * translation. This is subject to be changed to determine the max priority
588  * based on trial-and-error like Verbs driver once the restriction is lifted or
589  * the range is extended.
590  */
591 #define MLX5_TCF_GROUP_PRIORITY_MAX 15
592
593 #define MLX5_TCF_FATE_ACTIONS \
594         (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \
595          MLX5_FLOW_ACTION_JUMP)
596
597 #define MLX5_TCF_VLAN_ACTIONS \
598         (MLX5_FLOW_ACTION_OF_POP_VLAN | MLX5_FLOW_ACTION_OF_PUSH_VLAN | \
599          MLX5_FLOW_ACTION_OF_SET_VLAN_VID | MLX5_FLOW_ACTION_OF_SET_VLAN_PCP)
600
601 #define MLX5_TCF_VXLAN_ACTIONS \
602         (MLX5_FLOW_ACTION_VXLAN_ENCAP | MLX5_FLOW_ACTION_VXLAN_DECAP)
603
604 #define MLX5_TCF_PEDIT_ACTIONS \
605         (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST | \
606          MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST | \
607          MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST | \
608          MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL | \
609          MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)
610
611 #define MLX5_TCF_CONFIG_ACTIONS \
612         (MLX5_FLOW_ACTION_PORT_ID | MLX5_FLOW_ACTION_JUMP | \
613          MLX5_FLOW_ACTION_OF_PUSH_VLAN | MLX5_FLOW_ACTION_OF_SET_VLAN_VID | \
614          MLX5_FLOW_ACTION_OF_SET_VLAN_PCP | \
615          (MLX5_TCF_PEDIT_ACTIONS & ~MLX5_FLOW_ACTION_DEC_TTL))
616
617 #define MAX_PEDIT_KEYS 128
618 #define SZ_PEDIT_KEY_VAL 4
619
620 #define NUM_OF_PEDIT_KEYS(sz) \
621         (((sz) / SZ_PEDIT_KEY_VAL) + (((sz) % SZ_PEDIT_KEY_VAL) ? 1 : 0))
622
623 struct pedit_key_ex {
624         enum pedit_header_type htype;
625         enum pedit_cmd cmd;
626 };
627
628 struct pedit_parser {
629         struct tc_pedit_sel sel;
630         struct tc_pedit_key keys[MAX_PEDIT_KEYS];
631         struct pedit_key_ex keys_ex[MAX_PEDIT_KEYS];
632 };
633
634 /**
635  * Create space for using the implicitly created TC flow counter.
636  *
637  * @param[in] dev
638  *   Pointer to the Ethernet device structure.
639  *
640  * @return
641  *   A pointer to the counter data structure, NULL otherwise and
642  *   rte_errno is set.
643  */
644 static struct mlx5_flow_counter *
645 flow_tcf_counter_new(void)
646 {
647         struct mlx5_flow_counter *cnt;
648
649         /*
650          * eswitch counter cannot be shared and its id is unknown.
651          * currently returning all with id 0.
652          * in the future maybe better to switch to unique numbers.
653          */
654         struct mlx5_flow_counter tmpl = {
655                 .ref_cnt = 1,
656         };
657         cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0);
658         if (!cnt) {
659                 rte_errno = ENOMEM;
660                 return NULL;
661         }
662         *cnt = tmpl;
663         /* Implicit counter, do not add to list. */
664         return cnt;
665 }
666
667 /**
668  * Set pedit key of MAC address
669  *
670  * @param[in] actions
671  *   pointer to action specification
672  * @param[in,out] p_parser
673  *   pointer to pedit_parser
674  */
675 static void
676 flow_tcf_pedit_key_set_mac(const struct rte_flow_action *actions,
677                            struct pedit_parser *p_parser)
678 {
679         int idx = p_parser->sel.nkeys;
680         uint32_t off = actions->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ?
681                                         offsetof(struct ether_hdr, s_addr) :
682                                         offsetof(struct ether_hdr, d_addr);
683         const struct rte_flow_action_set_mac *conf =
684                 (const struct rte_flow_action_set_mac *)actions->conf;
685
686         p_parser->keys[idx].off = off;
687         p_parser->keys[idx].mask = ~UINT32_MAX;
688         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
689         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
690         memcpy(&p_parser->keys[idx].val,
691                 conf->mac_addr, SZ_PEDIT_KEY_VAL);
692         idx++;
693         p_parser->keys[idx].off = off + SZ_PEDIT_KEY_VAL;
694         p_parser->keys[idx].mask = 0xFFFF0000;
695         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
696         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
697         memcpy(&p_parser->keys[idx].val,
698                 conf->mac_addr + SZ_PEDIT_KEY_VAL,
699                 ETHER_ADDR_LEN - SZ_PEDIT_KEY_VAL);
700         p_parser->sel.nkeys = (++idx);
701 }
702
703 /**
704  * Set pedit key of decrease/set ttl
705  *
706  * @param[in] actions
707  *   pointer to action specification
708  * @param[in,out] p_parser
709  *   pointer to pedit_parser
710  * @param[in] item_flags
711  *   flags of all items presented
712  */
713 static void
714 flow_tcf_pedit_key_set_dec_ttl(const struct rte_flow_action *actions,
715                                 struct pedit_parser *p_parser,
716                                 uint64_t item_flags)
717 {
718         int idx = p_parser->sel.nkeys;
719
720         p_parser->keys[idx].mask = 0xFFFFFF00;
721         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) {
722                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
723                 p_parser->keys[idx].off =
724                         offsetof(struct ipv4_hdr, time_to_live);
725         }
726         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6) {
727                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
728                 p_parser->keys[idx].off =
729                         offsetof(struct ipv6_hdr, hop_limits);
730         }
731         if (actions->type == RTE_FLOW_ACTION_TYPE_DEC_TTL) {
732                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_ADD;
733                 p_parser->keys[idx].val = 0x000000FF;
734         } else {
735                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
736                 p_parser->keys[idx].val =
737                         (__u32)((const struct rte_flow_action_set_ttl *)
738                          actions->conf)->ttl_value;
739         }
740         p_parser->sel.nkeys = (++idx);
741 }
742
743 /**
744  * Set pedit key of transport (TCP/UDP) port value
745  *
746  * @param[in] actions
747  *   pointer to action specification
748  * @param[in,out] p_parser
749  *   pointer to pedit_parser
750  * @param[in] item_flags
751  *   flags of all items presented
752  */
753 static void
754 flow_tcf_pedit_key_set_tp_port(const struct rte_flow_action *actions,
755                                 struct pedit_parser *p_parser,
756                                 uint64_t item_flags)
757 {
758         int idx = p_parser->sel.nkeys;
759
760         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)
761                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_UDP;
762         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_TCP)
763                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_TCP;
764         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
765         /* offset of src/dst port is same for TCP and UDP */
766         p_parser->keys[idx].off =
767                 actions->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC ?
768                 offsetof(struct tcp_hdr, src_port) :
769                 offsetof(struct tcp_hdr, dst_port);
770         p_parser->keys[idx].mask = 0xFFFF0000;
771         p_parser->keys[idx].val =
772                 (__u32)((const struct rte_flow_action_set_tp *)
773                                 actions->conf)->port;
774         p_parser->sel.nkeys = (++idx);
775 }
776
777 /**
778  * Set pedit key of ipv6 address
779  *
780  * @param[in] actions
781  *   pointer to action specification
782  * @param[in,out] p_parser
783  *   pointer to pedit_parser
784  */
785 static void
786 flow_tcf_pedit_key_set_ipv6_addr(const struct rte_flow_action *actions,
787                                  struct pedit_parser *p_parser)
788 {
789         int idx = p_parser->sel.nkeys;
790         int keys = NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
791         int off_base =
792                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ?
793                 offsetof(struct ipv6_hdr, src_addr) :
794                 offsetof(struct ipv6_hdr, dst_addr);
795         const struct rte_flow_action_set_ipv6 *conf =
796                 (const struct rte_flow_action_set_ipv6 *)actions->conf;
797
798         for (int i = 0; i < keys; i++, idx++) {
799                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
800                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
801                 p_parser->keys[idx].off = off_base + i * SZ_PEDIT_KEY_VAL;
802                 p_parser->keys[idx].mask = ~UINT32_MAX;
803                 memcpy(&p_parser->keys[idx].val,
804                         conf->ipv6_addr + i *  SZ_PEDIT_KEY_VAL,
805                         SZ_PEDIT_KEY_VAL);
806         }
807         p_parser->sel.nkeys += keys;
808 }
809
810 /**
811  * Set pedit key of ipv4 address
812  *
813  * @param[in] actions
814  *   pointer to action specification
815  * @param[in,out] p_parser
816  *   pointer to pedit_parser
817  */
818 static void
819 flow_tcf_pedit_key_set_ipv4_addr(const struct rte_flow_action *actions,
820                                  struct pedit_parser *p_parser)
821 {
822         int idx = p_parser->sel.nkeys;
823
824         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
825         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
826         p_parser->keys[idx].off =
827                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ?
828                 offsetof(struct ipv4_hdr, src_addr) :
829                 offsetof(struct ipv4_hdr, dst_addr);
830         p_parser->keys[idx].mask = ~UINT32_MAX;
831         p_parser->keys[idx].val =
832                 ((const struct rte_flow_action_set_ipv4 *)
833                  actions->conf)->ipv4_addr;
834         p_parser->sel.nkeys = (++idx);
835 }
836
837 /**
838  * Create the pedit's na attribute in netlink message
839  * on pre-allocate message buffer
840  *
841  * @param[in,out] nl
842  *   pointer to pre-allocated netlink message buffer
843  * @param[in,out] actions
844  *   pointer to pointer of actions specification.
845  * @param[in,out] action_flags
846  *   pointer to actions flags
847  * @param[in] item_flags
848  *   flags of all item presented
849  */
850 static void
851 flow_tcf_create_pedit_mnl_msg(struct nlmsghdr *nl,
852                               const struct rte_flow_action **actions,
853                               uint64_t item_flags)
854 {
855         struct pedit_parser p_parser;
856         struct nlattr *na_act_options;
857         struct nlattr *na_pedit_keys;
858
859         memset(&p_parser, 0, sizeof(p_parser));
860         mnl_attr_put_strz(nl, TCA_ACT_KIND, "pedit");
861         na_act_options = mnl_attr_nest_start(nl, TCA_ACT_OPTIONS);
862         /* all modify header actions should be in one tc-pedit action */
863         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
864                 switch ((*actions)->type) {
865                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
866                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
867                         flow_tcf_pedit_key_set_ipv4_addr(*actions, &p_parser);
868                         break;
869                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
870                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
871                         flow_tcf_pedit_key_set_ipv6_addr(*actions, &p_parser);
872                         break;
873                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
874                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
875                         flow_tcf_pedit_key_set_tp_port(*actions,
876                                                         &p_parser, item_flags);
877                         break;
878                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
879                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
880                         flow_tcf_pedit_key_set_dec_ttl(*actions,
881                                                         &p_parser, item_flags);
882                         break;
883                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
884                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
885                         flow_tcf_pedit_key_set_mac(*actions, &p_parser);
886                         break;
887                 default:
888                         goto pedit_mnl_msg_done;
889                 }
890         }
891 pedit_mnl_msg_done:
892         p_parser.sel.action = TC_ACT_PIPE;
893         mnl_attr_put(nl, TCA_PEDIT_PARMS_EX,
894                      sizeof(p_parser.sel) +
895                      p_parser.sel.nkeys * sizeof(struct tc_pedit_key),
896                      &p_parser);
897         na_pedit_keys =
898                 mnl_attr_nest_start(nl, TCA_PEDIT_KEYS_EX | NLA_F_NESTED);
899         for (int i = 0; i < p_parser.sel.nkeys; i++) {
900                 struct nlattr *na_pedit_key =
901                         mnl_attr_nest_start(nl,
902                                             TCA_PEDIT_KEY_EX | NLA_F_NESTED);
903                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_HTYPE,
904                                  p_parser.keys_ex[i].htype);
905                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_CMD,
906                                  p_parser.keys_ex[i].cmd);
907                 mnl_attr_nest_end(nl, na_pedit_key);
908         }
909         mnl_attr_nest_end(nl, na_pedit_keys);
910         mnl_attr_nest_end(nl, na_act_options);
911         (*actions)--;
912 }
913
914 /**
915  * Calculate max memory size of one TC-pedit actions.
916  * One TC-pedit action can contain set of keys each defining
917  * a rewrite element (rte_flow action)
918  *
919  * @param[in,out] actions
920  *   actions specification.
921  * @param[in,out] action_flags
922  *   actions flags
923  * @param[in,out] size
924  *   accumulated size
925  * @return
926  *   Max memory size of one TC-pedit action
927  */
928 static int
929 flow_tcf_get_pedit_actions_size(const struct rte_flow_action **actions,
930                                 uint64_t *action_flags)
931 {
932         int pedit_size = 0;
933         int keys = 0;
934         uint64_t flags = 0;
935
936         pedit_size += SZ_NLATTR_NEST + /* na_act_index. */
937                       SZ_NLATTR_STRZ_OF("pedit") +
938                       SZ_NLATTR_NEST; /* TCA_ACT_OPTIONS. */
939         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
940                 switch ((*actions)->type) {
941                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
942                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
943                         flags |= MLX5_FLOW_ACTION_SET_IPV4_SRC;
944                         break;
945                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
946                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
947                         flags |= MLX5_FLOW_ACTION_SET_IPV4_DST;
948                         break;
949                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
950                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
951                         flags |= MLX5_FLOW_ACTION_SET_IPV6_SRC;
952                         break;
953                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
954                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
955                         flags |= MLX5_FLOW_ACTION_SET_IPV6_DST;
956                         break;
957                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
958                         /* TCP is as same as UDP */
959                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
960                         flags |= MLX5_FLOW_ACTION_SET_TP_SRC;
961                         break;
962                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
963                         /* TCP is as same as UDP */
964                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
965                         flags |= MLX5_FLOW_ACTION_SET_TP_DST;
966                         break;
967                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
968                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
969                         flags |= MLX5_FLOW_ACTION_SET_TTL;
970                         break;
971                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
972                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
973                         flags |= MLX5_FLOW_ACTION_DEC_TTL;
974                         break;
975                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
976                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
977                         flags |= MLX5_FLOW_ACTION_SET_MAC_SRC;
978                         break;
979                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
980                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
981                         flags |= MLX5_FLOW_ACTION_SET_MAC_DST;
982                         break;
983                 default:
984                         goto get_pedit_action_size_done;
985                 }
986         }
987 get_pedit_action_size_done:
988         /* TCA_PEDIT_PARAMS_EX */
989         pedit_size +=
990                 SZ_NLATTR_DATA_OF(sizeof(struct tc_pedit_sel) +
991                                   keys * sizeof(struct tc_pedit_key));
992         pedit_size += SZ_NLATTR_NEST; /* TCA_PEDIT_KEYS */
993         pedit_size += keys *
994                       /* TCA_PEDIT_KEY_EX + HTYPE + CMD */
995                       (SZ_NLATTR_NEST + SZ_NLATTR_DATA_OF(2) +
996                        SZ_NLATTR_DATA_OF(2));
997         (*action_flags) |= flags;
998         (*actions)--;
999         return pedit_size;
1000 }
1001
1002 /**
1003  * Retrieve mask for pattern item.
1004  *
1005  * This function does basic sanity checks on a pattern item in order to
1006  * return the most appropriate mask for it.
1007  *
1008  * @param[in] item
1009  *   Item specification.
1010  * @param[in] mask_default
1011  *   Default mask for pattern item as specified by the flow API.
1012  * @param[in] mask_supported
1013  *   Mask fields supported by the implementation.
1014  * @param[in] mask_empty
1015  *   Empty mask to return when there is no specification.
1016  * @param[out] error
1017  *   Perform verbose error reporting if not NULL.
1018  *
1019  * @return
1020  *   Either @p item->mask or one of the mask parameters on success, NULL
1021  *   otherwise and rte_errno is set.
1022  */
1023 static const void *
1024 flow_tcf_item_mask(const struct rte_flow_item *item, const void *mask_default,
1025                    const void *mask_supported, const void *mask_empty,
1026                    size_t mask_size, struct rte_flow_error *error)
1027 {
1028         const uint8_t *mask;
1029         size_t i;
1030
1031         /* item->last and item->mask cannot exist without item->spec. */
1032         if (!item->spec && (item->mask || item->last)) {
1033                 rte_flow_error_set(error, EINVAL,
1034                                    RTE_FLOW_ERROR_TYPE_ITEM, item,
1035                                    "\"mask\" or \"last\" field provided without"
1036                                    " a corresponding \"spec\"");
1037                 return NULL;
1038         }
1039         /* No spec, no mask, no problem. */
1040         if (!item->spec)
1041                 return mask_empty;
1042         mask = item->mask ? item->mask : mask_default;
1043         assert(mask);
1044         /*
1045          * Single-pass check to make sure that:
1046          * - Mask is supported, no bits are set outside mask_supported.
1047          * - Both item->spec and item->last are included in mask.
1048          */
1049         for (i = 0; i != mask_size; ++i) {
1050                 if (!mask[i])
1051                         continue;
1052                 if ((mask[i] | ((const uint8_t *)mask_supported)[i]) !=
1053                     ((const uint8_t *)mask_supported)[i]) {
1054                         rte_flow_error_set(error, ENOTSUP,
1055                                            RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1056                                            "unsupported field found"
1057                                            " in \"mask\"");
1058                         return NULL;
1059                 }
1060                 if (item->last &&
1061                     (((const uint8_t *)item->spec)[i] & mask[i]) !=
1062                     (((const uint8_t *)item->last)[i] & mask[i])) {
1063                         rte_flow_error_set(error, EINVAL,
1064                                            RTE_FLOW_ERROR_TYPE_ITEM_LAST,
1065                                            item->last,
1066                                            "range between \"spec\" and \"last\""
1067                                            " not comprised in \"mask\"");
1068                         return NULL;
1069                 }
1070         }
1071         return mask;
1072 }
1073
1074 /**
1075  * Build a conversion table between port ID and ifindex.
1076  *
1077  * @param[in] dev
1078  *   Pointer to Ethernet device.
1079  * @param[out] ptoi
1080  *   Pointer to ptoi table.
1081  * @param[in] len
1082  *   Size of ptoi table provided.
1083  *
1084  * @return
1085  *   Size of ptoi table filled.
1086  */
1087 static unsigned int
1088 flow_tcf_build_ptoi_table(struct rte_eth_dev *dev, struct flow_tcf_ptoi *ptoi,
1089                           unsigned int len)
1090 {
1091         unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
1092         uint16_t port_id[n + 1];
1093         unsigned int i;
1094         unsigned int own = 0;
1095
1096         /* At least one port is needed when no switch domain is present. */
1097         if (!n) {
1098                 n = 1;
1099                 port_id[0] = dev->data->port_id;
1100         } else {
1101                 n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
1102         }
1103         if (n > len)
1104                 return 0;
1105         for (i = 0; i != n; ++i) {
1106                 struct rte_eth_dev_info dev_info;
1107
1108                 rte_eth_dev_info_get(port_id[i], &dev_info);
1109                 if (port_id[i] == dev->data->port_id)
1110                         own = i;
1111                 ptoi[i].port_id = port_id[i];
1112                 ptoi[i].ifindex = dev_info.if_index;
1113         }
1114         /* Ensure first entry of ptoi[] is the current device. */
1115         if (own) {
1116                 ptoi[n] = ptoi[0];
1117                 ptoi[0] = ptoi[own];
1118                 ptoi[own] = ptoi[n];
1119         }
1120         /* An entry with zero ifindex terminates ptoi[]. */
1121         ptoi[n].port_id = 0;
1122         ptoi[n].ifindex = 0;
1123         return n;
1124 }
1125
1126 /**
1127  * Verify the @p attr will be correctly understood by the E-switch.
1128  *
1129  * @param[in] attr
1130  *   Pointer to flow attributes
1131  * @param[out] error
1132  *   Pointer to error structure.
1133  *
1134  * @return
1135  *   0 on success, a negative errno value otherwise and rte_errno is set.
1136  */
1137 static int
1138 flow_tcf_validate_attributes(const struct rte_flow_attr *attr,
1139                              struct rte_flow_error *error)
1140 {
1141         /*
1142          * Supported attributes: groups, some priorities and ingress only.
1143          * group is supported only if kernel supports chain. Don't care about
1144          * transfer as it is the caller's problem.
1145          */
1146         if (attr->group > MLX5_TCF_GROUP_ID_MAX)
1147                 return rte_flow_error_set(error, ENOTSUP,
1148                                           RTE_FLOW_ERROR_TYPE_ATTR_GROUP, attr,
1149                                           "group ID larger than "
1150                                           RTE_STR(MLX5_TCF_GROUP_ID_MAX)
1151                                           " isn't supported");
1152         else if (attr->priority > MLX5_TCF_GROUP_PRIORITY_MAX)
1153                 return rte_flow_error_set(error, ENOTSUP,
1154                                           RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1155                                           attr,
1156                                           "priority more than "
1157                                           RTE_STR(MLX5_TCF_GROUP_PRIORITY_MAX)
1158                                           " is not supported");
1159         if (!attr->ingress)
1160                 return rte_flow_error_set(error, EINVAL,
1161                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1162                                           attr, "only ingress is supported");
1163         if (attr->egress)
1164                 return rte_flow_error_set(error, ENOTSUP,
1165                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1166                                           attr, "egress is not supported");
1167         return 0;
1168 }
1169
1170 /**
1171  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_ETH item for E-Switch.
1172  * The routine checks the L2 fields to be used in encapsulation header.
1173  *
1174  * @param[in] item
1175  *   Pointer to the item structure.
1176  * @param[out] error
1177  *   Pointer to the error structure.
1178  *
1179  * @return
1180  *   0 on success, a negative errno value otherwise and rte_errno is set.
1181  **/
1182 static int
1183 flow_tcf_validate_vxlan_encap_eth(const struct rte_flow_item *item,
1184                                   struct rte_flow_error *error)
1185 {
1186         const struct rte_flow_item_eth *spec = item->spec;
1187         const struct rte_flow_item_eth *mask = item->mask;
1188
1189         if (!spec) {
1190                 /*
1191                  * Specification for L2 addresses can be empty
1192                  * because these ones are optional and not
1193                  * required directly by tc rule. Kernel tries
1194                  * to resolve these ones on its own
1195                  */
1196                 return 0;
1197         }
1198         if (!mask) {
1199                 /* If mask is not specified use the default one. */
1200                 mask = &rte_flow_item_eth_mask;
1201         }
1202         if (memcmp(&mask->dst,
1203                    &flow_tcf_mask_empty.eth.dst,
1204                    sizeof(flow_tcf_mask_empty.eth.dst))) {
1205                 if (memcmp(&mask->dst,
1206                            &rte_flow_item_eth_mask.dst,
1207                            sizeof(rte_flow_item_eth_mask.dst)))
1208                         return rte_flow_error_set
1209                                 (error, ENOTSUP,
1210                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1211                                  "no support for partial mask on"
1212                                  " \"eth.dst\" field");
1213         }
1214         if (memcmp(&mask->src,
1215                    &flow_tcf_mask_empty.eth.src,
1216                    sizeof(flow_tcf_mask_empty.eth.src))) {
1217                 if (memcmp(&mask->src,
1218                            &rte_flow_item_eth_mask.src,
1219                            sizeof(rte_flow_item_eth_mask.src)))
1220                         return rte_flow_error_set
1221                                 (error, ENOTSUP,
1222                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1223                                  "no support for partial mask on"
1224                                  " \"eth.src\" field");
1225         }
1226         if (mask->type != RTE_BE16(0x0000)) {
1227                 if (mask->type != RTE_BE16(0xffff))
1228                         return rte_flow_error_set
1229                                 (error, ENOTSUP,
1230                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1231                                  "no support for partial mask on"
1232                                  " \"eth.type\" field");
1233                 DRV_LOG(WARNING,
1234                         "outer ethernet type field"
1235                         " cannot be forced for vxlan"
1236                         " encapsulation, parameter ignored");
1237         }
1238         return 0;
1239 }
1240
1241 /**
1242  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV4 item for E-Switch.
1243  * The routine checks the IPv4 fields to be used in encapsulation header.
1244  *
1245  * @param[in] item
1246  *   Pointer to the item structure.
1247  * @param[out] error
1248  *   Pointer to the error structure.
1249  *
1250  * @return
1251  *   0 on success, a negative errno value otherwise and rte_errno is set.
1252  **/
1253 static int
1254 flow_tcf_validate_vxlan_encap_ipv4(const struct rte_flow_item *item,
1255                                    struct rte_flow_error *error)
1256 {
1257         const struct rte_flow_item_ipv4 *spec = item->spec;
1258         const struct rte_flow_item_ipv4 *mask = item->mask;
1259
1260         if (!spec) {
1261                 /*
1262                  * Specification for IP addresses cannot be empty
1263                  * because it is required by tunnel_key parameter.
1264                  */
1265                 return rte_flow_error_set(error, EINVAL,
1266                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1267                                           "NULL outer ipv4 address"
1268                                           " specification for vxlan"
1269                                           " encapsulation");
1270         }
1271         if (!mask)
1272                 mask = &rte_flow_item_ipv4_mask;
1273         if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) {
1274                 if (mask->hdr.dst_addr != RTE_BE32(0xffffffff))
1275                         return rte_flow_error_set
1276                                 (error, ENOTSUP,
1277                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1278                                  "no support for partial mask on"
1279                                  " \"ipv4.hdr.dst_addr\" field"
1280                                  " for vxlan encapsulation");
1281                 /* More IPv4 address validations can be put here. */
1282         } else {
1283                 /*
1284                  * Kernel uses the destination IP address to determine
1285                  * the routing path and obtain the MAC destination
1286                  * address, so IP destination address must be
1287                  * specified in the tc rule.
1288                  */
1289                 return rte_flow_error_set(error, EINVAL,
1290                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1291                                           "outer ipv4 destination address"
1292                                           " must be specified for"
1293                                           " vxlan encapsulation");
1294         }
1295         if (mask->hdr.src_addr != RTE_BE32(0x00000000)) {
1296                 if (mask->hdr.src_addr != RTE_BE32(0xffffffff))
1297                         return rte_flow_error_set
1298                                 (error, ENOTSUP,
1299                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1300                                  "no support for partial mask on"
1301                                  " \"ipv4.hdr.src_addr\" field"
1302                                  " for vxlan encapsulation");
1303                 /* More IPv4 address validations can be put here. */
1304         } else {
1305                 /*
1306                  * Kernel uses the source IP address to select the
1307                  * interface for egress encapsulated traffic, so
1308                  * it must be specified in the tc rule.
1309                  */
1310                 return rte_flow_error_set(error, EINVAL,
1311                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1312                                           "outer ipv4 source address"
1313                                           " must be specified for"
1314                                           " vxlan encapsulation");
1315         }
1316         if (mask->hdr.type_of_service &&
1317             mask->hdr.type_of_service != 0xff)
1318                 return rte_flow_error_set(error, ENOTSUP,
1319                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1320                                           "no support for partial mask on"
1321                                           " \"ipv4.hdr.type_of_service\" field"
1322                                           " for vxlan encapsulation");
1323         if (mask->hdr.time_to_live &&
1324             mask->hdr.time_to_live != 0xff)
1325                 return rte_flow_error_set(error, ENOTSUP,
1326                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1327                                           "no support for partial mask on"
1328                                           " \"ipv4.hdr.time_to_live\" field"
1329                                           " for vxlan encapsulation");
1330         return 0;
1331 }
1332
1333 /**
1334  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV6 item for E-Switch.
1335  * The routine checks the IPv6 fields to be used in encapsulation header.
1336  *
1337  * @param[in] item
1338  *   Pointer to the item structure.
1339  * @param[out] error
1340  *   Pointer to the error structure.
1341  *
1342  * @return
1343  *   0 on success, a negative errno value otherwise and rte_errno is set.
1344  **/
1345 static int
1346 flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item,
1347                                    struct rte_flow_error *error)
1348 {
1349         const struct rte_flow_item_ipv6 *spec = item->spec;
1350         const struct rte_flow_item_ipv6 *mask = item->mask;
1351         uint8_t msk6;
1352
1353         if (!spec) {
1354                 /*
1355                  * Specification for IP addresses cannot be empty
1356                  * because it is required by tunnel_key parameter.
1357                  */
1358                 return rte_flow_error_set(error, EINVAL,
1359                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1360                                           "NULL outer ipv6 address"
1361                                           " specification for"
1362                                           " vxlan encapsulation");
1363         }
1364         if (!mask)
1365                 mask = &rte_flow_item_ipv6_mask;
1366         if (memcmp(&mask->hdr.dst_addr,
1367                    &flow_tcf_mask_empty.ipv6.hdr.dst_addr,
1368                    IPV6_ADDR_LEN)) {
1369                 if (memcmp(&mask->hdr.dst_addr,
1370                            &rte_flow_item_ipv6_mask.hdr.dst_addr,
1371                            IPV6_ADDR_LEN))
1372                         return rte_flow_error_set
1373                                         (error, ENOTSUP,
1374                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1375                                          "no support for partial mask on"
1376                                          " \"ipv6.hdr.dst_addr\" field"
1377                                          " for vxlan encapsulation");
1378                 /* More IPv6 address validations can be put here. */
1379         } else {
1380                 /*
1381                  * Kernel uses the destination IP address to determine
1382                  * the routing path and obtain the MAC destination
1383                  * address (heigh or gate), so IP destination address
1384                  * must be specified within the tc rule.
1385                  */
1386                 return rte_flow_error_set(error, EINVAL,
1387                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1388                                           "outer ipv6 destination address"
1389                                           " must be specified for"
1390                                           " vxlan encapsulation");
1391         }
1392         if (memcmp(&mask->hdr.src_addr,
1393                    &flow_tcf_mask_empty.ipv6.hdr.src_addr,
1394                    IPV6_ADDR_LEN)) {
1395                 if (memcmp(&mask->hdr.src_addr,
1396                            &rte_flow_item_ipv6_mask.hdr.src_addr,
1397                            IPV6_ADDR_LEN))
1398                         return rte_flow_error_set
1399                                         (error, ENOTSUP,
1400                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1401                                          "no support for partial mask on"
1402                                          " \"ipv6.hdr.src_addr\" field"
1403                                          " for vxlan encapsulation");
1404                 /* More L3 address validation can be put here. */
1405         } else {
1406                 /*
1407                  * Kernel uses the source IP address to select the
1408                  * interface for egress encapsulated traffic, so
1409                  * it must be specified in the tc rule.
1410                  */
1411                 return rte_flow_error_set(error, EINVAL,
1412                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1413                                           "outer L3 source address"
1414                                           " must be specified for"
1415                                           " vxlan encapsulation");
1416         }
1417         msk6 = (rte_be_to_cpu_32(mask->hdr.vtc_flow) >>
1418                 IPV6_HDR_TC_SHIFT) & 0xff;
1419         if (msk6 && msk6 != 0xff)
1420                 return rte_flow_error_set(error, ENOTSUP,
1421                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1422                                           "no support for partial mask on"
1423                                           " \"ipv6.hdr.vtc_flow.tos\" field"
1424                                           " for vxlan encapsulation");
1425         if (mask->hdr.hop_limits && mask->hdr.hop_limits != 0xff)
1426                 return rte_flow_error_set(error, ENOTSUP,
1427                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1428                                           "no support for partial mask on"
1429                                           " \"ipv6.hdr.hop_limits\" field"
1430                                           " for vxlan encapsulation");
1431         return 0;
1432 }
1433
1434 /**
1435  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_UDP item for E-Switch.
1436  * The routine checks the UDP fields to be used in encapsulation header.
1437  *
1438  * @param[in] item
1439  *   Pointer to the item structure.
1440  * @param[out] error
1441  *   Pointer to the error structure.
1442  *
1443  * @return
1444  *   0 on success, a negative errno value otherwise and rte_errno is set.
1445  **/
1446 static int
1447 flow_tcf_validate_vxlan_encap_udp(const struct rte_flow_item *item,
1448                                   struct rte_flow_error *error)
1449 {
1450         const struct rte_flow_item_udp *spec = item->spec;
1451         const struct rte_flow_item_udp *mask = item->mask;
1452
1453         if (!spec) {
1454                 /*
1455                  * Specification for UDP ports cannot be empty
1456                  * because it is required by tunnel_key parameter.
1457                  */
1458                 return rte_flow_error_set(error, EINVAL,
1459                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1460                                           "NULL UDP port specification "
1461                                           " for vxlan encapsulation");
1462         }
1463         if (!mask)
1464                 mask = &rte_flow_item_udp_mask;
1465         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1466                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1467                         return rte_flow_error_set
1468                                         (error, ENOTSUP,
1469                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1470                                          "no support for partial mask on"
1471                                          " \"udp.hdr.dst_port\" field"
1472                                          " for vxlan encapsulation");
1473                 if (!spec->hdr.dst_port)
1474                         return rte_flow_error_set
1475                                         (error, EINVAL,
1476                                          RTE_FLOW_ERROR_TYPE_ITEM, item,
1477                                          "outer UDP remote port cannot be"
1478                                          " 0 for vxlan encapsulation");
1479         } else {
1480                 return rte_flow_error_set(error, EINVAL,
1481                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1482                                           "outer UDP remote port"
1483                                           " must be specified for"
1484                                           " vxlan encapsulation");
1485         }
1486         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1487                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1488                         return rte_flow_error_set
1489                                         (error, ENOTSUP,
1490                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1491                                          "no support for partial mask on"
1492                                          " \"udp.hdr.src_port\" field"
1493                                          " for vxlan encapsulation");
1494                 DRV_LOG(WARNING,
1495                         "outer UDP source port cannot be"
1496                         " forced for vxlan encapsulation,"
1497                         " parameter ignored");
1498         }
1499         return 0;
1500 }
1501
1502 /**
1503  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_VXLAN item for E-Switch.
1504  * The routine checks the VNIP fields to be used in encapsulation header.
1505  *
1506  * @param[in] item
1507  *   Pointer to the item structure.
1508  * @param[out] error
1509  *   Pointer to the error structure.
1510  *
1511  * @return
1512  *   0 on success, a negative errno value otherwise and rte_errno is set.
1513  **/
1514 static int
1515 flow_tcf_validate_vxlan_encap_vni(const struct rte_flow_item *item,
1516                                   struct rte_flow_error *error)
1517 {
1518         const struct rte_flow_item_vxlan *spec = item->spec;
1519         const struct rte_flow_item_vxlan *mask = item->mask;
1520
1521         if (!spec) {
1522                 /* Outer VNI is required by tunnel_key parameter. */
1523                 return rte_flow_error_set(error, EINVAL,
1524                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1525                                           "NULL VNI specification"
1526                                           " for vxlan encapsulation");
1527         }
1528         if (!mask)
1529                 mask = &rte_flow_item_vxlan_mask;
1530         if (!mask->vni[0] && !mask->vni[1] && !mask->vni[2])
1531                 return rte_flow_error_set(error, EINVAL,
1532                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1533                                           "outer VNI must be specified "
1534                                           "for vxlan encapsulation");
1535         if (mask->vni[0] != 0xff ||
1536             mask->vni[1] != 0xff ||
1537             mask->vni[2] != 0xff)
1538                 return rte_flow_error_set(error, ENOTSUP,
1539                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1540                                           "no support for partial mask on"
1541                                           " \"vxlan.vni\" field");
1542
1543         if (!spec->vni[0] && !spec->vni[1] && !spec->vni[2])
1544                 return rte_flow_error_set(error, EINVAL,
1545                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1546                                           "vxlan vni cannot be 0");
1547         return 0;
1548 }
1549
1550 /**
1551  * Validate VXLAN_ENCAP action item list for E-Switch.
1552  * The routine checks items to be used in encapsulation header.
1553  *
1554  * @param[in] action
1555  *   Pointer to the VXLAN_ENCAP action structure.
1556  * @param[out] error
1557  *   Pointer to the error structure.
1558  *
1559  * @return
1560  *   0 on success, a negative errno value otherwise and rte_errno is set.
1561  **/
1562 static int
1563 flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action,
1564                               struct rte_flow_error *error)
1565 {
1566         const struct rte_flow_item *items;
1567         int ret;
1568         uint32_t item_flags = 0;
1569
1570         if (!action->conf)
1571                 return rte_flow_error_set(error, EINVAL,
1572                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1573                                           "Missing vxlan tunnel"
1574                                           " action configuration");
1575         items = ((const struct rte_flow_action_vxlan_encap *)
1576                                         action->conf)->definition;
1577         if (!items)
1578                 return rte_flow_error_set(error, EINVAL,
1579                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1580                                           "Missing vxlan tunnel"
1581                                           " encapsulation parameters");
1582         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1583                 switch (items->type) {
1584                 case RTE_FLOW_ITEM_TYPE_VOID:
1585                         break;
1586                 case RTE_FLOW_ITEM_TYPE_ETH:
1587                         ret = mlx5_flow_validate_item_eth(items, item_flags,
1588                                                           error);
1589                         if (ret < 0)
1590                                 return ret;
1591                         ret = flow_tcf_validate_vxlan_encap_eth(items, error);
1592                         if (ret < 0)
1593                                 return ret;
1594                         item_flags |= MLX5_FLOW_LAYER_OUTER_L2;
1595                         break;
1596                 break;
1597                 case RTE_FLOW_ITEM_TYPE_IPV4:
1598                         ret = mlx5_flow_validate_item_ipv4(items, item_flags,
1599                                                            error);
1600                         if (ret < 0)
1601                                 return ret;
1602                         ret = flow_tcf_validate_vxlan_encap_ipv4(items, error);
1603                         if (ret < 0)
1604                                 return ret;
1605                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
1606                         break;
1607                 case RTE_FLOW_ITEM_TYPE_IPV6:
1608                         ret = mlx5_flow_validate_item_ipv6(items, item_flags,
1609                                                            error);
1610                         if (ret < 0)
1611                                 return ret;
1612                         ret = flow_tcf_validate_vxlan_encap_ipv6(items, error);
1613                         if (ret < 0)
1614                                 return ret;
1615                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
1616                         break;
1617                 case RTE_FLOW_ITEM_TYPE_UDP:
1618                         ret = mlx5_flow_validate_item_udp(items, item_flags,
1619                                                            0xFF, error);
1620                         if (ret < 0)
1621                                 return ret;
1622                         ret = flow_tcf_validate_vxlan_encap_udp(items, error);
1623                         if (ret < 0)
1624                                 return ret;
1625                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
1626                         break;
1627                 case RTE_FLOW_ITEM_TYPE_VXLAN:
1628                         ret = mlx5_flow_validate_item_vxlan(items,
1629                                                             item_flags, error);
1630                         if (ret < 0)
1631                                 return ret;
1632                         ret = flow_tcf_validate_vxlan_encap_vni(items, error);
1633                         if (ret < 0)
1634                                 return ret;
1635                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
1636                         break;
1637                 default:
1638                         return rte_flow_error_set
1639                                         (error, ENOTSUP,
1640                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
1641                                          "vxlan encap item not supported");
1642                 }
1643         }
1644         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3))
1645                 return rte_flow_error_set(error, EINVAL,
1646                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1647                                           "no outer IP layer found"
1648                                           " for vxlan encapsulation");
1649         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
1650                 return rte_flow_error_set(error, EINVAL,
1651                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1652                                           "no outer UDP layer found"
1653                                           " for vxlan encapsulation");
1654         if (!(item_flags & MLX5_FLOW_LAYER_VXLAN))
1655                 return rte_flow_error_set(error, EINVAL,
1656                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1657                                           "no VXLAN VNI found"
1658                                           " for vxlan encapsulation");
1659         return 0;
1660 }
1661
1662 /**
1663  * Validate outer RTE_FLOW_ITEM_TYPE_UDP item if tunnel item
1664  * RTE_FLOW_ITEM_TYPE_VXLAN is present in item list.
1665  *
1666  * @param[in] udp
1667  *   Outer UDP layer item (if any, NULL otherwise).
1668  * @param[out] error
1669  *   Pointer to the error structure.
1670  *
1671  * @return
1672  *   0 on success, a negative errno value otherwise and rte_errno is set.
1673  **/
1674 static int
1675 flow_tcf_validate_vxlan_decap_udp(const struct rte_flow_item *udp,
1676                                   struct rte_flow_error *error)
1677 {
1678         const struct rte_flow_item_udp *spec = udp->spec;
1679         const struct rte_flow_item_udp *mask = udp->mask;
1680
1681         if (!spec)
1682                 /*
1683                  * Specification for UDP ports cannot be empty
1684                  * because it is required as decap parameter.
1685                  */
1686                 return rte_flow_error_set(error, EINVAL,
1687                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1688                                           "NULL UDP port specification"
1689                                           " for VXLAN decapsulation");
1690         if (!mask)
1691                 mask = &rte_flow_item_udp_mask;
1692         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1693                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1694                         return rte_flow_error_set
1695                                         (error, ENOTSUP,
1696                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1697                                          "no support for partial mask on"
1698                                          " \"udp.hdr.dst_port\" field");
1699                 if (!spec->hdr.dst_port)
1700                         return rte_flow_error_set
1701                                         (error, EINVAL,
1702                                          RTE_FLOW_ERROR_TYPE_ITEM, udp,
1703                                          "zero decap local UDP port");
1704         } else {
1705                 return rte_flow_error_set(error, EINVAL,
1706                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1707                                           "outer UDP destination port must be "
1708                                           "specified for vxlan decapsulation");
1709         }
1710         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1711                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1712                         return rte_flow_error_set
1713                                         (error, ENOTSUP,
1714                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1715                                          "no support for partial mask on"
1716                                          " \"udp.hdr.src_port\" field");
1717                 DRV_LOG(WARNING,
1718                         "outer UDP local port cannot be "
1719                         "forced for VXLAN encapsulation, "
1720                         "parameter ignored");
1721         }
1722         return 0;
1723 }
1724
1725 /**
1726  * Validate flow for E-Switch.
1727  *
1728  * @param[in] priv
1729  *   Pointer to the priv structure.
1730  * @param[in] attr
1731  *   Pointer to the flow attributes.
1732  * @param[in] items
1733  *   Pointer to the list of items.
1734  * @param[in] actions
1735  *   Pointer to the list of actions.
1736  * @param[out] error
1737  *   Pointer to the error structure.
1738  *
1739  * @return
1740  *   0 on success, a negative errno value otherwise and rte_errno is set.
1741  */
1742 static int
1743 flow_tcf_validate(struct rte_eth_dev *dev,
1744                   const struct rte_flow_attr *attr,
1745                   const struct rte_flow_item items[],
1746                   const struct rte_flow_action actions[],
1747                   struct rte_flow_error *error)
1748 {
1749         union {
1750                 const struct rte_flow_item_port_id *port_id;
1751                 const struct rte_flow_item_eth *eth;
1752                 const struct rte_flow_item_vlan *vlan;
1753                 const struct rte_flow_item_ipv4 *ipv4;
1754                 const struct rte_flow_item_ipv6 *ipv6;
1755                 const struct rte_flow_item_tcp *tcp;
1756                 const struct rte_flow_item_udp *udp;
1757                 const struct rte_flow_item_vxlan *vxlan;
1758         } spec, mask;
1759         union {
1760                 const struct rte_flow_action_port_id *port_id;
1761                 const struct rte_flow_action_jump *jump;
1762                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
1763                 const struct rte_flow_action_of_set_vlan_vid *
1764                         of_set_vlan_vid;
1765                 const struct rte_flow_action_of_set_vlan_pcp *
1766                         of_set_vlan_pcp;
1767                 const struct rte_flow_action_vxlan_encap *vxlan_encap;
1768                 const struct rte_flow_action_set_ipv4 *set_ipv4;
1769                 const struct rte_flow_action_set_ipv6 *set_ipv6;
1770         } conf;
1771         const struct rte_flow_item *outer_udp = NULL;
1772         rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
1773         rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
1774         rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
1775         uint64_t item_flags = 0;
1776         uint64_t action_flags = 0;
1777         uint8_t next_protocol = 0xff;
1778         unsigned int tcm_ifindex = 0;
1779         uint8_t pedit_validated = 0;
1780         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
1781         struct rte_eth_dev *port_id_dev = NULL;
1782         bool in_port_id_set;
1783         int ret;
1784
1785         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
1786                                                 PTOI_TABLE_SZ_MAX(dev)));
1787         ret = flow_tcf_validate_attributes(attr, error);
1788         if (ret < 0)
1789                 return ret;
1790         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
1791                 unsigned int i;
1792                 uint64_t current_action_flag = 0;
1793
1794                 switch (actions->type) {
1795                 case RTE_FLOW_ACTION_TYPE_VOID:
1796                         break;
1797                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
1798                         current_action_flag = MLX5_FLOW_ACTION_PORT_ID;
1799                         if (!actions->conf)
1800                                 break;
1801                         conf.port_id = actions->conf;
1802                         if (conf.port_id->original)
1803                                 i = 0;
1804                         else
1805                                 for (i = 0; ptoi[i].ifindex; ++i)
1806                                         if (ptoi[i].port_id == conf.port_id->id)
1807                                                 break;
1808                         if (!ptoi[i].ifindex)
1809                                 return rte_flow_error_set
1810                                         (error, ENODEV,
1811                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1812                                          conf.port_id,
1813                                          "missing data to convert port ID to"
1814                                          " ifindex");
1815                         port_id_dev = &rte_eth_devices[conf.port_id->id];
1816                         break;
1817                 case RTE_FLOW_ACTION_TYPE_JUMP:
1818                         current_action_flag = MLX5_FLOW_ACTION_JUMP;
1819                         if (!actions->conf)
1820                                 break;
1821                         conf.jump = actions->conf;
1822                         if (attr->group >= conf.jump->group)
1823                                 return rte_flow_error_set
1824                                         (error, ENOTSUP,
1825                                          RTE_FLOW_ERROR_TYPE_ACTION,
1826                                          actions,
1827                                          "can jump only to a group forward");
1828                         break;
1829                 case RTE_FLOW_ACTION_TYPE_DROP:
1830                         current_action_flag = MLX5_FLOW_ACTION_DROP;
1831                         break;
1832                 case RTE_FLOW_ACTION_TYPE_COUNT:
1833                         break;
1834                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
1835                         current_action_flag = MLX5_FLOW_ACTION_OF_POP_VLAN;
1836                         break;
1837                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: {
1838                         rte_be16_t ethertype;
1839
1840                         current_action_flag = MLX5_FLOW_ACTION_OF_PUSH_VLAN;
1841                         if (!actions->conf)
1842                                 break;
1843                         conf.of_push_vlan = actions->conf;
1844                         ethertype = conf.of_push_vlan->ethertype;
1845                         if (ethertype != RTE_BE16(ETH_P_8021Q) &&
1846                             ethertype != RTE_BE16(ETH_P_8021AD))
1847                                 return rte_flow_error_set
1848                                         (error, EINVAL,
1849                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1850                                          "vlan push TPID must be "
1851                                          "802.1Q or 802.1AD");
1852                         break;
1853                 }
1854                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
1855                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1856                                 return rte_flow_error_set
1857                                         (error, ENOTSUP,
1858                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1859                                          "vlan modify is not supported,"
1860                                          " set action must follow push action");
1861                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
1862                         break;
1863                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
1864                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1865                                 return rte_flow_error_set
1866                                         (error, ENOTSUP,
1867                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1868                                          "vlan modify is not supported,"
1869                                          " set action must follow push action");
1870                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
1871                         break;
1872                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
1873                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_DECAP;
1874                         break;
1875                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
1876                         ret = flow_tcf_validate_vxlan_encap(actions, error);
1877                         if (ret < 0)
1878                                 return ret;
1879                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_ENCAP;
1880                         break;
1881                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
1882                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_SRC;
1883                         break;
1884                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
1885                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_DST;
1886                         break;
1887                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
1888                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_SRC;
1889                         break;
1890                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
1891                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_DST;
1892                         break;
1893                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
1894                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_SRC;
1895                         break;
1896                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
1897                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_DST;
1898                         break;
1899                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
1900                         current_action_flag = MLX5_FLOW_ACTION_SET_TTL;
1901                         break;
1902                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
1903                         current_action_flag = MLX5_FLOW_ACTION_DEC_TTL;
1904                         break;
1905                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
1906                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_SRC;
1907                         break;
1908                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
1909                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_DST;
1910                         break;
1911                 default:
1912                         return rte_flow_error_set(error, ENOTSUP,
1913                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1914                                                   actions,
1915                                                   "action not supported");
1916                 }
1917                 if (current_action_flag & MLX5_TCF_CONFIG_ACTIONS) {
1918                         if (!actions->conf)
1919                                 return rte_flow_error_set
1920                                         (error, EINVAL,
1921                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1922                                          actions,
1923                                          "action configuration not set");
1924                 }
1925                 if ((current_action_flag & MLX5_TCF_PEDIT_ACTIONS) &&
1926                     pedit_validated)
1927                         return rte_flow_error_set(error, ENOTSUP,
1928                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1929                                                   actions,
1930                                                   "set actions should be "
1931                                                   "listed successively");
1932                 if ((current_action_flag & ~MLX5_TCF_PEDIT_ACTIONS) &&
1933                     (action_flags & MLX5_TCF_PEDIT_ACTIONS))
1934                         pedit_validated = 1;
1935                 if ((current_action_flag & MLX5_TCF_FATE_ACTIONS) &&
1936                     (action_flags & MLX5_TCF_FATE_ACTIONS))
1937                         return rte_flow_error_set(error, EINVAL,
1938                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1939                                                   actions,
1940                                                   "can't have multiple fate"
1941                                                   " actions");
1942                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1943                     (action_flags & MLX5_TCF_VXLAN_ACTIONS))
1944                         return rte_flow_error_set(error, EINVAL,
1945                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1946                                                   actions,
1947                                                   "can't have multiple vxlan"
1948                                                   " actions");
1949                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1950                     (action_flags & MLX5_TCF_VLAN_ACTIONS))
1951                         return rte_flow_error_set(error, ENOTSUP,
1952                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1953                                                   actions,
1954                                                   "can't have vxlan and vlan"
1955                                                   " actions in the same rule");
1956                 action_flags |= current_action_flag;
1957         }
1958         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1959                 unsigned int i;
1960
1961                 switch (items->type) {
1962                 case RTE_FLOW_ITEM_TYPE_VOID:
1963                         break;
1964                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
1965                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
1966                                 return rte_flow_error_set
1967                                         (error, ENOTSUP,
1968                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
1969                                          "inner tunnel port id"
1970                                          " item is not supported");
1971                         mask.port_id = flow_tcf_item_mask
1972                                 (items, &rte_flow_item_port_id_mask,
1973                                  &flow_tcf_mask_supported.port_id,
1974                                  &flow_tcf_mask_empty.port_id,
1975                                  sizeof(flow_tcf_mask_supported.port_id),
1976                                  error);
1977                         if (!mask.port_id)
1978                                 return -rte_errno;
1979                         if (mask.port_id == &flow_tcf_mask_empty.port_id) {
1980                                 in_port_id_set = 1;
1981                                 break;
1982                         }
1983                         spec.port_id = items->spec;
1984                         if (mask.port_id->id && mask.port_id->id != 0xffffffff)
1985                                 return rte_flow_error_set
1986                                         (error, ENOTSUP,
1987                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
1988                                          mask.port_id,
1989                                          "no support for partial mask on"
1990                                          " \"id\" field");
1991                         if (!mask.port_id->id)
1992                                 i = 0;
1993                         else
1994                                 for (i = 0; ptoi[i].ifindex; ++i)
1995                                         if (ptoi[i].port_id == spec.port_id->id)
1996                                                 break;
1997                         if (!ptoi[i].ifindex)
1998                                 return rte_flow_error_set
1999                                         (error, ENODEV,
2000                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
2001                                          spec.port_id,
2002                                          "missing data to convert port ID to"
2003                                          " ifindex");
2004                         if (in_port_id_set && ptoi[i].ifindex != tcm_ifindex)
2005                                 return rte_flow_error_set
2006                                         (error, ENOTSUP,
2007                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
2008                                          spec.port_id,
2009                                          "cannot match traffic for"
2010                                          " several port IDs through"
2011                                          " a single flow rule");
2012                         tcm_ifindex = ptoi[i].ifindex;
2013                         in_port_id_set = 1;
2014                         break;
2015                 case RTE_FLOW_ITEM_TYPE_ETH:
2016                         ret = mlx5_flow_validate_item_eth(items, item_flags,
2017                                                           error);
2018                         if (ret < 0)
2019                                 return ret;
2020                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2021                                       MLX5_FLOW_LAYER_INNER_L2 :
2022                                       MLX5_FLOW_LAYER_OUTER_L2;
2023                         /* TODO:
2024                          * Redundant check due to different supported mask.
2025                          * Same for the rest of items.
2026                          */
2027                         mask.eth = flow_tcf_item_mask
2028                                 (items, &rte_flow_item_eth_mask,
2029                                  &flow_tcf_mask_supported.eth,
2030                                  &flow_tcf_mask_empty.eth,
2031                                  sizeof(flow_tcf_mask_supported.eth),
2032                                  error);
2033                         if (!mask.eth)
2034                                 return -rte_errno;
2035                         if (mask.eth->type && mask.eth->type !=
2036                             RTE_BE16(0xffff))
2037                                 return rte_flow_error_set
2038                                         (error, ENOTSUP,
2039                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2040                                          mask.eth,
2041                                          "no support for partial mask on"
2042                                          " \"type\" field");
2043                         assert(items->spec);
2044                         spec.eth = items->spec;
2045                         if (mask.eth->type &&
2046                             (item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
2047                             inner_etype != RTE_BE16(ETH_P_ALL) &&
2048                             inner_etype != spec.eth->type)
2049                                 return rte_flow_error_set
2050                                         (error, EINVAL,
2051                                          RTE_FLOW_ERROR_TYPE_ITEM,
2052                                          items,
2053                                          "inner eth_type conflict");
2054                         if (mask.eth->type &&
2055                             !(item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
2056                             outer_etype != RTE_BE16(ETH_P_ALL) &&
2057                             outer_etype != spec.eth->type)
2058                                 return rte_flow_error_set
2059                                         (error, EINVAL,
2060                                          RTE_FLOW_ERROR_TYPE_ITEM,
2061                                          items,
2062                                          "outer eth_type conflict");
2063                         if (mask.eth->type) {
2064                                 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
2065                                         inner_etype = spec.eth->type;
2066                                 else
2067                                         outer_etype = spec.eth->type;
2068                         }
2069                         break;
2070                 case RTE_FLOW_ITEM_TYPE_VLAN:
2071                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
2072                                 return rte_flow_error_set
2073                                         (error, ENOTSUP,
2074                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
2075                                          "inner tunnel VLAN"
2076                                          " is not supported");
2077                         ret = mlx5_flow_validate_item_vlan(items, item_flags,
2078                                                            error);
2079                         if (ret < 0)
2080                                 return ret;
2081                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
2082                         mask.vlan = flow_tcf_item_mask
2083                                 (items, &rte_flow_item_vlan_mask,
2084                                  &flow_tcf_mask_supported.vlan,
2085                                  &flow_tcf_mask_empty.vlan,
2086                                  sizeof(flow_tcf_mask_supported.vlan),
2087                                  error);
2088                         if (!mask.vlan)
2089                                 return -rte_errno;
2090                         if ((mask.vlan->tci & RTE_BE16(0xe000) &&
2091                              (mask.vlan->tci & RTE_BE16(0xe000)) !=
2092                               RTE_BE16(0xe000)) ||
2093                             (mask.vlan->tci & RTE_BE16(0x0fff) &&
2094                              (mask.vlan->tci & RTE_BE16(0x0fff)) !=
2095                               RTE_BE16(0x0fff)) ||
2096                             (mask.vlan->inner_type &&
2097                              mask.vlan->inner_type != RTE_BE16(0xffff)))
2098                                 return rte_flow_error_set
2099                                         (error, ENOTSUP,
2100                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2101                                          mask.vlan,
2102                                          "no support for partial masks on"
2103                                          " \"tci\" (PCP and VID parts) and"
2104                                          " \"inner_type\" fields");
2105                         if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2106                             outer_etype != RTE_BE16(ETH_P_8021Q))
2107                                 return rte_flow_error_set
2108                                         (error, EINVAL,
2109                                          RTE_FLOW_ERROR_TYPE_ITEM,
2110                                          items,
2111                                          "outer eth_type conflict,"
2112                                          " must be 802.1Q");
2113                         outer_etype = RTE_BE16(ETH_P_8021Q);
2114                         assert(items->spec);
2115                         spec.vlan = items->spec;
2116                         if (mask.vlan->inner_type &&
2117                             vlan_etype != RTE_BE16(ETH_P_ALL) &&
2118                             vlan_etype != spec.vlan->inner_type)
2119                                 return rte_flow_error_set
2120                                         (error, EINVAL,
2121                                          RTE_FLOW_ERROR_TYPE_ITEM,
2122                                          items,
2123                                          "vlan eth_type conflict");
2124                         if (mask.vlan->inner_type)
2125                                 vlan_etype = spec.vlan->inner_type;
2126                         break;
2127                 case RTE_FLOW_ITEM_TYPE_IPV4:
2128                         ret = mlx5_flow_validate_item_ipv4(items, item_flags,
2129                                                            error);
2130                         if (ret < 0)
2131                                 return ret;
2132                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2133                                       MLX5_FLOW_LAYER_INNER_L3_IPV4 :
2134                                       MLX5_FLOW_LAYER_OUTER_L3_IPV4;
2135                         mask.ipv4 = flow_tcf_item_mask
2136                                 (items, &rte_flow_item_ipv4_mask,
2137                                  &flow_tcf_mask_supported.ipv4,
2138                                  &flow_tcf_mask_empty.ipv4,
2139                                  sizeof(flow_tcf_mask_supported.ipv4),
2140                                  error);
2141                         if (!mask.ipv4)
2142                                 return -rte_errno;
2143                         if (mask.ipv4->hdr.next_proto_id &&
2144                             mask.ipv4->hdr.next_proto_id != 0xff)
2145                                 return rte_flow_error_set
2146                                         (error, ENOTSUP,
2147                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2148                                          mask.ipv4,
2149                                          "no support for partial mask on"
2150                                          " \"hdr.next_proto_id\" field");
2151                         else if (mask.ipv4->hdr.next_proto_id)
2152                                 next_protocol =
2153                                         ((const struct rte_flow_item_ipv4 *)
2154                                          (items->spec))->hdr.next_proto_id;
2155                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2156                                 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2157                                     inner_etype != RTE_BE16(ETH_P_IP))
2158                                         return rte_flow_error_set
2159                                                 (error, EINVAL,
2160                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2161                                                  items,
2162                                                  "inner eth_type conflict,"
2163                                                  " IPv4 is required");
2164                                 inner_etype = RTE_BE16(ETH_P_IP);
2165                         } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2166                                 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2167                                     vlan_etype != RTE_BE16(ETH_P_IP))
2168                                         return rte_flow_error_set
2169                                                 (error, EINVAL,
2170                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2171                                                  items,
2172                                                  "vlan eth_type conflict,"
2173                                                  " IPv4 is required");
2174                                 vlan_etype = RTE_BE16(ETH_P_IP);
2175                         } else {
2176                                 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2177                                     outer_etype != RTE_BE16(ETH_P_IP))
2178                                         return rte_flow_error_set
2179                                                 (error, EINVAL,
2180                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2181                                                  items,
2182                                                  "eth_type conflict,"
2183                                                  " IPv4 is required");
2184                                 outer_etype = RTE_BE16(ETH_P_IP);
2185                         }
2186                         break;
2187                 case RTE_FLOW_ITEM_TYPE_IPV6:
2188                         ret = mlx5_flow_validate_item_ipv6(items, item_flags,
2189                                                            error);
2190                         if (ret < 0)
2191                                 return ret;
2192                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2193                                       MLX5_FLOW_LAYER_INNER_L3_IPV6 :
2194                                       MLX5_FLOW_LAYER_OUTER_L3_IPV6;
2195                         mask.ipv6 = flow_tcf_item_mask
2196                                 (items, &rte_flow_item_ipv6_mask,
2197                                  &flow_tcf_mask_supported.ipv6,
2198                                  &flow_tcf_mask_empty.ipv6,
2199                                  sizeof(flow_tcf_mask_supported.ipv6),
2200                                  error);
2201                         if (!mask.ipv6)
2202                                 return -rte_errno;
2203                         if (mask.ipv6->hdr.proto &&
2204                             mask.ipv6->hdr.proto != 0xff)
2205                                 return rte_flow_error_set
2206                                         (error, ENOTSUP,
2207                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2208                                          mask.ipv6,
2209                                          "no support for partial mask on"
2210                                          " \"hdr.proto\" field");
2211                         else if (mask.ipv6->hdr.proto)
2212                                 next_protocol =
2213                                         ((const struct rte_flow_item_ipv6 *)
2214                                          (items->spec))->hdr.proto;
2215                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2216                                 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2217                                     inner_etype != RTE_BE16(ETH_P_IPV6))
2218                                         return rte_flow_error_set
2219                                                 (error, EINVAL,
2220                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2221                                                  items,
2222                                                  "inner eth_type conflict,"
2223                                                  " IPv6 is required");
2224                                 inner_etype = RTE_BE16(ETH_P_IPV6);
2225                         } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2226                                 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2227                                     vlan_etype != RTE_BE16(ETH_P_IPV6))
2228                                         return rte_flow_error_set
2229                                                 (error, EINVAL,
2230                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2231                                                  items,
2232                                                  "vlan eth_type conflict,"
2233                                                  " IPv6 is required");
2234                                 vlan_etype = RTE_BE16(ETH_P_IPV6);
2235                         } else {
2236                                 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2237                                     outer_etype != RTE_BE16(ETH_P_IPV6))
2238                                         return rte_flow_error_set
2239                                                 (error, EINVAL,
2240                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2241                                                  items,
2242                                                  "eth_type conflict,"
2243                                                  " IPv6 is required");
2244                                 outer_etype = RTE_BE16(ETH_P_IPV6);
2245                         }
2246                         break;
2247                 case RTE_FLOW_ITEM_TYPE_UDP:
2248                         ret = mlx5_flow_validate_item_udp(items, item_flags,
2249                                                           next_protocol, error);
2250                         if (ret < 0)
2251                                 return ret;
2252                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2253                                       MLX5_FLOW_LAYER_INNER_L4_UDP :
2254                                       MLX5_FLOW_LAYER_OUTER_L4_UDP;
2255                         mask.udp = flow_tcf_item_mask
2256                                 (items, &rte_flow_item_udp_mask,
2257                                  &flow_tcf_mask_supported.udp,
2258                                  &flow_tcf_mask_empty.udp,
2259                                  sizeof(flow_tcf_mask_supported.udp),
2260                                  error);
2261                         if (!mask.udp)
2262                                 return -rte_errno;
2263                         /*
2264                          * Save the presumed outer UDP item for extra check
2265                          * if the tunnel item will be found later in the list.
2266                          */
2267                         if (!(item_flags & MLX5_FLOW_LAYER_TUNNEL))
2268                                 outer_udp = items;
2269                         break;
2270                 case RTE_FLOW_ITEM_TYPE_TCP:
2271                         ret = mlx5_flow_validate_item_tcp
2272                                              (items, item_flags,
2273                                               next_protocol,
2274                                               &flow_tcf_mask_supported.tcp,
2275                                               error);
2276                         if (ret < 0)
2277                                 return ret;
2278                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2279                                       MLX5_FLOW_LAYER_INNER_L4_TCP :
2280                                       MLX5_FLOW_LAYER_OUTER_L4_TCP;
2281                         mask.tcp = flow_tcf_item_mask
2282                                 (items, &rte_flow_item_tcp_mask,
2283                                  &flow_tcf_mask_supported.tcp,
2284                                  &flow_tcf_mask_empty.tcp,
2285                                  sizeof(flow_tcf_mask_supported.tcp),
2286                                  error);
2287                         if (!mask.tcp)
2288                                 return -rte_errno;
2289                         break;
2290                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2291                         if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN)
2292                                 return rte_flow_error_set
2293                                         (error, ENOTSUP,
2294                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
2295                                          "vxlan tunnel over vlan"
2296                                          " is not supported");
2297                         ret = mlx5_flow_validate_item_vxlan(items,
2298                                                             item_flags, error);
2299                         if (ret < 0)
2300                                 return ret;
2301                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
2302                         mask.vxlan = flow_tcf_item_mask
2303                                 (items, &rte_flow_item_vxlan_mask,
2304                                  &flow_tcf_mask_supported.vxlan,
2305                                  &flow_tcf_mask_empty.vxlan,
2306                                  sizeof(flow_tcf_mask_supported.vxlan), error);
2307                         if (!mask.vxlan)
2308                                 return -rte_errno;
2309                         if (mask.vxlan->vni[0] != 0xff ||
2310                             mask.vxlan->vni[1] != 0xff ||
2311                             mask.vxlan->vni[2] != 0xff)
2312                                 return rte_flow_error_set
2313                                         (error, ENOTSUP,
2314                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2315                                          mask.vxlan,
2316                                          "no support for partial or "
2317                                          "empty mask on \"vxlan.vni\" field");
2318                         /*
2319                          * The VNI item assumes the VXLAN tunnel, it requires
2320                          * at least the outer destination UDP port must be
2321                          * specified without wildcards to allow kernel select
2322                          * the virtual VXLAN device by port. Also outer IPv4
2323                          * or IPv6 item must be specified (wilcards or even
2324                          * zero mask are allowed) to let driver know the tunnel
2325                          * IP version and process UDP traffic correctly.
2326                          */
2327                         if (!(item_flags &
2328                              (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2329                               MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2330                                 return rte_flow_error_set
2331                                                  (error, EINVAL,
2332                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2333                                                   NULL,
2334                                                   "no outer IP pattern found"
2335                                                   " for vxlan tunnel");
2336                         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
2337                                 return rte_flow_error_set
2338                                                  (error, EINVAL,
2339                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2340                                                   NULL,
2341                                                   "no outer UDP pattern found"
2342                                                   " for vxlan tunnel");
2343                         /*
2344                          * All items preceding the tunnel item become outer
2345                          * ones and we should do extra validation for them
2346                          * due to tc limitations for tunnel outer parameters.
2347                          * Currently only outer UDP item requres extra check,
2348                          * use the saved pointer instead of item list rescan.
2349                          */
2350                         assert(outer_udp);
2351                         ret = flow_tcf_validate_vxlan_decap_udp
2352                                                 (outer_udp, error);
2353                         if (ret < 0)
2354                                 return ret;
2355                         /* Reset L4 protocol for inner parameters. */
2356                         next_protocol = 0xff;
2357                         break;
2358                 default:
2359                         return rte_flow_error_set(error, ENOTSUP,
2360                                                   RTE_FLOW_ERROR_TYPE_ITEM,
2361                                                   items, "item not supported");
2362                 }
2363         }
2364         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2365             (action_flags & MLX5_FLOW_ACTION_DROP))
2366                 return rte_flow_error_set(error, ENOTSUP,
2367                                           RTE_FLOW_ERROR_TYPE_ACTION,
2368                                           actions,
2369                                           "set action is not compatible with "
2370                                           "drop action");
2371         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2372             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2373                 return rte_flow_error_set(error, ENOTSUP,
2374                                           RTE_FLOW_ERROR_TYPE_ACTION,
2375                                           actions,
2376                                           "set action must be followed by "
2377                                           "port_id action");
2378         if (action_flags &
2379            (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST)) {
2380                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4))
2381                         return rte_flow_error_set(error, EINVAL,
2382                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2383                                                   actions,
2384                                                   "no ipv4 item found in"
2385                                                   " pattern");
2386         }
2387         if (action_flags &
2388            (MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST)) {
2389                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6))
2390                         return rte_flow_error_set(error, EINVAL,
2391                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2392                                                   actions,
2393                                                   "no ipv6 item found in"
2394                                                   " pattern");
2395         }
2396         if (action_flags &
2397            (MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST)) {
2398                 if (!(item_flags &
2399                      (MLX5_FLOW_LAYER_OUTER_L4_UDP |
2400                       MLX5_FLOW_LAYER_OUTER_L4_TCP)))
2401                         return rte_flow_error_set(error, EINVAL,
2402                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2403                                                   actions,
2404                                                   "no TCP/UDP item found in"
2405                                                   " pattern");
2406         }
2407         /*
2408          * FW syndrome (0xA9C090):
2409          *     set_flow_table_entry: push vlan action fte in fdb can ONLY be
2410          *     forward to the uplink.
2411          */
2412         if ((action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) &&
2413             (action_flags & MLX5_FLOW_ACTION_PORT_ID) &&
2414             ((struct priv *)port_id_dev->data->dev_private)->representor)
2415                 return rte_flow_error_set(error, ENOTSUP,
2416                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2417                                           "vlan push can only be applied"
2418                                           " when forwarding to uplink port");
2419         /*
2420          * FW syndrome (0x294609):
2421          *     set_flow_table_entry: modify/pop/push actions in fdb flow table
2422          *     are supported only while forwarding to vport.
2423          */
2424         if ((action_flags & MLX5_TCF_VLAN_ACTIONS) &&
2425             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2426                 return rte_flow_error_set(error, ENOTSUP,
2427                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2428                                           "vlan actions are supported"
2429                                           " only with port_id action");
2430         if ((action_flags & MLX5_TCF_VXLAN_ACTIONS) &&
2431             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2432                 return rte_flow_error_set(error, ENOTSUP,
2433                                           RTE_FLOW_ERROR_TYPE_ACTION, NULL,
2434                                           "vxlan actions are supported"
2435                                           " only with port_id action");
2436         if (!(action_flags & MLX5_TCF_FATE_ACTIONS))
2437                 return rte_flow_error_set(error, EINVAL,
2438                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2439                                           "no fate action is found");
2440         if (action_flags &
2441            (MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL)) {
2442                 if (!(item_flags &
2443                      (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2444                       MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2445                         return rte_flow_error_set(error, EINVAL,
2446                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2447                                                   actions,
2448                                                   "no IP found in pattern");
2449         }
2450         if (action_flags &
2451             (MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)) {
2452                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L2))
2453                         return rte_flow_error_set(error, ENOTSUP,
2454                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2455                                                   actions,
2456                                                   "no ethernet found in"
2457                                                   " pattern");
2458         }
2459         if ((action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) &&
2460             !(item_flags & MLX5_FLOW_LAYER_VXLAN))
2461                 return rte_flow_error_set(error, EINVAL,
2462                                           RTE_FLOW_ERROR_TYPE_ACTION,
2463                                           NULL,
2464                                           "no VNI pattern found"
2465                                           " for vxlan decap action");
2466         if ((action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) &&
2467             (item_flags & MLX5_FLOW_LAYER_TUNNEL))
2468                 return rte_flow_error_set(error, EINVAL,
2469                                           RTE_FLOW_ERROR_TYPE_ACTION,
2470                                           NULL,
2471                                           "vxlan encap not supported"
2472                                           " for tunneled traffic");
2473         return 0;
2474 }
2475
2476 /**
2477  * Calculate maximum size of memory for flow items of Linux TC flower.
2478  *
2479  * @param[in] attr
2480  *   Pointer to the flow attributes.
2481  * @param[in] items
2482  *   Pointer to the list of items.
2483  * @param[out] action_flags
2484  *   Pointer to the detected actions.
2485  *
2486  * @return
2487  *   Maximum size of memory for items.
2488  */
2489 static int
2490 flow_tcf_get_items_size(const struct rte_flow_attr *attr,
2491                         const struct rte_flow_item items[],
2492                         uint64_t *action_flags)
2493 {
2494         int size = 0;
2495
2496         size += SZ_NLATTR_STRZ_OF("flower") +
2497                 SZ_NLATTR_TYPE_OF(uint16_t) + /* Outer ether type. */
2498                 SZ_NLATTR_NEST + /* TCA_OPTIONS. */
2499                 SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */
2500         if (attr->group > 0)
2501                 size += SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CHAIN. */
2502         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2503                 switch (items->type) {
2504                 case RTE_FLOW_ITEM_TYPE_VOID:
2505                         break;
2506                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
2507                         break;
2508                 case RTE_FLOW_ITEM_TYPE_ETH:
2509                         size += SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4;
2510                                 /* dst/src MAC addr and mask. */
2511                         break;
2512                 case RTE_FLOW_ITEM_TYPE_VLAN:
2513                         size += SZ_NLATTR_TYPE_OF(uint16_t) +
2514                                 /* VLAN Ether type. */
2515                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */
2516                                 SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */
2517                         break;
2518                 case RTE_FLOW_ITEM_TYPE_IPV4: {
2519                         const struct rte_flow_item_ipv4 *ipv4 = items->mask;
2520
2521                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2522                                 SZ_NLATTR_TYPE_OF(uint32_t) * 4;
2523                                 /* dst/src IP addr and mask. */
2524                         if (ipv4 && ipv4->hdr.time_to_live)
2525                                 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2526                         if (ipv4 && ipv4->hdr.type_of_service)
2527                                 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2528                         break;
2529                 }
2530                 case RTE_FLOW_ITEM_TYPE_IPV6: {
2531                         const struct rte_flow_item_ipv6 *ipv6 = items->mask;
2532
2533                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2534                                 SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4;
2535                                 /* dst/src IP addr and mask. */
2536                         if (ipv6 && ipv6->hdr.hop_limits)
2537                                 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2538                         if (ipv6 && (rte_be_to_cpu_32(ipv6->hdr.vtc_flow) &
2539                                      (0xfful << IPV6_HDR_TC_SHIFT)))
2540                                 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2541                         break;
2542                 }
2543                 case RTE_FLOW_ITEM_TYPE_UDP:
2544                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2545                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2546                                 /* dst/src port and mask. */
2547                         break;
2548                 case RTE_FLOW_ITEM_TYPE_TCP:
2549                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2550                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2551                                 /* dst/src port and mask. */
2552                         break;
2553                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2554                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2555                         /*
2556                          * There might be no VXLAN decap action in the action
2557                          * list, nonetheless the VXLAN tunnel flow requires
2558                          * the decap structure to be correctly applied to
2559                          * VXLAN device, set the flag to create the structure.
2560                          * Translation routine will not put the decap action
2561                          * in tne Netlink message if there is no actual action
2562                          * in the list.
2563                          */
2564                         *action_flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2565                         break;
2566                 default:
2567                         DRV_LOG(WARNING,
2568                                 "unsupported item %p type %d,"
2569                                 " items must be validated before flow creation",
2570                                 (const void *)items, items->type);
2571                         break;
2572                 }
2573         }
2574         return size;
2575 }
2576
2577 /**
2578  * Calculate size of memory to store the VXLAN encapsultion
2579  * related items in the Netlink message buffer. Items list
2580  * is specified by RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action.
2581  * The item list should be validated.
2582  *
2583  * @param[in] action
2584  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2585  *   List of pattern items to scan data from.
2586  *
2587  * @return
2588  *   The size the part of Netlink message buffer to store the
2589  *   VXLAN encapsulation item attributes.
2590  */
2591 static int
2592 flow_tcf_vxlan_encap_size(const struct rte_flow_action *action)
2593 {
2594         const struct rte_flow_item *items;
2595         int size = 0;
2596
2597         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
2598         assert(action->conf);
2599
2600         items = ((const struct rte_flow_action_vxlan_encap *)
2601                                         action->conf)->definition;
2602         assert(items);
2603         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2604                 switch (items->type) {
2605                 case RTE_FLOW_ITEM_TYPE_VOID:
2606                         break;
2607                 case RTE_FLOW_ITEM_TYPE_ETH:
2608                         /* This item does not require message buffer. */
2609                         break;
2610                 case RTE_FLOW_ITEM_TYPE_IPV4: {
2611                         const struct rte_flow_item_ipv4 *ipv4 = items->mask;
2612
2613                         size += SZ_NLATTR_DATA_OF(IPV4_ADDR_LEN) * 2;
2614                         if (ipv4 && ipv4->hdr.time_to_live)
2615                                 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2616                         if (ipv4 && ipv4->hdr.type_of_service)
2617                                 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2618                         break;
2619                 }
2620                 case RTE_FLOW_ITEM_TYPE_IPV6: {
2621                         const struct rte_flow_item_ipv6 *ipv6 = items->mask;
2622
2623                         size += SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 2;
2624                         if (ipv6 && ipv6->hdr.hop_limits)
2625                                 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2626                         if (ipv6 && (rte_be_to_cpu_32(ipv6->hdr.vtc_flow) &
2627                                      (0xfful << IPV6_HDR_TC_SHIFT)))
2628                                 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2629                         break;
2630                 }
2631                 case RTE_FLOW_ITEM_TYPE_UDP: {
2632                         const struct rte_flow_item_udp *udp = items->mask;
2633
2634                         size += SZ_NLATTR_TYPE_OF(uint16_t);
2635                         if (!udp || udp->hdr.src_port != RTE_BE16(0x0000))
2636                                 size += SZ_NLATTR_TYPE_OF(uint16_t);
2637                         break;
2638                 }
2639                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2640                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2641                         break;
2642                 default:
2643                         assert(false);
2644                         DRV_LOG(WARNING,
2645                                 "unsupported item %p type %d,"
2646                                 " items must be validated"
2647                                 " before flow creation",
2648                                 (const void *)items, items->type);
2649                         return 0;
2650                 }
2651         }
2652         return size;
2653 }
2654
2655 /**
2656  * Calculate maximum size of memory for flow actions of Linux TC flower and
2657  * extract specified actions.
2658  *
2659  * @param[in] actions
2660  *   Pointer to the list of actions.
2661  * @param[out] action_flags
2662  *   Pointer to the detected actions.
2663  *
2664  * @return
2665  *   Maximum size of memory for actions.
2666  */
2667 static int
2668 flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
2669                               uint64_t *action_flags)
2670 {
2671         int size = 0;
2672         uint64_t flags = 0;
2673
2674         size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */
2675         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
2676                 switch (actions->type) {
2677                 case RTE_FLOW_ACTION_TYPE_VOID:
2678                         break;
2679                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
2680                         size += SZ_NLATTR_NEST + /* na_act_index. */
2681                                 SZ_NLATTR_STRZ_OF("mirred") +
2682                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2683                                 SZ_NLATTR_TYPE_OF(struct tc_mirred);
2684                         flags |= MLX5_FLOW_ACTION_PORT_ID;
2685                         break;
2686                 case RTE_FLOW_ACTION_TYPE_JUMP:
2687                         size += SZ_NLATTR_NEST + /* na_act_index. */
2688                                 SZ_NLATTR_STRZ_OF("gact") +
2689                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2690                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2691                         flags |= MLX5_FLOW_ACTION_JUMP;
2692                         break;
2693                 case RTE_FLOW_ACTION_TYPE_DROP:
2694                         size += SZ_NLATTR_NEST + /* na_act_index. */
2695                                 SZ_NLATTR_STRZ_OF("gact") +
2696                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2697                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2698                         flags |= MLX5_FLOW_ACTION_DROP;
2699                         break;
2700                 case RTE_FLOW_ACTION_TYPE_COUNT:
2701                         break;
2702                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
2703                         flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
2704                         goto action_of_vlan;
2705                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
2706                         flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
2707                         goto action_of_vlan;
2708                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
2709                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
2710                         goto action_of_vlan;
2711                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
2712                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
2713                         goto action_of_vlan;
2714 action_of_vlan:
2715                         size += SZ_NLATTR_NEST + /* na_act_index. */
2716                                 SZ_NLATTR_STRZ_OF("vlan") +
2717                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2718                                 SZ_NLATTR_TYPE_OF(struct tc_vlan) +
2719                                 SZ_NLATTR_TYPE_OF(uint16_t) +
2720                                 /* VLAN protocol. */
2721                                 SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
2722                                 SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
2723                         break;
2724                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
2725                         size += SZ_NLATTR_NEST + /* na_act_index. */
2726                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2727                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2728                                 SZ_NLATTR_TYPE_OF(uint8_t);
2729                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2730                         size += flow_tcf_vxlan_encap_size(actions) +
2731                                 RTE_ALIGN_CEIL /* preceding encap params. */
2732                                 (sizeof(struct flow_tcf_vxlan_encap),
2733                                 MNL_ALIGNTO);
2734                         flags |= MLX5_FLOW_ACTION_VXLAN_ENCAP;
2735                         break;
2736                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
2737                         size += SZ_NLATTR_NEST + /* na_act_index. */
2738                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2739                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2740                                 SZ_NLATTR_TYPE_OF(uint8_t);
2741                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2742                         size += RTE_ALIGN_CEIL /* preceding decap params. */
2743                                 (sizeof(struct flow_tcf_vxlan_decap),
2744                                 MNL_ALIGNTO);
2745                         flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2746                         break;
2747                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
2748                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
2749                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
2750                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
2751                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
2752                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
2753                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
2754                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
2755                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
2756                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
2757                         size += flow_tcf_get_pedit_actions_size(&actions,
2758                                                                 &flags);
2759                         break;
2760                 default:
2761                         DRV_LOG(WARNING,
2762                                 "unsupported action %p type %d,"
2763                                 " items must be validated before flow creation",
2764                                 (const void *)actions, actions->type);
2765                         break;
2766                 }
2767         }
2768         *action_flags = flags;
2769         return size;
2770 }
2771
2772 /**
2773  * Brand rtnetlink buffer with unique handle.
2774  *
2775  * This handle should be unique for a given network interface to avoid
2776  * collisions.
2777  *
2778  * @param nlh
2779  *   Pointer to Netlink message.
2780  * @param handle
2781  *   Unique 32-bit handle to use.
2782  */
2783 static void
2784 flow_tcf_nl_brand(struct nlmsghdr *nlh, uint32_t handle)
2785 {
2786         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
2787
2788         tcm->tcm_handle = handle;
2789         DRV_LOG(DEBUG, "Netlink msg %p is branded with handle %x",
2790                 (void *)nlh, handle);
2791 }
2792
2793 /**
2794  * Prepare a flow object for Linux TC flower. It calculates the maximum size of
2795  * memory required, allocates the memory, initializes Netlink message headers
2796  * and set unique TC message handle.
2797  *
2798  * @param[in] attr
2799  *   Pointer to the flow attributes.
2800  * @param[in] items
2801  *   Pointer to the list of items.
2802  * @param[in] actions
2803  *   Pointer to the list of actions.
2804  * @param[out] error
2805  *   Pointer to the error structure.
2806  *
2807  * @return
2808  *   Pointer to mlx5_flow object on success,
2809  *   otherwise NULL and rte_errno is set.
2810  */
2811 static struct mlx5_flow *
2812 flow_tcf_prepare(const struct rte_flow_attr *attr,
2813                  const struct rte_flow_item items[],
2814                  const struct rte_flow_action actions[],
2815                  struct rte_flow_error *error)
2816 {
2817         size_t size = RTE_ALIGN_CEIL
2818                         (sizeof(struct mlx5_flow),
2819                          alignof(struct flow_tcf_tunnel_hdr)) +
2820                       MNL_ALIGN(sizeof(struct nlmsghdr)) +
2821                       MNL_ALIGN(sizeof(struct tcmsg));
2822         struct mlx5_flow *dev_flow;
2823         uint64_t action_flags = 0;
2824         struct nlmsghdr *nlh;
2825         struct tcmsg *tcm;
2826         uint8_t *sp, *tun = NULL;
2827
2828         size += flow_tcf_get_items_size(attr, items, &action_flags);
2829         size += flow_tcf_get_actions_and_size(actions, &action_flags);
2830         dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
2831         if (!dev_flow) {
2832                 rte_flow_error_set(error, ENOMEM,
2833                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
2834                                    "not enough memory to create E-Switch flow");
2835                 return NULL;
2836         }
2837         sp = (uint8_t *)(dev_flow + 1);
2838         if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) {
2839                 sp = RTE_PTR_ALIGN
2840                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2841                 tun = sp;
2842                 sp += RTE_ALIGN_CEIL
2843                         (sizeof(struct flow_tcf_vxlan_encap),
2844                         MNL_ALIGNTO);
2845 #ifndef NDEBUG
2846                 size -= RTE_ALIGN_CEIL
2847                         (sizeof(struct flow_tcf_vxlan_encap),
2848                         MNL_ALIGNTO);
2849 #endif
2850         } else if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2851                 sp = RTE_PTR_ALIGN
2852                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2853                 tun = sp;
2854                 sp += RTE_ALIGN_CEIL
2855                         (sizeof(struct flow_tcf_vxlan_decap),
2856                         MNL_ALIGNTO);
2857 #ifndef NDEBUG
2858                 size -= RTE_ALIGN_CEIL
2859                         (sizeof(struct flow_tcf_vxlan_decap),
2860                         MNL_ALIGNTO);
2861 #endif
2862         } else {
2863                 sp = RTE_PTR_ALIGN(sp, MNL_ALIGNTO);
2864         }
2865         nlh = mnl_nlmsg_put_header(sp);
2866         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
2867         *dev_flow = (struct mlx5_flow){
2868                 .tcf = (struct mlx5_flow_tcf){
2869 #ifndef NDEBUG
2870                         .nlsize = size - RTE_ALIGN_CEIL
2871                                 (sizeof(struct mlx5_flow),
2872                                  alignof(struct flow_tcf_tunnel_hdr)),
2873 #endif
2874                         .tunnel = (struct flow_tcf_tunnel_hdr *)tun,
2875                         .nlh = nlh,
2876                         .tcm = tcm,
2877                 },
2878         };
2879         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP)
2880                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_DECAP;
2881         else if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP)
2882                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_ENCAP;
2883         /*
2884          * Generate a reasonably unique handle based on the address of the
2885          * target buffer.
2886          *
2887          * This is straightforward on 32-bit systems where the flow pointer can
2888          * be used directly. Otherwise, its least significant part is taken
2889          * after shifting it by the previous power of two of the pointed buffer
2890          * size.
2891          */
2892         if (sizeof(dev_flow) <= 4)
2893                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow);
2894         else
2895                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow >>
2896                                        rte_log2_u32(rte_align32prevpow2(size)));
2897         return dev_flow;
2898 }
2899
2900 /**
2901  * Make adjustments for supporting count actions.
2902  *
2903  * @param[in] dev
2904  *   Pointer to the Ethernet device structure.
2905  * @param[in] dev_flow
2906  *   Pointer to mlx5_flow.
2907  * @param[out] error
2908  *   Pointer to error structure.
2909  *
2910  * @return
2911  *   0 On success else a negative errno value is returned and rte_errno is set.
2912  */
2913 static int
2914 flow_tcf_translate_action_count(struct rte_eth_dev *dev __rte_unused,
2915                                   struct mlx5_flow *dev_flow,
2916                                   struct rte_flow_error *error)
2917 {
2918         struct rte_flow *flow = dev_flow->flow;
2919
2920         if (!flow->counter) {
2921                 flow->counter = flow_tcf_counter_new();
2922                 if (!flow->counter)
2923                         return rte_flow_error_set(error, rte_errno,
2924                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2925                                                   NULL,
2926                                                   "cannot get counter"
2927                                                   " context.");
2928         }
2929         return 0;
2930 }
2931
2932 /**
2933  * Convert VXLAN VNI to 32-bit integer.
2934  *
2935  * @param[in] vni
2936  *   VXLAN VNI in 24-bit wire format.
2937  *
2938  * @return
2939  *   VXLAN VNI as a 32-bit integer value in network endian.
2940  */
2941 static inline rte_be32_t
2942 vxlan_vni_as_be32(const uint8_t vni[3])
2943 {
2944         union {
2945                 uint8_t vni[4];
2946                 rte_be32_t dword;
2947         } ret = {
2948                 .vni = { 0, vni[0], vni[1], vni[2] },
2949         };
2950         return ret.dword;
2951 }
2952
2953 /**
2954  * Helper function to process RTE_FLOW_ITEM_TYPE_ETH entry in configuration
2955  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the MAC address fields
2956  * in the encapsulation parameters structure. The item must be prevalidated,
2957  * no any validation checks performed by function.
2958  *
2959  * @param[in] spec
2960  *   RTE_FLOW_ITEM_TYPE_ETH entry specification.
2961  * @param[in] mask
2962  *   RTE_FLOW_ITEM_TYPE_ETH entry mask.
2963  * @param[out] encap
2964  *   Structure to fill the gathered MAC address data.
2965  */
2966 static void
2967 flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec,
2968                                const struct rte_flow_item_eth *mask,
2969                                struct flow_tcf_vxlan_encap *encap)
2970 {
2971         /* Item must be validated before. No redundant checks. */
2972         assert(spec);
2973         if (!mask || !memcmp(&mask->dst,
2974                              &rte_flow_item_eth_mask.dst,
2975                              sizeof(rte_flow_item_eth_mask.dst))) {
2976                 /*
2977                  * Ethernet addresses are not supported by
2978                  * tc as tunnel_key parameters. Destination
2979                  * address is needed to form encap packet
2980                  * header and retrieved by kernel from
2981                  * implicit sources (ARP table, etc),
2982                  * address masks are not supported at all.
2983                  */
2984                 encap->eth.dst = spec->dst;
2985                 encap->mask |= FLOW_TCF_ENCAP_ETH_DST;
2986         }
2987         if (!mask || !memcmp(&mask->src,
2988                              &rte_flow_item_eth_mask.src,
2989                              sizeof(rte_flow_item_eth_mask.src))) {
2990                 /*
2991                  * Ethernet addresses are not supported by
2992                  * tc as tunnel_key parameters. Source ethernet
2993                  * address is ignored anyway.
2994                  */
2995                 encap->eth.src = spec->src;
2996                 encap->mask |= FLOW_TCF_ENCAP_ETH_SRC;
2997         }
2998 }
2999
3000 /**
3001  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV4 entry in configuration
3002  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV4 address fields
3003  * in the encapsulation parameters structure. The item must be prevalidated,
3004  * no any validation checks performed by function.
3005  *
3006  * @param[in] spec
3007  *   RTE_FLOW_ITEM_TYPE_IPV4 entry specification.
3008  * @param[in] mask
3009  *  RTE_FLOW_ITEM_TYPE_IPV4 entry mask.
3010  * @param[out] encap
3011  *   Structure to fill the gathered IPV4 address data.
3012  */
3013 static void
3014 flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec,
3015                                 const struct rte_flow_item_ipv4 *mask,
3016                                 struct flow_tcf_vxlan_encap *encap)
3017 {
3018         /* Item must be validated before. No redundant checks. */
3019         assert(spec);
3020         encap->ipv4.dst = spec->hdr.dst_addr;
3021         encap->ipv4.src = spec->hdr.src_addr;
3022         encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC |
3023                        FLOW_TCF_ENCAP_IPV4_DST;
3024         if (mask && mask->hdr.type_of_service) {
3025                 encap->mask |= FLOW_TCF_ENCAP_IP_TOS;
3026                 encap->ip_tos = spec->hdr.type_of_service;
3027         }
3028         if (mask && mask->hdr.time_to_live) {
3029                 encap->mask |= FLOW_TCF_ENCAP_IP_TTL;
3030                 encap->ip_ttl_hop = spec->hdr.time_to_live;
3031         }
3032 }
3033
3034 /**
3035  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV6 entry in configuration
3036  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV6 address fields
3037  * in the encapsulation parameters structure. The item must be prevalidated,
3038  * no any validation checks performed by function.
3039  *
3040  * @param[in] spec
3041  *   RTE_FLOW_ITEM_TYPE_IPV6 entry specification.
3042  * @param[in] mask
3043  *  RTE_FLOW_ITEM_TYPE_IPV6 entry mask.
3044  * @param[out] encap
3045  *   Structure to fill the gathered IPV6 address data.
3046  */
3047 static void
3048 flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec,
3049                                 const struct rte_flow_item_ipv6 *mask,
3050                                 struct flow_tcf_vxlan_encap *encap)
3051 {
3052         /* Item must be validated before. No redundant checks. */
3053         assert(spec);
3054         memcpy(encap->ipv6.dst, spec->hdr.dst_addr, IPV6_ADDR_LEN);
3055         memcpy(encap->ipv6.src, spec->hdr.src_addr, IPV6_ADDR_LEN);
3056         encap->mask |= FLOW_TCF_ENCAP_IPV6_SRC |
3057                        FLOW_TCF_ENCAP_IPV6_DST;
3058         if (mask) {
3059                 if ((rte_be_to_cpu_32(mask->hdr.vtc_flow) >>
3060                     IPV6_HDR_TC_SHIFT) & 0xff) {
3061                         encap->mask |= FLOW_TCF_ENCAP_IP_TOS;
3062                         encap->ip_tos = (rte_be_to_cpu_32
3063                                                 (spec->hdr.vtc_flow) >>
3064                                                  IPV6_HDR_TC_SHIFT) & 0xff;
3065                 }
3066                 if (mask->hdr.hop_limits) {
3067                         encap->mask |= FLOW_TCF_ENCAP_IP_TTL;
3068                         encap->ip_ttl_hop = spec->hdr.hop_limits;
3069                 }
3070         }
3071 }
3072
3073 /**
3074  * Helper function to process RTE_FLOW_ITEM_TYPE_UDP entry in configuration
3075  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the UDP port fields
3076  * in the encapsulation parameters structure. The item must be prevalidated,
3077  * no any validation checks performed by function.
3078  *
3079  * @param[in] spec
3080  *   RTE_FLOW_ITEM_TYPE_UDP entry specification.
3081  * @param[in] mask
3082  *   RTE_FLOW_ITEM_TYPE_UDP entry mask.
3083  * @param[out] encap
3084  *   Structure to fill the gathered UDP port data.
3085  */
3086 static void
3087 flow_tcf_parse_vxlan_encap_udp(const struct rte_flow_item_udp *spec,
3088                                const struct rte_flow_item_udp *mask,
3089                                struct flow_tcf_vxlan_encap *encap)
3090 {
3091         assert(spec);
3092         encap->udp.dst = spec->hdr.dst_port;
3093         encap->mask |= FLOW_TCF_ENCAP_UDP_DST;
3094         if (!mask || mask->hdr.src_port != RTE_BE16(0x0000)) {
3095                 encap->udp.src = spec->hdr.src_port;
3096                 encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC;
3097         }
3098 }
3099
3100 /**
3101  * Helper function to process RTE_FLOW_ITEM_TYPE_VXLAN entry in configuration
3102  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the VNI fields
3103  * in the encapsulation parameters structure. The item must be prevalidated,
3104  * no any validation checks performed by function.
3105  *
3106  * @param[in] spec
3107  *   RTE_FLOW_ITEM_TYPE_VXLAN entry specification.
3108  * @param[out] encap
3109  *   Structure to fill the gathered VNI address data.
3110  */
3111 static void
3112 flow_tcf_parse_vxlan_encap_vni(const struct rte_flow_item_vxlan *spec,
3113                                struct flow_tcf_vxlan_encap *encap)
3114 {
3115         /* Item must be validated before. Do not redundant checks. */
3116         assert(spec);
3117         memcpy(encap->vxlan.vni, spec->vni, sizeof(encap->vxlan.vni));
3118         encap->mask |= FLOW_TCF_ENCAP_VXLAN_VNI;
3119 }
3120
3121 /**
3122  * Populate consolidated encapsulation object from list of pattern items.
3123  *
3124  * Helper function to process configuration of action such as
3125  * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. The item list should be
3126  * validated, there is no way to return an meaningful error.
3127  *
3128  * @param[in] action
3129  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
3130  *   List of pattern items to gather data from.
3131  * @param[out] src
3132  *   Structure to fill gathered data.
3133  */
3134 static void
3135 flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action,
3136                            struct flow_tcf_vxlan_encap *encap)
3137 {
3138         union {
3139                 const struct rte_flow_item_eth *eth;
3140                 const struct rte_flow_item_ipv4 *ipv4;
3141                 const struct rte_flow_item_ipv6 *ipv6;
3142                 const struct rte_flow_item_udp *udp;
3143                 const struct rte_flow_item_vxlan *vxlan;
3144         } spec, mask;
3145         const struct rte_flow_item *items;
3146
3147         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
3148         assert(action->conf);
3149
3150         items = ((const struct rte_flow_action_vxlan_encap *)
3151                                         action->conf)->definition;
3152         assert(items);
3153         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3154                 switch (items->type) {
3155                 case RTE_FLOW_ITEM_TYPE_VOID:
3156                         break;
3157                 case RTE_FLOW_ITEM_TYPE_ETH:
3158                         mask.eth = items->mask;
3159                         spec.eth = items->spec;
3160                         flow_tcf_parse_vxlan_encap_eth(spec.eth, mask.eth,
3161                                                        encap);
3162                         break;
3163                 case RTE_FLOW_ITEM_TYPE_IPV4:
3164                         spec.ipv4 = items->spec;
3165                         mask.ipv4 = items->mask;
3166                         flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4, mask.ipv4,
3167                                                         encap);
3168                         break;
3169                 case RTE_FLOW_ITEM_TYPE_IPV6:
3170                         spec.ipv6 = items->spec;
3171                         mask.ipv6 = items->mask;
3172                         flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6, mask.ipv6,
3173                                                         encap);
3174                         break;
3175                 case RTE_FLOW_ITEM_TYPE_UDP:
3176                         mask.udp = items->mask;
3177                         spec.udp = items->spec;
3178                         flow_tcf_parse_vxlan_encap_udp(spec.udp, mask.udp,
3179                                                        encap);
3180                         break;
3181                 case RTE_FLOW_ITEM_TYPE_VXLAN:
3182                         spec.vxlan = items->spec;
3183                         flow_tcf_parse_vxlan_encap_vni(spec.vxlan, encap);
3184                         break;
3185                 default:
3186                         assert(false);
3187                         DRV_LOG(WARNING,
3188                                 "unsupported item %p type %d,"
3189                                 " items must be validated"
3190                                 " before flow creation",
3191                                 (const void *)items, items->type);
3192                         encap->mask = 0;
3193                         return;
3194                 }
3195         }
3196 }
3197
3198 /**
3199  * Translate flow for Linux TC flower and construct Netlink message.
3200  *
3201  * @param[in] priv
3202  *   Pointer to the priv structure.
3203  * @param[in, out] flow
3204  *   Pointer to the sub flow.
3205  * @param[in] attr
3206  *   Pointer to the flow attributes.
3207  * @param[in] items
3208  *   Pointer to the list of items.
3209  * @param[in] actions
3210  *   Pointer to the list of actions.
3211  * @param[out] error
3212  *   Pointer to the error structure.
3213  *
3214  * @return
3215  *   0 on success, a negative errno value otherwise and rte_errno is set.
3216  */
3217 static int
3218 flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow,
3219                    const struct rte_flow_attr *attr,
3220                    const struct rte_flow_item items[],
3221                    const struct rte_flow_action actions[],
3222                    struct rte_flow_error *error)
3223 {
3224         union {
3225                 const struct rte_flow_item_port_id *port_id;
3226                 const struct rte_flow_item_eth *eth;
3227                 const struct rte_flow_item_vlan *vlan;
3228                 const struct rte_flow_item_ipv4 *ipv4;
3229                 const struct rte_flow_item_ipv6 *ipv6;
3230                 const struct rte_flow_item_tcp *tcp;
3231                 const struct rte_flow_item_udp *udp;
3232                 const struct rte_flow_item_vxlan *vxlan;
3233         } spec, mask;
3234         union {
3235                 const struct rte_flow_action_port_id *port_id;
3236                 const struct rte_flow_action_jump *jump;
3237                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
3238                 const struct rte_flow_action_of_set_vlan_vid *
3239                         of_set_vlan_vid;
3240                 const struct rte_flow_action_of_set_vlan_pcp *
3241                         of_set_vlan_pcp;
3242         } conf;
3243         union {
3244                 struct flow_tcf_tunnel_hdr *hdr;
3245                 struct flow_tcf_vxlan_decap *vxlan;
3246         } decap = {
3247                 .hdr = NULL,
3248         };
3249         union {
3250                 struct flow_tcf_tunnel_hdr *hdr;
3251                 struct flow_tcf_vxlan_encap *vxlan;
3252         } encap = {
3253                 .hdr = NULL,
3254         };
3255         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
3256         struct nlmsghdr *nlh = dev_flow->tcf.nlh;
3257         struct tcmsg *tcm = dev_flow->tcf.tcm;
3258         uint32_t na_act_index_cur;
3259         rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
3260         rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
3261         rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
3262         bool ip_proto_set = 0;
3263         bool tunnel_outer = 0;
3264         struct nlattr *na_flower;
3265         struct nlattr *na_flower_act;
3266         struct nlattr *na_vlan_id = NULL;
3267         struct nlattr *na_vlan_priority = NULL;
3268         uint64_t item_flags = 0;
3269         int ret;
3270
3271         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
3272                                                 PTOI_TABLE_SZ_MAX(dev)));
3273         if (dev_flow->tcf.tunnel) {
3274                 switch (dev_flow->tcf.tunnel->type) {
3275                 case FLOW_TCF_TUNACT_VXLAN_DECAP:
3276                         decap.vxlan = dev_flow->tcf.vxlan_decap;
3277                         tunnel_outer = 1;
3278                         break;
3279                 case FLOW_TCF_TUNACT_VXLAN_ENCAP:
3280                         encap.vxlan = dev_flow->tcf.vxlan_encap;
3281                         break;
3282                 /* New tunnel actions can be added here. */
3283                 default:
3284                         assert(false);
3285                         break;
3286                 }
3287         }
3288         nlh = dev_flow->tcf.nlh;
3289         tcm = dev_flow->tcf.tcm;
3290         /* Prepare API must have been called beforehand. */
3291         assert(nlh != NULL && tcm != NULL);
3292         tcm->tcm_family = AF_UNSPEC;
3293         tcm->tcm_ifindex = ptoi[0].ifindex;
3294         tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
3295         /*
3296          * Priority cannot be zero to prevent the kernel from picking one
3297          * automatically.
3298          */
3299         tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16, outer_etype);
3300         if (attr->group > 0)
3301                 mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group);
3302         mnl_attr_put_strz(nlh, TCA_KIND, "flower");
3303         na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
3304         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3305                 unsigned int i;
3306
3307                 switch (items->type) {
3308                 case RTE_FLOW_ITEM_TYPE_VOID:
3309                         break;
3310                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
3311                         mask.port_id = flow_tcf_item_mask
3312                                 (items, &rte_flow_item_port_id_mask,
3313                                  &flow_tcf_mask_supported.port_id,
3314                                  &flow_tcf_mask_empty.port_id,
3315                                  sizeof(flow_tcf_mask_supported.port_id),
3316                                  error);
3317                         assert(mask.port_id);
3318                         if (mask.port_id == &flow_tcf_mask_empty.port_id)
3319                                 break;
3320                         spec.port_id = items->spec;
3321                         if (!mask.port_id->id)
3322                                 i = 0;
3323                         else
3324                                 for (i = 0; ptoi[i].ifindex; ++i)
3325                                         if (ptoi[i].port_id == spec.port_id->id)
3326                                                 break;
3327                         assert(ptoi[i].ifindex);
3328                         tcm->tcm_ifindex = ptoi[i].ifindex;
3329                         break;
3330                 case RTE_FLOW_ITEM_TYPE_ETH:
3331                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3332                                       MLX5_FLOW_LAYER_INNER_L2 :
3333                                       MLX5_FLOW_LAYER_OUTER_L2;
3334                         mask.eth = flow_tcf_item_mask
3335                                 (items, &rte_flow_item_eth_mask,
3336                                  &flow_tcf_mask_supported.eth,
3337                                  &flow_tcf_mask_empty.eth,
3338                                  sizeof(flow_tcf_mask_supported.eth),
3339                                  error);
3340                         assert(mask.eth);
3341                         if (mask.eth == &flow_tcf_mask_empty.eth)
3342                                 break;
3343                         spec.eth = items->spec;
3344                         if (mask.eth->type) {
3345                                 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
3346                                         inner_etype = spec.eth->type;
3347                                 else
3348                                         outer_etype = spec.eth->type;
3349                         }
3350                         if (tunnel_outer) {
3351                                 DRV_LOG(WARNING,
3352                                         "outer L2 addresses cannot be"
3353                                         " forced is outer ones for tunnel,"
3354                                         " parameter is ignored");
3355                                 break;
3356                         }
3357                         if (!is_zero_ether_addr(&mask.eth->dst)) {
3358                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
3359                                              ETHER_ADDR_LEN,
3360                                              spec.eth->dst.addr_bytes);
3361                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK,
3362                                              ETHER_ADDR_LEN,
3363                                              mask.eth->dst.addr_bytes);
3364                         }
3365                         if (!is_zero_ether_addr(&mask.eth->src)) {
3366                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC,
3367                                              ETHER_ADDR_LEN,
3368                                              spec.eth->src.addr_bytes);
3369                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK,
3370                                              ETHER_ADDR_LEN,
3371                                              mask.eth->src.addr_bytes);
3372                         }
3373                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3374                         break;
3375                 case RTE_FLOW_ITEM_TYPE_VLAN:
3376                         assert(!encap.hdr);
3377                         assert(!decap.hdr);
3378                         assert(!tunnel_outer);
3379                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
3380                         mask.vlan = flow_tcf_item_mask
3381                                 (items, &rte_flow_item_vlan_mask,
3382                                  &flow_tcf_mask_supported.vlan,
3383                                  &flow_tcf_mask_empty.vlan,
3384                                  sizeof(flow_tcf_mask_supported.vlan),
3385                                  error);
3386                         assert(mask.vlan);
3387                         if (mask.vlan == &flow_tcf_mask_empty.vlan)
3388                                 break;
3389                         spec.vlan = items->spec;
3390                         assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3391                                outer_etype == RTE_BE16(ETH_P_8021Q));
3392                         outer_etype = RTE_BE16(ETH_P_8021Q);
3393                         if (mask.vlan->inner_type)
3394                                 vlan_etype = spec.vlan->inner_type;
3395                         if (mask.vlan->tci & RTE_BE16(0xe000))
3396                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_VLAN_PRIO,
3397                                                 (rte_be_to_cpu_16
3398                                                  (spec.vlan->tci) >> 13) & 0x7);
3399                         if (mask.vlan->tci & RTE_BE16(0x0fff))
3400                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_VLAN_ID,
3401                                                  rte_be_to_cpu_16
3402                                                  (spec.vlan->tci &
3403                                                   RTE_BE16(0x0fff)));
3404                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3405                         break;
3406                 case RTE_FLOW_ITEM_TYPE_IPV4:
3407                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3408                                       MLX5_FLOW_LAYER_INNER_L3_IPV4 :
3409                                       MLX5_FLOW_LAYER_OUTER_L3_IPV4;
3410                         mask.ipv4 = flow_tcf_item_mask
3411                                 (items, &rte_flow_item_ipv4_mask,
3412                                  &flow_tcf_mask_supported.ipv4,
3413                                  &flow_tcf_mask_empty.ipv4,
3414                                  sizeof(flow_tcf_mask_supported.ipv4),
3415                                  error);
3416                         assert(mask.ipv4);
3417                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3418                                 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3419                                        inner_etype == RTE_BE16(ETH_P_IP));
3420                                 inner_etype = RTE_BE16(ETH_P_IP);
3421                         } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3422                                 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3423                                        vlan_etype == RTE_BE16(ETH_P_IP));
3424                                 vlan_etype = RTE_BE16(ETH_P_IP);
3425                         } else {
3426                                 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3427                                        outer_etype == RTE_BE16(ETH_P_IP));
3428                                 outer_etype = RTE_BE16(ETH_P_IP);
3429                         }
3430                         spec.ipv4 = items->spec;
3431                         if (!tunnel_outer && mask.ipv4->hdr.next_proto_id) {
3432                                 /*
3433                                  * No way to set IP protocol for outer tunnel
3434                                  * layers. Usually it is fixed, for example,
3435                                  * to UDP for VXLAN/GPE.
3436                                  */
3437                                 assert(spec.ipv4); /* Mask is not empty. */
3438                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3439                                                 spec.ipv4->hdr.next_proto_id);
3440                                 ip_proto_set = 1;
3441                         }
3442                         if (mask.ipv4 == &flow_tcf_mask_empty.ipv4 ||
3443                              (!mask.ipv4->hdr.src_addr &&
3444                               !mask.ipv4->hdr.dst_addr)) {
3445                                 if (!tunnel_outer)
3446                                         break;
3447                                 /*
3448                                  * For tunnel outer we must set outer IP key
3449                                  * anyway, even if the specification/mask is
3450                                  * empty. There is no another way to tell
3451                                  * kernel about he outer layer protocol.
3452                                  */
3453                                 mnl_attr_put_u32
3454                                         (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC,
3455                                          mask.ipv4->hdr.src_addr);
3456                                 mnl_attr_put_u32
3457                                         (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
3458                                          mask.ipv4->hdr.src_addr);
3459                                 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3460                                 break;
3461                         }
3462                         if (mask.ipv4->hdr.src_addr) {
3463                                 mnl_attr_put_u32
3464                                         (nlh, tunnel_outer ?
3465                                          TCA_FLOWER_KEY_ENC_IPV4_SRC :
3466                                          TCA_FLOWER_KEY_IPV4_SRC,
3467                                          spec.ipv4->hdr.src_addr);
3468                                 mnl_attr_put_u32
3469                                         (nlh, tunnel_outer ?
3470                                          TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK :
3471                                          TCA_FLOWER_KEY_IPV4_SRC_MASK,
3472                                          mask.ipv4->hdr.src_addr);
3473                         }
3474                         if (mask.ipv4->hdr.dst_addr) {
3475                                 mnl_attr_put_u32
3476                                         (nlh, tunnel_outer ?
3477                                          TCA_FLOWER_KEY_ENC_IPV4_DST :
3478                                          TCA_FLOWER_KEY_IPV4_DST,
3479                                          spec.ipv4->hdr.dst_addr);
3480                                 mnl_attr_put_u32
3481                                         (nlh, tunnel_outer ?
3482                                          TCA_FLOWER_KEY_ENC_IPV4_DST_MASK :
3483                                          TCA_FLOWER_KEY_IPV4_DST_MASK,
3484                                          mask.ipv4->hdr.dst_addr);
3485                         }
3486                         if (mask.ipv4->hdr.time_to_live) {
3487                                 mnl_attr_put_u8
3488                                         (nlh, tunnel_outer ?
3489                                          TCA_FLOWER_KEY_ENC_IP_TTL :
3490                                          TCA_FLOWER_KEY_IP_TTL,
3491                                          spec.ipv4->hdr.time_to_live);
3492                                 mnl_attr_put_u8
3493                                         (nlh, tunnel_outer ?
3494                                          TCA_FLOWER_KEY_ENC_IP_TTL_MASK :
3495                                          TCA_FLOWER_KEY_IP_TTL_MASK,
3496                                          mask.ipv4->hdr.time_to_live);
3497                         }
3498                         if (mask.ipv4->hdr.type_of_service) {
3499                                 mnl_attr_put_u8
3500                                         (nlh, tunnel_outer ?
3501                                          TCA_FLOWER_KEY_ENC_IP_TOS :
3502                                          TCA_FLOWER_KEY_IP_TOS,
3503                                          spec.ipv4->hdr.type_of_service);
3504                                 mnl_attr_put_u8
3505                                         (nlh, tunnel_outer ?
3506                                          TCA_FLOWER_KEY_ENC_IP_TOS_MASK :
3507                                          TCA_FLOWER_KEY_IP_TOS_MASK,
3508                                          mask.ipv4->hdr.type_of_service);
3509                         }
3510                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3511                         break;
3512                 case RTE_FLOW_ITEM_TYPE_IPV6: {
3513                         bool ipv6_src, ipv6_dst;
3514                         uint8_t msk6, tos6;
3515
3516                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3517                                       MLX5_FLOW_LAYER_INNER_L3_IPV6 :
3518                                       MLX5_FLOW_LAYER_OUTER_L3_IPV6;
3519                         mask.ipv6 = flow_tcf_item_mask
3520                                 (items, &rte_flow_item_ipv6_mask,
3521                                  &flow_tcf_mask_supported.ipv6,
3522                                  &flow_tcf_mask_empty.ipv6,
3523                                  sizeof(flow_tcf_mask_supported.ipv6),
3524                                  error);
3525                         assert(mask.ipv6);
3526                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3527                                 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3528                                        inner_etype == RTE_BE16(ETH_P_IPV6));
3529                                 inner_etype = RTE_BE16(ETH_P_IPV6);
3530                         } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3531                                 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3532                                        vlan_etype == RTE_BE16(ETH_P_IPV6));
3533                                 vlan_etype = RTE_BE16(ETH_P_IPV6);
3534                         } else {
3535                                 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3536                                        outer_etype == RTE_BE16(ETH_P_IPV6));
3537                                 outer_etype = RTE_BE16(ETH_P_IPV6);
3538                         }
3539                         spec.ipv6 = items->spec;
3540                         if (!tunnel_outer && mask.ipv6->hdr.proto) {
3541                                 /*
3542                                  * No way to set IP protocol for outer tunnel
3543                                  * layers. Usually it is fixed, for example,
3544                                  * to UDP for VXLAN/GPE.
3545                                  */
3546                                 assert(spec.ipv6); /* Mask is not empty. */
3547                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3548                                                 spec.ipv6->hdr.proto);
3549                                 ip_proto_set = 1;
3550                         }
3551                         ipv6_dst = !IN6_IS_ADDR_UNSPECIFIED
3552                                                 (mask.ipv6->hdr.dst_addr);
3553                         ipv6_src = !IN6_IS_ADDR_UNSPECIFIED
3554                                                 (mask.ipv6->hdr.src_addr);
3555                         if (mask.ipv6 == &flow_tcf_mask_empty.ipv6 ||
3556                              (!ipv6_dst && !ipv6_src)) {
3557                                 if (!tunnel_outer)
3558                                         break;
3559                                 /*
3560                                  * For tunnel outer we must set outer IP key
3561                                  * anyway, even if the specification/mask is
3562                                  * empty. There is no another way to tell
3563                                  * kernel about he outer layer protocol.
3564                                  */
3565                                 mnl_attr_put(nlh,
3566                                              TCA_FLOWER_KEY_ENC_IPV6_SRC,
3567                                              IPV6_ADDR_LEN,
3568                                              mask.ipv6->hdr.src_addr);
3569                                 mnl_attr_put(nlh,
3570                                              TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
3571                                              IPV6_ADDR_LEN,
3572                                              mask.ipv6->hdr.src_addr);
3573                                 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3574                                 break;
3575                         }
3576                         if (ipv6_src) {
3577                                 mnl_attr_put(nlh, tunnel_outer ?
3578                                              TCA_FLOWER_KEY_ENC_IPV6_SRC :
3579                                              TCA_FLOWER_KEY_IPV6_SRC,
3580                                              IPV6_ADDR_LEN,
3581                                              spec.ipv6->hdr.src_addr);
3582                                 mnl_attr_put(nlh, tunnel_outer ?
3583                                              TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK :
3584                                              TCA_FLOWER_KEY_IPV6_SRC_MASK,
3585                                              IPV6_ADDR_LEN,
3586                                              mask.ipv6->hdr.src_addr);
3587                         }
3588                         if (ipv6_dst) {
3589                                 mnl_attr_put(nlh, tunnel_outer ?
3590                                              TCA_FLOWER_KEY_ENC_IPV6_DST :
3591                                              TCA_FLOWER_KEY_IPV6_DST,
3592                                              IPV6_ADDR_LEN,
3593                                              spec.ipv6->hdr.dst_addr);
3594                                 mnl_attr_put(nlh, tunnel_outer ?
3595                                              TCA_FLOWER_KEY_ENC_IPV6_DST_MASK :
3596                                              TCA_FLOWER_KEY_IPV6_DST_MASK,
3597                                              IPV6_ADDR_LEN,
3598                                              mask.ipv6->hdr.dst_addr);
3599                         }
3600                         if (mask.ipv6->hdr.hop_limits) {
3601                                 mnl_attr_put_u8
3602                                         (nlh, tunnel_outer ?
3603                                          TCA_FLOWER_KEY_ENC_IP_TTL :
3604                                          TCA_FLOWER_KEY_IP_TTL,
3605                                          spec.ipv6->hdr.hop_limits);
3606                                 mnl_attr_put_u8
3607                                         (nlh, tunnel_outer ?
3608                                          TCA_FLOWER_KEY_ENC_IP_TTL_MASK :
3609                                          TCA_FLOWER_KEY_IP_TTL_MASK,
3610                                          mask.ipv6->hdr.hop_limits);
3611                         }
3612                         msk6 = (rte_be_to_cpu_32(mask.ipv6->hdr.vtc_flow) >>
3613                                 IPV6_HDR_TC_SHIFT) & 0xff;
3614                         if (msk6) {
3615                                 tos6 = (rte_be_to_cpu_32
3616                                         (spec.ipv6->hdr.vtc_flow) >>
3617                                                 IPV6_HDR_TC_SHIFT) & 0xff;
3618                                 mnl_attr_put_u8
3619                                         (nlh, tunnel_outer ?
3620                                          TCA_FLOWER_KEY_ENC_IP_TOS :
3621                                          TCA_FLOWER_KEY_IP_TOS, tos6);
3622                                 mnl_attr_put_u8
3623                                         (nlh, tunnel_outer ?
3624                                          TCA_FLOWER_KEY_ENC_IP_TOS_MASK :
3625                                          TCA_FLOWER_KEY_IP_TOS_MASK, msk6);
3626                         }
3627                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3628                         break;
3629                 }
3630                 case RTE_FLOW_ITEM_TYPE_UDP:
3631                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3632                                       MLX5_FLOW_LAYER_INNER_L4_UDP :
3633                                       MLX5_FLOW_LAYER_OUTER_L4_UDP;
3634                         mask.udp = flow_tcf_item_mask
3635                                 (items, &rte_flow_item_udp_mask,
3636                                  &flow_tcf_mask_supported.udp,
3637                                  &flow_tcf_mask_empty.udp,
3638                                  sizeof(flow_tcf_mask_supported.udp),
3639                                  error);
3640                         assert(mask.udp);
3641                         spec.udp = items->spec;
3642                         if (!tunnel_outer) {
3643                                 if (!ip_proto_set)
3644                                         mnl_attr_put_u8
3645                                                 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3646                                                 IPPROTO_UDP);
3647                                 if (mask.udp == &flow_tcf_mask_empty.udp)
3648                                         break;
3649                         } else {
3650                                 assert(mask.udp != &flow_tcf_mask_empty.udp);
3651                                 decap.vxlan->udp_port =
3652                                         rte_be_to_cpu_16
3653                                                 (spec.udp->hdr.dst_port);
3654                         }
3655                         if (mask.udp->hdr.src_port) {
3656                                 mnl_attr_put_u16
3657                                         (nlh, tunnel_outer ?
3658                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT :
3659                                          TCA_FLOWER_KEY_UDP_SRC,
3660                                          spec.udp->hdr.src_port);
3661                                 mnl_attr_put_u16
3662                                         (nlh, tunnel_outer ?
3663                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK :
3664                                          TCA_FLOWER_KEY_UDP_SRC_MASK,
3665                                          mask.udp->hdr.src_port);
3666                         }
3667                         if (mask.udp->hdr.dst_port) {
3668                                 mnl_attr_put_u16
3669                                         (nlh, tunnel_outer ?
3670                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT :
3671                                          TCA_FLOWER_KEY_UDP_DST,
3672                                          spec.udp->hdr.dst_port);
3673                                 mnl_attr_put_u16
3674                                         (nlh, tunnel_outer ?
3675                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK :
3676                                          TCA_FLOWER_KEY_UDP_DST_MASK,
3677                                          mask.udp->hdr.dst_port);
3678                         }
3679                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3680                         break;
3681                 case RTE_FLOW_ITEM_TYPE_TCP:
3682                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3683                                       MLX5_FLOW_LAYER_INNER_L4_TCP :
3684                                       MLX5_FLOW_LAYER_OUTER_L4_TCP;
3685                         mask.tcp = flow_tcf_item_mask
3686                                 (items, &rte_flow_item_tcp_mask,
3687                                  &flow_tcf_mask_supported.tcp,
3688                                  &flow_tcf_mask_empty.tcp,
3689                                  sizeof(flow_tcf_mask_supported.tcp),
3690                                  error);
3691                         assert(mask.tcp);
3692                         if (!ip_proto_set)
3693                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3694                                                 IPPROTO_TCP);
3695                         if (mask.tcp == &flow_tcf_mask_empty.tcp)
3696                                 break;
3697                         spec.tcp = items->spec;
3698                         if (mask.tcp->hdr.src_port) {
3699                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_SRC,
3700                                                  spec.tcp->hdr.src_port);
3701                                 mnl_attr_put_u16(nlh,
3702                                                  TCA_FLOWER_KEY_TCP_SRC_MASK,
3703                                                  mask.tcp->hdr.src_port);
3704                         }
3705                         if (mask.tcp->hdr.dst_port) {
3706                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_DST,
3707                                                  spec.tcp->hdr.dst_port);
3708                                 mnl_attr_put_u16(nlh,
3709                                                  TCA_FLOWER_KEY_TCP_DST_MASK,
3710                                                  mask.tcp->hdr.dst_port);
3711                         }
3712                         if (mask.tcp->hdr.tcp_flags) {
3713                                 mnl_attr_put_u16
3714                                         (nlh,
3715                                          TCA_FLOWER_KEY_TCP_FLAGS,
3716                                          rte_cpu_to_be_16
3717                                                 (spec.tcp->hdr.tcp_flags));
3718                                 mnl_attr_put_u16
3719                                         (nlh,
3720                                          TCA_FLOWER_KEY_TCP_FLAGS_MASK,
3721                                          rte_cpu_to_be_16
3722                                                 (mask.tcp->hdr.tcp_flags));
3723                         }
3724                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3725                         break;
3726                 case RTE_FLOW_ITEM_TYPE_VXLAN:
3727                         assert(decap.vxlan);
3728                         tunnel_outer = 0;
3729                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
3730                         spec.vxlan = items->spec;
3731                         mnl_attr_put_u32(nlh,
3732                                          TCA_FLOWER_KEY_ENC_KEY_ID,
3733                                          vxlan_vni_as_be32(spec.vxlan->vni));
3734                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3735                         break;
3736                 default:
3737                         return rte_flow_error_set(error, ENOTSUP,
3738                                                   RTE_FLOW_ERROR_TYPE_ITEM,
3739                                                   NULL, "item not supported");
3740                 }
3741         }
3742         /*
3743          * Set the ether_type flower key and tc rule protocol:
3744          * - if there is nor VLAN neither VXLAN the key is taken from
3745          *   eth item directly or deduced from L3 items.
3746          * - if there is vlan item then key is fixed to 802.1q.
3747          * - if there is vxlan item then key is set to inner tunnel type.
3748          * - simultaneous vlan and vxlan items are prohibited.
3749          */
3750         if (outer_etype != RTE_BE16(ETH_P_ALL)) {
3751                 tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
3752                                            outer_etype);
3753                 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3754                         if (inner_etype != RTE_BE16(ETH_P_ALL))
3755                                 mnl_attr_put_u16(nlh,
3756                                                  TCA_FLOWER_KEY_ETH_TYPE,
3757                                                  inner_etype);
3758                 } else {
3759                         mnl_attr_put_u16(nlh,
3760                                          TCA_FLOWER_KEY_ETH_TYPE,
3761                                          outer_etype);
3762                         if (outer_etype == RTE_BE16(ETH_P_8021Q) &&
3763                             vlan_etype != RTE_BE16(ETH_P_ALL))
3764                                 mnl_attr_put_u16(nlh,
3765                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE,
3766                                                  vlan_etype);
3767                 }
3768                 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3769         }
3770         na_flower_act = mnl_attr_nest_start(nlh, TCA_FLOWER_ACT);
3771         na_act_index_cur = 1;
3772         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
3773                 struct nlattr *na_act_index;
3774                 struct nlattr *na_act;
3775                 unsigned int vlan_act;
3776                 unsigned int i;
3777
3778                 switch (actions->type) {
3779                 case RTE_FLOW_ACTION_TYPE_VOID:
3780                         break;
3781                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
3782                         conf.port_id = actions->conf;
3783                         if (conf.port_id->original)
3784                                 i = 0;
3785                         else
3786                                 for (i = 0; ptoi[i].ifindex; ++i)
3787                                         if (ptoi[i].port_id == conf.port_id->id)
3788                                                 break;
3789                         assert(ptoi[i].ifindex);
3790                         na_act_index =
3791                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3792                         assert(na_act_index);
3793                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
3794                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3795                         assert(na_act);
3796                         if (encap.hdr) {
3797                                 assert(dev_flow->tcf.tunnel);
3798                                 dev_flow->tcf.tunnel->ifindex_ptr =
3799                                         &((struct tc_mirred *)
3800                                         mnl_attr_get_payload
3801                                         (mnl_nlmsg_get_payload_tail
3802                                                 (nlh)))->ifindex;
3803                         }
3804                         mnl_attr_put(nlh, TCA_MIRRED_PARMS,
3805                                      sizeof(struct tc_mirred),
3806                                      &(struct tc_mirred){
3807                                         .action = TC_ACT_STOLEN,
3808                                         .eaction = TCA_EGRESS_REDIR,
3809                                         .ifindex = ptoi[i].ifindex,
3810                                      });
3811                         mnl_attr_nest_end(nlh, na_act);
3812                         mnl_attr_nest_end(nlh, na_act_index);
3813                         break;
3814                 case RTE_FLOW_ACTION_TYPE_JUMP:
3815                         conf.jump = actions->conf;
3816                         na_act_index =
3817                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3818                         assert(na_act_index);
3819                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3820                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3821                         assert(na_act);
3822                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3823                                      sizeof(struct tc_gact),
3824                                      &(struct tc_gact){
3825                                         .action = TC_ACT_GOTO_CHAIN |
3826                                                   conf.jump->group,
3827                                      });
3828                         mnl_attr_nest_end(nlh, na_act);
3829                         mnl_attr_nest_end(nlh, na_act_index);
3830                         break;
3831                 case RTE_FLOW_ACTION_TYPE_DROP:
3832                         na_act_index =
3833                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3834                         assert(na_act_index);
3835                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3836                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3837                         assert(na_act);
3838                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3839                                      sizeof(struct tc_gact),
3840                                      &(struct tc_gact){
3841                                         .action = TC_ACT_SHOT,
3842                                      });
3843                         mnl_attr_nest_end(nlh, na_act);
3844                         mnl_attr_nest_end(nlh, na_act_index);
3845                         break;
3846                 case RTE_FLOW_ACTION_TYPE_COUNT:
3847                         /*
3848                          * Driver adds the count action implicitly for
3849                          * each rule it creates.
3850                          */
3851                         ret = flow_tcf_translate_action_count(dev,
3852                                                               dev_flow, error);
3853                         if (ret < 0)
3854                                 return ret;
3855                         break;
3856                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
3857                         conf.of_push_vlan = NULL;
3858                         vlan_act = TCA_VLAN_ACT_POP;
3859                         goto action_of_vlan;
3860                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
3861                         conf.of_push_vlan = actions->conf;
3862                         vlan_act = TCA_VLAN_ACT_PUSH;
3863                         goto action_of_vlan;
3864                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
3865                         conf.of_set_vlan_vid = actions->conf;
3866                         if (na_vlan_id)
3867                                 goto override_na_vlan_id;
3868                         vlan_act = TCA_VLAN_ACT_MODIFY;
3869                         goto action_of_vlan;
3870                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
3871                         conf.of_set_vlan_pcp = actions->conf;
3872                         if (na_vlan_priority)
3873                                 goto override_na_vlan_priority;
3874                         vlan_act = TCA_VLAN_ACT_MODIFY;
3875                         goto action_of_vlan;
3876 action_of_vlan:
3877                         na_act_index =
3878                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3879                         assert(na_act_index);
3880                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "vlan");
3881                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3882                         assert(na_act);
3883                         mnl_attr_put(nlh, TCA_VLAN_PARMS,
3884                                      sizeof(struct tc_vlan),
3885                                      &(struct tc_vlan){
3886                                         .action = TC_ACT_PIPE,
3887                                         .v_action = vlan_act,
3888                                      });
3889                         if (vlan_act == TCA_VLAN_ACT_POP) {
3890                                 mnl_attr_nest_end(nlh, na_act);
3891                                 mnl_attr_nest_end(nlh, na_act_index);
3892                                 break;
3893                         }
3894                         if (vlan_act == TCA_VLAN_ACT_PUSH)
3895                                 mnl_attr_put_u16(nlh,
3896                                                  TCA_VLAN_PUSH_VLAN_PROTOCOL,
3897                                                  conf.of_push_vlan->ethertype);
3898                         na_vlan_id = mnl_nlmsg_get_payload_tail(nlh);
3899                         mnl_attr_put_u16(nlh, TCA_VLAN_PAD, 0);
3900                         na_vlan_priority = mnl_nlmsg_get_payload_tail(nlh);
3901                         mnl_attr_put_u8(nlh, TCA_VLAN_PAD, 0);
3902                         mnl_attr_nest_end(nlh, na_act);
3903                         mnl_attr_nest_end(nlh, na_act_index);
3904                         if (actions->type ==
3905                             RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
3906 override_na_vlan_id:
3907                                 na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID;
3908                                 *(uint16_t *)mnl_attr_get_payload(na_vlan_id) =
3909                                         rte_be_to_cpu_16
3910                                         (conf.of_set_vlan_vid->vlan_vid);
3911                         } else if (actions->type ==
3912                                    RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
3913 override_na_vlan_priority:
3914                                 na_vlan_priority->nla_type =
3915                                         TCA_VLAN_PUSH_VLAN_PRIORITY;
3916                                 *(uint8_t *)mnl_attr_get_payload
3917                                         (na_vlan_priority) =
3918                                         conf.of_set_vlan_pcp->vlan_pcp;
3919                         }
3920                         break;
3921                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
3922                         assert(decap.vxlan);
3923                         assert(dev_flow->tcf.tunnel);
3924                         dev_flow->tcf.tunnel->ifindex_ptr =
3925                                 (unsigned int *)&tcm->tcm_ifindex;
3926                         na_act_index =
3927                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3928                         assert(na_act_index);
3929                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3930                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3931                         assert(na_act);
3932                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3933                                 sizeof(struct tc_tunnel_key),
3934                                 &(struct tc_tunnel_key){
3935                                         .action = TC_ACT_PIPE,
3936                                         .t_action = TCA_TUNNEL_KEY_ACT_RELEASE,
3937                                         });
3938                         mnl_attr_nest_end(nlh, na_act);
3939                         mnl_attr_nest_end(nlh, na_act_index);
3940                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3941                         break;
3942                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
3943                         assert(encap.vxlan);
3944                         flow_tcf_vxlan_encap_parse(actions, encap.vxlan);
3945                         na_act_index =
3946                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3947                         assert(na_act_index);
3948                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3949                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3950                         assert(na_act);
3951                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3952                                 sizeof(struct tc_tunnel_key),
3953                                 &(struct tc_tunnel_key){
3954                                         .action = TC_ACT_PIPE,
3955                                         .t_action = TCA_TUNNEL_KEY_ACT_SET,
3956                                         });
3957                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_UDP_DST)
3958                                 mnl_attr_put_u16(nlh,
3959                                          TCA_TUNNEL_KEY_ENC_DST_PORT,
3960                                          encap.vxlan->udp.dst);
3961                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_SRC)
3962                                 mnl_attr_put_u32(nlh,
3963                                          TCA_TUNNEL_KEY_ENC_IPV4_SRC,
3964                                          encap.vxlan->ipv4.src);
3965                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_DST)
3966                                 mnl_attr_put_u32(nlh,
3967                                          TCA_TUNNEL_KEY_ENC_IPV4_DST,
3968                                          encap.vxlan->ipv4.dst);
3969                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_SRC)
3970                                 mnl_attr_put(nlh,
3971                                          TCA_TUNNEL_KEY_ENC_IPV6_SRC,
3972                                          sizeof(encap.vxlan->ipv6.src),
3973                                          &encap.vxlan->ipv6.src);
3974                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_DST)
3975                                 mnl_attr_put(nlh,
3976                                          TCA_TUNNEL_KEY_ENC_IPV6_DST,
3977                                          sizeof(encap.vxlan->ipv6.dst),
3978                                          &encap.vxlan->ipv6.dst);
3979                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IP_TTL)
3980                                 mnl_attr_put_u8(nlh,
3981                                          TCA_TUNNEL_KEY_ENC_TTL,
3982                                          encap.vxlan->ip_ttl_hop);
3983                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IP_TOS)
3984                                 mnl_attr_put_u8(nlh,
3985                                          TCA_TUNNEL_KEY_ENC_TOS,
3986                                          encap.vxlan->ip_tos);
3987                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_VXLAN_VNI)
3988                                 mnl_attr_put_u32(nlh,
3989                                          TCA_TUNNEL_KEY_ENC_KEY_ID,
3990                                          vxlan_vni_as_be32
3991                                                 (encap.vxlan->vxlan.vni));
3992                         mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM, 0);
3993                         mnl_attr_nest_end(nlh, na_act);
3994                         mnl_attr_nest_end(nlh, na_act_index);
3995                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3996                         break;
3997                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
3998                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
3999                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
4000                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
4001                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
4002                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
4003                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
4004                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
4005                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
4006                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
4007                         na_act_index =
4008                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
4009                         flow_tcf_create_pedit_mnl_msg(nlh,
4010                                                       &actions, item_flags);
4011                         mnl_attr_nest_end(nlh, na_act_index);
4012                         break;
4013                 default:
4014                         return rte_flow_error_set(error, ENOTSUP,
4015                                                   RTE_FLOW_ERROR_TYPE_ACTION,
4016                                                   actions,
4017                                                   "action not supported");
4018                 }
4019         }
4020         assert(na_flower);
4021         assert(na_flower_act);
4022         mnl_attr_nest_end(nlh, na_flower_act);
4023         dev_flow->tcf.ptc_flags = mnl_attr_get_payload
4024                                         (mnl_nlmsg_get_payload_tail(nlh));
4025         mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, decap.vxlan ?
4026                                                 0 : TCA_CLS_FLAGS_SKIP_SW);
4027         mnl_attr_nest_end(nlh, na_flower);
4028         if (dev_flow->tcf.tunnel && dev_flow->tcf.tunnel->ifindex_ptr)
4029                 dev_flow->tcf.tunnel->ifindex_org =
4030                         *dev_flow->tcf.tunnel->ifindex_ptr;
4031         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
4032         return 0;
4033 }
4034
4035 /**
4036  * Send Netlink message with acknowledgment.
4037  *
4038  * @param tcf
4039  *   Flow context to use.
4040  * @param nlh
4041  *   Message to send. This function always raises the NLM_F_ACK flag before
4042  *   sending.
4043  * @param[in] cb
4044  *   Callback handler for received message.
4045  * @param[in] arg
4046  *   Context pointer for callback handler.
4047  *
4048  * @return
4049  *   0 on success, a negative errno value otherwise and rte_errno is set.
4050  */
4051 static int
4052 flow_tcf_nl_ack(struct mlx5_flow_tcf_context *tcf,
4053                 struct nlmsghdr *nlh,
4054                 mnl_cb_t cb, void *arg)
4055 {
4056         unsigned int portid = mnl_socket_get_portid(tcf->nl);
4057         uint32_t seq = tcf->seq++;
4058         int ret, err = 0;
4059
4060         assert(tcf->nl);
4061         assert(tcf->buf);
4062         if (!seq) {
4063                 /* seq 0 is reserved for kernel event-driven notifications. */
4064                 seq = tcf->seq++;
4065         }
4066         nlh->nlmsg_seq = seq;
4067         nlh->nlmsg_flags |= NLM_F_ACK;
4068         ret = mnl_socket_sendto(tcf->nl, nlh, nlh->nlmsg_len);
4069         if (ret <= 0) {
4070                 /* Message send error occurres. */
4071                 rte_errno = errno;
4072                 return -rte_errno;
4073         }
4074         nlh = (struct nlmsghdr *)(tcf->buf);
4075         /*
4076          * The following loop postpones non-fatal errors until multipart
4077          * messages are complete.
4078          */
4079         while (true) {
4080                 ret = mnl_socket_recvfrom(tcf->nl, tcf->buf, tcf->buf_size);
4081                 if (ret < 0) {
4082                         err = errno;
4083                         /*
4084                          * In case of overflow Will receive till
4085                          * end of multipart message. We may lost part
4086                          * of reply messages but mark and return an error.
4087                          */
4088                         if (err != ENOSPC ||
4089                             !(nlh->nlmsg_flags & NLM_F_MULTI) ||
4090                             nlh->nlmsg_type == NLMSG_DONE)
4091                                 break;
4092                 } else {
4093                         ret = mnl_cb_run(nlh, ret, seq, portid, cb, arg);
4094                         if (!ret) {
4095                                 /*
4096                                  * libmnl returns 0 if DONE or
4097                                  * success ACK message found.
4098                                  */
4099                                 break;
4100                         }
4101                         if (ret < 0) {
4102                                 /*
4103                                  * ACK message with error found
4104                                  * or some error occurred.
4105                                  */
4106                                 err = errno;
4107                                 break;
4108                         }
4109                         /* We should continue receiving. */
4110                 }
4111         }
4112         if (!err)
4113                 return 0;
4114         rte_errno = err;
4115         return -err;
4116 }
4117
4118 #define MNL_BUF_EXTRA_SPACE 16
4119 #define MNL_REQUEST_SIZE_MIN 256
4120 #define MNL_REQUEST_SIZE_MAX 2048
4121 #define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \
4122                                  MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX)
4123
4124 /* Data structures used by flow_tcf_xxx_cb() routines. */
4125 struct tcf_nlcb_buf {
4126         LIST_ENTRY(tcf_nlcb_buf) next;
4127         uint32_t size;
4128         alignas(struct nlmsghdr)
4129         uint8_t msg[]; /**< Netlink message data. */
4130 };
4131
4132 struct tcf_nlcb_context {
4133         unsigned int ifindex; /**< Base interface index. */
4134         uint32_t bufsize;
4135         LIST_HEAD(, tcf_nlcb_buf) nlbuf;
4136 };
4137
4138 /**
4139  * Allocate space for netlink command in buffer list
4140  *
4141  * @param[in, out] ctx
4142  *   Pointer to callback context with command buffers list.
4143  * @param[in] size
4144  *   Required size of data buffer to be allocated.
4145  *
4146  * @return
4147  *   Pointer to allocated memory, aligned as message header.
4148  *   NULL if some error occurred.
4149  */
4150 static struct nlmsghdr *
4151 flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size)
4152 {
4153         struct tcf_nlcb_buf *buf;
4154         struct nlmsghdr *nlh;
4155
4156         size = NLMSG_ALIGN(size);
4157         buf = LIST_FIRST(&ctx->nlbuf);
4158         if (buf && (buf->size + size) <= ctx->bufsize) {
4159                 nlh = (struct nlmsghdr *)&buf->msg[buf->size];
4160                 buf->size += size;
4161                 return nlh;
4162         }
4163         if (size > ctx->bufsize) {
4164                 DRV_LOG(WARNING, "netlink: too long command buffer requested");
4165                 return NULL;
4166         }
4167         buf = rte_malloc(__func__,
4168                         ctx->bufsize + sizeof(struct tcf_nlcb_buf),
4169                         alignof(struct tcf_nlcb_buf));
4170         if (!buf) {
4171                 DRV_LOG(WARNING, "netlink: no memory for command buffer");
4172                 return NULL;
4173         }
4174         LIST_INSERT_HEAD(&ctx->nlbuf, buf, next);
4175         buf->size = size;
4176         nlh = (struct nlmsghdr *)&buf->msg[0];
4177         return nlh;
4178 }
4179
4180 /**
4181  * Send the buffers with prepared netlink commands. Scans the list and
4182  * sends all found buffers. Buffers are sent and freed anyway in order
4183  * to prevent memory leakage if some every message in received packet.
4184  *
4185  * @param[in] tcf
4186  *   Context object initialized by mlx5_flow_tcf_context_create().
4187  * @param[in, out] ctx
4188  *   Pointer to callback context with command buffers list.
4189  *
4190  * @return
4191  *   Zero value on success, negative errno value otherwise
4192  *   and rte_errno is set.
4193  */
4194 static int
4195 flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf,
4196                     struct tcf_nlcb_context *ctx)
4197 {
4198         struct tcf_nlcb_buf *bc = LIST_FIRST(&ctx->nlbuf);
4199         int ret = 0;
4200
4201         while (bc) {
4202                 struct tcf_nlcb_buf *bn = LIST_NEXT(bc, next);
4203                 struct nlmsghdr *nlh;
4204                 uint32_t msg = 0;
4205                 int rc;
4206
4207                 while (msg < bc->size) {
4208                         /*
4209                          * Send Netlink commands from buffer in one by one
4210                          * fashion. If we send multiple rule deletion commands
4211                          * in one Netlink message and some error occurs it may
4212                          * cause multiple ACK error messages and break sequence
4213                          * numbers of Netlink communication, because we expect
4214                          * the only one ACK reply.
4215                          */
4216                         assert((bc->size - msg) >= sizeof(struct nlmsghdr));
4217                         nlh = (struct nlmsghdr *)&bc->msg[msg];
4218                         assert((bc->size - msg) >= nlh->nlmsg_len);
4219                         msg += nlh->nlmsg_len;
4220                         rc = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4221                         if (rc) {
4222                                 DRV_LOG(WARNING,
4223                                         "netlink: cleanup error %d", rc);
4224                                 if (!ret)
4225                                         ret = rc;
4226                         }
4227                 }
4228                 rte_free(bc);
4229                 bc = bn;
4230         }
4231         LIST_INIT(&ctx->nlbuf);
4232         return ret;
4233 }
4234
4235 /**
4236  * Collect local IP address rules with scope link attribute  on specified
4237  * network device. This is callback routine called by libmnl mnl_cb_run()
4238  * in loop for every message in received packet.
4239  *
4240  * @param[in] nlh
4241  *   Pointer to reply header.
4242  * @param[in, out] arg
4243  *   Opaque data pointer for this callback.
4244  *
4245  * @return
4246  *   A positive, nonzero value on success, negative errno value otherwise
4247  *   and rte_errno is set.
4248  */
4249 static int
4250 flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg)
4251 {
4252         struct tcf_nlcb_context *ctx = arg;
4253         struct nlmsghdr *cmd;
4254         struct ifaddrmsg *ifa;
4255         struct nlattr *na;
4256         struct nlattr *na_local = NULL;
4257         struct nlattr *na_peer = NULL;
4258         unsigned char family;
4259         uint32_t size;
4260
4261         if (nlh->nlmsg_type != RTM_NEWADDR) {
4262                 rte_errno = EINVAL;
4263                 return -rte_errno;
4264         }
4265         ifa = mnl_nlmsg_get_payload(nlh);
4266         family = ifa->ifa_family;
4267         if (ifa->ifa_index != ctx->ifindex ||
4268             ifa->ifa_scope != RT_SCOPE_LINK ||
4269             !(ifa->ifa_flags & IFA_F_PERMANENT) ||
4270             (family != AF_INET && family != AF_INET6))
4271                 return 1;
4272         mnl_attr_for_each(na, nlh, sizeof(*ifa)) {
4273                 switch (mnl_attr_get_type(na)) {
4274                 case IFA_LOCAL:
4275                         na_local = na;
4276                         break;
4277                 case IFA_ADDRESS:
4278                         na_peer = na;
4279                         break;
4280                 }
4281                 if (na_local && na_peer)
4282                         break;
4283         }
4284         if (!na_local || !na_peer)
4285                 return 1;
4286         /* Local rule found with scope link, permanent and assigned peer. */
4287         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4288                MNL_ALIGN(sizeof(struct ifaddrmsg)) +
4289                (family == AF_INET6 ? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4290                                    : 2 * SZ_NLATTR_TYPE_OF(uint32_t));
4291         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4292         if (!cmd) {
4293                 rte_errno = ENOMEM;
4294                 return -rte_errno;
4295         }
4296         cmd = mnl_nlmsg_put_header(cmd);
4297         cmd->nlmsg_type = RTM_DELADDR;
4298         cmd->nlmsg_flags = NLM_F_REQUEST;
4299         ifa = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifa));
4300         ifa->ifa_flags = IFA_F_PERMANENT;
4301         ifa->ifa_scope = RT_SCOPE_LINK;
4302         ifa->ifa_index = ctx->ifindex;
4303         if (family == AF_INET) {
4304                 ifa->ifa_family = AF_INET;
4305                 ifa->ifa_prefixlen = 32;
4306                 mnl_attr_put_u32(cmd, IFA_LOCAL, mnl_attr_get_u32(na_local));
4307                 mnl_attr_put_u32(cmd, IFA_ADDRESS, mnl_attr_get_u32(na_peer));
4308         } else {
4309                 ifa->ifa_family = AF_INET6;
4310                 ifa->ifa_prefixlen = 128;
4311                 mnl_attr_put(cmd, IFA_LOCAL, IPV6_ADDR_LEN,
4312                         mnl_attr_get_payload(na_local));
4313                 mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN,
4314                         mnl_attr_get_payload(na_peer));
4315         }
4316         assert(size == cmd->nlmsg_len);
4317         return 1;
4318 }
4319
4320 /**
4321  * Cleanup the local IP addresses on outer interface.
4322  *
4323  * @param[in] tcf
4324  *   Context object initialized by mlx5_flow_tcf_context_create().
4325  * @param[in] ifindex
4326  *   Network inferface index to perform cleanup.
4327  */
4328 static void
4329 flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf,
4330                             unsigned int ifindex)
4331 {
4332         struct nlmsghdr *nlh;
4333         struct ifaddrmsg *ifa;
4334         struct tcf_nlcb_context ctx = {
4335                 .ifindex = ifindex,
4336                 .bufsize = MNL_REQUEST_SIZE,
4337                 .nlbuf = LIST_HEAD_INITIALIZER(),
4338         };
4339         int ret;
4340
4341         assert(ifindex);
4342         /*
4343          * Seek and destroy leftovers of local IP addresses with
4344          * matching properties "scope link".
4345          */
4346         nlh = mnl_nlmsg_put_header(tcf->buf);
4347         nlh->nlmsg_type = RTM_GETADDR;
4348         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4349         ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4350         ifa->ifa_family = AF_UNSPEC;
4351         ifa->ifa_index = ifindex;
4352         ifa->ifa_scope = RT_SCOPE_LINK;
4353         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_local_cb, &ctx);
4354         if (ret)
4355                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4356         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4357         if (ret)
4358                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4359 }
4360
4361 /**
4362  * Collect neigh permament rules on specified network device.
4363  * This is callback routine called by libmnl mnl_cb_run() in loop for
4364  * every message in received packet.
4365  *
4366  * @param[in] nlh
4367  *   Pointer to reply header.
4368  * @param[in, out] arg
4369  *   Opaque data pointer for this callback.
4370  *
4371  * @return
4372  *   A positive, nonzero value on success, negative errno value otherwise
4373  *   and rte_errno is set.
4374  */
4375 static int
4376 flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg)
4377 {
4378         struct tcf_nlcb_context *ctx = arg;
4379         struct nlmsghdr *cmd;
4380         struct ndmsg *ndm;
4381         struct nlattr *na;
4382         struct nlattr *na_ip = NULL;
4383         struct nlattr *na_mac = NULL;
4384         unsigned char family;
4385         uint32_t size;
4386
4387         if (nlh->nlmsg_type != RTM_NEWNEIGH) {
4388                 rte_errno = EINVAL;
4389                 return -rte_errno;
4390         }
4391         ndm = mnl_nlmsg_get_payload(nlh);
4392         family = ndm->ndm_family;
4393         if (ndm->ndm_ifindex != (int)ctx->ifindex ||
4394            !(ndm->ndm_state & NUD_PERMANENT) ||
4395            (family != AF_INET && family != AF_INET6))
4396                 return 1;
4397         mnl_attr_for_each(na, nlh, sizeof(*ndm)) {
4398                 switch (mnl_attr_get_type(na)) {
4399                 case NDA_DST:
4400                         na_ip = na;
4401                         break;
4402                 case NDA_LLADDR:
4403                         na_mac = na;
4404                         break;
4405                 }
4406                 if (na_mac && na_ip)
4407                         break;
4408         }
4409         if (!na_mac || !na_ip)
4410                 return 1;
4411         /* Neigh rule with permenent attribute found. */
4412         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4413                MNL_ALIGN(sizeof(struct ndmsg)) +
4414                SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) +
4415                (family == AF_INET6 ? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4416                                    : SZ_NLATTR_TYPE_OF(uint32_t));
4417         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4418         if (!cmd) {
4419                 rte_errno = ENOMEM;
4420                 return -rte_errno;
4421         }
4422         cmd = mnl_nlmsg_put_header(cmd);
4423         cmd->nlmsg_type = RTM_DELNEIGH;
4424         cmd->nlmsg_flags = NLM_F_REQUEST;
4425         ndm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ndm));
4426         ndm->ndm_ifindex = ctx->ifindex;
4427         ndm->ndm_state = NUD_PERMANENT;
4428         ndm->ndm_flags = 0;
4429         ndm->ndm_type = 0;
4430         if (family == AF_INET) {
4431                 ndm->ndm_family = AF_INET;
4432                 mnl_attr_put_u32(cmd, NDA_DST, mnl_attr_get_u32(na_ip));
4433         } else {
4434                 ndm->ndm_family = AF_INET6;
4435                 mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN,
4436                              mnl_attr_get_payload(na_ip));
4437         }
4438         mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN,
4439                      mnl_attr_get_payload(na_mac));
4440         assert(size == cmd->nlmsg_len);
4441         return 1;
4442 }
4443
4444 /**
4445  * Cleanup the neigh rules on outer interface.
4446  *
4447  * @param[in] tcf
4448  *   Context object initialized by mlx5_flow_tcf_context_create().
4449  * @param[in] ifindex
4450  *   Network inferface index to perform cleanup.
4451  */
4452 static void
4453 flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf,
4454                             unsigned int ifindex)
4455 {
4456         struct nlmsghdr *nlh;
4457         struct ndmsg *ndm;
4458         struct tcf_nlcb_context ctx = {
4459                 .ifindex = ifindex,
4460                 .bufsize = MNL_REQUEST_SIZE,
4461                 .nlbuf = LIST_HEAD_INITIALIZER(),
4462         };
4463         int ret;
4464
4465         assert(ifindex);
4466         /* Seek and destroy leftovers of neigh rules. */
4467         nlh = mnl_nlmsg_put_header(tcf->buf);
4468         nlh->nlmsg_type = RTM_GETNEIGH;
4469         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4470         ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4471         ndm->ndm_family = AF_UNSPEC;
4472         ndm->ndm_ifindex = ifindex;
4473         ndm->ndm_state = NUD_PERMANENT;
4474         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_neigh_cb, &ctx);
4475         if (ret)
4476                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4477         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4478         if (ret)
4479                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4480 }
4481
4482 /**
4483  * Collect indices of VXLAN encap/decap interfaces associated with device.
4484  * This is callback routine called by libmnl mnl_cb_run() in loop for
4485  * every message in received packet.
4486  *
4487  * @param[in] nlh
4488  *   Pointer to reply header.
4489  * @param[in, out] arg
4490  *   Opaque data pointer for this callback.
4491  *
4492  * @return
4493  *   A positive, nonzero value on success, negative errno value otherwise
4494  *   and rte_errno is set.
4495  */
4496 static int
4497 flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg)
4498 {
4499         struct tcf_nlcb_context *ctx = arg;
4500         struct nlmsghdr *cmd;
4501         struct ifinfomsg *ifm;
4502         struct nlattr *na;
4503         struct nlattr *na_info = NULL;
4504         struct nlattr *na_vxlan = NULL;
4505         bool found = false;
4506         unsigned int vxindex;
4507         uint32_t size;
4508
4509         if (nlh->nlmsg_type != RTM_NEWLINK) {
4510                 rte_errno = EINVAL;
4511                 return -rte_errno;
4512         }
4513         ifm = mnl_nlmsg_get_payload(nlh);
4514         if (!ifm->ifi_index) {
4515                 rte_errno = EINVAL;
4516                 return -rte_errno;
4517         }
4518         mnl_attr_for_each(na, nlh, sizeof(*ifm))
4519                 if (mnl_attr_get_type(na) == IFLA_LINKINFO) {
4520                         na_info = na;
4521                         break;
4522                 }
4523         if (!na_info)
4524                 return 1;
4525         mnl_attr_for_each_nested(na, na_info) {
4526                 switch (mnl_attr_get_type(na)) {
4527                 case IFLA_INFO_KIND:
4528                         if (!strncmp("vxlan", mnl_attr_get_str(na),
4529                                      mnl_attr_get_len(na)))
4530                                 found = true;
4531                         break;
4532                 case IFLA_INFO_DATA:
4533                         na_vxlan = na;
4534                         break;
4535                 }
4536                 if (found && na_vxlan)
4537                         break;
4538         }
4539         if (!found || !na_vxlan)
4540                 return 1;
4541         found = false;
4542         mnl_attr_for_each_nested(na, na_vxlan) {
4543                 if (mnl_attr_get_type(na) == IFLA_VXLAN_LINK &&
4544                     mnl_attr_get_u32(na) == ctx->ifindex) {
4545                         found = true;
4546                         break;
4547                 }
4548         }
4549         if (!found)
4550                 return 1;
4551         /* Attached VXLAN device found, store the command to delete. */
4552         vxindex = ifm->ifi_index;
4553         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4554                MNL_ALIGN(sizeof(struct ifinfomsg));
4555         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4556         if (!cmd) {
4557                 rte_errno = ENOMEM;
4558                 return -rte_errno;
4559         }
4560         cmd = mnl_nlmsg_put_header(cmd);
4561         cmd->nlmsg_type = RTM_DELLINK;
4562         cmd->nlmsg_flags = NLM_F_REQUEST;
4563         ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm));
4564         ifm->ifi_family = AF_UNSPEC;
4565         ifm->ifi_index = vxindex;
4566         assert(size == cmd->nlmsg_len);
4567         return 1;
4568 }
4569
4570 /**
4571  * Cleanup the outer interface. Removes all found vxlan devices
4572  * attached to specified index, flushes the neigh and local IP
4573  * database.
4574  *
4575  * @param[in] tcf
4576  *   Context object initialized by mlx5_flow_tcf_context_create().
4577  * @param[in] ifindex
4578  *   Network inferface index to perform cleanup.
4579  */
4580 static void
4581 flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf,
4582                             unsigned int ifindex)
4583 {
4584         struct nlmsghdr *nlh;
4585         struct ifinfomsg *ifm;
4586         struct tcf_nlcb_context ctx = {
4587                 .ifindex = ifindex,
4588                 .bufsize = MNL_REQUEST_SIZE,
4589                 .nlbuf = LIST_HEAD_INITIALIZER(),
4590         };
4591         int ret;
4592
4593         assert(ifindex);
4594         /*
4595          * Seek and destroy leftover VXLAN encap/decap interfaces with
4596          * matching properties.
4597          */
4598         nlh = mnl_nlmsg_put_header(tcf->buf);
4599         nlh->nlmsg_type = RTM_GETLINK;
4600         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4601         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4602         ifm->ifi_family = AF_UNSPEC;
4603         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_vxlan_cb, &ctx);
4604         if (ret)
4605                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4606         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4607         if (ret)
4608                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4609 }
4610
4611 /**
4612  * Emit Netlink message to add/remove local address to the outer device.
4613  * The address being added is visible within the link only (scope link).
4614  *
4615  * Note that an implicit route is maintained by the kernel due to the
4616  * presence of a peer address (IFA_ADDRESS).
4617  *
4618  * These rules are used for encapsultion only and allow to assign
4619  * the outer tunnel source IP address.
4620  *
4621  * @param[in] tcf
4622  *   Libmnl socket context object.
4623  * @param[in] encap
4624  *   Encapsulation properties (source address and its peer).
4625  * @param[in] ifindex
4626  *   Network interface to apply rule.
4627  * @param[in] enable
4628  *   Toggle between add and remove.
4629  * @param[out] error
4630  *   Perform verbose error reporting if not NULL.
4631  *
4632  * @return
4633  *   0 on success, a negative errno value otherwise and rte_errno is set.
4634  */
4635 static int
4636 flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf,
4637                     const struct flow_tcf_vxlan_encap *encap,
4638                     unsigned int ifindex,
4639                     bool enable,
4640                     struct rte_flow_error *error)
4641 {
4642         struct nlmsghdr *nlh;
4643         struct ifaddrmsg *ifa;
4644         alignas(struct nlmsghdr)
4645         uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
4646
4647         nlh = mnl_nlmsg_put_header(buf);
4648         nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
4649         nlh->nlmsg_flags =
4650                 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4651         nlh->nlmsg_seq = 0;
4652         ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4653         ifa->ifa_flags = IFA_F_PERMANENT;
4654         ifa->ifa_scope = RT_SCOPE_LINK;
4655         ifa->ifa_index = ifindex;
4656         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4657                 ifa->ifa_family = AF_INET;
4658                 ifa->ifa_prefixlen = 32;
4659                 mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src);
4660                 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST)
4661                         mnl_attr_put_u32(nlh, IFA_ADDRESS,
4662                                               encap->ipv4.dst);
4663         } else {
4664                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4665                 ifa->ifa_family = AF_INET6;
4666                 ifa->ifa_prefixlen = 128;
4667                 mnl_attr_put(nlh, IFA_LOCAL,
4668                                   sizeof(encap->ipv6.src),
4669                                   &encap->ipv6.src);
4670                 if (encap->mask & FLOW_TCF_ENCAP_IPV6_DST)
4671                         mnl_attr_put(nlh, IFA_ADDRESS,
4672                                           sizeof(encap->ipv6.dst),
4673                                           &encap->ipv6.dst);
4674         }
4675         if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4676                 return 0;
4677         return rte_flow_error_set(error, rte_errno,
4678                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4679                                   "netlink: cannot complete IFA request"
4680                                   " (ip addr add)");
4681 }
4682
4683 /**
4684  * Emit Netlink message to add/remove neighbor.
4685  *
4686  * @param[in] tcf
4687  *   Libmnl socket context object.
4688  * @param[in] encap
4689  *   Encapsulation properties (destination address).
4690  * @param[in] ifindex
4691  *   Network interface.
4692  * @param[in] enable
4693  *   Toggle between add and remove.
4694  * @param[out] error
4695  *   Perform verbose error reporting if not NULL.
4696  *
4697  * @return
4698  *   0 on success, a negative errno value otherwise and rte_errno is set.
4699  */
4700 static int
4701 flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf,
4702                      const struct flow_tcf_vxlan_encap *encap,
4703                      unsigned int ifindex,
4704                      bool enable,
4705                      struct rte_flow_error *error)
4706 {
4707         struct nlmsghdr *nlh;
4708         struct ndmsg *ndm;
4709         alignas(struct nlmsghdr)
4710         uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
4711
4712         nlh = mnl_nlmsg_put_header(buf);
4713         nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
4714         nlh->nlmsg_flags =
4715                 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4716         nlh->nlmsg_seq = 0;
4717         ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4718         ndm->ndm_ifindex = ifindex;
4719         ndm->ndm_state = NUD_PERMANENT;
4720         ndm->ndm_flags = 0;
4721         ndm->ndm_type = 0;
4722         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4723                 ndm->ndm_family = AF_INET;
4724                 mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst);
4725         } else {
4726                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4727                 ndm->ndm_family = AF_INET6;
4728                 mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst),
4729                                                  &encap->ipv6.dst);
4730         }
4731         if (encap->mask & FLOW_TCF_ENCAP_ETH_SRC && enable)
4732                 DRV_LOG(WARNING,
4733                         "outer ethernet source address cannot be "
4734                         "forced for VXLAN encapsulation");
4735         if (encap->mask & FLOW_TCF_ENCAP_ETH_DST)
4736                 mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst),
4737                                                     &encap->eth.dst);
4738         if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4739                 return 0;
4740         return rte_flow_error_set(error, rte_errno,
4741                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4742                                   "netlink: cannot complete ND request"
4743                                   " (ip neigh)");
4744 }
4745
4746 /**
4747  * Manage the local IP addresses and their peers IP addresses on the
4748  * outer interface for encapsulation purposes. The kernel searches the
4749  * appropriate device for tunnel egress traffic using the outer source
4750  * IP, this IP should be assigned to the outer network device, otherwise
4751  * kernel rejects the rule.
4752  *
4753  * Adds or removes the addresses using the Netlink command like this:
4754  *   ip addr add <src_ip> peer <dst_ip> scope link dev <ifouter>
4755  *
4756  * The addresses are local to the netdev ("scope link"), this reduces
4757  * the risk of conflicts. Note that an implicit route is maintained by
4758  * the kernel due to the presence of a peer address (IFA_ADDRESS).
4759  *
4760  * @param[in] tcf
4761  *   Libmnl socket context object.
4762  * @param[in] iface
4763  *   Object, contains rule database and ifouter index.
4764  * @param[in] dev_flow
4765  *   Flow object, contains the tunnel parameters (for encap only).
4766  * @param[in] enable
4767  *   Toggle between add and remove.
4768  * @param[out] error
4769  *   Perform verbose error reporting if not NULL.
4770  *
4771  * @return
4772  *   0 on success, a negative errno value otherwise and rte_errno is set.
4773  */
4774 static int
4775 flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf,
4776                      struct tcf_irule *iface,
4777                      struct mlx5_flow *dev_flow,
4778                      bool enable,
4779                      struct rte_flow_error *error)
4780 {
4781         const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4782         struct tcf_local_rule *rule = NULL;
4783         int ret;
4784
4785         assert(encap);
4786         assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4787         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4788                 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_DST);
4789                 LIST_FOREACH(rule, &iface->local, next) {
4790                         if (rule->mask & FLOW_TCF_ENCAP_IPV4_SRC &&
4791                             encap->ipv4.src == rule->ipv4.src &&
4792                             encap->ipv4.dst == rule->ipv4.dst) {
4793                                 break;
4794                         }
4795                 }
4796         } else {
4797                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4798                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4799                 LIST_FOREACH(rule, &iface->local, next) {
4800                         if (rule->mask & FLOW_TCF_ENCAP_IPV6_SRC &&
4801                             !memcmp(&encap->ipv6.src, &rule->ipv6.src,
4802                                             sizeof(encap->ipv6.src)) &&
4803                             !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4804                                             sizeof(encap->ipv6.dst))) {
4805                                 break;
4806                         }
4807                 }
4808         }
4809         if (rule) {
4810                 if (enable) {
4811                         rule->refcnt++;
4812                         return 0;
4813                 }
4814                 if (!rule->refcnt || !--rule->refcnt) {
4815                         LIST_REMOVE(rule, next);
4816                         return flow_tcf_rule_local(tcf, encap,
4817                                         iface->ifouter, false, error);
4818                 }
4819                 return 0;
4820         }
4821         if (!enable) {
4822                 DRV_LOG(WARNING, "disabling not existing local rule");
4823                 rte_flow_error_set(error, ENOENT,
4824                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4825                                    "disabling not existing local rule");
4826                 return -ENOENT;
4827         }
4828         rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule),
4829                                 alignof(struct tcf_local_rule));
4830         if (!rule) {
4831                 rte_flow_error_set(error, ENOMEM,
4832                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4833                                    "unable to allocate memory for local rule");
4834                 return -rte_errno;
4835         }
4836         *rule = (struct tcf_local_rule){.refcnt = 0,
4837                                         .mask = 0,
4838                                         };
4839         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4840                 rule->mask = FLOW_TCF_ENCAP_IPV4_SRC
4841                            | FLOW_TCF_ENCAP_IPV4_DST;
4842                 rule->ipv4.src = encap->ipv4.src;
4843                 rule->ipv4.dst = encap->ipv4.dst;
4844         } else {
4845                 rule->mask = FLOW_TCF_ENCAP_IPV6_SRC
4846                            | FLOW_TCF_ENCAP_IPV6_DST;
4847                 memcpy(&rule->ipv6.src, &encap->ipv6.src, IPV6_ADDR_LEN);
4848                 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4849         }
4850         ret = flow_tcf_rule_local(tcf, encap, iface->ifouter, true, error);
4851         if (ret) {
4852                 rte_free(rule);
4853                 return ret;
4854         }
4855         rule->refcnt++;
4856         LIST_INSERT_HEAD(&iface->local, rule, next);
4857         return 0;
4858 }
4859
4860 /**
4861  * Manage the destination MAC/IP addresses neigh database, kernel uses
4862  * this one to determine the destination MAC address within encapsulation
4863  * header. Adds or removes the entries using the Netlink command like this:
4864  *   ip neigh add dev <ifouter> lladdr <dst_mac> to <dst_ip> nud permanent
4865  *
4866  * @param[in] tcf
4867  *   Libmnl socket context object.
4868  * @param[in] iface
4869  *   Object, contains rule database and ifouter index.
4870  * @param[in] dev_flow
4871  *   Flow object, contains the tunnel parameters (for encap only).
4872  * @param[in] enable
4873  *   Toggle between add and remove.
4874  * @param[out] error
4875  *   Perform verbose error reporting if not NULL.
4876  *
4877  * @return
4878  *   0 on success, a negative errno value otherwise and rte_errno is set.
4879  */
4880 static int
4881 flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf,
4882                      struct tcf_irule *iface,
4883                      struct mlx5_flow *dev_flow,
4884                      bool enable,
4885                      struct rte_flow_error *error)
4886 {
4887         const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4888         struct tcf_neigh_rule *rule = NULL;
4889         int ret;
4890
4891         assert(encap);
4892         assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4893         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4894                 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_SRC);
4895                 LIST_FOREACH(rule, &iface->neigh, next) {
4896                         if (rule->mask & FLOW_TCF_ENCAP_IPV4_DST &&
4897                             encap->ipv4.dst == rule->ipv4.dst) {
4898                                 break;
4899                         }
4900                 }
4901         } else {
4902                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4903                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4904                 LIST_FOREACH(rule, &iface->neigh, next) {
4905                         if (rule->mask & FLOW_TCF_ENCAP_IPV6_DST &&
4906                             !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4907                                                 sizeof(encap->ipv6.dst))) {
4908                                 break;
4909                         }
4910                 }
4911         }
4912         if (rule) {
4913                 if (memcmp(&encap->eth.dst, &rule->eth,
4914                            sizeof(encap->eth.dst))) {
4915                         DRV_LOG(WARNING, "Destination MAC differs"
4916                                          " in neigh rule");
4917                         rte_flow_error_set(error, EEXIST,
4918                                            RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
4919                                            NULL, "Different MAC address"
4920                                            " neigh rule for the same"
4921                                            " destination IP");
4922                                         return -EEXIST;
4923                 }
4924                 if (enable) {
4925                         rule->refcnt++;
4926                         return 0;
4927                 }
4928                 if (!rule->refcnt || !--rule->refcnt) {
4929                         LIST_REMOVE(rule, next);
4930                         return flow_tcf_rule_neigh(tcf, encap,
4931                                                    iface->ifouter,
4932                                                    false, error);
4933                 }
4934                 return 0;
4935         }
4936         if (!enable) {
4937                 DRV_LOG(WARNING, "Disabling not existing neigh rule");
4938                 rte_flow_error_set(error, ENOENT,
4939                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4940                                    "unable to allocate memory for neigh rule");
4941                 return -ENOENT;
4942         }
4943         rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule),
4944                                 alignof(struct tcf_neigh_rule));
4945         if (!rule) {
4946                 rte_flow_error_set(error, ENOMEM,
4947                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4948                                    "unable to allocate memory for neigh rule");
4949                 return -rte_errno;
4950         }
4951         *rule = (struct tcf_neigh_rule){.refcnt = 0,
4952                                         .mask = 0,
4953                                         };
4954         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4955                 rule->mask = FLOW_TCF_ENCAP_IPV4_DST;
4956                 rule->ipv4.dst = encap->ipv4.dst;
4957         } else {
4958                 rule->mask = FLOW_TCF_ENCAP_IPV6_DST;
4959                 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4960         }
4961         memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth));
4962         ret = flow_tcf_rule_neigh(tcf, encap, iface->ifouter, true, error);
4963         if (ret) {
4964                 rte_free(rule);
4965                 return ret;
4966         }
4967         rule->refcnt++;
4968         LIST_INSERT_HEAD(&iface->neigh, rule, next);
4969         return 0;
4970 }
4971
4972 /* VXLAN encap rule database for outer interfaces. */
4973 static  LIST_HEAD(, tcf_irule) iface_list_vxlan = LIST_HEAD_INITIALIZER();
4974
4975 /* VTEP device list is shared between PMD port instances. */
4976 static LIST_HEAD(, tcf_vtep) vtep_list_vxlan = LIST_HEAD_INITIALIZER();
4977 static pthread_mutex_t vtep_list_mutex = PTHREAD_MUTEX_INITIALIZER;
4978
4979 /**
4980  * Acquire the VXLAN encap rules container for specified interface.
4981  * First looks for the container in the existing ones list, creates
4982  * and initializes the new container if existing not found.
4983  *
4984  * @param[in] tcf
4985  *   Context object initialized by mlx5_flow_tcf_context_create().
4986  * @param[in] ifouter
4987  *   Network interface index to create VXLAN encap rules on.
4988  * @param[out] error
4989  *   Perform verbose error reporting if not NULL.
4990  * @return
4991  *   Rule container pointer on success,
4992  *   NULL otherwise and rte_errno is set.
4993  */
4994 static struct tcf_irule*
4995 flow_tcf_encap_irule_acquire(struct mlx5_flow_tcf_context *tcf,
4996                              unsigned int ifouter,
4997                              struct rte_flow_error *error)
4998 {
4999         struct tcf_irule *iface;
5000
5001         /* Look whether the container for encap rules is created. */
5002         assert(ifouter);
5003         LIST_FOREACH(iface, &iface_list_vxlan, next) {
5004                 if (iface->ifouter == ifouter)
5005                         break;
5006         }
5007         if (iface) {
5008                 /* Container already exists, just increment the reference. */
5009                 iface->refcnt++;
5010                 return iface;
5011         }
5012         /* Not found, we should create the new container. */
5013         iface = rte_zmalloc(__func__, sizeof(*iface),
5014                             alignof(struct tcf_irule));
5015         if (!iface) {
5016                 rte_flow_error_set(error, ENOMEM,
5017                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5018                                    "unable to allocate memory for container");
5019                 return NULL;
5020         }
5021         *iface = (struct tcf_irule){
5022                         .local = LIST_HEAD_INITIALIZER(),
5023                         .neigh = LIST_HEAD_INITIALIZER(),
5024                         .ifouter = ifouter,
5025                         .refcnt = 1,
5026         };
5027         /* Interface cleanup for new container created. */
5028         flow_tcf_encap_iface_cleanup(tcf, ifouter);
5029         flow_tcf_encap_local_cleanup(tcf, ifouter);
5030         flow_tcf_encap_neigh_cleanup(tcf, ifouter);
5031         LIST_INSERT_HEAD(&iface_list_vxlan, iface, next);
5032         return iface;
5033 }
5034
5035 /**
5036  * Releases VXLAN encap rules container by pointer. Decrements the
5037  * reference cointer and deletes the container if counter is zero.
5038  *
5039  * @param[in] irule
5040  *   VXLAN rule container pointer to release.
5041  */
5042 static void
5043 flow_tcf_encap_irule_release(struct tcf_irule *iface)
5044 {
5045         assert(iface->refcnt);
5046         if (--iface->refcnt == 0) {
5047                 /* Reference counter is zero, delete the container. */
5048                 assert(LIST_EMPTY(&iface->local));
5049                 assert(LIST_EMPTY(&iface->neigh));
5050                 LIST_REMOVE(iface, next);
5051                 rte_free(iface);
5052         }
5053 }
5054
5055 /**
5056  * Deletes VTEP network device.
5057  *
5058  * @param[in] tcf
5059  *   Context object initialized by mlx5_flow_tcf_context_create().
5060  * @param[in] vtep
5061  *   Object represinting the network device to delete. Memory
5062  *   allocated for this object is freed by routine.
5063  */
5064 static void
5065 flow_tcf_vtep_delete(struct mlx5_flow_tcf_context *tcf,
5066                      struct tcf_vtep *vtep)
5067 {
5068         struct nlmsghdr *nlh;
5069         struct ifinfomsg *ifm;
5070         alignas(struct nlmsghdr)
5071         uint8_t buf[mnl_nlmsg_size(MNL_ALIGN(sizeof(*ifm))) +
5072                     MNL_BUF_EXTRA_SPACE];
5073         int ret;
5074
5075         assert(!vtep->refcnt);
5076         /* Delete only ifaces those we actually created. */
5077         if (vtep->created && vtep->ifindex) {
5078                 DRV_LOG(INFO, "VTEP delete (%d)", vtep->ifindex);
5079                 nlh = mnl_nlmsg_put_header(buf);
5080                 nlh->nlmsg_type = RTM_DELLINK;
5081                 nlh->nlmsg_flags = NLM_F_REQUEST;
5082                 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5083                 ifm->ifi_family = AF_UNSPEC;
5084                 ifm->ifi_index = vtep->ifindex;
5085                 assert(sizeof(buf) >= nlh->nlmsg_len);
5086                 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5087                 if (ret)
5088                         DRV_LOG(WARNING, "netlink: error deleting vxlan"
5089                                          " encap/decap ifindex %u",
5090                                          ifm->ifi_index);
5091         }
5092         rte_free(vtep);
5093 }
5094
5095 /**
5096  * Creates VTEP network device.
5097  *
5098  * @param[in] tcf
5099  *   Context object initialized by mlx5_flow_tcf_context_create().
5100  * @param[in] port
5101  *   UDP port of created VTEP device.
5102  * @param[out] error
5103  *   Perform verbose error reporting if not NULL.
5104  *
5105  * @return
5106  * Pointer to created device structure on success,
5107  * NULL otherwise and rte_errno is set.
5108  */
5109 static struct tcf_vtep*
5110 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf,
5111                      uint16_t port, struct rte_flow_error *error)
5112 {
5113         struct tcf_vtep *vtep;
5114         struct nlmsghdr *nlh;
5115         struct ifinfomsg *ifm;
5116         char name[sizeof(MLX5_VXLAN_DEVICE_PFX) + 24];
5117         alignas(struct nlmsghdr)
5118         uint8_t buf[mnl_nlmsg_size(sizeof(*ifm)) +
5119                     SZ_NLATTR_DATA_OF(sizeof(name)) +
5120                     SZ_NLATTR_NEST * 2 +
5121                     SZ_NLATTR_STRZ_OF("vxlan") +
5122                     SZ_NLATTR_DATA_OF(sizeof(uint32_t)) +
5123                     SZ_NLATTR_DATA_OF(sizeof(uint16_t)) +
5124                     SZ_NLATTR_DATA_OF(sizeof(uint8_t)) * 3 +
5125                     MNL_BUF_EXTRA_SPACE];
5126         struct nlattr *na_info;
5127         struct nlattr *na_vxlan;
5128         rte_be16_t vxlan_port = rte_cpu_to_be_16(port);
5129         int ret;
5130
5131         vtep = rte_zmalloc(__func__, sizeof(*vtep), alignof(struct tcf_vtep));
5132         if (!vtep) {
5133                 rte_flow_error_set(error, ENOMEM,
5134                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5135                                    "unable to allocate memory for VTEP");
5136                 return NULL;
5137         }
5138         *vtep = (struct tcf_vtep){
5139                         .port = port,
5140         };
5141         memset(buf, 0, sizeof(buf));
5142         nlh = mnl_nlmsg_put_header(buf);
5143         nlh->nlmsg_type = RTM_NEWLINK;
5144         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE  | NLM_F_EXCL;
5145         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5146         ifm->ifi_family = AF_UNSPEC;
5147         ifm->ifi_type = 0;
5148         ifm->ifi_index = 0;
5149         ifm->ifi_flags = IFF_UP;
5150         ifm->ifi_change = 0xffffffff;
5151         snprintf(name, sizeof(name), "%s%u", MLX5_VXLAN_DEVICE_PFX, port);
5152         mnl_attr_put_strz(nlh, IFLA_IFNAME, name);
5153         na_info = mnl_attr_nest_start(nlh, IFLA_LINKINFO);
5154         assert(na_info);
5155         mnl_attr_put_strz(nlh, IFLA_INFO_KIND, "vxlan");
5156         na_vxlan = mnl_attr_nest_start(nlh, IFLA_INFO_DATA);
5157         assert(na_vxlan);
5158 #ifdef HAVE_IFLA_VXLAN_COLLECT_METADATA
5159         /*
5160          * RH 7.2 does not support metadata for tunnel device.
5161          * It does not matter because we are going to use the
5162          * hardware offload by mlx5 driver.
5163          */
5164         mnl_attr_put_u8(nlh, IFLA_VXLAN_COLLECT_METADATA, 1);
5165 #endif
5166         mnl_attr_put_u8(nlh, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1);
5167         mnl_attr_put_u8(nlh, IFLA_VXLAN_LEARNING, 0);
5168         mnl_attr_put_u16(nlh, IFLA_VXLAN_PORT, vxlan_port);
5169 #ifndef HAVE_IFLA_VXLAN_COLLECT_METADATA
5170         /*
5171          *  We must specify VNI explicitly if metadata not supported.
5172          *  Note, VNI is transferred with native endianness format.
5173          */
5174         mnl_attr_put_u16(nlh, IFLA_VXLAN_ID, MLX5_VXLAN_DEFAULT_VNI);
5175 #endif
5176         mnl_attr_nest_end(nlh, na_vxlan);
5177         mnl_attr_nest_end(nlh, na_info);
5178         assert(sizeof(buf) >= nlh->nlmsg_len);
5179         ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5180         if (ret) {
5181                 DRV_LOG(WARNING,
5182                         "netlink: VTEP %s create failure (%d)",
5183                         name, rte_errno);
5184                 if (rte_errno != EEXIST)
5185                         /*
5186                          * Some unhandled error occurred or device is
5187                          * for encapsulation and cannot be shared.
5188                          */
5189                         goto error;
5190         } else {
5191                 /*
5192                  * Mark device we actually created.
5193                  * We should explicitly delete
5194                  * when we do not need it anymore.
5195                  */
5196                 vtep->created = 1;
5197         }
5198         /* Try to get ifindex of created of pre-existing device. */
5199         ret = if_nametoindex(name);
5200         if (!ret) {
5201                 DRV_LOG(WARNING,
5202                         "VTEP %s failed to get index (%d)", name, errno);
5203                 rte_flow_error_set
5204                         (error, -errno,
5205                          RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5206                          "netlink: failed to retrieve VTEP ifindex");
5207                 goto error;
5208         }
5209         vtep->ifindex = ret;
5210         memset(buf, 0, sizeof(buf));
5211         nlh = mnl_nlmsg_put_header(buf);
5212         nlh->nlmsg_type = RTM_NEWLINK;
5213         nlh->nlmsg_flags = NLM_F_REQUEST;
5214         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5215         ifm->ifi_family = AF_UNSPEC;
5216         ifm->ifi_type = 0;
5217         ifm->ifi_index = vtep->ifindex;
5218         ifm->ifi_flags = IFF_UP;
5219         ifm->ifi_change = IFF_UP;
5220         ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5221         if (ret) {
5222                 rte_flow_error_set(error, -errno,
5223                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5224                                    "netlink: failed to set VTEP link up");
5225                 DRV_LOG(WARNING, "netlink: VTEP %s set link up failure (%d)",
5226                         name, rte_errno);
5227                 goto clean;
5228         }
5229         ret = mlx5_flow_tcf_init(tcf, vtep->ifindex, error);
5230         if (ret) {
5231                 DRV_LOG(WARNING, "VTEP %s init failure (%d)", name, rte_errno);
5232                 goto clean;
5233         }
5234         DRV_LOG(INFO, "VTEP create (%d, %d)", vtep->port, vtep->ifindex);
5235         vtep->refcnt = 1;
5236         return vtep;
5237 clean:
5238         flow_tcf_vtep_delete(tcf, vtep);
5239         return NULL;
5240 error:
5241         rte_free(vtep);
5242         return NULL;
5243 }
5244
5245 /**
5246  * Acquire target interface index for VXLAN tunneling decapsulation.
5247  * In order to share the UDP port within the other interfaces the
5248  * VXLAN device created as not attached to any interface (if created).
5249  *
5250  * @param[in] tcf
5251  *   Context object initialized by mlx5_flow_tcf_context_create().
5252  * @param[in] dev_flow
5253  *   Flow tcf object with tunnel structure pointer set.
5254  * @param[out] error
5255  *   Perform verbose error reporting if not NULL.
5256  * @return
5257  *   Interface descriptor pointer on success,
5258  *   NULL otherwise and rte_errno is set.
5259  */
5260 static struct tcf_vtep*
5261 flow_tcf_decap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5262                             struct mlx5_flow *dev_flow,
5263                             struct rte_flow_error *error)
5264 {
5265         struct tcf_vtep *vtep;
5266         uint16_t port = dev_flow->tcf.vxlan_decap->udp_port;
5267
5268         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5269                 if (vtep->port == port)
5270                         break;
5271         }
5272         if (vtep) {
5273                 /* Device exists, just increment the reference counter. */
5274                 vtep->refcnt++;
5275                 assert(vtep->ifindex);
5276                 return vtep;
5277         }
5278         /* No decapsulation device exists, try to create the new one. */
5279         vtep = flow_tcf_vtep_create(tcf, port, error);
5280         if (vtep)
5281                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5282         return vtep;
5283 }
5284
5285 /**
5286  * Aqcuire target interface index for VXLAN tunneling encapsulation.
5287  *
5288  * @param[in] tcf
5289  *   Context object initialized by mlx5_flow_tcf_context_create().
5290  * @param[in] ifouter
5291  *   Network interface index to attach VXLAN encap device to.
5292  * @param[in] dev_flow
5293  *   Flow tcf object with tunnel structure pointer set.
5294  * @param[out] error
5295  *   Perform verbose error reporting if not NULL.
5296  * @return
5297  *   Interface descriptor pointer on success,
5298  *   NULL otherwise and rte_errno is set.
5299  */
5300 static struct tcf_vtep*
5301 flow_tcf_encap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5302                             unsigned int ifouter,
5303                             struct mlx5_flow *dev_flow,
5304                             struct rte_flow_error *error)
5305 {
5306         static uint16_t port;
5307         struct tcf_vtep *vtep;
5308         struct tcf_irule *iface;
5309         int ret;
5310
5311         assert(ifouter);
5312         /* Look whether the VTEP for specified port is created. */
5313         port = rte_be_to_cpu_16(dev_flow->tcf.vxlan_encap->udp.dst);
5314         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5315                 if (vtep->port == port)
5316                         break;
5317         }
5318         if (vtep) {
5319                 /* VTEP already exists, just increment the reference. */
5320                 vtep->refcnt++;
5321         } else {
5322                 /* Not found, we should create the new VTEP. */
5323                 vtep = flow_tcf_vtep_create(tcf, port, error);
5324                 if (!vtep)
5325                         return NULL;
5326                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5327         }
5328         assert(vtep->ifindex);
5329         iface = flow_tcf_encap_irule_acquire(tcf, ifouter, error);
5330         if (!iface) {
5331                 if (--vtep->refcnt == 0)
5332                         flow_tcf_vtep_delete(tcf, vtep);
5333                 return NULL;
5334         }
5335         dev_flow->tcf.vxlan_encap->iface = iface;
5336         /* Create local ipaddr with peer to specify the outer IPs. */
5337         ret = flow_tcf_encap_local(tcf, iface, dev_flow, true, error);
5338         if (!ret) {
5339                 /* Create neigh rule to specify outer destination MAC. */
5340                 ret = flow_tcf_encap_neigh(tcf, iface, dev_flow, true, error);
5341                 if (ret)
5342                         flow_tcf_encap_local(tcf, iface,
5343                                              dev_flow, false, error);
5344         }
5345         if (ret) {
5346                 dev_flow->tcf.vxlan_encap->iface = NULL;
5347                 flow_tcf_encap_irule_release(iface);
5348                 if (--vtep->refcnt == 0)
5349                         flow_tcf_vtep_delete(tcf, vtep);
5350                 return NULL;
5351         }
5352         return vtep;
5353 }
5354
5355 /**
5356  * Acquires target interface index for tunneling of any type.
5357  * Creates the new VTEP if needed.
5358  *
5359  * @param[in] tcf
5360  *   Context object initialized by mlx5_flow_tcf_context_create().
5361  * @param[in] ifouter
5362  *   Network interface index to create VXLAN encap rules on.
5363  * @param[in] dev_flow
5364  *   Flow tcf object with tunnel structure pointer set.
5365  * @param[out] error
5366  *   Perform verbose error reporting if not NULL.
5367  * @return
5368  *   Interface descriptor pointer on success,
5369  *   NULL otherwise and rte_errno is set.
5370  */
5371 static struct tcf_vtep*
5372 flow_tcf_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5373                       unsigned int ifouter,
5374                       struct mlx5_flow *dev_flow,
5375                       struct rte_flow_error *error)
5376 {
5377         struct tcf_vtep *vtep = NULL;
5378
5379         assert(dev_flow->tcf.tunnel);
5380         pthread_mutex_lock(&vtep_list_mutex);
5381         switch (dev_flow->tcf.tunnel->type) {
5382         case FLOW_TCF_TUNACT_VXLAN_ENCAP:
5383                 vtep = flow_tcf_encap_vtep_acquire(tcf, ifouter,
5384                                                   dev_flow, error);
5385                 break;
5386         case FLOW_TCF_TUNACT_VXLAN_DECAP:
5387                 vtep = flow_tcf_decap_vtep_acquire(tcf, dev_flow, error);
5388                 break;
5389         default:
5390                 rte_flow_error_set(error, ENOTSUP,
5391                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5392                                    "unsupported tunnel type");
5393                 break;
5394         }
5395         pthread_mutex_unlock(&vtep_list_mutex);
5396         return vtep;
5397 }
5398
5399 /**
5400  * Release tunneling interface by ifindex. Decrements reference
5401  * counter and actually removes the device if counter is zero.
5402  *
5403  * @param[in] tcf
5404  *   Context object initialized by mlx5_flow_tcf_context_create().
5405  * @param[in] vtep
5406  *   VTEP device descriptor structure.
5407  * @param[in] dev_flow
5408  *   Flow tcf object with tunnel structure pointer set.
5409  */
5410 static void
5411 flow_tcf_vtep_release(struct mlx5_flow_tcf_context *tcf,
5412                       struct tcf_vtep *vtep,
5413                       struct mlx5_flow *dev_flow)
5414 {
5415         assert(dev_flow->tcf.tunnel);
5416         pthread_mutex_lock(&vtep_list_mutex);
5417         switch (dev_flow->tcf.tunnel->type) {
5418         case FLOW_TCF_TUNACT_VXLAN_DECAP:
5419                 break;
5420         case FLOW_TCF_TUNACT_VXLAN_ENCAP: {
5421                 struct tcf_irule *iface;
5422
5423                 /* Remove the encap ancillary rules first. */
5424                 iface = dev_flow->tcf.vxlan_encap->iface;
5425                 assert(iface);
5426                 flow_tcf_encap_neigh(tcf, iface, dev_flow, false, NULL);
5427                 flow_tcf_encap_local(tcf, iface, dev_flow, false, NULL);
5428                 flow_tcf_encap_irule_release(iface);
5429                 dev_flow->tcf.vxlan_encap->iface = NULL;
5430                 break;
5431         }
5432         default:
5433                 assert(false);
5434                 DRV_LOG(WARNING, "Unsupported tunnel type");
5435                 break;
5436         }
5437         assert(vtep->refcnt);
5438         if (--vtep->refcnt == 0) {
5439                 LIST_REMOVE(vtep, next);
5440                 flow_tcf_vtep_delete(tcf, vtep);
5441         }
5442         pthread_mutex_unlock(&vtep_list_mutex);
5443 }
5444
5445 struct tcf_nlcb_query {
5446         uint32_t handle;
5447         uint32_t tc_flags;
5448         uint32_t flags_valid:1;
5449 };
5450
5451 /**
5452  * Collect queried rule attributes. This is callback routine called by
5453  * libmnl mnl_cb_run() in loop for every message in received packet.
5454  * Current implementation collects the flower flags only.
5455  *
5456  * @param[in] nlh
5457  *   Pointer to reply header.
5458  * @param[in, out] arg
5459  *   Context pointer for this callback.
5460  *
5461  * @return
5462  *   A positive, nonzero value on success (required by libmnl
5463  *   to continue messages processing).
5464  */
5465 static int
5466 flow_tcf_collect_query_cb(const struct nlmsghdr *nlh, void *arg)
5467 {
5468         struct tcf_nlcb_query *query = arg;
5469         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
5470         struct nlattr *na, *na_opt;
5471         bool flower = false;
5472
5473         if (nlh->nlmsg_type != RTM_NEWTFILTER ||
5474             tcm->tcm_handle != query->handle)
5475                 return 1;
5476         mnl_attr_for_each(na, nlh, sizeof(*tcm)) {
5477                 switch (mnl_attr_get_type(na)) {
5478                 case TCA_KIND:
5479                         if (strcmp(mnl_attr_get_payload(na), "flower")) {
5480                                 /* Not flower filter, drop entire message. */
5481                                 return 1;
5482                         }
5483                         flower = true;
5484                         break;
5485                 case TCA_OPTIONS:
5486                         if (!flower) {
5487                                 /* Not flower options, drop entire message. */
5488                                 return 1;
5489                         }
5490                         /* Check nested flower options. */
5491                         mnl_attr_for_each_nested(na_opt, na) {
5492                                 switch (mnl_attr_get_type(na_opt)) {
5493                                 case TCA_FLOWER_FLAGS:
5494                                         query->flags_valid = 1;
5495                                         query->tc_flags =
5496                                                 mnl_attr_get_u32(na_opt);
5497                                         break;
5498                                 }
5499                         }
5500                         break;
5501                 }
5502         }
5503         return 1;
5504 }
5505
5506 /**
5507  * Query a TC flower rule flags via netlink.
5508  *
5509  * @param[in] tcf
5510  *   Context object initialized by mlx5_flow_tcf_context_create().
5511  * @param[in] dev_flow
5512  *   Pointer to the flow.
5513  * @param[out] pflags
5514  *   pointer to the data retrieved by the query.
5515  *
5516  * @return
5517  *   0 on success, a negative errno value otherwise.
5518  */
5519 static int
5520 flow_tcf_query_flags(struct mlx5_flow_tcf_context *tcf,
5521                      struct mlx5_flow *dev_flow,
5522                      uint32_t *pflags)
5523 {
5524         struct nlmsghdr *nlh;
5525         struct tcmsg *tcm;
5526         struct tcf_nlcb_query query = {
5527                 .handle = dev_flow->tcf.tcm->tcm_handle,
5528         };
5529
5530         nlh = mnl_nlmsg_put_header(tcf->buf);
5531         nlh->nlmsg_type = RTM_GETTFILTER;
5532         nlh->nlmsg_flags = NLM_F_REQUEST;
5533         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
5534         memcpy(tcm, dev_flow->tcf.tcm, sizeof(*tcm));
5535         /*
5536          * Ignore Netlink error for filter query operations.
5537          * The reply length is sent by kernel as errno.
5538          * Just check we got the flags option.
5539          */
5540         flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_query_cb, &query);
5541         if (!query.flags_valid) {
5542                 *pflags = 0;
5543                 return -ENOENT;
5544         }
5545         *pflags = query.tc_flags;
5546         return 0;
5547 }
5548
5549 /**
5550  * Query and check the in_hw set for specified rule.
5551  *
5552  * @param[in] tcf
5553  *   Context object initialized by mlx5_flow_tcf_context_create().
5554  * @param[in] dev_flow
5555  *   Pointer to the flow to check.
5556  *
5557  * @return
5558  *   0 on success, a negative errno value otherwise.
5559  */
5560 static int
5561 flow_tcf_check_inhw(struct mlx5_flow_tcf_context *tcf,
5562                     struct mlx5_flow *dev_flow)
5563 {
5564         uint32_t flags;
5565         int ret;
5566
5567         ret = flow_tcf_query_flags(tcf, dev_flow, &flags);
5568         if (ret)
5569                 return ret;
5570         return  (flags & TCA_CLS_FLAGS_IN_HW) ? 0 : -ENOENT;
5571 }
5572
5573 /**
5574  * Remove flow from E-Switch by sending Netlink message.
5575  *
5576  * @param[in] dev
5577  *   Pointer to Ethernet device.
5578  * @param[in, out] flow
5579  *   Pointer to the sub flow.
5580  */
5581 static void
5582 flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
5583 {
5584         struct priv *priv = dev->data->dev_private;
5585         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5586         struct mlx5_flow *dev_flow;
5587         struct nlmsghdr *nlh;
5588
5589         if (!flow)
5590                 return;
5591         dev_flow = LIST_FIRST(&flow->dev_flows);
5592         if (!dev_flow)
5593                 return;
5594         /* E-Switch flow can't be expanded. */
5595         assert(!LIST_NEXT(dev_flow, next));
5596         if (dev_flow->tcf.applied) {
5597                 nlh = dev_flow->tcf.nlh;
5598                 nlh->nlmsg_type = RTM_DELTFILTER;
5599                 nlh->nlmsg_flags = NLM_F_REQUEST;
5600                 flow_tcf_nl_ack(ctx, nlh, NULL, NULL);
5601                 if (dev_flow->tcf.tunnel) {
5602                         assert(dev_flow->tcf.tunnel->vtep);
5603                         flow_tcf_vtep_release(ctx,
5604                                 dev_flow->tcf.tunnel->vtep,
5605                                 dev_flow);
5606                         dev_flow->tcf.tunnel->vtep = NULL;
5607                 }
5608                 dev_flow->tcf.applied = 0;
5609         }
5610 }
5611
5612 /**
5613  * Apply flow to E-Switch by sending Netlink message.
5614  *
5615  * @param[in] dev
5616  *   Pointer to Ethernet device.
5617  * @param[in, out] flow
5618  *   Pointer to the sub flow.
5619  * @param[out] error
5620  *   Pointer to the error structure.
5621  *
5622  * @return
5623  *   0 on success, a negative errno value otherwise and rte_errno is set.
5624  */
5625 static int
5626 flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
5627                struct rte_flow_error *error)
5628 {
5629         struct priv *priv = dev->data->dev_private;
5630         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5631         struct mlx5_flow *dev_flow;
5632         struct nlmsghdr *nlh;
5633
5634         dev_flow = LIST_FIRST(&flow->dev_flows);
5635         /* E-Switch flow can't be expanded. */
5636         assert(!LIST_NEXT(dev_flow, next));
5637         if (dev_flow->tcf.applied)
5638                 return 0;
5639         nlh = dev_flow->tcf.nlh;
5640         nlh->nlmsg_type = RTM_NEWTFILTER;
5641         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
5642         if (dev_flow->tcf.tunnel) {
5643                 /*
5644                  * Replace the interface index, target for
5645                  * encapsulation, source for decapsulation.
5646                  */
5647                 assert(!dev_flow->tcf.tunnel->vtep);
5648                 assert(dev_flow->tcf.tunnel->ifindex_ptr);
5649                 /* Acquire actual VTEP device when rule is being applied. */
5650                 dev_flow->tcf.tunnel->vtep =
5651                         flow_tcf_vtep_acquire(ctx,
5652                                         dev_flow->tcf.tunnel->ifindex_org,
5653                                         dev_flow, error);
5654                 if (!dev_flow->tcf.tunnel->vtep)
5655                         return -rte_errno;
5656                 DRV_LOG(INFO, "Replace ifindex: %d->%d",
5657                                 dev_flow->tcf.tunnel->vtep->ifindex,
5658                                 dev_flow->tcf.tunnel->ifindex_org);
5659                 *dev_flow->tcf.tunnel->ifindex_ptr =
5660                         dev_flow->tcf.tunnel->vtep->ifindex;
5661         }
5662         if (!flow_tcf_nl_ack(ctx, nlh, NULL, NULL)) {
5663                 dev_flow->tcf.applied = 1;
5664                 if (*dev_flow->tcf.ptc_flags & TCA_CLS_FLAGS_SKIP_SW)
5665                         return 0;
5666                 /*
5667                  * Rule was applied without skip_sw flag set.
5668                  * We should check whether the rule was acctually
5669                  * accepted by hardware (have look at in_hw flag).
5670                  */
5671                 if (flow_tcf_check_inhw(ctx, dev_flow)) {
5672                         flow_tcf_remove(dev, flow);
5673                         return rte_flow_error_set
5674                                 (error, ENOENT,
5675                                  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5676                                  "netlink: rule has no in_hw flag set");
5677                 }
5678                 return 0;
5679         }
5680         if (dev_flow->tcf.tunnel) {
5681                 /* Rollback the VTEP configuration if rule apply failed. */
5682                 assert(dev_flow->tcf.tunnel->vtep);
5683                 flow_tcf_vtep_release(ctx, dev_flow->tcf.tunnel->vtep,
5684                                       dev_flow);
5685                 dev_flow->tcf.tunnel->vtep = NULL;
5686         }
5687         return rte_flow_error_set(error, rte_errno,
5688                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5689                                   "netlink: failed to create TC flow rule");
5690 }
5691
5692 /**
5693  * Remove flow from E-Switch and release resources of the device flow.
5694  *
5695  * @param[in] dev
5696  *   Pointer to Ethernet device.
5697  * @param[in, out] flow
5698  *   Pointer to the sub flow.
5699  */
5700 static void
5701 flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
5702 {
5703         struct mlx5_flow *dev_flow;
5704
5705         if (!flow)
5706                 return;
5707         flow_tcf_remove(dev, flow);
5708         if (flow->counter) {
5709                 if (--flow->counter->ref_cnt == 0) {
5710                         rte_free(flow->counter);
5711                         flow->counter = NULL;
5712                 }
5713         }
5714         dev_flow = LIST_FIRST(&flow->dev_flows);
5715         if (!dev_flow)
5716                 return;
5717         /* E-Switch flow can't be expanded. */
5718         assert(!LIST_NEXT(dev_flow, next));
5719         LIST_REMOVE(dev_flow, next);
5720         rte_free(dev_flow);
5721 }
5722
5723 /**
5724  * Helper routine for figuring the space size required for a parse buffer.
5725  *
5726  * @param array
5727  *   array of values to use.
5728  * @param idx
5729  *   Current location in array.
5730  * @param value
5731  *   Value to compare with.
5732  *
5733  * @return
5734  *   The maximum between the given value and the array value on index.
5735  */
5736 static uint16_t
5737 flow_tcf_arr_val_max(uint16_t array[], int idx, uint16_t value)
5738 {
5739         return idx < 0 ? (value) : RTE_MAX((array)[idx], value);
5740 }
5741
5742 /**
5743  * Parse rtnetlink message attributes filling the attribute table with the info
5744  * retrieved.
5745  *
5746  * @param tb
5747  *   Attribute table to be filled.
5748  * @param[out] max
5749  *   Maxinum entry in the attribute table.
5750  * @param rte
5751  *   The attributes section in the message to be parsed.
5752  * @param len
5753  *   The length of the attributes section in the message.
5754  */
5755 static void
5756 flow_tcf_nl_parse_rtattr(struct rtattr *tb[], int max,
5757                          struct rtattr *rta, int len)
5758 {
5759         unsigned short type;
5760         memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
5761         while (RTA_OK(rta, len)) {
5762                 type = rta->rta_type;
5763                 if (type <= max && !tb[type])
5764                         tb[type] = rta;
5765                 rta = RTA_NEXT(rta, len);
5766         }
5767 }
5768
5769 /**
5770  * Extract flow counters from flower action.
5771  *
5772  * @param rta
5773  *   flower action stats properties in the Netlink message received.
5774  * @param rta_type
5775  *   The backward sequence of rta_types, as written in the attribute table,
5776  *   we need to traverse in order to get to the requested object.
5777  * @param idx
5778  *   Current location in rta_type table.
5779  * @param[out] data
5780  *   data holding the count statistics of the rte_flow retrieved from
5781  *   the message.
5782  *
5783  * @return
5784  *   0 if data was found and retrieved, -1 otherwise.
5785  */
5786 static int
5787 flow_tcf_nl_action_stats_parse_and_get(struct rtattr *rta,
5788                                        uint16_t rta_type[], int idx,
5789                                        struct gnet_stats_basic *data)
5790 {
5791         int tca_stats_max = flow_tcf_arr_val_max(rta_type, idx,
5792                                                  TCA_STATS_BASIC);
5793         struct rtattr *tbs[tca_stats_max + 1];
5794
5795         if (rta == NULL || idx < 0)
5796                 return -1;
5797         flow_tcf_nl_parse_rtattr(tbs, tca_stats_max,
5798                                  RTA_DATA(rta), RTA_PAYLOAD(rta));
5799         switch (rta_type[idx]) {
5800         case TCA_STATS_BASIC:
5801                 if (tbs[TCA_STATS_BASIC]) {
5802                         memcpy(data, RTA_DATA(tbs[TCA_STATS_BASIC]),
5803                                RTE_MIN(RTA_PAYLOAD(tbs[TCA_STATS_BASIC]),
5804                                sizeof(*data)));
5805                         return 0;
5806                 }
5807                 break;
5808         default:
5809                 break;
5810         }
5811         return -1;
5812 }
5813
5814 /**
5815  * Parse flower single action retrieving the requested action attribute,
5816  * if found.
5817  *
5818  * @param arg
5819  *   flower action properties in the Netlink message received.
5820  * @param rta_type
5821  *   The backward sequence of rta_types, as written in the attribute table,
5822  *   we need to traverse in order to get to the requested object.
5823  * @param idx
5824  *   Current location in rta_type table.
5825  * @param[out] data
5826  *   Count statistics retrieved from the message query.
5827  *
5828  * @return
5829  *   0 if data was found and retrieved, -1 otherwise.
5830  */
5831 static int
5832 flow_tcf_nl_parse_one_action_and_get(struct rtattr *arg,
5833                                      uint16_t rta_type[], int idx, void *data)
5834 {
5835         int tca_act_max = flow_tcf_arr_val_max(rta_type, idx, TCA_ACT_STATS);
5836         struct rtattr *tb[tca_act_max + 1];
5837
5838         if (arg == NULL || idx < 0)
5839                 return -1;
5840         flow_tcf_nl_parse_rtattr(tb, tca_act_max,
5841                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
5842         if (tb[TCA_ACT_KIND] == NULL)
5843                 return -1;
5844         switch (rta_type[idx]) {
5845         case TCA_ACT_STATS:
5846                 if (tb[TCA_ACT_STATS])
5847                         return flow_tcf_nl_action_stats_parse_and_get
5848                                         (tb[TCA_ACT_STATS],
5849                                          rta_type, --idx,
5850                                          (struct gnet_stats_basic *)data);
5851                 break;
5852         default:
5853                 break;
5854         }
5855         return -1;
5856 }
5857
5858 /**
5859  * Parse flower action section in the message retrieving the requested
5860  * attribute from the first action that provides it.
5861  *
5862  * @param opt
5863  *   flower section in the Netlink message received.
5864  * @param rta_type
5865  *   The backward sequence of rta_types, as written in the attribute table,
5866  *   we need to traverse in order to get to the requested object.
5867  * @param idx
5868  *   Current location in rta_type table.
5869  * @param[out] data
5870  *   data retrieved from the message query.
5871  *
5872  * @return
5873  *   0 if data was found and retrieved, -1 otherwise.
5874  */
5875 static int
5876 flow_tcf_nl_action_parse_and_get(struct rtattr *arg,
5877                                  uint16_t rta_type[], int idx, void *data)
5878 {
5879         struct rtattr *tb[TCA_ACT_MAX_PRIO + 1];
5880         int i;
5881
5882         if (arg == NULL || idx < 0)
5883                 return -1;
5884         flow_tcf_nl_parse_rtattr(tb, TCA_ACT_MAX_PRIO,
5885                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
5886         switch (rta_type[idx]) {
5887         /*
5888          * flow counters are stored in the actions defined by the flow
5889          * and not in the flow itself, therefore we need to traverse the
5890          * flower chain of actions in search for them.
5891          *
5892          * Note that the index is not decremented here.
5893          */
5894         case TCA_ACT_STATS:
5895                 for (i = 0; i <= TCA_ACT_MAX_PRIO; i++) {
5896                         if (tb[i] &&
5897                         !flow_tcf_nl_parse_one_action_and_get(tb[i],
5898                                                               rta_type,
5899                                                               idx, data))
5900                                 return 0;
5901                 }
5902                 break;
5903         default:
5904                 break;
5905         }
5906         return -1;
5907 }
5908
5909 /**
5910  * Parse flower classifier options in the message, retrieving the requested
5911  * attribute if found.
5912  *
5913  * @param opt
5914  *   flower section in the Netlink message received.
5915  * @param rta_type
5916  *   The backward sequence of rta_types, as written in the attribute table,
5917  *   we need to traverse in order to get to the requested object.
5918  * @param idx
5919  *   Current location in rta_type table.
5920  * @param[out] data
5921  *   data retrieved from the message query.
5922  *
5923  * @return
5924  *   0 if data was found and retrieved, -1 otherwise.
5925  */
5926 static int
5927 flow_tcf_nl_opts_parse_and_get(struct rtattr *opt,
5928                                uint16_t rta_type[], int idx, void *data)
5929 {
5930         int tca_flower_max = flow_tcf_arr_val_max(rta_type, idx,
5931                                                   TCA_FLOWER_ACT);
5932         struct rtattr *tb[tca_flower_max + 1];
5933
5934         if (!opt || idx < 0)
5935                 return -1;
5936         flow_tcf_nl_parse_rtattr(tb, tca_flower_max,
5937                                  RTA_DATA(opt), RTA_PAYLOAD(opt));
5938         switch (rta_type[idx]) {
5939         case TCA_FLOWER_ACT:
5940                 if (tb[TCA_FLOWER_ACT])
5941                         return flow_tcf_nl_action_parse_and_get
5942                                                         (tb[TCA_FLOWER_ACT],
5943                                                          rta_type, --idx, data);
5944                 break;
5945         default:
5946                 break;
5947         }
5948         return -1;
5949 }
5950
5951 /**
5952  * Parse Netlink reply on filter query, retrieving the flow counters.
5953  *
5954  * @param nlh
5955  *   Message received from Netlink.
5956  * @param rta_type
5957  *   The backward sequence of rta_types, as written in the attribute table,
5958  *   we need to traverse in order to get to the requested object.
5959  * @param idx
5960  *   Current location in rta_type table.
5961  * @param[out] data
5962  *   data retrieved from the message query.
5963  *
5964  * @return
5965  *   0 if data was found and retrieved, -1 otherwise.
5966  */
5967 static int
5968 flow_tcf_nl_filter_parse_and_get(struct nlmsghdr *cnlh,
5969                                  uint16_t rta_type[], int idx, void *data)
5970 {
5971         struct nlmsghdr *nlh = cnlh;
5972         struct tcmsg *t = NLMSG_DATA(nlh);
5973         int len = nlh->nlmsg_len;
5974         int tca_max = flow_tcf_arr_val_max(rta_type, idx, TCA_OPTIONS);
5975         struct rtattr *tb[tca_max + 1];
5976
5977         if (idx < 0)
5978                 return -1;
5979         if (nlh->nlmsg_type != RTM_NEWTFILTER &&
5980             nlh->nlmsg_type != RTM_GETTFILTER &&
5981             nlh->nlmsg_type != RTM_DELTFILTER)
5982                 return -1;
5983         len -= NLMSG_LENGTH(sizeof(*t));
5984         if (len < 0)
5985                 return -1;
5986         flow_tcf_nl_parse_rtattr(tb, tca_max, TCA_RTA(t), len);
5987         /* Not a TC flower flow - bail out */
5988         if (!tb[TCA_KIND] ||
5989             strcmp(RTA_DATA(tb[TCA_KIND]), "flower"))
5990                 return -1;
5991         switch (rta_type[idx]) {
5992         case TCA_OPTIONS:
5993                 if (tb[TCA_OPTIONS])
5994                         return flow_tcf_nl_opts_parse_and_get(tb[TCA_OPTIONS],
5995                                                               rta_type,
5996                                                               --idx, data);
5997                 break;
5998         default:
5999                 break;
6000         }
6001         return -1;
6002 }
6003
6004 /**
6005  * A callback to parse Netlink reply on TC flower query.
6006  *
6007  * @param nlh
6008  *   Message received from Netlink.
6009  * @param[out] data
6010  *   Pointer to data area to be filled by the parsing routine.
6011  *   assumed to be a pointer to struct flow_tcf_stats_basic.
6012  *
6013  * @return
6014  *   MNL_CB_OK value.
6015  */
6016 static int
6017 flow_tcf_nl_message_get_stats_basic(const struct nlmsghdr *nlh, void *data)
6018 {
6019         /*
6020          * The backward sequence of rta_types to pass in order to get
6021          *  to the counters.
6022          */
6023         uint16_t rta_type[] = { TCA_STATS_BASIC, TCA_ACT_STATS,
6024                                 TCA_FLOWER_ACT, TCA_OPTIONS };
6025         struct flow_tcf_stats_basic *sb_data = data;
6026         union {
6027                 const struct nlmsghdr *c;
6028                 struct nlmsghdr *nc;
6029         } tnlh = { .c = nlh };
6030
6031         if (!flow_tcf_nl_filter_parse_and_get(tnlh.nc, rta_type,
6032                                               RTE_DIM(rta_type) - 1,
6033                                               (void *)&sb_data->counters))
6034                 sb_data->valid = true;
6035         return MNL_CB_OK;
6036 }
6037
6038 /**
6039  * Query a TC flower rule for its statistics via netlink.
6040  *
6041  * @param[in] dev
6042  *   Pointer to Ethernet device.
6043  * @param[in] flow
6044  *   Pointer to the sub flow.
6045  * @param[out] data
6046  *   data retrieved by the query.
6047  * @param[out] error
6048  *   Perform verbose error reporting if not NULL.
6049  *
6050  * @return
6051  *   0 on success, a negative errno value otherwise and rte_errno is set.
6052  */
6053 static int
6054 flow_tcf_query_count(struct rte_eth_dev *dev,
6055                           struct rte_flow *flow,
6056                           void *data,
6057                           struct rte_flow_error *error)
6058 {
6059         struct flow_tcf_stats_basic sb_data;
6060         struct rte_flow_query_count *qc = data;
6061         struct priv *priv = dev->data->dev_private;
6062         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
6063         struct mnl_socket *nl = ctx->nl;
6064         struct mlx5_flow *dev_flow;
6065         struct nlmsghdr *nlh;
6066         uint32_t seq = priv->tcf_context->seq++;
6067         ssize_t ret;
6068         assert(qc);
6069
6070         memset(&sb_data, 0, sizeof(sb_data));
6071         dev_flow = LIST_FIRST(&flow->dev_flows);
6072         /* E-Switch flow can't be expanded. */
6073         assert(!LIST_NEXT(dev_flow, next));
6074         if (!dev_flow->flow->counter)
6075                 goto notsup_exit;
6076         nlh = dev_flow->tcf.nlh;
6077         nlh->nlmsg_type = RTM_GETTFILTER;
6078         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ECHO;
6079         nlh->nlmsg_seq = seq;
6080         if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) == -1)
6081                 goto error_exit;
6082         do {
6083                 ret = mnl_socket_recvfrom(nl, ctx->buf, ctx->buf_size);
6084                 if (ret <= 0)
6085                         break;
6086                 ret = mnl_cb_run(ctx->buf, ret, seq,
6087                                  mnl_socket_get_portid(nl),
6088                                  flow_tcf_nl_message_get_stats_basic,
6089                                  (void *)&sb_data);
6090         } while (ret > 0);
6091         /* Return the delta from last reset. */
6092         if (sb_data.valid) {
6093                 /* Return the delta from last reset. */
6094                 qc->hits_set = 1;
6095                 qc->bytes_set = 1;
6096                 qc->hits = sb_data.counters.packets - flow->counter->hits;
6097                 qc->bytes = sb_data.counters.bytes - flow->counter->bytes;
6098                 if (qc->reset) {
6099                         flow->counter->hits = sb_data.counters.packets;
6100                         flow->counter->bytes = sb_data.counters.bytes;
6101                 }
6102                 return 0;
6103         }
6104         return rte_flow_error_set(error, EINVAL,
6105                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
6106                                   NULL,
6107                                   "flow does not have counter");
6108 error_exit:
6109         return rte_flow_error_set
6110                         (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
6111                          NULL, "netlink: failed to read flow rule counters");
6112 notsup_exit:
6113         return rte_flow_error_set
6114                         (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
6115                          NULL, "counters are not available.");
6116 }
6117
6118 /**
6119  * Query a flow.
6120  *
6121  * @see rte_flow_query()
6122  * @see rte_flow_ops
6123  */
6124 static int
6125 flow_tcf_query(struct rte_eth_dev *dev,
6126                struct rte_flow *flow,
6127                const struct rte_flow_action *actions,
6128                void *data,
6129                struct rte_flow_error *error)
6130 {
6131         int ret = -EINVAL;
6132
6133         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
6134                 switch (actions->type) {
6135                 case RTE_FLOW_ACTION_TYPE_VOID:
6136                         break;
6137                 case RTE_FLOW_ACTION_TYPE_COUNT:
6138                         ret = flow_tcf_query_count(dev, flow, data, error);
6139                         break;
6140                 default:
6141                         return rte_flow_error_set(error, ENOTSUP,
6142                                                   RTE_FLOW_ERROR_TYPE_ACTION,
6143                                                   actions,
6144                                                   "action not supported");
6145                 }
6146         }
6147         return ret;
6148 }
6149
6150 const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops = {
6151         .validate = flow_tcf_validate,
6152         .prepare = flow_tcf_prepare,
6153         .translate = flow_tcf_translate,
6154         .apply = flow_tcf_apply,
6155         .remove = flow_tcf_remove,
6156         .destroy = flow_tcf_destroy,
6157         .query = flow_tcf_query,
6158 };
6159
6160 /**
6161  * Create and configure a libmnl socket for Netlink flow rules.
6162  *
6163  * @return
6164  *   A valid libmnl socket object pointer on success, NULL otherwise and
6165  *   rte_errno is set.
6166  */
6167 static struct mnl_socket *
6168 flow_tcf_mnl_socket_create(void)
6169 {
6170         struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
6171
6172         if (nl) {
6173                 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
6174                                       sizeof(int));
6175                 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
6176                         return nl;
6177         }
6178         rte_errno = errno;
6179         if (nl)
6180                 mnl_socket_close(nl);
6181         return NULL;
6182 }
6183
6184 /**
6185  * Destroy a libmnl socket.
6186  *
6187  * @param nl
6188  *   Libmnl socket of the @p NETLINK_ROUTE kind.
6189  */
6190 static void
6191 flow_tcf_mnl_socket_destroy(struct mnl_socket *nl)
6192 {
6193         if (nl)
6194                 mnl_socket_close(nl);
6195 }
6196
6197 /**
6198  * Initialize ingress qdisc of a given network interface.
6199  *
6200  * @param ctx
6201  *   Pointer to tc-flower context to use.
6202  * @param ifindex
6203  *   Index of network interface to initialize.
6204  * @param[out] error
6205  *   Perform verbose error reporting if not NULL.
6206  *
6207  * @return
6208  *   0 on success, a negative errno value otherwise and rte_errno is set.
6209  */
6210 int
6211 mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx,
6212                    unsigned int ifindex, struct rte_flow_error *error)
6213 {
6214         struct nlmsghdr *nlh;
6215         struct tcmsg *tcm;
6216         alignas(struct nlmsghdr)
6217         uint8_t buf[mnl_nlmsg_size(sizeof(*tcm)) +
6218                     SZ_NLATTR_STRZ_OF("ingress") +
6219                     MNL_BUF_EXTRA_SPACE];
6220
6221         /* Destroy existing ingress qdisc and everything attached to it. */
6222         nlh = mnl_nlmsg_put_header(buf);
6223         nlh->nlmsg_type = RTM_DELQDISC;
6224         nlh->nlmsg_flags = NLM_F_REQUEST;
6225         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6226         tcm->tcm_family = AF_UNSPEC;
6227         tcm->tcm_ifindex = ifindex;
6228         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6229         tcm->tcm_parent = TC_H_INGRESS;
6230         assert(sizeof(buf) >= nlh->nlmsg_len);
6231         /* Ignore errors when qdisc is already absent. */
6232         if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL) &&
6233             rte_errno != EINVAL && rte_errno != ENOENT)
6234                 return rte_flow_error_set(error, rte_errno,
6235                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6236                                           "netlink: failed to remove ingress"
6237                                           " qdisc");
6238         /* Create fresh ingress qdisc. */
6239         nlh = mnl_nlmsg_put_header(buf);
6240         nlh->nlmsg_type = RTM_NEWQDISC;
6241         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
6242         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6243         tcm->tcm_family = AF_UNSPEC;
6244         tcm->tcm_ifindex = ifindex;
6245         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6246         tcm->tcm_parent = TC_H_INGRESS;
6247         mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
6248         assert(sizeof(buf) >= nlh->nlmsg_len);
6249         if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL))
6250                 return rte_flow_error_set(error, rte_errno,
6251                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6252                                           "netlink: failed to create ingress"
6253                                           " qdisc");
6254         return 0;
6255 }
6256
6257 /**
6258  * Create libmnl context for Netlink flow rules.
6259  *
6260  * @return
6261  *   A valid libmnl socket object pointer on success, NULL otherwise and
6262  *   rte_errno is set.
6263  */
6264 struct mlx5_flow_tcf_context *
6265 mlx5_flow_tcf_context_create(void)
6266 {
6267         struct mlx5_flow_tcf_context *ctx = rte_zmalloc(__func__,
6268                                                         sizeof(*ctx),
6269                                                         sizeof(uint32_t));
6270         if (!ctx)
6271                 goto error;
6272         ctx->nl = flow_tcf_mnl_socket_create();
6273         if (!ctx->nl)
6274                 goto error;
6275         ctx->buf_size = MNL_SOCKET_BUFFER_SIZE;
6276         ctx->buf = rte_zmalloc(__func__,
6277                                ctx->buf_size, sizeof(uint32_t));
6278         if (!ctx->buf)
6279                 goto error;
6280         ctx->seq = random();
6281         return ctx;
6282 error:
6283         mlx5_flow_tcf_context_destroy(ctx);
6284         return NULL;
6285 }
6286
6287 /**
6288  * Destroy a libmnl context.
6289  *
6290  * @param ctx
6291  *   Libmnl socket of the @p NETLINK_ROUTE kind.
6292  */
6293 void
6294 mlx5_flow_tcf_context_destroy(struct mlx5_flow_tcf_context *ctx)
6295 {
6296         if (!ctx)
6297                 return;
6298         flow_tcf_mnl_socket_destroy(ctx->nl);
6299         rte_free(ctx->buf);
6300         rte_free(ctx);
6301 }