net/mlx5: add E-switch VXLAN tunnel devices management
[dpdk.git] / drivers / net / mlx5 / mlx5_flow_tcf.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018 6WIND S.A.
3  * Copyright 2018 Mellanox Technologies, Ltd
4  */
5
6 #include <assert.h>
7 #include <errno.h>
8 #include <libmnl/libmnl.h>
9 #include <linux/gen_stats.h>
10 #include <linux/if_ether.h>
11 #include <linux/netlink.h>
12 #include <linux/pkt_cls.h>
13 #include <linux/pkt_sched.h>
14 #include <linux/rtnetlink.h>
15 #include <linux/tc_act/tc_gact.h>
16 #include <linux/tc_act/tc_mirred.h>
17 #include <netinet/in.h>
18 #include <stdalign.h>
19 #include <stdbool.h>
20 #include <stddef.h>
21 #include <stdint.h>
22 #include <stdlib.h>
23 #include <sys/socket.h>
24
25 #include <rte_byteorder.h>
26 #include <rte_errno.h>
27 #include <rte_ether.h>
28 #include <rte_flow.h>
29 #include <rte_malloc.h>
30 #include <rte_common.h>
31
32 #include "mlx5.h"
33 #include "mlx5_flow.h"
34 #include "mlx5_autoconf.h"
35
36 #ifdef HAVE_TC_ACT_VLAN
37
38 #include <linux/tc_act/tc_vlan.h>
39
40 #else /* HAVE_TC_ACT_VLAN */
41
42 #define TCA_VLAN_ACT_POP 1
43 #define TCA_VLAN_ACT_PUSH 2
44 #define TCA_VLAN_ACT_MODIFY 3
45 #define TCA_VLAN_PARMS 2
46 #define TCA_VLAN_PUSH_VLAN_ID 3
47 #define TCA_VLAN_PUSH_VLAN_PROTOCOL 4
48 #define TCA_VLAN_PAD 5
49 #define TCA_VLAN_PUSH_VLAN_PRIORITY 6
50
51 struct tc_vlan {
52         tc_gen;
53         int v_action;
54 };
55
56 #endif /* HAVE_TC_ACT_VLAN */
57
58 #ifdef HAVE_TC_ACT_PEDIT
59
60 #include <linux/tc_act/tc_pedit.h>
61
62 #else /* HAVE_TC_ACT_VLAN */
63
64 enum {
65         TCA_PEDIT_UNSPEC,
66         TCA_PEDIT_TM,
67         TCA_PEDIT_PARMS,
68         TCA_PEDIT_PAD,
69         TCA_PEDIT_PARMS_EX,
70         TCA_PEDIT_KEYS_EX,
71         TCA_PEDIT_KEY_EX,
72         __TCA_PEDIT_MAX
73 };
74
75 enum {
76         TCA_PEDIT_KEY_EX_HTYPE = 1,
77         TCA_PEDIT_KEY_EX_CMD = 2,
78         __TCA_PEDIT_KEY_EX_MAX
79 };
80
81 enum pedit_header_type {
82         TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK = 0,
83         TCA_PEDIT_KEY_EX_HDR_TYPE_ETH = 1,
84         TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 = 2,
85         TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 = 3,
86         TCA_PEDIT_KEY_EX_HDR_TYPE_TCP = 4,
87         TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
88         __PEDIT_HDR_TYPE_MAX,
89 };
90
91 enum pedit_cmd {
92         TCA_PEDIT_KEY_EX_CMD_SET = 0,
93         TCA_PEDIT_KEY_EX_CMD_ADD = 1,
94         __PEDIT_CMD_MAX,
95 };
96
97 struct tc_pedit_key {
98         __u32 mask; /* AND */
99         __u32 val; /*XOR */
100         __u32 off; /*offset */
101         __u32 at;
102         __u32 offmask;
103         __u32 shift;
104 };
105
106 __extension__
107 struct tc_pedit_sel {
108         tc_gen;
109         unsigned char nkeys;
110         unsigned char flags;
111         struct tc_pedit_key keys[0];
112 };
113
114 #endif /* HAVE_TC_ACT_VLAN */
115
116 #ifdef HAVE_TC_ACT_TUNNEL_KEY
117
118 #include <linux/tc_act/tc_tunnel_key.h>
119
120 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
121 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
122 #endif
123
124 #ifndef HAVE_TCA_TUNNEL_KEY_NO_CSUM
125 #define TCA_TUNNEL_KEY_NO_CSUM 10
126 #endif
127
128 #else /* HAVE_TC_ACT_TUNNEL_KEY */
129
130 #define TCA_ACT_TUNNEL_KEY 17
131 #define TCA_TUNNEL_KEY_ACT_SET 1
132 #define TCA_TUNNEL_KEY_ACT_RELEASE 2
133 #define TCA_TUNNEL_KEY_PARMS 2
134 #define TCA_TUNNEL_KEY_ENC_IPV4_SRC 3
135 #define TCA_TUNNEL_KEY_ENC_IPV4_DST 4
136 #define TCA_TUNNEL_KEY_ENC_IPV6_SRC 5
137 #define TCA_TUNNEL_KEY_ENC_IPV6_DST 6
138 #define TCA_TUNNEL_KEY_ENC_KEY_ID 7
139 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
140 #define TCA_TUNNEL_KEY_NO_CSUM 10
141
142 struct tc_tunnel_key {
143         tc_gen;
144         int t_action;
145 };
146
147 #endif /* HAVE_TC_ACT_TUNNEL_KEY */
148
149 /* Normally found in linux/netlink.h. */
150 #ifndef NETLINK_CAP_ACK
151 #define NETLINK_CAP_ACK 10
152 #endif
153
154 /* Normally found in linux/pkt_sched.h. */
155 #ifndef TC_H_MIN_INGRESS
156 #define TC_H_MIN_INGRESS 0xfff2u
157 #endif
158
159 /* Normally found in linux/pkt_cls.h. */
160 #ifndef TCA_CLS_FLAGS_SKIP_SW
161 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
162 #endif
163 #ifndef HAVE_TCA_CHAIN
164 #define TCA_CHAIN 11
165 #endif
166 #ifndef HAVE_TCA_FLOWER_ACT
167 #define TCA_FLOWER_ACT 3
168 #endif
169 #ifndef HAVE_TCA_FLOWER_FLAGS
170 #define TCA_FLOWER_FLAGS 22
171 #endif
172 #ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE
173 #define TCA_FLOWER_KEY_ETH_TYPE 8
174 #endif
175 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST
176 #define TCA_FLOWER_KEY_ETH_DST 4
177 #endif
178 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK
179 #define TCA_FLOWER_KEY_ETH_DST_MASK 5
180 #endif
181 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC
182 #define TCA_FLOWER_KEY_ETH_SRC 6
183 #endif
184 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK
185 #define TCA_FLOWER_KEY_ETH_SRC_MASK 7
186 #endif
187 #ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO
188 #define TCA_FLOWER_KEY_IP_PROTO 9
189 #endif
190 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC
191 #define TCA_FLOWER_KEY_IPV4_SRC 10
192 #endif
193 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK
194 #define TCA_FLOWER_KEY_IPV4_SRC_MASK 11
195 #endif
196 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST
197 #define TCA_FLOWER_KEY_IPV4_DST 12
198 #endif
199 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK
200 #define TCA_FLOWER_KEY_IPV4_DST_MASK 13
201 #endif
202 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC
203 #define TCA_FLOWER_KEY_IPV6_SRC 14
204 #endif
205 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK
206 #define TCA_FLOWER_KEY_IPV6_SRC_MASK 15
207 #endif
208 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST
209 #define TCA_FLOWER_KEY_IPV6_DST 16
210 #endif
211 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK
212 #define TCA_FLOWER_KEY_IPV6_DST_MASK 17
213 #endif
214 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC
215 #define TCA_FLOWER_KEY_TCP_SRC 18
216 #endif
217 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK
218 #define TCA_FLOWER_KEY_TCP_SRC_MASK 35
219 #endif
220 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST
221 #define TCA_FLOWER_KEY_TCP_DST 19
222 #endif
223 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK
224 #define TCA_FLOWER_KEY_TCP_DST_MASK 36
225 #endif
226 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC
227 #define TCA_FLOWER_KEY_UDP_SRC 20
228 #endif
229 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK
230 #define TCA_FLOWER_KEY_UDP_SRC_MASK 37
231 #endif
232 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST
233 #define TCA_FLOWER_KEY_UDP_DST 21
234 #endif
235 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK
236 #define TCA_FLOWER_KEY_UDP_DST_MASK 38
237 #endif
238 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID
239 #define TCA_FLOWER_KEY_VLAN_ID 23
240 #endif
241 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO
242 #define TCA_FLOWER_KEY_VLAN_PRIO 24
243 #endif
244 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
245 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
246 #endif
247 #ifndef HAVE_TCA_FLOWER_KEY_ENC_KEY_ID
248 #define TCA_FLOWER_KEY_ENC_KEY_ID 26
249 #endif
250 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC
251 #define TCA_FLOWER_KEY_ENC_IPV4_SRC 27
252 #endif
253 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK
254 #define TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK 28
255 #endif
256 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST
257 #define TCA_FLOWER_KEY_ENC_IPV4_DST 29
258 #endif
259 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK
260 #define TCA_FLOWER_KEY_ENC_IPV4_DST_MASK 30
261 #endif
262 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC
263 #define TCA_FLOWER_KEY_ENC_IPV6_SRC 31
264 #endif
265 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK
266 #define TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK 32
267 #endif
268 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST
269 #define TCA_FLOWER_KEY_ENC_IPV6_DST 33
270 #endif
271 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK
272 #define TCA_FLOWER_KEY_ENC_IPV6_DST_MASK 34
273 #endif
274 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT
275 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT 43
276 #endif
277 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK
278 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK 44
279 #endif
280 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT
281 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT 45
282 #endif
283 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK
284 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK 46
285 #endif
286 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS
287 #define TCA_FLOWER_KEY_TCP_FLAGS 71
288 #endif
289 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK
290 #define TCA_FLOWER_KEY_TCP_FLAGS_MASK 72
291 #endif
292 #ifndef HAVE_TC_ACT_GOTO_CHAIN
293 #define TC_ACT_GOTO_CHAIN 0x20000000
294 #endif
295
296 #ifndef IPV6_ADDR_LEN
297 #define IPV6_ADDR_LEN 16
298 #endif
299
300 #ifndef IPV4_ADDR_LEN
301 #define IPV4_ADDR_LEN 4
302 #endif
303
304 #ifndef TP_PORT_LEN
305 #define TP_PORT_LEN 2 /* Transport Port (UDP/TCP) Length */
306 #endif
307
308 #ifndef TTL_LEN
309 #define TTL_LEN 1
310 #endif
311
312 #ifndef TCA_ACT_MAX_PRIO
313 #define TCA_ACT_MAX_PRIO 32
314 #endif
315
316 /** UDP port range of VXLAN devices created by driver. */
317 #define MLX5_VXLAN_PORT_MIN 30000
318 #define MLX5_VXLAN_PORT_MAX 60000
319 #define MLX5_VXLAN_DEVICE_PFX "vmlx_"
320
321 /** Tunnel action type, used for @p type in header structure. */
322 enum flow_tcf_tunact_type {
323         FLOW_TCF_TUNACT_VXLAN_DECAP,
324         FLOW_TCF_TUNACT_VXLAN_ENCAP,
325 };
326
327 /** Flags used for @p mask in tunnel action encap descriptors. */
328 #define FLOW_TCF_ENCAP_ETH_SRC (1u << 0)
329 #define FLOW_TCF_ENCAP_ETH_DST (1u << 1)
330 #define FLOW_TCF_ENCAP_IPV4_SRC (1u << 2)
331 #define FLOW_TCF_ENCAP_IPV4_DST (1u << 3)
332 #define FLOW_TCF_ENCAP_IPV6_SRC (1u << 4)
333 #define FLOW_TCF_ENCAP_IPV6_DST (1u << 5)
334 #define FLOW_TCF_ENCAP_UDP_SRC (1u << 6)
335 #define FLOW_TCF_ENCAP_UDP_DST (1u << 7)
336 #define FLOW_TCF_ENCAP_VXLAN_VNI (1u << 8)
337
338 /**
339  * Structure for holding netlink context.
340  * Note the size of the message buffer which is MNL_SOCKET_BUFFER_SIZE.
341  * Using this (8KB) buffer size ensures that netlink messages will never be
342  * truncated.
343  */
344 struct mlx5_flow_tcf_context {
345         struct mnl_socket *nl; /* NETLINK_ROUTE libmnl socket. */
346         uint32_t seq; /* Message sequence number. */
347         uint32_t buf_size; /* Message buffer size. */
348         uint8_t *buf; /* Message buffer. */
349 };
350
351 /**
352  * Neigh rule structure. The neigh rule is applied via Netlink to
353  * outer tunnel iface in order to provide destination MAC address
354  * for the VXLAN encapsultion. The neigh rule is implicitly related
355  * to the Flow itself and can be shared by multiple Flows.
356  */
357 struct tcf_neigh_rule {
358         LIST_ENTRY(tcf_neigh_rule) next;
359         uint32_t refcnt;
360         struct ether_addr eth;
361         uint16_t mask;
362         union {
363                 struct {
364                         rte_be32_t dst;
365                 } ipv4;
366                 struct {
367                         uint8_t dst[IPV6_ADDR_LEN];
368                 } ipv6;
369         };
370 };
371
372 /**
373  * Local rule structure. The local rule is applied via Netlink to
374  * outer tunnel iface in order to provide local and peer IP addresses
375  * of the VXLAN tunnel for encapsulation. The local rule is implicitly
376  * related to the Flow itself and can be shared by multiple Flows.
377  */
378 struct tcf_local_rule {
379         LIST_ENTRY(tcf_local_rule) next;
380         uint32_t refcnt;
381         uint16_t mask;
382         union {
383                 struct {
384                         rte_be32_t dst;
385                         rte_be32_t src;
386                 } ipv4;
387                 struct {
388                         uint8_t dst[IPV6_ADDR_LEN];
389                         uint8_t src[IPV6_ADDR_LEN];
390                 } ipv6;
391         };
392 };
393
394 /** VXLAN virtual netdev. */
395 struct tcf_vtep {
396         LIST_ENTRY(tcf_vtep) next;
397         LIST_HEAD(, tcf_neigh_rule) neigh;
398         LIST_HEAD(, tcf_local_rule) local;
399         uint32_t refcnt;
400         unsigned int ifindex; /**< Own interface index. */
401         unsigned int ifouter; /**< Index of device attached to. */
402         uint16_t port;
403         uint8_t created;
404 };
405
406 /** Tunnel descriptor header, common for all tunnel types. */
407 struct flow_tcf_tunnel_hdr {
408         uint32_t type; /**< Tunnel action type. */
409         struct tcf_vtep *vtep; /**< Virtual tunnel endpoint device. */
410         unsigned int ifindex_org; /**< Original dst/src interface */
411         unsigned int *ifindex_ptr; /**< Interface ptr in message. */
412 };
413
414 struct flow_tcf_vxlan_decap {
415         struct flow_tcf_tunnel_hdr hdr;
416         uint16_t udp_port;
417 };
418
419 struct flow_tcf_vxlan_encap {
420         struct flow_tcf_tunnel_hdr hdr;
421         uint32_t mask;
422         struct {
423                 struct ether_addr dst;
424                 struct ether_addr src;
425         } eth;
426         union {
427                 struct {
428                         rte_be32_t dst;
429                         rte_be32_t src;
430                 } ipv4;
431                 struct {
432                         uint8_t dst[IPV6_ADDR_LEN];
433                         uint8_t src[IPV6_ADDR_LEN];
434                 } ipv6;
435         };
436 struct {
437                 rte_be16_t src;
438                 rte_be16_t dst;
439         } udp;
440         struct {
441                 uint8_t vni[3];
442         } vxlan;
443 };
444
445 /** Structure used when extracting the values of a flow counters
446  * from a netlink message.
447  */
448 struct flow_tcf_stats_basic {
449         bool valid;
450         struct gnet_stats_basic counters;
451 };
452
453 /** Empty masks for known item types. */
454 static const union {
455         struct rte_flow_item_port_id port_id;
456         struct rte_flow_item_eth eth;
457         struct rte_flow_item_vlan vlan;
458         struct rte_flow_item_ipv4 ipv4;
459         struct rte_flow_item_ipv6 ipv6;
460         struct rte_flow_item_tcp tcp;
461         struct rte_flow_item_udp udp;
462         struct rte_flow_item_vxlan vxlan;
463 } flow_tcf_mask_empty;
464
465 /** Supported masks for known item types. */
466 static const struct {
467         struct rte_flow_item_port_id port_id;
468         struct rte_flow_item_eth eth;
469         struct rte_flow_item_vlan vlan;
470         struct rte_flow_item_ipv4 ipv4;
471         struct rte_flow_item_ipv6 ipv6;
472         struct rte_flow_item_tcp tcp;
473         struct rte_flow_item_udp udp;
474         struct rte_flow_item_vxlan vxlan;
475 } flow_tcf_mask_supported = {
476         .port_id = {
477                 .id = 0xffffffff,
478         },
479         .eth = {
480                 .type = RTE_BE16(0xffff),
481                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
482                 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
483         },
484         .vlan = {
485                 /* PCP and VID only, no DEI. */
486                 .tci = RTE_BE16(0xefff),
487                 .inner_type = RTE_BE16(0xffff),
488         },
489         .ipv4.hdr = {
490                 .next_proto_id = 0xff,
491                 .src_addr = RTE_BE32(0xffffffff),
492                 .dst_addr = RTE_BE32(0xffffffff),
493         },
494         .ipv6.hdr = {
495                 .proto = 0xff,
496                 .src_addr =
497                         "\xff\xff\xff\xff\xff\xff\xff\xff"
498                         "\xff\xff\xff\xff\xff\xff\xff\xff",
499                 .dst_addr =
500                         "\xff\xff\xff\xff\xff\xff\xff\xff"
501                         "\xff\xff\xff\xff\xff\xff\xff\xff",
502         },
503         .tcp.hdr = {
504                 .src_port = RTE_BE16(0xffff),
505                 .dst_port = RTE_BE16(0xffff),
506                 .tcp_flags = 0xff,
507         },
508         .udp.hdr = {
509                 .src_port = RTE_BE16(0xffff),
510                 .dst_port = RTE_BE16(0xffff),
511         },
512         .vxlan = {
513                .vni = "\xff\xff\xff",
514         },
515 };
516
517 #define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr))
518 #define SZ_NLATTR_NEST SZ_NLATTR_HDR
519 #define SZ_NLATTR_DATA_OF(len) MNL_ALIGN(SZ_NLATTR_HDR + (len))
520 #define SZ_NLATTR_TYPE_OF(typ) SZ_NLATTR_DATA_OF(sizeof(typ))
521 #define SZ_NLATTR_STRZ_OF(str) SZ_NLATTR_DATA_OF(strlen(str) + 1)
522
523 #define PTOI_TABLE_SZ_MAX(dev) (mlx5_dev_to_port_id((dev)->device, NULL, 0) + 2)
524
525 /** DPDK port to network interface index (ifindex) conversion. */
526 struct flow_tcf_ptoi {
527         uint16_t port_id; /**< DPDK port ID. */
528         unsigned int ifindex; /**< Network interface index. */
529 };
530
531 /* Due to a limitation on driver/FW. */
532 #define MLX5_TCF_GROUP_ID_MAX 3
533 #define MLX5_TCF_GROUP_PRIORITY_MAX 14
534
535 #define MLX5_TCF_FATE_ACTIONS \
536         (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \
537          MLX5_FLOW_ACTION_JUMP)
538
539 #define MLX5_TCF_VLAN_ACTIONS \
540         (MLX5_FLOW_ACTION_OF_POP_VLAN | MLX5_FLOW_ACTION_OF_PUSH_VLAN | \
541          MLX5_FLOW_ACTION_OF_SET_VLAN_VID | MLX5_FLOW_ACTION_OF_SET_VLAN_PCP)
542
543 #define MLX5_TCF_VXLAN_ACTIONS \
544         (MLX5_FLOW_ACTION_VXLAN_ENCAP | MLX5_FLOW_ACTION_VXLAN_DECAP)
545
546 #define MLX5_TCF_PEDIT_ACTIONS \
547         (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST | \
548          MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST | \
549          MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST | \
550          MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL | \
551          MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)
552
553 #define MLX5_TCF_CONFIG_ACTIONS \
554         (MLX5_FLOW_ACTION_PORT_ID | MLX5_FLOW_ACTION_JUMP | \
555          MLX5_FLOW_ACTION_OF_PUSH_VLAN | MLX5_FLOW_ACTION_OF_SET_VLAN_VID | \
556          MLX5_FLOW_ACTION_OF_SET_VLAN_PCP | \
557          (MLX5_TCF_PEDIT_ACTIONS & ~MLX5_FLOW_ACTION_DEC_TTL))
558
559 #define MAX_PEDIT_KEYS 128
560 #define SZ_PEDIT_KEY_VAL 4
561
562 #define NUM_OF_PEDIT_KEYS(sz) \
563         (((sz) / SZ_PEDIT_KEY_VAL) + (((sz) % SZ_PEDIT_KEY_VAL) ? 1 : 0))
564
565 struct pedit_key_ex {
566         enum pedit_header_type htype;
567         enum pedit_cmd cmd;
568 };
569
570 struct pedit_parser {
571         struct tc_pedit_sel sel;
572         struct tc_pedit_key keys[MAX_PEDIT_KEYS];
573         struct pedit_key_ex keys_ex[MAX_PEDIT_KEYS];
574 };
575
576 /**
577  * Create space for using the implicitly created TC flow counter.
578  *
579  * @param[in] dev
580  *   Pointer to the Ethernet device structure.
581  *
582  * @return
583  *   A pointer to the counter data structure, NULL otherwise and
584  *   rte_errno is set.
585  */
586 static struct mlx5_flow_counter *
587 flow_tcf_counter_new(void)
588 {
589         struct mlx5_flow_counter *cnt;
590
591         /*
592          * eswitch counter cannot be shared and its id is unknown.
593          * currently returning all with id 0.
594          * in the future maybe better to switch to unique numbers.
595          */
596         struct mlx5_flow_counter tmpl = {
597                 .ref_cnt = 1,
598         };
599         cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0);
600         if (!cnt) {
601                 rte_errno = ENOMEM;
602                 return NULL;
603         }
604         *cnt = tmpl;
605         /* Implicit counter, do not add to list. */
606         return cnt;
607 }
608
609 /**
610  * Set pedit key of MAC address
611  *
612  * @param[in] actions
613  *   pointer to action specification
614  * @param[in,out] p_parser
615  *   pointer to pedit_parser
616  */
617 static void
618 flow_tcf_pedit_key_set_mac(const struct rte_flow_action *actions,
619                            struct pedit_parser *p_parser)
620 {
621         int idx = p_parser->sel.nkeys;
622         uint32_t off = actions->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ?
623                                         offsetof(struct ether_hdr, s_addr) :
624                                         offsetof(struct ether_hdr, d_addr);
625         const struct rte_flow_action_set_mac *conf =
626                 (const struct rte_flow_action_set_mac *)actions->conf;
627
628         p_parser->keys[idx].off = off;
629         p_parser->keys[idx].mask = ~UINT32_MAX;
630         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
631         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
632         memcpy(&p_parser->keys[idx].val,
633                 conf->mac_addr, SZ_PEDIT_KEY_VAL);
634         idx++;
635         p_parser->keys[idx].off = off + SZ_PEDIT_KEY_VAL;
636         p_parser->keys[idx].mask = 0xFFFF0000;
637         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
638         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
639         memcpy(&p_parser->keys[idx].val,
640                 conf->mac_addr + SZ_PEDIT_KEY_VAL,
641                 ETHER_ADDR_LEN - SZ_PEDIT_KEY_VAL);
642         p_parser->sel.nkeys = (++idx);
643 }
644
645 /**
646  * Set pedit key of decrease/set ttl
647  *
648  * @param[in] actions
649  *   pointer to action specification
650  * @param[in,out] p_parser
651  *   pointer to pedit_parser
652  * @param[in] item_flags
653  *   flags of all items presented
654  */
655 static void
656 flow_tcf_pedit_key_set_dec_ttl(const struct rte_flow_action *actions,
657                                 struct pedit_parser *p_parser,
658                                 uint64_t item_flags)
659 {
660         int idx = p_parser->sel.nkeys;
661
662         p_parser->keys[idx].mask = 0xFFFFFF00;
663         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) {
664                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
665                 p_parser->keys[idx].off =
666                         offsetof(struct ipv4_hdr, time_to_live);
667         }
668         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6) {
669                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
670                 p_parser->keys[idx].off =
671                         offsetof(struct ipv6_hdr, hop_limits);
672         }
673         if (actions->type == RTE_FLOW_ACTION_TYPE_DEC_TTL) {
674                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_ADD;
675                 p_parser->keys[idx].val = 0x000000FF;
676         } else {
677                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
678                 p_parser->keys[idx].val =
679                         (__u32)((const struct rte_flow_action_set_ttl *)
680                          actions->conf)->ttl_value;
681         }
682         p_parser->sel.nkeys = (++idx);
683 }
684
685 /**
686  * Set pedit key of transport (TCP/UDP) port value
687  *
688  * @param[in] actions
689  *   pointer to action specification
690  * @param[in,out] p_parser
691  *   pointer to pedit_parser
692  * @param[in] item_flags
693  *   flags of all items presented
694  */
695 static void
696 flow_tcf_pedit_key_set_tp_port(const struct rte_flow_action *actions,
697                                 struct pedit_parser *p_parser,
698                                 uint64_t item_flags)
699 {
700         int idx = p_parser->sel.nkeys;
701
702         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)
703                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_UDP;
704         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_TCP)
705                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_TCP;
706         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
707         /* offset of src/dst port is same for TCP and UDP */
708         p_parser->keys[idx].off =
709                 actions->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC ?
710                 offsetof(struct tcp_hdr, src_port) :
711                 offsetof(struct tcp_hdr, dst_port);
712         p_parser->keys[idx].mask = 0xFFFF0000;
713         p_parser->keys[idx].val =
714                 (__u32)((const struct rte_flow_action_set_tp *)
715                                 actions->conf)->port;
716         p_parser->sel.nkeys = (++idx);
717 }
718
719 /**
720  * Set pedit key of ipv6 address
721  *
722  * @param[in] actions
723  *   pointer to action specification
724  * @param[in,out] p_parser
725  *   pointer to pedit_parser
726  */
727 static void
728 flow_tcf_pedit_key_set_ipv6_addr(const struct rte_flow_action *actions,
729                                  struct pedit_parser *p_parser)
730 {
731         int idx = p_parser->sel.nkeys;
732         int keys = NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
733         int off_base =
734                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ?
735                 offsetof(struct ipv6_hdr, src_addr) :
736                 offsetof(struct ipv6_hdr, dst_addr);
737         const struct rte_flow_action_set_ipv6 *conf =
738                 (const struct rte_flow_action_set_ipv6 *)actions->conf;
739
740         for (int i = 0; i < keys; i++, idx++) {
741                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
742                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
743                 p_parser->keys[idx].off = off_base + i * SZ_PEDIT_KEY_VAL;
744                 p_parser->keys[idx].mask = ~UINT32_MAX;
745                 memcpy(&p_parser->keys[idx].val,
746                         conf->ipv6_addr + i *  SZ_PEDIT_KEY_VAL,
747                         SZ_PEDIT_KEY_VAL);
748         }
749         p_parser->sel.nkeys += keys;
750 }
751
752 /**
753  * Set pedit key of ipv4 address
754  *
755  * @param[in] actions
756  *   pointer to action specification
757  * @param[in,out] p_parser
758  *   pointer to pedit_parser
759  */
760 static void
761 flow_tcf_pedit_key_set_ipv4_addr(const struct rte_flow_action *actions,
762                                  struct pedit_parser *p_parser)
763 {
764         int idx = p_parser->sel.nkeys;
765
766         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
767         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
768         p_parser->keys[idx].off =
769                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ?
770                 offsetof(struct ipv4_hdr, src_addr) :
771                 offsetof(struct ipv4_hdr, dst_addr);
772         p_parser->keys[idx].mask = ~UINT32_MAX;
773         p_parser->keys[idx].val =
774                 ((const struct rte_flow_action_set_ipv4 *)
775                  actions->conf)->ipv4_addr;
776         p_parser->sel.nkeys = (++idx);
777 }
778
779 /**
780  * Create the pedit's na attribute in netlink message
781  * on pre-allocate message buffer
782  *
783  * @param[in,out] nl
784  *   pointer to pre-allocated netlink message buffer
785  * @param[in,out] actions
786  *   pointer to pointer of actions specification.
787  * @param[in,out] action_flags
788  *   pointer to actions flags
789  * @param[in] item_flags
790  *   flags of all item presented
791  */
792 static void
793 flow_tcf_create_pedit_mnl_msg(struct nlmsghdr *nl,
794                               const struct rte_flow_action **actions,
795                               uint64_t item_flags)
796 {
797         struct pedit_parser p_parser;
798         struct nlattr *na_act_options;
799         struct nlattr *na_pedit_keys;
800
801         memset(&p_parser, 0, sizeof(p_parser));
802         mnl_attr_put_strz(nl, TCA_ACT_KIND, "pedit");
803         na_act_options = mnl_attr_nest_start(nl, TCA_ACT_OPTIONS);
804         /* all modify header actions should be in one tc-pedit action */
805         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
806                 switch ((*actions)->type) {
807                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
808                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
809                         flow_tcf_pedit_key_set_ipv4_addr(*actions, &p_parser);
810                         break;
811                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
812                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
813                         flow_tcf_pedit_key_set_ipv6_addr(*actions, &p_parser);
814                         break;
815                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
816                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
817                         flow_tcf_pedit_key_set_tp_port(*actions,
818                                                         &p_parser, item_flags);
819                         break;
820                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
821                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
822                         flow_tcf_pedit_key_set_dec_ttl(*actions,
823                                                         &p_parser, item_flags);
824                         break;
825                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
826                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
827                         flow_tcf_pedit_key_set_mac(*actions, &p_parser);
828                         break;
829                 default:
830                         goto pedit_mnl_msg_done;
831                 }
832         }
833 pedit_mnl_msg_done:
834         p_parser.sel.action = TC_ACT_PIPE;
835         mnl_attr_put(nl, TCA_PEDIT_PARMS_EX,
836                      sizeof(p_parser.sel) +
837                      p_parser.sel.nkeys * sizeof(struct tc_pedit_key),
838                      &p_parser);
839         na_pedit_keys =
840                 mnl_attr_nest_start(nl, TCA_PEDIT_KEYS_EX | NLA_F_NESTED);
841         for (int i = 0; i < p_parser.sel.nkeys; i++) {
842                 struct nlattr *na_pedit_key =
843                         mnl_attr_nest_start(nl,
844                                             TCA_PEDIT_KEY_EX | NLA_F_NESTED);
845                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_HTYPE,
846                                  p_parser.keys_ex[i].htype);
847                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_CMD,
848                                  p_parser.keys_ex[i].cmd);
849                 mnl_attr_nest_end(nl, na_pedit_key);
850         }
851         mnl_attr_nest_end(nl, na_pedit_keys);
852         mnl_attr_nest_end(nl, na_act_options);
853         (*actions)--;
854 }
855
856 /**
857  * Calculate max memory size of one TC-pedit actions.
858  * One TC-pedit action can contain set of keys each defining
859  * a rewrite element (rte_flow action)
860  *
861  * @param[in,out] actions
862  *   actions specification.
863  * @param[in,out] action_flags
864  *   actions flags
865  * @param[in,out] size
866  *   accumulated size
867  * @return
868  *   Max memory size of one TC-pedit action
869  */
870 static int
871 flow_tcf_get_pedit_actions_size(const struct rte_flow_action **actions,
872                                 uint64_t *action_flags)
873 {
874         int pedit_size = 0;
875         int keys = 0;
876         uint64_t flags = 0;
877
878         pedit_size += SZ_NLATTR_NEST + /* na_act_index. */
879                       SZ_NLATTR_STRZ_OF("pedit") +
880                       SZ_NLATTR_NEST; /* TCA_ACT_OPTIONS. */
881         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
882                 switch ((*actions)->type) {
883                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
884                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
885                         flags |= MLX5_FLOW_ACTION_SET_IPV4_SRC;
886                         break;
887                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
888                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
889                         flags |= MLX5_FLOW_ACTION_SET_IPV4_DST;
890                         break;
891                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
892                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
893                         flags |= MLX5_FLOW_ACTION_SET_IPV6_SRC;
894                         break;
895                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
896                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
897                         flags |= MLX5_FLOW_ACTION_SET_IPV6_DST;
898                         break;
899                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
900                         /* TCP is as same as UDP */
901                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
902                         flags |= MLX5_FLOW_ACTION_SET_TP_SRC;
903                         break;
904                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
905                         /* TCP is as same as UDP */
906                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
907                         flags |= MLX5_FLOW_ACTION_SET_TP_DST;
908                         break;
909                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
910                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
911                         flags |= MLX5_FLOW_ACTION_SET_TTL;
912                         break;
913                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
914                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
915                         flags |= MLX5_FLOW_ACTION_DEC_TTL;
916                         break;
917                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
918                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
919                         flags |= MLX5_FLOW_ACTION_SET_MAC_SRC;
920                         break;
921                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
922                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
923                         flags |= MLX5_FLOW_ACTION_SET_MAC_DST;
924                         break;
925                 default:
926                         goto get_pedit_action_size_done;
927                 }
928         }
929 get_pedit_action_size_done:
930         /* TCA_PEDIT_PARAMS_EX */
931         pedit_size +=
932                 SZ_NLATTR_DATA_OF(sizeof(struct tc_pedit_sel) +
933                                   keys * sizeof(struct tc_pedit_key));
934         pedit_size += SZ_NLATTR_NEST; /* TCA_PEDIT_KEYS */
935         pedit_size += keys *
936                       /* TCA_PEDIT_KEY_EX + HTYPE + CMD */
937                       (SZ_NLATTR_NEST + SZ_NLATTR_DATA_OF(2) +
938                        SZ_NLATTR_DATA_OF(2));
939         (*action_flags) |= flags;
940         (*actions)--;
941         return pedit_size;
942 }
943
944 /**
945  * Retrieve mask for pattern item.
946  *
947  * This function does basic sanity checks on a pattern item in order to
948  * return the most appropriate mask for it.
949  *
950  * @param[in] item
951  *   Item specification.
952  * @param[in] mask_default
953  *   Default mask for pattern item as specified by the flow API.
954  * @param[in] mask_supported
955  *   Mask fields supported by the implementation.
956  * @param[in] mask_empty
957  *   Empty mask to return when there is no specification.
958  * @param[out] error
959  *   Perform verbose error reporting if not NULL.
960  *
961  * @return
962  *   Either @p item->mask or one of the mask parameters on success, NULL
963  *   otherwise and rte_errno is set.
964  */
965 static const void *
966 flow_tcf_item_mask(const struct rte_flow_item *item, const void *mask_default,
967                    const void *mask_supported, const void *mask_empty,
968                    size_t mask_size, struct rte_flow_error *error)
969 {
970         const uint8_t *mask;
971         size_t i;
972
973         /* item->last and item->mask cannot exist without item->spec. */
974         if (!item->spec && (item->mask || item->last)) {
975                 rte_flow_error_set(error, EINVAL,
976                                    RTE_FLOW_ERROR_TYPE_ITEM, item,
977                                    "\"mask\" or \"last\" field provided without"
978                                    " a corresponding \"spec\"");
979                 return NULL;
980         }
981         /* No spec, no mask, no problem. */
982         if (!item->spec)
983                 return mask_empty;
984         mask = item->mask ? item->mask : mask_default;
985         assert(mask);
986         /*
987          * Single-pass check to make sure that:
988          * - Mask is supported, no bits are set outside mask_supported.
989          * - Both item->spec and item->last are included in mask.
990          */
991         for (i = 0; i != mask_size; ++i) {
992                 if (!mask[i])
993                         continue;
994                 if ((mask[i] | ((const uint8_t *)mask_supported)[i]) !=
995                     ((const uint8_t *)mask_supported)[i]) {
996                         rte_flow_error_set(error, ENOTSUP,
997                                            RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
998                                            "unsupported field found"
999                                            " in \"mask\"");
1000                         return NULL;
1001                 }
1002                 if (item->last &&
1003                     (((const uint8_t *)item->spec)[i] & mask[i]) !=
1004                     (((const uint8_t *)item->last)[i] & mask[i])) {
1005                         rte_flow_error_set(error, EINVAL,
1006                                            RTE_FLOW_ERROR_TYPE_ITEM_LAST,
1007                                            item->last,
1008                                            "range between \"spec\" and \"last\""
1009                                            " not comprised in \"mask\"");
1010                         return NULL;
1011                 }
1012         }
1013         return mask;
1014 }
1015
1016 /**
1017  * Build a conversion table between port ID and ifindex.
1018  *
1019  * @param[in] dev
1020  *   Pointer to Ethernet device.
1021  * @param[out] ptoi
1022  *   Pointer to ptoi table.
1023  * @param[in] len
1024  *   Size of ptoi table provided.
1025  *
1026  * @return
1027  *   Size of ptoi table filled.
1028  */
1029 static unsigned int
1030 flow_tcf_build_ptoi_table(struct rte_eth_dev *dev, struct flow_tcf_ptoi *ptoi,
1031                           unsigned int len)
1032 {
1033         unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
1034         uint16_t port_id[n + 1];
1035         unsigned int i;
1036         unsigned int own = 0;
1037
1038         /* At least one port is needed when no switch domain is present. */
1039         if (!n) {
1040                 n = 1;
1041                 port_id[0] = dev->data->port_id;
1042         } else {
1043                 n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
1044         }
1045         if (n > len)
1046                 return 0;
1047         for (i = 0; i != n; ++i) {
1048                 struct rte_eth_dev_info dev_info;
1049
1050                 rte_eth_dev_info_get(port_id[i], &dev_info);
1051                 if (port_id[i] == dev->data->port_id)
1052                         own = i;
1053                 ptoi[i].port_id = port_id[i];
1054                 ptoi[i].ifindex = dev_info.if_index;
1055         }
1056         /* Ensure first entry of ptoi[] is the current device. */
1057         if (own) {
1058                 ptoi[n] = ptoi[0];
1059                 ptoi[0] = ptoi[own];
1060                 ptoi[own] = ptoi[n];
1061         }
1062         /* An entry with zero ifindex terminates ptoi[]. */
1063         ptoi[n].port_id = 0;
1064         ptoi[n].ifindex = 0;
1065         return n;
1066 }
1067
1068 /**
1069  * Verify the @p attr will be correctly understood by the E-switch.
1070  *
1071  * @param[in] attr
1072  *   Pointer to flow attributes
1073  * @param[out] error
1074  *   Pointer to error structure.
1075  *
1076  * @return
1077  *   0 on success, a negative errno value otherwise and rte_errno is set.
1078  */
1079 static int
1080 flow_tcf_validate_attributes(const struct rte_flow_attr *attr,
1081                              struct rte_flow_error *error)
1082 {
1083         /*
1084          * Supported attributes: groups, some priorities and ingress only.
1085          * group is supported only if kernel supports chain. Don't care about
1086          * transfer as it is the caller's problem.
1087          */
1088         if (attr->group > MLX5_TCF_GROUP_ID_MAX)
1089                 return rte_flow_error_set(error, ENOTSUP,
1090                                           RTE_FLOW_ERROR_TYPE_ATTR_GROUP, attr,
1091                                           "group ID larger than "
1092                                           RTE_STR(MLX5_TCF_GROUP_ID_MAX)
1093                                           " isn't supported");
1094         else if (attr->group > 0 &&
1095                  attr->priority > MLX5_TCF_GROUP_PRIORITY_MAX)
1096                 return rte_flow_error_set(error, ENOTSUP,
1097                                           RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1098                                           attr,
1099                                           "lowest priority level is "
1100                                           RTE_STR(MLX5_TCF_GROUP_PRIORITY_MAX)
1101                                           " when group is configured");
1102         else if (attr->priority > 0xfffe)
1103                 return rte_flow_error_set(error, ENOTSUP,
1104                                           RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1105                                           attr,
1106                                           "lowest priority level is 0xfffe");
1107         if (!attr->ingress)
1108                 return rte_flow_error_set(error, EINVAL,
1109                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1110                                           attr, "only ingress is supported");
1111         if (attr->egress)
1112                 return rte_flow_error_set(error, ENOTSUP,
1113                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1114                                           attr, "egress is not supported");
1115         return 0;
1116 }
1117
1118 /**
1119  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_ETH item for E-Switch.
1120  * The routine checks the L2 fields to be used in encapsulation header.
1121  *
1122  * @param[in] item
1123  *   Pointer to the item structure.
1124  * @param[out] error
1125  *   Pointer to the error structure.
1126  *
1127  * @return
1128  *   0 on success, a negative errno value otherwise and rte_errno is set.
1129  **/
1130 static int
1131 flow_tcf_validate_vxlan_encap_eth(const struct rte_flow_item *item,
1132                                   struct rte_flow_error *error)
1133 {
1134         const struct rte_flow_item_eth *spec = item->spec;
1135         const struct rte_flow_item_eth *mask = item->mask;
1136
1137         if (!spec) {
1138                 /*
1139                  * Specification for L2 addresses can be empty
1140                  * because these ones are optional and not
1141                  * required directly by tc rule. Kernel tries
1142                  * to resolve these ones on its own
1143                  */
1144                 return 0;
1145         }
1146         if (!mask) {
1147                 /* If mask is not specified use the default one. */
1148                 mask = &rte_flow_item_eth_mask;
1149         }
1150         if (memcmp(&mask->dst,
1151                    &flow_tcf_mask_empty.eth.dst,
1152                    sizeof(flow_tcf_mask_empty.eth.dst))) {
1153                 if (memcmp(&mask->dst,
1154                            &rte_flow_item_eth_mask.dst,
1155                            sizeof(rte_flow_item_eth_mask.dst)))
1156                         return rte_flow_error_set
1157                                 (error, ENOTSUP,
1158                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1159                                  "no support for partial mask on"
1160                                  " \"eth.dst\" field");
1161         }
1162         if (memcmp(&mask->src,
1163                    &flow_tcf_mask_empty.eth.src,
1164                    sizeof(flow_tcf_mask_empty.eth.src))) {
1165                 if (memcmp(&mask->src,
1166                            &rte_flow_item_eth_mask.src,
1167                            sizeof(rte_flow_item_eth_mask.src)))
1168                         return rte_flow_error_set
1169                                 (error, ENOTSUP,
1170                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1171                                  "no support for partial mask on"
1172                                  " \"eth.src\" field");
1173         }
1174         if (mask->type != RTE_BE16(0x0000)) {
1175                 if (mask->type != RTE_BE16(0xffff))
1176                         return rte_flow_error_set
1177                                 (error, ENOTSUP,
1178                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1179                                  "no support for partial mask on"
1180                                  " \"eth.type\" field");
1181                 DRV_LOG(WARNING,
1182                         "outer ethernet type field"
1183                         " cannot be forced for vxlan"
1184                         " encapsulation, parameter ignored");
1185         }
1186         return 0;
1187 }
1188
1189 /**
1190  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV4 item for E-Switch.
1191  * The routine checks the IPv4 fields to be used in encapsulation header.
1192  *
1193  * @param[in] item
1194  *   Pointer to the item structure.
1195  * @param[out] error
1196  *   Pointer to the error structure.
1197  *
1198  * @return
1199  *   0 on success, a negative errno value otherwise and rte_errno is set.
1200  **/
1201 static int
1202 flow_tcf_validate_vxlan_encap_ipv4(const struct rte_flow_item *item,
1203                                    struct rte_flow_error *error)
1204 {
1205         const struct rte_flow_item_ipv4 *spec = item->spec;
1206         const struct rte_flow_item_ipv4 *mask = item->mask;
1207
1208         if (!spec) {
1209                 /*
1210                  * Specification for IP addresses cannot be empty
1211                  * because it is required by tunnel_key parameter.
1212                  */
1213                 return rte_flow_error_set(error, EINVAL,
1214                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1215                                           "NULL outer ipv4 address"
1216                                           " specification for vxlan"
1217                                           " encapsulation");
1218         }
1219         if (!mask)
1220                 mask = &rte_flow_item_ipv4_mask;
1221         if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) {
1222                 if (mask->hdr.dst_addr != RTE_BE32(0xffffffff))
1223                         return rte_flow_error_set
1224                                 (error, ENOTSUP,
1225                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1226                                  "no support for partial mask on"
1227                                  " \"ipv4.hdr.dst_addr\" field"
1228                                  " for vxlan encapsulation");
1229                 /* More IPv4 address validations can be put here. */
1230         } else {
1231                 /*
1232                  * Kernel uses the destination IP address to determine
1233                  * the routing path and obtain the MAC destination
1234                  * address, so IP destination address must be
1235                  * specified in the tc rule.
1236                  */
1237                 return rte_flow_error_set(error, EINVAL,
1238                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1239                                           "outer ipv4 destination address"
1240                                           " must be specified for"
1241                                           " vxlan encapsulation");
1242         }
1243         if (mask->hdr.src_addr != RTE_BE32(0x00000000)) {
1244                 if (mask->hdr.src_addr != RTE_BE32(0xffffffff))
1245                         return rte_flow_error_set
1246                                 (error, ENOTSUP,
1247                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1248                                  "no support for partial mask on"
1249                                  " \"ipv4.hdr.src_addr\" field"
1250                                  " for vxlan encapsulation");
1251                 /* More IPv4 address validations can be put here. */
1252         } else {
1253                 /*
1254                  * Kernel uses the source IP address to select the
1255                  * interface for egress encapsulated traffic, so
1256                  * it must be specified in the tc rule.
1257                  */
1258                 return rte_flow_error_set(error, EINVAL,
1259                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1260                                           "outer ipv4 source address"
1261                                           " must be specified for"
1262                                           " vxlan encapsulation");
1263         }
1264         return 0;
1265 }
1266
1267 /**
1268  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV6 item for E-Switch.
1269  * The routine checks the IPv6 fields to be used in encapsulation header.
1270  *
1271  * @param[in] item
1272  *   Pointer to the item structure.
1273  * @param[out] error
1274  *   Pointer to the error structure.
1275  *
1276  * @return
1277  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1278  **/
1279 static int
1280 flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item,
1281                                    struct rte_flow_error *error)
1282 {
1283         const struct rte_flow_item_ipv6 *spec = item->spec;
1284         const struct rte_flow_item_ipv6 *mask = item->mask;
1285
1286         if (!spec) {
1287                 /*
1288                  * Specification for IP addresses cannot be empty
1289                  * because it is required by tunnel_key parameter.
1290                  */
1291                 return rte_flow_error_set(error, EINVAL,
1292                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1293                                           "NULL outer ipv6 address"
1294                                           " specification for"
1295                                           " vxlan encapsulation");
1296         }
1297         if (!mask)
1298                 mask = &rte_flow_item_ipv6_mask;
1299         if (memcmp(&mask->hdr.dst_addr,
1300                    &flow_tcf_mask_empty.ipv6.hdr.dst_addr,
1301                    IPV6_ADDR_LEN)) {
1302                 if (memcmp(&mask->hdr.dst_addr,
1303                            &rte_flow_item_ipv6_mask.hdr.dst_addr,
1304                            IPV6_ADDR_LEN))
1305                         return rte_flow_error_set
1306                                         (error, ENOTSUP,
1307                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1308                                          "no support for partial mask on"
1309                                          " \"ipv6.hdr.dst_addr\" field"
1310                                          " for vxlan encapsulation");
1311                 /* More IPv6 address validations can be put here. */
1312         } else {
1313                 /*
1314                  * Kernel uses the destination IP address to determine
1315                  * the routing path and obtain the MAC destination
1316                  * address (heigh or gate), so IP destination address
1317                  * must be specified within the tc rule.
1318                  */
1319                 return rte_flow_error_set(error, EINVAL,
1320                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1321                                           "outer ipv6 destination address"
1322                                           " must be specified for"
1323                                           " vxlan encapsulation");
1324         }
1325         if (memcmp(&mask->hdr.src_addr,
1326                    &flow_tcf_mask_empty.ipv6.hdr.src_addr,
1327                    IPV6_ADDR_LEN)) {
1328                 if (memcmp(&mask->hdr.src_addr,
1329                            &rte_flow_item_ipv6_mask.hdr.src_addr,
1330                            IPV6_ADDR_LEN))
1331                         return rte_flow_error_set
1332                                         (error, ENOTSUP,
1333                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1334                                          "no support for partial mask on"
1335                                          " \"ipv6.hdr.src_addr\" field"
1336                                          " for vxlan encapsulation");
1337                 /* More L3 address validation can be put here. */
1338         } else {
1339                 /*
1340                  * Kernel uses the source IP address to select the
1341                  * interface for egress encapsulated traffic, so
1342                  * it must be specified in the tc rule.
1343                  */
1344                 return rte_flow_error_set(error, EINVAL,
1345                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1346                                           "outer L3 source address"
1347                                           " must be specified for"
1348                                           " vxlan encapsulation");
1349         }
1350         return 0;
1351 }
1352
1353 /**
1354  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_UDP item for E-Switch.
1355  * The routine checks the UDP fields to be used in encapsulation header.
1356  *
1357  * @param[in] item
1358  *   Pointer to the item structure.
1359  * @param[out] error
1360  *   Pointer to the error structure.
1361  *
1362  * @return
1363  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1364  **/
1365 static int
1366 flow_tcf_validate_vxlan_encap_udp(const struct rte_flow_item *item,
1367                                   struct rte_flow_error *error)
1368 {
1369         const struct rte_flow_item_udp *spec = item->spec;
1370         const struct rte_flow_item_udp *mask = item->mask;
1371
1372         if (!spec) {
1373                 /*
1374                  * Specification for UDP ports cannot be empty
1375                  * because it is required by tunnel_key parameter.
1376                  */
1377                 return rte_flow_error_set(error, EINVAL,
1378                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1379                                           "NULL UDP port specification "
1380                                           " for vxlan encapsulation");
1381         }
1382         if (!mask)
1383                 mask = &rte_flow_item_udp_mask;
1384         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1385                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1386                         return rte_flow_error_set
1387                                         (error, ENOTSUP,
1388                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1389                                          "no support for partial mask on"
1390                                          " \"udp.hdr.dst_port\" field"
1391                                          " for vxlan encapsulation");
1392                 if (!spec->hdr.dst_port)
1393                         return rte_flow_error_set
1394                                         (error, EINVAL,
1395                                          RTE_FLOW_ERROR_TYPE_ITEM, item,
1396                                          "outer UDP remote port cannot be"
1397                                          " 0 for vxlan encapsulation");
1398         } else {
1399                 return rte_flow_error_set(error, EINVAL,
1400                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1401                                           "outer UDP remote port"
1402                                           " must be specified for"
1403                                           " vxlan encapsulation");
1404         }
1405         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1406                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1407                         return rte_flow_error_set
1408                                         (error, ENOTSUP,
1409                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1410                                          "no support for partial mask on"
1411                                          " \"udp.hdr.src_port\" field"
1412                                          " for vxlan encapsulation");
1413                 DRV_LOG(WARNING,
1414                         "outer UDP source port cannot be"
1415                         " forced for vxlan encapsulation,"
1416                         " parameter ignored");
1417         }
1418         return 0;
1419 }
1420
1421 /**
1422  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_VXLAN item for E-Switch.
1423  * The routine checks the VNIP fields to be used in encapsulation header.
1424  *
1425  * @param[in] item
1426  *   Pointer to the item structure.
1427  * @param[out] error
1428  *   Pointer to the error structure.
1429  *
1430  * @return
1431  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1432  **/
1433 static int
1434 flow_tcf_validate_vxlan_encap_vni(const struct rte_flow_item *item,
1435                                   struct rte_flow_error *error)
1436 {
1437         const struct rte_flow_item_vxlan *spec = item->spec;
1438         const struct rte_flow_item_vxlan *mask = item->mask;
1439
1440         if (!spec) {
1441                 /* Outer VNI is required by tunnel_key parameter. */
1442                 return rte_flow_error_set(error, EINVAL,
1443                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1444                                           "NULL VNI specification"
1445                                           " for vxlan encapsulation");
1446         }
1447         if (!mask)
1448                 mask = &rte_flow_item_vxlan_mask;
1449         if (!mask->vni[0] && !mask->vni[1] && !mask->vni[2])
1450                 return rte_flow_error_set(error, EINVAL,
1451                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1452                                           "outer VNI must be specified "
1453                                           "for vxlan encapsulation");
1454         if (mask->vni[0] != 0xff ||
1455             mask->vni[1] != 0xff ||
1456             mask->vni[2] != 0xff)
1457                 return rte_flow_error_set(error, ENOTSUP,
1458                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1459                                           "no support for partial mask on"
1460                                           " \"vxlan.vni\" field");
1461
1462         if (!spec->vni[0] && !spec->vni[1] && !spec->vni[2])
1463                 return rte_flow_error_set(error, EINVAL,
1464                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1465                                           "vxlan vni cannot be 0");
1466         return 0;
1467 }
1468
1469 /**
1470  * Validate VXLAN_ENCAP action item list for E-Switch.
1471  * The routine checks items to be used in encapsulation header.
1472  *
1473  * @param[in] action
1474  *   Pointer to the VXLAN_ENCAP action structure.
1475  * @param[out] error
1476  *   Pointer to the error structure.
1477  *
1478  * @return
1479  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1480  **/
1481 static int
1482 flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action,
1483                               struct rte_flow_error *error)
1484 {
1485         const struct rte_flow_item *items;
1486         int ret;
1487         uint32_t item_flags = 0;
1488
1489         if (!action->conf)
1490                 return rte_flow_error_set(error, EINVAL,
1491                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1492                                           "Missing vxlan tunnel"
1493                                           " action configuration");
1494         items = ((const struct rte_flow_action_vxlan_encap *)
1495                                         action->conf)->definition;
1496         if (!items)
1497                 return rte_flow_error_set(error, EINVAL,
1498                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1499                                           "Missing vxlan tunnel"
1500                                           " encapsulation parameters");
1501         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1502                 switch (items->type) {
1503                 case RTE_FLOW_ITEM_TYPE_VOID:
1504                         break;
1505                 case RTE_FLOW_ITEM_TYPE_ETH:
1506                         ret = mlx5_flow_validate_item_eth(items, item_flags,
1507                                                           error);
1508                         if (ret < 0)
1509                                 return ret;
1510                         ret = flow_tcf_validate_vxlan_encap_eth(items, error);
1511                         if (ret < 0)
1512                                 return ret;
1513                         item_flags |= MLX5_FLOW_LAYER_OUTER_L2;
1514                         break;
1515                 break;
1516                 case RTE_FLOW_ITEM_TYPE_IPV4:
1517                         ret = mlx5_flow_validate_item_ipv4(items, item_flags,
1518                                                            error);
1519                         if (ret < 0)
1520                                 return ret;
1521                         ret = flow_tcf_validate_vxlan_encap_ipv4(items, error);
1522                         if (ret < 0)
1523                                 return ret;
1524                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
1525                         break;
1526                 case RTE_FLOW_ITEM_TYPE_IPV6:
1527                         ret = mlx5_flow_validate_item_ipv6(items, item_flags,
1528                                                            error);
1529                         if (ret < 0)
1530                                 return ret;
1531                         ret = flow_tcf_validate_vxlan_encap_ipv6(items, error);
1532                         if (ret < 0)
1533                                 return ret;
1534                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
1535                         break;
1536                 case RTE_FLOW_ITEM_TYPE_UDP:
1537                         ret = mlx5_flow_validate_item_udp(items, item_flags,
1538                                                            0xFF, error);
1539                         if (ret < 0)
1540                                 return ret;
1541                         ret = flow_tcf_validate_vxlan_encap_udp(items, error);
1542                         if (ret < 0)
1543                                 return ret;
1544                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
1545                         break;
1546                 case RTE_FLOW_ITEM_TYPE_VXLAN:
1547                         ret = mlx5_flow_validate_item_vxlan(items,
1548                                                             item_flags, error);
1549                         if (ret < 0)
1550                                 return ret;
1551                         ret = flow_tcf_validate_vxlan_encap_vni(items, error);
1552                         if (ret < 0)
1553                                 return ret;
1554                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
1555                         break;
1556                 default:
1557                         return rte_flow_error_set
1558                                         (error, ENOTSUP,
1559                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
1560                                          "vxlan encap item not supported");
1561                 }
1562         }
1563         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3))
1564                 return rte_flow_error_set(error, EINVAL,
1565                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1566                                           "no outer IP layer found"
1567                                           " for vxlan encapsulation");
1568         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
1569                 return rte_flow_error_set(error, EINVAL,
1570                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1571                                           "no outer UDP layer found"
1572                                           " for vxlan encapsulation");
1573         if (!(item_flags & MLX5_FLOW_LAYER_VXLAN))
1574                 return rte_flow_error_set(error, EINVAL,
1575                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1576                                           "no VXLAN VNI found"
1577                                           " for vxlan encapsulation");
1578         return 0;
1579 }
1580
1581 /**
1582  * Validate RTE_FLOW_ITEM_TYPE_IPV4 item if VXLAN_DECAP action
1583  * is present in actions list.
1584  *
1585  * @param[in] ipv4
1586  *   Outer IPv4 address item (if any, NULL otherwise).
1587  * @param[out] error
1588  *   Pointer to the error structure.
1589  *
1590  * @return
1591  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1592  **/
1593 static int
1594 flow_tcf_validate_vxlan_decap_ipv4(const struct rte_flow_item *ipv4,
1595                                    struct rte_flow_error *error)
1596 {
1597         const struct rte_flow_item_ipv4 *spec = ipv4->spec;
1598         const struct rte_flow_item_ipv4 *mask = ipv4->mask;
1599
1600         if (!spec) {
1601                 /*
1602                  * Specification for IP addresses cannot be empty
1603                  * because it is required as decap parameter.
1604                  */
1605                 return rte_flow_error_set(error, EINVAL,
1606                                           RTE_FLOW_ERROR_TYPE_ITEM, ipv4,
1607                                           "NULL outer ipv4 address"
1608                                           " specification for vxlan"
1609                                           " for vxlan decapsulation");
1610         }
1611         if (!mask)
1612                 mask = &rte_flow_item_ipv4_mask;
1613         if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) {
1614                 if (mask->hdr.dst_addr != RTE_BE32(0xffffffff))
1615                         return rte_flow_error_set
1616                                         (error, ENOTSUP,
1617                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1618                                          "no support for partial mask on"
1619                                          " \"ipv4.hdr.dst_addr\" field");
1620                 /* More IP address validations can be put here. */
1621         } else {
1622                 /*
1623                  * Kernel uses the destination IP address
1624                  * to determine the ingress network interface
1625                  * for traffic being decapsulated.
1626                  */
1627                 return rte_flow_error_set(error, EINVAL,
1628                                           RTE_FLOW_ERROR_TYPE_ITEM, ipv4,
1629                                           "outer ipv4 destination address"
1630                                           " must be specified for"
1631                                           " vxlan decapsulation");
1632         }
1633         /* Source IP address is optional for decap. */
1634         if (mask->hdr.src_addr != RTE_BE32(0x00000000) &&
1635             mask->hdr.src_addr != RTE_BE32(0xffffffff))
1636                 return rte_flow_error_set(error, ENOTSUP,
1637                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1638                                           "no support for partial mask on"
1639                                           " \"ipv4.hdr.src_addr\" field");
1640         return 0;
1641 }
1642
1643 /**
1644  * Validate RTE_FLOW_ITEM_TYPE_IPV6 item if VXLAN_DECAP action
1645  * is present in actions list.
1646  *
1647  * @param[in] ipv6
1648  *   Outer IPv6 address item (if any, NULL otherwise).
1649  * @param[out] error
1650  *   Pointer to the error structure.
1651  *
1652  * @return
1653  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1654  **/
1655 static int
1656 flow_tcf_validate_vxlan_decap_ipv6(const struct rte_flow_item *ipv6,
1657                                    struct rte_flow_error *error)
1658 {
1659         const struct rte_flow_item_ipv6 *spec = ipv6->spec;
1660         const struct rte_flow_item_ipv6 *mask = ipv6->mask;
1661
1662         if (!spec) {
1663                 /*
1664                  * Specification for IP addresses cannot be empty
1665                  * because it is required as decap parameter.
1666                  */
1667                 return rte_flow_error_set(error, EINVAL,
1668                                           RTE_FLOW_ERROR_TYPE_ITEM, ipv6,
1669                                           "NULL outer ipv6 address"
1670                                           " specification for vxlan"
1671                                           " decapsulation");
1672         }
1673         if (!mask)
1674                 mask = &rte_flow_item_ipv6_mask;
1675         if (memcmp(&mask->hdr.dst_addr,
1676                    &flow_tcf_mask_empty.ipv6.hdr.dst_addr,
1677                    IPV6_ADDR_LEN)) {
1678                 if (memcmp(&mask->hdr.dst_addr,
1679                         &rte_flow_item_ipv6_mask.hdr.dst_addr,
1680                         IPV6_ADDR_LEN))
1681                         return rte_flow_error_set
1682                                         (error, ENOTSUP,
1683                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1684                                          "no support for partial mask on"
1685                                          " \"ipv6.hdr.dst_addr\" field");
1686                 /* More IP address validations can be put here. */
1687         } else {
1688                 /*
1689                  * Kernel uses the destination IP address
1690                  * to determine the ingress network interface
1691                  * for traffic being decapsulated.
1692                  */
1693                 return rte_flow_error_set(error, EINVAL,
1694                                           RTE_FLOW_ERROR_TYPE_ITEM, ipv6,
1695                                           "outer ipv6 destination address must be "
1696                                           "specified for vxlan decapsulation");
1697         }
1698         /* Source IP address is optional for decap. */
1699         if (memcmp(&mask->hdr.src_addr,
1700                    &flow_tcf_mask_empty.ipv6.hdr.src_addr,
1701                    IPV6_ADDR_LEN)) {
1702                 if (memcmp(&mask->hdr.src_addr,
1703                            &rte_flow_item_ipv6_mask.hdr.src_addr,
1704                            IPV6_ADDR_LEN))
1705                         return rte_flow_error_set
1706                                         (error, ENOTSUP,
1707                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1708                                          "no support for partial mask on"
1709                                          " \"ipv6.hdr.src_addr\" field");
1710         }
1711         return 0;
1712 }
1713
1714 /**
1715  * Validate RTE_FLOW_ITEM_TYPE_UDP item if VXLAN_DECAP action
1716  * is present in actions list.
1717  *
1718  * @param[in] udp
1719  *   Outer UDP layer item (if any, NULL otherwise).
1720  * @param[out] error
1721  *   Pointer to the error structure.
1722  *
1723  * @return
1724  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1725  **/
1726 static int
1727 flow_tcf_validate_vxlan_decap_udp(const struct rte_flow_item *udp,
1728                                   struct rte_flow_error *error)
1729 {
1730         const struct rte_flow_item_udp *spec = udp->spec;
1731         const struct rte_flow_item_udp *mask = udp->mask;
1732
1733         if (!spec)
1734                 /*
1735                  * Specification for UDP ports cannot be empty
1736                  * because it is required as decap parameter.
1737                  */
1738                 return rte_flow_error_set(error, EINVAL,
1739                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1740                                           "NULL UDP port specification"
1741                                           " for VXLAN decapsulation");
1742         if (!mask)
1743                 mask = &rte_flow_item_udp_mask;
1744         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1745                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1746                         return rte_flow_error_set
1747                                         (error, ENOTSUP,
1748                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1749                                          "no support for partial mask on"
1750                                          " \"udp.hdr.dst_port\" field");
1751                 if (!spec->hdr.dst_port)
1752                         return rte_flow_error_set
1753                                         (error, EINVAL,
1754                                          RTE_FLOW_ERROR_TYPE_ITEM, udp,
1755                                          "zero decap local UDP port");
1756         } else {
1757                 return rte_flow_error_set(error, EINVAL,
1758                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1759                                           "outer UDP destination port must be "
1760                                           "specified for vxlan decapsulation");
1761         }
1762         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1763                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1764                         return rte_flow_error_set
1765                                         (error, ENOTSUP,
1766                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1767                                          "no support for partial mask on"
1768                                          " \"udp.hdr.src_port\" field");
1769                 DRV_LOG(WARNING,
1770                         "outer UDP local port cannot be "
1771                         "forced for VXLAN encapsulation, "
1772                         "parameter ignored");
1773         }
1774         return 0;
1775 }
1776
1777 /**
1778  * Validate flow for E-Switch.
1779  *
1780  * @param[in] priv
1781  *   Pointer to the priv structure.
1782  * @param[in] attr
1783  *   Pointer to the flow attributes.
1784  * @param[in] items
1785  *   Pointer to the list of items.
1786  * @param[in] actions
1787  *   Pointer to the list of actions.
1788  * @param[out] error
1789  *   Pointer to the error structure.
1790  *
1791  * @return
1792  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1793  */
1794 static int
1795 flow_tcf_validate(struct rte_eth_dev *dev,
1796                   const struct rte_flow_attr *attr,
1797                   const struct rte_flow_item items[],
1798                   const struct rte_flow_action actions[],
1799                   struct rte_flow_error *error)
1800 {
1801         union {
1802                 const struct rte_flow_item_port_id *port_id;
1803                 const struct rte_flow_item_eth *eth;
1804                 const struct rte_flow_item_vlan *vlan;
1805                 const struct rte_flow_item_ipv4 *ipv4;
1806                 const struct rte_flow_item_ipv6 *ipv6;
1807                 const struct rte_flow_item_tcp *tcp;
1808                 const struct rte_flow_item_udp *udp;
1809                 const struct rte_flow_item_vxlan *vxlan;
1810         } spec, mask;
1811         union {
1812                 const struct rte_flow_action_port_id *port_id;
1813                 const struct rte_flow_action_jump *jump;
1814                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
1815                 const struct rte_flow_action_of_set_vlan_vid *
1816                         of_set_vlan_vid;
1817                 const struct rte_flow_action_of_set_vlan_pcp *
1818                         of_set_vlan_pcp;
1819                 const struct rte_flow_action_vxlan_encap *vxlan_encap;
1820                 const struct rte_flow_action_set_ipv4 *set_ipv4;
1821                 const struct rte_flow_action_set_ipv6 *set_ipv6;
1822         } conf;
1823         uint64_t item_flags = 0;
1824         uint64_t action_flags = 0;
1825         uint8_t next_protocol = -1;
1826         unsigned int tcm_ifindex = 0;
1827         uint8_t pedit_validated = 0;
1828         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
1829         struct rte_eth_dev *port_id_dev = NULL;
1830         bool in_port_id_set;
1831         int ret;
1832
1833         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
1834                                                 PTOI_TABLE_SZ_MAX(dev)));
1835         ret = flow_tcf_validate_attributes(attr, error);
1836         if (ret < 0)
1837                 return ret;
1838         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
1839                 unsigned int i;
1840                 uint64_t current_action_flag = 0;
1841
1842                 switch (actions->type) {
1843                 case RTE_FLOW_ACTION_TYPE_VOID:
1844                         break;
1845                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
1846                         current_action_flag = MLX5_FLOW_ACTION_PORT_ID;
1847                         if (!actions->conf)
1848                                 break;
1849                         conf.port_id = actions->conf;
1850                         if (conf.port_id->original)
1851                                 i = 0;
1852                         else
1853                                 for (i = 0; ptoi[i].ifindex; ++i)
1854                                         if (ptoi[i].port_id == conf.port_id->id)
1855                                                 break;
1856                         if (!ptoi[i].ifindex)
1857                                 return rte_flow_error_set
1858                                         (error, ENODEV,
1859                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1860                                          conf.port_id,
1861                                          "missing data to convert port ID to"
1862                                          " ifindex");
1863                         port_id_dev = &rte_eth_devices[conf.port_id->id];
1864                         break;
1865                 case RTE_FLOW_ACTION_TYPE_JUMP:
1866                         current_action_flag = MLX5_FLOW_ACTION_JUMP;
1867                         if (!actions->conf)
1868                                 break;
1869                         conf.jump = actions->conf;
1870                         if (attr->group >= conf.jump->group)
1871                                 return rte_flow_error_set
1872                                         (error, ENOTSUP,
1873                                          RTE_FLOW_ERROR_TYPE_ACTION,
1874                                          actions,
1875                                          "can jump only to a group forward");
1876                         break;
1877                 case RTE_FLOW_ACTION_TYPE_DROP:
1878                         current_action_flag = MLX5_FLOW_ACTION_DROP;
1879                         break;
1880                 case RTE_FLOW_ACTION_TYPE_COUNT:
1881                         break;
1882                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
1883                         current_action_flag = MLX5_FLOW_ACTION_OF_POP_VLAN;
1884                         break;
1885                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
1886                         current_action_flag = MLX5_FLOW_ACTION_OF_PUSH_VLAN;
1887                         break;
1888                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
1889                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1890                                 return rte_flow_error_set
1891                                         (error, ENOTSUP,
1892                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1893                                          "vlan modify is not supported,"
1894                                          " set action must follow push action");
1895                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
1896                         break;
1897                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
1898                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1899                                 return rte_flow_error_set
1900                                         (error, ENOTSUP,
1901                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1902                                          "vlan modify is not supported,"
1903                                          " set action must follow push action");
1904                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
1905                         break;
1906                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
1907                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_DECAP;
1908                         break;
1909                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
1910                         ret = flow_tcf_validate_vxlan_encap(actions, error);
1911                         if (ret < 0)
1912                                 return ret;
1913                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_ENCAP;
1914                         break;
1915                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
1916                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_SRC;
1917                         break;
1918                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
1919                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_DST;
1920                         break;
1921                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
1922                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_SRC;
1923                         break;
1924                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
1925                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_DST;
1926                         break;
1927                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
1928                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_SRC;
1929                         break;
1930                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
1931                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_DST;
1932                         break;
1933                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
1934                         current_action_flag = MLX5_FLOW_ACTION_SET_TTL;
1935                         break;
1936                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
1937                         current_action_flag = MLX5_FLOW_ACTION_DEC_TTL;
1938                         break;
1939                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
1940                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_SRC;
1941                         break;
1942                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
1943                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_DST;
1944                         break;
1945                 default:
1946                         return rte_flow_error_set(error, ENOTSUP,
1947                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1948                                                   actions,
1949                                                   "action not supported");
1950                 }
1951                 if (current_action_flag & MLX5_TCF_CONFIG_ACTIONS) {
1952                         if (!actions->conf)
1953                                 return rte_flow_error_set
1954                                         (error, EINVAL,
1955                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1956                                          actions,
1957                                          "action configuration not set");
1958                 }
1959                 if ((current_action_flag & MLX5_TCF_PEDIT_ACTIONS) &&
1960                     pedit_validated)
1961                         return rte_flow_error_set(error, ENOTSUP,
1962                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1963                                                   actions,
1964                                                   "set actions should be "
1965                                                   "listed successively");
1966                 if ((current_action_flag & ~MLX5_TCF_PEDIT_ACTIONS) &&
1967                     (action_flags & MLX5_TCF_PEDIT_ACTIONS))
1968                         pedit_validated = 1;
1969                 if ((current_action_flag & MLX5_TCF_FATE_ACTIONS) &&
1970                     (action_flags & MLX5_TCF_FATE_ACTIONS))
1971                         return rte_flow_error_set(error, EINVAL,
1972                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1973                                                   actions,
1974                                                   "can't have multiple fate"
1975                                                   " actions");
1976                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1977                     (action_flags & MLX5_TCF_VXLAN_ACTIONS))
1978                         return rte_flow_error_set(error, EINVAL,
1979                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1980                                                   actions,
1981                                                   "can't have multiple vxlan"
1982                                                   " actions");
1983                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1984                     (action_flags & MLX5_TCF_VLAN_ACTIONS))
1985                         return rte_flow_error_set(error, ENOTSUP,
1986                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1987                                                   actions,
1988                                                   "can't have vxlan and vlan"
1989                                                   " actions in the same rule");
1990                 action_flags |= current_action_flag;
1991         }
1992         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1993                 unsigned int i;
1994
1995                 if ((item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
1996                     items->type != RTE_FLOW_ITEM_TYPE_ETH)
1997                         return rte_flow_error_set(error, ENOTSUP,
1998                                                   RTE_FLOW_ERROR_TYPE_ITEM,
1999                                                   items,
2000                                                   "only L2 inner item"
2001                                                   " is supported");
2002                 switch (items->type) {
2003                 case RTE_FLOW_ITEM_TYPE_VOID:
2004                         break;
2005                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
2006                         mask.port_id = flow_tcf_item_mask
2007                                 (items, &rte_flow_item_port_id_mask,
2008                                  &flow_tcf_mask_supported.port_id,
2009                                  &flow_tcf_mask_empty.port_id,
2010                                  sizeof(flow_tcf_mask_supported.port_id),
2011                                  error);
2012                         if (!mask.port_id)
2013                                 return -rte_errno;
2014                         if (mask.port_id == &flow_tcf_mask_empty.port_id) {
2015                                 in_port_id_set = 1;
2016                                 break;
2017                         }
2018                         spec.port_id = items->spec;
2019                         if (mask.port_id->id && mask.port_id->id != 0xffffffff)
2020                                 return rte_flow_error_set
2021                                         (error, ENOTSUP,
2022                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2023                                          mask.port_id,
2024                                          "no support for partial mask on"
2025                                          " \"id\" field");
2026                         if (!mask.port_id->id)
2027                                 i = 0;
2028                         else
2029                                 for (i = 0; ptoi[i].ifindex; ++i)
2030                                         if (ptoi[i].port_id == spec.port_id->id)
2031                                                 break;
2032                         if (!ptoi[i].ifindex)
2033                                 return rte_flow_error_set
2034                                         (error, ENODEV,
2035                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
2036                                          spec.port_id,
2037                                          "missing data to convert port ID to"
2038                                          " ifindex");
2039                         if (in_port_id_set && ptoi[i].ifindex != tcm_ifindex)
2040                                 return rte_flow_error_set
2041                                         (error, ENOTSUP,
2042                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
2043                                          spec.port_id,
2044                                          "cannot match traffic for"
2045                                          " several port IDs through"
2046                                          " a single flow rule");
2047                         tcm_ifindex = ptoi[i].ifindex;
2048                         in_port_id_set = 1;
2049                         break;
2050                 case RTE_FLOW_ITEM_TYPE_ETH:
2051                         ret = mlx5_flow_validate_item_eth(items, item_flags,
2052                                                           error);
2053                         if (ret < 0)
2054                                 return ret;
2055                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2056                                         MLX5_FLOW_LAYER_INNER_L2 :
2057                                         MLX5_FLOW_LAYER_OUTER_L2;
2058                         /* TODO:
2059                          * Redundant check due to different supported mask.
2060                          * Same for the rest of items.
2061                          */
2062                         mask.eth = flow_tcf_item_mask
2063                                 (items, &rte_flow_item_eth_mask,
2064                                  &flow_tcf_mask_supported.eth,
2065                                  &flow_tcf_mask_empty.eth,
2066                                  sizeof(flow_tcf_mask_supported.eth),
2067                                  error);
2068                         if (!mask.eth)
2069                                 return -rte_errno;
2070                         if (mask.eth->type && mask.eth->type !=
2071                             RTE_BE16(0xffff))
2072                                 return rte_flow_error_set
2073                                         (error, ENOTSUP,
2074                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2075                                          mask.eth,
2076                                          "no support for partial mask on"
2077                                          " \"type\" field");
2078                         break;
2079                 case RTE_FLOW_ITEM_TYPE_VLAN:
2080                         ret = mlx5_flow_validate_item_vlan(items, item_flags,
2081                                                            error);
2082                         if (ret < 0)
2083                                 return ret;
2084                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
2085                         mask.vlan = flow_tcf_item_mask
2086                                 (items, &rte_flow_item_vlan_mask,
2087                                  &flow_tcf_mask_supported.vlan,
2088                                  &flow_tcf_mask_empty.vlan,
2089                                  sizeof(flow_tcf_mask_supported.vlan),
2090                                  error);
2091                         if (!mask.vlan)
2092                                 return -rte_errno;
2093                         if ((mask.vlan->tci & RTE_BE16(0xe000) &&
2094                              (mask.vlan->tci & RTE_BE16(0xe000)) !=
2095                               RTE_BE16(0xe000)) ||
2096                             (mask.vlan->tci & RTE_BE16(0x0fff) &&
2097                              (mask.vlan->tci & RTE_BE16(0x0fff)) !=
2098                               RTE_BE16(0x0fff)) ||
2099                             (mask.vlan->inner_type &&
2100                              mask.vlan->inner_type != RTE_BE16(0xffff)))
2101                                 return rte_flow_error_set
2102                                         (error, ENOTSUP,
2103                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2104                                          mask.vlan,
2105                                          "no support for partial masks on"
2106                                          " \"tci\" (PCP and VID parts) and"
2107                                          " \"inner_type\" fields");
2108                         break;
2109                 case RTE_FLOW_ITEM_TYPE_IPV4:
2110                         ret = mlx5_flow_validate_item_ipv4(items, item_flags,
2111                                                            error);
2112                         if (ret < 0)
2113                                 return ret;
2114                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
2115                         mask.ipv4 = flow_tcf_item_mask
2116                                 (items, &rte_flow_item_ipv4_mask,
2117                                  &flow_tcf_mask_supported.ipv4,
2118                                  &flow_tcf_mask_empty.ipv4,
2119                                  sizeof(flow_tcf_mask_supported.ipv4),
2120                                  error);
2121                         if (!mask.ipv4)
2122                                 return -rte_errno;
2123                         if (mask.ipv4->hdr.next_proto_id &&
2124                             mask.ipv4->hdr.next_proto_id != 0xff)
2125                                 return rte_flow_error_set
2126                                         (error, ENOTSUP,
2127                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2128                                          mask.ipv4,
2129                                          "no support for partial mask on"
2130                                          " \"hdr.next_proto_id\" field");
2131                         else if (mask.ipv4->hdr.next_proto_id)
2132                                 next_protocol =
2133                                         ((const struct rte_flow_item_ipv4 *)
2134                                          (items->spec))->hdr.next_proto_id;
2135                         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2136                                 ret = flow_tcf_validate_vxlan_decap_ipv4
2137                                                                 (items, error);
2138                                 if (ret < 0)
2139                                         return ret;
2140                         }
2141                         break;
2142                 case RTE_FLOW_ITEM_TYPE_IPV6:
2143                         ret = mlx5_flow_validate_item_ipv6(items, item_flags,
2144                                                            error);
2145                         if (ret < 0)
2146                                 return ret;
2147                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
2148                         mask.ipv6 = flow_tcf_item_mask
2149                                 (items, &rte_flow_item_ipv6_mask,
2150                                  &flow_tcf_mask_supported.ipv6,
2151                                  &flow_tcf_mask_empty.ipv6,
2152                                  sizeof(flow_tcf_mask_supported.ipv6),
2153                                  error);
2154                         if (!mask.ipv6)
2155                                 return -rte_errno;
2156                         if (mask.ipv6->hdr.proto &&
2157                             mask.ipv6->hdr.proto != 0xff)
2158                                 return rte_flow_error_set
2159                                         (error, ENOTSUP,
2160                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2161                                          mask.ipv6,
2162                                          "no support for partial mask on"
2163                                          " \"hdr.proto\" field");
2164                         else if (mask.ipv6->hdr.proto)
2165                                 next_protocol =
2166                                         ((const struct rte_flow_item_ipv6 *)
2167                                          (items->spec))->hdr.proto;
2168                         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2169                                 ret = flow_tcf_validate_vxlan_decap_ipv6
2170                                                                 (items, error);
2171                                 if (ret < 0)
2172                                         return ret;
2173                         }
2174                         break;
2175                 case RTE_FLOW_ITEM_TYPE_UDP:
2176                         ret = mlx5_flow_validate_item_udp(items, item_flags,
2177                                                           next_protocol, error);
2178                         if (ret < 0)
2179                                 return ret;
2180                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
2181                         mask.udp = flow_tcf_item_mask
2182                                 (items, &rte_flow_item_udp_mask,
2183                                  &flow_tcf_mask_supported.udp,
2184                                  &flow_tcf_mask_empty.udp,
2185                                  sizeof(flow_tcf_mask_supported.udp),
2186                                  error);
2187                         if (!mask.udp)
2188                                 return -rte_errno;
2189                         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2190                                 ret = flow_tcf_validate_vxlan_decap_udp
2191                                                                 (items, error);
2192                                 if (ret < 0)
2193                                         return ret;
2194                         }
2195                         break;
2196                 case RTE_FLOW_ITEM_TYPE_TCP:
2197                         ret = mlx5_flow_validate_item_tcp
2198                                              (items, item_flags,
2199                                               next_protocol,
2200                                               &flow_tcf_mask_supported.tcp,
2201                                               error);
2202                         if (ret < 0)
2203                                 return ret;
2204                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
2205                         mask.tcp = flow_tcf_item_mask
2206                                 (items, &rte_flow_item_tcp_mask,
2207                                  &flow_tcf_mask_supported.tcp,
2208                                  &flow_tcf_mask_empty.tcp,
2209                                  sizeof(flow_tcf_mask_supported.tcp),
2210                                  error);
2211                         if (!mask.tcp)
2212                                 return -rte_errno;
2213                         break;
2214                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2215                         if (!(action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP))
2216                                 return rte_flow_error_set
2217                                         (error, ENOTSUP,
2218                                          RTE_FLOW_ERROR_TYPE_ITEM,
2219                                          items,
2220                                          "vni pattern should be followed by"
2221                                          " vxlan decapsulation action");
2222                         ret = mlx5_flow_validate_item_vxlan(items,
2223                                                             item_flags, error);
2224                         if (ret < 0)
2225                                 return ret;
2226                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
2227                         mask.vxlan = flow_tcf_item_mask
2228                                 (items, &rte_flow_item_vxlan_mask,
2229                                  &flow_tcf_mask_supported.vxlan,
2230                                  &flow_tcf_mask_empty.vxlan,
2231                                  sizeof(flow_tcf_mask_supported.vxlan), error);
2232                         if (!mask.vxlan)
2233                                 return -rte_errno;
2234                         if (mask.vxlan->vni[0] != 0xff ||
2235                             mask.vxlan->vni[1] != 0xff ||
2236                             mask.vxlan->vni[2] != 0xff)
2237                                 return rte_flow_error_set
2238                                         (error, ENOTSUP,
2239                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2240                                          mask.vxlan,
2241                                          "no support for partial or "
2242                                          "empty mask on \"vxlan.vni\" field");
2243                         break;
2244                 default:
2245                         return rte_flow_error_set(error, ENOTSUP,
2246                                                   RTE_FLOW_ERROR_TYPE_ITEM,
2247                                                   items, "item not supported");
2248                 }
2249         }
2250         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2251             (action_flags & MLX5_FLOW_ACTION_DROP))
2252                 return rte_flow_error_set(error, ENOTSUP,
2253                                           RTE_FLOW_ERROR_TYPE_ACTION,
2254                                           actions,
2255                                           "set action is not compatible with "
2256                                           "drop action");
2257         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2258             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2259                 return rte_flow_error_set(error, ENOTSUP,
2260                                           RTE_FLOW_ERROR_TYPE_ACTION,
2261                                           actions,
2262                                           "set action must be followed by "
2263                                           "port_id action");
2264         if (action_flags &
2265            (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST)) {
2266                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4))
2267                         return rte_flow_error_set(error, EINVAL,
2268                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2269                                                   actions,
2270                                                   "no ipv4 item found in"
2271                                                   " pattern");
2272         }
2273         if (action_flags &
2274            (MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST)) {
2275                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6))
2276                         return rte_flow_error_set(error, EINVAL,
2277                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2278                                                   actions,
2279                                                   "no ipv6 item found in"
2280                                                   " pattern");
2281         }
2282         if (action_flags &
2283            (MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST)) {
2284                 if (!(item_flags &
2285                      (MLX5_FLOW_LAYER_OUTER_L4_UDP |
2286                       MLX5_FLOW_LAYER_OUTER_L4_TCP)))
2287                         return rte_flow_error_set(error, EINVAL,
2288                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2289                                                   actions,
2290                                                   "no TCP/UDP item found in"
2291                                                   " pattern");
2292         }
2293         /*
2294          * FW syndrome (0xA9C090):
2295          *     set_flow_table_entry: push vlan action fte in fdb can ONLY be
2296          *     forward to the uplink.
2297          */
2298         if ((action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) &&
2299             (action_flags & MLX5_FLOW_ACTION_PORT_ID) &&
2300             ((struct priv *)port_id_dev->data->dev_private)->representor)
2301                 return rte_flow_error_set(error, ENOTSUP,
2302                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2303                                           "vlan push can only be applied"
2304                                           " when forwarding to uplink port");
2305         /*
2306          * FW syndrome (0x294609):
2307          *     set_flow_table_entry: modify/pop/push actions in fdb flow table
2308          *     are supported only while forwarding to vport.
2309          */
2310         if ((action_flags & MLX5_TCF_VLAN_ACTIONS) &&
2311             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2312                 return rte_flow_error_set(error, ENOTSUP,
2313                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2314                                           "vlan actions are supported"
2315                                           " only with port_id action");
2316         if ((action_flags & MLX5_TCF_VXLAN_ACTIONS) &&
2317             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2318                 return rte_flow_error_set(error, ENOTSUP,
2319                                           RTE_FLOW_ERROR_TYPE_ACTION, NULL,
2320                                           "vxlan actions are supported"
2321                                           " only with port_id action");
2322         if (!(action_flags & MLX5_TCF_FATE_ACTIONS))
2323                 return rte_flow_error_set(error, EINVAL,
2324                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2325                                           "no fate action is found");
2326         if (action_flags &
2327            (MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL)) {
2328                 if (!(item_flags &
2329                      (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2330                       MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2331                         return rte_flow_error_set(error, EINVAL,
2332                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2333                                                   actions,
2334                                                   "no IP found in pattern");
2335         }
2336         if (action_flags &
2337             (MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)) {
2338                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L2))
2339                         return rte_flow_error_set(error, ENOTSUP,
2340                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2341                                                   actions,
2342                                                   "no ethernet found in"
2343                                                   " pattern");
2344         }
2345         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2346                 if (!(item_flags &
2347                      (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2348                       MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2349                         return rte_flow_error_set(error, EINVAL,
2350                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2351                                                   NULL,
2352                                                   "no outer IP pattern found"
2353                                                   " for vxlan decap action");
2354                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
2355                         return rte_flow_error_set(error, EINVAL,
2356                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2357                                                   NULL,
2358                                                   "no outer UDP pattern found"
2359                                                   " for vxlan decap action");
2360                 if (!(item_flags & MLX5_FLOW_LAYER_VXLAN))
2361                         return rte_flow_error_set(error, EINVAL,
2362                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2363                                                   NULL,
2364                                                   "no VNI pattern found"
2365                                                   " for vxlan decap action");
2366         }
2367         return 0;
2368 }
2369
2370 /**
2371  * Calculate maximum size of memory for flow items of Linux TC flower and
2372  * extract specified items.
2373  *
2374  * @param[in] items
2375  *   Pointer to the list of items.
2376  * @param[out] item_flags
2377  *   Pointer to the detected items.
2378  *
2379  * @return
2380  *   Maximum size of memory for items.
2381  */
2382 static int
2383 flow_tcf_get_items_and_size(const struct rte_flow_attr *attr,
2384                             const struct rte_flow_item items[],
2385                             uint64_t *item_flags)
2386 {
2387         int size = 0;
2388         uint64_t flags = 0;
2389
2390         size += SZ_NLATTR_STRZ_OF("flower") +
2391                 SZ_NLATTR_NEST + /* TCA_OPTIONS. */
2392                 SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */
2393         if (attr->group > 0)
2394                 size += SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CHAIN. */
2395         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2396                 switch (items->type) {
2397                 case RTE_FLOW_ITEM_TYPE_VOID:
2398                         break;
2399                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
2400                         break;
2401                 case RTE_FLOW_ITEM_TYPE_ETH:
2402                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
2403                                 SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4;
2404                                 /* dst/src MAC addr and mask. */
2405                         flags |= MLX5_FLOW_LAYER_OUTER_L2;
2406                         break;
2407                 case RTE_FLOW_ITEM_TYPE_VLAN:
2408                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
2409                                 SZ_NLATTR_TYPE_OF(uint16_t) +
2410                                 /* VLAN Ether type. */
2411                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */
2412                                 SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */
2413                         flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
2414                         break;
2415                 case RTE_FLOW_ITEM_TYPE_IPV4:
2416                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
2417                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2418                                 SZ_NLATTR_TYPE_OF(uint32_t) * 4;
2419                                 /* dst/src IP addr and mask. */
2420                         flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
2421                         break;
2422                 case RTE_FLOW_ITEM_TYPE_IPV6:
2423                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
2424                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2425                                 SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4;
2426                                 /* dst/src IP addr and mask. */
2427                         flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
2428                         break;
2429                 case RTE_FLOW_ITEM_TYPE_UDP:
2430                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2431                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2432                                 /* dst/src port and mask. */
2433                         flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
2434                         break;
2435                 case RTE_FLOW_ITEM_TYPE_TCP:
2436                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2437                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2438                                 /* dst/src port and mask. */
2439                         flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
2440                         break;
2441                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2442                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2443                         flags |= MLX5_FLOW_LAYER_VXLAN;
2444                         break;
2445                 default:
2446                         DRV_LOG(WARNING,
2447                                 "unsupported item %p type %d,"
2448                                 " items must be validated before flow creation",
2449                                 (const void *)items, items->type);
2450                         break;
2451                 }
2452         }
2453         *item_flags = flags;
2454         return size;
2455 }
2456
2457 /**
2458  * Calculate size of memory to store the VXLAN encapsultion
2459  * related items in the Netlink message buffer. Items list
2460  * is specified by RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action.
2461  * The item list should be validated.
2462  *
2463  * @param[in] action
2464  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2465  *   List of pattern items to scan data from.
2466  *
2467  * @return
2468  *   The size the part of Netlink message buffer to store the
2469  *   VXLAN encapsulation item attributes.
2470  */
2471 static int
2472 flow_tcf_vxlan_encap_size(const struct rte_flow_action *action)
2473 {
2474         const struct rte_flow_item *items;
2475         int size = 0;
2476
2477         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
2478         assert(action->conf);
2479
2480         items = ((const struct rte_flow_action_vxlan_encap *)
2481                                         action->conf)->definition;
2482         assert(items);
2483         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2484                 switch (items->type) {
2485                 case RTE_FLOW_ITEM_TYPE_VOID:
2486                         break;
2487                 case RTE_FLOW_ITEM_TYPE_ETH:
2488                         /* This item does not require message buffer. */
2489                         break;
2490                 case RTE_FLOW_ITEM_TYPE_IPV4:
2491                         size += SZ_NLATTR_DATA_OF(IPV4_ADDR_LEN) * 2;
2492                         break;
2493                 case RTE_FLOW_ITEM_TYPE_IPV6:
2494                         size += SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 2;
2495                         break;
2496                 case RTE_FLOW_ITEM_TYPE_UDP: {
2497                         const struct rte_flow_item_udp *udp = items->mask;
2498
2499                         size += SZ_NLATTR_TYPE_OF(uint16_t);
2500                         if (!udp || udp->hdr.src_port != RTE_BE16(0x0000))
2501                                 size += SZ_NLATTR_TYPE_OF(uint16_t);
2502                         break;
2503                 }
2504                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2505                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2506                         break;
2507                 default:
2508                         assert(false);
2509                         DRV_LOG(WARNING,
2510                                 "unsupported item %p type %d,"
2511                                 " items must be validated"
2512                                 " before flow creation",
2513                                 (const void *)items, items->type);
2514                         return 0;
2515                 }
2516         }
2517         return size;
2518 }
2519
2520 /**
2521  * Calculate maximum size of memory for flow actions of Linux TC flower and
2522  * extract specified actions.
2523  *
2524  * @param[in] actions
2525  *   Pointer to the list of actions.
2526  * @param[out] action_flags
2527  *   Pointer to the detected actions.
2528  *
2529  * @return
2530  *   Maximum size of memory for actions.
2531  */
2532 static int
2533 flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
2534                               uint64_t *action_flags)
2535 {
2536         int size = 0;
2537         uint64_t flags = 0;
2538
2539         size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */
2540         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
2541                 switch (actions->type) {
2542                 case RTE_FLOW_ACTION_TYPE_VOID:
2543                         break;
2544                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
2545                         size += SZ_NLATTR_NEST + /* na_act_index. */
2546                                 SZ_NLATTR_STRZ_OF("mirred") +
2547                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2548                                 SZ_NLATTR_TYPE_OF(struct tc_mirred);
2549                         flags |= MLX5_FLOW_ACTION_PORT_ID;
2550                         break;
2551                 case RTE_FLOW_ACTION_TYPE_JUMP:
2552                         size += SZ_NLATTR_NEST + /* na_act_index. */
2553                                 SZ_NLATTR_STRZ_OF("gact") +
2554                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2555                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2556                         flags |= MLX5_FLOW_ACTION_JUMP;
2557                         break;
2558                 case RTE_FLOW_ACTION_TYPE_DROP:
2559                         size += SZ_NLATTR_NEST + /* na_act_index. */
2560                                 SZ_NLATTR_STRZ_OF("gact") +
2561                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2562                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2563                         flags |= MLX5_FLOW_ACTION_DROP;
2564                         break;
2565                 case RTE_FLOW_ACTION_TYPE_COUNT:
2566                         break;
2567                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
2568                         flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
2569                         goto action_of_vlan;
2570                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
2571                         flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
2572                         goto action_of_vlan;
2573                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
2574                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
2575                         goto action_of_vlan;
2576                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
2577                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
2578                         goto action_of_vlan;
2579 action_of_vlan:
2580                         size += SZ_NLATTR_NEST + /* na_act_index. */
2581                                 SZ_NLATTR_STRZ_OF("vlan") +
2582                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2583                                 SZ_NLATTR_TYPE_OF(struct tc_vlan) +
2584                                 SZ_NLATTR_TYPE_OF(uint16_t) +
2585                                 /* VLAN protocol. */
2586                                 SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
2587                                 SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
2588                         break;
2589                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
2590                         size += SZ_NLATTR_NEST + /* na_act_index. */
2591                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2592                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2593                                 SZ_NLATTR_TYPE_OF(uint8_t);
2594                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2595                         size += flow_tcf_vxlan_encap_size(actions) +
2596                                 RTE_ALIGN_CEIL /* preceding encap params. */
2597                                 (sizeof(struct flow_tcf_vxlan_encap),
2598                                 MNL_ALIGNTO);
2599                         flags |= MLX5_FLOW_ACTION_VXLAN_ENCAP;
2600                         break;
2601                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
2602                         size += SZ_NLATTR_NEST + /* na_act_index. */
2603                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2604                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2605                                 SZ_NLATTR_TYPE_OF(uint8_t);
2606                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2607                         size += RTE_ALIGN_CEIL /* preceding decap params. */
2608                                 (sizeof(struct flow_tcf_vxlan_decap),
2609                                 MNL_ALIGNTO);
2610                         flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2611                         break;
2612                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
2613                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
2614                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
2615                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
2616                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
2617                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
2618                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
2619                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
2620                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
2621                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
2622                         size += flow_tcf_get_pedit_actions_size(&actions,
2623                                                                 &flags);
2624                         break;
2625                 default:
2626                         DRV_LOG(WARNING,
2627                                 "unsupported action %p type %d,"
2628                                 " items must be validated before flow creation",
2629                                 (const void *)actions, actions->type);
2630                         break;
2631                 }
2632         }
2633         *action_flags = flags;
2634         return size;
2635 }
2636
2637 /**
2638  * Brand rtnetlink buffer with unique handle.
2639  *
2640  * This handle should be unique for a given network interface to avoid
2641  * collisions.
2642  *
2643  * @param nlh
2644  *   Pointer to Netlink message.
2645  * @param handle
2646  *   Unique 32-bit handle to use.
2647  */
2648 static void
2649 flow_tcf_nl_brand(struct nlmsghdr *nlh, uint32_t handle)
2650 {
2651         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
2652
2653         tcm->tcm_handle = handle;
2654         DRV_LOG(DEBUG, "Netlink msg %p is branded with handle %x",
2655                 (void *)nlh, handle);
2656 }
2657
2658 /**
2659  * Prepare a flow object for Linux TC flower. It calculates the maximum size of
2660  * memory required, allocates the memory, initializes Netlink message headers
2661  * and set unique TC message handle.
2662  *
2663  * @param[in] attr
2664  *   Pointer to the flow attributes.
2665  * @param[in] items
2666  *   Pointer to the list of items.
2667  * @param[in] actions
2668  *   Pointer to the list of actions.
2669  * @param[out] item_flags
2670  *   Pointer to bit mask of all items detected.
2671  * @param[out] action_flags
2672  *   Pointer to bit mask of all actions detected.
2673  * @param[out] error
2674  *   Pointer to the error structure.
2675  *
2676  * @return
2677  *   Pointer to mlx5_flow object on success,
2678  *   otherwise NULL and rte_ernno is set.
2679  */
2680 static struct mlx5_flow *
2681 flow_tcf_prepare(const struct rte_flow_attr *attr,
2682                  const struct rte_flow_item items[],
2683                  const struct rte_flow_action actions[],
2684                  uint64_t *item_flags, uint64_t *action_flags,
2685                  struct rte_flow_error *error)
2686 {
2687         size_t size = RTE_ALIGN_CEIL
2688                         (sizeof(struct mlx5_flow),
2689                          alignof(struct flow_tcf_tunnel_hdr)) +
2690                       MNL_ALIGN(sizeof(struct nlmsghdr)) +
2691                       MNL_ALIGN(sizeof(struct tcmsg));
2692         struct mlx5_flow *dev_flow;
2693         struct nlmsghdr *nlh;
2694         struct tcmsg *tcm;
2695         uint8_t *sp, *tun = NULL;
2696
2697         size += flow_tcf_get_items_and_size(attr, items, item_flags);
2698         size += flow_tcf_get_actions_and_size(actions, action_flags);
2699         dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
2700         if (!dev_flow) {
2701                 rte_flow_error_set(error, ENOMEM,
2702                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
2703                                    "not enough memory to create E-Switch flow");
2704                 return NULL;
2705         }
2706         sp = (uint8_t *)(dev_flow + 1);
2707         if (*action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) {
2708                 sp = RTE_PTR_ALIGN
2709                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2710                 tun = sp;
2711                 sp += RTE_ALIGN_CEIL
2712                         (sizeof(struct flow_tcf_vxlan_encap),
2713                         MNL_ALIGNTO);
2714 #ifndef NDEBUG
2715                 size -= RTE_ALIGN_CEIL
2716                         (sizeof(struct flow_tcf_vxlan_encap),
2717                         MNL_ALIGNTO);
2718 #endif
2719         } else if (*action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2720                 sp = RTE_PTR_ALIGN
2721                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2722                 tun = sp;
2723                 sp += RTE_ALIGN_CEIL
2724                         (sizeof(struct flow_tcf_vxlan_decap),
2725                         MNL_ALIGNTO);
2726 #ifndef NDEBUG
2727                 size -= RTE_ALIGN_CEIL
2728                         (sizeof(struct flow_tcf_vxlan_decap),
2729                         MNL_ALIGNTO);
2730 #endif
2731         } else {
2732                 sp = RTE_PTR_ALIGN(sp, MNL_ALIGNTO);
2733         }
2734         nlh = mnl_nlmsg_put_header(sp);
2735         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
2736         *dev_flow = (struct mlx5_flow){
2737                 .tcf = (struct mlx5_flow_tcf){
2738 #ifndef NDEBUG
2739                         .nlsize = size - RTE_ALIGN_CEIL
2740                                 (sizeof(struct mlx5_flow),
2741                                  alignof(struct flow_tcf_tunnel_hdr)),
2742 #endif
2743                         .tunnel = (struct flow_tcf_tunnel_hdr *)tun,
2744                         .nlh = nlh,
2745                         .tcm = tcm,
2746                 },
2747         };
2748         if (*action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP)
2749                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_DECAP;
2750         else if (*action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP)
2751                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_ENCAP;
2752         /*
2753          * Generate a reasonably unique handle based on the address of the
2754          * target buffer.
2755          *
2756          * This is straightforward on 32-bit systems where the flow pointer can
2757          * be used directly. Otherwise, its least significant part is taken
2758          * after shifting it by the previous power of two of the pointed buffer
2759          * size.
2760          */
2761         if (sizeof(dev_flow) <= 4)
2762                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow);
2763         else
2764                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow >>
2765                                        rte_log2_u32(rte_align32prevpow2(size)));
2766         return dev_flow;
2767 }
2768
2769 /**
2770  * Make adjustments for supporting count actions.
2771  *
2772  * @param[in] dev
2773  *   Pointer to the Ethernet device structure.
2774  * @param[in] dev_flow
2775  *   Pointer to mlx5_flow.
2776  * @param[out] error
2777  *   Pointer to error structure.
2778  *
2779  * @return
2780  *   0 On success else a negative errno value is returned and rte_errno is set.
2781  */
2782 static int
2783 flow_tcf_translate_action_count(struct rte_eth_dev *dev __rte_unused,
2784                                   struct mlx5_flow *dev_flow,
2785                                   struct rte_flow_error *error)
2786 {
2787         struct rte_flow *flow = dev_flow->flow;
2788
2789         if (!flow->counter) {
2790                 flow->counter = flow_tcf_counter_new();
2791                 if (!flow->counter)
2792                         return rte_flow_error_set(error, rte_errno,
2793                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2794                                                   NULL,
2795                                                   "cannot get counter"
2796                                                   " context.");
2797         }
2798         return 0;
2799 }
2800
2801 /**
2802  * Convert VXLAN VNI to 32-bit integer.
2803  *
2804  * @param[in] vni
2805  *   VXLAN VNI in 24-bit wire format.
2806  *
2807  * @return
2808  *   VXLAN VNI as a 32-bit integer value in network endian.
2809  */
2810 static inline rte_be32_t
2811 vxlan_vni_as_be32(const uint8_t vni[3])
2812 {
2813         union {
2814                 uint8_t vni[4];
2815                 rte_be32_t dword;
2816         } ret = {
2817                 .vni = { 0, vni[0], vni[1], vni[2] },
2818         };
2819         return ret.dword;
2820 }
2821
2822 /**
2823  * Helper function to process RTE_FLOW_ITEM_TYPE_ETH entry in configuration
2824  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the MAC address fields
2825  * in the encapsulation parameters structure. The item must be prevalidated,
2826  * no any validation checks performed by function.
2827  *
2828  * @param[in] spec
2829  *   RTE_FLOW_ITEM_TYPE_ETH entry specification.
2830  * @param[in] mask
2831  *   RTE_FLOW_ITEM_TYPE_ETH entry mask.
2832  * @param[out] encap
2833  *   Structure to fill the gathered MAC address data.
2834  */
2835 static void
2836 flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec,
2837                                const struct rte_flow_item_eth *mask,
2838                                struct flow_tcf_vxlan_encap *encap)
2839 {
2840         /* Item must be validated before. No redundant checks. */
2841         assert(spec);
2842         if (!mask || !memcmp(&mask->dst,
2843                              &rte_flow_item_eth_mask.dst,
2844                              sizeof(rte_flow_item_eth_mask.dst))) {
2845                 /*
2846                  * Ethernet addresses are not supported by
2847                  * tc as tunnel_key parameters. Destination
2848                  * address is needed to form encap packet
2849                  * header and retrieved by kernel from
2850                  * implicit sources (ARP table, etc),
2851                  * address masks are not supported at all.
2852                  */
2853                 encap->eth.dst = spec->dst;
2854                 encap->mask |= FLOW_TCF_ENCAP_ETH_DST;
2855         }
2856         if (!mask || !memcmp(&mask->src,
2857                              &rte_flow_item_eth_mask.src,
2858                              sizeof(rte_flow_item_eth_mask.src))) {
2859                 /*
2860                  * Ethernet addresses are not supported by
2861                  * tc as tunnel_key parameters. Source ethernet
2862                  * address is ignored anyway.
2863                  */
2864                 encap->eth.src = spec->src;
2865                 encap->mask |= FLOW_TCF_ENCAP_ETH_SRC;
2866         }
2867 }
2868
2869 /**
2870  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV4 entry in configuration
2871  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV4 address fields
2872  * in the encapsulation parameters structure. The item must be prevalidated,
2873  * no any validation checks performed by function.
2874  *
2875  * @param[in] spec
2876  *   RTE_FLOW_ITEM_TYPE_IPV4 entry specification.
2877  * @param[out] encap
2878  *   Structure to fill the gathered IPV4 address data.
2879  */
2880 static void
2881 flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec,
2882                                 struct flow_tcf_vxlan_encap *encap)
2883 {
2884         /* Item must be validated before. No redundant checks. */
2885         assert(spec);
2886         encap->ipv4.dst = spec->hdr.dst_addr;
2887         encap->ipv4.src = spec->hdr.src_addr;
2888         encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC |
2889                        FLOW_TCF_ENCAP_IPV4_DST;
2890 }
2891
2892 /**
2893  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV6 entry in configuration
2894  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV6 address fields
2895  * in the encapsulation parameters structure. The item must be prevalidated,
2896  * no any validation checks performed by function.
2897  *
2898  * @param[in] spec
2899  *   RTE_FLOW_ITEM_TYPE_IPV6 entry specification.
2900  * @param[out] encap
2901  *   Structure to fill the gathered IPV6 address data.
2902  */
2903 static void
2904 flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec,
2905                                 struct flow_tcf_vxlan_encap *encap)
2906 {
2907         /* Item must be validated before. No redundant checks. */
2908         assert(spec);
2909         memcpy(encap->ipv6.dst, spec->hdr.dst_addr, IPV6_ADDR_LEN);
2910         memcpy(encap->ipv6.src, spec->hdr.src_addr, IPV6_ADDR_LEN);
2911         encap->mask |= FLOW_TCF_ENCAP_IPV6_SRC |
2912                        FLOW_TCF_ENCAP_IPV6_DST;
2913 }
2914
2915 /**
2916  * Helper function to process RTE_FLOW_ITEM_TYPE_UDP entry in configuration
2917  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the UDP port fields
2918  * in the encapsulation parameters structure. The item must be prevalidated,
2919  * no any validation checks performed by function.
2920  *
2921  * @param[in] spec
2922  *   RTE_FLOW_ITEM_TYPE_UDP entry specification.
2923  * @param[in] mask
2924  *   RTE_FLOW_ITEM_TYPE_UDP entry mask.
2925  * @param[out] encap
2926  *   Structure to fill the gathered UDP port data.
2927  */
2928 static void
2929 flow_tcf_parse_vxlan_encap_udp(const struct rte_flow_item_udp *spec,
2930                                const struct rte_flow_item_udp *mask,
2931                                struct flow_tcf_vxlan_encap *encap)
2932 {
2933         assert(spec);
2934         encap->udp.dst = spec->hdr.dst_port;
2935         encap->mask |= FLOW_TCF_ENCAP_UDP_DST;
2936         if (!mask || mask->hdr.src_port != RTE_BE16(0x0000)) {
2937                 encap->udp.src = spec->hdr.src_port;
2938                 encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC;
2939         }
2940 }
2941
2942 /**
2943  * Helper function to process RTE_FLOW_ITEM_TYPE_VXLAN entry in configuration
2944  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the VNI fields
2945  * in the encapsulation parameters structure. The item must be prevalidated,
2946  * no any validation checks performed by function.
2947  *
2948  * @param[in] spec
2949  *   RTE_FLOW_ITEM_TYPE_VXLAN entry specification.
2950  * @param[out] encap
2951  *   Structure to fill the gathered VNI address data.
2952  */
2953 static void
2954 flow_tcf_parse_vxlan_encap_vni(const struct rte_flow_item_vxlan *spec,
2955                                struct flow_tcf_vxlan_encap *encap)
2956 {
2957         /* Item must be validated before. Do not redundant checks. */
2958         assert(spec);
2959         memcpy(encap->vxlan.vni, spec->vni, sizeof(encap->vxlan.vni));
2960         encap->mask |= FLOW_TCF_ENCAP_VXLAN_VNI;
2961 }
2962
2963 /**
2964  * Populate consolidated encapsulation object from list of pattern items.
2965  *
2966  * Helper function to process configuration of action such as
2967  * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. The item list should be
2968  * validated, there is no way to return an meaningful error.
2969  *
2970  * @param[in] action
2971  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2972  *   List of pattern items to gather data from.
2973  * @param[out] src
2974  *   Structure to fill gathered data.
2975  */
2976 static void
2977 flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action,
2978                            struct flow_tcf_vxlan_encap *encap)
2979 {
2980         union {
2981                 const struct rte_flow_item_eth *eth;
2982                 const struct rte_flow_item_ipv4 *ipv4;
2983                 const struct rte_flow_item_ipv6 *ipv6;
2984                 const struct rte_flow_item_udp *udp;
2985                 const struct rte_flow_item_vxlan *vxlan;
2986         } spec, mask;
2987         const struct rte_flow_item *items;
2988
2989         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
2990         assert(action->conf);
2991
2992         items = ((const struct rte_flow_action_vxlan_encap *)
2993                                         action->conf)->definition;
2994         assert(items);
2995         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2996                 switch (items->type) {
2997                 case RTE_FLOW_ITEM_TYPE_VOID:
2998                         break;
2999                 case RTE_FLOW_ITEM_TYPE_ETH:
3000                         mask.eth = items->mask;
3001                         spec.eth = items->spec;
3002                         flow_tcf_parse_vxlan_encap_eth(spec.eth, mask.eth,
3003                                                        encap);
3004                         break;
3005                 case RTE_FLOW_ITEM_TYPE_IPV4:
3006                         spec.ipv4 = items->spec;
3007                         flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4, encap);
3008                         break;
3009                 case RTE_FLOW_ITEM_TYPE_IPV6:
3010                         spec.ipv6 = items->spec;
3011                         flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6, encap);
3012                         break;
3013                 case RTE_FLOW_ITEM_TYPE_UDP:
3014                         mask.udp = items->mask;
3015                         spec.udp = items->spec;
3016                         flow_tcf_parse_vxlan_encap_udp(spec.udp, mask.udp,
3017                                                        encap);
3018                         break;
3019                 case RTE_FLOW_ITEM_TYPE_VXLAN:
3020                         spec.vxlan = items->spec;
3021                         flow_tcf_parse_vxlan_encap_vni(spec.vxlan, encap);
3022                         break;
3023                 default:
3024                         assert(false);
3025                         DRV_LOG(WARNING,
3026                                 "unsupported item %p type %d,"
3027                                 " items must be validated"
3028                                 " before flow creation",
3029                                 (const void *)items, items->type);
3030                         encap->mask = 0;
3031                         return;
3032                 }
3033         }
3034 }
3035
3036 /**
3037  * Translate flow for Linux TC flower and construct Netlink message.
3038  *
3039  * @param[in] priv
3040  *   Pointer to the priv structure.
3041  * @param[in, out] flow
3042  *   Pointer to the sub flow.
3043  * @param[in] attr
3044  *   Pointer to the flow attributes.
3045  * @param[in] items
3046  *   Pointer to the list of items.
3047  * @param[in] actions
3048  *   Pointer to the list of actions.
3049  * @param[out] error
3050  *   Pointer to the error structure.
3051  *
3052  * @return
3053  *   0 on success, a negative errno value otherwise and rte_ernno is set.
3054  */
3055 static int
3056 flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow,
3057                    const struct rte_flow_attr *attr,
3058                    const struct rte_flow_item items[],
3059                    const struct rte_flow_action actions[],
3060                    struct rte_flow_error *error)
3061 {
3062         union {
3063                 const struct rte_flow_item_port_id *port_id;
3064                 const struct rte_flow_item_eth *eth;
3065                 const struct rte_flow_item_vlan *vlan;
3066                 const struct rte_flow_item_ipv4 *ipv4;
3067                 const struct rte_flow_item_ipv6 *ipv6;
3068                 const struct rte_flow_item_tcp *tcp;
3069                 const struct rte_flow_item_udp *udp;
3070                 const struct rte_flow_item_vxlan *vxlan;
3071         } spec, mask;
3072         union {
3073                 const struct rte_flow_action_port_id *port_id;
3074                 const struct rte_flow_action_jump *jump;
3075                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
3076                 const struct rte_flow_action_of_set_vlan_vid *
3077                         of_set_vlan_vid;
3078                 const struct rte_flow_action_of_set_vlan_pcp *
3079                         of_set_vlan_pcp;
3080         } conf;
3081         union {
3082                 struct flow_tcf_tunnel_hdr *hdr;
3083                 struct flow_tcf_vxlan_decap *vxlan;
3084         } decap = {
3085                 .hdr = NULL,
3086         };
3087         union {
3088                 struct flow_tcf_tunnel_hdr *hdr;
3089                 struct flow_tcf_vxlan_encap *vxlan;
3090         } encap = {
3091                 .hdr = NULL,
3092         };
3093         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
3094         struct nlmsghdr *nlh = dev_flow->tcf.nlh;
3095         struct tcmsg *tcm = dev_flow->tcf.tcm;
3096         uint32_t na_act_index_cur;
3097         bool eth_type_set = 0;
3098         bool vlan_present = 0;
3099         bool vlan_eth_type_set = 0;
3100         bool ip_proto_set = 0;
3101         struct nlattr *na_flower;
3102         struct nlattr *na_flower_act;
3103         struct nlattr *na_vlan_id = NULL;
3104         struct nlattr *na_vlan_priority = NULL;
3105         uint64_t item_flags = 0;
3106         int ret;
3107
3108         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
3109                                                 PTOI_TABLE_SZ_MAX(dev)));
3110         if (dev_flow->tcf.tunnel) {
3111                 switch (dev_flow->tcf.tunnel->type) {
3112                 case FLOW_TCF_TUNACT_VXLAN_DECAP:
3113                         decap.vxlan = dev_flow->tcf.vxlan_decap;
3114                         break;
3115                 case FLOW_TCF_TUNACT_VXLAN_ENCAP:
3116                         encap.vxlan = dev_flow->tcf.vxlan_encap;
3117                         break;
3118                 /* New tunnel actions can be added here. */
3119                 default:
3120                         assert(false);
3121                         break;
3122                 }
3123         }
3124         nlh = dev_flow->tcf.nlh;
3125         tcm = dev_flow->tcf.tcm;
3126         /* Prepare API must have been called beforehand. */
3127         assert(nlh != NULL && tcm != NULL);
3128         tcm->tcm_family = AF_UNSPEC;
3129         tcm->tcm_ifindex = ptoi[0].ifindex;
3130         tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
3131         /*
3132          * Priority cannot be zero to prevent the kernel from picking one
3133          * automatically.
3134          */
3135         tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
3136                                   RTE_BE16(ETH_P_ALL));
3137         if (attr->group > 0)
3138                 mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group);
3139         mnl_attr_put_strz(nlh, TCA_KIND, "flower");
3140         na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
3141         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3142                 unsigned int i;
3143
3144                 switch (items->type) {
3145                 case RTE_FLOW_ITEM_TYPE_VOID:
3146                         break;
3147                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
3148                         mask.port_id = flow_tcf_item_mask
3149                                 (items, &rte_flow_item_port_id_mask,
3150                                  &flow_tcf_mask_supported.port_id,
3151                                  &flow_tcf_mask_empty.port_id,
3152                                  sizeof(flow_tcf_mask_supported.port_id),
3153                                  error);
3154                         assert(mask.port_id);
3155                         if (mask.port_id == &flow_tcf_mask_empty.port_id)
3156                                 break;
3157                         spec.port_id = items->spec;
3158                         if (!mask.port_id->id)
3159                                 i = 0;
3160                         else
3161                                 for (i = 0; ptoi[i].ifindex; ++i)
3162                                         if (ptoi[i].port_id == spec.port_id->id)
3163                                                 break;
3164                         assert(ptoi[i].ifindex);
3165                         tcm->tcm_ifindex = ptoi[i].ifindex;
3166                         break;
3167                 case RTE_FLOW_ITEM_TYPE_ETH:
3168                         item_flags |= (item_flags & MLX5_FLOW_LAYER_VXLAN) ?
3169                                       MLX5_FLOW_LAYER_INNER_L2 :
3170                                       MLX5_FLOW_LAYER_OUTER_L2;
3171                         mask.eth = flow_tcf_item_mask
3172                                 (items, &rte_flow_item_eth_mask,
3173                                  &flow_tcf_mask_supported.eth,
3174                                  &flow_tcf_mask_empty.eth,
3175                                  sizeof(flow_tcf_mask_supported.eth),
3176                                  error);
3177                         assert(mask.eth);
3178                         if (mask.eth == &flow_tcf_mask_empty.eth)
3179                                 break;
3180                         spec.eth = items->spec;
3181                         if (decap.vxlan &&
3182                             !(item_flags & MLX5_FLOW_LAYER_VXLAN)) {
3183                                 DRV_LOG(WARNING,
3184                                         "outer L2 addresses cannot be forced"
3185                                         " for vxlan decapsulation, parameter"
3186                                         " ignored");
3187                                 break;
3188                         }
3189                         if (mask.eth->type) {
3190                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_ETH_TYPE,
3191                                                  spec.eth->type);
3192                                 eth_type_set = 1;
3193                         }
3194                         if (!is_zero_ether_addr(&mask.eth->dst)) {
3195                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
3196                                              ETHER_ADDR_LEN,
3197                                              spec.eth->dst.addr_bytes);
3198                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK,
3199                                              ETHER_ADDR_LEN,
3200                                              mask.eth->dst.addr_bytes);
3201                         }
3202                         if (!is_zero_ether_addr(&mask.eth->src)) {
3203                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC,
3204                                              ETHER_ADDR_LEN,
3205                                              spec.eth->src.addr_bytes);
3206                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK,
3207                                              ETHER_ADDR_LEN,
3208                                              mask.eth->src.addr_bytes);
3209                         }
3210                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3211                         break;
3212                 case RTE_FLOW_ITEM_TYPE_VLAN:
3213                         assert(!encap.hdr);
3214                         assert(!decap.hdr);
3215                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
3216                         mask.vlan = flow_tcf_item_mask
3217                                 (items, &rte_flow_item_vlan_mask,
3218                                  &flow_tcf_mask_supported.vlan,
3219                                  &flow_tcf_mask_empty.vlan,
3220                                  sizeof(flow_tcf_mask_supported.vlan),
3221                                  error);
3222                         assert(mask.vlan);
3223                         if (!eth_type_set)
3224                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_ETH_TYPE,
3225                                                  RTE_BE16(ETH_P_8021Q));
3226                         eth_type_set = 1;
3227                         vlan_present = 1;
3228                         if (mask.vlan == &flow_tcf_mask_empty.vlan)
3229                                 break;
3230                         spec.vlan = items->spec;
3231                         if (mask.vlan->inner_type) {
3232                                 mnl_attr_put_u16(nlh,
3233                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE,
3234                                                  spec.vlan->inner_type);
3235                                 vlan_eth_type_set = 1;
3236                         }
3237                         if (mask.vlan->tci & RTE_BE16(0xe000))
3238                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_VLAN_PRIO,
3239                                                 (rte_be_to_cpu_16
3240                                                  (spec.vlan->tci) >> 13) & 0x7);
3241                         if (mask.vlan->tci & RTE_BE16(0x0fff))
3242                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_VLAN_ID,
3243                                                  rte_be_to_cpu_16
3244                                                  (spec.vlan->tci &
3245                                                   RTE_BE16(0x0fff)));
3246                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3247                         break;
3248                 case RTE_FLOW_ITEM_TYPE_IPV4:
3249                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
3250                         mask.ipv4 = flow_tcf_item_mask
3251                                 (items, &rte_flow_item_ipv4_mask,
3252                                  &flow_tcf_mask_supported.ipv4,
3253                                  &flow_tcf_mask_empty.ipv4,
3254                                  sizeof(flow_tcf_mask_supported.ipv4),
3255                                  error);
3256                         assert(mask.ipv4);
3257                         spec.ipv4 = items->spec;
3258                         if (!decap.vxlan) {
3259                                 if (!eth_type_set && !vlan_eth_type_set)
3260                                         mnl_attr_put_u16
3261                                                 (nlh,
3262                                                  vlan_present ?
3263                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE :
3264                                                  TCA_FLOWER_KEY_ETH_TYPE,
3265                                                  RTE_BE16(ETH_P_IP));
3266                                 eth_type_set = 1;
3267                                 vlan_eth_type_set = 1;
3268                                 if (mask.ipv4 == &flow_tcf_mask_empty.ipv4)
3269                                         break;
3270                                 if (mask.ipv4->hdr.next_proto_id) {
3271                                         mnl_attr_put_u8
3272                                                 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3273                                                  spec.ipv4->hdr.next_proto_id);
3274                                         ip_proto_set = 1;
3275                                 }
3276                         } else {
3277                                 assert(mask.ipv4 != &flow_tcf_mask_empty.ipv4);
3278                         }
3279                         if (mask.ipv4->hdr.src_addr) {
3280                                 mnl_attr_put_u32
3281                                         (nlh, decap.vxlan ?
3282                                          TCA_FLOWER_KEY_ENC_IPV4_SRC :
3283                                          TCA_FLOWER_KEY_IPV4_SRC,
3284                                          spec.ipv4->hdr.src_addr);
3285                                 mnl_attr_put_u32
3286                                         (nlh, decap.vxlan ?
3287                                          TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK :
3288                                          TCA_FLOWER_KEY_IPV4_SRC_MASK,
3289                                          mask.ipv4->hdr.src_addr);
3290                         }
3291                         if (mask.ipv4->hdr.dst_addr) {
3292                                 mnl_attr_put_u32
3293                                         (nlh, decap.vxlan ?
3294                                          TCA_FLOWER_KEY_ENC_IPV4_DST :
3295                                          TCA_FLOWER_KEY_IPV4_DST,
3296                                          spec.ipv4->hdr.dst_addr);
3297                                 mnl_attr_put_u32
3298                                         (nlh, decap.vxlan ?
3299                                          TCA_FLOWER_KEY_ENC_IPV4_DST_MASK :
3300                                          TCA_FLOWER_KEY_IPV4_DST_MASK,
3301                                          mask.ipv4->hdr.dst_addr);
3302                         }
3303                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3304                         break;
3305                 case RTE_FLOW_ITEM_TYPE_IPV6:
3306                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
3307                         mask.ipv6 = flow_tcf_item_mask
3308                                 (items, &rte_flow_item_ipv6_mask,
3309                                  &flow_tcf_mask_supported.ipv6,
3310                                  &flow_tcf_mask_empty.ipv6,
3311                                  sizeof(flow_tcf_mask_supported.ipv6),
3312                                  error);
3313                         assert(mask.ipv6);
3314                         spec.ipv6 = items->spec;
3315                         if (!decap.vxlan) {
3316                                 if (!eth_type_set || !vlan_eth_type_set) {
3317                                         mnl_attr_put_u16
3318                                                 (nlh,
3319                                                  vlan_present ?
3320                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE :
3321                                                  TCA_FLOWER_KEY_ETH_TYPE,
3322                                                  RTE_BE16(ETH_P_IPV6));
3323                                 }
3324                                 eth_type_set = 1;
3325                                 vlan_eth_type_set = 1;
3326                                 if (mask.ipv6 == &flow_tcf_mask_empty.ipv6)
3327                                         break;
3328                                 if (mask.ipv6->hdr.proto) {
3329                                         mnl_attr_put_u8
3330                                                 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3331                                                  spec.ipv6->hdr.proto);
3332                                         ip_proto_set = 1;
3333                                 }
3334                         } else {
3335                                 assert(mask.ipv6 != &flow_tcf_mask_empty.ipv6);
3336                         }
3337                         if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.src_addr)) {
3338                                 mnl_attr_put(nlh, decap.vxlan ?
3339                                              TCA_FLOWER_KEY_ENC_IPV6_SRC :
3340                                              TCA_FLOWER_KEY_IPV6_SRC,
3341                                              IPV6_ADDR_LEN,
3342                                              spec.ipv6->hdr.src_addr);
3343                                 mnl_attr_put(nlh, decap.vxlan ?
3344                                              TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK :
3345                                              TCA_FLOWER_KEY_IPV6_SRC_MASK,
3346                                              IPV6_ADDR_LEN,
3347                                              mask.ipv6->hdr.src_addr);
3348                         }
3349                         if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.dst_addr)) {
3350                                 mnl_attr_put(nlh, decap.vxlan ?
3351                                              TCA_FLOWER_KEY_ENC_IPV6_DST :
3352                                              TCA_FLOWER_KEY_IPV6_DST,
3353                                              IPV6_ADDR_LEN,
3354                                              spec.ipv6->hdr.dst_addr);
3355                                 mnl_attr_put(nlh, decap.vxlan ?
3356                                              TCA_FLOWER_KEY_ENC_IPV6_DST_MASK :
3357                                              TCA_FLOWER_KEY_IPV6_DST_MASK,
3358                                              IPV6_ADDR_LEN,
3359                                              mask.ipv6->hdr.dst_addr);
3360                         }
3361                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3362                         break;
3363                 case RTE_FLOW_ITEM_TYPE_UDP:
3364                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
3365                         mask.udp = flow_tcf_item_mask
3366                                 (items, &rte_flow_item_udp_mask,
3367                                  &flow_tcf_mask_supported.udp,
3368                                  &flow_tcf_mask_empty.udp,
3369                                  sizeof(flow_tcf_mask_supported.udp),
3370                                  error);
3371                         assert(mask.udp);
3372                         spec.udp = items->spec;
3373                         if (!decap.vxlan) {
3374                                 if (!ip_proto_set)
3375                                         mnl_attr_put_u8
3376                                                 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3377                                                 IPPROTO_UDP);
3378                                 if (mask.udp == &flow_tcf_mask_empty.udp)
3379                                         break;
3380                         } else {
3381                                 assert(mask.udp != &flow_tcf_mask_empty.udp);
3382                                 decap.vxlan->udp_port =
3383                                         rte_be_to_cpu_16
3384                                                 (spec.udp->hdr.dst_port);
3385                         }
3386                         if (mask.udp->hdr.src_port) {
3387                                 mnl_attr_put_u16
3388                                         (nlh, decap.vxlan ?
3389                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT :
3390                                          TCA_FLOWER_KEY_UDP_SRC,
3391                                          spec.udp->hdr.src_port);
3392                                 mnl_attr_put_u16
3393                                         (nlh, decap.vxlan ?
3394                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK :
3395                                          TCA_FLOWER_KEY_UDP_SRC_MASK,
3396                                          mask.udp->hdr.src_port);
3397                         }
3398                         if (mask.udp->hdr.dst_port) {
3399                                 mnl_attr_put_u16
3400                                         (nlh, decap.vxlan ?
3401                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT :
3402                                          TCA_FLOWER_KEY_UDP_DST,
3403                                          spec.udp->hdr.dst_port);
3404                                 mnl_attr_put_u16
3405                                         (nlh, decap.vxlan ?
3406                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK :
3407                                          TCA_FLOWER_KEY_UDP_DST_MASK,
3408                                          mask.udp->hdr.dst_port);
3409                         }
3410                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3411                         break;
3412                 case RTE_FLOW_ITEM_TYPE_TCP:
3413                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
3414                         mask.tcp = flow_tcf_item_mask
3415                                 (items, &rte_flow_item_tcp_mask,
3416                                  &flow_tcf_mask_supported.tcp,
3417                                  &flow_tcf_mask_empty.tcp,
3418                                  sizeof(flow_tcf_mask_supported.tcp),
3419                                  error);
3420                         assert(mask.tcp);
3421                         if (!ip_proto_set)
3422                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3423                                                 IPPROTO_TCP);
3424                         if (mask.tcp == &flow_tcf_mask_empty.tcp)
3425                                 break;
3426                         spec.tcp = items->spec;
3427                         if (mask.tcp->hdr.src_port) {
3428                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_SRC,
3429                                                  spec.tcp->hdr.src_port);
3430                                 mnl_attr_put_u16(nlh,
3431                                                  TCA_FLOWER_KEY_TCP_SRC_MASK,
3432                                                  mask.tcp->hdr.src_port);
3433                         }
3434                         if (mask.tcp->hdr.dst_port) {
3435                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_DST,
3436                                                  spec.tcp->hdr.dst_port);
3437                                 mnl_attr_put_u16(nlh,
3438                                                  TCA_FLOWER_KEY_TCP_DST_MASK,
3439                                                  mask.tcp->hdr.dst_port);
3440                         }
3441                         if (mask.tcp->hdr.tcp_flags) {
3442                                 mnl_attr_put_u16
3443                                         (nlh,
3444                                          TCA_FLOWER_KEY_TCP_FLAGS,
3445                                          rte_cpu_to_be_16
3446                                                 (spec.tcp->hdr.tcp_flags));
3447                                 mnl_attr_put_u16
3448                                         (nlh,
3449                                          TCA_FLOWER_KEY_TCP_FLAGS_MASK,
3450                                          rte_cpu_to_be_16
3451                                                 (mask.tcp->hdr.tcp_flags));
3452                         }
3453                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3454                         break;
3455                 case RTE_FLOW_ITEM_TYPE_VXLAN:
3456                         assert(decap.vxlan);
3457                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
3458                         spec.vxlan = items->spec;
3459                         mnl_attr_put_u32(nlh,
3460                                          TCA_FLOWER_KEY_ENC_KEY_ID,
3461                                          vxlan_vni_as_be32(spec.vxlan->vni));
3462                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3463                         break;
3464                 default:
3465                         return rte_flow_error_set(error, ENOTSUP,
3466                                                   RTE_FLOW_ERROR_TYPE_ITEM,
3467                                                   NULL, "item not supported");
3468                 }
3469         }
3470         na_flower_act = mnl_attr_nest_start(nlh, TCA_FLOWER_ACT);
3471         na_act_index_cur = 1;
3472         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
3473                 struct nlattr *na_act_index;
3474                 struct nlattr *na_act;
3475                 unsigned int vlan_act;
3476                 unsigned int i;
3477
3478                 switch (actions->type) {
3479                 case RTE_FLOW_ACTION_TYPE_VOID:
3480                         break;
3481                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
3482                         conf.port_id = actions->conf;
3483                         if (conf.port_id->original)
3484                                 i = 0;
3485                         else
3486                                 for (i = 0; ptoi[i].ifindex; ++i)
3487                                         if (ptoi[i].port_id == conf.port_id->id)
3488                                                 break;
3489                         assert(ptoi[i].ifindex);
3490                         na_act_index =
3491                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3492                         assert(na_act_index);
3493                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
3494                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3495                         assert(na_act);
3496                         if (encap.hdr) {
3497                                 assert(dev_flow->tcf.tunnel);
3498                                 dev_flow->tcf.tunnel->ifindex_ptr =
3499                                         &((struct tc_mirred *)
3500                                         mnl_attr_get_payload
3501                                         (mnl_nlmsg_get_payload_tail
3502                                                 (nlh)))->ifindex;
3503                         }
3504                         mnl_attr_put(nlh, TCA_MIRRED_PARMS,
3505                                      sizeof(struct tc_mirred),
3506                                      &(struct tc_mirred){
3507                                         .action = TC_ACT_STOLEN,
3508                                         .eaction = TCA_EGRESS_REDIR,
3509                                         .ifindex = ptoi[i].ifindex,
3510                                      });
3511                         mnl_attr_nest_end(nlh, na_act);
3512                         mnl_attr_nest_end(nlh, na_act_index);
3513                         break;
3514                 case RTE_FLOW_ACTION_TYPE_JUMP:
3515                         conf.jump = actions->conf;
3516                         na_act_index =
3517                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3518                         assert(na_act_index);
3519                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3520                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3521                         assert(na_act);
3522                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3523                                      sizeof(struct tc_gact),
3524                                      &(struct tc_gact){
3525                                         .action = TC_ACT_GOTO_CHAIN |
3526                                                   conf.jump->group,
3527                                      });
3528                         mnl_attr_nest_end(nlh, na_act);
3529                         mnl_attr_nest_end(nlh, na_act_index);
3530                         break;
3531                 case RTE_FLOW_ACTION_TYPE_DROP:
3532                         na_act_index =
3533                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3534                         assert(na_act_index);
3535                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3536                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3537                         assert(na_act);
3538                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3539                                      sizeof(struct tc_gact),
3540                                      &(struct tc_gact){
3541                                         .action = TC_ACT_SHOT,
3542                                      });
3543                         mnl_attr_nest_end(nlh, na_act);
3544                         mnl_attr_nest_end(nlh, na_act_index);
3545                         break;
3546                 case RTE_FLOW_ACTION_TYPE_COUNT:
3547                         /*
3548                          * Driver adds the count action implicitly for
3549                          * each rule it creates.
3550                          */
3551                         ret = flow_tcf_translate_action_count(dev,
3552                                                               dev_flow, error);
3553                         if (ret < 0)
3554                                 return ret;
3555                         break;
3556                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
3557                         conf.of_push_vlan = NULL;
3558                         vlan_act = TCA_VLAN_ACT_POP;
3559                         goto action_of_vlan;
3560                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
3561                         conf.of_push_vlan = actions->conf;
3562                         vlan_act = TCA_VLAN_ACT_PUSH;
3563                         goto action_of_vlan;
3564                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
3565                         conf.of_set_vlan_vid = actions->conf;
3566                         if (na_vlan_id)
3567                                 goto override_na_vlan_id;
3568                         vlan_act = TCA_VLAN_ACT_MODIFY;
3569                         goto action_of_vlan;
3570                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
3571                         conf.of_set_vlan_pcp = actions->conf;
3572                         if (na_vlan_priority)
3573                                 goto override_na_vlan_priority;
3574                         vlan_act = TCA_VLAN_ACT_MODIFY;
3575                         goto action_of_vlan;
3576 action_of_vlan:
3577                         na_act_index =
3578                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3579                         assert(na_act_index);
3580                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "vlan");
3581                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3582                         assert(na_act);
3583                         mnl_attr_put(nlh, TCA_VLAN_PARMS,
3584                                      sizeof(struct tc_vlan),
3585                                      &(struct tc_vlan){
3586                                         .action = TC_ACT_PIPE,
3587                                         .v_action = vlan_act,
3588                                      });
3589                         if (vlan_act == TCA_VLAN_ACT_POP) {
3590                                 mnl_attr_nest_end(nlh, na_act);
3591                                 mnl_attr_nest_end(nlh, na_act_index);
3592                                 break;
3593                         }
3594                         if (vlan_act == TCA_VLAN_ACT_PUSH)
3595                                 mnl_attr_put_u16(nlh,
3596                                                  TCA_VLAN_PUSH_VLAN_PROTOCOL,
3597                                                  conf.of_push_vlan->ethertype);
3598                         na_vlan_id = mnl_nlmsg_get_payload_tail(nlh);
3599                         mnl_attr_put_u16(nlh, TCA_VLAN_PAD, 0);
3600                         na_vlan_priority = mnl_nlmsg_get_payload_tail(nlh);
3601                         mnl_attr_put_u8(nlh, TCA_VLAN_PAD, 0);
3602                         mnl_attr_nest_end(nlh, na_act);
3603                         mnl_attr_nest_end(nlh, na_act_index);
3604                         if (actions->type ==
3605                             RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
3606 override_na_vlan_id:
3607                                 na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID;
3608                                 *(uint16_t *)mnl_attr_get_payload(na_vlan_id) =
3609                                         rte_be_to_cpu_16
3610                                         (conf.of_set_vlan_vid->vlan_vid);
3611                         } else if (actions->type ==
3612                                    RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
3613 override_na_vlan_priority:
3614                                 na_vlan_priority->nla_type =
3615                                         TCA_VLAN_PUSH_VLAN_PRIORITY;
3616                                 *(uint8_t *)mnl_attr_get_payload
3617                                         (na_vlan_priority) =
3618                                         conf.of_set_vlan_pcp->vlan_pcp;
3619                         }
3620                         break;
3621                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
3622                         assert(decap.vxlan);
3623                         assert(dev_flow->tcf.tunnel);
3624                         dev_flow->tcf.tunnel->ifindex_ptr =
3625                                 (unsigned int *)&tcm->tcm_ifindex;
3626                         na_act_index =
3627                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3628                         assert(na_act_index);
3629                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3630                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3631                         assert(na_act);
3632                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3633                                 sizeof(struct tc_tunnel_key),
3634                                 &(struct tc_tunnel_key){
3635                                         .action = TC_ACT_PIPE,
3636                                         .t_action = TCA_TUNNEL_KEY_ACT_RELEASE,
3637                                         });
3638                         mnl_attr_nest_end(nlh, na_act);
3639                         mnl_attr_nest_end(nlh, na_act_index);
3640                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3641                         break;
3642                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
3643                         assert(encap.vxlan);
3644                         flow_tcf_vxlan_encap_parse(actions, encap.vxlan);
3645                         na_act_index =
3646                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3647                         assert(na_act_index);
3648                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3649                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3650                         assert(na_act);
3651                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3652                                 sizeof(struct tc_tunnel_key),
3653                                 &(struct tc_tunnel_key){
3654                                         .action = TC_ACT_PIPE,
3655                                         .t_action = TCA_TUNNEL_KEY_ACT_SET,
3656                                         });
3657                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_UDP_DST)
3658                                 mnl_attr_put_u16(nlh,
3659                                          TCA_TUNNEL_KEY_ENC_DST_PORT,
3660                                          encap.vxlan->udp.dst);
3661                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_SRC)
3662                                 mnl_attr_put_u32(nlh,
3663                                          TCA_TUNNEL_KEY_ENC_IPV4_SRC,
3664                                          encap.vxlan->ipv4.src);
3665                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_DST)
3666                                 mnl_attr_put_u32(nlh,
3667                                          TCA_TUNNEL_KEY_ENC_IPV4_DST,
3668                                          encap.vxlan->ipv4.dst);
3669                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_SRC)
3670                                 mnl_attr_put(nlh,
3671                                          TCA_TUNNEL_KEY_ENC_IPV6_SRC,
3672                                          sizeof(encap.vxlan->ipv6.src),
3673                                          &encap.vxlan->ipv6.src);
3674                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_DST)
3675                                 mnl_attr_put(nlh,
3676                                          TCA_TUNNEL_KEY_ENC_IPV6_DST,
3677                                          sizeof(encap.vxlan->ipv6.dst),
3678                                          &encap.vxlan->ipv6.dst);
3679                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_VXLAN_VNI)
3680                                 mnl_attr_put_u32(nlh,
3681                                          TCA_TUNNEL_KEY_ENC_KEY_ID,
3682                                          vxlan_vni_as_be32
3683                                                 (encap.vxlan->vxlan.vni));
3684                         mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM, 0);
3685                         mnl_attr_nest_end(nlh, na_act);
3686                         mnl_attr_nest_end(nlh, na_act_index);
3687                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3688                         break;
3689                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
3690                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
3691                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
3692                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
3693                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
3694                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
3695                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
3696                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
3697                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
3698                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
3699                         na_act_index =
3700                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3701                         flow_tcf_create_pedit_mnl_msg(nlh,
3702                                                       &actions, item_flags);
3703                         mnl_attr_nest_end(nlh, na_act_index);
3704                         break;
3705                 default:
3706                         return rte_flow_error_set(error, ENOTSUP,
3707                                                   RTE_FLOW_ERROR_TYPE_ACTION,
3708                                                   actions,
3709                                                   "action not supported");
3710                 }
3711         }
3712         assert(na_flower);
3713         assert(na_flower_act);
3714         mnl_attr_nest_end(nlh, na_flower_act);
3715         mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, decap.vxlan ?
3716                                                 0 : TCA_CLS_FLAGS_SKIP_SW);
3717         mnl_attr_nest_end(nlh, na_flower);
3718         if (dev_flow->tcf.tunnel && dev_flow->tcf.tunnel->ifindex_ptr)
3719                 dev_flow->tcf.tunnel->ifindex_org =
3720                         *dev_flow->tcf.tunnel->ifindex_ptr;
3721         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3722         return 0;
3723 }
3724
3725 /**
3726  * Send Netlink message with acknowledgment.
3727  *
3728  * @param tcf
3729  *   Flow context to use.
3730  * @param nlh
3731  *   Message to send. This function always raises the NLM_F_ACK flag before
3732  *   sending.
3733  * @param[in] msglen
3734  *   Message length. Message buffer may contain multiple commands and
3735  *   nlmsg_len field not always corresponds to actual message length.
3736  *   If 0 specified the nlmsg_len field in header is used as message length.
3737  * @param[in] cb
3738  *   Callback handler for received message.
3739  * @param[in] arg
3740  *   Context pointer for callback handler.
3741  *
3742  * @return
3743  *   0 on success, a negative errno value otherwise and rte_errno is set.
3744  */
3745 static int
3746 flow_tcf_nl_ack(struct mlx5_flow_tcf_context *tcf,
3747                 struct nlmsghdr *nlh,
3748                 uint32_t msglen,
3749                 mnl_cb_t cb, void *arg)
3750 {
3751         unsigned int portid = mnl_socket_get_portid(tcf->nl);
3752         uint32_t seq = tcf->seq++;
3753         int err, ret;
3754
3755         assert(tcf->nl);
3756         assert(tcf->buf);
3757         if (!seq)
3758                 /* seq 0 is reserved for kernel event-driven notifications. */
3759                 seq = tcf->seq++;
3760         nlh->nlmsg_seq = seq;
3761         if (!msglen) {
3762                 msglen = nlh->nlmsg_len;
3763                 nlh->nlmsg_flags |= NLM_F_ACK;
3764         }
3765         ret = mnl_socket_sendto(tcf->nl, nlh, msglen);
3766         err = (ret <= 0) ? errno : 0;
3767         nlh = (struct nlmsghdr *)(tcf->buf);
3768         /*
3769          * The following loop postpones non-fatal errors until multipart
3770          * messages are complete.
3771          */
3772         if (ret > 0)
3773                 while (true) {
3774                         ret = mnl_socket_recvfrom(tcf->nl, tcf->buf,
3775                                                   tcf->buf_size);
3776                         if (ret < 0) {
3777                                 err = errno;
3778                                 if (err != ENOSPC)
3779                                         break;
3780                         }
3781                         if (!err) {
3782                                 ret = mnl_cb_run(nlh, ret, seq, portid,
3783                                                  cb, arg);
3784                                 if (ret < 0) {
3785                                         err = errno;
3786                                         break;
3787                                 }
3788                         }
3789                         /* Will receive till end of multipart message */
3790                         if (!(nlh->nlmsg_flags & NLM_F_MULTI) ||
3791                               nlh->nlmsg_type == NLMSG_DONE)
3792                                 break;
3793                 }
3794         if (!err)
3795                 return 0;
3796         rte_errno = err;
3797         return -err;
3798 }
3799
3800 #define MNL_BUF_EXTRA_SPACE 16
3801 #define MNL_REQUEST_SIZE_MIN 256
3802 #define MNL_REQUEST_SIZE_MAX 2048
3803 #define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \
3804                                  MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX)
3805
3806 /* VTEP device list is shared between PMD port instances. */
3807 static LIST_HEAD(, tcf_vtep) vtep_list_vxlan = LIST_HEAD_INITIALIZER();
3808 static pthread_mutex_t vtep_list_mutex = PTHREAD_MUTEX_INITIALIZER;
3809
3810 /**
3811  * Deletes VTEP network device.
3812  *
3813  * @param[in] tcf
3814  *   Context object initialized by mlx5_flow_tcf_context_create().
3815  * @param[in] vtep
3816  *   Object represinting the network device to delete. Memory
3817  *   allocated for this object is freed by routine.
3818  */
3819 static void
3820 flow_tcf_vtep_delete(struct mlx5_flow_tcf_context *tcf,
3821                      struct tcf_vtep *vtep)
3822 {
3823         struct nlmsghdr *nlh;
3824         struct ifinfomsg *ifm;
3825         alignas(struct nlmsghdr)
3826         uint8_t buf[mnl_nlmsg_size(MNL_ALIGN(sizeof(*ifm))) +
3827                     MNL_BUF_EXTRA_SPACE];
3828         int ret;
3829
3830         assert(!vtep->refcnt);
3831         /* Delete only ifaces those we actually created. */
3832         if (vtep->created && vtep->ifindex) {
3833                 DRV_LOG(INFO, "VTEP delete (%d)", vtep->ifindex);
3834                 nlh = mnl_nlmsg_put_header(buf);
3835                 nlh->nlmsg_type = RTM_DELLINK;
3836                 nlh->nlmsg_flags = NLM_F_REQUEST;
3837                 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
3838                 ifm->ifi_family = AF_UNSPEC;
3839                 ifm->ifi_index = vtep->ifindex;
3840                 assert(sizeof(buf) >= nlh->nlmsg_len);
3841                 ret = flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL);
3842                 if (ret)
3843                         DRV_LOG(WARNING, "netlink: error deleting vxlan"
3844                                          " encap/decap ifindex %u",
3845                                          ifm->ifi_index);
3846         }
3847         rte_free(vtep);
3848 }
3849
3850 /**
3851  * Creates VTEP network device.
3852  *
3853  * @param[in] tcf
3854  *   Context object initialized by mlx5_flow_tcf_context_create().
3855  * @param[in] ifouter
3856  *   Outer interface to attach new-created VXLAN device
3857  *   If zero the VXLAN device will not be attached to any device.
3858  *   These VTEPs are used for decapsulation and can be precreated
3859  *   and shared between processes.
3860  * @param[in] port
3861  *   UDP port of created VTEP device.
3862  * @param[out] error
3863  *   Perform verbose error reporting if not NULL.
3864  *
3865  * @return
3866  * Pointer to created device structure on success,
3867  * NULL otherwise and rte_errno is set.
3868  */
3869 #ifdef HAVE_IFLA_VXLAN_COLLECT_METADATA
3870 static struct tcf_vtep*
3871 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf,
3872                      unsigned int ifouter,
3873                      uint16_t port, struct rte_flow_error *error)
3874 {
3875         struct tcf_vtep *vtep;
3876         struct nlmsghdr *nlh;
3877         struct ifinfomsg *ifm;
3878         char name[sizeof(MLX5_VXLAN_DEVICE_PFX) + 24];
3879         alignas(struct nlmsghdr)
3880         uint8_t buf[mnl_nlmsg_size(sizeof(*ifm)) +
3881                     SZ_NLATTR_DATA_OF(sizeof(name)) +
3882                     SZ_NLATTR_NEST * 2 +
3883                     SZ_NLATTR_STRZ_OF("vxlan") +
3884                     SZ_NLATTR_DATA_OF(sizeof(uint32_t)) +
3885                     SZ_NLATTR_DATA_OF(sizeof(uint16_t)) +
3886                     SZ_NLATTR_DATA_OF(sizeof(uint8_t)) * 3 +
3887                     MNL_BUF_EXTRA_SPACE];
3888         struct nlattr *na_info;
3889         struct nlattr *na_vxlan;
3890         rte_be16_t vxlan_port = rte_cpu_to_be_16(port);
3891         int ret;
3892
3893         vtep = rte_zmalloc(__func__, sizeof(*vtep), alignof(struct tcf_vtep));
3894         if (!vtep) {
3895                 rte_flow_error_set(error, ENOMEM,
3896                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
3897                                    "unable to allocate memory for VTEP");
3898                 return NULL;
3899         }
3900         *vtep = (struct tcf_vtep){
3901                         .port = port,
3902                         .local = LIST_HEAD_INITIALIZER(),
3903                         .neigh = LIST_HEAD_INITIALIZER(),
3904         };
3905         memset(buf, 0, sizeof(buf));
3906         nlh = mnl_nlmsg_put_header(buf);
3907         nlh->nlmsg_type = RTM_NEWLINK;
3908         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE  | NLM_F_EXCL;
3909         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
3910         ifm->ifi_family = AF_UNSPEC;
3911         ifm->ifi_type = 0;
3912         ifm->ifi_index = 0;
3913         ifm->ifi_flags = IFF_UP;
3914         ifm->ifi_change = 0xffffffff;
3915         snprintf(name, sizeof(name), "%s%u", MLX5_VXLAN_DEVICE_PFX, port);
3916         mnl_attr_put_strz(nlh, IFLA_IFNAME, name);
3917         na_info = mnl_attr_nest_start(nlh, IFLA_LINKINFO);
3918         assert(na_info);
3919         mnl_attr_put_strz(nlh, IFLA_INFO_KIND, "vxlan");
3920         na_vxlan = mnl_attr_nest_start(nlh, IFLA_INFO_DATA);
3921         if (ifouter)
3922                 mnl_attr_put_u32(nlh, IFLA_VXLAN_LINK, ifouter);
3923         assert(na_vxlan);
3924         mnl_attr_put_u8(nlh, IFLA_VXLAN_COLLECT_METADATA, 1);
3925         mnl_attr_put_u8(nlh, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1);
3926         mnl_attr_put_u8(nlh, IFLA_VXLAN_LEARNING, 0);
3927         mnl_attr_put_u16(nlh, IFLA_VXLAN_PORT, vxlan_port);
3928         mnl_attr_nest_end(nlh, na_vxlan);
3929         mnl_attr_nest_end(nlh, na_info);
3930         assert(sizeof(buf) >= nlh->nlmsg_len);
3931         ret = flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL);
3932         if (ret) {
3933                 DRV_LOG(WARNING,
3934                         "netlink: VTEP %s create failure (%d)",
3935                         name, rte_errno);
3936                 if (rte_errno != EEXIST || ifouter)
3937                         /*
3938                          * Some unhandled error occurred or device is
3939                          * for encapsulation and cannot be shared.
3940                          */
3941                         goto error;
3942         } else {
3943                 /*
3944                  * Mark device we actually created.
3945                  * We should explicitly delete
3946                  * when we do not need it anymore.
3947                  */
3948                 vtep->created = 1;
3949         }
3950         /* Try to get ifindex of created of pre-existing device. */
3951         ret = if_nametoindex(name);
3952         if (!ret) {
3953                 DRV_LOG(WARNING,
3954                         "VTEP %s failed to get index (%d)", name, errno);
3955                 rte_flow_error_set
3956                         (error, -errno,
3957                          RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
3958                          "netlink: failed to retrieve VTEP ifindex");
3959                 goto error;
3960         }
3961         vtep->ifindex = ret;
3962         vtep->ifouter = ifouter;
3963         memset(buf, 0, sizeof(buf));
3964         nlh = mnl_nlmsg_put_header(buf);
3965         nlh->nlmsg_type = RTM_NEWLINK;
3966         nlh->nlmsg_flags = NLM_F_REQUEST;
3967         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
3968         ifm->ifi_family = AF_UNSPEC;
3969         ifm->ifi_type = 0;
3970         ifm->ifi_index = vtep->ifindex;
3971         ifm->ifi_flags = IFF_UP;
3972         ifm->ifi_change = IFF_UP;
3973         ret = flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL);
3974         if (ret) {
3975                 rte_flow_error_set(error, -errno,
3976                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
3977                                    "netlink: failed to set VTEP link up");
3978                 DRV_LOG(WARNING, "netlink: VTEP %s set link up failure (%d)",
3979                         name, rte_errno);
3980                 goto clean;
3981         }
3982         ret = mlx5_flow_tcf_init(tcf, vtep->ifindex, error);
3983         if (ret) {
3984                 DRV_LOG(WARNING, "VTEP %s init failure (%d)", name, rte_errno);
3985                 goto clean;
3986         }
3987         DRV_LOG(INFO, "VTEP create (%d, %d)", vtep->port, vtep->ifindex);
3988         vtep->refcnt = 1;
3989         return vtep;
3990 clean:
3991         flow_tcf_vtep_delete(tcf, vtep);
3992         return NULL;
3993 error:
3994         rte_free(vtep);
3995         return NULL;
3996 }
3997 #else
3998 static struct tcf_vtep*
3999 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf __rte_unused,
4000                      unsigned int ifouter __rte_unused,
4001                      uint16_t port __rte_unused,
4002                      struct rte_flow_error *error)
4003 {
4004         rte_flow_error_set(error, ENOTSUP,
4005                            RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4006                            "netlink: failed to create VTEP, "
4007                            "vxlan metadata are not supported by kernel");
4008         return NULL;
4009 }
4010 #endif /* HAVE_IFLA_VXLAN_COLLECT_METADATA */
4011
4012 /**
4013  * Acquire target interface index for VXLAN tunneling decapsulation.
4014  * In order to share the UDP port within the other interfaces the
4015  * VXLAN device created as not attached to any interface (if created).
4016  *
4017  * @param[in] tcf
4018  *   Context object initialized by mlx5_flow_tcf_context_create().
4019  * @param[in] dev_flow
4020  *   Flow tcf object with tunnel structure pointer set.
4021  * @param[out] error
4022  *   Perform verbose error reporting if not NULL.
4023  * @return
4024  *   Interface descriptor pointer on success,
4025  *   NULL otherwise and rte_errno is set.
4026  */
4027 static struct tcf_vtep*
4028 flow_tcf_decap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
4029                             struct mlx5_flow *dev_flow,
4030                             struct rte_flow_error *error)
4031 {
4032         struct tcf_vtep *vtep;
4033         uint16_t port = dev_flow->tcf.vxlan_decap->udp_port;
4034
4035         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
4036                 if (vtep->port == port)
4037                         break;
4038         }
4039         if (vtep && vtep->ifouter) {
4040                 rte_flow_error_set(error, -errno,
4041                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4042                                    "Failed to create decap VTEP with specified"
4043                                    " UDP port, atatched device exists");
4044                 return NULL;
4045         }
4046         if (vtep) {
4047                 /* Device exists, just increment the reference counter. */
4048                 vtep->refcnt++;
4049                 assert(vtep->ifindex);
4050                 return vtep;
4051         }
4052         /* No decapsulation device exists, try to create the new one. */
4053         vtep = flow_tcf_vtep_create(tcf, 0, port, error);
4054         if (vtep)
4055                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
4056         return vtep;
4057 }
4058
4059 /**
4060  * Aqcuire target interface index for VXLAN tunneling encapsulation.
4061  *
4062  * @param[in] tcf
4063  *   Context object initialized by mlx5_flow_tcf_context_create().
4064  * @param[in] ifouter
4065  *   Network interface index to attach VXLAN encap device to.
4066  * @param[in] dev_flow
4067  *   Flow tcf object with tunnel structure pointer set.
4068  * @param[out] error
4069  *   Perform verbose error reporting if not NULL.
4070  * @return
4071  *   Interface descriptor pointer on success,
4072  *   NULL otherwise and rte_errno is set.
4073  */
4074 static struct tcf_vtep*
4075 flow_tcf_encap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
4076                             unsigned int ifouter,
4077                             struct mlx5_flow *dev_flow __rte_unused,
4078                             struct rte_flow_error *error)
4079 {
4080         static uint16_t encap_port = MLX5_VXLAN_PORT_MIN - 1;
4081         struct tcf_vtep *vtep;
4082
4083         assert(ifouter);
4084         /* Look whether the attached VTEP for encap is created. */
4085         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
4086                 if (vtep->ifouter == ifouter)
4087                         break;
4088         }
4089         if (vtep) {
4090                 /* VTEP already exists, just increment the reference. */
4091                 vtep->refcnt++;
4092         } else {
4093                 uint16_t pcnt;
4094
4095                 /* Not found, we should create the new attached VTEP. */
4096                 for (pcnt = 0; pcnt <= (MLX5_VXLAN_PORT_MAX
4097                                      - MLX5_VXLAN_PORT_MIN); pcnt++) {
4098                         encap_port++;
4099                         /* Wraparound the UDP port index. */
4100                         if (encap_port < MLX5_VXLAN_PORT_MIN ||
4101                             encap_port > MLX5_VXLAN_PORT_MAX)
4102                                 encap_port = MLX5_VXLAN_PORT_MIN;
4103                         /* Check whether UDP port is in already in use. */
4104                         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
4105                                 if (vtep->port == encap_port)
4106                                         break;
4107                         }
4108                         if (vtep) {
4109                                 /* Port is in use, try the next one. */
4110                                 vtep = NULL;
4111                                 continue;
4112                         }
4113                         vtep = flow_tcf_vtep_create(tcf, ifouter,
4114                                                     encap_port, error);
4115                         if (vtep) {
4116                                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
4117                                 break;
4118                         }
4119                         if (rte_errno != EEXIST)
4120                                 break;
4121                 }
4122                 if (!vtep)
4123                         return NULL;
4124         }
4125         assert(vtep->ifouter == ifouter);
4126         assert(vtep->ifindex);
4127         return vtep;
4128 }
4129
4130 /**
4131  * Acquires target interface index for tunneling of any type.
4132  * Creates the new VTEP if needed.
4133  *
4134  * @param[in] tcf
4135  *   Context object initialized by mlx5_flow_tcf_context_create().
4136  * @param[in] ifouter
4137  *   Network interface index to attach VXLAN encap device to.
4138  * @param[in] dev_flow
4139  *   Flow tcf object with tunnel structure pointer set.
4140  * @param[out] error
4141  *   Perform verbose error reporting if not NULL.
4142  * @return
4143  *   Interface descriptor pointer on success,
4144  *   NULL otherwise and rte_errno is set.
4145  */
4146 static struct tcf_vtep*
4147 flow_tcf_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
4148                       unsigned int ifouter,
4149                       struct mlx5_flow *dev_flow,
4150                       struct rte_flow_error *error)
4151 {
4152         struct tcf_vtep *vtep = NULL;
4153
4154         assert(dev_flow->tcf.tunnel);
4155         pthread_mutex_lock(&vtep_list_mutex);
4156         switch (dev_flow->tcf.tunnel->type) {
4157         case FLOW_TCF_TUNACT_VXLAN_ENCAP:
4158                 vtep = flow_tcf_encap_vtep_acquire(tcf, ifouter,
4159                                                   dev_flow, error);
4160                 break;
4161         case FLOW_TCF_TUNACT_VXLAN_DECAP:
4162                 vtep = flow_tcf_decap_vtep_acquire(tcf, dev_flow, error);
4163                 break;
4164         default:
4165                 rte_flow_error_set(error, ENOTSUP,
4166                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4167                                    "unsupported tunnel type");
4168                 break;
4169         }
4170         pthread_mutex_unlock(&vtep_list_mutex);
4171         return vtep;
4172 }
4173
4174 /**
4175  * Release tunneling interface by ifindex. Decrements reference
4176  * counter and actually removes the device if counter is zero.
4177  *
4178  * @param[in] tcf
4179  *   Context object initialized by mlx5_flow_tcf_context_create().
4180  * @param[in] vtep
4181  *   VTEP device descriptor structure.
4182  * @param[in] dev_flow
4183  *   Flow tcf object with tunnel structure pointer set.
4184  */
4185 static void
4186 flow_tcf_vtep_release(struct mlx5_flow_tcf_context *tcf,
4187                       struct tcf_vtep *vtep,
4188                       struct mlx5_flow *dev_flow)
4189 {
4190         assert(dev_flow->tcf.tunnel);
4191         pthread_mutex_lock(&vtep_list_mutex);
4192         switch (dev_flow->tcf.tunnel->type) {
4193         case FLOW_TCF_TUNACT_VXLAN_DECAP:
4194                 break;
4195         case FLOW_TCF_TUNACT_VXLAN_ENCAP:
4196                 break;
4197         default:
4198                 assert(false);
4199                 DRV_LOG(WARNING, "Unsupported tunnel type");
4200                 break;
4201         }
4202         assert(vtep->refcnt);
4203         if (--vtep->refcnt == 0) {
4204                 LIST_REMOVE(vtep, next);
4205                 flow_tcf_vtep_delete(tcf, vtep);
4206         }
4207         pthread_mutex_unlock(&vtep_list_mutex);
4208 }
4209
4210
4211 /**
4212  * Apply flow to E-Switch by sending Netlink message.
4213  *
4214  * @param[in] dev
4215  *   Pointer to Ethernet device.
4216  * @param[in, out] flow
4217  *   Pointer to the sub flow.
4218  * @param[out] error
4219  *   Pointer to the error structure.
4220  *
4221  * @return
4222  *   0 on success, a negative errno value otherwise and rte_ernno is set.
4223  */
4224 static int
4225 flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
4226                struct rte_flow_error *error)
4227 {
4228         struct priv *priv = dev->data->dev_private;
4229         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
4230         struct mlx5_flow *dev_flow;
4231         struct nlmsghdr *nlh;
4232
4233         dev_flow = LIST_FIRST(&flow->dev_flows);
4234         /* E-Switch flow can't be expanded. */
4235         assert(!LIST_NEXT(dev_flow, next));
4236         if (dev_flow->tcf.applied)
4237                 return 0;
4238         nlh = dev_flow->tcf.nlh;
4239         nlh->nlmsg_type = RTM_NEWTFILTER;
4240         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
4241         if (dev_flow->tcf.tunnel) {
4242                 /*
4243                  * Replace the interface index, target for
4244                  * encapsulation, source for decapsulation.
4245                  */
4246                 assert(!dev_flow->tcf.tunnel->vtep);
4247                 assert(dev_flow->tcf.tunnel->ifindex_ptr);
4248                 /* Acquire actual VTEP device when rule is being applied. */
4249                 dev_flow->tcf.tunnel->vtep =
4250                         flow_tcf_vtep_acquire(ctx,
4251                                         dev_flow->tcf.tunnel->ifindex_org,
4252                                         dev_flow, error);
4253                 if (!dev_flow->tcf.tunnel->vtep)
4254                         return -rte_errno;
4255                 DRV_LOG(INFO, "Replace ifindex: %d->%d",
4256                                 dev_flow->tcf.tunnel->vtep->ifindex,
4257                                 dev_flow->tcf.tunnel->ifindex_org);
4258                 *dev_flow->tcf.tunnel->ifindex_ptr =
4259                         dev_flow->tcf.tunnel->vtep->ifindex;
4260         }
4261         if (!flow_tcf_nl_ack(ctx, nlh, 0, NULL, NULL)) {
4262                 dev_flow->tcf.applied = 1;
4263                 return 0;
4264         }
4265         return rte_flow_error_set(error, rte_errno,
4266                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4267                                   "netlink: failed to create TC flow rule");
4268 }
4269
4270 /**
4271  * Remove flow from E-Switch by sending Netlink message.
4272  *
4273  * @param[in] dev
4274  *   Pointer to Ethernet device.
4275  * @param[in, out] flow
4276  *   Pointer to the sub flow.
4277  */
4278 static void
4279 flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
4280 {
4281         struct priv *priv = dev->data->dev_private;
4282         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
4283         struct mlx5_flow *dev_flow;
4284         struct nlmsghdr *nlh;
4285
4286         if (!flow)
4287                 return;
4288         dev_flow = LIST_FIRST(&flow->dev_flows);
4289         if (!dev_flow)
4290                 return;
4291         /* E-Switch flow can't be expanded. */
4292         assert(!LIST_NEXT(dev_flow, next));
4293         if (dev_flow->tcf.applied) {
4294                 nlh = dev_flow->tcf.nlh;
4295                 nlh->nlmsg_type = RTM_DELTFILTER;
4296                 nlh->nlmsg_flags = NLM_F_REQUEST;
4297                 flow_tcf_nl_ack(ctx, nlh, 0, NULL, NULL);
4298                 if (dev_flow->tcf.tunnel) {
4299                         assert(dev_flow->tcf.tunnel->vtep);
4300                         flow_tcf_vtep_release(ctx,
4301                                 dev_flow->tcf.tunnel->vtep,
4302                                 dev_flow);
4303                         dev_flow->tcf.tunnel->vtep = NULL;
4304                 }
4305                 dev_flow->tcf.applied = 0;
4306         }
4307 }
4308
4309 /**
4310  * Remove flow from E-Switch and release resources of the device flow.
4311  *
4312  * @param[in] dev
4313  *   Pointer to Ethernet device.
4314  * @param[in, out] flow
4315  *   Pointer to the sub flow.
4316  */
4317 static void
4318 flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
4319 {
4320         struct mlx5_flow *dev_flow;
4321
4322         if (!flow)
4323                 return;
4324         flow_tcf_remove(dev, flow);
4325         if (flow->counter) {
4326                 if (--flow->counter->ref_cnt == 0) {
4327                         rte_free(flow->counter);
4328                         flow->counter = NULL;
4329                 }
4330         }
4331         dev_flow = LIST_FIRST(&flow->dev_flows);
4332         if (!dev_flow)
4333                 return;
4334         /* E-Switch flow can't be expanded. */
4335         assert(!LIST_NEXT(dev_flow, next));
4336         LIST_REMOVE(dev_flow, next);
4337         rte_free(dev_flow);
4338 }
4339
4340 /**
4341  * Helper routine for figuring the space size required for a parse buffer.
4342  *
4343  * @param array
4344  *   array of values to use.
4345  * @param idx
4346  *   Current location in array.
4347  * @param value
4348  *   Value to compare with.
4349  *
4350  * @return
4351  *   The maximum between the given value and the array value on index.
4352  */
4353 static uint16_t
4354 flow_tcf_arr_val_max(uint16_t array[], int idx, uint16_t value)
4355 {
4356         return idx < 0 ? (value) : RTE_MAX((array)[idx], value);
4357 }
4358
4359 /**
4360  * Parse rtnetlink message attributes filling the attribute table with the info
4361  * retrieved.
4362  *
4363  * @param tb
4364  *   Attribute table to be filled.
4365  * @param[out] max
4366  *   Maxinum entry in the attribute table.
4367  * @param rte
4368  *   The attributes section in the message to be parsed.
4369  * @param len
4370  *   The length of the attributes section in the message.
4371  */
4372 static void
4373 flow_tcf_nl_parse_rtattr(struct rtattr *tb[], int max,
4374                          struct rtattr *rta, int len)
4375 {
4376         unsigned short type;
4377         memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
4378         while (RTA_OK(rta, len)) {
4379                 type = rta->rta_type;
4380                 if (type <= max && !tb[type])
4381                         tb[type] = rta;
4382                 rta = RTA_NEXT(rta, len);
4383         }
4384 }
4385
4386 /**
4387  * Extract flow counters from flower action.
4388  *
4389  * @param rta
4390  *   flower action stats properties in the Netlink message received.
4391  * @param rta_type
4392  *   The backward sequence of rta_types, as written in the attribute table,
4393  *   we need to traverse in order to get to the requested object.
4394  * @param idx
4395  *   Current location in rta_type table.
4396  * @param[out] data
4397  *   data holding the count statistics of the rte_flow retrieved from
4398  *   the message.
4399  *
4400  * @return
4401  *   0 if data was found and retrieved, -1 otherwise.
4402  */
4403 static int
4404 flow_tcf_nl_action_stats_parse_and_get(struct rtattr *rta,
4405                                        uint16_t rta_type[], int idx,
4406                                        struct gnet_stats_basic *data)
4407 {
4408         int tca_stats_max = flow_tcf_arr_val_max(rta_type, idx,
4409                                                  TCA_STATS_BASIC);
4410         struct rtattr *tbs[tca_stats_max + 1];
4411
4412         if (rta == NULL || idx < 0)
4413                 return -1;
4414         flow_tcf_nl_parse_rtattr(tbs, tca_stats_max,
4415                                  RTA_DATA(rta), RTA_PAYLOAD(rta));
4416         switch (rta_type[idx]) {
4417         case TCA_STATS_BASIC:
4418                 if (tbs[TCA_STATS_BASIC]) {
4419                         memcpy(data, RTA_DATA(tbs[TCA_STATS_BASIC]),
4420                                RTE_MIN(RTA_PAYLOAD(tbs[TCA_STATS_BASIC]),
4421                                sizeof(*data)));
4422                         return 0;
4423                 }
4424                 break;
4425         default:
4426                 break;
4427         }
4428         return -1;
4429 }
4430
4431 /**
4432  * Parse flower single action retrieving the requested action attribute,
4433  * if found.
4434  *
4435  * @param arg
4436  *   flower action properties in the Netlink message received.
4437  * @param rta_type
4438  *   The backward sequence of rta_types, as written in the attribute table,
4439  *   we need to traverse in order to get to the requested object.
4440  * @param idx
4441  *   Current location in rta_type table.
4442  * @param[out] data
4443  *   Count statistics retrieved from the message query.
4444  *
4445  * @return
4446  *   0 if data was found and retrieved, -1 otherwise.
4447  */
4448 static int
4449 flow_tcf_nl_parse_one_action_and_get(struct rtattr *arg,
4450                                      uint16_t rta_type[], int idx, void *data)
4451 {
4452         int tca_act_max = flow_tcf_arr_val_max(rta_type, idx, TCA_ACT_STATS);
4453         struct rtattr *tb[tca_act_max + 1];
4454
4455         if (arg == NULL || idx < 0)
4456                 return -1;
4457         flow_tcf_nl_parse_rtattr(tb, tca_act_max,
4458                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
4459         if (tb[TCA_ACT_KIND] == NULL)
4460                 return -1;
4461         switch (rta_type[idx]) {
4462         case TCA_ACT_STATS:
4463                 if (tb[TCA_ACT_STATS])
4464                         return flow_tcf_nl_action_stats_parse_and_get
4465                                         (tb[TCA_ACT_STATS],
4466                                          rta_type, --idx,
4467                                          (struct gnet_stats_basic *)data);
4468                 break;
4469         default:
4470                 break;
4471         }
4472         return -1;
4473 }
4474
4475 /**
4476  * Parse flower action section in the message retrieving the requested
4477  * attribute from the first action that provides it.
4478  *
4479  * @param opt
4480  *   flower section in the Netlink message received.
4481  * @param rta_type
4482  *   The backward sequence of rta_types, as written in the attribute table,
4483  *   we need to traverse in order to get to the requested object.
4484  * @param idx
4485  *   Current location in rta_type table.
4486  * @param[out] data
4487  *   data retrieved from the message query.
4488  *
4489  * @return
4490  *   0 if data was found and retrieved, -1 otherwise.
4491  */
4492 static int
4493 flow_tcf_nl_action_parse_and_get(struct rtattr *arg,
4494                                  uint16_t rta_type[], int idx, void *data)
4495 {
4496         struct rtattr *tb[TCA_ACT_MAX_PRIO + 1];
4497         int i;
4498
4499         if (arg == NULL || idx < 0)
4500                 return -1;
4501         flow_tcf_nl_parse_rtattr(tb, TCA_ACT_MAX_PRIO,
4502                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
4503         switch (rta_type[idx]) {
4504         /*
4505          * flow counters are stored in the actions defined by the flow
4506          * and not in the flow itself, therefore we need to traverse the
4507          * flower chain of actions in search for them.
4508          *
4509          * Note that the index is not decremented here.
4510          */
4511         case TCA_ACT_STATS:
4512                 for (i = 0; i <= TCA_ACT_MAX_PRIO; i++) {
4513                         if (tb[i] &&
4514                         !flow_tcf_nl_parse_one_action_and_get(tb[i],
4515                                                               rta_type,
4516                                                               idx, data))
4517                                 return 0;
4518                 }
4519                 break;
4520         default:
4521                 break;
4522         }
4523         return -1;
4524 }
4525
4526 /**
4527  * Parse flower classifier options in the message, retrieving the requested
4528  * attribute if found.
4529  *
4530  * @param opt
4531  *   flower section in the Netlink message received.
4532  * @param rta_type
4533  *   The backward sequence of rta_types, as written in the attribute table,
4534  *   we need to traverse in order to get to the requested object.
4535  * @param idx
4536  *   Current location in rta_type table.
4537  * @param[out] data
4538  *   data retrieved from the message query.
4539  *
4540  * @return
4541  *   0 if data was found and retrieved, -1 otherwise.
4542  */
4543 static int
4544 flow_tcf_nl_opts_parse_and_get(struct rtattr *opt,
4545                                uint16_t rta_type[], int idx, void *data)
4546 {
4547         int tca_flower_max = flow_tcf_arr_val_max(rta_type, idx,
4548                                                   TCA_FLOWER_ACT);
4549         struct rtattr *tb[tca_flower_max + 1];
4550
4551         if (!opt || idx < 0)
4552                 return -1;
4553         flow_tcf_nl_parse_rtattr(tb, tca_flower_max,
4554                                  RTA_DATA(opt), RTA_PAYLOAD(opt));
4555         switch (rta_type[idx]) {
4556         case TCA_FLOWER_ACT:
4557                 if (tb[TCA_FLOWER_ACT])
4558                         return flow_tcf_nl_action_parse_and_get
4559                                                         (tb[TCA_FLOWER_ACT],
4560                                                          rta_type, --idx, data);
4561                 break;
4562         default:
4563                 break;
4564         }
4565         return -1;
4566 }
4567
4568 /**
4569  * Parse Netlink reply on filter query, retrieving the flow counters.
4570  *
4571  * @param nlh
4572  *   Message received from Netlink.
4573  * @param rta_type
4574  *   The backward sequence of rta_types, as written in the attribute table,
4575  *   we need to traverse in order to get to the requested object.
4576  * @param idx
4577  *   Current location in rta_type table.
4578  * @param[out] data
4579  *   data retrieved from the message query.
4580  *
4581  * @return
4582  *   0 if data was found and retrieved, -1 otherwise.
4583  */
4584 static int
4585 flow_tcf_nl_filter_parse_and_get(struct nlmsghdr *cnlh,
4586                                  uint16_t rta_type[], int idx, void *data)
4587 {
4588         struct nlmsghdr *nlh = cnlh;
4589         struct tcmsg *t = NLMSG_DATA(nlh);
4590         int len = nlh->nlmsg_len;
4591         int tca_max = flow_tcf_arr_val_max(rta_type, idx, TCA_OPTIONS);
4592         struct rtattr *tb[tca_max + 1];
4593
4594         if (idx < 0)
4595                 return -1;
4596         if (nlh->nlmsg_type != RTM_NEWTFILTER &&
4597             nlh->nlmsg_type != RTM_GETTFILTER &&
4598             nlh->nlmsg_type != RTM_DELTFILTER)
4599                 return -1;
4600         len -= NLMSG_LENGTH(sizeof(*t));
4601         if (len < 0)
4602                 return -1;
4603         flow_tcf_nl_parse_rtattr(tb, tca_max, TCA_RTA(t), len);
4604         /* Not a TC flower flow - bail out */
4605         if (!tb[TCA_KIND] ||
4606             strcmp(RTA_DATA(tb[TCA_KIND]), "flower"))
4607                 return -1;
4608         switch (rta_type[idx]) {
4609         case TCA_OPTIONS:
4610                 if (tb[TCA_OPTIONS])
4611                         return flow_tcf_nl_opts_parse_and_get(tb[TCA_OPTIONS],
4612                                                               rta_type,
4613                                                               --idx, data);
4614                 break;
4615         default:
4616                 break;
4617         }
4618         return -1;
4619 }
4620
4621 /**
4622  * A callback to parse Netlink reply on TC flower query.
4623  *
4624  * @param nlh
4625  *   Message received from Netlink.
4626  * @param[out] data
4627  *   Pointer to data area to be filled by the parsing routine.
4628  *   assumed to be a pinter to struct flow_tcf_stats_basic.
4629  *
4630  * @return
4631  *   MNL_CB_OK value.
4632  */
4633 static int
4634 flow_tcf_nl_message_get_stats_basic(const struct nlmsghdr *nlh, void *data)
4635 {
4636         /*
4637          * The backward sequence of rta_types to pass in order to get
4638          *  to the counters.
4639          */
4640         uint16_t rta_type[] = { TCA_STATS_BASIC, TCA_ACT_STATS,
4641                                 TCA_FLOWER_ACT, TCA_OPTIONS };
4642         struct flow_tcf_stats_basic *sb_data = data;
4643         union {
4644                 const struct nlmsghdr *c;
4645                 struct nlmsghdr *nc;
4646         } tnlh = { .c = nlh };
4647
4648         if (!flow_tcf_nl_filter_parse_and_get(tnlh.nc, rta_type,
4649                                               RTE_DIM(rta_type) - 1,
4650                                               (void *)&sb_data->counters))
4651                 sb_data->valid = true;
4652         return MNL_CB_OK;
4653 }
4654
4655 /**
4656  * Query a TC flower rule for its statistics via netlink.
4657  *
4658  * @param[in] dev
4659  *   Pointer to Ethernet device.
4660  * @param[in] flow
4661  *   Pointer to the sub flow.
4662  * @param[out] data
4663  *   data retrieved by the query.
4664  * @param[out] error
4665  *   Perform verbose error reporting if not NULL.
4666  *
4667  * @return
4668  *   0 on success, a negative errno value otherwise and rte_errno is set.
4669  */
4670 static int
4671 flow_tcf_query_count(struct rte_eth_dev *dev,
4672                           struct rte_flow *flow,
4673                           void *data,
4674                           struct rte_flow_error *error)
4675 {
4676         struct flow_tcf_stats_basic sb_data = { 0 };
4677         struct rte_flow_query_count *qc = data;
4678         struct priv *priv = dev->data->dev_private;
4679         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
4680         struct mnl_socket *nl = ctx->nl;
4681         struct mlx5_flow *dev_flow;
4682         struct nlmsghdr *nlh;
4683         uint32_t seq = priv->tcf_context->seq++;
4684         ssize_t ret;
4685         assert(qc);
4686
4687         dev_flow = LIST_FIRST(&flow->dev_flows);
4688         /* E-Switch flow can't be expanded. */
4689         assert(!LIST_NEXT(dev_flow, next));
4690         if (!dev_flow->flow->counter)
4691                 goto notsup_exit;
4692         nlh = dev_flow->tcf.nlh;
4693         nlh->nlmsg_type = RTM_GETTFILTER;
4694         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ECHO;
4695         nlh->nlmsg_seq = seq;
4696         if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) == -1)
4697                 goto error_exit;
4698         do {
4699                 ret = mnl_socket_recvfrom(nl, ctx->buf, ctx->buf_size);
4700                 if (ret <= 0)
4701                         break;
4702                 ret = mnl_cb_run(ctx->buf, ret, seq,
4703                                  mnl_socket_get_portid(nl),
4704                                  flow_tcf_nl_message_get_stats_basic,
4705                                  (void *)&sb_data);
4706         } while (ret > 0);
4707         /* Return the delta from last reset. */
4708         if (sb_data.valid) {
4709                 /* Return the delta from last reset. */
4710                 qc->hits_set = 1;
4711                 qc->bytes_set = 1;
4712                 qc->hits = sb_data.counters.packets - flow->counter->hits;
4713                 qc->bytes = sb_data.counters.bytes - flow->counter->bytes;
4714                 if (qc->reset) {
4715                         flow->counter->hits = sb_data.counters.packets;
4716                         flow->counter->bytes = sb_data.counters.bytes;
4717                 }
4718                 return 0;
4719         }
4720         return rte_flow_error_set(error, EINVAL,
4721                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
4722                                   NULL,
4723                                   "flow does not have counter");
4724 error_exit:
4725         return rte_flow_error_set
4726                         (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
4727                          NULL, "netlink: failed to read flow rule counters");
4728 notsup_exit:
4729         return rte_flow_error_set
4730                         (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
4731                          NULL, "counters are not available.");
4732 }
4733
4734 /**
4735  * Query a flow.
4736  *
4737  * @see rte_flow_query()
4738  * @see rte_flow_ops
4739  */
4740 static int
4741 flow_tcf_query(struct rte_eth_dev *dev,
4742                struct rte_flow *flow,
4743                const struct rte_flow_action *actions,
4744                void *data,
4745                struct rte_flow_error *error)
4746 {
4747         int ret = -EINVAL;
4748
4749         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
4750                 switch (actions->type) {
4751                 case RTE_FLOW_ACTION_TYPE_VOID:
4752                         break;
4753                 case RTE_FLOW_ACTION_TYPE_COUNT:
4754                         ret = flow_tcf_query_count(dev, flow, data, error);
4755                         break;
4756                 default:
4757                         return rte_flow_error_set(error, ENOTSUP,
4758                                                   RTE_FLOW_ERROR_TYPE_ACTION,
4759                                                   actions,
4760                                                   "action not supported");
4761                 }
4762         }
4763         return ret;
4764 }
4765
4766 const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops = {
4767         .validate = flow_tcf_validate,
4768         .prepare = flow_tcf_prepare,
4769         .translate = flow_tcf_translate,
4770         .apply = flow_tcf_apply,
4771         .remove = flow_tcf_remove,
4772         .destroy = flow_tcf_destroy,
4773         .query = flow_tcf_query,
4774 };
4775
4776 /**
4777  * Create and configure a libmnl socket for Netlink flow rules.
4778  *
4779  * @return
4780  *   A valid libmnl socket object pointer on success, NULL otherwise and
4781  *   rte_errno is set.
4782  */
4783 static struct mnl_socket *
4784 flow_tcf_mnl_socket_create(void)
4785 {
4786         struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
4787
4788         if (nl) {
4789                 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
4790                                       sizeof(int));
4791                 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
4792                         return nl;
4793         }
4794         rte_errno = errno;
4795         if (nl)
4796                 mnl_socket_close(nl);
4797         return NULL;
4798 }
4799
4800 /**
4801  * Destroy a libmnl socket.
4802  *
4803  * @param nl
4804  *   Libmnl socket of the @p NETLINK_ROUTE kind.
4805  */
4806 static void
4807 flow_tcf_mnl_socket_destroy(struct mnl_socket *nl)
4808 {
4809         if (nl)
4810                 mnl_socket_close(nl);
4811 }
4812
4813 /**
4814  * Initialize ingress qdisc of a given network interface.
4815  *
4816  * @param ctx
4817  *   Pointer to tc-flower context to use.
4818  * @param ifindex
4819  *   Index of network interface to initialize.
4820  * @param[out] error
4821  *   Perform verbose error reporting if not NULL.
4822  *
4823  * @return
4824  *   0 on success, a negative errno value otherwise and rte_errno is set.
4825  */
4826 int
4827 mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx,
4828                    unsigned int ifindex, struct rte_flow_error *error)
4829 {
4830         struct nlmsghdr *nlh;
4831         struct tcmsg *tcm;
4832         alignas(struct nlmsghdr)
4833         uint8_t buf[mnl_nlmsg_size(sizeof(*tcm)) +
4834                     SZ_NLATTR_STRZ_OF("ingress") +
4835                     MNL_BUF_EXTRA_SPACE];
4836
4837         /* Destroy existing ingress qdisc and everything attached to it. */
4838         nlh = mnl_nlmsg_put_header(buf);
4839         nlh->nlmsg_type = RTM_DELQDISC;
4840         nlh->nlmsg_flags = NLM_F_REQUEST;
4841         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
4842         tcm->tcm_family = AF_UNSPEC;
4843         tcm->tcm_ifindex = ifindex;
4844         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
4845         tcm->tcm_parent = TC_H_INGRESS;
4846         assert(sizeof(buf) >= nlh->nlmsg_len);
4847         /* Ignore errors when qdisc is already absent. */
4848         if (flow_tcf_nl_ack(ctx, nlh, 0, NULL, NULL) &&
4849             rte_errno != EINVAL && rte_errno != ENOENT)
4850                 return rte_flow_error_set(error, rte_errno,
4851                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4852                                           "netlink: failed to remove ingress"
4853                                           " qdisc");
4854         /* Create fresh ingress qdisc. */
4855         nlh = mnl_nlmsg_put_header(buf);
4856         nlh->nlmsg_type = RTM_NEWQDISC;
4857         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
4858         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
4859         tcm->tcm_family = AF_UNSPEC;
4860         tcm->tcm_ifindex = ifindex;
4861         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
4862         tcm->tcm_parent = TC_H_INGRESS;
4863         mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
4864         assert(sizeof(buf) >= nlh->nlmsg_len);
4865         if (flow_tcf_nl_ack(ctx, nlh, 0, NULL, NULL))
4866                 return rte_flow_error_set(error, rte_errno,
4867                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4868                                           "netlink: failed to create ingress"
4869                                           " qdisc");
4870         return 0;
4871 }
4872
4873 /**
4874  * Create libmnl context for Netlink flow rules.
4875  *
4876  * @return
4877  *   A valid libmnl socket object pointer on success, NULL otherwise and
4878  *   rte_errno is set.
4879  */
4880 struct mlx5_flow_tcf_context *
4881 mlx5_flow_tcf_context_create(void)
4882 {
4883         struct mlx5_flow_tcf_context *ctx = rte_zmalloc(__func__,
4884                                                         sizeof(*ctx),
4885                                                         sizeof(uint32_t));
4886         if (!ctx)
4887                 goto error;
4888         ctx->nl = flow_tcf_mnl_socket_create();
4889         if (!ctx->nl)
4890                 goto error;
4891         ctx->buf_size = MNL_SOCKET_BUFFER_SIZE;
4892         ctx->buf = rte_zmalloc(__func__,
4893                                ctx->buf_size, sizeof(uint32_t));
4894         if (!ctx->buf)
4895                 goto error;
4896         ctx->seq = random();
4897         return ctx;
4898 error:
4899         mlx5_flow_tcf_context_destroy(ctx);
4900         return NULL;
4901 }
4902
4903 /**
4904  * Destroy a libmnl context.
4905  *
4906  * @param ctx
4907  *   Libmnl socket of the @p NETLINK_ROUTE kind.
4908  */
4909 void
4910 mlx5_flow_tcf_context_destroy(struct mlx5_flow_tcf_context *ctx)
4911 {
4912         if (!ctx)
4913                 return;
4914         flow_tcf_mnl_socket_destroy(ctx->nl);
4915         rte_free(ctx->buf);
4916         rte_free(ctx);
4917 }