net/mlx5: validate TOS and TTL on E-Switch
[dpdk.git] / drivers / net / mlx5 / mlx5_flow_tcf.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018 6WIND S.A.
3  * Copyright 2018 Mellanox Technologies, Ltd
4  */
5
6 #include <assert.h>
7 #include <errno.h>
8 #include <libmnl/libmnl.h>
9 #include <linux/gen_stats.h>
10 #include <linux/if_ether.h>
11 #include <linux/netlink.h>
12 #include <linux/pkt_cls.h>
13 #include <linux/pkt_sched.h>
14 #include <linux/rtnetlink.h>
15 #include <linux/tc_act/tc_gact.h>
16 #include <linux/tc_act/tc_mirred.h>
17 #include <netinet/in.h>
18 #include <stdalign.h>
19 #include <stdbool.h>
20 #include <stddef.h>
21 #include <stdint.h>
22 #include <stdlib.h>
23 #include <sys/socket.h>
24
25 #include <rte_byteorder.h>
26 #include <rte_errno.h>
27 #include <rte_ether.h>
28 #include <rte_flow.h>
29 #include <rte_malloc.h>
30 #include <rte_common.h>
31
32 #include "mlx5.h"
33 #include "mlx5_flow.h"
34 #include "mlx5_autoconf.h"
35
36 #ifdef HAVE_TC_ACT_VLAN
37
38 #include <linux/tc_act/tc_vlan.h>
39
40 #else /* HAVE_TC_ACT_VLAN */
41
42 #define TCA_VLAN_ACT_POP 1
43 #define TCA_VLAN_ACT_PUSH 2
44 #define TCA_VLAN_ACT_MODIFY 3
45 #define TCA_VLAN_PARMS 2
46 #define TCA_VLAN_PUSH_VLAN_ID 3
47 #define TCA_VLAN_PUSH_VLAN_PROTOCOL 4
48 #define TCA_VLAN_PAD 5
49 #define TCA_VLAN_PUSH_VLAN_PRIORITY 6
50
51 struct tc_vlan {
52         tc_gen;
53         int v_action;
54 };
55
56 #endif /* HAVE_TC_ACT_VLAN */
57
58 #ifdef HAVE_TC_ACT_PEDIT
59
60 #include <linux/tc_act/tc_pedit.h>
61
62 #else /* HAVE_TC_ACT_VLAN */
63
64 enum {
65         TCA_PEDIT_UNSPEC,
66         TCA_PEDIT_TM,
67         TCA_PEDIT_PARMS,
68         TCA_PEDIT_PAD,
69         TCA_PEDIT_PARMS_EX,
70         TCA_PEDIT_KEYS_EX,
71         TCA_PEDIT_KEY_EX,
72         __TCA_PEDIT_MAX
73 };
74
75 enum {
76         TCA_PEDIT_KEY_EX_HTYPE = 1,
77         TCA_PEDIT_KEY_EX_CMD = 2,
78         __TCA_PEDIT_KEY_EX_MAX
79 };
80
81 enum pedit_header_type {
82         TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK = 0,
83         TCA_PEDIT_KEY_EX_HDR_TYPE_ETH = 1,
84         TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 = 2,
85         TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 = 3,
86         TCA_PEDIT_KEY_EX_HDR_TYPE_TCP = 4,
87         TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
88         __PEDIT_HDR_TYPE_MAX,
89 };
90
91 enum pedit_cmd {
92         TCA_PEDIT_KEY_EX_CMD_SET = 0,
93         TCA_PEDIT_KEY_EX_CMD_ADD = 1,
94         __PEDIT_CMD_MAX,
95 };
96
97 struct tc_pedit_key {
98         __u32 mask; /* AND */
99         __u32 val; /*XOR */
100         __u32 off; /*offset */
101         __u32 at;
102         __u32 offmask;
103         __u32 shift;
104 };
105
106 __extension__
107 struct tc_pedit_sel {
108         tc_gen;
109         unsigned char nkeys;
110         unsigned char flags;
111         struct tc_pedit_key keys[0];
112 };
113
114 #endif /* HAVE_TC_ACT_VLAN */
115
116 #ifdef HAVE_TC_ACT_TUNNEL_KEY
117
118 #include <linux/tc_act/tc_tunnel_key.h>
119
120 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
121 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
122 #endif
123
124 #ifndef HAVE_TCA_TUNNEL_KEY_NO_CSUM
125 #define TCA_TUNNEL_KEY_NO_CSUM 10
126 #endif
127
128 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_TOS
129 #define TCA_TUNNEL_KEY_ENC_TOS 12
130 #endif
131
132 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_TTL
133 #define TCA_TUNNEL_KEY_ENC_TTL 13
134 #endif
135
136 #else /* HAVE_TC_ACT_TUNNEL_KEY */
137
138 #define TCA_ACT_TUNNEL_KEY 17
139 #define TCA_TUNNEL_KEY_ACT_SET 1
140 #define TCA_TUNNEL_KEY_ACT_RELEASE 2
141 #define TCA_TUNNEL_KEY_PARMS 2
142 #define TCA_TUNNEL_KEY_ENC_IPV4_SRC 3
143 #define TCA_TUNNEL_KEY_ENC_IPV4_DST 4
144 #define TCA_TUNNEL_KEY_ENC_IPV6_SRC 5
145 #define TCA_TUNNEL_KEY_ENC_IPV6_DST 6
146 #define TCA_TUNNEL_KEY_ENC_KEY_ID 7
147 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
148 #define TCA_TUNNEL_KEY_NO_CSUM 10
149 #define TCA_TUNNEL_KEY_ENC_TOS 12
150 #define TCA_TUNNEL_KEY_ENC_TTL 13
151
152 struct tc_tunnel_key {
153         tc_gen;
154         int t_action;
155 };
156
157 #endif /* HAVE_TC_ACT_TUNNEL_KEY */
158
159 /* Normally found in linux/netlink.h. */
160 #ifndef NETLINK_CAP_ACK
161 #define NETLINK_CAP_ACK 10
162 #endif
163
164 /* Normally found in linux/pkt_sched.h. */
165 #ifndef TC_H_MIN_INGRESS
166 #define TC_H_MIN_INGRESS 0xfff2u
167 #endif
168
169 /* Normally found in linux/pkt_cls.h. */
170 #ifndef TCA_CLS_FLAGS_SKIP_SW
171 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
172 #endif
173 #ifndef TCA_CLS_FLAGS_IN_HW
174 #define TCA_CLS_FLAGS_IN_HW (1 << 2)
175 #endif
176 #ifndef HAVE_TCA_CHAIN
177 #define TCA_CHAIN 11
178 #endif
179 #ifndef HAVE_TCA_FLOWER_ACT
180 #define TCA_FLOWER_ACT 3
181 #endif
182 #ifndef HAVE_TCA_FLOWER_FLAGS
183 #define TCA_FLOWER_FLAGS 22
184 #endif
185 #ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE
186 #define TCA_FLOWER_KEY_ETH_TYPE 8
187 #endif
188 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST
189 #define TCA_FLOWER_KEY_ETH_DST 4
190 #endif
191 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK
192 #define TCA_FLOWER_KEY_ETH_DST_MASK 5
193 #endif
194 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC
195 #define TCA_FLOWER_KEY_ETH_SRC 6
196 #endif
197 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK
198 #define TCA_FLOWER_KEY_ETH_SRC_MASK 7
199 #endif
200 #ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO
201 #define TCA_FLOWER_KEY_IP_PROTO 9
202 #endif
203 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC
204 #define TCA_FLOWER_KEY_IPV4_SRC 10
205 #endif
206 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK
207 #define TCA_FLOWER_KEY_IPV4_SRC_MASK 11
208 #endif
209 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST
210 #define TCA_FLOWER_KEY_IPV4_DST 12
211 #endif
212 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK
213 #define TCA_FLOWER_KEY_IPV4_DST_MASK 13
214 #endif
215 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC
216 #define TCA_FLOWER_KEY_IPV6_SRC 14
217 #endif
218 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK
219 #define TCA_FLOWER_KEY_IPV6_SRC_MASK 15
220 #endif
221 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST
222 #define TCA_FLOWER_KEY_IPV6_DST 16
223 #endif
224 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK
225 #define TCA_FLOWER_KEY_IPV6_DST_MASK 17
226 #endif
227 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC
228 #define TCA_FLOWER_KEY_TCP_SRC 18
229 #endif
230 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK
231 #define TCA_FLOWER_KEY_TCP_SRC_MASK 35
232 #endif
233 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST
234 #define TCA_FLOWER_KEY_TCP_DST 19
235 #endif
236 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK
237 #define TCA_FLOWER_KEY_TCP_DST_MASK 36
238 #endif
239 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC
240 #define TCA_FLOWER_KEY_UDP_SRC 20
241 #endif
242 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK
243 #define TCA_FLOWER_KEY_UDP_SRC_MASK 37
244 #endif
245 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST
246 #define TCA_FLOWER_KEY_UDP_DST 21
247 #endif
248 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK
249 #define TCA_FLOWER_KEY_UDP_DST_MASK 38
250 #endif
251 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID
252 #define TCA_FLOWER_KEY_VLAN_ID 23
253 #endif
254 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO
255 #define TCA_FLOWER_KEY_VLAN_PRIO 24
256 #endif
257 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
258 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
259 #endif
260 #ifndef HAVE_TCA_FLOWER_KEY_ENC_KEY_ID
261 #define TCA_FLOWER_KEY_ENC_KEY_ID 26
262 #endif
263 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC
264 #define TCA_FLOWER_KEY_ENC_IPV4_SRC 27
265 #endif
266 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK
267 #define TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK 28
268 #endif
269 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST
270 #define TCA_FLOWER_KEY_ENC_IPV4_DST 29
271 #endif
272 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK
273 #define TCA_FLOWER_KEY_ENC_IPV4_DST_MASK 30
274 #endif
275 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC
276 #define TCA_FLOWER_KEY_ENC_IPV6_SRC 31
277 #endif
278 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK
279 #define TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK 32
280 #endif
281 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST
282 #define TCA_FLOWER_KEY_ENC_IPV6_DST 33
283 #endif
284 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK
285 #define TCA_FLOWER_KEY_ENC_IPV6_DST_MASK 34
286 #endif
287 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT
288 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT 43
289 #endif
290 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK
291 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK 44
292 #endif
293 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT
294 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT 45
295 #endif
296 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK
297 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK 46
298 #endif
299 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS
300 #define TCA_FLOWER_KEY_TCP_FLAGS 71
301 #endif
302 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK
303 #define TCA_FLOWER_KEY_TCP_FLAGS_MASK 72
304 #endif
305 #ifndef HAVE_TCA_FLOWER_KEY_IP_TOS
306 #define TCA_FLOWER_KEY_IP_TOS 73
307 #endif
308 #ifndef HAVE_TCA_FLOWER_KEY_IP_TOS_MASK
309 #define TCA_FLOWER_KEY_IP_TOS_MASK 74
310 #endif
311 #ifndef HAVE_TCA_FLOWER_KEY_IP_TTL
312 #define TCA_FLOWER_KEY_IP_TTL 75
313 #endif
314 #ifndef HAVE_TCA_FLOWER_KEY_IP_TTL_MASK
315 #define TCA_FLOWER_KEY_IP_TTL_MASK 76
316 #endif
317 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TOS
318 #define TCA_FLOWER_KEY_ENC_IP_TOS 80
319 #endif
320 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TOS_MASK
321 #define TCA_FLOWER_KEY_ENC_IP_TOS_MASK 81
322 #endif
323 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TTL
324 #define TCA_FLOWER_KEY_ENC_IP_TTL 82
325 #endif
326 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TTL_MASK
327 #define TCA_FLOWER_KEY_ENC_IP_TTL_MASK 83
328 #endif
329
330 #ifndef HAVE_TC_ACT_GOTO_CHAIN
331 #define TC_ACT_GOTO_CHAIN 0x20000000
332 #endif
333
334 #ifndef IPV6_ADDR_LEN
335 #define IPV6_ADDR_LEN 16
336 #endif
337
338 #ifndef IPV4_ADDR_LEN
339 #define IPV4_ADDR_LEN 4
340 #endif
341
342 #ifndef TP_PORT_LEN
343 #define TP_PORT_LEN 2 /* Transport Port (UDP/TCP) Length */
344 #endif
345
346 #ifndef TTL_LEN
347 #define TTL_LEN 1
348 #endif
349
350 #ifndef TCA_ACT_MAX_PRIO
351 #define TCA_ACT_MAX_PRIO 32
352 #endif
353
354 /** Parameters of VXLAN devices created by driver. */
355 #define MLX5_VXLAN_DEFAULT_VNI  1
356 #define MLX5_VXLAN_DEVICE_PFX "vmlx_"
357
358 /** Tunnel action type, used for @p type in header structure. */
359 enum flow_tcf_tunact_type {
360         FLOW_TCF_TUNACT_VXLAN_DECAP,
361         FLOW_TCF_TUNACT_VXLAN_ENCAP,
362 };
363
364 /** Flags used for @p mask in tunnel action encap descriptors. */
365 #define FLOW_TCF_ENCAP_ETH_SRC (1u << 0)
366 #define FLOW_TCF_ENCAP_ETH_DST (1u << 1)
367 #define FLOW_TCF_ENCAP_IPV4_SRC (1u << 2)
368 #define FLOW_TCF_ENCAP_IPV4_DST (1u << 3)
369 #define FLOW_TCF_ENCAP_IPV6_SRC (1u << 4)
370 #define FLOW_TCF_ENCAP_IPV6_DST (1u << 5)
371 #define FLOW_TCF_ENCAP_UDP_SRC (1u << 6)
372 #define FLOW_TCF_ENCAP_UDP_DST (1u << 7)
373 #define FLOW_TCF_ENCAP_VXLAN_VNI (1u << 8)
374 #define FLOW_TCF_ENCAP_IP_TTL (1u << 9)
375 #define FLOW_TCF_ENCAP_IP_TOS (1u << 10)
376
377 /**
378  * Structure for holding netlink context.
379  * Note the size of the message buffer which is MNL_SOCKET_BUFFER_SIZE.
380  * Using this (8KB) buffer size ensures that netlink messages will never be
381  * truncated.
382  */
383 struct mlx5_flow_tcf_context {
384         struct mnl_socket *nl; /* NETLINK_ROUTE libmnl socket. */
385         uint32_t seq; /* Message sequence number. */
386         uint32_t buf_size; /* Message buffer size. */
387         uint8_t *buf; /* Message buffer. */
388 };
389
390 /**
391  * Neigh rule structure. The neigh rule is applied via Netlink to
392  * outer tunnel iface in order to provide destination MAC address
393  * for the VXLAN encapsultion. The neigh rule is implicitly related
394  * to the Flow itself and can be shared by multiple Flows.
395  */
396 struct tcf_neigh_rule {
397         LIST_ENTRY(tcf_neigh_rule) next;
398         uint32_t refcnt;
399         struct ether_addr eth;
400         uint16_t mask;
401         union {
402                 struct {
403                         rte_be32_t dst;
404                 } ipv4;
405                 struct {
406                         uint8_t dst[IPV6_ADDR_LEN];
407                 } ipv6;
408         };
409 };
410
411 /**
412  * Local rule structure. The local rule is applied via Netlink to
413  * outer tunnel iface in order to provide local and peer IP addresses
414  * of the VXLAN tunnel for encapsulation. The local rule is implicitly
415  * related to the Flow itself and can be shared by multiple Flows.
416  */
417 struct tcf_local_rule {
418         LIST_ENTRY(tcf_local_rule) next;
419         uint32_t refcnt;
420         uint16_t mask;
421         union {
422                 struct {
423                         rte_be32_t dst;
424                         rte_be32_t src;
425                 } ipv4;
426                 struct {
427                         uint8_t dst[IPV6_ADDR_LEN];
428                         uint8_t src[IPV6_ADDR_LEN];
429                 } ipv6;
430         };
431 };
432
433 /** Outer interface VXLAN encapsulation rules container. */
434 struct tcf_irule {
435         LIST_ENTRY(tcf_irule) next;
436         LIST_HEAD(, tcf_neigh_rule) neigh;
437         LIST_HEAD(, tcf_local_rule) local;
438         uint32_t refcnt;
439         unsigned int ifouter; /**< Own interface index. */
440 };
441
442 /** VXLAN virtual netdev. */
443 struct tcf_vtep {
444         LIST_ENTRY(tcf_vtep) next;
445         uint32_t refcnt;
446         unsigned int ifindex; /**< Own interface index. */
447         uint16_t port;
448         uint8_t created;
449 };
450
451 /** Tunnel descriptor header, common for all tunnel types. */
452 struct flow_tcf_tunnel_hdr {
453         uint32_t type; /**< Tunnel action type. */
454         struct tcf_vtep *vtep; /**< Virtual tunnel endpoint device. */
455         unsigned int ifindex_org; /**< Original dst/src interface */
456         unsigned int *ifindex_ptr; /**< Interface ptr in message. */
457 };
458
459 struct flow_tcf_vxlan_decap {
460         struct flow_tcf_tunnel_hdr hdr;
461         uint16_t udp_port;
462 };
463
464 struct flow_tcf_vxlan_encap {
465         struct flow_tcf_tunnel_hdr hdr;
466         struct tcf_irule *iface;
467         uint32_t mask;
468         uint8_t ip_tos;
469         uint8_t ip_ttl_hop;
470         struct {
471                 struct ether_addr dst;
472                 struct ether_addr src;
473         } eth;
474         union {
475                 struct {
476                         rte_be32_t dst;
477                         rte_be32_t src;
478                 } ipv4;
479                 struct {
480                         uint8_t dst[IPV6_ADDR_LEN];
481                         uint8_t src[IPV6_ADDR_LEN];
482                 } ipv6;
483         };
484         struct {
485                 rte_be16_t src;
486                 rte_be16_t dst;
487         } udp;
488         struct {
489                 uint8_t vni[3];
490         } vxlan;
491 };
492
493 /** Structure used when extracting the values of a flow counters
494  * from a netlink message.
495  */
496 struct flow_tcf_stats_basic {
497         bool valid;
498         struct gnet_stats_basic counters;
499 };
500
501 /** Empty masks for known item types. */
502 static const union {
503         struct rte_flow_item_port_id port_id;
504         struct rte_flow_item_eth eth;
505         struct rte_flow_item_vlan vlan;
506         struct rte_flow_item_ipv4 ipv4;
507         struct rte_flow_item_ipv6 ipv6;
508         struct rte_flow_item_tcp tcp;
509         struct rte_flow_item_udp udp;
510         struct rte_flow_item_vxlan vxlan;
511 } flow_tcf_mask_empty = {
512         {0},
513 };
514
515 /** Supported masks for known item types. */
516 static const struct {
517         struct rte_flow_item_port_id port_id;
518         struct rte_flow_item_eth eth;
519         struct rte_flow_item_vlan vlan;
520         struct rte_flow_item_ipv4 ipv4;
521         struct rte_flow_item_ipv6 ipv6;
522         struct rte_flow_item_tcp tcp;
523         struct rte_flow_item_udp udp;
524         struct rte_flow_item_vxlan vxlan;
525 } flow_tcf_mask_supported = {
526         .port_id = {
527                 .id = 0xffffffff,
528         },
529         .eth = {
530                 .type = RTE_BE16(0xffff),
531                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
532                 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
533         },
534         .vlan = {
535                 /* PCP and VID only, no DEI. */
536                 .tci = RTE_BE16(0xefff),
537                 .inner_type = RTE_BE16(0xffff),
538         },
539         .ipv4.hdr = {
540                 .next_proto_id = 0xff,
541                 .time_to_live = 0xff,
542                 .type_of_service = 0xff,
543                 .src_addr = RTE_BE32(0xffffffff),
544                 .dst_addr = RTE_BE32(0xffffffff),
545         },
546         .ipv6.hdr = {
547                 .proto = 0xff,
548                 .vtc_flow = RTE_BE32(0xfful << IPV6_HDR_FL_SHIFT),
549                 .hop_limits = 0xff,
550                 .src_addr =
551                         "\xff\xff\xff\xff\xff\xff\xff\xff"
552                         "\xff\xff\xff\xff\xff\xff\xff\xff",
553                 .dst_addr =
554                         "\xff\xff\xff\xff\xff\xff\xff\xff"
555                         "\xff\xff\xff\xff\xff\xff\xff\xff",
556         },
557         .tcp.hdr = {
558                 .src_port = RTE_BE16(0xffff),
559                 .dst_port = RTE_BE16(0xffff),
560                 .tcp_flags = 0xff,
561         },
562         .udp.hdr = {
563                 .src_port = RTE_BE16(0xffff),
564                 .dst_port = RTE_BE16(0xffff),
565         },
566         .vxlan = {
567                .vni = "\xff\xff\xff",
568         },
569 };
570
571 #define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr))
572 #define SZ_NLATTR_NEST SZ_NLATTR_HDR
573 #define SZ_NLATTR_DATA_OF(len) MNL_ALIGN(SZ_NLATTR_HDR + (len))
574 #define SZ_NLATTR_TYPE_OF(typ) SZ_NLATTR_DATA_OF(sizeof(typ))
575 #define SZ_NLATTR_STRZ_OF(str) SZ_NLATTR_DATA_OF(strlen(str) + 1)
576
577 #define PTOI_TABLE_SZ_MAX(dev) (mlx5_dev_to_port_id((dev)->device, NULL, 0) + 2)
578
579 /** DPDK port to network interface index (ifindex) conversion. */
580 struct flow_tcf_ptoi {
581         uint16_t port_id; /**< DPDK port ID. */
582         unsigned int ifindex; /**< Network interface index. */
583 };
584
585 /* Due to a limitation on driver/FW. */
586 #define MLX5_TCF_GROUP_ID_MAX 3
587
588 /*
589  * Due to a limitation on driver/FW, priority ranges from 1 to 16 in kernel.
590  * Priority in rte_flow attribute starts from 0 and is added by 1 in
591  * translation. This is subject to be changed to determine the max priority
592  * based on trial-and-error like Verbs driver once the restriction is lifted or
593  * the range is extended.
594  */
595 #define MLX5_TCF_GROUP_PRIORITY_MAX 15
596
597 #define MLX5_TCF_FATE_ACTIONS \
598         (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \
599          MLX5_FLOW_ACTION_JUMP)
600
601 #define MLX5_TCF_VLAN_ACTIONS \
602         (MLX5_FLOW_ACTION_OF_POP_VLAN | MLX5_FLOW_ACTION_OF_PUSH_VLAN | \
603          MLX5_FLOW_ACTION_OF_SET_VLAN_VID | MLX5_FLOW_ACTION_OF_SET_VLAN_PCP)
604
605 #define MLX5_TCF_VXLAN_ACTIONS \
606         (MLX5_FLOW_ACTION_VXLAN_ENCAP | MLX5_FLOW_ACTION_VXLAN_DECAP)
607
608 #define MLX5_TCF_PEDIT_ACTIONS \
609         (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST | \
610          MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST | \
611          MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST | \
612          MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL | \
613          MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)
614
615 #define MLX5_TCF_CONFIG_ACTIONS \
616         (MLX5_FLOW_ACTION_PORT_ID | MLX5_FLOW_ACTION_JUMP | \
617          MLX5_FLOW_ACTION_OF_PUSH_VLAN | MLX5_FLOW_ACTION_OF_SET_VLAN_VID | \
618          MLX5_FLOW_ACTION_OF_SET_VLAN_PCP | \
619          (MLX5_TCF_PEDIT_ACTIONS & ~MLX5_FLOW_ACTION_DEC_TTL))
620
621 #define MAX_PEDIT_KEYS 128
622 #define SZ_PEDIT_KEY_VAL 4
623
624 #define NUM_OF_PEDIT_KEYS(sz) \
625         (((sz) / SZ_PEDIT_KEY_VAL) + (((sz) % SZ_PEDIT_KEY_VAL) ? 1 : 0))
626
627 struct pedit_key_ex {
628         enum pedit_header_type htype;
629         enum pedit_cmd cmd;
630 };
631
632 struct pedit_parser {
633         struct tc_pedit_sel sel;
634         struct tc_pedit_key keys[MAX_PEDIT_KEYS];
635         struct pedit_key_ex keys_ex[MAX_PEDIT_KEYS];
636 };
637
638 /**
639  * Create space for using the implicitly created TC flow counter.
640  *
641  * @param[in] dev
642  *   Pointer to the Ethernet device structure.
643  *
644  * @return
645  *   A pointer to the counter data structure, NULL otherwise and
646  *   rte_errno is set.
647  */
648 static struct mlx5_flow_counter *
649 flow_tcf_counter_new(void)
650 {
651         struct mlx5_flow_counter *cnt;
652
653         /*
654          * eswitch counter cannot be shared and its id is unknown.
655          * currently returning all with id 0.
656          * in the future maybe better to switch to unique numbers.
657          */
658         struct mlx5_flow_counter tmpl = {
659                 .ref_cnt = 1,
660         };
661         cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0);
662         if (!cnt) {
663                 rte_errno = ENOMEM;
664                 return NULL;
665         }
666         *cnt = tmpl;
667         /* Implicit counter, do not add to list. */
668         return cnt;
669 }
670
671 /**
672  * Set pedit key of MAC address
673  *
674  * @param[in] actions
675  *   pointer to action specification
676  * @param[in,out] p_parser
677  *   pointer to pedit_parser
678  */
679 static void
680 flow_tcf_pedit_key_set_mac(const struct rte_flow_action *actions,
681                            struct pedit_parser *p_parser)
682 {
683         int idx = p_parser->sel.nkeys;
684         uint32_t off = actions->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ?
685                                         offsetof(struct ether_hdr, s_addr) :
686                                         offsetof(struct ether_hdr, d_addr);
687         const struct rte_flow_action_set_mac *conf =
688                 (const struct rte_flow_action_set_mac *)actions->conf;
689
690         p_parser->keys[idx].off = off;
691         p_parser->keys[idx].mask = ~UINT32_MAX;
692         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
693         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
694         memcpy(&p_parser->keys[idx].val,
695                 conf->mac_addr, SZ_PEDIT_KEY_VAL);
696         idx++;
697         p_parser->keys[idx].off = off + SZ_PEDIT_KEY_VAL;
698         p_parser->keys[idx].mask = 0xFFFF0000;
699         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
700         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
701         memcpy(&p_parser->keys[idx].val,
702                 conf->mac_addr + SZ_PEDIT_KEY_VAL,
703                 ETHER_ADDR_LEN - SZ_PEDIT_KEY_VAL);
704         p_parser->sel.nkeys = (++idx);
705 }
706
707 /**
708  * Set pedit key of decrease/set ttl
709  *
710  * @param[in] actions
711  *   pointer to action specification
712  * @param[in,out] p_parser
713  *   pointer to pedit_parser
714  * @param[in] item_flags
715  *   flags of all items presented
716  */
717 static void
718 flow_tcf_pedit_key_set_dec_ttl(const struct rte_flow_action *actions,
719                                 struct pedit_parser *p_parser,
720                                 uint64_t item_flags)
721 {
722         int idx = p_parser->sel.nkeys;
723
724         p_parser->keys[idx].mask = 0xFFFFFF00;
725         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) {
726                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
727                 p_parser->keys[idx].off =
728                         offsetof(struct ipv4_hdr, time_to_live);
729         }
730         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6) {
731                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
732                 p_parser->keys[idx].off =
733                         offsetof(struct ipv6_hdr, hop_limits);
734         }
735         if (actions->type == RTE_FLOW_ACTION_TYPE_DEC_TTL) {
736                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_ADD;
737                 p_parser->keys[idx].val = 0x000000FF;
738         } else {
739                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
740                 p_parser->keys[idx].val =
741                         (__u32)((const struct rte_flow_action_set_ttl *)
742                          actions->conf)->ttl_value;
743         }
744         p_parser->sel.nkeys = (++idx);
745 }
746
747 /**
748  * Set pedit key of transport (TCP/UDP) port value
749  *
750  * @param[in] actions
751  *   pointer to action specification
752  * @param[in,out] p_parser
753  *   pointer to pedit_parser
754  * @param[in] item_flags
755  *   flags of all items presented
756  */
757 static void
758 flow_tcf_pedit_key_set_tp_port(const struct rte_flow_action *actions,
759                                 struct pedit_parser *p_parser,
760                                 uint64_t item_flags)
761 {
762         int idx = p_parser->sel.nkeys;
763
764         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)
765                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_UDP;
766         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_TCP)
767                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_TCP;
768         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
769         /* offset of src/dst port is same for TCP and UDP */
770         p_parser->keys[idx].off =
771                 actions->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC ?
772                 offsetof(struct tcp_hdr, src_port) :
773                 offsetof(struct tcp_hdr, dst_port);
774         p_parser->keys[idx].mask = 0xFFFF0000;
775         p_parser->keys[idx].val =
776                 (__u32)((const struct rte_flow_action_set_tp *)
777                                 actions->conf)->port;
778         p_parser->sel.nkeys = (++idx);
779 }
780
781 /**
782  * Set pedit key of ipv6 address
783  *
784  * @param[in] actions
785  *   pointer to action specification
786  * @param[in,out] p_parser
787  *   pointer to pedit_parser
788  */
789 static void
790 flow_tcf_pedit_key_set_ipv6_addr(const struct rte_flow_action *actions,
791                                  struct pedit_parser *p_parser)
792 {
793         int idx = p_parser->sel.nkeys;
794         int keys = NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
795         int off_base =
796                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ?
797                 offsetof(struct ipv6_hdr, src_addr) :
798                 offsetof(struct ipv6_hdr, dst_addr);
799         const struct rte_flow_action_set_ipv6 *conf =
800                 (const struct rte_flow_action_set_ipv6 *)actions->conf;
801
802         for (int i = 0; i < keys; i++, idx++) {
803                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
804                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
805                 p_parser->keys[idx].off = off_base + i * SZ_PEDIT_KEY_VAL;
806                 p_parser->keys[idx].mask = ~UINT32_MAX;
807                 memcpy(&p_parser->keys[idx].val,
808                         conf->ipv6_addr + i *  SZ_PEDIT_KEY_VAL,
809                         SZ_PEDIT_KEY_VAL);
810         }
811         p_parser->sel.nkeys += keys;
812 }
813
814 /**
815  * Set pedit key of ipv4 address
816  *
817  * @param[in] actions
818  *   pointer to action specification
819  * @param[in,out] p_parser
820  *   pointer to pedit_parser
821  */
822 static void
823 flow_tcf_pedit_key_set_ipv4_addr(const struct rte_flow_action *actions,
824                                  struct pedit_parser *p_parser)
825 {
826         int idx = p_parser->sel.nkeys;
827
828         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
829         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
830         p_parser->keys[idx].off =
831                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ?
832                 offsetof(struct ipv4_hdr, src_addr) :
833                 offsetof(struct ipv4_hdr, dst_addr);
834         p_parser->keys[idx].mask = ~UINT32_MAX;
835         p_parser->keys[idx].val =
836                 ((const struct rte_flow_action_set_ipv4 *)
837                  actions->conf)->ipv4_addr;
838         p_parser->sel.nkeys = (++idx);
839 }
840
841 /**
842  * Create the pedit's na attribute in netlink message
843  * on pre-allocate message buffer
844  *
845  * @param[in,out] nl
846  *   pointer to pre-allocated netlink message buffer
847  * @param[in,out] actions
848  *   pointer to pointer of actions specification.
849  * @param[in,out] action_flags
850  *   pointer to actions flags
851  * @param[in] item_flags
852  *   flags of all item presented
853  */
854 static void
855 flow_tcf_create_pedit_mnl_msg(struct nlmsghdr *nl,
856                               const struct rte_flow_action **actions,
857                               uint64_t item_flags)
858 {
859         struct pedit_parser p_parser;
860         struct nlattr *na_act_options;
861         struct nlattr *na_pedit_keys;
862
863         memset(&p_parser, 0, sizeof(p_parser));
864         mnl_attr_put_strz(nl, TCA_ACT_KIND, "pedit");
865         na_act_options = mnl_attr_nest_start(nl, TCA_ACT_OPTIONS);
866         /* all modify header actions should be in one tc-pedit action */
867         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
868                 switch ((*actions)->type) {
869                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
870                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
871                         flow_tcf_pedit_key_set_ipv4_addr(*actions, &p_parser);
872                         break;
873                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
874                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
875                         flow_tcf_pedit_key_set_ipv6_addr(*actions, &p_parser);
876                         break;
877                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
878                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
879                         flow_tcf_pedit_key_set_tp_port(*actions,
880                                                         &p_parser, item_flags);
881                         break;
882                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
883                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
884                         flow_tcf_pedit_key_set_dec_ttl(*actions,
885                                                         &p_parser, item_flags);
886                         break;
887                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
888                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
889                         flow_tcf_pedit_key_set_mac(*actions, &p_parser);
890                         break;
891                 default:
892                         goto pedit_mnl_msg_done;
893                 }
894         }
895 pedit_mnl_msg_done:
896         p_parser.sel.action = TC_ACT_PIPE;
897         mnl_attr_put(nl, TCA_PEDIT_PARMS_EX,
898                      sizeof(p_parser.sel) +
899                      p_parser.sel.nkeys * sizeof(struct tc_pedit_key),
900                      &p_parser);
901         na_pedit_keys =
902                 mnl_attr_nest_start(nl, TCA_PEDIT_KEYS_EX | NLA_F_NESTED);
903         for (int i = 0; i < p_parser.sel.nkeys; i++) {
904                 struct nlattr *na_pedit_key =
905                         mnl_attr_nest_start(nl,
906                                             TCA_PEDIT_KEY_EX | NLA_F_NESTED);
907                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_HTYPE,
908                                  p_parser.keys_ex[i].htype);
909                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_CMD,
910                                  p_parser.keys_ex[i].cmd);
911                 mnl_attr_nest_end(nl, na_pedit_key);
912         }
913         mnl_attr_nest_end(nl, na_pedit_keys);
914         mnl_attr_nest_end(nl, na_act_options);
915         (*actions)--;
916 }
917
918 /**
919  * Calculate max memory size of one TC-pedit actions.
920  * One TC-pedit action can contain set of keys each defining
921  * a rewrite element (rte_flow action)
922  *
923  * @param[in,out] actions
924  *   actions specification.
925  * @param[in,out] action_flags
926  *   actions flags
927  * @param[in,out] size
928  *   accumulated size
929  * @return
930  *   Max memory size of one TC-pedit action
931  */
932 static int
933 flow_tcf_get_pedit_actions_size(const struct rte_flow_action **actions,
934                                 uint64_t *action_flags)
935 {
936         int pedit_size = 0;
937         int keys = 0;
938         uint64_t flags = 0;
939
940         pedit_size += SZ_NLATTR_NEST + /* na_act_index. */
941                       SZ_NLATTR_STRZ_OF("pedit") +
942                       SZ_NLATTR_NEST; /* TCA_ACT_OPTIONS. */
943         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
944                 switch ((*actions)->type) {
945                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
946                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
947                         flags |= MLX5_FLOW_ACTION_SET_IPV4_SRC;
948                         break;
949                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
950                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
951                         flags |= MLX5_FLOW_ACTION_SET_IPV4_DST;
952                         break;
953                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
954                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
955                         flags |= MLX5_FLOW_ACTION_SET_IPV6_SRC;
956                         break;
957                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
958                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
959                         flags |= MLX5_FLOW_ACTION_SET_IPV6_DST;
960                         break;
961                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
962                         /* TCP is as same as UDP */
963                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
964                         flags |= MLX5_FLOW_ACTION_SET_TP_SRC;
965                         break;
966                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
967                         /* TCP is as same as UDP */
968                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
969                         flags |= MLX5_FLOW_ACTION_SET_TP_DST;
970                         break;
971                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
972                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
973                         flags |= MLX5_FLOW_ACTION_SET_TTL;
974                         break;
975                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
976                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
977                         flags |= MLX5_FLOW_ACTION_DEC_TTL;
978                         break;
979                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
980                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
981                         flags |= MLX5_FLOW_ACTION_SET_MAC_SRC;
982                         break;
983                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
984                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
985                         flags |= MLX5_FLOW_ACTION_SET_MAC_DST;
986                         break;
987                 default:
988                         goto get_pedit_action_size_done;
989                 }
990         }
991 get_pedit_action_size_done:
992         /* TCA_PEDIT_PARAMS_EX */
993         pedit_size +=
994                 SZ_NLATTR_DATA_OF(sizeof(struct tc_pedit_sel) +
995                                   keys * sizeof(struct tc_pedit_key));
996         pedit_size += SZ_NLATTR_NEST; /* TCA_PEDIT_KEYS */
997         pedit_size += keys *
998                       /* TCA_PEDIT_KEY_EX + HTYPE + CMD */
999                       (SZ_NLATTR_NEST + SZ_NLATTR_DATA_OF(2) +
1000                        SZ_NLATTR_DATA_OF(2));
1001         (*action_flags) |= flags;
1002         (*actions)--;
1003         return pedit_size;
1004 }
1005
1006 /**
1007  * Retrieve mask for pattern item.
1008  *
1009  * This function does basic sanity checks on a pattern item in order to
1010  * return the most appropriate mask for it.
1011  *
1012  * @param[in] item
1013  *   Item specification.
1014  * @param[in] mask_default
1015  *   Default mask for pattern item as specified by the flow API.
1016  * @param[in] mask_supported
1017  *   Mask fields supported by the implementation.
1018  * @param[in] mask_empty
1019  *   Empty mask to return when there is no specification.
1020  * @param[out] error
1021  *   Perform verbose error reporting if not NULL.
1022  *
1023  * @return
1024  *   Either @p item->mask or one of the mask parameters on success, NULL
1025  *   otherwise and rte_errno is set.
1026  */
1027 static const void *
1028 flow_tcf_item_mask(const struct rte_flow_item *item, const void *mask_default,
1029                    const void *mask_supported, const void *mask_empty,
1030                    size_t mask_size, struct rte_flow_error *error)
1031 {
1032         const uint8_t *mask;
1033         size_t i;
1034
1035         /* item->last and item->mask cannot exist without item->spec. */
1036         if (!item->spec && (item->mask || item->last)) {
1037                 rte_flow_error_set(error, EINVAL,
1038                                    RTE_FLOW_ERROR_TYPE_ITEM, item,
1039                                    "\"mask\" or \"last\" field provided without"
1040                                    " a corresponding \"spec\"");
1041                 return NULL;
1042         }
1043         /* No spec, no mask, no problem. */
1044         if (!item->spec)
1045                 return mask_empty;
1046         mask = item->mask ? item->mask : mask_default;
1047         assert(mask);
1048         /*
1049          * Single-pass check to make sure that:
1050          * - Mask is supported, no bits are set outside mask_supported.
1051          * - Both item->spec and item->last are included in mask.
1052          */
1053         for (i = 0; i != mask_size; ++i) {
1054                 if (!mask[i])
1055                         continue;
1056                 if ((mask[i] | ((const uint8_t *)mask_supported)[i]) !=
1057                     ((const uint8_t *)mask_supported)[i]) {
1058                         rte_flow_error_set(error, ENOTSUP,
1059                                            RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1060                                            "unsupported field found"
1061                                            " in \"mask\"");
1062                         return NULL;
1063                 }
1064                 if (item->last &&
1065                     (((const uint8_t *)item->spec)[i] & mask[i]) !=
1066                     (((const uint8_t *)item->last)[i] & mask[i])) {
1067                         rte_flow_error_set(error, EINVAL,
1068                                            RTE_FLOW_ERROR_TYPE_ITEM_LAST,
1069                                            item->last,
1070                                            "range between \"spec\" and \"last\""
1071                                            " not comprised in \"mask\"");
1072                         return NULL;
1073                 }
1074         }
1075         return mask;
1076 }
1077
1078 /**
1079  * Build a conversion table between port ID and ifindex.
1080  *
1081  * @param[in] dev
1082  *   Pointer to Ethernet device.
1083  * @param[out] ptoi
1084  *   Pointer to ptoi table.
1085  * @param[in] len
1086  *   Size of ptoi table provided.
1087  *
1088  * @return
1089  *   Size of ptoi table filled.
1090  */
1091 static unsigned int
1092 flow_tcf_build_ptoi_table(struct rte_eth_dev *dev, struct flow_tcf_ptoi *ptoi,
1093                           unsigned int len)
1094 {
1095         unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
1096         uint16_t port_id[n + 1];
1097         unsigned int i;
1098         unsigned int own = 0;
1099
1100         /* At least one port is needed when no switch domain is present. */
1101         if (!n) {
1102                 n = 1;
1103                 port_id[0] = dev->data->port_id;
1104         } else {
1105                 n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
1106         }
1107         if (n > len)
1108                 return 0;
1109         for (i = 0; i != n; ++i) {
1110                 struct rte_eth_dev_info dev_info;
1111
1112                 rte_eth_dev_info_get(port_id[i], &dev_info);
1113                 if (port_id[i] == dev->data->port_id)
1114                         own = i;
1115                 ptoi[i].port_id = port_id[i];
1116                 ptoi[i].ifindex = dev_info.if_index;
1117         }
1118         /* Ensure first entry of ptoi[] is the current device. */
1119         if (own) {
1120                 ptoi[n] = ptoi[0];
1121                 ptoi[0] = ptoi[own];
1122                 ptoi[own] = ptoi[n];
1123         }
1124         /* An entry with zero ifindex terminates ptoi[]. */
1125         ptoi[n].port_id = 0;
1126         ptoi[n].ifindex = 0;
1127         return n;
1128 }
1129
1130 /**
1131  * Verify the @p attr will be correctly understood by the E-switch.
1132  *
1133  * @param[in] attr
1134  *   Pointer to flow attributes
1135  * @param[out] error
1136  *   Pointer to error structure.
1137  *
1138  * @return
1139  *   0 on success, a negative errno value otherwise and rte_errno is set.
1140  */
1141 static int
1142 flow_tcf_validate_attributes(const struct rte_flow_attr *attr,
1143                              struct rte_flow_error *error)
1144 {
1145         /*
1146          * Supported attributes: groups, some priorities and ingress only.
1147          * group is supported only if kernel supports chain. Don't care about
1148          * transfer as it is the caller's problem.
1149          */
1150         if (attr->group > MLX5_TCF_GROUP_ID_MAX)
1151                 return rte_flow_error_set(error, ENOTSUP,
1152                                           RTE_FLOW_ERROR_TYPE_ATTR_GROUP, attr,
1153                                           "group ID larger than "
1154                                           RTE_STR(MLX5_TCF_GROUP_ID_MAX)
1155                                           " isn't supported");
1156         else if (attr->priority > MLX5_TCF_GROUP_PRIORITY_MAX)
1157                 return rte_flow_error_set(error, ENOTSUP,
1158                                           RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1159                                           attr,
1160                                           "priority more than "
1161                                           RTE_STR(MLX5_TCF_GROUP_PRIORITY_MAX)
1162                                           " is not supported");
1163         if (!attr->ingress)
1164                 return rte_flow_error_set(error, EINVAL,
1165                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1166                                           attr, "only ingress is supported");
1167         if (attr->egress)
1168                 return rte_flow_error_set(error, ENOTSUP,
1169                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1170                                           attr, "egress is not supported");
1171         return 0;
1172 }
1173
1174 /**
1175  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_ETH item for E-Switch.
1176  * The routine checks the L2 fields to be used in encapsulation header.
1177  *
1178  * @param[in] item
1179  *   Pointer to the item structure.
1180  * @param[out] error
1181  *   Pointer to the error structure.
1182  *
1183  * @return
1184  *   0 on success, a negative errno value otherwise and rte_errno is set.
1185  **/
1186 static int
1187 flow_tcf_validate_vxlan_encap_eth(const struct rte_flow_item *item,
1188                                   struct rte_flow_error *error)
1189 {
1190         const struct rte_flow_item_eth *spec = item->spec;
1191         const struct rte_flow_item_eth *mask = item->mask;
1192
1193         if (!spec) {
1194                 /*
1195                  * Specification for L2 addresses can be empty
1196                  * because these ones are optional and not
1197                  * required directly by tc rule. Kernel tries
1198                  * to resolve these ones on its own
1199                  */
1200                 return 0;
1201         }
1202         if (!mask) {
1203                 /* If mask is not specified use the default one. */
1204                 mask = &rte_flow_item_eth_mask;
1205         }
1206         if (memcmp(&mask->dst,
1207                    &flow_tcf_mask_empty.eth.dst,
1208                    sizeof(flow_tcf_mask_empty.eth.dst))) {
1209                 if (memcmp(&mask->dst,
1210                            &rte_flow_item_eth_mask.dst,
1211                            sizeof(rte_flow_item_eth_mask.dst)))
1212                         return rte_flow_error_set
1213                                 (error, ENOTSUP,
1214                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1215                                  "no support for partial mask on"
1216                                  " \"eth.dst\" field");
1217         }
1218         if (memcmp(&mask->src,
1219                    &flow_tcf_mask_empty.eth.src,
1220                    sizeof(flow_tcf_mask_empty.eth.src))) {
1221                 if (memcmp(&mask->src,
1222                            &rte_flow_item_eth_mask.src,
1223                            sizeof(rte_flow_item_eth_mask.src)))
1224                         return rte_flow_error_set
1225                                 (error, ENOTSUP,
1226                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1227                                  "no support for partial mask on"
1228                                  " \"eth.src\" field");
1229         }
1230         if (mask->type != RTE_BE16(0x0000)) {
1231                 if (mask->type != RTE_BE16(0xffff))
1232                         return rte_flow_error_set
1233                                 (error, ENOTSUP,
1234                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1235                                  "no support for partial mask on"
1236                                  " \"eth.type\" field");
1237                 DRV_LOG(WARNING,
1238                         "outer ethernet type field"
1239                         " cannot be forced for vxlan"
1240                         " encapsulation, parameter ignored");
1241         }
1242         return 0;
1243 }
1244
1245 /**
1246  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV4 item for E-Switch.
1247  * The routine checks the IPv4 fields to be used in encapsulation header.
1248  *
1249  * @param[in] item
1250  *   Pointer to the item structure.
1251  * @param[out] error
1252  *   Pointer to the error structure.
1253  *
1254  * @return
1255  *   0 on success, a negative errno value otherwise and rte_errno is set.
1256  **/
1257 static int
1258 flow_tcf_validate_vxlan_encap_ipv4(const struct rte_flow_item *item,
1259                                    struct rte_flow_error *error)
1260 {
1261         const struct rte_flow_item_ipv4 *spec = item->spec;
1262         const struct rte_flow_item_ipv4 *mask = item->mask;
1263
1264         if (!spec) {
1265                 /*
1266                  * Specification for IP addresses cannot be empty
1267                  * because it is required by tunnel_key parameter.
1268                  */
1269                 return rte_flow_error_set(error, EINVAL,
1270                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1271                                           "NULL outer ipv4 address"
1272                                           " specification for vxlan"
1273                                           " encapsulation");
1274         }
1275         if (!mask)
1276                 mask = &rte_flow_item_ipv4_mask;
1277         if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) {
1278                 if (mask->hdr.dst_addr != RTE_BE32(0xffffffff))
1279                         return rte_flow_error_set
1280                                 (error, ENOTSUP,
1281                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1282                                  "no support for partial mask on"
1283                                  " \"ipv4.hdr.dst_addr\" field"
1284                                  " for vxlan encapsulation");
1285                 /* More IPv4 address validations can be put here. */
1286         } else {
1287                 /*
1288                  * Kernel uses the destination IP address to determine
1289                  * the routing path and obtain the MAC destination
1290                  * address, so IP destination address must be
1291                  * specified in the tc rule.
1292                  */
1293                 return rte_flow_error_set(error, EINVAL,
1294                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1295                                           "outer ipv4 destination address"
1296                                           " must be specified for"
1297                                           " vxlan encapsulation");
1298         }
1299         if (mask->hdr.src_addr != RTE_BE32(0x00000000)) {
1300                 if (mask->hdr.src_addr != RTE_BE32(0xffffffff))
1301                         return rte_flow_error_set
1302                                 (error, ENOTSUP,
1303                                  RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1304                                  "no support for partial mask on"
1305                                  " \"ipv4.hdr.src_addr\" field"
1306                                  " for vxlan encapsulation");
1307                 /* More IPv4 address validations can be put here. */
1308         } else {
1309                 /*
1310                  * Kernel uses the source IP address to select the
1311                  * interface for egress encapsulated traffic, so
1312                  * it must be specified in the tc rule.
1313                  */
1314                 return rte_flow_error_set(error, EINVAL,
1315                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1316                                           "outer ipv4 source address"
1317                                           " must be specified for"
1318                                           " vxlan encapsulation");
1319         }
1320         if (mask->hdr.type_of_service &&
1321             mask->hdr.type_of_service != 0xff)
1322                 return rte_flow_error_set(error, ENOTSUP,
1323                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1324                                           "no support for partial mask on"
1325                                           " \"ipv4.hdr.type_of_service\" field"
1326                                           " for vxlan encapsulation");
1327         if (mask->hdr.time_to_live &&
1328             mask->hdr.time_to_live != 0xff)
1329                 return rte_flow_error_set(error, ENOTSUP,
1330                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1331                                           "no support for partial mask on"
1332                                           " \"ipv4.hdr.time_to_live\" field"
1333                                           " for vxlan encapsulation");
1334         return 0;
1335 }
1336
1337 /**
1338  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV6 item for E-Switch.
1339  * The routine checks the IPv6 fields to be used in encapsulation header.
1340  *
1341  * @param[in] item
1342  *   Pointer to the item structure.
1343  * @param[out] error
1344  *   Pointer to the error structure.
1345  *
1346  * @return
1347  *   0 on success, a negative errno value otherwise and rte_errno is set.
1348  **/
1349 static int
1350 flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item,
1351                                    struct rte_flow_error *error)
1352 {
1353         const struct rte_flow_item_ipv6 *spec = item->spec;
1354         const struct rte_flow_item_ipv6 *mask = item->mask;
1355         uint8_t msk6;
1356
1357         if (!spec) {
1358                 /*
1359                  * Specification for IP addresses cannot be empty
1360                  * because it is required by tunnel_key parameter.
1361                  */
1362                 return rte_flow_error_set(error, EINVAL,
1363                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1364                                           "NULL outer ipv6 address"
1365                                           " specification for"
1366                                           " vxlan encapsulation");
1367         }
1368         if (!mask)
1369                 mask = &rte_flow_item_ipv6_mask;
1370         if (memcmp(&mask->hdr.dst_addr,
1371                    &flow_tcf_mask_empty.ipv6.hdr.dst_addr,
1372                    IPV6_ADDR_LEN)) {
1373                 if (memcmp(&mask->hdr.dst_addr,
1374                            &rte_flow_item_ipv6_mask.hdr.dst_addr,
1375                            IPV6_ADDR_LEN))
1376                         return rte_flow_error_set
1377                                         (error, ENOTSUP,
1378                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1379                                          "no support for partial mask on"
1380                                          " \"ipv6.hdr.dst_addr\" field"
1381                                          " for vxlan encapsulation");
1382                 /* More IPv6 address validations can be put here. */
1383         } else {
1384                 /*
1385                  * Kernel uses the destination IP address to determine
1386                  * the routing path and obtain the MAC destination
1387                  * address (heigh or gate), so IP destination address
1388                  * must be specified within the tc rule.
1389                  */
1390                 return rte_flow_error_set(error, EINVAL,
1391                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1392                                           "outer ipv6 destination address"
1393                                           " must be specified for"
1394                                           " vxlan encapsulation");
1395         }
1396         if (memcmp(&mask->hdr.src_addr,
1397                    &flow_tcf_mask_empty.ipv6.hdr.src_addr,
1398                    IPV6_ADDR_LEN)) {
1399                 if (memcmp(&mask->hdr.src_addr,
1400                            &rte_flow_item_ipv6_mask.hdr.src_addr,
1401                            IPV6_ADDR_LEN))
1402                         return rte_flow_error_set
1403                                         (error, ENOTSUP,
1404                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1405                                          "no support for partial mask on"
1406                                          " \"ipv6.hdr.src_addr\" field"
1407                                          " for vxlan encapsulation");
1408                 /* More L3 address validation can be put here. */
1409         } else {
1410                 /*
1411                  * Kernel uses the source IP address to select the
1412                  * interface for egress encapsulated traffic, so
1413                  * it must be specified in the tc rule.
1414                  */
1415                 return rte_flow_error_set(error, EINVAL,
1416                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1417                                           "outer L3 source address"
1418                                           " must be specified for"
1419                                           " vxlan encapsulation");
1420         }
1421         msk6 = (rte_be_to_cpu_32(mask->hdr.vtc_flow) >>
1422                 IPV6_HDR_TC_SHIFT) & 0xff;
1423         if (msk6 && msk6 != 0xff)
1424                 return rte_flow_error_set(error, ENOTSUP,
1425                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1426                                           "no support for partial mask on"
1427                                           " \"ipv6.hdr.vtc_flow.tos\" field"
1428                                           " for vxlan encapsulation");
1429         if (mask->hdr.hop_limits && mask->hdr.hop_limits != 0xff)
1430                 return rte_flow_error_set(error, ENOTSUP,
1431                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1432                                           "no support for partial mask on"
1433                                           " \"ipv6.hdr.hop_limits\" field"
1434                                           " for vxlan encapsulation");
1435         return 0;
1436 }
1437
1438 /**
1439  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_UDP item for E-Switch.
1440  * The routine checks the UDP fields to be used in encapsulation header.
1441  *
1442  * @param[in] item
1443  *   Pointer to the item structure.
1444  * @param[out] error
1445  *   Pointer to the error structure.
1446  *
1447  * @return
1448  *   0 on success, a negative errno value otherwise and rte_errno is set.
1449  **/
1450 static int
1451 flow_tcf_validate_vxlan_encap_udp(const struct rte_flow_item *item,
1452                                   struct rte_flow_error *error)
1453 {
1454         const struct rte_flow_item_udp *spec = item->spec;
1455         const struct rte_flow_item_udp *mask = item->mask;
1456
1457         if (!spec) {
1458                 /*
1459                  * Specification for UDP ports cannot be empty
1460                  * because it is required by tunnel_key parameter.
1461                  */
1462                 return rte_flow_error_set(error, EINVAL,
1463                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1464                                           "NULL UDP port specification "
1465                                           " for vxlan encapsulation");
1466         }
1467         if (!mask)
1468                 mask = &rte_flow_item_udp_mask;
1469         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1470                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1471                         return rte_flow_error_set
1472                                         (error, ENOTSUP,
1473                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1474                                          "no support for partial mask on"
1475                                          " \"udp.hdr.dst_port\" field"
1476                                          " for vxlan encapsulation");
1477                 if (!spec->hdr.dst_port)
1478                         return rte_flow_error_set
1479                                         (error, EINVAL,
1480                                          RTE_FLOW_ERROR_TYPE_ITEM, item,
1481                                          "outer UDP remote port cannot be"
1482                                          " 0 for vxlan encapsulation");
1483         } else {
1484                 return rte_flow_error_set(error, EINVAL,
1485                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1486                                           "outer UDP remote port"
1487                                           " must be specified for"
1488                                           " vxlan encapsulation");
1489         }
1490         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1491                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1492                         return rte_flow_error_set
1493                                         (error, ENOTSUP,
1494                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1495                                          "no support for partial mask on"
1496                                          " \"udp.hdr.src_port\" field"
1497                                          " for vxlan encapsulation");
1498                 DRV_LOG(WARNING,
1499                         "outer UDP source port cannot be"
1500                         " forced for vxlan encapsulation,"
1501                         " parameter ignored");
1502         }
1503         return 0;
1504 }
1505
1506 /**
1507  * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_VXLAN item for E-Switch.
1508  * The routine checks the VNIP fields to be used in encapsulation header.
1509  *
1510  * @param[in] item
1511  *   Pointer to the item structure.
1512  * @param[out] error
1513  *   Pointer to the error structure.
1514  *
1515  * @return
1516  *   0 on success, a negative errno value otherwise and rte_errno is set.
1517  **/
1518 static int
1519 flow_tcf_validate_vxlan_encap_vni(const struct rte_flow_item *item,
1520                                   struct rte_flow_error *error)
1521 {
1522         const struct rte_flow_item_vxlan *spec = item->spec;
1523         const struct rte_flow_item_vxlan *mask = item->mask;
1524
1525         if (!spec) {
1526                 /* Outer VNI is required by tunnel_key parameter. */
1527                 return rte_flow_error_set(error, EINVAL,
1528                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1529                                           "NULL VNI specification"
1530                                           " for vxlan encapsulation");
1531         }
1532         if (!mask)
1533                 mask = &rte_flow_item_vxlan_mask;
1534         if (!mask->vni[0] && !mask->vni[1] && !mask->vni[2])
1535                 return rte_flow_error_set(error, EINVAL,
1536                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1537                                           "outer VNI must be specified "
1538                                           "for vxlan encapsulation");
1539         if (mask->vni[0] != 0xff ||
1540             mask->vni[1] != 0xff ||
1541             mask->vni[2] != 0xff)
1542                 return rte_flow_error_set(error, ENOTSUP,
1543                                           RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1544                                           "no support for partial mask on"
1545                                           " \"vxlan.vni\" field");
1546
1547         if (!spec->vni[0] && !spec->vni[1] && !spec->vni[2])
1548                 return rte_flow_error_set(error, EINVAL,
1549                                           RTE_FLOW_ERROR_TYPE_ITEM, item,
1550                                           "vxlan vni cannot be 0");
1551         return 0;
1552 }
1553
1554 /**
1555  * Validate VXLAN_ENCAP action item list for E-Switch.
1556  * The routine checks items to be used in encapsulation header.
1557  *
1558  * @param[in] action
1559  *   Pointer to the VXLAN_ENCAP action structure.
1560  * @param[out] error
1561  *   Pointer to the error structure.
1562  *
1563  * @return
1564  *   0 on success, a negative errno value otherwise and rte_errno is set.
1565  **/
1566 static int
1567 flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action,
1568                               struct rte_flow_error *error)
1569 {
1570         const struct rte_flow_item *items;
1571         int ret;
1572         uint32_t item_flags = 0;
1573
1574         if (!action->conf)
1575                 return rte_flow_error_set(error, EINVAL,
1576                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1577                                           "Missing vxlan tunnel"
1578                                           " action configuration");
1579         items = ((const struct rte_flow_action_vxlan_encap *)
1580                                         action->conf)->definition;
1581         if (!items)
1582                 return rte_flow_error_set(error, EINVAL,
1583                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1584                                           "Missing vxlan tunnel"
1585                                           " encapsulation parameters");
1586         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1587                 switch (items->type) {
1588                 case RTE_FLOW_ITEM_TYPE_VOID:
1589                         break;
1590                 case RTE_FLOW_ITEM_TYPE_ETH:
1591                         ret = mlx5_flow_validate_item_eth(items, item_flags,
1592                                                           error);
1593                         if (ret < 0)
1594                                 return ret;
1595                         ret = flow_tcf_validate_vxlan_encap_eth(items, error);
1596                         if (ret < 0)
1597                                 return ret;
1598                         item_flags |= MLX5_FLOW_LAYER_OUTER_L2;
1599                         break;
1600                 break;
1601                 case RTE_FLOW_ITEM_TYPE_IPV4:
1602                         ret = mlx5_flow_validate_item_ipv4
1603                                         (items, item_flags,
1604                                          &flow_tcf_mask_supported.ipv4, error);
1605                         if (ret < 0)
1606                                 return ret;
1607                         ret = flow_tcf_validate_vxlan_encap_ipv4(items, error);
1608                         if (ret < 0)
1609                                 return ret;
1610                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
1611                         break;
1612                 case RTE_FLOW_ITEM_TYPE_IPV6:
1613                         ret = mlx5_flow_validate_item_ipv6
1614                                         (items, item_flags,
1615                                          &flow_tcf_mask_supported.ipv6, error);
1616                         if (ret < 0)
1617                                 return ret;
1618                         ret = flow_tcf_validate_vxlan_encap_ipv6(items, error);
1619                         if (ret < 0)
1620                                 return ret;
1621                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
1622                         break;
1623                 case RTE_FLOW_ITEM_TYPE_UDP:
1624                         ret = mlx5_flow_validate_item_udp(items, item_flags,
1625                                                            0xFF, error);
1626                         if (ret < 0)
1627                                 return ret;
1628                         ret = flow_tcf_validate_vxlan_encap_udp(items, error);
1629                         if (ret < 0)
1630                                 return ret;
1631                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
1632                         break;
1633                 case RTE_FLOW_ITEM_TYPE_VXLAN:
1634                         ret = mlx5_flow_validate_item_vxlan(items,
1635                                                             item_flags, error);
1636                         if (ret < 0)
1637                                 return ret;
1638                         ret = flow_tcf_validate_vxlan_encap_vni(items, error);
1639                         if (ret < 0)
1640                                 return ret;
1641                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
1642                         break;
1643                 default:
1644                         return rte_flow_error_set
1645                                         (error, ENOTSUP,
1646                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
1647                                          "vxlan encap item not supported");
1648                 }
1649         }
1650         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3))
1651                 return rte_flow_error_set(error, EINVAL,
1652                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1653                                           "no outer IP layer found"
1654                                           " for vxlan encapsulation");
1655         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
1656                 return rte_flow_error_set(error, EINVAL,
1657                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1658                                           "no outer UDP layer found"
1659                                           " for vxlan encapsulation");
1660         if (!(item_flags & MLX5_FLOW_LAYER_VXLAN))
1661                 return rte_flow_error_set(error, EINVAL,
1662                                           RTE_FLOW_ERROR_TYPE_ACTION, action,
1663                                           "no VXLAN VNI found"
1664                                           " for vxlan encapsulation");
1665         return 0;
1666 }
1667
1668 /**
1669  * Validate outer RTE_FLOW_ITEM_TYPE_UDP item if tunnel item
1670  * RTE_FLOW_ITEM_TYPE_VXLAN is present in item list.
1671  *
1672  * @param[in] udp
1673  *   Outer UDP layer item (if any, NULL otherwise).
1674  * @param[out] error
1675  *   Pointer to the error structure.
1676  *
1677  * @return
1678  *   0 on success, a negative errno value otherwise and rte_errno is set.
1679  **/
1680 static int
1681 flow_tcf_validate_vxlan_decap_udp(const struct rte_flow_item *udp,
1682                                   struct rte_flow_error *error)
1683 {
1684         const struct rte_flow_item_udp *spec = udp->spec;
1685         const struct rte_flow_item_udp *mask = udp->mask;
1686
1687         if (!spec)
1688                 /*
1689                  * Specification for UDP ports cannot be empty
1690                  * because it is required as decap parameter.
1691                  */
1692                 return rte_flow_error_set(error, EINVAL,
1693                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1694                                           "NULL UDP port specification"
1695                                           " for VXLAN decapsulation");
1696         if (!mask)
1697                 mask = &rte_flow_item_udp_mask;
1698         if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1699                 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1700                         return rte_flow_error_set
1701                                         (error, ENOTSUP,
1702                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1703                                          "no support for partial mask on"
1704                                          " \"udp.hdr.dst_port\" field");
1705                 if (!spec->hdr.dst_port)
1706                         return rte_flow_error_set
1707                                         (error, EINVAL,
1708                                          RTE_FLOW_ERROR_TYPE_ITEM, udp,
1709                                          "zero decap local UDP port");
1710         } else {
1711                 return rte_flow_error_set(error, EINVAL,
1712                                           RTE_FLOW_ERROR_TYPE_ITEM, udp,
1713                                           "outer UDP destination port must be "
1714                                           "specified for vxlan decapsulation");
1715         }
1716         if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1717                 if (mask->hdr.src_port != RTE_BE16(0xffff))
1718                         return rte_flow_error_set
1719                                         (error, ENOTSUP,
1720                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1721                                          "no support for partial mask on"
1722                                          " \"udp.hdr.src_port\" field");
1723                 DRV_LOG(WARNING,
1724                         "outer UDP local port cannot be "
1725                         "forced for VXLAN encapsulation, "
1726                         "parameter ignored");
1727         }
1728         return 0;
1729 }
1730
1731 /**
1732  * Validate flow for E-Switch.
1733  *
1734  * @param[in] priv
1735  *   Pointer to the priv structure.
1736  * @param[in] attr
1737  *   Pointer to the flow attributes.
1738  * @param[in] items
1739  *   Pointer to the list of items.
1740  * @param[in] actions
1741  *   Pointer to the list of actions.
1742  * @param[out] error
1743  *   Pointer to the error structure.
1744  *
1745  * @return
1746  *   0 on success, a negative errno value otherwise and rte_errno is set.
1747  */
1748 static int
1749 flow_tcf_validate(struct rte_eth_dev *dev,
1750                   const struct rte_flow_attr *attr,
1751                   const struct rte_flow_item items[],
1752                   const struct rte_flow_action actions[],
1753                   struct rte_flow_error *error)
1754 {
1755         union {
1756                 const struct rte_flow_item_port_id *port_id;
1757                 const struct rte_flow_item_eth *eth;
1758                 const struct rte_flow_item_vlan *vlan;
1759                 const struct rte_flow_item_ipv4 *ipv4;
1760                 const struct rte_flow_item_ipv6 *ipv6;
1761                 const struct rte_flow_item_tcp *tcp;
1762                 const struct rte_flow_item_udp *udp;
1763                 const struct rte_flow_item_vxlan *vxlan;
1764         } spec, mask;
1765         union {
1766                 const struct rte_flow_action_port_id *port_id;
1767                 const struct rte_flow_action_jump *jump;
1768                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
1769                 const struct rte_flow_action_of_set_vlan_vid *
1770                         of_set_vlan_vid;
1771                 const struct rte_flow_action_of_set_vlan_pcp *
1772                         of_set_vlan_pcp;
1773                 const struct rte_flow_action_vxlan_encap *vxlan_encap;
1774                 const struct rte_flow_action_set_ipv4 *set_ipv4;
1775                 const struct rte_flow_action_set_ipv6 *set_ipv6;
1776         } conf;
1777         const struct rte_flow_item *outer_udp = NULL;
1778         rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
1779         rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
1780         rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
1781         uint64_t item_flags = 0;
1782         uint64_t action_flags = 0;
1783         uint8_t next_protocol = 0xff;
1784         unsigned int tcm_ifindex = 0;
1785         uint8_t pedit_validated = 0;
1786         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
1787         struct rte_eth_dev *port_id_dev = NULL;
1788         bool in_port_id_set;
1789         int ret;
1790
1791         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
1792                                                 PTOI_TABLE_SZ_MAX(dev)));
1793         ret = flow_tcf_validate_attributes(attr, error);
1794         if (ret < 0)
1795                 return ret;
1796         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
1797                 unsigned int i;
1798                 uint64_t current_action_flag = 0;
1799
1800                 switch (actions->type) {
1801                 case RTE_FLOW_ACTION_TYPE_VOID:
1802                         break;
1803                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
1804                         current_action_flag = MLX5_FLOW_ACTION_PORT_ID;
1805                         if (!actions->conf)
1806                                 break;
1807                         conf.port_id = actions->conf;
1808                         if (conf.port_id->original)
1809                                 i = 0;
1810                         else
1811                                 for (i = 0; ptoi[i].ifindex; ++i)
1812                                         if (ptoi[i].port_id == conf.port_id->id)
1813                                                 break;
1814                         if (!ptoi[i].ifindex)
1815                                 return rte_flow_error_set
1816                                         (error, ENODEV,
1817                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1818                                          conf.port_id,
1819                                          "missing data to convert port ID to"
1820                                          " ifindex");
1821                         port_id_dev = &rte_eth_devices[conf.port_id->id];
1822                         break;
1823                 case RTE_FLOW_ACTION_TYPE_JUMP:
1824                         current_action_flag = MLX5_FLOW_ACTION_JUMP;
1825                         if (!actions->conf)
1826                                 break;
1827                         conf.jump = actions->conf;
1828                         if (attr->group >= conf.jump->group)
1829                                 return rte_flow_error_set
1830                                         (error, ENOTSUP,
1831                                          RTE_FLOW_ERROR_TYPE_ACTION,
1832                                          actions,
1833                                          "can jump only to a group forward");
1834                         break;
1835                 case RTE_FLOW_ACTION_TYPE_DROP:
1836                         current_action_flag = MLX5_FLOW_ACTION_DROP;
1837                         break;
1838                 case RTE_FLOW_ACTION_TYPE_COUNT:
1839                         break;
1840                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
1841                         current_action_flag = MLX5_FLOW_ACTION_OF_POP_VLAN;
1842                         break;
1843                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: {
1844                         rte_be16_t ethertype;
1845
1846                         current_action_flag = MLX5_FLOW_ACTION_OF_PUSH_VLAN;
1847                         if (!actions->conf)
1848                                 break;
1849                         conf.of_push_vlan = actions->conf;
1850                         ethertype = conf.of_push_vlan->ethertype;
1851                         if (ethertype != RTE_BE16(ETH_P_8021Q) &&
1852                             ethertype != RTE_BE16(ETH_P_8021AD))
1853                                 return rte_flow_error_set
1854                                         (error, EINVAL,
1855                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1856                                          "vlan push TPID must be "
1857                                          "802.1Q or 802.1AD");
1858                         break;
1859                 }
1860                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
1861                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1862                                 return rte_flow_error_set
1863                                         (error, ENOTSUP,
1864                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1865                                          "vlan modify is not supported,"
1866                                          " set action must follow push action");
1867                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
1868                         break;
1869                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
1870                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1871                                 return rte_flow_error_set
1872                                         (error, ENOTSUP,
1873                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1874                                          "vlan modify is not supported,"
1875                                          " set action must follow push action");
1876                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
1877                         break;
1878                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
1879                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_DECAP;
1880                         break;
1881                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
1882                         ret = flow_tcf_validate_vxlan_encap(actions, error);
1883                         if (ret < 0)
1884                                 return ret;
1885                         current_action_flag = MLX5_FLOW_ACTION_VXLAN_ENCAP;
1886                         break;
1887                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
1888                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_SRC;
1889                         break;
1890                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
1891                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_DST;
1892                         break;
1893                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
1894                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_SRC;
1895                         break;
1896                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
1897                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_DST;
1898                         break;
1899                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
1900                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_SRC;
1901                         break;
1902                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
1903                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_DST;
1904                         break;
1905                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
1906                         current_action_flag = MLX5_FLOW_ACTION_SET_TTL;
1907                         break;
1908                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
1909                         current_action_flag = MLX5_FLOW_ACTION_DEC_TTL;
1910                         break;
1911                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
1912                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_SRC;
1913                         break;
1914                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
1915                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_DST;
1916                         break;
1917                 default:
1918                         return rte_flow_error_set(error, ENOTSUP,
1919                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1920                                                   actions,
1921                                                   "action not supported");
1922                 }
1923                 if (current_action_flag & MLX5_TCF_CONFIG_ACTIONS) {
1924                         if (!actions->conf)
1925                                 return rte_flow_error_set
1926                                         (error, EINVAL,
1927                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1928                                          actions,
1929                                          "action configuration not set");
1930                 }
1931                 if ((current_action_flag & MLX5_TCF_PEDIT_ACTIONS) &&
1932                     pedit_validated)
1933                         return rte_flow_error_set(error, ENOTSUP,
1934                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1935                                                   actions,
1936                                                   "set actions should be "
1937                                                   "listed successively");
1938                 if ((current_action_flag & ~MLX5_TCF_PEDIT_ACTIONS) &&
1939                     (action_flags & MLX5_TCF_PEDIT_ACTIONS))
1940                         pedit_validated = 1;
1941                 if ((current_action_flag & MLX5_TCF_FATE_ACTIONS) &&
1942                     (action_flags & MLX5_TCF_FATE_ACTIONS))
1943                         return rte_flow_error_set(error, EINVAL,
1944                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1945                                                   actions,
1946                                                   "can't have multiple fate"
1947                                                   " actions");
1948                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1949                     (action_flags & MLX5_TCF_VXLAN_ACTIONS))
1950                         return rte_flow_error_set(error, EINVAL,
1951                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1952                                                   actions,
1953                                                   "can't have multiple vxlan"
1954                                                   " actions");
1955                 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1956                     (action_flags & MLX5_TCF_VLAN_ACTIONS))
1957                         return rte_flow_error_set(error, ENOTSUP,
1958                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1959                                                   actions,
1960                                                   "can't have vxlan and vlan"
1961                                                   " actions in the same rule");
1962                 action_flags |= current_action_flag;
1963         }
1964         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1965                 unsigned int i;
1966
1967                 switch (items->type) {
1968                 case RTE_FLOW_ITEM_TYPE_VOID:
1969                         break;
1970                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
1971                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
1972                                 return rte_flow_error_set
1973                                         (error, ENOTSUP,
1974                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
1975                                          "inner tunnel port id"
1976                                          " item is not supported");
1977                         mask.port_id = flow_tcf_item_mask
1978                                 (items, &rte_flow_item_port_id_mask,
1979                                  &flow_tcf_mask_supported.port_id,
1980                                  &flow_tcf_mask_empty.port_id,
1981                                  sizeof(flow_tcf_mask_supported.port_id),
1982                                  error);
1983                         if (!mask.port_id)
1984                                 return -rte_errno;
1985                         if (mask.port_id == &flow_tcf_mask_empty.port_id) {
1986                                 in_port_id_set = 1;
1987                                 break;
1988                         }
1989                         spec.port_id = items->spec;
1990                         if (mask.port_id->id && mask.port_id->id != 0xffffffff)
1991                                 return rte_flow_error_set
1992                                         (error, ENOTSUP,
1993                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
1994                                          mask.port_id,
1995                                          "no support for partial mask on"
1996                                          " \"id\" field");
1997                         if (!mask.port_id->id)
1998                                 i = 0;
1999                         else
2000                                 for (i = 0; ptoi[i].ifindex; ++i)
2001                                         if (ptoi[i].port_id == spec.port_id->id)
2002                                                 break;
2003                         if (!ptoi[i].ifindex)
2004                                 return rte_flow_error_set
2005                                         (error, ENODEV,
2006                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
2007                                          spec.port_id,
2008                                          "missing data to convert port ID to"
2009                                          " ifindex");
2010                         if (in_port_id_set && ptoi[i].ifindex != tcm_ifindex)
2011                                 return rte_flow_error_set
2012                                         (error, ENOTSUP,
2013                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
2014                                          spec.port_id,
2015                                          "cannot match traffic for"
2016                                          " several port IDs through"
2017                                          " a single flow rule");
2018                         tcm_ifindex = ptoi[i].ifindex;
2019                         in_port_id_set = 1;
2020                         break;
2021                 case RTE_FLOW_ITEM_TYPE_ETH:
2022                         ret = mlx5_flow_validate_item_eth(items, item_flags,
2023                                                           error);
2024                         if (ret < 0)
2025                                 return ret;
2026                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2027                                       MLX5_FLOW_LAYER_INNER_L2 :
2028                                       MLX5_FLOW_LAYER_OUTER_L2;
2029                         /* TODO:
2030                          * Redundant check due to different supported mask.
2031                          * Same for the rest of items.
2032                          */
2033                         mask.eth = flow_tcf_item_mask
2034                                 (items, &rte_flow_item_eth_mask,
2035                                  &flow_tcf_mask_supported.eth,
2036                                  &flow_tcf_mask_empty.eth,
2037                                  sizeof(flow_tcf_mask_supported.eth),
2038                                  error);
2039                         if (!mask.eth)
2040                                 return -rte_errno;
2041                         if (mask.eth->type && mask.eth->type !=
2042                             RTE_BE16(0xffff))
2043                                 return rte_flow_error_set
2044                                         (error, ENOTSUP,
2045                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2046                                          mask.eth,
2047                                          "no support for partial mask on"
2048                                          " \"type\" field");
2049                         assert(items->spec);
2050                         spec.eth = items->spec;
2051                         if (mask.eth->type &&
2052                             (item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
2053                             inner_etype != RTE_BE16(ETH_P_ALL) &&
2054                             inner_etype != spec.eth->type)
2055                                 return rte_flow_error_set
2056                                         (error, EINVAL,
2057                                          RTE_FLOW_ERROR_TYPE_ITEM,
2058                                          items,
2059                                          "inner eth_type conflict");
2060                         if (mask.eth->type &&
2061                             !(item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
2062                             outer_etype != RTE_BE16(ETH_P_ALL) &&
2063                             outer_etype != spec.eth->type)
2064                                 return rte_flow_error_set
2065                                         (error, EINVAL,
2066                                          RTE_FLOW_ERROR_TYPE_ITEM,
2067                                          items,
2068                                          "outer eth_type conflict");
2069                         if (mask.eth->type) {
2070                                 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
2071                                         inner_etype = spec.eth->type;
2072                                 else
2073                                         outer_etype = spec.eth->type;
2074                         }
2075                         break;
2076                 case RTE_FLOW_ITEM_TYPE_VLAN:
2077                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
2078                                 return rte_flow_error_set
2079                                         (error, ENOTSUP,
2080                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
2081                                          "inner tunnel VLAN"
2082                                          " is not supported");
2083                         ret = mlx5_flow_validate_item_vlan(items, item_flags,
2084                                                            error);
2085                         if (ret < 0)
2086                                 return ret;
2087                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
2088                         mask.vlan = flow_tcf_item_mask
2089                                 (items, &rte_flow_item_vlan_mask,
2090                                  &flow_tcf_mask_supported.vlan,
2091                                  &flow_tcf_mask_empty.vlan,
2092                                  sizeof(flow_tcf_mask_supported.vlan),
2093                                  error);
2094                         if (!mask.vlan)
2095                                 return -rte_errno;
2096                         if ((mask.vlan->tci & RTE_BE16(0xe000) &&
2097                              (mask.vlan->tci & RTE_BE16(0xe000)) !=
2098                               RTE_BE16(0xe000)) ||
2099                             (mask.vlan->tci & RTE_BE16(0x0fff) &&
2100                              (mask.vlan->tci & RTE_BE16(0x0fff)) !=
2101                               RTE_BE16(0x0fff)) ||
2102                             (mask.vlan->inner_type &&
2103                              mask.vlan->inner_type != RTE_BE16(0xffff)))
2104                                 return rte_flow_error_set
2105                                         (error, ENOTSUP,
2106                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2107                                          mask.vlan,
2108                                          "no support for partial masks on"
2109                                          " \"tci\" (PCP and VID parts) and"
2110                                          " \"inner_type\" fields");
2111                         if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2112                             outer_etype != RTE_BE16(ETH_P_8021Q))
2113                                 return rte_flow_error_set
2114                                         (error, EINVAL,
2115                                          RTE_FLOW_ERROR_TYPE_ITEM,
2116                                          items,
2117                                          "outer eth_type conflict,"
2118                                          " must be 802.1Q");
2119                         outer_etype = RTE_BE16(ETH_P_8021Q);
2120                         assert(items->spec);
2121                         spec.vlan = items->spec;
2122                         if (mask.vlan->inner_type &&
2123                             vlan_etype != RTE_BE16(ETH_P_ALL) &&
2124                             vlan_etype != spec.vlan->inner_type)
2125                                 return rte_flow_error_set
2126                                         (error, EINVAL,
2127                                          RTE_FLOW_ERROR_TYPE_ITEM,
2128                                          items,
2129                                          "vlan eth_type conflict");
2130                         if (mask.vlan->inner_type)
2131                                 vlan_etype = spec.vlan->inner_type;
2132                         break;
2133                 case RTE_FLOW_ITEM_TYPE_IPV4:
2134                         ret = mlx5_flow_validate_item_ipv4
2135                                         (items, item_flags,
2136                                          &flow_tcf_mask_supported.ipv4, error);
2137                         if (ret < 0)
2138                                 return ret;
2139                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2140                                       MLX5_FLOW_LAYER_INNER_L3_IPV4 :
2141                                       MLX5_FLOW_LAYER_OUTER_L3_IPV4;
2142                         mask.ipv4 = flow_tcf_item_mask
2143                                 (items, &rte_flow_item_ipv4_mask,
2144                                  &flow_tcf_mask_supported.ipv4,
2145                                  &flow_tcf_mask_empty.ipv4,
2146                                  sizeof(flow_tcf_mask_supported.ipv4),
2147                                  error);
2148                         if (!mask.ipv4)
2149                                 return -rte_errno;
2150                         if (mask.ipv4->hdr.next_proto_id &&
2151                             mask.ipv4->hdr.next_proto_id != 0xff)
2152                                 return rte_flow_error_set
2153                                         (error, ENOTSUP,
2154                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2155                                          mask.ipv4,
2156                                          "no support for partial mask on"
2157                                          " \"hdr.next_proto_id\" field");
2158                         else if (mask.ipv4->hdr.next_proto_id)
2159                                 next_protocol =
2160                                         ((const struct rte_flow_item_ipv4 *)
2161                                          (items->spec))->hdr.next_proto_id;
2162                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2163                                 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2164                                     inner_etype != RTE_BE16(ETH_P_IP))
2165                                         return rte_flow_error_set
2166                                                 (error, EINVAL,
2167                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2168                                                  items,
2169                                                  "inner eth_type conflict,"
2170                                                  " IPv4 is required");
2171                                 inner_etype = RTE_BE16(ETH_P_IP);
2172                         } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2173                                 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2174                                     vlan_etype != RTE_BE16(ETH_P_IP))
2175                                         return rte_flow_error_set
2176                                                 (error, EINVAL,
2177                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2178                                                  items,
2179                                                  "vlan eth_type conflict,"
2180                                                  " IPv4 is required");
2181                                 vlan_etype = RTE_BE16(ETH_P_IP);
2182                         } else {
2183                                 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2184                                     outer_etype != RTE_BE16(ETH_P_IP))
2185                                         return rte_flow_error_set
2186                                                 (error, EINVAL,
2187                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2188                                                  items,
2189                                                  "eth_type conflict,"
2190                                                  " IPv4 is required");
2191                                 outer_etype = RTE_BE16(ETH_P_IP);
2192                         }
2193                         break;
2194                 case RTE_FLOW_ITEM_TYPE_IPV6:
2195                         ret = mlx5_flow_validate_item_ipv6
2196                                         (items, item_flags,
2197                                          &flow_tcf_mask_supported.ipv6, error);
2198                         if (ret < 0)
2199                                 return ret;
2200                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2201                                       MLX5_FLOW_LAYER_INNER_L3_IPV6 :
2202                                       MLX5_FLOW_LAYER_OUTER_L3_IPV6;
2203                         mask.ipv6 = flow_tcf_item_mask
2204                                 (items, &rte_flow_item_ipv6_mask,
2205                                  &flow_tcf_mask_supported.ipv6,
2206                                  &flow_tcf_mask_empty.ipv6,
2207                                  sizeof(flow_tcf_mask_supported.ipv6),
2208                                  error);
2209                         if (!mask.ipv6)
2210                                 return -rte_errno;
2211                         if (mask.ipv6->hdr.proto &&
2212                             mask.ipv6->hdr.proto != 0xff)
2213                                 return rte_flow_error_set
2214                                         (error, ENOTSUP,
2215                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2216                                          mask.ipv6,
2217                                          "no support for partial mask on"
2218                                          " \"hdr.proto\" field");
2219                         else if (mask.ipv6->hdr.proto)
2220                                 next_protocol =
2221                                         ((const struct rte_flow_item_ipv6 *)
2222                                          (items->spec))->hdr.proto;
2223                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2224                                 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2225                                     inner_etype != RTE_BE16(ETH_P_IPV6))
2226                                         return rte_flow_error_set
2227                                                 (error, EINVAL,
2228                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2229                                                  items,
2230                                                  "inner eth_type conflict,"
2231                                                  " IPv6 is required");
2232                                 inner_etype = RTE_BE16(ETH_P_IPV6);
2233                         } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2234                                 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2235                                     vlan_etype != RTE_BE16(ETH_P_IPV6))
2236                                         return rte_flow_error_set
2237                                                 (error, EINVAL,
2238                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2239                                                  items,
2240                                                  "vlan eth_type conflict,"
2241                                                  " IPv6 is required");
2242                                 vlan_etype = RTE_BE16(ETH_P_IPV6);
2243                         } else {
2244                                 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2245                                     outer_etype != RTE_BE16(ETH_P_IPV6))
2246                                         return rte_flow_error_set
2247                                                 (error, EINVAL,
2248                                                  RTE_FLOW_ERROR_TYPE_ITEM,
2249                                                  items,
2250                                                  "eth_type conflict,"
2251                                                  " IPv6 is required");
2252                                 outer_etype = RTE_BE16(ETH_P_IPV6);
2253                         }
2254                         break;
2255                 case RTE_FLOW_ITEM_TYPE_UDP:
2256                         ret = mlx5_flow_validate_item_udp(items, item_flags,
2257                                                           next_protocol, error);
2258                         if (ret < 0)
2259                                 return ret;
2260                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2261                                       MLX5_FLOW_LAYER_INNER_L4_UDP :
2262                                       MLX5_FLOW_LAYER_OUTER_L4_UDP;
2263                         mask.udp = flow_tcf_item_mask
2264                                 (items, &rte_flow_item_udp_mask,
2265                                  &flow_tcf_mask_supported.udp,
2266                                  &flow_tcf_mask_empty.udp,
2267                                  sizeof(flow_tcf_mask_supported.udp),
2268                                  error);
2269                         if (!mask.udp)
2270                                 return -rte_errno;
2271                         /*
2272                          * Save the presumed outer UDP item for extra check
2273                          * if the tunnel item will be found later in the list.
2274                          */
2275                         if (!(item_flags & MLX5_FLOW_LAYER_TUNNEL))
2276                                 outer_udp = items;
2277                         break;
2278                 case RTE_FLOW_ITEM_TYPE_TCP:
2279                         ret = mlx5_flow_validate_item_tcp
2280                                              (items, item_flags,
2281                                               next_protocol,
2282                                               &flow_tcf_mask_supported.tcp,
2283                                               error);
2284                         if (ret < 0)
2285                                 return ret;
2286                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2287                                       MLX5_FLOW_LAYER_INNER_L4_TCP :
2288                                       MLX5_FLOW_LAYER_OUTER_L4_TCP;
2289                         mask.tcp = flow_tcf_item_mask
2290                                 (items, &rte_flow_item_tcp_mask,
2291                                  &flow_tcf_mask_supported.tcp,
2292                                  &flow_tcf_mask_empty.tcp,
2293                                  sizeof(flow_tcf_mask_supported.tcp),
2294                                  error);
2295                         if (!mask.tcp)
2296                                 return -rte_errno;
2297                         break;
2298                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2299                         if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN)
2300                                 return rte_flow_error_set
2301                                         (error, ENOTSUP,
2302                                          RTE_FLOW_ERROR_TYPE_ITEM, items,
2303                                          "vxlan tunnel over vlan"
2304                                          " is not supported");
2305                         ret = mlx5_flow_validate_item_vxlan(items,
2306                                                             item_flags, error);
2307                         if (ret < 0)
2308                                 return ret;
2309                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
2310                         mask.vxlan = flow_tcf_item_mask
2311                                 (items, &rte_flow_item_vxlan_mask,
2312                                  &flow_tcf_mask_supported.vxlan,
2313                                  &flow_tcf_mask_empty.vxlan,
2314                                  sizeof(flow_tcf_mask_supported.vxlan), error);
2315                         if (!mask.vxlan)
2316                                 return -rte_errno;
2317                         if (mask.vxlan->vni[0] != 0xff ||
2318                             mask.vxlan->vni[1] != 0xff ||
2319                             mask.vxlan->vni[2] != 0xff)
2320                                 return rte_flow_error_set
2321                                         (error, ENOTSUP,
2322                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2323                                          mask.vxlan,
2324                                          "no support for partial or "
2325                                          "empty mask on \"vxlan.vni\" field");
2326                         /*
2327                          * The VNI item assumes the VXLAN tunnel, it requires
2328                          * at least the outer destination UDP port must be
2329                          * specified without wildcards to allow kernel select
2330                          * the virtual VXLAN device by port. Also outer IPv4
2331                          * or IPv6 item must be specified (wilcards or even
2332                          * zero mask are allowed) to let driver know the tunnel
2333                          * IP version and process UDP traffic correctly.
2334                          */
2335                         if (!(item_flags &
2336                              (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2337                               MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2338                                 return rte_flow_error_set
2339                                                  (error, EINVAL,
2340                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2341                                                   NULL,
2342                                                   "no outer IP pattern found"
2343                                                   " for vxlan tunnel");
2344                         if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
2345                                 return rte_flow_error_set
2346                                                  (error, EINVAL,
2347                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2348                                                   NULL,
2349                                                   "no outer UDP pattern found"
2350                                                   " for vxlan tunnel");
2351                         /*
2352                          * All items preceding the tunnel item become outer
2353                          * ones and we should do extra validation for them
2354                          * due to tc limitations for tunnel outer parameters.
2355                          * Currently only outer UDP item requres extra check,
2356                          * use the saved pointer instead of item list rescan.
2357                          */
2358                         assert(outer_udp);
2359                         ret = flow_tcf_validate_vxlan_decap_udp
2360                                                 (outer_udp, error);
2361                         if (ret < 0)
2362                                 return ret;
2363                         /* Reset L4 protocol for inner parameters. */
2364                         next_protocol = 0xff;
2365                         break;
2366                 default:
2367                         return rte_flow_error_set(error, ENOTSUP,
2368                                                   RTE_FLOW_ERROR_TYPE_ITEM,
2369                                                   items, "item not supported");
2370                 }
2371         }
2372         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2373             (action_flags & MLX5_FLOW_ACTION_DROP))
2374                 return rte_flow_error_set(error, ENOTSUP,
2375                                           RTE_FLOW_ERROR_TYPE_ACTION,
2376                                           actions,
2377                                           "set action is not compatible with "
2378                                           "drop action");
2379         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2380             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2381                 return rte_flow_error_set(error, ENOTSUP,
2382                                           RTE_FLOW_ERROR_TYPE_ACTION,
2383                                           actions,
2384                                           "set action must be followed by "
2385                                           "port_id action");
2386         if (action_flags &
2387            (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST)) {
2388                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4))
2389                         return rte_flow_error_set(error, EINVAL,
2390                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2391                                                   actions,
2392                                                   "no ipv4 item found in"
2393                                                   " pattern");
2394         }
2395         if (action_flags &
2396            (MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST)) {
2397                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6))
2398                         return rte_flow_error_set(error, EINVAL,
2399                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2400                                                   actions,
2401                                                   "no ipv6 item found in"
2402                                                   " pattern");
2403         }
2404         if (action_flags &
2405            (MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST)) {
2406                 if (!(item_flags &
2407                      (MLX5_FLOW_LAYER_OUTER_L4_UDP |
2408                       MLX5_FLOW_LAYER_OUTER_L4_TCP)))
2409                         return rte_flow_error_set(error, EINVAL,
2410                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2411                                                   actions,
2412                                                   "no TCP/UDP item found in"
2413                                                   " pattern");
2414         }
2415         /*
2416          * FW syndrome (0xA9C090):
2417          *     set_flow_table_entry: push vlan action fte in fdb can ONLY be
2418          *     forward to the uplink.
2419          */
2420         if ((action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) &&
2421             (action_flags & MLX5_FLOW_ACTION_PORT_ID) &&
2422             ((struct priv *)port_id_dev->data->dev_private)->representor)
2423                 return rte_flow_error_set(error, ENOTSUP,
2424                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2425                                           "vlan push can only be applied"
2426                                           " when forwarding to uplink port");
2427         /*
2428          * FW syndrome (0x294609):
2429          *     set_flow_table_entry: modify/pop/push actions in fdb flow table
2430          *     are supported only while forwarding to vport.
2431          */
2432         if ((action_flags & MLX5_TCF_VLAN_ACTIONS) &&
2433             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2434                 return rte_flow_error_set(error, ENOTSUP,
2435                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2436                                           "vlan actions are supported"
2437                                           " only with port_id action");
2438         if ((action_flags & MLX5_TCF_VXLAN_ACTIONS) &&
2439             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2440                 return rte_flow_error_set(error, ENOTSUP,
2441                                           RTE_FLOW_ERROR_TYPE_ACTION, NULL,
2442                                           "vxlan actions are supported"
2443                                           " only with port_id action");
2444         if (!(action_flags & MLX5_TCF_FATE_ACTIONS))
2445                 return rte_flow_error_set(error, EINVAL,
2446                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
2447                                           "no fate action is found");
2448         if (action_flags &
2449            (MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL)) {
2450                 if (!(item_flags &
2451                      (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2452                       MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2453                         return rte_flow_error_set(error, EINVAL,
2454                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2455                                                   actions,
2456                                                   "no IP found in pattern");
2457         }
2458         if (action_flags &
2459             (MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)) {
2460                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L2))
2461                         return rte_flow_error_set(error, ENOTSUP,
2462                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2463                                                   actions,
2464                                                   "no ethernet found in"
2465                                                   " pattern");
2466         }
2467         if ((action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) &&
2468             !(item_flags & MLX5_FLOW_LAYER_VXLAN))
2469                 return rte_flow_error_set(error, EINVAL,
2470                                           RTE_FLOW_ERROR_TYPE_ACTION,
2471                                           NULL,
2472                                           "no VNI pattern found"
2473                                           " for vxlan decap action");
2474         if ((action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) &&
2475             (item_flags & MLX5_FLOW_LAYER_TUNNEL))
2476                 return rte_flow_error_set(error, EINVAL,
2477                                           RTE_FLOW_ERROR_TYPE_ACTION,
2478                                           NULL,
2479                                           "vxlan encap not supported"
2480                                           " for tunneled traffic");
2481         return 0;
2482 }
2483
2484 /**
2485  * Calculate maximum size of memory for flow items of Linux TC flower.
2486  *
2487  * @param[in] attr
2488  *   Pointer to the flow attributes.
2489  * @param[in] items
2490  *   Pointer to the list of items.
2491  * @param[out] action_flags
2492  *   Pointer to the detected actions.
2493  *
2494  * @return
2495  *   Maximum size of memory for items.
2496  */
2497 static int
2498 flow_tcf_get_items_size(const struct rte_flow_attr *attr,
2499                         const struct rte_flow_item items[],
2500                         uint64_t *action_flags)
2501 {
2502         int size = 0;
2503
2504         size += SZ_NLATTR_STRZ_OF("flower") +
2505                 SZ_NLATTR_TYPE_OF(uint16_t) + /* Outer ether type. */
2506                 SZ_NLATTR_NEST + /* TCA_OPTIONS. */
2507                 SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */
2508         if (attr->group > 0)
2509                 size += SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CHAIN. */
2510         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2511                 switch (items->type) {
2512                 case RTE_FLOW_ITEM_TYPE_VOID:
2513                         break;
2514                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
2515                         break;
2516                 case RTE_FLOW_ITEM_TYPE_ETH:
2517                         size += SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4;
2518                                 /* dst/src MAC addr and mask. */
2519                         break;
2520                 case RTE_FLOW_ITEM_TYPE_VLAN:
2521                         size += SZ_NLATTR_TYPE_OF(uint16_t) +
2522                                 /* VLAN Ether type. */
2523                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */
2524                                 SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */
2525                         break;
2526                 case RTE_FLOW_ITEM_TYPE_IPV4: {
2527                         const struct rte_flow_item_ipv4 *ipv4 = items->mask;
2528
2529                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2530                                 SZ_NLATTR_TYPE_OF(uint32_t) * 4;
2531                                 /* dst/src IP addr and mask. */
2532                         if (ipv4 && ipv4->hdr.time_to_live)
2533                                 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2534                         if (ipv4 && ipv4->hdr.type_of_service)
2535                                 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2536                         break;
2537                 }
2538                 case RTE_FLOW_ITEM_TYPE_IPV6: {
2539                         const struct rte_flow_item_ipv6 *ipv6 = items->mask;
2540
2541                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2542                                 SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4;
2543                                 /* dst/src IP addr and mask. */
2544                         if (ipv6 && ipv6->hdr.hop_limits)
2545                                 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2546                         if (ipv6 && (rte_be_to_cpu_32(ipv6->hdr.vtc_flow) &
2547                                      (0xfful << IPV6_HDR_TC_SHIFT)))
2548                                 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2549                         break;
2550                 }
2551                 case RTE_FLOW_ITEM_TYPE_UDP:
2552                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2553                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2554                                 /* dst/src port and mask. */
2555                         break;
2556                 case RTE_FLOW_ITEM_TYPE_TCP:
2557                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2558                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2559                                 /* dst/src port and mask. */
2560                         break;
2561                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2562                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2563                         /*
2564                          * There might be no VXLAN decap action in the action
2565                          * list, nonetheless the VXLAN tunnel flow requires
2566                          * the decap structure to be correctly applied to
2567                          * VXLAN device, set the flag to create the structure.
2568                          * Translation routine will not put the decap action
2569                          * in tne Netlink message if there is no actual action
2570                          * in the list.
2571                          */
2572                         *action_flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2573                         break;
2574                 default:
2575                         DRV_LOG(WARNING,
2576                                 "unsupported item %p type %d,"
2577                                 " items must be validated before flow creation",
2578                                 (const void *)items, items->type);
2579                         break;
2580                 }
2581         }
2582         return size;
2583 }
2584
2585 /**
2586  * Calculate size of memory to store the VXLAN encapsultion
2587  * related items in the Netlink message buffer. Items list
2588  * is specified by RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action.
2589  * The item list should be validated.
2590  *
2591  * @param[in] action
2592  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2593  *   List of pattern items to scan data from.
2594  *
2595  * @return
2596  *   The size the part of Netlink message buffer to store the
2597  *   VXLAN encapsulation item attributes.
2598  */
2599 static int
2600 flow_tcf_vxlan_encap_size(const struct rte_flow_action *action)
2601 {
2602         const struct rte_flow_item *items;
2603         int size = 0;
2604
2605         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
2606         assert(action->conf);
2607
2608         items = ((const struct rte_flow_action_vxlan_encap *)
2609                                         action->conf)->definition;
2610         assert(items);
2611         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2612                 switch (items->type) {
2613                 case RTE_FLOW_ITEM_TYPE_VOID:
2614                         break;
2615                 case RTE_FLOW_ITEM_TYPE_ETH:
2616                         /* This item does not require message buffer. */
2617                         break;
2618                 case RTE_FLOW_ITEM_TYPE_IPV4: {
2619                         const struct rte_flow_item_ipv4 *ipv4 = items->mask;
2620
2621                         size += SZ_NLATTR_DATA_OF(IPV4_ADDR_LEN) * 2;
2622                         if (ipv4 && ipv4->hdr.time_to_live)
2623                                 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2624                         if (ipv4 && ipv4->hdr.type_of_service)
2625                                 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2626                         break;
2627                 }
2628                 case RTE_FLOW_ITEM_TYPE_IPV6: {
2629                         const struct rte_flow_item_ipv6 *ipv6 = items->mask;
2630
2631                         size += SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 2;
2632                         if (ipv6 && ipv6->hdr.hop_limits)
2633                                 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2634                         if (ipv6 && (rte_be_to_cpu_32(ipv6->hdr.vtc_flow) &
2635                                      (0xfful << IPV6_HDR_TC_SHIFT)))
2636                                 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2637                         break;
2638                 }
2639                 case RTE_FLOW_ITEM_TYPE_UDP: {
2640                         const struct rte_flow_item_udp *udp = items->mask;
2641
2642                         size += SZ_NLATTR_TYPE_OF(uint16_t);
2643                         if (!udp || udp->hdr.src_port != RTE_BE16(0x0000))
2644                                 size += SZ_NLATTR_TYPE_OF(uint16_t);
2645                         break;
2646                 }
2647                 case RTE_FLOW_ITEM_TYPE_VXLAN:
2648                         size += SZ_NLATTR_TYPE_OF(uint32_t);
2649                         break;
2650                 default:
2651                         assert(false);
2652                         DRV_LOG(WARNING,
2653                                 "unsupported item %p type %d,"
2654                                 " items must be validated"
2655                                 " before flow creation",
2656                                 (const void *)items, items->type);
2657                         return 0;
2658                 }
2659         }
2660         return size;
2661 }
2662
2663 /**
2664  * Calculate maximum size of memory for flow actions of Linux TC flower and
2665  * extract specified actions.
2666  *
2667  * @param[in] actions
2668  *   Pointer to the list of actions.
2669  * @param[out] action_flags
2670  *   Pointer to the detected actions.
2671  *
2672  * @return
2673  *   Maximum size of memory for actions.
2674  */
2675 static int
2676 flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
2677                               uint64_t *action_flags)
2678 {
2679         int size = 0;
2680         uint64_t flags = 0;
2681
2682         size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */
2683         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
2684                 switch (actions->type) {
2685                 case RTE_FLOW_ACTION_TYPE_VOID:
2686                         break;
2687                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
2688                         size += SZ_NLATTR_NEST + /* na_act_index. */
2689                                 SZ_NLATTR_STRZ_OF("mirred") +
2690                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2691                                 SZ_NLATTR_TYPE_OF(struct tc_mirred);
2692                         flags |= MLX5_FLOW_ACTION_PORT_ID;
2693                         break;
2694                 case RTE_FLOW_ACTION_TYPE_JUMP:
2695                         size += SZ_NLATTR_NEST + /* na_act_index. */
2696                                 SZ_NLATTR_STRZ_OF("gact") +
2697                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2698                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2699                         flags |= MLX5_FLOW_ACTION_JUMP;
2700                         break;
2701                 case RTE_FLOW_ACTION_TYPE_DROP:
2702                         size += SZ_NLATTR_NEST + /* na_act_index. */
2703                                 SZ_NLATTR_STRZ_OF("gact") +
2704                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2705                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
2706                         flags |= MLX5_FLOW_ACTION_DROP;
2707                         break;
2708                 case RTE_FLOW_ACTION_TYPE_COUNT:
2709                         break;
2710                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
2711                         flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
2712                         goto action_of_vlan;
2713                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
2714                         flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
2715                         goto action_of_vlan;
2716                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
2717                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
2718                         goto action_of_vlan;
2719                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
2720                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
2721                         goto action_of_vlan;
2722 action_of_vlan:
2723                         size += SZ_NLATTR_NEST + /* na_act_index. */
2724                                 SZ_NLATTR_STRZ_OF("vlan") +
2725                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2726                                 SZ_NLATTR_TYPE_OF(struct tc_vlan) +
2727                                 SZ_NLATTR_TYPE_OF(uint16_t) +
2728                                 /* VLAN protocol. */
2729                                 SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
2730                                 SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
2731                         break;
2732                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
2733                         size += SZ_NLATTR_NEST + /* na_act_index. */
2734                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2735                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2736                                 SZ_NLATTR_TYPE_OF(uint8_t);
2737                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2738                         size += flow_tcf_vxlan_encap_size(actions) +
2739                                 RTE_ALIGN_CEIL /* preceding encap params. */
2740                                 (sizeof(struct flow_tcf_vxlan_encap),
2741                                 MNL_ALIGNTO);
2742                         flags |= MLX5_FLOW_ACTION_VXLAN_ENCAP;
2743                         break;
2744                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
2745                         size += SZ_NLATTR_NEST + /* na_act_index. */
2746                                 SZ_NLATTR_STRZ_OF("tunnel_key") +
2747                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2748                                 SZ_NLATTR_TYPE_OF(uint8_t);
2749                         size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2750                         size += RTE_ALIGN_CEIL /* preceding decap params. */
2751                                 (sizeof(struct flow_tcf_vxlan_decap),
2752                                 MNL_ALIGNTO);
2753                         flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2754                         break;
2755                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
2756                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
2757                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
2758                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
2759                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
2760                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
2761                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
2762                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
2763                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
2764                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
2765                         size += flow_tcf_get_pedit_actions_size(&actions,
2766                                                                 &flags);
2767                         break;
2768                 default:
2769                         DRV_LOG(WARNING,
2770                                 "unsupported action %p type %d,"
2771                                 " items must be validated before flow creation",
2772                                 (const void *)actions, actions->type);
2773                         break;
2774                 }
2775         }
2776         *action_flags = flags;
2777         return size;
2778 }
2779
2780 /**
2781  * Brand rtnetlink buffer with unique handle.
2782  *
2783  * This handle should be unique for a given network interface to avoid
2784  * collisions.
2785  *
2786  * @param nlh
2787  *   Pointer to Netlink message.
2788  * @param handle
2789  *   Unique 32-bit handle to use.
2790  */
2791 static void
2792 flow_tcf_nl_brand(struct nlmsghdr *nlh, uint32_t handle)
2793 {
2794         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
2795
2796         tcm->tcm_handle = handle;
2797         DRV_LOG(DEBUG, "Netlink msg %p is branded with handle %x",
2798                 (void *)nlh, handle);
2799 }
2800
2801 /**
2802  * Prepare a flow object for Linux TC flower. It calculates the maximum size of
2803  * memory required, allocates the memory, initializes Netlink message headers
2804  * and set unique TC message handle.
2805  *
2806  * @param[in] attr
2807  *   Pointer to the flow attributes.
2808  * @param[in] items
2809  *   Pointer to the list of items.
2810  * @param[in] actions
2811  *   Pointer to the list of actions.
2812  * @param[out] error
2813  *   Pointer to the error structure.
2814  *
2815  * @return
2816  *   Pointer to mlx5_flow object on success,
2817  *   otherwise NULL and rte_errno is set.
2818  */
2819 static struct mlx5_flow *
2820 flow_tcf_prepare(const struct rte_flow_attr *attr,
2821                  const struct rte_flow_item items[],
2822                  const struct rte_flow_action actions[],
2823                  struct rte_flow_error *error)
2824 {
2825         size_t size = RTE_ALIGN_CEIL
2826                         (sizeof(struct mlx5_flow),
2827                          alignof(struct flow_tcf_tunnel_hdr)) +
2828                       MNL_ALIGN(sizeof(struct nlmsghdr)) +
2829                       MNL_ALIGN(sizeof(struct tcmsg));
2830         struct mlx5_flow *dev_flow;
2831         uint64_t action_flags = 0;
2832         struct nlmsghdr *nlh;
2833         struct tcmsg *tcm;
2834         uint8_t *sp, *tun = NULL;
2835
2836         size += flow_tcf_get_items_size(attr, items, &action_flags);
2837         size += flow_tcf_get_actions_and_size(actions, &action_flags);
2838         dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
2839         if (!dev_flow) {
2840                 rte_flow_error_set(error, ENOMEM,
2841                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
2842                                    "not enough memory to create E-Switch flow");
2843                 return NULL;
2844         }
2845         sp = (uint8_t *)(dev_flow + 1);
2846         if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) {
2847                 sp = RTE_PTR_ALIGN
2848                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2849                 tun = sp;
2850                 sp += RTE_ALIGN_CEIL
2851                         (sizeof(struct flow_tcf_vxlan_encap),
2852                         MNL_ALIGNTO);
2853 #ifndef NDEBUG
2854                 size -= RTE_ALIGN_CEIL
2855                         (sizeof(struct flow_tcf_vxlan_encap),
2856                         MNL_ALIGNTO);
2857 #endif
2858         } else if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2859                 sp = RTE_PTR_ALIGN
2860                         (sp, alignof(struct flow_tcf_tunnel_hdr));
2861                 tun = sp;
2862                 sp += RTE_ALIGN_CEIL
2863                         (sizeof(struct flow_tcf_vxlan_decap),
2864                         MNL_ALIGNTO);
2865 #ifndef NDEBUG
2866                 size -= RTE_ALIGN_CEIL
2867                         (sizeof(struct flow_tcf_vxlan_decap),
2868                         MNL_ALIGNTO);
2869 #endif
2870         } else {
2871                 sp = RTE_PTR_ALIGN(sp, MNL_ALIGNTO);
2872         }
2873         nlh = mnl_nlmsg_put_header(sp);
2874         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
2875         *dev_flow = (struct mlx5_flow){
2876                 .tcf = (struct mlx5_flow_tcf){
2877 #ifndef NDEBUG
2878                         .nlsize = size - RTE_ALIGN_CEIL
2879                                 (sizeof(struct mlx5_flow),
2880                                  alignof(struct flow_tcf_tunnel_hdr)),
2881 #endif
2882                         .tunnel = (struct flow_tcf_tunnel_hdr *)tun,
2883                         .nlh = nlh,
2884                         .tcm = tcm,
2885                 },
2886         };
2887         if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP)
2888                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_DECAP;
2889         else if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP)
2890                 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_ENCAP;
2891         /*
2892          * Generate a reasonably unique handle based on the address of the
2893          * target buffer.
2894          *
2895          * This is straightforward on 32-bit systems where the flow pointer can
2896          * be used directly. Otherwise, its least significant part is taken
2897          * after shifting it by the previous power of two of the pointed buffer
2898          * size.
2899          */
2900         if (sizeof(dev_flow) <= 4)
2901                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow);
2902         else
2903                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow >>
2904                                        rte_log2_u32(rte_align32prevpow2(size)));
2905         return dev_flow;
2906 }
2907
2908 /**
2909  * Make adjustments for supporting count actions.
2910  *
2911  * @param[in] dev
2912  *   Pointer to the Ethernet device structure.
2913  * @param[in] dev_flow
2914  *   Pointer to mlx5_flow.
2915  * @param[out] error
2916  *   Pointer to error structure.
2917  *
2918  * @return
2919  *   0 On success else a negative errno value is returned and rte_errno is set.
2920  */
2921 static int
2922 flow_tcf_translate_action_count(struct rte_eth_dev *dev __rte_unused,
2923                                   struct mlx5_flow *dev_flow,
2924                                   struct rte_flow_error *error)
2925 {
2926         struct rte_flow *flow = dev_flow->flow;
2927
2928         if (!flow->counter) {
2929                 flow->counter = flow_tcf_counter_new();
2930                 if (!flow->counter)
2931                         return rte_flow_error_set(error, rte_errno,
2932                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2933                                                   NULL,
2934                                                   "cannot get counter"
2935                                                   " context.");
2936         }
2937         return 0;
2938 }
2939
2940 /**
2941  * Convert VXLAN VNI to 32-bit integer.
2942  *
2943  * @param[in] vni
2944  *   VXLAN VNI in 24-bit wire format.
2945  *
2946  * @return
2947  *   VXLAN VNI as a 32-bit integer value in network endian.
2948  */
2949 static inline rte_be32_t
2950 vxlan_vni_as_be32(const uint8_t vni[3])
2951 {
2952         union {
2953                 uint8_t vni[4];
2954                 rte_be32_t dword;
2955         } ret = {
2956                 .vni = { 0, vni[0], vni[1], vni[2] },
2957         };
2958         return ret.dword;
2959 }
2960
2961 /**
2962  * Helper function to process RTE_FLOW_ITEM_TYPE_ETH entry in configuration
2963  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the MAC address fields
2964  * in the encapsulation parameters structure. The item must be prevalidated,
2965  * no any validation checks performed by function.
2966  *
2967  * @param[in] spec
2968  *   RTE_FLOW_ITEM_TYPE_ETH entry specification.
2969  * @param[in] mask
2970  *   RTE_FLOW_ITEM_TYPE_ETH entry mask.
2971  * @param[out] encap
2972  *   Structure to fill the gathered MAC address data.
2973  */
2974 static void
2975 flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec,
2976                                const struct rte_flow_item_eth *mask,
2977                                struct flow_tcf_vxlan_encap *encap)
2978 {
2979         /* Item must be validated before. No redundant checks. */
2980         assert(spec);
2981         if (!mask || !memcmp(&mask->dst,
2982                              &rte_flow_item_eth_mask.dst,
2983                              sizeof(rte_flow_item_eth_mask.dst))) {
2984                 /*
2985                  * Ethernet addresses are not supported by
2986                  * tc as tunnel_key parameters. Destination
2987                  * address is needed to form encap packet
2988                  * header and retrieved by kernel from
2989                  * implicit sources (ARP table, etc),
2990                  * address masks are not supported at all.
2991                  */
2992                 encap->eth.dst = spec->dst;
2993                 encap->mask |= FLOW_TCF_ENCAP_ETH_DST;
2994         }
2995         if (!mask || !memcmp(&mask->src,
2996                              &rte_flow_item_eth_mask.src,
2997                              sizeof(rte_flow_item_eth_mask.src))) {
2998                 /*
2999                  * Ethernet addresses are not supported by
3000                  * tc as tunnel_key parameters. Source ethernet
3001                  * address is ignored anyway.
3002                  */
3003                 encap->eth.src = spec->src;
3004                 encap->mask |= FLOW_TCF_ENCAP_ETH_SRC;
3005         }
3006 }
3007
3008 /**
3009  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV4 entry in configuration
3010  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV4 address fields
3011  * in the encapsulation parameters structure. The item must be prevalidated,
3012  * no any validation checks performed by function.
3013  *
3014  * @param[in] spec
3015  *   RTE_FLOW_ITEM_TYPE_IPV4 entry specification.
3016  * @param[in] mask
3017  *  RTE_FLOW_ITEM_TYPE_IPV4 entry mask.
3018  * @param[out] encap
3019  *   Structure to fill the gathered IPV4 address data.
3020  */
3021 static void
3022 flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec,
3023                                 const struct rte_flow_item_ipv4 *mask,
3024                                 struct flow_tcf_vxlan_encap *encap)
3025 {
3026         /* Item must be validated before. No redundant checks. */
3027         assert(spec);
3028         encap->ipv4.dst = spec->hdr.dst_addr;
3029         encap->ipv4.src = spec->hdr.src_addr;
3030         encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC |
3031                        FLOW_TCF_ENCAP_IPV4_DST;
3032         if (mask && mask->hdr.type_of_service) {
3033                 encap->mask |= FLOW_TCF_ENCAP_IP_TOS;
3034                 encap->ip_tos = spec->hdr.type_of_service;
3035         }
3036         if (mask && mask->hdr.time_to_live) {
3037                 encap->mask |= FLOW_TCF_ENCAP_IP_TTL;
3038                 encap->ip_ttl_hop = spec->hdr.time_to_live;
3039         }
3040 }
3041
3042 /**
3043  * Helper function to process RTE_FLOW_ITEM_TYPE_IPV6 entry in configuration
3044  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV6 address fields
3045  * in the encapsulation parameters structure. The item must be prevalidated,
3046  * no any validation checks performed by function.
3047  *
3048  * @param[in] spec
3049  *   RTE_FLOW_ITEM_TYPE_IPV6 entry specification.
3050  * @param[in] mask
3051  *  RTE_FLOW_ITEM_TYPE_IPV6 entry mask.
3052  * @param[out] encap
3053  *   Structure to fill the gathered IPV6 address data.
3054  */
3055 static void
3056 flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec,
3057                                 const struct rte_flow_item_ipv6 *mask,
3058                                 struct flow_tcf_vxlan_encap *encap)
3059 {
3060         /* Item must be validated before. No redundant checks. */
3061         assert(spec);
3062         memcpy(encap->ipv6.dst, spec->hdr.dst_addr, IPV6_ADDR_LEN);
3063         memcpy(encap->ipv6.src, spec->hdr.src_addr, IPV6_ADDR_LEN);
3064         encap->mask |= FLOW_TCF_ENCAP_IPV6_SRC |
3065                        FLOW_TCF_ENCAP_IPV6_DST;
3066         if (mask) {
3067                 if ((rte_be_to_cpu_32(mask->hdr.vtc_flow) >>
3068                     IPV6_HDR_TC_SHIFT) & 0xff) {
3069                         encap->mask |= FLOW_TCF_ENCAP_IP_TOS;
3070                         encap->ip_tos = (rte_be_to_cpu_32
3071                                                 (spec->hdr.vtc_flow) >>
3072                                                  IPV6_HDR_TC_SHIFT) & 0xff;
3073                 }
3074                 if (mask->hdr.hop_limits) {
3075                         encap->mask |= FLOW_TCF_ENCAP_IP_TTL;
3076                         encap->ip_ttl_hop = spec->hdr.hop_limits;
3077                 }
3078         }
3079 }
3080
3081 /**
3082  * Helper function to process RTE_FLOW_ITEM_TYPE_UDP entry in configuration
3083  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the UDP port fields
3084  * in the encapsulation parameters structure. The item must be prevalidated,
3085  * no any validation checks performed by function.
3086  *
3087  * @param[in] spec
3088  *   RTE_FLOW_ITEM_TYPE_UDP entry specification.
3089  * @param[in] mask
3090  *   RTE_FLOW_ITEM_TYPE_UDP entry mask.
3091  * @param[out] encap
3092  *   Structure to fill the gathered UDP port data.
3093  */
3094 static void
3095 flow_tcf_parse_vxlan_encap_udp(const struct rte_flow_item_udp *spec,
3096                                const struct rte_flow_item_udp *mask,
3097                                struct flow_tcf_vxlan_encap *encap)
3098 {
3099         assert(spec);
3100         encap->udp.dst = spec->hdr.dst_port;
3101         encap->mask |= FLOW_TCF_ENCAP_UDP_DST;
3102         if (!mask || mask->hdr.src_port != RTE_BE16(0x0000)) {
3103                 encap->udp.src = spec->hdr.src_port;
3104                 encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC;
3105         }
3106 }
3107
3108 /**
3109  * Helper function to process RTE_FLOW_ITEM_TYPE_VXLAN entry in configuration
3110  * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the VNI fields
3111  * in the encapsulation parameters structure. The item must be prevalidated,
3112  * no any validation checks performed by function.
3113  *
3114  * @param[in] spec
3115  *   RTE_FLOW_ITEM_TYPE_VXLAN entry specification.
3116  * @param[out] encap
3117  *   Structure to fill the gathered VNI address data.
3118  */
3119 static void
3120 flow_tcf_parse_vxlan_encap_vni(const struct rte_flow_item_vxlan *spec,
3121                                struct flow_tcf_vxlan_encap *encap)
3122 {
3123         /* Item must be validated before. Do not redundant checks. */
3124         assert(spec);
3125         memcpy(encap->vxlan.vni, spec->vni, sizeof(encap->vxlan.vni));
3126         encap->mask |= FLOW_TCF_ENCAP_VXLAN_VNI;
3127 }
3128
3129 /**
3130  * Populate consolidated encapsulation object from list of pattern items.
3131  *
3132  * Helper function to process configuration of action such as
3133  * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. The item list should be
3134  * validated, there is no way to return an meaningful error.
3135  *
3136  * @param[in] action
3137  *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
3138  *   List of pattern items to gather data from.
3139  * @param[out] src
3140  *   Structure to fill gathered data.
3141  */
3142 static void
3143 flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action,
3144                            struct flow_tcf_vxlan_encap *encap)
3145 {
3146         union {
3147                 const struct rte_flow_item_eth *eth;
3148                 const struct rte_flow_item_ipv4 *ipv4;
3149                 const struct rte_flow_item_ipv6 *ipv6;
3150                 const struct rte_flow_item_udp *udp;
3151                 const struct rte_flow_item_vxlan *vxlan;
3152         } spec, mask;
3153         const struct rte_flow_item *items;
3154
3155         assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
3156         assert(action->conf);
3157
3158         items = ((const struct rte_flow_action_vxlan_encap *)
3159                                         action->conf)->definition;
3160         assert(items);
3161         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3162                 switch (items->type) {
3163                 case RTE_FLOW_ITEM_TYPE_VOID:
3164                         break;
3165                 case RTE_FLOW_ITEM_TYPE_ETH:
3166                         mask.eth = items->mask;
3167                         spec.eth = items->spec;
3168                         flow_tcf_parse_vxlan_encap_eth(spec.eth, mask.eth,
3169                                                        encap);
3170                         break;
3171                 case RTE_FLOW_ITEM_TYPE_IPV4:
3172                         spec.ipv4 = items->spec;
3173                         mask.ipv4 = items->mask;
3174                         flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4, mask.ipv4,
3175                                                         encap);
3176                         break;
3177                 case RTE_FLOW_ITEM_TYPE_IPV6:
3178                         spec.ipv6 = items->spec;
3179                         mask.ipv6 = items->mask;
3180                         flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6, mask.ipv6,
3181                                                         encap);
3182                         break;
3183                 case RTE_FLOW_ITEM_TYPE_UDP:
3184                         mask.udp = items->mask;
3185                         spec.udp = items->spec;
3186                         flow_tcf_parse_vxlan_encap_udp(spec.udp, mask.udp,
3187                                                        encap);
3188                         break;
3189                 case RTE_FLOW_ITEM_TYPE_VXLAN:
3190                         spec.vxlan = items->spec;
3191                         flow_tcf_parse_vxlan_encap_vni(spec.vxlan, encap);
3192                         break;
3193                 default:
3194                         assert(false);
3195                         DRV_LOG(WARNING,
3196                                 "unsupported item %p type %d,"
3197                                 " items must be validated"
3198                                 " before flow creation",
3199                                 (const void *)items, items->type);
3200                         encap->mask = 0;
3201                         return;
3202                 }
3203         }
3204 }
3205
3206 /**
3207  * Translate flow for Linux TC flower and construct Netlink message.
3208  *
3209  * @param[in] priv
3210  *   Pointer to the priv structure.
3211  * @param[in, out] flow
3212  *   Pointer to the sub flow.
3213  * @param[in] attr
3214  *   Pointer to the flow attributes.
3215  * @param[in] items
3216  *   Pointer to the list of items.
3217  * @param[in] actions
3218  *   Pointer to the list of actions.
3219  * @param[out] error
3220  *   Pointer to the error structure.
3221  *
3222  * @return
3223  *   0 on success, a negative errno value otherwise and rte_errno is set.
3224  */
3225 static int
3226 flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow,
3227                    const struct rte_flow_attr *attr,
3228                    const struct rte_flow_item items[],
3229                    const struct rte_flow_action actions[],
3230                    struct rte_flow_error *error)
3231 {
3232         union {
3233                 const struct rte_flow_item_port_id *port_id;
3234                 const struct rte_flow_item_eth *eth;
3235                 const struct rte_flow_item_vlan *vlan;
3236                 const struct rte_flow_item_ipv4 *ipv4;
3237                 const struct rte_flow_item_ipv6 *ipv6;
3238                 const struct rte_flow_item_tcp *tcp;
3239                 const struct rte_flow_item_udp *udp;
3240                 const struct rte_flow_item_vxlan *vxlan;
3241         } spec, mask;
3242         union {
3243                 const struct rte_flow_action_port_id *port_id;
3244                 const struct rte_flow_action_jump *jump;
3245                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
3246                 const struct rte_flow_action_of_set_vlan_vid *
3247                         of_set_vlan_vid;
3248                 const struct rte_flow_action_of_set_vlan_pcp *
3249                         of_set_vlan_pcp;
3250         } conf;
3251         union {
3252                 struct flow_tcf_tunnel_hdr *hdr;
3253                 struct flow_tcf_vxlan_decap *vxlan;
3254         } decap = {
3255                 .hdr = NULL,
3256         };
3257         union {
3258                 struct flow_tcf_tunnel_hdr *hdr;
3259                 struct flow_tcf_vxlan_encap *vxlan;
3260         } encap = {
3261                 .hdr = NULL,
3262         };
3263         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
3264         struct nlmsghdr *nlh = dev_flow->tcf.nlh;
3265         struct tcmsg *tcm = dev_flow->tcf.tcm;
3266         uint32_t na_act_index_cur;
3267         rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
3268         rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
3269         rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
3270         bool ip_proto_set = 0;
3271         bool tunnel_outer = 0;
3272         struct nlattr *na_flower;
3273         struct nlattr *na_flower_act;
3274         struct nlattr *na_vlan_id = NULL;
3275         struct nlattr *na_vlan_priority = NULL;
3276         uint64_t item_flags = 0;
3277         int ret;
3278
3279         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
3280                                                 PTOI_TABLE_SZ_MAX(dev)));
3281         if (dev_flow->tcf.tunnel) {
3282                 switch (dev_flow->tcf.tunnel->type) {
3283                 case FLOW_TCF_TUNACT_VXLAN_DECAP:
3284                         decap.vxlan = dev_flow->tcf.vxlan_decap;
3285                         tunnel_outer = 1;
3286                         break;
3287                 case FLOW_TCF_TUNACT_VXLAN_ENCAP:
3288                         encap.vxlan = dev_flow->tcf.vxlan_encap;
3289                         break;
3290                 /* New tunnel actions can be added here. */
3291                 default:
3292                         assert(false);
3293                         break;
3294                 }
3295         }
3296         nlh = dev_flow->tcf.nlh;
3297         tcm = dev_flow->tcf.tcm;
3298         /* Prepare API must have been called beforehand. */
3299         assert(nlh != NULL && tcm != NULL);
3300         tcm->tcm_family = AF_UNSPEC;
3301         tcm->tcm_ifindex = ptoi[0].ifindex;
3302         tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
3303         /*
3304          * Priority cannot be zero to prevent the kernel from picking one
3305          * automatically.
3306          */
3307         tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16, outer_etype);
3308         if (attr->group > 0)
3309                 mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group);
3310         mnl_attr_put_strz(nlh, TCA_KIND, "flower");
3311         na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
3312         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3313                 unsigned int i;
3314
3315                 switch (items->type) {
3316                 case RTE_FLOW_ITEM_TYPE_VOID:
3317                         break;
3318                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
3319                         mask.port_id = flow_tcf_item_mask
3320                                 (items, &rte_flow_item_port_id_mask,
3321                                  &flow_tcf_mask_supported.port_id,
3322                                  &flow_tcf_mask_empty.port_id,
3323                                  sizeof(flow_tcf_mask_supported.port_id),
3324                                  error);
3325                         assert(mask.port_id);
3326                         if (mask.port_id == &flow_tcf_mask_empty.port_id)
3327                                 break;
3328                         spec.port_id = items->spec;
3329                         if (!mask.port_id->id)
3330                                 i = 0;
3331                         else
3332                                 for (i = 0; ptoi[i].ifindex; ++i)
3333                                         if (ptoi[i].port_id == spec.port_id->id)
3334                                                 break;
3335                         assert(ptoi[i].ifindex);
3336                         tcm->tcm_ifindex = ptoi[i].ifindex;
3337                         break;
3338                 case RTE_FLOW_ITEM_TYPE_ETH:
3339                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3340                                       MLX5_FLOW_LAYER_INNER_L2 :
3341                                       MLX5_FLOW_LAYER_OUTER_L2;
3342                         mask.eth = flow_tcf_item_mask
3343                                 (items, &rte_flow_item_eth_mask,
3344                                  &flow_tcf_mask_supported.eth,
3345                                  &flow_tcf_mask_empty.eth,
3346                                  sizeof(flow_tcf_mask_supported.eth),
3347                                  error);
3348                         assert(mask.eth);
3349                         if (mask.eth == &flow_tcf_mask_empty.eth)
3350                                 break;
3351                         spec.eth = items->spec;
3352                         if (mask.eth->type) {
3353                                 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
3354                                         inner_etype = spec.eth->type;
3355                                 else
3356                                         outer_etype = spec.eth->type;
3357                         }
3358                         if (tunnel_outer) {
3359                                 DRV_LOG(WARNING,
3360                                         "outer L2 addresses cannot be"
3361                                         " forced is outer ones for tunnel,"
3362                                         " parameter is ignored");
3363                                 break;
3364                         }
3365                         if (!is_zero_ether_addr(&mask.eth->dst)) {
3366                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
3367                                              ETHER_ADDR_LEN,
3368                                              spec.eth->dst.addr_bytes);
3369                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK,
3370                                              ETHER_ADDR_LEN,
3371                                              mask.eth->dst.addr_bytes);
3372                         }
3373                         if (!is_zero_ether_addr(&mask.eth->src)) {
3374                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC,
3375                                              ETHER_ADDR_LEN,
3376                                              spec.eth->src.addr_bytes);
3377                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK,
3378                                              ETHER_ADDR_LEN,
3379                                              mask.eth->src.addr_bytes);
3380                         }
3381                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3382                         break;
3383                 case RTE_FLOW_ITEM_TYPE_VLAN:
3384                         assert(!encap.hdr);
3385                         assert(!decap.hdr);
3386                         assert(!tunnel_outer);
3387                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
3388                         mask.vlan = flow_tcf_item_mask
3389                                 (items, &rte_flow_item_vlan_mask,
3390                                  &flow_tcf_mask_supported.vlan,
3391                                  &flow_tcf_mask_empty.vlan,
3392                                  sizeof(flow_tcf_mask_supported.vlan),
3393                                  error);
3394                         assert(mask.vlan);
3395                         if (mask.vlan == &flow_tcf_mask_empty.vlan)
3396                                 break;
3397                         spec.vlan = items->spec;
3398                         assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3399                                outer_etype == RTE_BE16(ETH_P_8021Q));
3400                         outer_etype = RTE_BE16(ETH_P_8021Q);
3401                         if (mask.vlan->inner_type)
3402                                 vlan_etype = spec.vlan->inner_type;
3403                         if (mask.vlan->tci & RTE_BE16(0xe000))
3404                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_VLAN_PRIO,
3405                                                 (rte_be_to_cpu_16
3406                                                  (spec.vlan->tci) >> 13) & 0x7);
3407                         if (mask.vlan->tci & RTE_BE16(0x0fff))
3408                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_VLAN_ID,
3409                                                  rte_be_to_cpu_16
3410                                                  (spec.vlan->tci &
3411                                                   RTE_BE16(0x0fff)));
3412                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3413                         break;
3414                 case RTE_FLOW_ITEM_TYPE_IPV4:
3415                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3416                                       MLX5_FLOW_LAYER_INNER_L3_IPV4 :
3417                                       MLX5_FLOW_LAYER_OUTER_L3_IPV4;
3418                         mask.ipv4 = flow_tcf_item_mask
3419                                 (items, &rte_flow_item_ipv4_mask,
3420                                  &flow_tcf_mask_supported.ipv4,
3421                                  &flow_tcf_mask_empty.ipv4,
3422                                  sizeof(flow_tcf_mask_supported.ipv4),
3423                                  error);
3424                         assert(mask.ipv4);
3425                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3426                                 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3427                                        inner_etype == RTE_BE16(ETH_P_IP));
3428                                 inner_etype = RTE_BE16(ETH_P_IP);
3429                         } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3430                                 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3431                                        vlan_etype == RTE_BE16(ETH_P_IP));
3432                                 vlan_etype = RTE_BE16(ETH_P_IP);
3433                         } else {
3434                                 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3435                                        outer_etype == RTE_BE16(ETH_P_IP));
3436                                 outer_etype = RTE_BE16(ETH_P_IP);
3437                         }
3438                         spec.ipv4 = items->spec;
3439                         if (!tunnel_outer && mask.ipv4->hdr.next_proto_id) {
3440                                 /*
3441                                  * No way to set IP protocol for outer tunnel
3442                                  * layers. Usually it is fixed, for example,
3443                                  * to UDP for VXLAN/GPE.
3444                                  */
3445                                 assert(spec.ipv4); /* Mask is not empty. */
3446                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3447                                                 spec.ipv4->hdr.next_proto_id);
3448                                 ip_proto_set = 1;
3449                         }
3450                         if (mask.ipv4 == &flow_tcf_mask_empty.ipv4 ||
3451                              (!mask.ipv4->hdr.src_addr &&
3452                               !mask.ipv4->hdr.dst_addr)) {
3453                                 if (!tunnel_outer)
3454                                         break;
3455                                 /*
3456                                  * For tunnel outer we must set outer IP key
3457                                  * anyway, even if the specification/mask is
3458                                  * empty. There is no another way to tell
3459                                  * kernel about he outer layer protocol.
3460                                  */
3461                                 mnl_attr_put_u32
3462                                         (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC,
3463                                          mask.ipv4->hdr.src_addr);
3464                                 mnl_attr_put_u32
3465                                         (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
3466                                          mask.ipv4->hdr.src_addr);
3467                                 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3468                                 break;
3469                         }
3470                         if (mask.ipv4->hdr.src_addr) {
3471                                 mnl_attr_put_u32
3472                                         (nlh, tunnel_outer ?
3473                                          TCA_FLOWER_KEY_ENC_IPV4_SRC :
3474                                          TCA_FLOWER_KEY_IPV4_SRC,
3475                                          spec.ipv4->hdr.src_addr);
3476                                 mnl_attr_put_u32
3477                                         (nlh, tunnel_outer ?
3478                                          TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK :
3479                                          TCA_FLOWER_KEY_IPV4_SRC_MASK,
3480                                          mask.ipv4->hdr.src_addr);
3481                         }
3482                         if (mask.ipv4->hdr.dst_addr) {
3483                                 mnl_attr_put_u32
3484                                         (nlh, tunnel_outer ?
3485                                          TCA_FLOWER_KEY_ENC_IPV4_DST :
3486                                          TCA_FLOWER_KEY_IPV4_DST,
3487                                          spec.ipv4->hdr.dst_addr);
3488                                 mnl_attr_put_u32
3489                                         (nlh, tunnel_outer ?
3490                                          TCA_FLOWER_KEY_ENC_IPV4_DST_MASK :
3491                                          TCA_FLOWER_KEY_IPV4_DST_MASK,
3492                                          mask.ipv4->hdr.dst_addr);
3493                         }
3494                         if (mask.ipv4->hdr.time_to_live) {
3495                                 mnl_attr_put_u8
3496                                         (nlh, tunnel_outer ?
3497                                          TCA_FLOWER_KEY_ENC_IP_TTL :
3498                                          TCA_FLOWER_KEY_IP_TTL,
3499                                          spec.ipv4->hdr.time_to_live);
3500                                 mnl_attr_put_u8
3501                                         (nlh, tunnel_outer ?
3502                                          TCA_FLOWER_KEY_ENC_IP_TTL_MASK :
3503                                          TCA_FLOWER_KEY_IP_TTL_MASK,
3504                                          mask.ipv4->hdr.time_to_live);
3505                         }
3506                         if (mask.ipv4->hdr.type_of_service) {
3507                                 mnl_attr_put_u8
3508                                         (nlh, tunnel_outer ?
3509                                          TCA_FLOWER_KEY_ENC_IP_TOS :
3510                                          TCA_FLOWER_KEY_IP_TOS,
3511                                          spec.ipv4->hdr.type_of_service);
3512                                 mnl_attr_put_u8
3513                                         (nlh, tunnel_outer ?
3514                                          TCA_FLOWER_KEY_ENC_IP_TOS_MASK :
3515                                          TCA_FLOWER_KEY_IP_TOS_MASK,
3516                                          mask.ipv4->hdr.type_of_service);
3517                         }
3518                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3519                         break;
3520                 case RTE_FLOW_ITEM_TYPE_IPV6: {
3521                         bool ipv6_src, ipv6_dst;
3522                         uint8_t msk6, tos6;
3523
3524                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3525                                       MLX5_FLOW_LAYER_INNER_L3_IPV6 :
3526                                       MLX5_FLOW_LAYER_OUTER_L3_IPV6;
3527                         mask.ipv6 = flow_tcf_item_mask
3528                                 (items, &rte_flow_item_ipv6_mask,
3529                                  &flow_tcf_mask_supported.ipv6,
3530                                  &flow_tcf_mask_empty.ipv6,
3531                                  sizeof(flow_tcf_mask_supported.ipv6),
3532                                  error);
3533                         assert(mask.ipv6);
3534                         if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3535                                 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3536                                        inner_etype == RTE_BE16(ETH_P_IPV6));
3537                                 inner_etype = RTE_BE16(ETH_P_IPV6);
3538                         } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3539                                 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3540                                        vlan_etype == RTE_BE16(ETH_P_IPV6));
3541                                 vlan_etype = RTE_BE16(ETH_P_IPV6);
3542                         } else {
3543                                 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3544                                        outer_etype == RTE_BE16(ETH_P_IPV6));
3545                                 outer_etype = RTE_BE16(ETH_P_IPV6);
3546                         }
3547                         spec.ipv6 = items->spec;
3548                         if (!tunnel_outer && mask.ipv6->hdr.proto) {
3549                                 /*
3550                                  * No way to set IP protocol for outer tunnel
3551                                  * layers. Usually it is fixed, for example,
3552                                  * to UDP for VXLAN/GPE.
3553                                  */
3554                                 assert(spec.ipv6); /* Mask is not empty. */
3555                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3556                                                 spec.ipv6->hdr.proto);
3557                                 ip_proto_set = 1;
3558                         }
3559                         ipv6_dst = !IN6_IS_ADDR_UNSPECIFIED
3560                                                 (mask.ipv6->hdr.dst_addr);
3561                         ipv6_src = !IN6_IS_ADDR_UNSPECIFIED
3562                                                 (mask.ipv6->hdr.src_addr);
3563                         if (mask.ipv6 == &flow_tcf_mask_empty.ipv6 ||
3564                              (!ipv6_dst && !ipv6_src)) {
3565                                 if (!tunnel_outer)
3566                                         break;
3567                                 /*
3568                                  * For tunnel outer we must set outer IP key
3569                                  * anyway, even if the specification/mask is
3570                                  * empty. There is no another way to tell
3571                                  * kernel about he outer layer protocol.
3572                                  */
3573                                 mnl_attr_put(nlh,
3574                                              TCA_FLOWER_KEY_ENC_IPV6_SRC,
3575                                              IPV6_ADDR_LEN,
3576                                              mask.ipv6->hdr.src_addr);
3577                                 mnl_attr_put(nlh,
3578                                              TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
3579                                              IPV6_ADDR_LEN,
3580                                              mask.ipv6->hdr.src_addr);
3581                                 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3582                                 break;
3583                         }
3584                         if (ipv6_src) {
3585                                 mnl_attr_put(nlh, tunnel_outer ?
3586                                              TCA_FLOWER_KEY_ENC_IPV6_SRC :
3587                                              TCA_FLOWER_KEY_IPV6_SRC,
3588                                              IPV6_ADDR_LEN,
3589                                              spec.ipv6->hdr.src_addr);
3590                                 mnl_attr_put(nlh, tunnel_outer ?
3591                                              TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK :
3592                                              TCA_FLOWER_KEY_IPV6_SRC_MASK,
3593                                              IPV6_ADDR_LEN,
3594                                              mask.ipv6->hdr.src_addr);
3595                         }
3596                         if (ipv6_dst) {
3597                                 mnl_attr_put(nlh, tunnel_outer ?
3598                                              TCA_FLOWER_KEY_ENC_IPV6_DST :
3599                                              TCA_FLOWER_KEY_IPV6_DST,
3600                                              IPV6_ADDR_LEN,
3601                                              spec.ipv6->hdr.dst_addr);
3602                                 mnl_attr_put(nlh, tunnel_outer ?
3603                                              TCA_FLOWER_KEY_ENC_IPV6_DST_MASK :
3604                                              TCA_FLOWER_KEY_IPV6_DST_MASK,
3605                                              IPV6_ADDR_LEN,
3606                                              mask.ipv6->hdr.dst_addr);
3607                         }
3608                         if (mask.ipv6->hdr.hop_limits) {
3609                                 mnl_attr_put_u8
3610                                         (nlh, tunnel_outer ?
3611                                          TCA_FLOWER_KEY_ENC_IP_TTL :
3612                                          TCA_FLOWER_KEY_IP_TTL,
3613                                          spec.ipv6->hdr.hop_limits);
3614                                 mnl_attr_put_u8
3615                                         (nlh, tunnel_outer ?
3616                                          TCA_FLOWER_KEY_ENC_IP_TTL_MASK :
3617                                          TCA_FLOWER_KEY_IP_TTL_MASK,
3618                                          mask.ipv6->hdr.hop_limits);
3619                         }
3620                         msk6 = (rte_be_to_cpu_32(mask.ipv6->hdr.vtc_flow) >>
3621                                 IPV6_HDR_TC_SHIFT) & 0xff;
3622                         if (msk6) {
3623                                 tos6 = (rte_be_to_cpu_32
3624                                         (spec.ipv6->hdr.vtc_flow) >>
3625                                                 IPV6_HDR_TC_SHIFT) & 0xff;
3626                                 mnl_attr_put_u8
3627                                         (nlh, tunnel_outer ?
3628                                          TCA_FLOWER_KEY_ENC_IP_TOS :
3629                                          TCA_FLOWER_KEY_IP_TOS, tos6);
3630                                 mnl_attr_put_u8
3631                                         (nlh, tunnel_outer ?
3632                                          TCA_FLOWER_KEY_ENC_IP_TOS_MASK :
3633                                          TCA_FLOWER_KEY_IP_TOS_MASK, msk6);
3634                         }
3635                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3636                         break;
3637                 }
3638                 case RTE_FLOW_ITEM_TYPE_UDP:
3639                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3640                                       MLX5_FLOW_LAYER_INNER_L4_UDP :
3641                                       MLX5_FLOW_LAYER_OUTER_L4_UDP;
3642                         mask.udp = flow_tcf_item_mask
3643                                 (items, &rte_flow_item_udp_mask,
3644                                  &flow_tcf_mask_supported.udp,
3645                                  &flow_tcf_mask_empty.udp,
3646                                  sizeof(flow_tcf_mask_supported.udp),
3647                                  error);
3648                         assert(mask.udp);
3649                         spec.udp = items->spec;
3650                         if (!tunnel_outer) {
3651                                 if (!ip_proto_set)
3652                                         mnl_attr_put_u8
3653                                                 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3654                                                 IPPROTO_UDP);
3655                                 if (mask.udp == &flow_tcf_mask_empty.udp)
3656                                         break;
3657                         } else {
3658                                 assert(mask.udp != &flow_tcf_mask_empty.udp);
3659                                 decap.vxlan->udp_port =
3660                                         rte_be_to_cpu_16
3661                                                 (spec.udp->hdr.dst_port);
3662                         }
3663                         if (mask.udp->hdr.src_port) {
3664                                 mnl_attr_put_u16
3665                                         (nlh, tunnel_outer ?
3666                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT :
3667                                          TCA_FLOWER_KEY_UDP_SRC,
3668                                          spec.udp->hdr.src_port);
3669                                 mnl_attr_put_u16
3670                                         (nlh, tunnel_outer ?
3671                                          TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK :
3672                                          TCA_FLOWER_KEY_UDP_SRC_MASK,
3673                                          mask.udp->hdr.src_port);
3674                         }
3675                         if (mask.udp->hdr.dst_port) {
3676                                 mnl_attr_put_u16
3677                                         (nlh, tunnel_outer ?
3678                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT :
3679                                          TCA_FLOWER_KEY_UDP_DST,
3680                                          spec.udp->hdr.dst_port);
3681                                 mnl_attr_put_u16
3682                                         (nlh, tunnel_outer ?
3683                                          TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK :
3684                                          TCA_FLOWER_KEY_UDP_DST_MASK,
3685                                          mask.udp->hdr.dst_port);
3686                         }
3687                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3688                         break;
3689                 case RTE_FLOW_ITEM_TYPE_TCP:
3690                         item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3691                                       MLX5_FLOW_LAYER_INNER_L4_TCP :
3692                                       MLX5_FLOW_LAYER_OUTER_L4_TCP;
3693                         mask.tcp = flow_tcf_item_mask
3694                                 (items, &rte_flow_item_tcp_mask,
3695                                  &flow_tcf_mask_supported.tcp,
3696                                  &flow_tcf_mask_empty.tcp,
3697                                  sizeof(flow_tcf_mask_supported.tcp),
3698                                  error);
3699                         assert(mask.tcp);
3700                         if (!ip_proto_set)
3701                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3702                                                 IPPROTO_TCP);
3703                         if (mask.tcp == &flow_tcf_mask_empty.tcp)
3704                                 break;
3705                         spec.tcp = items->spec;
3706                         if (mask.tcp->hdr.src_port) {
3707                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_SRC,
3708                                                  spec.tcp->hdr.src_port);
3709                                 mnl_attr_put_u16(nlh,
3710                                                  TCA_FLOWER_KEY_TCP_SRC_MASK,
3711                                                  mask.tcp->hdr.src_port);
3712                         }
3713                         if (mask.tcp->hdr.dst_port) {
3714                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_DST,
3715                                                  spec.tcp->hdr.dst_port);
3716                                 mnl_attr_put_u16(nlh,
3717                                                  TCA_FLOWER_KEY_TCP_DST_MASK,
3718                                                  mask.tcp->hdr.dst_port);
3719                         }
3720                         if (mask.tcp->hdr.tcp_flags) {
3721                                 mnl_attr_put_u16
3722                                         (nlh,
3723                                          TCA_FLOWER_KEY_TCP_FLAGS,
3724                                          rte_cpu_to_be_16
3725                                                 (spec.tcp->hdr.tcp_flags));
3726                                 mnl_attr_put_u16
3727                                         (nlh,
3728                                          TCA_FLOWER_KEY_TCP_FLAGS_MASK,
3729                                          rte_cpu_to_be_16
3730                                                 (mask.tcp->hdr.tcp_flags));
3731                         }
3732                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3733                         break;
3734                 case RTE_FLOW_ITEM_TYPE_VXLAN:
3735                         assert(decap.vxlan);
3736                         tunnel_outer = 0;
3737                         item_flags |= MLX5_FLOW_LAYER_VXLAN;
3738                         spec.vxlan = items->spec;
3739                         mnl_attr_put_u32(nlh,
3740                                          TCA_FLOWER_KEY_ENC_KEY_ID,
3741                                          vxlan_vni_as_be32(spec.vxlan->vni));
3742                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3743                         break;
3744                 default:
3745                         return rte_flow_error_set(error, ENOTSUP,
3746                                                   RTE_FLOW_ERROR_TYPE_ITEM,
3747                                                   NULL, "item not supported");
3748                 }
3749         }
3750         /*
3751          * Set the ether_type flower key and tc rule protocol:
3752          * - if there is nor VLAN neither VXLAN the key is taken from
3753          *   eth item directly or deduced from L3 items.
3754          * - if there is vlan item then key is fixed to 802.1q.
3755          * - if there is vxlan item then key is set to inner tunnel type.
3756          * - simultaneous vlan and vxlan items are prohibited.
3757          */
3758         if (outer_etype != RTE_BE16(ETH_P_ALL)) {
3759                 tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
3760                                            outer_etype);
3761                 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3762                         if (inner_etype != RTE_BE16(ETH_P_ALL))
3763                                 mnl_attr_put_u16(nlh,
3764                                                  TCA_FLOWER_KEY_ETH_TYPE,
3765                                                  inner_etype);
3766                 } else {
3767                         mnl_attr_put_u16(nlh,
3768                                          TCA_FLOWER_KEY_ETH_TYPE,
3769                                          outer_etype);
3770                         if (outer_etype == RTE_BE16(ETH_P_8021Q) &&
3771                             vlan_etype != RTE_BE16(ETH_P_ALL))
3772                                 mnl_attr_put_u16(nlh,
3773                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE,
3774                                                  vlan_etype);
3775                 }
3776                 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3777         }
3778         na_flower_act = mnl_attr_nest_start(nlh, TCA_FLOWER_ACT);
3779         na_act_index_cur = 1;
3780         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
3781                 struct nlattr *na_act_index;
3782                 struct nlattr *na_act;
3783                 unsigned int vlan_act;
3784                 unsigned int i;
3785
3786                 switch (actions->type) {
3787                 case RTE_FLOW_ACTION_TYPE_VOID:
3788                         break;
3789                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
3790                         conf.port_id = actions->conf;
3791                         if (conf.port_id->original)
3792                                 i = 0;
3793                         else
3794                                 for (i = 0; ptoi[i].ifindex; ++i)
3795                                         if (ptoi[i].port_id == conf.port_id->id)
3796                                                 break;
3797                         assert(ptoi[i].ifindex);
3798                         na_act_index =
3799                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3800                         assert(na_act_index);
3801                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
3802                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3803                         assert(na_act);
3804                         if (encap.hdr) {
3805                                 assert(dev_flow->tcf.tunnel);
3806                                 dev_flow->tcf.tunnel->ifindex_ptr =
3807                                         &((struct tc_mirred *)
3808                                         mnl_attr_get_payload
3809                                         (mnl_nlmsg_get_payload_tail
3810                                                 (nlh)))->ifindex;
3811                         }
3812                         mnl_attr_put(nlh, TCA_MIRRED_PARMS,
3813                                      sizeof(struct tc_mirred),
3814                                      &(struct tc_mirred){
3815                                         .action = TC_ACT_STOLEN,
3816                                         .eaction = TCA_EGRESS_REDIR,
3817                                         .ifindex = ptoi[i].ifindex,
3818                                      });
3819                         mnl_attr_nest_end(nlh, na_act);
3820                         mnl_attr_nest_end(nlh, na_act_index);
3821                         break;
3822                 case RTE_FLOW_ACTION_TYPE_JUMP:
3823                         conf.jump = actions->conf;
3824                         na_act_index =
3825                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3826                         assert(na_act_index);
3827                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3828                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3829                         assert(na_act);
3830                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3831                                      sizeof(struct tc_gact),
3832                                      &(struct tc_gact){
3833                                         .action = TC_ACT_GOTO_CHAIN |
3834                                                   conf.jump->group,
3835                                      });
3836                         mnl_attr_nest_end(nlh, na_act);
3837                         mnl_attr_nest_end(nlh, na_act_index);
3838                         break;
3839                 case RTE_FLOW_ACTION_TYPE_DROP:
3840                         na_act_index =
3841                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3842                         assert(na_act_index);
3843                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3844                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3845                         assert(na_act);
3846                         mnl_attr_put(nlh, TCA_GACT_PARMS,
3847                                      sizeof(struct tc_gact),
3848                                      &(struct tc_gact){
3849                                         .action = TC_ACT_SHOT,
3850                                      });
3851                         mnl_attr_nest_end(nlh, na_act);
3852                         mnl_attr_nest_end(nlh, na_act_index);
3853                         break;
3854                 case RTE_FLOW_ACTION_TYPE_COUNT:
3855                         /*
3856                          * Driver adds the count action implicitly for
3857                          * each rule it creates.
3858                          */
3859                         ret = flow_tcf_translate_action_count(dev,
3860                                                               dev_flow, error);
3861                         if (ret < 0)
3862                                 return ret;
3863                         break;
3864                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
3865                         conf.of_push_vlan = NULL;
3866                         vlan_act = TCA_VLAN_ACT_POP;
3867                         goto action_of_vlan;
3868                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
3869                         conf.of_push_vlan = actions->conf;
3870                         vlan_act = TCA_VLAN_ACT_PUSH;
3871                         goto action_of_vlan;
3872                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
3873                         conf.of_set_vlan_vid = actions->conf;
3874                         if (na_vlan_id)
3875                                 goto override_na_vlan_id;
3876                         vlan_act = TCA_VLAN_ACT_MODIFY;
3877                         goto action_of_vlan;
3878                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
3879                         conf.of_set_vlan_pcp = actions->conf;
3880                         if (na_vlan_priority)
3881                                 goto override_na_vlan_priority;
3882                         vlan_act = TCA_VLAN_ACT_MODIFY;
3883                         goto action_of_vlan;
3884 action_of_vlan:
3885                         na_act_index =
3886                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3887                         assert(na_act_index);
3888                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "vlan");
3889                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3890                         assert(na_act);
3891                         mnl_attr_put(nlh, TCA_VLAN_PARMS,
3892                                      sizeof(struct tc_vlan),
3893                                      &(struct tc_vlan){
3894                                         .action = TC_ACT_PIPE,
3895                                         .v_action = vlan_act,
3896                                      });
3897                         if (vlan_act == TCA_VLAN_ACT_POP) {
3898                                 mnl_attr_nest_end(nlh, na_act);
3899                                 mnl_attr_nest_end(nlh, na_act_index);
3900                                 break;
3901                         }
3902                         if (vlan_act == TCA_VLAN_ACT_PUSH)
3903                                 mnl_attr_put_u16(nlh,
3904                                                  TCA_VLAN_PUSH_VLAN_PROTOCOL,
3905                                                  conf.of_push_vlan->ethertype);
3906                         na_vlan_id = mnl_nlmsg_get_payload_tail(nlh);
3907                         mnl_attr_put_u16(nlh, TCA_VLAN_PAD, 0);
3908                         na_vlan_priority = mnl_nlmsg_get_payload_tail(nlh);
3909                         mnl_attr_put_u8(nlh, TCA_VLAN_PAD, 0);
3910                         mnl_attr_nest_end(nlh, na_act);
3911                         mnl_attr_nest_end(nlh, na_act_index);
3912                         if (actions->type ==
3913                             RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
3914 override_na_vlan_id:
3915                                 na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID;
3916                                 *(uint16_t *)mnl_attr_get_payload(na_vlan_id) =
3917                                         rte_be_to_cpu_16
3918                                         (conf.of_set_vlan_vid->vlan_vid);
3919                         } else if (actions->type ==
3920                                    RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
3921 override_na_vlan_priority:
3922                                 na_vlan_priority->nla_type =
3923                                         TCA_VLAN_PUSH_VLAN_PRIORITY;
3924                                 *(uint8_t *)mnl_attr_get_payload
3925                                         (na_vlan_priority) =
3926                                         conf.of_set_vlan_pcp->vlan_pcp;
3927                         }
3928                         break;
3929                 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
3930                         assert(decap.vxlan);
3931                         assert(dev_flow->tcf.tunnel);
3932                         dev_flow->tcf.tunnel->ifindex_ptr =
3933                                 (unsigned int *)&tcm->tcm_ifindex;
3934                         na_act_index =
3935                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3936                         assert(na_act_index);
3937                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3938                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3939                         assert(na_act);
3940                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3941                                 sizeof(struct tc_tunnel_key),
3942                                 &(struct tc_tunnel_key){
3943                                         .action = TC_ACT_PIPE,
3944                                         .t_action = TCA_TUNNEL_KEY_ACT_RELEASE,
3945                                         });
3946                         mnl_attr_nest_end(nlh, na_act);
3947                         mnl_attr_nest_end(nlh, na_act_index);
3948                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3949                         break;
3950                 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
3951                         assert(encap.vxlan);
3952                         flow_tcf_vxlan_encap_parse(actions, encap.vxlan);
3953                         na_act_index =
3954                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
3955                         assert(na_act_index);
3956                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3957                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3958                         assert(na_act);
3959                         mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3960                                 sizeof(struct tc_tunnel_key),
3961                                 &(struct tc_tunnel_key){
3962                                         .action = TC_ACT_PIPE,
3963                                         .t_action = TCA_TUNNEL_KEY_ACT_SET,
3964                                         });
3965                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_UDP_DST)
3966                                 mnl_attr_put_u16(nlh,
3967                                          TCA_TUNNEL_KEY_ENC_DST_PORT,
3968                                          encap.vxlan->udp.dst);
3969                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_SRC)
3970                                 mnl_attr_put_u32(nlh,
3971                                          TCA_TUNNEL_KEY_ENC_IPV4_SRC,
3972                                          encap.vxlan->ipv4.src);
3973                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_DST)
3974                                 mnl_attr_put_u32(nlh,
3975                                          TCA_TUNNEL_KEY_ENC_IPV4_DST,
3976                                          encap.vxlan->ipv4.dst);
3977                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_SRC)
3978                                 mnl_attr_put(nlh,
3979                                          TCA_TUNNEL_KEY_ENC_IPV6_SRC,
3980                                          sizeof(encap.vxlan->ipv6.src),
3981                                          &encap.vxlan->ipv6.src);
3982                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_DST)
3983                                 mnl_attr_put(nlh,
3984                                          TCA_TUNNEL_KEY_ENC_IPV6_DST,
3985                                          sizeof(encap.vxlan->ipv6.dst),
3986                                          &encap.vxlan->ipv6.dst);
3987                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IP_TTL)
3988                                 mnl_attr_put_u8(nlh,
3989                                          TCA_TUNNEL_KEY_ENC_TTL,
3990                                          encap.vxlan->ip_ttl_hop);
3991                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_IP_TOS)
3992                                 mnl_attr_put_u8(nlh,
3993                                          TCA_TUNNEL_KEY_ENC_TOS,
3994                                          encap.vxlan->ip_tos);
3995                         if (encap.vxlan->mask & FLOW_TCF_ENCAP_VXLAN_VNI)
3996                                 mnl_attr_put_u32(nlh,
3997                                          TCA_TUNNEL_KEY_ENC_KEY_ID,
3998                                          vxlan_vni_as_be32
3999                                                 (encap.vxlan->vxlan.vni));
4000                         mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM, 0);
4001                         mnl_attr_nest_end(nlh, na_act);
4002                         mnl_attr_nest_end(nlh, na_act_index);
4003                         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
4004                         break;
4005                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
4006                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
4007                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
4008                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
4009                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
4010                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
4011                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
4012                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
4013                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
4014                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
4015                         na_act_index =
4016                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
4017                         flow_tcf_create_pedit_mnl_msg(nlh,
4018                                                       &actions, item_flags);
4019                         mnl_attr_nest_end(nlh, na_act_index);
4020                         break;
4021                 default:
4022                         return rte_flow_error_set(error, ENOTSUP,
4023                                                   RTE_FLOW_ERROR_TYPE_ACTION,
4024                                                   actions,
4025                                                   "action not supported");
4026                 }
4027         }
4028         assert(na_flower);
4029         assert(na_flower_act);
4030         mnl_attr_nest_end(nlh, na_flower_act);
4031         dev_flow->tcf.ptc_flags = mnl_attr_get_payload
4032                                         (mnl_nlmsg_get_payload_tail(nlh));
4033         mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, decap.vxlan ?
4034                                                 0 : TCA_CLS_FLAGS_SKIP_SW);
4035         mnl_attr_nest_end(nlh, na_flower);
4036         if (dev_flow->tcf.tunnel && dev_flow->tcf.tunnel->ifindex_ptr)
4037                 dev_flow->tcf.tunnel->ifindex_org =
4038                         *dev_flow->tcf.tunnel->ifindex_ptr;
4039         assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
4040         return 0;
4041 }
4042
4043 /**
4044  * Send Netlink message with acknowledgment.
4045  *
4046  * @param tcf
4047  *   Flow context to use.
4048  * @param nlh
4049  *   Message to send. This function always raises the NLM_F_ACK flag before
4050  *   sending.
4051  * @param[in] cb
4052  *   Callback handler for received message.
4053  * @param[in] arg
4054  *   Context pointer for callback handler.
4055  *
4056  * @return
4057  *   0 on success, a negative errno value otherwise and rte_errno is set.
4058  */
4059 static int
4060 flow_tcf_nl_ack(struct mlx5_flow_tcf_context *tcf,
4061                 struct nlmsghdr *nlh,
4062                 mnl_cb_t cb, void *arg)
4063 {
4064         unsigned int portid = mnl_socket_get_portid(tcf->nl);
4065         uint32_t seq = tcf->seq++;
4066         int ret, err = 0;
4067
4068         assert(tcf->nl);
4069         assert(tcf->buf);
4070         if (!seq) {
4071                 /* seq 0 is reserved for kernel event-driven notifications. */
4072                 seq = tcf->seq++;
4073         }
4074         nlh->nlmsg_seq = seq;
4075         nlh->nlmsg_flags |= NLM_F_ACK;
4076         ret = mnl_socket_sendto(tcf->nl, nlh, nlh->nlmsg_len);
4077         if (ret <= 0) {
4078                 /* Message send error occurres. */
4079                 rte_errno = errno;
4080                 return -rte_errno;
4081         }
4082         nlh = (struct nlmsghdr *)(tcf->buf);
4083         /*
4084          * The following loop postpones non-fatal errors until multipart
4085          * messages are complete.
4086          */
4087         while (true) {
4088                 ret = mnl_socket_recvfrom(tcf->nl, tcf->buf, tcf->buf_size);
4089                 if (ret < 0) {
4090                         err = errno;
4091                         /*
4092                          * In case of overflow Will receive till
4093                          * end of multipart message. We may lost part
4094                          * of reply messages but mark and return an error.
4095                          */
4096                         if (err != ENOSPC ||
4097                             !(nlh->nlmsg_flags & NLM_F_MULTI) ||
4098                             nlh->nlmsg_type == NLMSG_DONE)
4099                                 break;
4100                 } else {
4101                         ret = mnl_cb_run(nlh, ret, seq, portid, cb, arg);
4102                         if (!ret) {
4103                                 /*
4104                                  * libmnl returns 0 if DONE or
4105                                  * success ACK message found.
4106                                  */
4107                                 break;
4108                         }
4109                         if (ret < 0) {
4110                                 /*
4111                                  * ACK message with error found
4112                                  * or some error occurred.
4113                                  */
4114                                 err = errno;
4115                                 break;
4116                         }
4117                         /* We should continue receiving. */
4118                 }
4119         }
4120         if (!err)
4121                 return 0;
4122         rte_errno = err;
4123         return -err;
4124 }
4125
4126 #define MNL_BUF_EXTRA_SPACE 16
4127 #define MNL_REQUEST_SIZE_MIN 256
4128 #define MNL_REQUEST_SIZE_MAX 2048
4129 #define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \
4130                                  MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX)
4131
4132 /* Data structures used by flow_tcf_xxx_cb() routines. */
4133 struct tcf_nlcb_buf {
4134         LIST_ENTRY(tcf_nlcb_buf) next;
4135         uint32_t size;
4136         alignas(struct nlmsghdr)
4137         uint8_t msg[]; /**< Netlink message data. */
4138 };
4139
4140 struct tcf_nlcb_context {
4141         unsigned int ifindex; /**< Base interface index. */
4142         uint32_t bufsize;
4143         LIST_HEAD(, tcf_nlcb_buf) nlbuf;
4144 };
4145
4146 /**
4147  * Allocate space for netlink command in buffer list
4148  *
4149  * @param[in, out] ctx
4150  *   Pointer to callback context with command buffers list.
4151  * @param[in] size
4152  *   Required size of data buffer to be allocated.
4153  *
4154  * @return
4155  *   Pointer to allocated memory, aligned as message header.
4156  *   NULL if some error occurred.
4157  */
4158 static struct nlmsghdr *
4159 flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size)
4160 {
4161         struct tcf_nlcb_buf *buf;
4162         struct nlmsghdr *nlh;
4163
4164         size = NLMSG_ALIGN(size);
4165         buf = LIST_FIRST(&ctx->nlbuf);
4166         if (buf && (buf->size + size) <= ctx->bufsize) {
4167                 nlh = (struct nlmsghdr *)&buf->msg[buf->size];
4168                 buf->size += size;
4169                 return nlh;
4170         }
4171         if (size > ctx->bufsize) {
4172                 DRV_LOG(WARNING, "netlink: too long command buffer requested");
4173                 return NULL;
4174         }
4175         buf = rte_malloc(__func__,
4176                         ctx->bufsize + sizeof(struct tcf_nlcb_buf),
4177                         alignof(struct tcf_nlcb_buf));
4178         if (!buf) {
4179                 DRV_LOG(WARNING, "netlink: no memory for command buffer");
4180                 return NULL;
4181         }
4182         LIST_INSERT_HEAD(&ctx->nlbuf, buf, next);
4183         buf->size = size;
4184         nlh = (struct nlmsghdr *)&buf->msg[0];
4185         return nlh;
4186 }
4187
4188 /**
4189  * Send the buffers with prepared netlink commands. Scans the list and
4190  * sends all found buffers. Buffers are sent and freed anyway in order
4191  * to prevent memory leakage if some every message in received packet.
4192  *
4193  * @param[in] tcf
4194  *   Context object initialized by mlx5_flow_tcf_context_create().
4195  * @param[in, out] ctx
4196  *   Pointer to callback context with command buffers list.
4197  *
4198  * @return
4199  *   Zero value on success, negative errno value otherwise
4200  *   and rte_errno is set.
4201  */
4202 static int
4203 flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf,
4204                     struct tcf_nlcb_context *ctx)
4205 {
4206         struct tcf_nlcb_buf *bc = LIST_FIRST(&ctx->nlbuf);
4207         int ret = 0;
4208
4209         while (bc) {
4210                 struct tcf_nlcb_buf *bn = LIST_NEXT(bc, next);
4211                 struct nlmsghdr *nlh;
4212                 uint32_t msg = 0;
4213                 int rc;
4214
4215                 while (msg < bc->size) {
4216                         /*
4217                          * Send Netlink commands from buffer in one by one
4218                          * fashion. If we send multiple rule deletion commands
4219                          * in one Netlink message and some error occurs it may
4220                          * cause multiple ACK error messages and break sequence
4221                          * numbers of Netlink communication, because we expect
4222                          * the only one ACK reply.
4223                          */
4224                         assert((bc->size - msg) >= sizeof(struct nlmsghdr));
4225                         nlh = (struct nlmsghdr *)&bc->msg[msg];
4226                         assert((bc->size - msg) >= nlh->nlmsg_len);
4227                         msg += nlh->nlmsg_len;
4228                         rc = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4229                         if (rc) {
4230                                 DRV_LOG(WARNING,
4231                                         "netlink: cleanup error %d", rc);
4232                                 if (!ret)
4233                                         ret = rc;
4234                         }
4235                 }
4236                 rte_free(bc);
4237                 bc = bn;
4238         }
4239         LIST_INIT(&ctx->nlbuf);
4240         return ret;
4241 }
4242
4243 /**
4244  * Collect local IP address rules with scope link attribute  on specified
4245  * network device. This is callback routine called by libmnl mnl_cb_run()
4246  * in loop for every message in received packet.
4247  *
4248  * @param[in] nlh
4249  *   Pointer to reply header.
4250  * @param[in, out] arg
4251  *   Opaque data pointer for this callback.
4252  *
4253  * @return
4254  *   A positive, nonzero value on success, negative errno value otherwise
4255  *   and rte_errno is set.
4256  */
4257 static int
4258 flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg)
4259 {
4260         struct tcf_nlcb_context *ctx = arg;
4261         struct nlmsghdr *cmd;
4262         struct ifaddrmsg *ifa;
4263         struct nlattr *na;
4264         struct nlattr *na_local = NULL;
4265         struct nlattr *na_peer = NULL;
4266         unsigned char family;
4267         uint32_t size;
4268
4269         if (nlh->nlmsg_type != RTM_NEWADDR) {
4270                 rte_errno = EINVAL;
4271                 return -rte_errno;
4272         }
4273         ifa = mnl_nlmsg_get_payload(nlh);
4274         family = ifa->ifa_family;
4275         if (ifa->ifa_index != ctx->ifindex ||
4276             ifa->ifa_scope != RT_SCOPE_LINK ||
4277             !(ifa->ifa_flags & IFA_F_PERMANENT) ||
4278             (family != AF_INET && family != AF_INET6))
4279                 return 1;
4280         mnl_attr_for_each(na, nlh, sizeof(*ifa)) {
4281                 switch (mnl_attr_get_type(na)) {
4282                 case IFA_LOCAL:
4283                         na_local = na;
4284                         break;
4285                 case IFA_ADDRESS:
4286                         na_peer = na;
4287                         break;
4288                 }
4289                 if (na_local && na_peer)
4290                         break;
4291         }
4292         if (!na_local || !na_peer)
4293                 return 1;
4294         /* Local rule found with scope link, permanent and assigned peer. */
4295         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4296                MNL_ALIGN(sizeof(struct ifaddrmsg)) +
4297                (family == AF_INET6 ? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4298                                    : 2 * SZ_NLATTR_TYPE_OF(uint32_t));
4299         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4300         if (!cmd) {
4301                 rte_errno = ENOMEM;
4302                 return -rte_errno;
4303         }
4304         cmd = mnl_nlmsg_put_header(cmd);
4305         cmd->nlmsg_type = RTM_DELADDR;
4306         cmd->nlmsg_flags = NLM_F_REQUEST;
4307         ifa = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifa));
4308         ifa->ifa_flags = IFA_F_PERMANENT;
4309         ifa->ifa_scope = RT_SCOPE_LINK;
4310         ifa->ifa_index = ctx->ifindex;
4311         if (family == AF_INET) {
4312                 ifa->ifa_family = AF_INET;
4313                 ifa->ifa_prefixlen = 32;
4314                 mnl_attr_put_u32(cmd, IFA_LOCAL, mnl_attr_get_u32(na_local));
4315                 mnl_attr_put_u32(cmd, IFA_ADDRESS, mnl_attr_get_u32(na_peer));
4316         } else {
4317                 ifa->ifa_family = AF_INET6;
4318                 ifa->ifa_prefixlen = 128;
4319                 mnl_attr_put(cmd, IFA_LOCAL, IPV6_ADDR_LEN,
4320                         mnl_attr_get_payload(na_local));
4321                 mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN,
4322                         mnl_attr_get_payload(na_peer));
4323         }
4324         assert(size == cmd->nlmsg_len);
4325         return 1;
4326 }
4327
4328 /**
4329  * Cleanup the local IP addresses on outer interface.
4330  *
4331  * @param[in] tcf
4332  *   Context object initialized by mlx5_flow_tcf_context_create().
4333  * @param[in] ifindex
4334  *   Network inferface index to perform cleanup.
4335  */
4336 static void
4337 flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf,
4338                             unsigned int ifindex)
4339 {
4340         struct nlmsghdr *nlh;
4341         struct ifaddrmsg *ifa;
4342         struct tcf_nlcb_context ctx = {
4343                 .ifindex = ifindex,
4344                 .bufsize = MNL_REQUEST_SIZE,
4345                 .nlbuf = LIST_HEAD_INITIALIZER(),
4346         };
4347         int ret;
4348
4349         assert(ifindex);
4350         /*
4351          * Seek and destroy leftovers of local IP addresses with
4352          * matching properties "scope link".
4353          */
4354         nlh = mnl_nlmsg_put_header(tcf->buf);
4355         nlh->nlmsg_type = RTM_GETADDR;
4356         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4357         ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4358         ifa->ifa_family = AF_UNSPEC;
4359         ifa->ifa_index = ifindex;
4360         ifa->ifa_scope = RT_SCOPE_LINK;
4361         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_local_cb, &ctx);
4362         if (ret)
4363                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4364         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4365         if (ret)
4366                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4367 }
4368
4369 /**
4370  * Collect neigh permament rules on specified network device.
4371  * This is callback routine called by libmnl mnl_cb_run() in loop for
4372  * every message in received packet.
4373  *
4374  * @param[in] nlh
4375  *   Pointer to reply header.
4376  * @param[in, out] arg
4377  *   Opaque data pointer for this callback.
4378  *
4379  * @return
4380  *   A positive, nonzero value on success, negative errno value otherwise
4381  *   and rte_errno is set.
4382  */
4383 static int
4384 flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg)
4385 {
4386         struct tcf_nlcb_context *ctx = arg;
4387         struct nlmsghdr *cmd;
4388         struct ndmsg *ndm;
4389         struct nlattr *na;
4390         struct nlattr *na_ip = NULL;
4391         struct nlattr *na_mac = NULL;
4392         unsigned char family;
4393         uint32_t size;
4394
4395         if (nlh->nlmsg_type != RTM_NEWNEIGH) {
4396                 rte_errno = EINVAL;
4397                 return -rte_errno;
4398         }
4399         ndm = mnl_nlmsg_get_payload(nlh);
4400         family = ndm->ndm_family;
4401         if (ndm->ndm_ifindex != (int)ctx->ifindex ||
4402            !(ndm->ndm_state & NUD_PERMANENT) ||
4403            (family != AF_INET && family != AF_INET6))
4404                 return 1;
4405         mnl_attr_for_each(na, nlh, sizeof(*ndm)) {
4406                 switch (mnl_attr_get_type(na)) {
4407                 case NDA_DST:
4408                         na_ip = na;
4409                         break;
4410                 case NDA_LLADDR:
4411                         na_mac = na;
4412                         break;
4413                 }
4414                 if (na_mac && na_ip)
4415                         break;
4416         }
4417         if (!na_mac || !na_ip)
4418                 return 1;
4419         /* Neigh rule with permenent attribute found. */
4420         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4421                MNL_ALIGN(sizeof(struct ndmsg)) +
4422                SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) +
4423                (family == AF_INET6 ? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4424                                    : SZ_NLATTR_TYPE_OF(uint32_t));
4425         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4426         if (!cmd) {
4427                 rte_errno = ENOMEM;
4428                 return -rte_errno;
4429         }
4430         cmd = mnl_nlmsg_put_header(cmd);
4431         cmd->nlmsg_type = RTM_DELNEIGH;
4432         cmd->nlmsg_flags = NLM_F_REQUEST;
4433         ndm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ndm));
4434         ndm->ndm_ifindex = ctx->ifindex;
4435         ndm->ndm_state = NUD_PERMANENT;
4436         ndm->ndm_flags = 0;
4437         ndm->ndm_type = 0;
4438         if (family == AF_INET) {
4439                 ndm->ndm_family = AF_INET;
4440                 mnl_attr_put_u32(cmd, NDA_DST, mnl_attr_get_u32(na_ip));
4441         } else {
4442                 ndm->ndm_family = AF_INET6;
4443                 mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN,
4444                              mnl_attr_get_payload(na_ip));
4445         }
4446         mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN,
4447                      mnl_attr_get_payload(na_mac));
4448         assert(size == cmd->nlmsg_len);
4449         return 1;
4450 }
4451
4452 /**
4453  * Cleanup the neigh rules on outer interface.
4454  *
4455  * @param[in] tcf
4456  *   Context object initialized by mlx5_flow_tcf_context_create().
4457  * @param[in] ifindex
4458  *   Network inferface index to perform cleanup.
4459  */
4460 static void
4461 flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf,
4462                             unsigned int ifindex)
4463 {
4464         struct nlmsghdr *nlh;
4465         struct ndmsg *ndm;
4466         struct tcf_nlcb_context ctx = {
4467                 .ifindex = ifindex,
4468                 .bufsize = MNL_REQUEST_SIZE,
4469                 .nlbuf = LIST_HEAD_INITIALIZER(),
4470         };
4471         int ret;
4472
4473         assert(ifindex);
4474         /* Seek and destroy leftovers of neigh rules. */
4475         nlh = mnl_nlmsg_put_header(tcf->buf);
4476         nlh->nlmsg_type = RTM_GETNEIGH;
4477         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4478         ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4479         ndm->ndm_family = AF_UNSPEC;
4480         ndm->ndm_ifindex = ifindex;
4481         ndm->ndm_state = NUD_PERMANENT;
4482         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_neigh_cb, &ctx);
4483         if (ret)
4484                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4485         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4486         if (ret)
4487                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4488 }
4489
4490 /**
4491  * Collect indices of VXLAN encap/decap interfaces associated with device.
4492  * This is callback routine called by libmnl mnl_cb_run() in loop for
4493  * every message in received packet.
4494  *
4495  * @param[in] nlh
4496  *   Pointer to reply header.
4497  * @param[in, out] arg
4498  *   Opaque data pointer for this callback.
4499  *
4500  * @return
4501  *   A positive, nonzero value on success, negative errno value otherwise
4502  *   and rte_errno is set.
4503  */
4504 static int
4505 flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg)
4506 {
4507         struct tcf_nlcb_context *ctx = arg;
4508         struct nlmsghdr *cmd;
4509         struct ifinfomsg *ifm;
4510         struct nlattr *na;
4511         struct nlattr *na_info = NULL;
4512         struct nlattr *na_vxlan = NULL;
4513         bool found = false;
4514         unsigned int vxindex;
4515         uint32_t size;
4516
4517         if (nlh->nlmsg_type != RTM_NEWLINK) {
4518                 rte_errno = EINVAL;
4519                 return -rte_errno;
4520         }
4521         ifm = mnl_nlmsg_get_payload(nlh);
4522         if (!ifm->ifi_index) {
4523                 rte_errno = EINVAL;
4524                 return -rte_errno;
4525         }
4526         mnl_attr_for_each(na, nlh, sizeof(*ifm))
4527                 if (mnl_attr_get_type(na) == IFLA_LINKINFO) {
4528                         na_info = na;
4529                         break;
4530                 }
4531         if (!na_info)
4532                 return 1;
4533         mnl_attr_for_each_nested(na, na_info) {
4534                 switch (mnl_attr_get_type(na)) {
4535                 case IFLA_INFO_KIND:
4536                         if (!strncmp("vxlan", mnl_attr_get_str(na),
4537                                      mnl_attr_get_len(na)))
4538                                 found = true;
4539                         break;
4540                 case IFLA_INFO_DATA:
4541                         na_vxlan = na;
4542                         break;
4543                 }
4544                 if (found && na_vxlan)
4545                         break;
4546         }
4547         if (!found || !na_vxlan)
4548                 return 1;
4549         found = false;
4550         mnl_attr_for_each_nested(na, na_vxlan) {
4551                 if (mnl_attr_get_type(na) == IFLA_VXLAN_LINK &&
4552                     mnl_attr_get_u32(na) == ctx->ifindex) {
4553                         found = true;
4554                         break;
4555                 }
4556         }
4557         if (!found)
4558                 return 1;
4559         /* Attached VXLAN device found, store the command to delete. */
4560         vxindex = ifm->ifi_index;
4561         size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4562                MNL_ALIGN(sizeof(struct ifinfomsg));
4563         cmd = flow_tcf_alloc_nlcmd(ctx, size);
4564         if (!cmd) {
4565                 rte_errno = ENOMEM;
4566                 return -rte_errno;
4567         }
4568         cmd = mnl_nlmsg_put_header(cmd);
4569         cmd->nlmsg_type = RTM_DELLINK;
4570         cmd->nlmsg_flags = NLM_F_REQUEST;
4571         ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm));
4572         ifm->ifi_family = AF_UNSPEC;
4573         ifm->ifi_index = vxindex;
4574         assert(size == cmd->nlmsg_len);
4575         return 1;
4576 }
4577
4578 /**
4579  * Cleanup the outer interface. Removes all found vxlan devices
4580  * attached to specified index, flushes the neigh and local IP
4581  * database.
4582  *
4583  * @param[in] tcf
4584  *   Context object initialized by mlx5_flow_tcf_context_create().
4585  * @param[in] ifindex
4586  *   Network inferface index to perform cleanup.
4587  */
4588 static void
4589 flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf,
4590                             unsigned int ifindex)
4591 {
4592         struct nlmsghdr *nlh;
4593         struct ifinfomsg *ifm;
4594         struct tcf_nlcb_context ctx = {
4595                 .ifindex = ifindex,
4596                 .bufsize = MNL_REQUEST_SIZE,
4597                 .nlbuf = LIST_HEAD_INITIALIZER(),
4598         };
4599         int ret;
4600
4601         assert(ifindex);
4602         /*
4603          * Seek and destroy leftover VXLAN encap/decap interfaces with
4604          * matching properties.
4605          */
4606         nlh = mnl_nlmsg_put_header(tcf->buf);
4607         nlh->nlmsg_type = RTM_GETLINK;
4608         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4609         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4610         ifm->ifi_family = AF_UNSPEC;
4611         ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_vxlan_cb, &ctx);
4612         if (ret)
4613                 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4614         ret = flow_tcf_send_nlcmd(tcf, &ctx);
4615         if (ret)
4616                 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4617 }
4618
4619 /**
4620  * Emit Netlink message to add/remove local address to the outer device.
4621  * The address being added is visible within the link only (scope link).
4622  *
4623  * Note that an implicit route is maintained by the kernel due to the
4624  * presence of a peer address (IFA_ADDRESS).
4625  *
4626  * These rules are used for encapsultion only and allow to assign
4627  * the outer tunnel source IP address.
4628  *
4629  * @param[in] tcf
4630  *   Libmnl socket context object.
4631  * @param[in] encap
4632  *   Encapsulation properties (source address and its peer).
4633  * @param[in] ifindex
4634  *   Network interface to apply rule.
4635  * @param[in] enable
4636  *   Toggle between add and remove.
4637  * @param[out] error
4638  *   Perform verbose error reporting if not NULL.
4639  *
4640  * @return
4641  *   0 on success, a negative errno value otherwise and rte_errno is set.
4642  */
4643 static int
4644 flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf,
4645                     const struct flow_tcf_vxlan_encap *encap,
4646                     unsigned int ifindex,
4647                     bool enable,
4648                     struct rte_flow_error *error)
4649 {
4650         struct nlmsghdr *nlh;
4651         struct ifaddrmsg *ifa;
4652         alignas(struct nlmsghdr)
4653         uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
4654
4655         nlh = mnl_nlmsg_put_header(buf);
4656         nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
4657         nlh->nlmsg_flags =
4658                 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4659         nlh->nlmsg_seq = 0;
4660         ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4661         ifa->ifa_flags = IFA_F_PERMANENT;
4662         ifa->ifa_scope = RT_SCOPE_LINK;
4663         ifa->ifa_index = ifindex;
4664         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4665                 ifa->ifa_family = AF_INET;
4666                 ifa->ifa_prefixlen = 32;
4667                 mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src);
4668                 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST)
4669                         mnl_attr_put_u32(nlh, IFA_ADDRESS,
4670                                               encap->ipv4.dst);
4671         } else {
4672                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4673                 ifa->ifa_family = AF_INET6;
4674                 ifa->ifa_prefixlen = 128;
4675                 mnl_attr_put(nlh, IFA_LOCAL,
4676                                   sizeof(encap->ipv6.src),
4677                                   &encap->ipv6.src);
4678                 if (encap->mask & FLOW_TCF_ENCAP_IPV6_DST)
4679                         mnl_attr_put(nlh, IFA_ADDRESS,
4680                                           sizeof(encap->ipv6.dst),
4681                                           &encap->ipv6.dst);
4682         }
4683         if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4684                 return 0;
4685         return rte_flow_error_set(error, rte_errno,
4686                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4687                                   "netlink: cannot complete IFA request"
4688                                   " (ip addr add)");
4689 }
4690
4691 /**
4692  * Emit Netlink message to add/remove neighbor.
4693  *
4694  * @param[in] tcf
4695  *   Libmnl socket context object.
4696  * @param[in] encap
4697  *   Encapsulation properties (destination address).
4698  * @param[in] ifindex
4699  *   Network interface.
4700  * @param[in] enable
4701  *   Toggle between add and remove.
4702  * @param[out] error
4703  *   Perform verbose error reporting if not NULL.
4704  *
4705  * @return
4706  *   0 on success, a negative errno value otherwise and rte_errno is set.
4707  */
4708 static int
4709 flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf,
4710                      const struct flow_tcf_vxlan_encap *encap,
4711                      unsigned int ifindex,
4712                      bool enable,
4713                      struct rte_flow_error *error)
4714 {
4715         struct nlmsghdr *nlh;
4716         struct ndmsg *ndm;
4717         alignas(struct nlmsghdr)
4718         uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
4719
4720         nlh = mnl_nlmsg_put_header(buf);
4721         nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
4722         nlh->nlmsg_flags =
4723                 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4724         nlh->nlmsg_seq = 0;
4725         ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4726         ndm->ndm_ifindex = ifindex;
4727         ndm->ndm_state = NUD_PERMANENT;
4728         ndm->ndm_flags = 0;
4729         ndm->ndm_type = 0;
4730         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4731                 ndm->ndm_family = AF_INET;
4732                 mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst);
4733         } else {
4734                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4735                 ndm->ndm_family = AF_INET6;
4736                 mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst),
4737                                                  &encap->ipv6.dst);
4738         }
4739         if (encap->mask & FLOW_TCF_ENCAP_ETH_SRC && enable)
4740                 DRV_LOG(WARNING,
4741                         "outer ethernet source address cannot be "
4742                         "forced for VXLAN encapsulation");
4743         if (encap->mask & FLOW_TCF_ENCAP_ETH_DST)
4744                 mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst),
4745                                                     &encap->eth.dst);
4746         if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4747                 return 0;
4748         return rte_flow_error_set(error, rte_errno,
4749                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4750                                   "netlink: cannot complete ND request"
4751                                   " (ip neigh)");
4752 }
4753
4754 /**
4755  * Manage the local IP addresses and their peers IP addresses on the
4756  * outer interface for encapsulation purposes. The kernel searches the
4757  * appropriate device for tunnel egress traffic using the outer source
4758  * IP, this IP should be assigned to the outer network device, otherwise
4759  * kernel rejects the rule.
4760  *
4761  * Adds or removes the addresses using the Netlink command like this:
4762  *   ip addr add <src_ip> peer <dst_ip> scope link dev <ifouter>
4763  *
4764  * The addresses are local to the netdev ("scope link"), this reduces
4765  * the risk of conflicts. Note that an implicit route is maintained by
4766  * the kernel due to the presence of a peer address (IFA_ADDRESS).
4767  *
4768  * @param[in] tcf
4769  *   Libmnl socket context object.
4770  * @param[in] iface
4771  *   Object, contains rule database and ifouter index.
4772  * @param[in] dev_flow
4773  *   Flow object, contains the tunnel parameters (for encap only).
4774  * @param[in] enable
4775  *   Toggle between add and remove.
4776  * @param[out] error
4777  *   Perform verbose error reporting if not NULL.
4778  *
4779  * @return
4780  *   0 on success, a negative errno value otherwise and rte_errno is set.
4781  */
4782 static int
4783 flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf,
4784                      struct tcf_irule *iface,
4785                      struct mlx5_flow *dev_flow,
4786                      bool enable,
4787                      struct rte_flow_error *error)
4788 {
4789         const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4790         struct tcf_local_rule *rule = NULL;
4791         int ret;
4792
4793         assert(encap);
4794         assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4795         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4796                 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_DST);
4797                 LIST_FOREACH(rule, &iface->local, next) {
4798                         if (rule->mask & FLOW_TCF_ENCAP_IPV4_SRC &&
4799                             encap->ipv4.src == rule->ipv4.src &&
4800                             encap->ipv4.dst == rule->ipv4.dst) {
4801                                 break;
4802                         }
4803                 }
4804         } else {
4805                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4806                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4807                 LIST_FOREACH(rule, &iface->local, next) {
4808                         if (rule->mask & FLOW_TCF_ENCAP_IPV6_SRC &&
4809                             !memcmp(&encap->ipv6.src, &rule->ipv6.src,
4810                                             sizeof(encap->ipv6.src)) &&
4811                             !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4812                                             sizeof(encap->ipv6.dst))) {
4813                                 break;
4814                         }
4815                 }
4816         }
4817         if (rule) {
4818                 if (enable) {
4819                         rule->refcnt++;
4820                         return 0;
4821                 }
4822                 if (!rule->refcnt || !--rule->refcnt) {
4823                         LIST_REMOVE(rule, next);
4824                         return flow_tcf_rule_local(tcf, encap,
4825                                         iface->ifouter, false, error);
4826                 }
4827                 return 0;
4828         }
4829         if (!enable) {
4830                 DRV_LOG(WARNING, "disabling not existing local rule");
4831                 rte_flow_error_set(error, ENOENT,
4832                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4833                                    "disabling not existing local rule");
4834                 return -ENOENT;
4835         }
4836         rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule),
4837                                 alignof(struct tcf_local_rule));
4838         if (!rule) {
4839                 rte_flow_error_set(error, ENOMEM,
4840                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4841                                    "unable to allocate memory for local rule");
4842                 return -rte_errno;
4843         }
4844         *rule = (struct tcf_local_rule){.refcnt = 0,
4845                                         .mask = 0,
4846                                         };
4847         if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4848                 rule->mask = FLOW_TCF_ENCAP_IPV4_SRC
4849                            | FLOW_TCF_ENCAP_IPV4_DST;
4850                 rule->ipv4.src = encap->ipv4.src;
4851                 rule->ipv4.dst = encap->ipv4.dst;
4852         } else {
4853                 rule->mask = FLOW_TCF_ENCAP_IPV6_SRC
4854                            | FLOW_TCF_ENCAP_IPV6_DST;
4855                 memcpy(&rule->ipv6.src, &encap->ipv6.src, IPV6_ADDR_LEN);
4856                 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4857         }
4858         ret = flow_tcf_rule_local(tcf, encap, iface->ifouter, true, error);
4859         if (ret) {
4860                 rte_free(rule);
4861                 return ret;
4862         }
4863         rule->refcnt++;
4864         LIST_INSERT_HEAD(&iface->local, rule, next);
4865         return 0;
4866 }
4867
4868 /**
4869  * Manage the destination MAC/IP addresses neigh database, kernel uses
4870  * this one to determine the destination MAC address within encapsulation
4871  * header. Adds or removes the entries using the Netlink command like this:
4872  *   ip neigh add dev <ifouter> lladdr <dst_mac> to <dst_ip> nud permanent
4873  *
4874  * @param[in] tcf
4875  *   Libmnl socket context object.
4876  * @param[in] iface
4877  *   Object, contains rule database and ifouter index.
4878  * @param[in] dev_flow
4879  *   Flow object, contains the tunnel parameters (for encap only).
4880  * @param[in] enable
4881  *   Toggle between add and remove.
4882  * @param[out] error
4883  *   Perform verbose error reporting if not NULL.
4884  *
4885  * @return
4886  *   0 on success, a negative errno value otherwise and rte_errno is set.
4887  */
4888 static int
4889 flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf,
4890                      struct tcf_irule *iface,
4891                      struct mlx5_flow *dev_flow,
4892                      bool enable,
4893                      struct rte_flow_error *error)
4894 {
4895         const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4896         struct tcf_neigh_rule *rule = NULL;
4897         int ret;
4898
4899         assert(encap);
4900         assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4901         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4902                 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_SRC);
4903                 LIST_FOREACH(rule, &iface->neigh, next) {
4904                         if (rule->mask & FLOW_TCF_ENCAP_IPV4_DST &&
4905                             encap->ipv4.dst == rule->ipv4.dst) {
4906                                 break;
4907                         }
4908                 }
4909         } else {
4910                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4911                 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4912                 LIST_FOREACH(rule, &iface->neigh, next) {
4913                         if (rule->mask & FLOW_TCF_ENCAP_IPV6_DST &&
4914                             !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4915                                                 sizeof(encap->ipv6.dst))) {
4916                                 break;
4917                         }
4918                 }
4919         }
4920         if (rule) {
4921                 if (memcmp(&encap->eth.dst, &rule->eth,
4922                            sizeof(encap->eth.dst))) {
4923                         DRV_LOG(WARNING, "Destination MAC differs"
4924                                          " in neigh rule");
4925                         rte_flow_error_set(error, EEXIST,
4926                                            RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
4927                                            NULL, "Different MAC address"
4928                                            " neigh rule for the same"
4929                                            " destination IP");
4930                                         return -EEXIST;
4931                 }
4932                 if (enable) {
4933                         rule->refcnt++;
4934                         return 0;
4935                 }
4936                 if (!rule->refcnt || !--rule->refcnt) {
4937                         LIST_REMOVE(rule, next);
4938                         return flow_tcf_rule_neigh(tcf, encap,
4939                                                    iface->ifouter,
4940                                                    false, error);
4941                 }
4942                 return 0;
4943         }
4944         if (!enable) {
4945                 DRV_LOG(WARNING, "Disabling not existing neigh rule");
4946                 rte_flow_error_set(error, ENOENT,
4947                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4948                                    "unable to allocate memory for neigh rule");
4949                 return -ENOENT;
4950         }
4951         rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule),
4952                                 alignof(struct tcf_neigh_rule));
4953         if (!rule) {
4954                 rte_flow_error_set(error, ENOMEM,
4955                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4956                                    "unable to allocate memory for neigh rule");
4957                 return -rte_errno;
4958         }
4959         *rule = (struct tcf_neigh_rule){.refcnt = 0,
4960                                         .mask = 0,
4961                                         };
4962         if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4963                 rule->mask = FLOW_TCF_ENCAP_IPV4_DST;
4964                 rule->ipv4.dst = encap->ipv4.dst;
4965         } else {
4966                 rule->mask = FLOW_TCF_ENCAP_IPV6_DST;
4967                 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4968         }
4969         memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth));
4970         ret = flow_tcf_rule_neigh(tcf, encap, iface->ifouter, true, error);
4971         if (ret) {
4972                 rte_free(rule);
4973                 return ret;
4974         }
4975         rule->refcnt++;
4976         LIST_INSERT_HEAD(&iface->neigh, rule, next);
4977         return 0;
4978 }
4979
4980 /* VXLAN encap rule database for outer interfaces. */
4981 static  LIST_HEAD(, tcf_irule) iface_list_vxlan = LIST_HEAD_INITIALIZER();
4982
4983 /* VTEP device list is shared between PMD port instances. */
4984 static LIST_HEAD(, tcf_vtep) vtep_list_vxlan = LIST_HEAD_INITIALIZER();
4985 static pthread_mutex_t vtep_list_mutex = PTHREAD_MUTEX_INITIALIZER;
4986
4987 /**
4988  * Acquire the VXLAN encap rules container for specified interface.
4989  * First looks for the container in the existing ones list, creates
4990  * and initializes the new container if existing not found.
4991  *
4992  * @param[in] tcf
4993  *   Context object initialized by mlx5_flow_tcf_context_create().
4994  * @param[in] ifouter
4995  *   Network interface index to create VXLAN encap rules on.
4996  * @param[out] error
4997  *   Perform verbose error reporting if not NULL.
4998  * @return
4999  *   Rule container pointer on success,
5000  *   NULL otherwise and rte_errno is set.
5001  */
5002 static struct tcf_irule*
5003 flow_tcf_encap_irule_acquire(struct mlx5_flow_tcf_context *tcf,
5004                              unsigned int ifouter,
5005                              struct rte_flow_error *error)
5006 {
5007         struct tcf_irule *iface;
5008
5009         /* Look whether the container for encap rules is created. */
5010         assert(ifouter);
5011         LIST_FOREACH(iface, &iface_list_vxlan, next) {
5012                 if (iface->ifouter == ifouter)
5013                         break;
5014         }
5015         if (iface) {
5016                 /* Container already exists, just increment the reference. */
5017                 iface->refcnt++;
5018                 return iface;
5019         }
5020         /* Not found, we should create the new container. */
5021         iface = rte_zmalloc(__func__, sizeof(*iface),
5022                             alignof(struct tcf_irule));
5023         if (!iface) {
5024                 rte_flow_error_set(error, ENOMEM,
5025                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5026                                    "unable to allocate memory for container");
5027                 return NULL;
5028         }
5029         *iface = (struct tcf_irule){
5030                         .local = LIST_HEAD_INITIALIZER(),
5031                         .neigh = LIST_HEAD_INITIALIZER(),
5032                         .ifouter = ifouter,
5033                         .refcnt = 1,
5034         };
5035         /* Interface cleanup for new container created. */
5036         flow_tcf_encap_iface_cleanup(tcf, ifouter);
5037         flow_tcf_encap_local_cleanup(tcf, ifouter);
5038         flow_tcf_encap_neigh_cleanup(tcf, ifouter);
5039         LIST_INSERT_HEAD(&iface_list_vxlan, iface, next);
5040         return iface;
5041 }
5042
5043 /**
5044  * Releases VXLAN encap rules container by pointer. Decrements the
5045  * reference cointer and deletes the container if counter is zero.
5046  *
5047  * @param[in] irule
5048  *   VXLAN rule container pointer to release.
5049  */
5050 static void
5051 flow_tcf_encap_irule_release(struct tcf_irule *iface)
5052 {
5053         assert(iface->refcnt);
5054         if (--iface->refcnt == 0) {
5055                 /* Reference counter is zero, delete the container. */
5056                 assert(LIST_EMPTY(&iface->local));
5057                 assert(LIST_EMPTY(&iface->neigh));
5058                 LIST_REMOVE(iface, next);
5059                 rte_free(iface);
5060         }
5061 }
5062
5063 /**
5064  * Deletes VTEP network device.
5065  *
5066  * @param[in] tcf
5067  *   Context object initialized by mlx5_flow_tcf_context_create().
5068  * @param[in] vtep
5069  *   Object represinting the network device to delete. Memory
5070  *   allocated for this object is freed by routine.
5071  */
5072 static void
5073 flow_tcf_vtep_delete(struct mlx5_flow_tcf_context *tcf,
5074                      struct tcf_vtep *vtep)
5075 {
5076         struct nlmsghdr *nlh;
5077         struct ifinfomsg *ifm;
5078         alignas(struct nlmsghdr)
5079         uint8_t buf[mnl_nlmsg_size(MNL_ALIGN(sizeof(*ifm))) +
5080                     MNL_BUF_EXTRA_SPACE];
5081         int ret;
5082
5083         assert(!vtep->refcnt);
5084         /* Delete only ifaces those we actually created. */
5085         if (vtep->created && vtep->ifindex) {
5086                 DRV_LOG(INFO, "VTEP delete (%d)", vtep->ifindex);
5087                 nlh = mnl_nlmsg_put_header(buf);
5088                 nlh->nlmsg_type = RTM_DELLINK;
5089                 nlh->nlmsg_flags = NLM_F_REQUEST;
5090                 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5091                 ifm->ifi_family = AF_UNSPEC;
5092                 ifm->ifi_index = vtep->ifindex;
5093                 assert(sizeof(buf) >= nlh->nlmsg_len);
5094                 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5095                 if (ret)
5096                         DRV_LOG(WARNING, "netlink: error deleting vxlan"
5097                                          " encap/decap ifindex %u",
5098                                          ifm->ifi_index);
5099         }
5100         rte_free(vtep);
5101 }
5102
5103 /**
5104  * Creates VTEP network device.
5105  *
5106  * @param[in] tcf
5107  *   Context object initialized by mlx5_flow_tcf_context_create().
5108  * @param[in] port
5109  *   UDP port of created VTEP device.
5110  * @param[out] error
5111  *   Perform verbose error reporting if not NULL.
5112  *
5113  * @return
5114  * Pointer to created device structure on success,
5115  * NULL otherwise and rte_errno is set.
5116  */
5117 static struct tcf_vtep*
5118 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf,
5119                      uint16_t port, struct rte_flow_error *error)
5120 {
5121         struct tcf_vtep *vtep;
5122         struct nlmsghdr *nlh;
5123         struct ifinfomsg *ifm;
5124         char name[sizeof(MLX5_VXLAN_DEVICE_PFX) + 24];
5125         alignas(struct nlmsghdr)
5126         uint8_t buf[mnl_nlmsg_size(sizeof(*ifm)) +
5127                     SZ_NLATTR_DATA_OF(sizeof(name)) +
5128                     SZ_NLATTR_NEST * 2 +
5129                     SZ_NLATTR_STRZ_OF("vxlan") +
5130                     SZ_NLATTR_DATA_OF(sizeof(uint32_t)) +
5131                     SZ_NLATTR_DATA_OF(sizeof(uint16_t)) +
5132                     SZ_NLATTR_DATA_OF(sizeof(uint8_t)) * 3 +
5133                     MNL_BUF_EXTRA_SPACE];
5134         struct nlattr *na_info;
5135         struct nlattr *na_vxlan;
5136         rte_be16_t vxlan_port = rte_cpu_to_be_16(port);
5137         int ret;
5138
5139         vtep = rte_zmalloc(__func__, sizeof(*vtep), alignof(struct tcf_vtep));
5140         if (!vtep) {
5141                 rte_flow_error_set(error, ENOMEM,
5142                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5143                                    "unable to allocate memory for VTEP");
5144                 return NULL;
5145         }
5146         *vtep = (struct tcf_vtep){
5147                         .port = port,
5148         };
5149         memset(buf, 0, sizeof(buf));
5150         nlh = mnl_nlmsg_put_header(buf);
5151         nlh->nlmsg_type = RTM_NEWLINK;
5152         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE  | NLM_F_EXCL;
5153         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5154         ifm->ifi_family = AF_UNSPEC;
5155         ifm->ifi_type = 0;
5156         ifm->ifi_index = 0;
5157         ifm->ifi_flags = IFF_UP;
5158         ifm->ifi_change = 0xffffffff;
5159         snprintf(name, sizeof(name), "%s%u", MLX5_VXLAN_DEVICE_PFX, port);
5160         mnl_attr_put_strz(nlh, IFLA_IFNAME, name);
5161         na_info = mnl_attr_nest_start(nlh, IFLA_LINKINFO);
5162         assert(na_info);
5163         mnl_attr_put_strz(nlh, IFLA_INFO_KIND, "vxlan");
5164         na_vxlan = mnl_attr_nest_start(nlh, IFLA_INFO_DATA);
5165         assert(na_vxlan);
5166 #ifdef HAVE_IFLA_VXLAN_COLLECT_METADATA
5167         /*
5168          * RH 7.2 does not support metadata for tunnel device.
5169          * It does not matter because we are going to use the
5170          * hardware offload by mlx5 driver.
5171          */
5172         mnl_attr_put_u8(nlh, IFLA_VXLAN_COLLECT_METADATA, 1);
5173 #endif
5174         mnl_attr_put_u8(nlh, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1);
5175         mnl_attr_put_u8(nlh, IFLA_VXLAN_LEARNING, 0);
5176         mnl_attr_put_u16(nlh, IFLA_VXLAN_PORT, vxlan_port);
5177 #ifndef HAVE_IFLA_VXLAN_COLLECT_METADATA
5178         /*
5179          *  We must specify VNI explicitly if metadata not supported.
5180          *  Note, VNI is transferred with native endianness format.
5181          */
5182         mnl_attr_put_u16(nlh, IFLA_VXLAN_ID, MLX5_VXLAN_DEFAULT_VNI);
5183 #endif
5184         mnl_attr_nest_end(nlh, na_vxlan);
5185         mnl_attr_nest_end(nlh, na_info);
5186         assert(sizeof(buf) >= nlh->nlmsg_len);
5187         ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5188         if (ret) {
5189                 DRV_LOG(WARNING,
5190                         "netlink: VTEP %s create failure (%d)",
5191                         name, rte_errno);
5192                 if (rte_errno != EEXIST)
5193                         /*
5194                          * Some unhandled error occurred or device is
5195                          * for encapsulation and cannot be shared.
5196                          */
5197                         goto error;
5198         } else {
5199                 /*
5200                  * Mark device we actually created.
5201                  * We should explicitly delete
5202                  * when we do not need it anymore.
5203                  */
5204                 vtep->created = 1;
5205         }
5206         /* Try to get ifindex of created of pre-existing device. */
5207         ret = if_nametoindex(name);
5208         if (!ret) {
5209                 DRV_LOG(WARNING,
5210                         "VTEP %s failed to get index (%d)", name, errno);
5211                 rte_flow_error_set
5212                         (error, -errno,
5213                          RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5214                          "netlink: failed to retrieve VTEP ifindex");
5215                 goto error;
5216         }
5217         vtep->ifindex = ret;
5218         memset(buf, 0, sizeof(buf));
5219         nlh = mnl_nlmsg_put_header(buf);
5220         nlh->nlmsg_type = RTM_NEWLINK;
5221         nlh->nlmsg_flags = NLM_F_REQUEST;
5222         ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5223         ifm->ifi_family = AF_UNSPEC;
5224         ifm->ifi_type = 0;
5225         ifm->ifi_index = vtep->ifindex;
5226         ifm->ifi_flags = IFF_UP;
5227         ifm->ifi_change = IFF_UP;
5228         ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5229         if (ret) {
5230                 rte_flow_error_set(error, -errno,
5231                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5232                                    "netlink: failed to set VTEP link up");
5233                 DRV_LOG(WARNING, "netlink: VTEP %s set link up failure (%d)",
5234                         name, rte_errno);
5235                 goto clean;
5236         }
5237         ret = mlx5_flow_tcf_init(tcf, vtep->ifindex, error);
5238         if (ret) {
5239                 DRV_LOG(WARNING, "VTEP %s init failure (%d)", name, rte_errno);
5240                 goto clean;
5241         }
5242         DRV_LOG(INFO, "VTEP create (%d, %d)", vtep->port, vtep->ifindex);
5243         vtep->refcnt = 1;
5244         return vtep;
5245 clean:
5246         flow_tcf_vtep_delete(tcf, vtep);
5247         return NULL;
5248 error:
5249         rte_free(vtep);
5250         return NULL;
5251 }
5252
5253 /**
5254  * Acquire target interface index for VXLAN tunneling decapsulation.
5255  * In order to share the UDP port within the other interfaces the
5256  * VXLAN device created as not attached to any interface (if created).
5257  *
5258  * @param[in] tcf
5259  *   Context object initialized by mlx5_flow_tcf_context_create().
5260  * @param[in] dev_flow
5261  *   Flow tcf object with tunnel structure pointer set.
5262  * @param[out] error
5263  *   Perform verbose error reporting if not NULL.
5264  * @return
5265  *   Interface descriptor pointer on success,
5266  *   NULL otherwise and rte_errno is set.
5267  */
5268 static struct tcf_vtep*
5269 flow_tcf_decap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5270                             struct mlx5_flow *dev_flow,
5271                             struct rte_flow_error *error)
5272 {
5273         struct tcf_vtep *vtep;
5274         uint16_t port = dev_flow->tcf.vxlan_decap->udp_port;
5275
5276         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5277                 if (vtep->port == port)
5278                         break;
5279         }
5280         if (vtep) {
5281                 /* Device exists, just increment the reference counter. */
5282                 vtep->refcnt++;
5283                 assert(vtep->ifindex);
5284                 return vtep;
5285         }
5286         /* No decapsulation device exists, try to create the new one. */
5287         vtep = flow_tcf_vtep_create(tcf, port, error);
5288         if (vtep)
5289                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5290         return vtep;
5291 }
5292
5293 /**
5294  * Aqcuire target interface index for VXLAN tunneling encapsulation.
5295  *
5296  * @param[in] tcf
5297  *   Context object initialized by mlx5_flow_tcf_context_create().
5298  * @param[in] ifouter
5299  *   Network interface index to attach VXLAN encap device to.
5300  * @param[in] dev_flow
5301  *   Flow tcf object with tunnel structure pointer set.
5302  * @param[out] error
5303  *   Perform verbose error reporting if not NULL.
5304  * @return
5305  *   Interface descriptor pointer on success,
5306  *   NULL otherwise and rte_errno is set.
5307  */
5308 static struct tcf_vtep*
5309 flow_tcf_encap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5310                             unsigned int ifouter,
5311                             struct mlx5_flow *dev_flow,
5312                             struct rte_flow_error *error)
5313 {
5314         static uint16_t port;
5315         struct tcf_vtep *vtep;
5316         struct tcf_irule *iface;
5317         int ret;
5318
5319         assert(ifouter);
5320         /* Look whether the VTEP for specified port is created. */
5321         port = rte_be_to_cpu_16(dev_flow->tcf.vxlan_encap->udp.dst);
5322         LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5323                 if (vtep->port == port)
5324                         break;
5325         }
5326         if (vtep) {
5327                 /* VTEP already exists, just increment the reference. */
5328                 vtep->refcnt++;
5329         } else {
5330                 /* Not found, we should create the new VTEP. */
5331                 vtep = flow_tcf_vtep_create(tcf, port, error);
5332                 if (!vtep)
5333                         return NULL;
5334                 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5335         }
5336         assert(vtep->ifindex);
5337         iface = flow_tcf_encap_irule_acquire(tcf, ifouter, error);
5338         if (!iface) {
5339                 if (--vtep->refcnt == 0)
5340                         flow_tcf_vtep_delete(tcf, vtep);
5341                 return NULL;
5342         }
5343         dev_flow->tcf.vxlan_encap->iface = iface;
5344         /* Create local ipaddr with peer to specify the outer IPs. */
5345         ret = flow_tcf_encap_local(tcf, iface, dev_flow, true, error);
5346         if (!ret) {
5347                 /* Create neigh rule to specify outer destination MAC. */
5348                 ret = flow_tcf_encap_neigh(tcf, iface, dev_flow, true, error);
5349                 if (ret)
5350                         flow_tcf_encap_local(tcf, iface,
5351                                              dev_flow, false, error);
5352         }
5353         if (ret) {
5354                 dev_flow->tcf.vxlan_encap->iface = NULL;
5355                 flow_tcf_encap_irule_release(iface);
5356                 if (--vtep->refcnt == 0)
5357                         flow_tcf_vtep_delete(tcf, vtep);
5358                 return NULL;
5359         }
5360         return vtep;
5361 }
5362
5363 /**
5364  * Acquires target interface index for tunneling of any type.
5365  * Creates the new VTEP if needed.
5366  *
5367  * @param[in] tcf
5368  *   Context object initialized by mlx5_flow_tcf_context_create().
5369  * @param[in] ifouter
5370  *   Network interface index to create VXLAN encap rules on.
5371  * @param[in] dev_flow
5372  *   Flow tcf object with tunnel structure pointer set.
5373  * @param[out] error
5374  *   Perform verbose error reporting if not NULL.
5375  * @return
5376  *   Interface descriptor pointer on success,
5377  *   NULL otherwise and rte_errno is set.
5378  */
5379 static struct tcf_vtep*
5380 flow_tcf_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5381                       unsigned int ifouter,
5382                       struct mlx5_flow *dev_flow,
5383                       struct rte_flow_error *error)
5384 {
5385         struct tcf_vtep *vtep = NULL;
5386
5387         assert(dev_flow->tcf.tunnel);
5388         pthread_mutex_lock(&vtep_list_mutex);
5389         switch (dev_flow->tcf.tunnel->type) {
5390         case FLOW_TCF_TUNACT_VXLAN_ENCAP:
5391                 vtep = flow_tcf_encap_vtep_acquire(tcf, ifouter,
5392                                                   dev_flow, error);
5393                 break;
5394         case FLOW_TCF_TUNACT_VXLAN_DECAP:
5395                 vtep = flow_tcf_decap_vtep_acquire(tcf, dev_flow, error);
5396                 break;
5397         default:
5398                 rte_flow_error_set(error, ENOTSUP,
5399                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5400                                    "unsupported tunnel type");
5401                 break;
5402         }
5403         pthread_mutex_unlock(&vtep_list_mutex);
5404         return vtep;
5405 }
5406
5407 /**
5408  * Release tunneling interface by ifindex. Decrements reference
5409  * counter and actually removes the device if counter is zero.
5410  *
5411  * @param[in] tcf
5412  *   Context object initialized by mlx5_flow_tcf_context_create().
5413  * @param[in] vtep
5414  *   VTEP device descriptor structure.
5415  * @param[in] dev_flow
5416  *   Flow tcf object with tunnel structure pointer set.
5417  */
5418 static void
5419 flow_tcf_vtep_release(struct mlx5_flow_tcf_context *tcf,
5420                       struct tcf_vtep *vtep,
5421                       struct mlx5_flow *dev_flow)
5422 {
5423         assert(dev_flow->tcf.tunnel);
5424         pthread_mutex_lock(&vtep_list_mutex);
5425         switch (dev_flow->tcf.tunnel->type) {
5426         case FLOW_TCF_TUNACT_VXLAN_DECAP:
5427                 break;
5428         case FLOW_TCF_TUNACT_VXLAN_ENCAP: {
5429                 struct tcf_irule *iface;
5430
5431                 /* Remove the encap ancillary rules first. */
5432                 iface = dev_flow->tcf.vxlan_encap->iface;
5433                 assert(iface);
5434                 flow_tcf_encap_neigh(tcf, iface, dev_flow, false, NULL);
5435                 flow_tcf_encap_local(tcf, iface, dev_flow, false, NULL);
5436                 flow_tcf_encap_irule_release(iface);
5437                 dev_flow->tcf.vxlan_encap->iface = NULL;
5438                 break;
5439         }
5440         default:
5441                 assert(false);
5442                 DRV_LOG(WARNING, "Unsupported tunnel type");
5443                 break;
5444         }
5445         assert(vtep->refcnt);
5446         if (--vtep->refcnt == 0) {
5447                 LIST_REMOVE(vtep, next);
5448                 flow_tcf_vtep_delete(tcf, vtep);
5449         }
5450         pthread_mutex_unlock(&vtep_list_mutex);
5451 }
5452
5453 struct tcf_nlcb_query {
5454         uint32_t handle;
5455         uint32_t tc_flags;
5456         uint32_t flags_valid:1;
5457 };
5458
5459 /**
5460  * Collect queried rule attributes. This is callback routine called by
5461  * libmnl mnl_cb_run() in loop for every message in received packet.
5462  * Current implementation collects the flower flags only.
5463  *
5464  * @param[in] nlh
5465  *   Pointer to reply header.
5466  * @param[in, out] arg
5467  *   Context pointer for this callback.
5468  *
5469  * @return
5470  *   A positive, nonzero value on success (required by libmnl
5471  *   to continue messages processing).
5472  */
5473 static int
5474 flow_tcf_collect_query_cb(const struct nlmsghdr *nlh, void *arg)
5475 {
5476         struct tcf_nlcb_query *query = arg;
5477         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
5478         struct nlattr *na, *na_opt;
5479         bool flower = false;
5480
5481         if (nlh->nlmsg_type != RTM_NEWTFILTER ||
5482             tcm->tcm_handle != query->handle)
5483                 return 1;
5484         mnl_attr_for_each(na, nlh, sizeof(*tcm)) {
5485                 switch (mnl_attr_get_type(na)) {
5486                 case TCA_KIND:
5487                         if (strcmp(mnl_attr_get_payload(na), "flower")) {
5488                                 /* Not flower filter, drop entire message. */
5489                                 return 1;
5490                         }
5491                         flower = true;
5492                         break;
5493                 case TCA_OPTIONS:
5494                         if (!flower) {
5495                                 /* Not flower options, drop entire message. */
5496                                 return 1;
5497                         }
5498                         /* Check nested flower options. */
5499                         mnl_attr_for_each_nested(na_opt, na) {
5500                                 switch (mnl_attr_get_type(na_opt)) {
5501                                 case TCA_FLOWER_FLAGS:
5502                                         query->flags_valid = 1;
5503                                         query->tc_flags =
5504                                                 mnl_attr_get_u32(na_opt);
5505                                         break;
5506                                 }
5507                         }
5508                         break;
5509                 }
5510         }
5511         return 1;
5512 }
5513
5514 /**
5515  * Query a TC flower rule flags via netlink.
5516  *
5517  * @param[in] tcf
5518  *   Context object initialized by mlx5_flow_tcf_context_create().
5519  * @param[in] dev_flow
5520  *   Pointer to the flow.
5521  * @param[out] pflags
5522  *   pointer to the data retrieved by the query.
5523  *
5524  * @return
5525  *   0 on success, a negative errno value otherwise.
5526  */
5527 static int
5528 flow_tcf_query_flags(struct mlx5_flow_tcf_context *tcf,
5529                      struct mlx5_flow *dev_flow,
5530                      uint32_t *pflags)
5531 {
5532         struct nlmsghdr *nlh;
5533         struct tcmsg *tcm;
5534         struct tcf_nlcb_query query = {
5535                 .handle = dev_flow->tcf.tcm->tcm_handle,
5536         };
5537
5538         nlh = mnl_nlmsg_put_header(tcf->buf);
5539         nlh->nlmsg_type = RTM_GETTFILTER;
5540         nlh->nlmsg_flags = NLM_F_REQUEST;
5541         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
5542         memcpy(tcm, dev_flow->tcf.tcm, sizeof(*tcm));
5543         /*
5544          * Ignore Netlink error for filter query operations.
5545          * The reply length is sent by kernel as errno.
5546          * Just check we got the flags option.
5547          */
5548         flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_query_cb, &query);
5549         if (!query.flags_valid) {
5550                 *pflags = 0;
5551                 return -ENOENT;
5552         }
5553         *pflags = query.tc_flags;
5554         return 0;
5555 }
5556
5557 /**
5558  * Query and check the in_hw set for specified rule.
5559  *
5560  * @param[in] tcf
5561  *   Context object initialized by mlx5_flow_tcf_context_create().
5562  * @param[in] dev_flow
5563  *   Pointer to the flow to check.
5564  *
5565  * @return
5566  *   0 on success, a negative errno value otherwise.
5567  */
5568 static int
5569 flow_tcf_check_inhw(struct mlx5_flow_tcf_context *tcf,
5570                     struct mlx5_flow *dev_flow)
5571 {
5572         uint32_t flags;
5573         int ret;
5574
5575         ret = flow_tcf_query_flags(tcf, dev_flow, &flags);
5576         if (ret)
5577                 return ret;
5578         return  (flags & TCA_CLS_FLAGS_IN_HW) ? 0 : -ENOENT;
5579 }
5580
5581 /**
5582  * Remove flow from E-Switch by sending Netlink message.
5583  *
5584  * @param[in] dev
5585  *   Pointer to Ethernet device.
5586  * @param[in, out] flow
5587  *   Pointer to the sub flow.
5588  */
5589 static void
5590 flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
5591 {
5592         struct priv *priv = dev->data->dev_private;
5593         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5594         struct mlx5_flow *dev_flow;
5595         struct nlmsghdr *nlh;
5596
5597         if (!flow)
5598                 return;
5599         dev_flow = LIST_FIRST(&flow->dev_flows);
5600         if (!dev_flow)
5601                 return;
5602         /* E-Switch flow can't be expanded. */
5603         assert(!LIST_NEXT(dev_flow, next));
5604         if (dev_flow->tcf.applied) {
5605                 nlh = dev_flow->tcf.nlh;
5606                 nlh->nlmsg_type = RTM_DELTFILTER;
5607                 nlh->nlmsg_flags = NLM_F_REQUEST;
5608                 flow_tcf_nl_ack(ctx, nlh, NULL, NULL);
5609                 if (dev_flow->tcf.tunnel) {
5610                         assert(dev_flow->tcf.tunnel->vtep);
5611                         flow_tcf_vtep_release(ctx,
5612                                 dev_flow->tcf.tunnel->vtep,
5613                                 dev_flow);
5614                         dev_flow->tcf.tunnel->vtep = NULL;
5615                 }
5616                 dev_flow->tcf.applied = 0;
5617         }
5618 }
5619
5620 /**
5621  * Apply flow to E-Switch by sending Netlink message.
5622  *
5623  * @param[in] dev
5624  *   Pointer to Ethernet device.
5625  * @param[in, out] flow
5626  *   Pointer to the sub flow.
5627  * @param[out] error
5628  *   Pointer to the error structure.
5629  *
5630  * @return
5631  *   0 on success, a negative errno value otherwise and rte_errno is set.
5632  */
5633 static int
5634 flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
5635                struct rte_flow_error *error)
5636 {
5637         struct priv *priv = dev->data->dev_private;
5638         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5639         struct mlx5_flow *dev_flow;
5640         struct nlmsghdr *nlh;
5641
5642         dev_flow = LIST_FIRST(&flow->dev_flows);
5643         /* E-Switch flow can't be expanded. */
5644         assert(!LIST_NEXT(dev_flow, next));
5645         if (dev_flow->tcf.applied)
5646                 return 0;
5647         nlh = dev_flow->tcf.nlh;
5648         nlh->nlmsg_type = RTM_NEWTFILTER;
5649         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
5650         if (dev_flow->tcf.tunnel) {
5651                 /*
5652                  * Replace the interface index, target for
5653                  * encapsulation, source for decapsulation.
5654                  */
5655                 assert(!dev_flow->tcf.tunnel->vtep);
5656                 assert(dev_flow->tcf.tunnel->ifindex_ptr);
5657                 /* Acquire actual VTEP device when rule is being applied. */
5658                 dev_flow->tcf.tunnel->vtep =
5659                         flow_tcf_vtep_acquire(ctx,
5660                                         dev_flow->tcf.tunnel->ifindex_org,
5661                                         dev_flow, error);
5662                 if (!dev_flow->tcf.tunnel->vtep)
5663                         return -rte_errno;
5664                 DRV_LOG(INFO, "Replace ifindex: %d->%d",
5665                                 dev_flow->tcf.tunnel->vtep->ifindex,
5666                                 dev_flow->tcf.tunnel->ifindex_org);
5667                 *dev_flow->tcf.tunnel->ifindex_ptr =
5668                         dev_flow->tcf.tunnel->vtep->ifindex;
5669         }
5670         if (!flow_tcf_nl_ack(ctx, nlh, NULL, NULL)) {
5671                 dev_flow->tcf.applied = 1;
5672                 if (*dev_flow->tcf.ptc_flags & TCA_CLS_FLAGS_SKIP_SW)
5673                         return 0;
5674                 /*
5675                  * Rule was applied without skip_sw flag set.
5676                  * We should check whether the rule was acctually
5677                  * accepted by hardware (have look at in_hw flag).
5678                  */
5679                 if (flow_tcf_check_inhw(ctx, dev_flow)) {
5680                         flow_tcf_remove(dev, flow);
5681                         return rte_flow_error_set
5682                                 (error, ENOENT,
5683                                  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5684                                  "netlink: rule has no in_hw flag set");
5685                 }
5686                 return 0;
5687         }
5688         if (dev_flow->tcf.tunnel) {
5689                 /* Rollback the VTEP configuration if rule apply failed. */
5690                 assert(dev_flow->tcf.tunnel->vtep);
5691                 flow_tcf_vtep_release(ctx, dev_flow->tcf.tunnel->vtep,
5692                                       dev_flow);
5693                 dev_flow->tcf.tunnel->vtep = NULL;
5694         }
5695         return rte_flow_error_set(error, rte_errno,
5696                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5697                                   "netlink: failed to create TC flow rule");
5698 }
5699
5700 /**
5701  * Remove flow from E-Switch and release resources of the device flow.
5702  *
5703  * @param[in] dev
5704  *   Pointer to Ethernet device.
5705  * @param[in, out] flow
5706  *   Pointer to the sub flow.
5707  */
5708 static void
5709 flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
5710 {
5711         struct mlx5_flow *dev_flow;
5712
5713         if (!flow)
5714                 return;
5715         flow_tcf_remove(dev, flow);
5716         if (flow->counter) {
5717                 if (--flow->counter->ref_cnt == 0) {
5718                         rte_free(flow->counter);
5719                         flow->counter = NULL;
5720                 }
5721         }
5722         dev_flow = LIST_FIRST(&flow->dev_flows);
5723         if (!dev_flow)
5724                 return;
5725         /* E-Switch flow can't be expanded. */
5726         assert(!LIST_NEXT(dev_flow, next));
5727         LIST_REMOVE(dev_flow, next);
5728         rte_free(dev_flow);
5729 }
5730
5731 /**
5732  * Helper routine for figuring the space size required for a parse buffer.
5733  *
5734  * @param array
5735  *   array of values to use.
5736  * @param idx
5737  *   Current location in array.
5738  * @param value
5739  *   Value to compare with.
5740  *
5741  * @return
5742  *   The maximum between the given value and the array value on index.
5743  */
5744 static uint16_t
5745 flow_tcf_arr_val_max(uint16_t array[], int idx, uint16_t value)
5746 {
5747         return idx < 0 ? (value) : RTE_MAX((array)[idx], value);
5748 }
5749
5750 /**
5751  * Parse rtnetlink message attributes filling the attribute table with the info
5752  * retrieved.
5753  *
5754  * @param tb
5755  *   Attribute table to be filled.
5756  * @param[out] max
5757  *   Maxinum entry in the attribute table.
5758  * @param rte
5759  *   The attributes section in the message to be parsed.
5760  * @param len
5761  *   The length of the attributes section in the message.
5762  */
5763 static void
5764 flow_tcf_nl_parse_rtattr(struct rtattr *tb[], int max,
5765                          struct rtattr *rta, int len)
5766 {
5767         unsigned short type;
5768         memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
5769         while (RTA_OK(rta, len)) {
5770                 type = rta->rta_type;
5771                 if (type <= max && !tb[type])
5772                         tb[type] = rta;
5773                 rta = RTA_NEXT(rta, len);
5774         }
5775 }
5776
5777 /**
5778  * Extract flow counters from flower action.
5779  *
5780  * @param rta
5781  *   flower action stats properties in the Netlink message received.
5782  * @param rta_type
5783  *   The backward sequence of rta_types, as written in the attribute table,
5784  *   we need to traverse in order to get to the requested object.
5785  * @param idx
5786  *   Current location in rta_type table.
5787  * @param[out] data
5788  *   data holding the count statistics of the rte_flow retrieved from
5789  *   the message.
5790  *
5791  * @return
5792  *   0 if data was found and retrieved, -1 otherwise.
5793  */
5794 static int
5795 flow_tcf_nl_action_stats_parse_and_get(struct rtattr *rta,
5796                                        uint16_t rta_type[], int idx,
5797                                        struct gnet_stats_basic *data)
5798 {
5799         int tca_stats_max = flow_tcf_arr_val_max(rta_type, idx,
5800                                                  TCA_STATS_BASIC);
5801         struct rtattr *tbs[tca_stats_max + 1];
5802
5803         if (rta == NULL || idx < 0)
5804                 return -1;
5805         flow_tcf_nl_parse_rtattr(tbs, tca_stats_max,
5806                                  RTA_DATA(rta), RTA_PAYLOAD(rta));
5807         switch (rta_type[idx]) {
5808         case TCA_STATS_BASIC:
5809                 if (tbs[TCA_STATS_BASIC]) {
5810                         memcpy(data, RTA_DATA(tbs[TCA_STATS_BASIC]),
5811                                RTE_MIN(RTA_PAYLOAD(tbs[TCA_STATS_BASIC]),
5812                                sizeof(*data)));
5813                         return 0;
5814                 }
5815                 break;
5816         default:
5817                 break;
5818         }
5819         return -1;
5820 }
5821
5822 /**
5823  * Parse flower single action retrieving the requested action attribute,
5824  * if found.
5825  *
5826  * @param arg
5827  *   flower action properties in the Netlink message received.
5828  * @param rta_type
5829  *   The backward sequence of rta_types, as written in the attribute table,
5830  *   we need to traverse in order to get to the requested object.
5831  * @param idx
5832  *   Current location in rta_type table.
5833  * @param[out] data
5834  *   Count statistics retrieved from the message query.
5835  *
5836  * @return
5837  *   0 if data was found and retrieved, -1 otherwise.
5838  */
5839 static int
5840 flow_tcf_nl_parse_one_action_and_get(struct rtattr *arg,
5841                                      uint16_t rta_type[], int idx, void *data)
5842 {
5843         int tca_act_max = flow_tcf_arr_val_max(rta_type, idx, TCA_ACT_STATS);
5844         struct rtattr *tb[tca_act_max + 1];
5845
5846         if (arg == NULL || idx < 0)
5847                 return -1;
5848         flow_tcf_nl_parse_rtattr(tb, tca_act_max,
5849                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
5850         if (tb[TCA_ACT_KIND] == NULL)
5851                 return -1;
5852         switch (rta_type[idx]) {
5853         case TCA_ACT_STATS:
5854                 if (tb[TCA_ACT_STATS])
5855                         return flow_tcf_nl_action_stats_parse_and_get
5856                                         (tb[TCA_ACT_STATS],
5857                                          rta_type, --idx,
5858                                          (struct gnet_stats_basic *)data);
5859                 break;
5860         default:
5861                 break;
5862         }
5863         return -1;
5864 }
5865
5866 /**
5867  * Parse flower action section in the message retrieving the requested
5868  * attribute from the first action that provides it.
5869  *
5870  * @param opt
5871  *   flower section in the Netlink message received.
5872  * @param rta_type
5873  *   The backward sequence of rta_types, as written in the attribute table,
5874  *   we need to traverse in order to get to the requested object.
5875  * @param idx
5876  *   Current location in rta_type table.
5877  * @param[out] data
5878  *   data retrieved from the message query.
5879  *
5880  * @return
5881  *   0 if data was found and retrieved, -1 otherwise.
5882  */
5883 static int
5884 flow_tcf_nl_action_parse_and_get(struct rtattr *arg,
5885                                  uint16_t rta_type[], int idx, void *data)
5886 {
5887         struct rtattr *tb[TCA_ACT_MAX_PRIO + 1];
5888         int i;
5889
5890         if (arg == NULL || idx < 0)
5891                 return -1;
5892         flow_tcf_nl_parse_rtattr(tb, TCA_ACT_MAX_PRIO,
5893                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
5894         switch (rta_type[idx]) {
5895         /*
5896          * flow counters are stored in the actions defined by the flow
5897          * and not in the flow itself, therefore we need to traverse the
5898          * flower chain of actions in search for them.
5899          *
5900          * Note that the index is not decremented here.
5901          */
5902         case TCA_ACT_STATS:
5903                 for (i = 0; i <= TCA_ACT_MAX_PRIO; i++) {
5904                         if (tb[i] &&
5905                         !flow_tcf_nl_parse_one_action_and_get(tb[i],
5906                                                               rta_type,
5907                                                               idx, data))
5908                                 return 0;
5909                 }
5910                 break;
5911         default:
5912                 break;
5913         }
5914         return -1;
5915 }
5916
5917 /**
5918  * Parse flower classifier options in the message, retrieving the requested
5919  * attribute if found.
5920  *
5921  * @param opt
5922  *   flower section in the Netlink message received.
5923  * @param rta_type
5924  *   The backward sequence of rta_types, as written in the attribute table,
5925  *   we need to traverse in order to get to the requested object.
5926  * @param idx
5927  *   Current location in rta_type table.
5928  * @param[out] data
5929  *   data retrieved from the message query.
5930  *
5931  * @return
5932  *   0 if data was found and retrieved, -1 otherwise.
5933  */
5934 static int
5935 flow_tcf_nl_opts_parse_and_get(struct rtattr *opt,
5936                                uint16_t rta_type[], int idx, void *data)
5937 {
5938         int tca_flower_max = flow_tcf_arr_val_max(rta_type, idx,
5939                                                   TCA_FLOWER_ACT);
5940         struct rtattr *tb[tca_flower_max + 1];
5941
5942         if (!opt || idx < 0)
5943                 return -1;
5944         flow_tcf_nl_parse_rtattr(tb, tca_flower_max,
5945                                  RTA_DATA(opt), RTA_PAYLOAD(opt));
5946         switch (rta_type[idx]) {
5947         case TCA_FLOWER_ACT:
5948                 if (tb[TCA_FLOWER_ACT])
5949                         return flow_tcf_nl_action_parse_and_get
5950                                                         (tb[TCA_FLOWER_ACT],
5951                                                          rta_type, --idx, data);
5952                 break;
5953         default:
5954                 break;
5955         }
5956         return -1;
5957 }
5958
5959 /**
5960  * Parse Netlink reply on filter query, retrieving the flow counters.
5961  *
5962  * @param nlh
5963  *   Message received from Netlink.
5964  * @param rta_type
5965  *   The backward sequence of rta_types, as written in the attribute table,
5966  *   we need to traverse in order to get to the requested object.
5967  * @param idx
5968  *   Current location in rta_type table.
5969  * @param[out] data
5970  *   data retrieved from the message query.
5971  *
5972  * @return
5973  *   0 if data was found and retrieved, -1 otherwise.
5974  */
5975 static int
5976 flow_tcf_nl_filter_parse_and_get(struct nlmsghdr *cnlh,
5977                                  uint16_t rta_type[], int idx, void *data)
5978 {
5979         struct nlmsghdr *nlh = cnlh;
5980         struct tcmsg *t = NLMSG_DATA(nlh);
5981         int len = nlh->nlmsg_len;
5982         int tca_max = flow_tcf_arr_val_max(rta_type, idx, TCA_OPTIONS);
5983         struct rtattr *tb[tca_max + 1];
5984
5985         if (idx < 0)
5986                 return -1;
5987         if (nlh->nlmsg_type != RTM_NEWTFILTER &&
5988             nlh->nlmsg_type != RTM_GETTFILTER &&
5989             nlh->nlmsg_type != RTM_DELTFILTER)
5990                 return -1;
5991         len -= NLMSG_LENGTH(sizeof(*t));
5992         if (len < 0)
5993                 return -1;
5994         flow_tcf_nl_parse_rtattr(tb, tca_max, TCA_RTA(t), len);
5995         /* Not a TC flower flow - bail out */
5996         if (!tb[TCA_KIND] ||
5997             strcmp(RTA_DATA(tb[TCA_KIND]), "flower"))
5998                 return -1;
5999         switch (rta_type[idx]) {
6000         case TCA_OPTIONS:
6001                 if (tb[TCA_OPTIONS])
6002                         return flow_tcf_nl_opts_parse_and_get(tb[TCA_OPTIONS],
6003                                                               rta_type,
6004                                                               --idx, data);
6005                 break;
6006         default:
6007                 break;
6008         }
6009         return -1;
6010 }
6011
6012 /**
6013  * A callback to parse Netlink reply on TC flower query.
6014  *
6015  * @param nlh
6016  *   Message received from Netlink.
6017  * @param[out] data
6018  *   Pointer to data area to be filled by the parsing routine.
6019  *   assumed to be a pointer to struct flow_tcf_stats_basic.
6020  *
6021  * @return
6022  *   MNL_CB_OK value.
6023  */
6024 static int
6025 flow_tcf_nl_message_get_stats_basic(const struct nlmsghdr *nlh, void *data)
6026 {
6027         /*
6028          * The backward sequence of rta_types to pass in order to get
6029          *  to the counters.
6030          */
6031         uint16_t rta_type[] = { TCA_STATS_BASIC, TCA_ACT_STATS,
6032                                 TCA_FLOWER_ACT, TCA_OPTIONS };
6033         struct flow_tcf_stats_basic *sb_data = data;
6034         union {
6035                 const struct nlmsghdr *c;
6036                 struct nlmsghdr *nc;
6037         } tnlh = { .c = nlh };
6038
6039         if (!flow_tcf_nl_filter_parse_and_get(tnlh.nc, rta_type,
6040                                               RTE_DIM(rta_type) - 1,
6041                                               (void *)&sb_data->counters))
6042                 sb_data->valid = true;
6043         return MNL_CB_OK;
6044 }
6045
6046 /**
6047  * Query a TC flower rule for its statistics via netlink.
6048  *
6049  * @param[in] dev
6050  *   Pointer to Ethernet device.
6051  * @param[in] flow
6052  *   Pointer to the sub flow.
6053  * @param[out] data
6054  *   data retrieved by the query.
6055  * @param[out] error
6056  *   Perform verbose error reporting if not NULL.
6057  *
6058  * @return
6059  *   0 on success, a negative errno value otherwise and rte_errno is set.
6060  */
6061 static int
6062 flow_tcf_query_count(struct rte_eth_dev *dev,
6063                           struct rte_flow *flow,
6064                           void *data,
6065                           struct rte_flow_error *error)
6066 {
6067         struct flow_tcf_stats_basic sb_data;
6068         struct rte_flow_query_count *qc = data;
6069         struct priv *priv = dev->data->dev_private;
6070         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
6071         struct mnl_socket *nl = ctx->nl;
6072         struct mlx5_flow *dev_flow;
6073         struct nlmsghdr *nlh;
6074         uint32_t seq = priv->tcf_context->seq++;
6075         ssize_t ret;
6076         assert(qc);
6077
6078         memset(&sb_data, 0, sizeof(sb_data));
6079         dev_flow = LIST_FIRST(&flow->dev_flows);
6080         /* E-Switch flow can't be expanded. */
6081         assert(!LIST_NEXT(dev_flow, next));
6082         if (!dev_flow->flow->counter)
6083                 goto notsup_exit;
6084         nlh = dev_flow->tcf.nlh;
6085         nlh->nlmsg_type = RTM_GETTFILTER;
6086         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ECHO;
6087         nlh->nlmsg_seq = seq;
6088         if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) == -1)
6089                 goto error_exit;
6090         do {
6091                 ret = mnl_socket_recvfrom(nl, ctx->buf, ctx->buf_size);
6092                 if (ret <= 0)
6093                         break;
6094                 ret = mnl_cb_run(ctx->buf, ret, seq,
6095                                  mnl_socket_get_portid(nl),
6096                                  flow_tcf_nl_message_get_stats_basic,
6097                                  (void *)&sb_data);
6098         } while (ret > 0);
6099         /* Return the delta from last reset. */
6100         if (sb_data.valid) {
6101                 /* Return the delta from last reset. */
6102                 qc->hits_set = 1;
6103                 qc->bytes_set = 1;
6104                 qc->hits = sb_data.counters.packets - flow->counter->hits;
6105                 qc->bytes = sb_data.counters.bytes - flow->counter->bytes;
6106                 if (qc->reset) {
6107                         flow->counter->hits = sb_data.counters.packets;
6108                         flow->counter->bytes = sb_data.counters.bytes;
6109                 }
6110                 return 0;
6111         }
6112         return rte_flow_error_set(error, EINVAL,
6113                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
6114                                   NULL,
6115                                   "flow does not have counter");
6116 error_exit:
6117         return rte_flow_error_set
6118                         (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
6119                          NULL, "netlink: failed to read flow rule counters");
6120 notsup_exit:
6121         return rte_flow_error_set
6122                         (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
6123                          NULL, "counters are not available.");
6124 }
6125
6126 /**
6127  * Query a flow.
6128  *
6129  * @see rte_flow_query()
6130  * @see rte_flow_ops
6131  */
6132 static int
6133 flow_tcf_query(struct rte_eth_dev *dev,
6134                struct rte_flow *flow,
6135                const struct rte_flow_action *actions,
6136                void *data,
6137                struct rte_flow_error *error)
6138 {
6139         int ret = -EINVAL;
6140
6141         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
6142                 switch (actions->type) {
6143                 case RTE_FLOW_ACTION_TYPE_VOID:
6144                         break;
6145                 case RTE_FLOW_ACTION_TYPE_COUNT:
6146                         ret = flow_tcf_query_count(dev, flow, data, error);
6147                         break;
6148                 default:
6149                         return rte_flow_error_set(error, ENOTSUP,
6150                                                   RTE_FLOW_ERROR_TYPE_ACTION,
6151                                                   actions,
6152                                                   "action not supported");
6153                 }
6154         }
6155         return ret;
6156 }
6157
6158 const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops = {
6159         .validate = flow_tcf_validate,
6160         .prepare = flow_tcf_prepare,
6161         .translate = flow_tcf_translate,
6162         .apply = flow_tcf_apply,
6163         .remove = flow_tcf_remove,
6164         .destroy = flow_tcf_destroy,
6165         .query = flow_tcf_query,
6166 };
6167
6168 /**
6169  * Create and configure a libmnl socket for Netlink flow rules.
6170  *
6171  * @return
6172  *   A valid libmnl socket object pointer on success, NULL otherwise and
6173  *   rte_errno is set.
6174  */
6175 static struct mnl_socket *
6176 flow_tcf_mnl_socket_create(void)
6177 {
6178         struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
6179
6180         if (nl) {
6181                 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
6182                                       sizeof(int));
6183                 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
6184                         return nl;
6185         }
6186         rte_errno = errno;
6187         if (nl)
6188                 mnl_socket_close(nl);
6189         return NULL;
6190 }
6191
6192 /**
6193  * Destroy a libmnl socket.
6194  *
6195  * @param nl
6196  *   Libmnl socket of the @p NETLINK_ROUTE kind.
6197  */
6198 static void
6199 flow_tcf_mnl_socket_destroy(struct mnl_socket *nl)
6200 {
6201         if (nl)
6202                 mnl_socket_close(nl);
6203 }
6204
6205 /**
6206  * Initialize ingress qdisc of a given network interface.
6207  *
6208  * @param ctx
6209  *   Pointer to tc-flower context to use.
6210  * @param ifindex
6211  *   Index of network interface to initialize.
6212  * @param[out] error
6213  *   Perform verbose error reporting if not NULL.
6214  *
6215  * @return
6216  *   0 on success, a negative errno value otherwise and rte_errno is set.
6217  */
6218 int
6219 mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx,
6220                    unsigned int ifindex, struct rte_flow_error *error)
6221 {
6222         struct nlmsghdr *nlh;
6223         struct tcmsg *tcm;
6224         alignas(struct nlmsghdr)
6225         uint8_t buf[mnl_nlmsg_size(sizeof(*tcm)) +
6226                     SZ_NLATTR_STRZ_OF("ingress") +
6227                     MNL_BUF_EXTRA_SPACE];
6228
6229         /* Destroy existing ingress qdisc and everything attached to it. */
6230         nlh = mnl_nlmsg_put_header(buf);
6231         nlh->nlmsg_type = RTM_DELQDISC;
6232         nlh->nlmsg_flags = NLM_F_REQUEST;
6233         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6234         tcm->tcm_family = AF_UNSPEC;
6235         tcm->tcm_ifindex = ifindex;
6236         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6237         tcm->tcm_parent = TC_H_INGRESS;
6238         assert(sizeof(buf) >= nlh->nlmsg_len);
6239         /* Ignore errors when qdisc is already absent. */
6240         if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL) &&
6241             rte_errno != EINVAL && rte_errno != ENOENT)
6242                 return rte_flow_error_set(error, rte_errno,
6243                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6244                                           "netlink: failed to remove ingress"
6245                                           " qdisc");
6246         /* Create fresh ingress qdisc. */
6247         nlh = mnl_nlmsg_put_header(buf);
6248         nlh->nlmsg_type = RTM_NEWQDISC;
6249         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
6250         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6251         tcm->tcm_family = AF_UNSPEC;
6252         tcm->tcm_ifindex = ifindex;
6253         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6254         tcm->tcm_parent = TC_H_INGRESS;
6255         mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
6256         assert(sizeof(buf) >= nlh->nlmsg_len);
6257         if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL))
6258                 return rte_flow_error_set(error, rte_errno,
6259                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6260                                           "netlink: failed to create ingress"
6261                                           " qdisc");
6262         return 0;
6263 }
6264
6265 /**
6266  * Create libmnl context for Netlink flow rules.
6267  *
6268  * @return
6269  *   A valid libmnl socket object pointer on success, NULL otherwise and
6270  *   rte_errno is set.
6271  */
6272 struct mlx5_flow_tcf_context *
6273 mlx5_flow_tcf_context_create(void)
6274 {
6275         struct mlx5_flow_tcf_context *ctx = rte_zmalloc(__func__,
6276                                                         sizeof(*ctx),
6277                                                         sizeof(uint32_t));
6278         if (!ctx)
6279                 goto error;
6280         ctx->nl = flow_tcf_mnl_socket_create();
6281         if (!ctx->nl)
6282                 goto error;
6283         ctx->buf_size = MNL_SOCKET_BUFFER_SIZE;
6284         ctx->buf = rte_zmalloc(__func__,
6285                                ctx->buf_size, sizeof(uint32_t));
6286         if (!ctx->buf)
6287                 goto error;
6288         ctx->seq = random();
6289         return ctx;
6290 error:
6291         mlx5_flow_tcf_context_destroy(ctx);
6292         return NULL;
6293 }
6294
6295 /**
6296  * Destroy a libmnl context.
6297  *
6298  * @param ctx
6299  *   Libmnl socket of the @p NETLINK_ROUTE kind.
6300  */
6301 void
6302 mlx5_flow_tcf_context_destroy(struct mlx5_flow_tcf_context *ctx)
6303 {
6304         if (!ctx)
6305                 return;
6306         flow_tcf_mnl_socket_destroy(ctx->nl);
6307         rte_free(ctx->buf);
6308         rte_free(ctx);
6309 }