net/mlx5: swap items/actions validations for E-Switch rules
[dpdk.git] / drivers / net / mlx5 / mlx5_flow_tcf.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018 6WIND S.A.
3  * Copyright 2018 Mellanox Technologies, Ltd
4  */
5
6 #include <assert.h>
7 #include <errno.h>
8 #include <libmnl/libmnl.h>
9 #include <linux/gen_stats.h>
10 #include <linux/if_ether.h>
11 #include <linux/netlink.h>
12 #include <linux/pkt_cls.h>
13 #include <linux/pkt_sched.h>
14 #include <linux/rtnetlink.h>
15 #include <linux/tc_act/tc_gact.h>
16 #include <linux/tc_act/tc_mirred.h>
17 #include <netinet/in.h>
18 #include <stdalign.h>
19 #include <stdbool.h>
20 #include <stddef.h>
21 #include <stdint.h>
22 #include <stdlib.h>
23 #include <sys/socket.h>
24
25 #include <rte_byteorder.h>
26 #include <rte_errno.h>
27 #include <rte_ether.h>
28 #include <rte_flow.h>
29 #include <rte_malloc.h>
30 #include <rte_common.h>
31
32 #include "mlx5.h"
33 #include "mlx5_flow.h"
34 #include "mlx5_autoconf.h"
35
36 #ifdef HAVE_TC_ACT_VLAN
37
38 #include <linux/tc_act/tc_vlan.h>
39
40 #else /* HAVE_TC_ACT_VLAN */
41
42 #define TCA_VLAN_ACT_POP 1
43 #define TCA_VLAN_ACT_PUSH 2
44 #define TCA_VLAN_ACT_MODIFY 3
45 #define TCA_VLAN_PARMS 2
46 #define TCA_VLAN_PUSH_VLAN_ID 3
47 #define TCA_VLAN_PUSH_VLAN_PROTOCOL 4
48 #define TCA_VLAN_PAD 5
49 #define TCA_VLAN_PUSH_VLAN_PRIORITY 6
50
51 struct tc_vlan {
52         tc_gen;
53         int v_action;
54 };
55
56 #endif /* HAVE_TC_ACT_VLAN */
57
58 #ifdef HAVE_TC_ACT_PEDIT
59
60 #include <linux/tc_act/tc_pedit.h>
61
62 #else /* HAVE_TC_ACT_VLAN */
63
64 enum {
65         TCA_PEDIT_UNSPEC,
66         TCA_PEDIT_TM,
67         TCA_PEDIT_PARMS,
68         TCA_PEDIT_PAD,
69         TCA_PEDIT_PARMS_EX,
70         TCA_PEDIT_KEYS_EX,
71         TCA_PEDIT_KEY_EX,
72         __TCA_PEDIT_MAX
73 };
74
75 enum {
76         TCA_PEDIT_KEY_EX_HTYPE = 1,
77         TCA_PEDIT_KEY_EX_CMD = 2,
78         __TCA_PEDIT_KEY_EX_MAX
79 };
80
81 enum pedit_header_type {
82         TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK = 0,
83         TCA_PEDIT_KEY_EX_HDR_TYPE_ETH = 1,
84         TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 = 2,
85         TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 = 3,
86         TCA_PEDIT_KEY_EX_HDR_TYPE_TCP = 4,
87         TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
88         __PEDIT_HDR_TYPE_MAX,
89 };
90
91 enum pedit_cmd {
92         TCA_PEDIT_KEY_EX_CMD_SET = 0,
93         TCA_PEDIT_KEY_EX_CMD_ADD = 1,
94         __PEDIT_CMD_MAX,
95 };
96
97 struct tc_pedit_key {
98         __u32 mask; /* AND */
99         __u32 val; /*XOR */
100         __u32 off; /*offset */
101         __u32 at;
102         __u32 offmask;
103         __u32 shift;
104 };
105
106 __extension__
107 struct tc_pedit_sel {
108         tc_gen;
109         unsigned char nkeys;
110         unsigned char flags;
111         struct tc_pedit_key keys[0];
112 };
113
114 #endif /* HAVE_TC_ACT_VLAN */
115
116 #ifdef HAVE_TC_ACT_TUNNEL_KEY
117
118 #include <linux/tc_act/tc_tunnel_key.h>
119
120 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
121 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
122 #endif
123
124 #ifndef HAVE_TCA_TUNNEL_KEY_NO_CSUM
125 #define TCA_TUNNEL_KEY_NO_CSUM 10
126 #endif
127
128 #else /* HAVE_TC_ACT_TUNNEL_KEY */
129
130 #define TCA_ACT_TUNNEL_KEY 17
131 #define TCA_TUNNEL_KEY_ACT_SET 1
132 #define TCA_TUNNEL_KEY_ACT_RELEASE 2
133 #define TCA_TUNNEL_KEY_PARMS 2
134 #define TCA_TUNNEL_KEY_ENC_IPV4_SRC 3
135 #define TCA_TUNNEL_KEY_ENC_IPV4_DST 4
136 #define TCA_TUNNEL_KEY_ENC_IPV6_SRC 5
137 #define TCA_TUNNEL_KEY_ENC_IPV6_DST 6
138 #define TCA_TUNNEL_KEY_ENC_KEY_ID 7
139 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
140 #define TCA_TUNNEL_KEY_NO_CSUM 10
141
142 struct tc_tunnel_key {
143         tc_gen;
144         int t_action;
145 };
146
147 #endif /* HAVE_TC_ACT_TUNNEL_KEY */
148
149 /* Normally found in linux/netlink.h. */
150 #ifndef NETLINK_CAP_ACK
151 #define NETLINK_CAP_ACK 10
152 #endif
153
154 /* Normally found in linux/pkt_sched.h. */
155 #ifndef TC_H_MIN_INGRESS
156 #define TC_H_MIN_INGRESS 0xfff2u
157 #endif
158
159 /* Normally found in linux/pkt_cls.h. */
160 #ifndef TCA_CLS_FLAGS_SKIP_SW
161 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
162 #endif
163 #ifndef HAVE_TCA_CHAIN
164 #define TCA_CHAIN 11
165 #endif
166 #ifndef HAVE_TCA_FLOWER_ACT
167 #define TCA_FLOWER_ACT 3
168 #endif
169 #ifndef HAVE_TCA_FLOWER_FLAGS
170 #define TCA_FLOWER_FLAGS 22
171 #endif
172 #ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE
173 #define TCA_FLOWER_KEY_ETH_TYPE 8
174 #endif
175 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST
176 #define TCA_FLOWER_KEY_ETH_DST 4
177 #endif
178 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK
179 #define TCA_FLOWER_KEY_ETH_DST_MASK 5
180 #endif
181 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC
182 #define TCA_FLOWER_KEY_ETH_SRC 6
183 #endif
184 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK
185 #define TCA_FLOWER_KEY_ETH_SRC_MASK 7
186 #endif
187 #ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO
188 #define TCA_FLOWER_KEY_IP_PROTO 9
189 #endif
190 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC
191 #define TCA_FLOWER_KEY_IPV4_SRC 10
192 #endif
193 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK
194 #define TCA_FLOWER_KEY_IPV4_SRC_MASK 11
195 #endif
196 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST
197 #define TCA_FLOWER_KEY_IPV4_DST 12
198 #endif
199 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK
200 #define TCA_FLOWER_KEY_IPV4_DST_MASK 13
201 #endif
202 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC
203 #define TCA_FLOWER_KEY_IPV6_SRC 14
204 #endif
205 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK
206 #define TCA_FLOWER_KEY_IPV6_SRC_MASK 15
207 #endif
208 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST
209 #define TCA_FLOWER_KEY_IPV6_DST 16
210 #endif
211 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK
212 #define TCA_FLOWER_KEY_IPV6_DST_MASK 17
213 #endif
214 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC
215 #define TCA_FLOWER_KEY_TCP_SRC 18
216 #endif
217 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK
218 #define TCA_FLOWER_KEY_TCP_SRC_MASK 35
219 #endif
220 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST
221 #define TCA_FLOWER_KEY_TCP_DST 19
222 #endif
223 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK
224 #define TCA_FLOWER_KEY_TCP_DST_MASK 36
225 #endif
226 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC
227 #define TCA_FLOWER_KEY_UDP_SRC 20
228 #endif
229 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK
230 #define TCA_FLOWER_KEY_UDP_SRC_MASK 37
231 #endif
232 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST
233 #define TCA_FLOWER_KEY_UDP_DST 21
234 #endif
235 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK
236 #define TCA_FLOWER_KEY_UDP_DST_MASK 38
237 #endif
238 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID
239 #define TCA_FLOWER_KEY_VLAN_ID 23
240 #endif
241 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO
242 #define TCA_FLOWER_KEY_VLAN_PRIO 24
243 #endif
244 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
245 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
246 #endif
247 #ifndef HAVE_TCA_FLOWER_KEY_ENC_KEY_ID
248 #define TCA_FLOWER_KEY_ENC_KEY_ID 26
249 #endif
250 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC
251 #define TCA_FLOWER_KEY_ENC_IPV4_SRC 27
252 #endif
253 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK
254 #define TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK 28
255 #endif
256 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST
257 #define TCA_FLOWER_KEY_ENC_IPV4_DST 29
258 #endif
259 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK
260 #define TCA_FLOWER_KEY_ENC_IPV4_DST_MASK 30
261 #endif
262 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC
263 #define TCA_FLOWER_KEY_ENC_IPV6_SRC 31
264 #endif
265 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK
266 #define TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK 32
267 #endif
268 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST
269 #define TCA_FLOWER_KEY_ENC_IPV6_DST 33
270 #endif
271 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK
272 #define TCA_FLOWER_KEY_ENC_IPV6_DST_MASK 34
273 #endif
274 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT
275 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT 43
276 #endif
277 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK
278 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK 44
279 #endif
280 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT
281 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT 45
282 #endif
283 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK
284 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK 46
285 #endif
286 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS
287 #define TCA_FLOWER_KEY_TCP_FLAGS 71
288 #endif
289 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK
290 #define TCA_FLOWER_KEY_TCP_FLAGS_MASK 72
291 #endif
292 #ifndef HAVE_TC_ACT_GOTO_CHAIN
293 #define TC_ACT_GOTO_CHAIN 0x20000000
294 #endif
295
296 #ifndef IPV6_ADDR_LEN
297 #define IPV6_ADDR_LEN 16
298 #endif
299
300 #ifndef IPV4_ADDR_LEN
301 #define IPV4_ADDR_LEN 4
302 #endif
303
304 #ifndef TP_PORT_LEN
305 #define TP_PORT_LEN 2 /* Transport Port (UDP/TCP) Length */
306 #endif
307
308 #ifndef TTL_LEN
309 #define TTL_LEN 1
310 #endif
311
312 #ifndef TCA_ACT_MAX_PRIO
313 #define TCA_ACT_MAX_PRIO 32
314 #endif
315
316 /** UDP port range of VXLAN devices created by driver. */
317 #define MLX5_VXLAN_PORT_MIN 30000
318 #define MLX5_VXLAN_PORT_MAX 60000
319 #define MLX5_VXLAN_DEVICE_PFX "vmlx_"
320
321 /** Tunnel action type, used for @p type in header structure. */
322 enum flow_tcf_tunact_type {
323         FLOW_TCF_TUNACT_VXLAN_DECAP,
324         FLOW_TCF_TUNACT_VXLAN_ENCAP,
325 };
326
327 /** Flags used for @p mask in tunnel action encap descriptors. */
328 #define FLOW_TCF_ENCAP_ETH_SRC (1u << 0)
329 #define FLOW_TCF_ENCAP_ETH_DST (1u << 1)
330 #define FLOW_TCF_ENCAP_IPV4_SRC (1u << 2)
331 #define FLOW_TCF_ENCAP_IPV4_DST (1u << 3)
332 #define FLOW_TCF_ENCAP_IPV6_SRC (1u << 4)
333 #define FLOW_TCF_ENCAP_IPV6_DST (1u << 5)
334 #define FLOW_TCF_ENCAP_UDP_SRC (1u << 6)
335 #define FLOW_TCF_ENCAP_UDP_DST (1u << 7)
336 #define FLOW_TCF_ENCAP_VXLAN_VNI (1u << 8)
337
338 /**
339  * Structure for holding netlink context.
340  * Note the size of the message buffer which is MNL_SOCKET_BUFFER_SIZE.
341  * Using this (8KB) buffer size ensures that netlink messages will never be
342  * truncated.
343  */
344 struct mlx5_flow_tcf_context {
345         struct mnl_socket *nl; /* NETLINK_ROUTE libmnl socket. */
346         uint32_t seq; /* Message sequence number. */
347         uint32_t buf_size; /* Message buffer size. */
348         uint8_t *buf; /* Message buffer. */
349 };
350
351 /**
352  * Neigh rule structure. The neigh rule is applied via Netlink to
353  * outer tunnel iface in order to provide destination MAC address
354  * for the VXLAN encapsultion. The neigh rule is implicitly related
355  * to the Flow itself and can be shared by multiple Flows.
356  */
357 struct tcf_neigh_rule {
358         LIST_ENTRY(tcf_neigh_rule) next;
359         uint32_t refcnt;
360         struct ether_addr eth;
361         uint16_t mask;
362         union {
363                 struct {
364                         rte_be32_t dst;
365                 } ipv4;
366                 struct {
367                         uint8_t dst[IPV6_ADDR_LEN];
368                 } ipv6;
369         };
370 };
371
372 /**
373  * Local rule structure. The local rule is applied via Netlink to
374  * outer tunnel iface in order to provide local and peer IP addresses
375  * of the VXLAN tunnel for encapsulation. The local rule is implicitly
376  * related to the Flow itself and can be shared by multiple Flows.
377  */
378 struct tcf_local_rule {
379         LIST_ENTRY(tcf_local_rule) next;
380         uint32_t refcnt;
381         uint16_t mask;
382         union {
383                 struct {
384                         rte_be32_t dst;
385                         rte_be32_t src;
386                 } ipv4;
387                 struct {
388                         uint8_t dst[IPV6_ADDR_LEN];
389                         uint8_t src[IPV6_ADDR_LEN];
390                 } ipv6;
391         };
392 };
393
394 /** VXLAN virtual netdev. */
395 struct tcf_vtep {
396         LIST_ENTRY(tcf_vtep) next;
397         LIST_HEAD(, tcf_neigh_rule) neigh;
398         LIST_HEAD(, tcf_local_rule) local;
399         uint32_t refcnt;
400         unsigned int ifindex; /**< Own interface index. */
401         unsigned int ifouter; /**< Index of device attached to. */
402         uint16_t port;
403         uint8_t created;
404 };
405
406 /** Tunnel descriptor header, common for all tunnel types. */
407 struct flow_tcf_tunnel_hdr {
408         uint32_t type; /**< Tunnel action type. */
409         struct tcf_vtep *vtep; /**< Virtual tunnel endpoint device. */
410         unsigned int ifindex_org; /**< Original dst/src interface */
411         unsigned int *ifindex_ptr; /**< Interface ptr in message. */
412 };
413
414 struct flow_tcf_vxlan_decap {
415         struct flow_tcf_tunnel_hdr hdr;
416         uint16_t udp_port;
417 };
418
419 struct flow_tcf_vxlan_encap {
420         struct flow_tcf_tunnel_hdr hdr;
421         uint32_t mask;
422         struct {
423                 struct ether_addr dst;
424                 struct ether_addr src;
425         } eth;
426         union {
427                 struct {
428                         rte_be32_t dst;
429                         rte_be32_t src;
430                 } ipv4;
431                 struct {
432                         uint8_t dst[IPV6_ADDR_LEN];
433                         uint8_t src[IPV6_ADDR_LEN];
434                 } ipv6;
435         };
436 struct {
437                 rte_be16_t src;
438                 rte_be16_t dst;
439         } udp;
440         struct {
441                 uint8_t vni[3];
442         } vxlan;
443 };
444
445 /** Structure used when extracting the values of a flow counters
446  * from a netlink message.
447  */
448 struct flow_tcf_stats_basic {
449         bool valid;
450         struct gnet_stats_basic counters;
451 };
452
453 /** Empty masks for known item types. */
454 static const union {
455         struct rte_flow_item_port_id port_id;
456         struct rte_flow_item_eth eth;
457         struct rte_flow_item_vlan vlan;
458         struct rte_flow_item_ipv4 ipv4;
459         struct rte_flow_item_ipv6 ipv6;
460         struct rte_flow_item_tcp tcp;
461         struct rte_flow_item_udp udp;
462         struct rte_flow_item_vxlan vxlan;
463 } flow_tcf_mask_empty;
464
465 /** Supported masks for known item types. */
466 static const struct {
467         struct rte_flow_item_port_id port_id;
468         struct rte_flow_item_eth eth;
469         struct rte_flow_item_vlan vlan;
470         struct rte_flow_item_ipv4 ipv4;
471         struct rte_flow_item_ipv6 ipv6;
472         struct rte_flow_item_tcp tcp;
473         struct rte_flow_item_udp udp;
474         struct rte_flow_item_vxlan vxlan;
475 } flow_tcf_mask_supported = {
476         .port_id = {
477                 .id = 0xffffffff,
478         },
479         .eth = {
480                 .type = RTE_BE16(0xffff),
481                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
482                 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
483         },
484         .vlan = {
485                 /* PCP and VID only, no DEI. */
486                 .tci = RTE_BE16(0xefff),
487                 .inner_type = RTE_BE16(0xffff),
488         },
489         .ipv4.hdr = {
490                 .next_proto_id = 0xff,
491                 .src_addr = RTE_BE32(0xffffffff),
492                 .dst_addr = RTE_BE32(0xffffffff),
493         },
494         .ipv6.hdr = {
495                 .proto = 0xff,
496                 .src_addr =
497                         "\xff\xff\xff\xff\xff\xff\xff\xff"
498                         "\xff\xff\xff\xff\xff\xff\xff\xff",
499                 .dst_addr =
500                         "\xff\xff\xff\xff\xff\xff\xff\xff"
501                         "\xff\xff\xff\xff\xff\xff\xff\xff",
502         },
503         .tcp.hdr = {
504                 .src_port = RTE_BE16(0xffff),
505                 .dst_port = RTE_BE16(0xffff),
506                 .tcp_flags = 0xff,
507         },
508         .udp.hdr = {
509                 .src_port = RTE_BE16(0xffff),
510                 .dst_port = RTE_BE16(0xffff),
511         },
512         .vxlan = {
513                .vni = "\xff\xff\xff",
514         },
515 };
516
517 #define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr))
518 #define SZ_NLATTR_NEST SZ_NLATTR_HDR
519 #define SZ_NLATTR_DATA_OF(len) MNL_ALIGN(SZ_NLATTR_HDR + (len))
520 #define SZ_NLATTR_TYPE_OF(typ) SZ_NLATTR_DATA_OF(sizeof(typ))
521 #define SZ_NLATTR_STRZ_OF(str) SZ_NLATTR_DATA_OF(strlen(str) + 1)
522
523 #define PTOI_TABLE_SZ_MAX(dev) (mlx5_dev_to_port_id((dev)->device, NULL, 0) + 2)
524
525 /** DPDK port to network interface index (ifindex) conversion. */
526 struct flow_tcf_ptoi {
527         uint16_t port_id; /**< DPDK port ID. */
528         unsigned int ifindex; /**< Network interface index. */
529 };
530
531 /* Due to a limitation on driver/FW. */
532 #define MLX5_TCF_GROUP_ID_MAX 3
533 #define MLX5_TCF_GROUP_PRIORITY_MAX 14
534
535 #define MLX5_TCF_FATE_ACTIONS \
536         (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \
537          MLX5_FLOW_ACTION_JUMP)
538
539 #define MLX5_TCF_VLAN_ACTIONS \
540         (MLX5_FLOW_ACTION_OF_POP_VLAN | MLX5_FLOW_ACTION_OF_PUSH_VLAN | \
541          MLX5_FLOW_ACTION_OF_SET_VLAN_VID | MLX5_FLOW_ACTION_OF_SET_VLAN_PCP)
542
543 #define MLX5_TCF_VXLAN_ACTIONS \
544         (MLX5_FLOW_ACTION_VXLAN_ENCAP | MLX5_FLOW_ACTION_VXLAN_DECAP)
545
546 #define MLX5_TCF_PEDIT_ACTIONS \
547         (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST | \
548          MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST | \
549          MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST | \
550          MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL | \
551          MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)
552
553 #define MLX5_TCF_CONFIG_ACTIONS \
554         (MLX5_FLOW_ACTION_PORT_ID | MLX5_FLOW_ACTION_JUMP | \
555          MLX5_FLOW_ACTION_OF_PUSH_VLAN | MLX5_FLOW_ACTION_OF_SET_VLAN_VID | \
556          MLX5_FLOW_ACTION_OF_SET_VLAN_PCP | \
557          (MLX5_TCF_PEDIT_ACTIONS & ~MLX5_FLOW_ACTION_DEC_TTL))
558
559 #define MAX_PEDIT_KEYS 128
560 #define SZ_PEDIT_KEY_VAL 4
561
562 #define NUM_OF_PEDIT_KEYS(sz) \
563         (((sz) / SZ_PEDIT_KEY_VAL) + (((sz) % SZ_PEDIT_KEY_VAL) ? 1 : 0))
564
565 struct pedit_key_ex {
566         enum pedit_header_type htype;
567         enum pedit_cmd cmd;
568 };
569
570 struct pedit_parser {
571         struct tc_pedit_sel sel;
572         struct tc_pedit_key keys[MAX_PEDIT_KEYS];
573         struct pedit_key_ex keys_ex[MAX_PEDIT_KEYS];
574 };
575
576 /**
577  * Create space for using the implicitly created TC flow counter.
578  *
579  * @param[in] dev
580  *   Pointer to the Ethernet device structure.
581  *
582  * @return
583  *   A pointer to the counter data structure, NULL otherwise and
584  *   rte_errno is set.
585  */
586 static struct mlx5_flow_counter *
587 flow_tcf_counter_new(void)
588 {
589         struct mlx5_flow_counter *cnt;
590
591         /*
592          * eswitch counter cannot be shared and its id is unknown.
593          * currently returning all with id 0.
594          * in the future maybe better to switch to unique numbers.
595          */
596         struct mlx5_flow_counter tmpl = {
597                 .ref_cnt = 1,
598         };
599         cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0);
600         if (!cnt) {
601                 rte_errno = ENOMEM;
602                 return NULL;
603         }
604         *cnt = tmpl;
605         /* Implicit counter, do not add to list. */
606         return cnt;
607 }
608
609 /**
610  * Set pedit key of MAC address
611  *
612  * @param[in] actions
613  *   pointer to action specification
614  * @param[in,out] p_parser
615  *   pointer to pedit_parser
616  */
617 static void
618 flow_tcf_pedit_key_set_mac(const struct rte_flow_action *actions,
619                            struct pedit_parser *p_parser)
620 {
621         int idx = p_parser->sel.nkeys;
622         uint32_t off = actions->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ?
623                                         offsetof(struct ether_hdr, s_addr) :
624                                         offsetof(struct ether_hdr, d_addr);
625         const struct rte_flow_action_set_mac *conf =
626                 (const struct rte_flow_action_set_mac *)actions->conf;
627
628         p_parser->keys[idx].off = off;
629         p_parser->keys[idx].mask = ~UINT32_MAX;
630         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
631         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
632         memcpy(&p_parser->keys[idx].val,
633                 conf->mac_addr, SZ_PEDIT_KEY_VAL);
634         idx++;
635         p_parser->keys[idx].off = off + SZ_PEDIT_KEY_VAL;
636         p_parser->keys[idx].mask = 0xFFFF0000;
637         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
638         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
639         memcpy(&p_parser->keys[idx].val,
640                 conf->mac_addr + SZ_PEDIT_KEY_VAL,
641                 ETHER_ADDR_LEN - SZ_PEDIT_KEY_VAL);
642         p_parser->sel.nkeys = (++idx);
643 }
644
645 /**
646  * Set pedit key of decrease/set ttl
647  *
648  * @param[in] actions
649  *   pointer to action specification
650  * @param[in,out] p_parser
651  *   pointer to pedit_parser
652  * @param[in] item_flags
653  *   flags of all items presented
654  */
655 static void
656 flow_tcf_pedit_key_set_dec_ttl(const struct rte_flow_action *actions,
657                                 struct pedit_parser *p_parser,
658                                 uint64_t item_flags)
659 {
660         int idx = p_parser->sel.nkeys;
661
662         p_parser->keys[idx].mask = 0xFFFFFF00;
663         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) {
664                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
665                 p_parser->keys[idx].off =
666                         offsetof(struct ipv4_hdr, time_to_live);
667         }
668         if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6) {
669                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
670                 p_parser->keys[idx].off =
671                         offsetof(struct ipv6_hdr, hop_limits);
672         }
673         if (actions->type == RTE_FLOW_ACTION_TYPE_DEC_TTL) {
674                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_ADD;
675                 p_parser->keys[idx].val = 0x000000FF;
676         } else {
677                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
678                 p_parser->keys[idx].val =
679                         (__u32)((const struct rte_flow_action_set_ttl *)
680                          actions->conf)->ttl_value;
681         }
682         p_parser->sel.nkeys = (++idx);
683 }
684
685 /**
686  * Set pedit key of transport (TCP/UDP) port value
687  *
688  * @param[in] actions
689  *   pointer to action specification
690  * @param[in,out] p_parser
691  *   pointer to pedit_parser
692  * @param[in] item_flags
693  *   flags of all items presented
694  */
695 static void
696 flow_tcf_pedit_key_set_tp_port(const struct rte_flow_action *actions,
697                                 struct pedit_parser *p_parser,
698                                 uint64_t item_flags)
699 {
700         int idx = p_parser->sel.nkeys;
701
702         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)
703                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_UDP;
704         if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_TCP)
705                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_TCP;
706         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
707         /* offset of src/dst port is same for TCP and UDP */
708         p_parser->keys[idx].off =
709                 actions->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC ?
710                 offsetof(struct tcp_hdr, src_port) :
711                 offsetof(struct tcp_hdr, dst_port);
712         p_parser->keys[idx].mask = 0xFFFF0000;
713         p_parser->keys[idx].val =
714                 (__u32)((const struct rte_flow_action_set_tp *)
715                                 actions->conf)->port;
716         p_parser->sel.nkeys = (++idx);
717 }
718
719 /**
720  * Set pedit key of ipv6 address
721  *
722  * @param[in] actions
723  *   pointer to action specification
724  * @param[in,out] p_parser
725  *   pointer to pedit_parser
726  */
727 static void
728 flow_tcf_pedit_key_set_ipv6_addr(const struct rte_flow_action *actions,
729                                  struct pedit_parser *p_parser)
730 {
731         int idx = p_parser->sel.nkeys;
732         int keys = NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
733         int off_base =
734                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ?
735                 offsetof(struct ipv6_hdr, src_addr) :
736                 offsetof(struct ipv6_hdr, dst_addr);
737         const struct rte_flow_action_set_ipv6 *conf =
738                 (const struct rte_flow_action_set_ipv6 *)actions->conf;
739
740         for (int i = 0; i < keys; i++, idx++) {
741                 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
742                 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
743                 p_parser->keys[idx].off = off_base + i * SZ_PEDIT_KEY_VAL;
744                 p_parser->keys[idx].mask = ~UINT32_MAX;
745                 memcpy(&p_parser->keys[idx].val,
746                         conf->ipv6_addr + i *  SZ_PEDIT_KEY_VAL,
747                         SZ_PEDIT_KEY_VAL);
748         }
749         p_parser->sel.nkeys += keys;
750 }
751
752 /**
753  * Set pedit key of ipv4 address
754  *
755  * @param[in] actions
756  *   pointer to action specification
757  * @param[in,out] p_parser
758  *   pointer to pedit_parser
759  */
760 static void
761 flow_tcf_pedit_key_set_ipv4_addr(const struct rte_flow_action *actions,
762                                  struct pedit_parser *p_parser)
763 {
764         int idx = p_parser->sel.nkeys;
765
766         p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
767         p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
768         p_parser->keys[idx].off =
769                 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ?
770                 offsetof(struct ipv4_hdr, src_addr) :
771                 offsetof(struct ipv4_hdr, dst_addr);
772         p_parser->keys[idx].mask = ~UINT32_MAX;
773         p_parser->keys[idx].val =
774                 ((const struct rte_flow_action_set_ipv4 *)
775                  actions->conf)->ipv4_addr;
776         p_parser->sel.nkeys = (++idx);
777 }
778
779 /**
780  * Create the pedit's na attribute in netlink message
781  * on pre-allocate message buffer
782  *
783  * @param[in,out] nl
784  *   pointer to pre-allocated netlink message buffer
785  * @param[in,out] actions
786  *   pointer to pointer of actions specification.
787  * @param[in,out] action_flags
788  *   pointer to actions flags
789  * @param[in] item_flags
790  *   flags of all item presented
791  */
792 static void
793 flow_tcf_create_pedit_mnl_msg(struct nlmsghdr *nl,
794                               const struct rte_flow_action **actions,
795                               uint64_t item_flags)
796 {
797         struct pedit_parser p_parser;
798         struct nlattr *na_act_options;
799         struct nlattr *na_pedit_keys;
800
801         memset(&p_parser, 0, sizeof(p_parser));
802         mnl_attr_put_strz(nl, TCA_ACT_KIND, "pedit");
803         na_act_options = mnl_attr_nest_start(nl, TCA_ACT_OPTIONS);
804         /* all modify header actions should be in one tc-pedit action */
805         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
806                 switch ((*actions)->type) {
807                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
808                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
809                         flow_tcf_pedit_key_set_ipv4_addr(*actions, &p_parser);
810                         break;
811                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
812                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
813                         flow_tcf_pedit_key_set_ipv6_addr(*actions, &p_parser);
814                         break;
815                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
816                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
817                         flow_tcf_pedit_key_set_tp_port(*actions,
818                                                         &p_parser, item_flags);
819                         break;
820                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
821                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
822                         flow_tcf_pedit_key_set_dec_ttl(*actions,
823                                                         &p_parser, item_flags);
824                         break;
825                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
826                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
827                         flow_tcf_pedit_key_set_mac(*actions, &p_parser);
828                         break;
829                 default:
830                         goto pedit_mnl_msg_done;
831                 }
832         }
833 pedit_mnl_msg_done:
834         p_parser.sel.action = TC_ACT_PIPE;
835         mnl_attr_put(nl, TCA_PEDIT_PARMS_EX,
836                      sizeof(p_parser.sel) +
837                      p_parser.sel.nkeys * sizeof(struct tc_pedit_key),
838                      &p_parser);
839         na_pedit_keys =
840                 mnl_attr_nest_start(nl, TCA_PEDIT_KEYS_EX | NLA_F_NESTED);
841         for (int i = 0; i < p_parser.sel.nkeys; i++) {
842                 struct nlattr *na_pedit_key =
843                         mnl_attr_nest_start(nl,
844                                             TCA_PEDIT_KEY_EX | NLA_F_NESTED);
845                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_HTYPE,
846                                  p_parser.keys_ex[i].htype);
847                 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_CMD,
848                                  p_parser.keys_ex[i].cmd);
849                 mnl_attr_nest_end(nl, na_pedit_key);
850         }
851         mnl_attr_nest_end(nl, na_pedit_keys);
852         mnl_attr_nest_end(nl, na_act_options);
853         (*actions)--;
854 }
855
856 /**
857  * Calculate max memory size of one TC-pedit actions.
858  * One TC-pedit action can contain set of keys each defining
859  * a rewrite element (rte_flow action)
860  *
861  * @param[in,out] actions
862  *   actions specification.
863  * @param[in,out] action_flags
864  *   actions flags
865  * @param[in,out] size
866  *   accumulated size
867  * @return
868  *   Max memory size of one TC-pedit action
869  */
870 static int
871 flow_tcf_get_pedit_actions_size(const struct rte_flow_action **actions,
872                                 uint64_t *action_flags)
873 {
874         int pedit_size = 0;
875         int keys = 0;
876         uint64_t flags = 0;
877
878         pedit_size += SZ_NLATTR_NEST + /* na_act_index. */
879                       SZ_NLATTR_STRZ_OF("pedit") +
880                       SZ_NLATTR_NEST; /* TCA_ACT_OPTIONS. */
881         for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
882                 switch ((*actions)->type) {
883                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
884                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
885                         flags |= MLX5_FLOW_ACTION_SET_IPV4_SRC;
886                         break;
887                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
888                         keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
889                         flags |= MLX5_FLOW_ACTION_SET_IPV4_DST;
890                         break;
891                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
892                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
893                         flags |= MLX5_FLOW_ACTION_SET_IPV6_SRC;
894                         break;
895                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
896                         keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
897                         flags |= MLX5_FLOW_ACTION_SET_IPV6_DST;
898                         break;
899                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
900                         /* TCP is as same as UDP */
901                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
902                         flags |= MLX5_FLOW_ACTION_SET_TP_SRC;
903                         break;
904                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
905                         /* TCP is as same as UDP */
906                         keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
907                         flags |= MLX5_FLOW_ACTION_SET_TP_DST;
908                         break;
909                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
910                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
911                         flags |= MLX5_FLOW_ACTION_SET_TTL;
912                         break;
913                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
914                         keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
915                         flags |= MLX5_FLOW_ACTION_DEC_TTL;
916                         break;
917                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
918                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
919                         flags |= MLX5_FLOW_ACTION_SET_MAC_SRC;
920                         break;
921                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
922                         keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
923                         flags |= MLX5_FLOW_ACTION_SET_MAC_DST;
924                         break;
925                 default:
926                         goto get_pedit_action_size_done;
927                 }
928         }
929 get_pedit_action_size_done:
930         /* TCA_PEDIT_PARAMS_EX */
931         pedit_size +=
932                 SZ_NLATTR_DATA_OF(sizeof(struct tc_pedit_sel) +
933                                   keys * sizeof(struct tc_pedit_key));
934         pedit_size += SZ_NLATTR_NEST; /* TCA_PEDIT_KEYS */
935         pedit_size += keys *
936                       /* TCA_PEDIT_KEY_EX + HTYPE + CMD */
937                       (SZ_NLATTR_NEST + SZ_NLATTR_DATA_OF(2) +
938                        SZ_NLATTR_DATA_OF(2));
939         (*action_flags) |= flags;
940         (*actions)--;
941         return pedit_size;
942 }
943
944 /**
945  * Retrieve mask for pattern item.
946  *
947  * This function does basic sanity checks on a pattern item in order to
948  * return the most appropriate mask for it.
949  *
950  * @param[in] item
951  *   Item specification.
952  * @param[in] mask_default
953  *   Default mask for pattern item as specified by the flow API.
954  * @param[in] mask_supported
955  *   Mask fields supported by the implementation.
956  * @param[in] mask_empty
957  *   Empty mask to return when there is no specification.
958  * @param[out] error
959  *   Perform verbose error reporting if not NULL.
960  *
961  * @return
962  *   Either @p item->mask or one of the mask parameters on success, NULL
963  *   otherwise and rte_errno is set.
964  */
965 static const void *
966 flow_tcf_item_mask(const struct rte_flow_item *item, const void *mask_default,
967                    const void *mask_supported, const void *mask_empty,
968                    size_t mask_size, struct rte_flow_error *error)
969 {
970         const uint8_t *mask;
971         size_t i;
972
973         /* item->last and item->mask cannot exist without item->spec. */
974         if (!item->spec && (item->mask || item->last)) {
975                 rte_flow_error_set(error, EINVAL,
976                                    RTE_FLOW_ERROR_TYPE_ITEM, item,
977                                    "\"mask\" or \"last\" field provided without"
978                                    " a corresponding \"spec\"");
979                 return NULL;
980         }
981         /* No spec, no mask, no problem. */
982         if (!item->spec)
983                 return mask_empty;
984         mask = item->mask ? item->mask : mask_default;
985         assert(mask);
986         /*
987          * Single-pass check to make sure that:
988          * - Mask is supported, no bits are set outside mask_supported.
989          * - Both item->spec and item->last are included in mask.
990          */
991         for (i = 0; i != mask_size; ++i) {
992                 if (!mask[i])
993                         continue;
994                 if ((mask[i] | ((const uint8_t *)mask_supported)[i]) !=
995                     ((const uint8_t *)mask_supported)[i]) {
996                         rte_flow_error_set(error, ENOTSUP,
997                                            RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
998                                            "unsupported field found"
999                                            " in \"mask\"");
1000                         return NULL;
1001                 }
1002                 if (item->last &&
1003                     (((const uint8_t *)item->spec)[i] & mask[i]) !=
1004                     (((const uint8_t *)item->last)[i] & mask[i])) {
1005                         rte_flow_error_set(error, EINVAL,
1006                                            RTE_FLOW_ERROR_TYPE_ITEM_LAST,
1007                                            item->last,
1008                                            "range between \"spec\" and \"last\""
1009                                            " not comprised in \"mask\"");
1010                         return NULL;
1011                 }
1012         }
1013         return mask;
1014 }
1015
1016 /**
1017  * Build a conversion table between port ID and ifindex.
1018  *
1019  * @param[in] dev
1020  *   Pointer to Ethernet device.
1021  * @param[out] ptoi
1022  *   Pointer to ptoi table.
1023  * @param[in] len
1024  *   Size of ptoi table provided.
1025  *
1026  * @return
1027  *   Size of ptoi table filled.
1028  */
1029 static unsigned int
1030 flow_tcf_build_ptoi_table(struct rte_eth_dev *dev, struct flow_tcf_ptoi *ptoi,
1031                           unsigned int len)
1032 {
1033         unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
1034         uint16_t port_id[n + 1];
1035         unsigned int i;
1036         unsigned int own = 0;
1037
1038         /* At least one port is needed when no switch domain is present. */
1039         if (!n) {
1040                 n = 1;
1041                 port_id[0] = dev->data->port_id;
1042         } else {
1043                 n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
1044         }
1045         if (n > len)
1046                 return 0;
1047         for (i = 0; i != n; ++i) {
1048                 struct rte_eth_dev_info dev_info;
1049
1050                 rte_eth_dev_info_get(port_id[i], &dev_info);
1051                 if (port_id[i] == dev->data->port_id)
1052                         own = i;
1053                 ptoi[i].port_id = port_id[i];
1054                 ptoi[i].ifindex = dev_info.if_index;
1055         }
1056         /* Ensure first entry of ptoi[] is the current device. */
1057         if (own) {
1058                 ptoi[n] = ptoi[0];
1059                 ptoi[0] = ptoi[own];
1060                 ptoi[own] = ptoi[n];
1061         }
1062         /* An entry with zero ifindex terminates ptoi[]. */
1063         ptoi[n].port_id = 0;
1064         ptoi[n].ifindex = 0;
1065         return n;
1066 }
1067
1068 /**
1069  * Verify the @p attr will be correctly understood by the E-switch.
1070  *
1071  * @param[in] attr
1072  *   Pointer to flow attributes
1073  * @param[out] error
1074  *   Pointer to error structure.
1075  *
1076  * @return
1077  *   0 on success, a negative errno value otherwise and rte_errno is set.
1078  */
1079 static int
1080 flow_tcf_validate_attributes(const struct rte_flow_attr *attr,
1081                              struct rte_flow_error *error)
1082 {
1083         /*
1084          * Supported attributes: groups, some priorities and ingress only.
1085          * group is supported only if kernel supports chain. Don't care about
1086          * transfer as it is the caller's problem.
1087          */
1088         if (attr->group > MLX5_TCF_GROUP_ID_MAX)
1089                 return rte_flow_error_set(error, ENOTSUP,
1090                                           RTE_FLOW_ERROR_TYPE_ATTR_GROUP, attr,
1091                                           "group ID larger than "
1092                                           RTE_STR(MLX5_TCF_GROUP_ID_MAX)
1093                                           " isn't supported");
1094         else if (attr->group > 0 &&
1095                  attr->priority > MLX5_TCF_GROUP_PRIORITY_MAX)
1096                 return rte_flow_error_set(error, ENOTSUP,
1097                                           RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1098                                           attr,
1099                                           "lowest priority level is "
1100                                           RTE_STR(MLX5_TCF_GROUP_PRIORITY_MAX)
1101                                           " when group is configured");
1102         else if (attr->priority > 0xfffe)
1103                 return rte_flow_error_set(error, ENOTSUP,
1104                                           RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1105                                           attr,
1106                                           "lowest priority level is 0xfffe");
1107         if (!attr->ingress)
1108                 return rte_flow_error_set(error, EINVAL,
1109                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1110                                           attr, "only ingress is supported");
1111         if (attr->egress)
1112                 return rte_flow_error_set(error, ENOTSUP,
1113                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1114                                           attr, "egress is not supported");
1115         return 0;
1116 }
1117
1118 /**
1119  * Validate flow for E-Switch.
1120  *
1121  * @param[in] priv
1122  *   Pointer to the priv structure.
1123  * @param[in] attr
1124  *   Pointer to the flow attributes.
1125  * @param[in] items
1126  *   Pointer to the list of items.
1127  * @param[in] actions
1128  *   Pointer to the list of actions.
1129  * @param[out] error
1130  *   Pointer to the error structure.
1131  *
1132  * @return
1133  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1134  */
1135 static int
1136 flow_tcf_validate(struct rte_eth_dev *dev,
1137                   const struct rte_flow_attr *attr,
1138                   const struct rte_flow_item items[],
1139                   const struct rte_flow_action actions[],
1140                   struct rte_flow_error *error)
1141 {
1142         union {
1143                 const struct rte_flow_item_port_id *port_id;
1144                 const struct rte_flow_item_eth *eth;
1145                 const struct rte_flow_item_vlan *vlan;
1146                 const struct rte_flow_item_ipv4 *ipv4;
1147                 const struct rte_flow_item_ipv6 *ipv6;
1148                 const struct rte_flow_item_tcp *tcp;
1149                 const struct rte_flow_item_udp *udp;
1150         } spec, mask;
1151         union {
1152                 const struct rte_flow_action_port_id *port_id;
1153                 const struct rte_flow_action_jump *jump;
1154                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
1155                 const struct rte_flow_action_of_set_vlan_vid *
1156                         of_set_vlan_vid;
1157                 const struct rte_flow_action_of_set_vlan_pcp *
1158                         of_set_vlan_pcp;
1159                 const struct rte_flow_action_set_ipv4 *set_ipv4;
1160                 const struct rte_flow_action_set_ipv6 *set_ipv6;
1161         } conf;
1162         uint64_t item_flags = 0;
1163         uint64_t action_flags = 0;
1164         uint8_t next_protocol = -1;
1165         unsigned int tcm_ifindex = 0;
1166         uint8_t pedit_validated = 0;
1167         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
1168         struct rte_eth_dev *port_id_dev = NULL;
1169         bool in_port_id_set;
1170         int ret;
1171
1172         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
1173                                                 PTOI_TABLE_SZ_MAX(dev)));
1174         ret = flow_tcf_validate_attributes(attr, error);
1175         if (ret < 0)
1176                 return ret;
1177         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
1178                 unsigned int i;
1179                 uint64_t current_action_flag = 0;
1180
1181                 switch (actions->type) {
1182                 case RTE_FLOW_ACTION_TYPE_VOID:
1183                         break;
1184                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
1185                         current_action_flag = MLX5_FLOW_ACTION_PORT_ID;
1186                         if (!actions->conf)
1187                                 break;
1188                         conf.port_id = actions->conf;
1189                         if (conf.port_id->original)
1190                                 i = 0;
1191                         else
1192                                 for (i = 0; ptoi[i].ifindex; ++i)
1193                                         if (ptoi[i].port_id == conf.port_id->id)
1194                                                 break;
1195                         if (!ptoi[i].ifindex)
1196                                 return rte_flow_error_set
1197                                         (error, ENODEV,
1198                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1199                                          conf.port_id,
1200                                          "missing data to convert port ID to"
1201                                          " ifindex");
1202                         port_id_dev = &rte_eth_devices[conf.port_id->id];
1203                         break;
1204                 case RTE_FLOW_ACTION_TYPE_JUMP:
1205                         current_action_flag = MLX5_FLOW_ACTION_JUMP;
1206                         if (!actions->conf)
1207                                 break;
1208                         conf.jump = actions->conf;
1209                         if (attr->group >= conf.jump->group)
1210                                 return rte_flow_error_set
1211                                         (error, ENOTSUP,
1212                                          RTE_FLOW_ERROR_TYPE_ACTION,
1213                                          actions,
1214                                          "can jump only to a group forward");
1215                         break;
1216                 case RTE_FLOW_ACTION_TYPE_DROP:
1217                         current_action_flag = MLX5_FLOW_ACTION_DROP;
1218                         break;
1219                 case RTE_FLOW_ACTION_TYPE_COUNT:
1220                         break;
1221                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
1222                         current_action_flag = MLX5_FLOW_ACTION_OF_POP_VLAN;
1223                         break;
1224                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
1225                         current_action_flag = MLX5_FLOW_ACTION_OF_PUSH_VLAN;
1226                         break;
1227                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
1228                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1229                                 return rte_flow_error_set
1230                                         (error, ENOTSUP,
1231                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1232                                          "vlan modify is not supported,"
1233                                          " set action must follow push action");
1234                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
1235                         break;
1236                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
1237                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1238                                 return rte_flow_error_set
1239                                         (error, ENOTSUP,
1240                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
1241                                          "vlan modify is not supported,"
1242                                          " set action must follow push action");
1243                         current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
1244                         break;
1245                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
1246                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_SRC;
1247                         break;
1248                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
1249                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_DST;
1250                         break;
1251                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
1252                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_SRC;
1253                         break;
1254                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
1255                         current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_DST;
1256                         break;
1257                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
1258                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_SRC;
1259                         break;
1260                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
1261                         current_action_flag = MLX5_FLOW_ACTION_SET_TP_DST;
1262                         break;
1263                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
1264                         current_action_flag = MLX5_FLOW_ACTION_SET_TTL;
1265                         break;
1266                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
1267                         current_action_flag = MLX5_FLOW_ACTION_DEC_TTL;
1268                         break;
1269                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
1270                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_SRC;
1271                         break;
1272                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
1273                         current_action_flag = MLX5_FLOW_ACTION_SET_MAC_DST;
1274                         break;
1275                 default:
1276                         return rte_flow_error_set(error, ENOTSUP,
1277                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1278                                                   actions,
1279                                                   "action not supported");
1280                 }
1281                 if (current_action_flag & MLX5_TCF_CONFIG_ACTIONS) {
1282                         if (!actions->conf)
1283                                 return rte_flow_error_set
1284                                         (error, EINVAL,
1285                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1286                                          actions,
1287                                          "action configuration not set");
1288                 }
1289                 if ((current_action_flag & MLX5_TCF_PEDIT_ACTIONS) &&
1290                     pedit_validated)
1291                         return rte_flow_error_set(error, ENOTSUP,
1292                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1293                                                   actions,
1294                                                   "set actions should be "
1295                                                   "listed successively");
1296                 if ((current_action_flag & ~MLX5_TCF_PEDIT_ACTIONS) &&
1297                     (action_flags & MLX5_TCF_PEDIT_ACTIONS))
1298                         pedit_validated = 1;
1299                 if ((current_action_flag & MLX5_TCF_FATE_ACTIONS) &&
1300                     (action_flags & MLX5_TCF_FATE_ACTIONS))
1301                         return rte_flow_error_set(error, EINVAL,
1302                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1303                                                   actions,
1304                                                   "can't have multiple fate"
1305                                                   " actions");
1306                 action_flags |= current_action_flag;
1307         }
1308         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1309                 unsigned int i;
1310
1311                 switch (items->type) {
1312                 case RTE_FLOW_ITEM_TYPE_VOID:
1313                         break;
1314                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
1315                         mask.port_id = flow_tcf_item_mask
1316                                 (items, &rte_flow_item_port_id_mask,
1317                                  &flow_tcf_mask_supported.port_id,
1318                                  &flow_tcf_mask_empty.port_id,
1319                                  sizeof(flow_tcf_mask_supported.port_id),
1320                                  error);
1321                         if (!mask.port_id)
1322                                 return -rte_errno;
1323                         if (mask.port_id == &flow_tcf_mask_empty.port_id) {
1324                                 in_port_id_set = 1;
1325                                 break;
1326                         }
1327                         spec.port_id = items->spec;
1328                         if (mask.port_id->id && mask.port_id->id != 0xffffffff)
1329                                 return rte_flow_error_set
1330                                         (error, ENOTSUP,
1331                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
1332                                          mask.port_id,
1333                                          "no support for partial mask on"
1334                                          " \"id\" field");
1335                         if (!mask.port_id->id)
1336                                 i = 0;
1337                         else
1338                                 for (i = 0; ptoi[i].ifindex; ++i)
1339                                         if (ptoi[i].port_id == spec.port_id->id)
1340                                                 break;
1341                         if (!ptoi[i].ifindex)
1342                                 return rte_flow_error_set
1343                                         (error, ENODEV,
1344                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
1345                                          spec.port_id,
1346                                          "missing data to convert port ID to"
1347                                          " ifindex");
1348                         if (in_port_id_set && ptoi[i].ifindex != tcm_ifindex)
1349                                 return rte_flow_error_set
1350                                         (error, ENOTSUP,
1351                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
1352                                          spec.port_id,
1353                                          "cannot match traffic for"
1354                                          " several port IDs through"
1355                                          " a single flow rule");
1356                         tcm_ifindex = ptoi[i].ifindex;
1357                         in_port_id_set = 1;
1358                         break;
1359                 case RTE_FLOW_ITEM_TYPE_ETH:
1360                         ret = mlx5_flow_validate_item_eth(items, item_flags,
1361                                                           error);
1362                         if (ret < 0)
1363                                 return ret;
1364                         item_flags |= MLX5_FLOW_LAYER_OUTER_L2;
1365                         /* TODO:
1366                          * Redundant check due to different supported mask.
1367                          * Same for the rest of items.
1368                          */
1369                         mask.eth = flow_tcf_item_mask
1370                                 (items, &rte_flow_item_eth_mask,
1371                                  &flow_tcf_mask_supported.eth,
1372                                  &flow_tcf_mask_empty.eth,
1373                                  sizeof(flow_tcf_mask_supported.eth),
1374                                  error);
1375                         if (!mask.eth)
1376                                 return -rte_errno;
1377                         if (mask.eth->type && mask.eth->type !=
1378                             RTE_BE16(0xffff))
1379                                 return rte_flow_error_set
1380                                         (error, ENOTSUP,
1381                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
1382                                          mask.eth,
1383                                          "no support for partial mask on"
1384                                          " \"type\" field");
1385                         break;
1386                 case RTE_FLOW_ITEM_TYPE_VLAN:
1387                         ret = mlx5_flow_validate_item_vlan(items, item_flags,
1388                                                            error);
1389                         if (ret < 0)
1390                                 return ret;
1391                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
1392                         mask.vlan = flow_tcf_item_mask
1393                                 (items, &rte_flow_item_vlan_mask,
1394                                  &flow_tcf_mask_supported.vlan,
1395                                  &flow_tcf_mask_empty.vlan,
1396                                  sizeof(flow_tcf_mask_supported.vlan),
1397                                  error);
1398                         if (!mask.vlan)
1399                                 return -rte_errno;
1400                         if ((mask.vlan->tci & RTE_BE16(0xe000) &&
1401                              (mask.vlan->tci & RTE_BE16(0xe000)) !=
1402                               RTE_BE16(0xe000)) ||
1403                             (mask.vlan->tci & RTE_BE16(0x0fff) &&
1404                              (mask.vlan->tci & RTE_BE16(0x0fff)) !=
1405                               RTE_BE16(0x0fff)) ||
1406                             (mask.vlan->inner_type &&
1407                              mask.vlan->inner_type != RTE_BE16(0xffff)))
1408                                 return rte_flow_error_set
1409                                         (error, ENOTSUP,
1410                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
1411                                          mask.vlan,
1412                                          "no support for partial masks on"
1413                                          " \"tci\" (PCP and VID parts) and"
1414                                          " \"inner_type\" fields");
1415                         break;
1416                 case RTE_FLOW_ITEM_TYPE_IPV4:
1417                         ret = mlx5_flow_validate_item_ipv4(items, item_flags,
1418                                                            error);
1419                         if (ret < 0)
1420                                 return ret;
1421                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
1422                         mask.ipv4 = flow_tcf_item_mask
1423                                 (items, &rte_flow_item_ipv4_mask,
1424                                  &flow_tcf_mask_supported.ipv4,
1425                                  &flow_tcf_mask_empty.ipv4,
1426                                  sizeof(flow_tcf_mask_supported.ipv4),
1427                                  error);
1428                         if (!mask.ipv4)
1429                                 return -rte_errno;
1430                         if (mask.ipv4->hdr.next_proto_id &&
1431                             mask.ipv4->hdr.next_proto_id != 0xff)
1432                                 return rte_flow_error_set
1433                                         (error, ENOTSUP,
1434                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
1435                                          mask.ipv4,
1436                                          "no support for partial mask on"
1437                                          " \"hdr.next_proto_id\" field");
1438                         else if (mask.ipv4->hdr.next_proto_id)
1439                                 next_protocol =
1440                                         ((const struct rte_flow_item_ipv4 *)
1441                                          (items->spec))->hdr.next_proto_id;
1442                         break;
1443                 case RTE_FLOW_ITEM_TYPE_IPV6:
1444                         ret = mlx5_flow_validate_item_ipv6(items, item_flags,
1445                                                            error);
1446                         if (ret < 0)
1447                                 return ret;
1448                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
1449                         mask.ipv6 = flow_tcf_item_mask
1450                                 (items, &rte_flow_item_ipv6_mask,
1451                                  &flow_tcf_mask_supported.ipv6,
1452                                  &flow_tcf_mask_empty.ipv6,
1453                                  sizeof(flow_tcf_mask_supported.ipv6),
1454                                  error);
1455                         if (!mask.ipv6)
1456                                 return -rte_errno;
1457                         if (mask.ipv6->hdr.proto &&
1458                             mask.ipv6->hdr.proto != 0xff)
1459                                 return rte_flow_error_set
1460                                         (error, ENOTSUP,
1461                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
1462                                          mask.ipv6,
1463                                          "no support for partial mask on"
1464                                          " \"hdr.proto\" field");
1465                         else if (mask.ipv6->hdr.proto)
1466                                 next_protocol =
1467                                         ((const struct rte_flow_item_ipv6 *)
1468                                          (items->spec))->hdr.proto;
1469                         break;
1470                 case RTE_FLOW_ITEM_TYPE_UDP:
1471                         ret = mlx5_flow_validate_item_udp(items, item_flags,
1472                                                           next_protocol, error);
1473                         if (ret < 0)
1474                                 return ret;
1475                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
1476                         mask.udp = flow_tcf_item_mask
1477                                 (items, &rte_flow_item_udp_mask,
1478                                  &flow_tcf_mask_supported.udp,
1479                                  &flow_tcf_mask_empty.udp,
1480                                  sizeof(flow_tcf_mask_supported.udp),
1481                                  error);
1482                         if (!mask.udp)
1483                                 return -rte_errno;
1484                         break;
1485                 case RTE_FLOW_ITEM_TYPE_TCP:
1486                         ret = mlx5_flow_validate_item_tcp
1487                                              (items, item_flags,
1488                                               next_protocol,
1489                                               &flow_tcf_mask_supported.tcp,
1490                                               error);
1491                         if (ret < 0)
1492                                 return ret;
1493                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
1494                         mask.tcp = flow_tcf_item_mask
1495                                 (items, &rte_flow_item_tcp_mask,
1496                                  &flow_tcf_mask_supported.tcp,
1497                                  &flow_tcf_mask_empty.tcp,
1498                                  sizeof(flow_tcf_mask_supported.tcp),
1499                                  error);
1500                         if (!mask.tcp)
1501                                 return -rte_errno;
1502                         break;
1503                 default:
1504                         return rte_flow_error_set(error, ENOTSUP,
1505                                                   RTE_FLOW_ERROR_TYPE_ITEM,
1506                                                   NULL, "item not supported");
1507                 }
1508         }
1509         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
1510             (action_flags & MLX5_FLOW_ACTION_DROP))
1511                 return rte_flow_error_set(error, ENOTSUP,
1512                                           RTE_FLOW_ERROR_TYPE_ACTION,
1513                                           actions,
1514                                           "set action is not compatible with "
1515                                           "drop action");
1516         if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
1517             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
1518                 return rte_flow_error_set(error, ENOTSUP,
1519                                           RTE_FLOW_ERROR_TYPE_ACTION,
1520                                           actions,
1521                                           "set action must be followed by "
1522                                           "port_id action");
1523         if (action_flags &
1524            (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST)) {
1525                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4))
1526                         return rte_flow_error_set(error, EINVAL,
1527                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1528                                                   actions,
1529                                                   "no ipv4 item found in"
1530                                                   " pattern");
1531         }
1532         if (action_flags &
1533            (MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST)) {
1534                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6))
1535                         return rte_flow_error_set(error, EINVAL,
1536                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1537                                                   actions,
1538                                                   "no ipv6 item found in"
1539                                                   " pattern");
1540         }
1541         if (action_flags &
1542            (MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST)) {
1543                 if (!(item_flags &
1544                      (MLX5_FLOW_LAYER_OUTER_L4_UDP |
1545                       MLX5_FLOW_LAYER_OUTER_L4_TCP)))
1546                         return rte_flow_error_set(error, EINVAL,
1547                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1548                                                   actions,
1549                                                   "no TCP/UDP item found in"
1550                                                   " pattern");
1551         }
1552         /*
1553          * FW syndrome (0xA9C090):
1554          *     set_flow_table_entry: push vlan action fte in fdb can ONLY be
1555          *     forward to the uplink.
1556          */
1557         if ((action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) &&
1558             (action_flags & MLX5_FLOW_ACTION_PORT_ID) &&
1559             ((struct priv *)port_id_dev->data->dev_private)->representor)
1560                 return rte_flow_error_set(error, ENOTSUP,
1561                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
1562                                           "vlan push can only be applied"
1563                                           " when forwarding to uplink port");
1564         /*
1565          * FW syndrome (0x294609):
1566          *     set_flow_table_entry: modify/pop/push actions in fdb flow table
1567          *     are supported only while forwarding to vport.
1568          */
1569         if ((action_flags & MLX5_TCF_VLAN_ACTIONS) &&
1570             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
1571                 return rte_flow_error_set(error, ENOTSUP,
1572                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
1573                                           "vlan actions are supported"
1574                                           " only with port_id action");
1575         if (!(action_flags & MLX5_TCF_FATE_ACTIONS))
1576                 return rte_flow_error_set(error, EINVAL,
1577                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
1578                                           "no fate action is found");
1579         if (action_flags &
1580            (MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL)) {
1581                 if (!(item_flags &
1582                      (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
1583                       MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
1584                         return rte_flow_error_set(error, EINVAL,
1585                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1586                                                   actions,
1587                                                   "no IP found in pattern");
1588         }
1589         if (action_flags &
1590             (MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)) {
1591                 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L2))
1592                         return rte_flow_error_set(error, ENOTSUP,
1593                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1594                                                   actions,
1595                                                   "no ethernet found in"
1596                                                   " pattern");
1597         }
1598         return 0;
1599 }
1600
1601 /**
1602  * Calculate maximum size of memory for flow items of Linux TC flower and
1603  * extract specified items.
1604  *
1605  * @param[in] items
1606  *   Pointer to the list of items.
1607  * @param[out] item_flags
1608  *   Pointer to the detected items.
1609  *
1610  * @return
1611  *   Maximum size of memory for items.
1612  */
1613 static int
1614 flow_tcf_get_items_and_size(const struct rte_flow_attr *attr,
1615                             const struct rte_flow_item items[],
1616                             uint64_t *item_flags)
1617 {
1618         int size = 0;
1619         uint64_t flags = 0;
1620
1621         size += SZ_NLATTR_STRZ_OF("flower") +
1622                 SZ_NLATTR_NEST + /* TCA_OPTIONS. */
1623                 SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */
1624         if (attr->group > 0)
1625                 size += SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CHAIN. */
1626         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1627                 switch (items->type) {
1628                 case RTE_FLOW_ITEM_TYPE_VOID:
1629                         break;
1630                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
1631                         break;
1632                 case RTE_FLOW_ITEM_TYPE_ETH:
1633                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
1634                                 SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4;
1635                                 /* dst/src MAC addr and mask. */
1636                         flags |= MLX5_FLOW_LAYER_OUTER_L2;
1637                         break;
1638                 case RTE_FLOW_ITEM_TYPE_VLAN:
1639                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
1640                                 SZ_NLATTR_TYPE_OF(uint16_t) +
1641                                 /* VLAN Ether type. */
1642                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */
1643                                 SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */
1644                         flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
1645                         break;
1646                 case RTE_FLOW_ITEM_TYPE_IPV4:
1647                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
1648                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
1649                                 SZ_NLATTR_TYPE_OF(uint32_t) * 4;
1650                                 /* dst/src IP addr and mask. */
1651                         flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
1652                         break;
1653                 case RTE_FLOW_ITEM_TYPE_IPV6:
1654                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
1655                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
1656                                 SZ_NLATTR_TYPE_OF(IPV6_ADDR_LEN) * 4;
1657                                 /* dst/src IP addr and mask. */
1658                         flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
1659                         break;
1660                 case RTE_FLOW_ITEM_TYPE_UDP:
1661                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
1662                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
1663                                 /* dst/src port and mask. */
1664                         flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
1665                         break;
1666                 case RTE_FLOW_ITEM_TYPE_TCP:
1667                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
1668                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
1669                                 /* dst/src port and mask. */
1670                         flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
1671                         break;
1672                 default:
1673                         DRV_LOG(WARNING,
1674                                 "unsupported item %p type %d,"
1675                                 " items must be validated before flow creation",
1676                                 (const void *)items, items->type);
1677                         break;
1678                 }
1679         }
1680         *item_flags = flags;
1681         return size;
1682 }
1683
1684 /**
1685  * Calculate maximum size of memory for flow actions of Linux TC flower and
1686  * extract specified actions.
1687  *
1688  * @param[in] actions
1689  *   Pointer to the list of actions.
1690  * @param[out] action_flags
1691  *   Pointer to the detected actions.
1692  *
1693  * @return
1694  *   Maximum size of memory for actions.
1695  */
1696 static int
1697 flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
1698                               uint64_t *action_flags)
1699 {
1700         int size = 0;
1701         uint64_t flags = 0;
1702
1703         size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */
1704         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
1705                 switch (actions->type) {
1706                 case RTE_FLOW_ACTION_TYPE_VOID:
1707                         break;
1708                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
1709                         size += SZ_NLATTR_NEST + /* na_act_index. */
1710                                 SZ_NLATTR_STRZ_OF("mirred") +
1711                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
1712                                 SZ_NLATTR_TYPE_OF(struct tc_mirred);
1713                         flags |= MLX5_FLOW_ACTION_PORT_ID;
1714                         break;
1715                 case RTE_FLOW_ACTION_TYPE_JUMP:
1716                         size += SZ_NLATTR_NEST + /* na_act_index. */
1717                                 SZ_NLATTR_STRZ_OF("gact") +
1718                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
1719                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
1720                         flags |= MLX5_FLOW_ACTION_JUMP;
1721                         break;
1722                 case RTE_FLOW_ACTION_TYPE_DROP:
1723                         size += SZ_NLATTR_NEST + /* na_act_index. */
1724                                 SZ_NLATTR_STRZ_OF("gact") +
1725                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
1726                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
1727                         flags |= MLX5_FLOW_ACTION_DROP;
1728                         break;
1729                 case RTE_FLOW_ACTION_TYPE_COUNT:
1730                         break;
1731                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
1732                         flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
1733                         goto action_of_vlan;
1734                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
1735                         flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
1736                         goto action_of_vlan;
1737                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
1738                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
1739                         goto action_of_vlan;
1740                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
1741                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
1742                         goto action_of_vlan;
1743 action_of_vlan:
1744                         size += SZ_NLATTR_NEST + /* na_act_index. */
1745                                 SZ_NLATTR_STRZ_OF("vlan") +
1746                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
1747                                 SZ_NLATTR_TYPE_OF(struct tc_vlan) +
1748                                 SZ_NLATTR_TYPE_OF(uint16_t) +
1749                                 /* VLAN protocol. */
1750                                 SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
1751                                 SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
1752                         break;
1753                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
1754                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
1755                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
1756                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
1757                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
1758                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
1759                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
1760                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
1761                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
1762                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
1763                         size += flow_tcf_get_pedit_actions_size(&actions,
1764                                                                 &flags);
1765                         break;
1766                 default:
1767                         DRV_LOG(WARNING,
1768                                 "unsupported action %p type %d,"
1769                                 " items must be validated before flow creation",
1770                                 (const void *)actions, actions->type);
1771                         break;
1772                 }
1773         }
1774         *action_flags = flags;
1775         return size;
1776 }
1777
1778 /**
1779  * Brand rtnetlink buffer with unique handle.
1780  *
1781  * This handle should be unique for a given network interface to avoid
1782  * collisions.
1783  *
1784  * @param nlh
1785  *   Pointer to Netlink message.
1786  * @param handle
1787  *   Unique 32-bit handle to use.
1788  */
1789 static void
1790 flow_tcf_nl_brand(struct nlmsghdr *nlh, uint32_t handle)
1791 {
1792         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
1793
1794         tcm->tcm_handle = handle;
1795         DRV_LOG(DEBUG, "Netlink msg %p is branded with handle %x",
1796                 (void *)nlh, handle);
1797 }
1798
1799 /**
1800  * Prepare a flow object for Linux TC flower. It calculates the maximum size of
1801  * memory required, allocates the memory, initializes Netlink message headers
1802  * and set unique TC message handle.
1803  *
1804  * @param[in] attr
1805  *   Pointer to the flow attributes.
1806  * @param[in] items
1807  *   Pointer to the list of items.
1808  * @param[in] actions
1809  *   Pointer to the list of actions.
1810  * @param[out] item_flags
1811  *   Pointer to bit mask of all items detected.
1812  * @param[out] action_flags
1813  *   Pointer to bit mask of all actions detected.
1814  * @param[out] error
1815  *   Pointer to the error structure.
1816  *
1817  * @return
1818  *   Pointer to mlx5_flow object on success,
1819  *   otherwise NULL and rte_ernno is set.
1820  */
1821 static struct mlx5_flow *
1822 flow_tcf_prepare(const struct rte_flow_attr *attr,
1823                  const struct rte_flow_item items[],
1824                  const struct rte_flow_action actions[],
1825                  uint64_t *item_flags, uint64_t *action_flags,
1826                  struct rte_flow_error *error)
1827 {
1828         size_t size = sizeof(struct mlx5_flow) +
1829                       MNL_ALIGN(sizeof(struct nlmsghdr)) +
1830                       MNL_ALIGN(sizeof(struct tcmsg));
1831         struct mlx5_flow *dev_flow;
1832         struct nlmsghdr *nlh;
1833         struct tcmsg *tcm;
1834
1835         size += flow_tcf_get_items_and_size(attr, items, item_flags);
1836         size += flow_tcf_get_actions_and_size(actions, action_flags);
1837         dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
1838         if (!dev_flow) {
1839                 rte_flow_error_set(error, ENOMEM,
1840                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
1841                                    "not enough memory to create E-Switch flow");
1842                 return NULL;
1843         }
1844         nlh = mnl_nlmsg_put_header((void *)(dev_flow + 1));
1845         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
1846         *dev_flow = (struct mlx5_flow){
1847                 .tcf = (struct mlx5_flow_tcf){
1848                         .nlh = nlh,
1849                         .tcm = tcm,
1850                 },
1851         };
1852         /*
1853          * Generate a reasonably unique handle based on the address of the
1854          * target buffer.
1855          *
1856          * This is straightforward on 32-bit systems where the flow pointer can
1857          * be used directly. Otherwise, its least significant part is taken
1858          * after shifting it by the previous power of two of the pointed buffer
1859          * size.
1860          */
1861         if (sizeof(dev_flow) <= 4)
1862                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow);
1863         else
1864                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow >>
1865                                        rte_log2_u32(rte_align32prevpow2(size)));
1866         return dev_flow;
1867 }
1868
1869 /**
1870  * Make adjustments for supporting count actions.
1871  *
1872  * @param[in] dev
1873  *   Pointer to the Ethernet device structure.
1874  * @param[in] dev_flow
1875  *   Pointer to mlx5_flow.
1876  * @param[out] error
1877  *   Pointer to error structure.
1878  *
1879  * @return
1880  *   0 On success else a negative errno value is returned and rte_errno is set.
1881  */
1882 static int
1883 flow_tcf_translate_action_count(struct rte_eth_dev *dev __rte_unused,
1884                                   struct mlx5_flow *dev_flow,
1885                                   struct rte_flow_error *error)
1886 {
1887         struct rte_flow *flow = dev_flow->flow;
1888
1889         if (!flow->counter) {
1890                 flow->counter = flow_tcf_counter_new();
1891                 if (!flow->counter)
1892                         return rte_flow_error_set(error, rte_errno,
1893                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1894                                                   NULL,
1895                                                   "cannot get counter"
1896                                                   " context.");
1897         }
1898         return 0;
1899 }
1900
1901 /**
1902  * Translate flow for Linux TC flower and construct Netlink message.
1903  *
1904  * @param[in] priv
1905  *   Pointer to the priv structure.
1906  * @param[in, out] flow
1907  *   Pointer to the sub flow.
1908  * @param[in] attr
1909  *   Pointer to the flow attributes.
1910  * @param[in] items
1911  *   Pointer to the list of items.
1912  * @param[in] actions
1913  *   Pointer to the list of actions.
1914  * @param[out] error
1915  *   Pointer to the error structure.
1916  *
1917  * @return
1918  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1919  */
1920 static int
1921 flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow,
1922                    const struct rte_flow_attr *attr,
1923                    const struct rte_flow_item items[],
1924                    const struct rte_flow_action actions[],
1925                    struct rte_flow_error *error)
1926 {
1927         union {
1928                 const struct rte_flow_item_port_id *port_id;
1929                 const struct rte_flow_item_eth *eth;
1930                 const struct rte_flow_item_vlan *vlan;
1931                 const struct rte_flow_item_ipv4 *ipv4;
1932                 const struct rte_flow_item_ipv6 *ipv6;
1933                 const struct rte_flow_item_tcp *tcp;
1934                 const struct rte_flow_item_udp *udp;
1935         } spec, mask;
1936         union {
1937                 const struct rte_flow_action_port_id *port_id;
1938                 const struct rte_flow_action_jump *jump;
1939                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
1940                 const struct rte_flow_action_of_set_vlan_vid *
1941                         of_set_vlan_vid;
1942                 const struct rte_flow_action_of_set_vlan_pcp *
1943                         of_set_vlan_pcp;
1944         } conf;
1945         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
1946         struct nlmsghdr *nlh = dev_flow->tcf.nlh;
1947         struct tcmsg *tcm = dev_flow->tcf.tcm;
1948         uint32_t na_act_index_cur;
1949         bool eth_type_set = 0;
1950         bool vlan_present = 0;
1951         bool vlan_eth_type_set = 0;
1952         bool ip_proto_set = 0;
1953         struct nlattr *na_flower;
1954         struct nlattr *na_flower_act;
1955         struct nlattr *na_vlan_id = NULL;
1956         struct nlattr *na_vlan_priority = NULL;
1957         uint64_t item_flags = 0;
1958         int ret;
1959
1960         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
1961                                                 PTOI_TABLE_SZ_MAX(dev)));
1962         nlh = dev_flow->tcf.nlh;
1963         tcm = dev_flow->tcf.tcm;
1964         /* Prepare API must have been called beforehand. */
1965         assert(nlh != NULL && tcm != NULL);
1966         tcm->tcm_family = AF_UNSPEC;
1967         tcm->tcm_ifindex = ptoi[0].ifindex;
1968         tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
1969         /*
1970          * Priority cannot be zero to prevent the kernel from picking one
1971          * automatically.
1972          */
1973         tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
1974                                   RTE_BE16(ETH_P_ALL));
1975         if (attr->group > 0)
1976                 mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group);
1977         mnl_attr_put_strz(nlh, TCA_KIND, "flower");
1978         na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
1979         mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, TCA_CLS_FLAGS_SKIP_SW);
1980         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1981                 unsigned int i;
1982
1983                 switch (items->type) {
1984                 case RTE_FLOW_ITEM_TYPE_VOID:
1985                         break;
1986                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
1987                         mask.port_id = flow_tcf_item_mask
1988                                 (items, &rte_flow_item_port_id_mask,
1989                                  &flow_tcf_mask_supported.port_id,
1990                                  &flow_tcf_mask_empty.port_id,
1991                                  sizeof(flow_tcf_mask_supported.port_id),
1992                                  error);
1993                         assert(mask.port_id);
1994                         if (mask.port_id == &flow_tcf_mask_empty.port_id)
1995                                 break;
1996                         spec.port_id = items->spec;
1997                         if (!mask.port_id->id)
1998                                 i = 0;
1999                         else
2000                                 for (i = 0; ptoi[i].ifindex; ++i)
2001                                         if (ptoi[i].port_id == spec.port_id->id)
2002                                                 break;
2003                         assert(ptoi[i].ifindex);
2004                         tcm->tcm_ifindex = ptoi[i].ifindex;
2005                         break;
2006                 case RTE_FLOW_ITEM_TYPE_ETH:
2007                         item_flags |= MLX5_FLOW_LAYER_OUTER_L2;
2008                         mask.eth = flow_tcf_item_mask
2009                                 (items, &rte_flow_item_eth_mask,
2010                                  &flow_tcf_mask_supported.eth,
2011                                  &flow_tcf_mask_empty.eth,
2012                                  sizeof(flow_tcf_mask_supported.eth),
2013                                  error);
2014                         assert(mask.eth);
2015                         if (mask.eth == &flow_tcf_mask_empty.eth)
2016                                 break;
2017                         spec.eth = items->spec;
2018                         if (mask.eth->type) {
2019                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_ETH_TYPE,
2020                                                  spec.eth->type);
2021                                 eth_type_set = 1;
2022                         }
2023                         if (!is_zero_ether_addr(&mask.eth->dst)) {
2024                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
2025                                              ETHER_ADDR_LEN,
2026                                              spec.eth->dst.addr_bytes);
2027                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK,
2028                                              ETHER_ADDR_LEN,
2029                                              mask.eth->dst.addr_bytes);
2030                         }
2031                         if (!is_zero_ether_addr(&mask.eth->src)) {
2032                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC,
2033                                              ETHER_ADDR_LEN,
2034                                              spec.eth->src.addr_bytes);
2035                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK,
2036                                              ETHER_ADDR_LEN,
2037                                              mask.eth->src.addr_bytes);
2038                         }
2039                         break;
2040                 case RTE_FLOW_ITEM_TYPE_VLAN:
2041                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
2042                         mask.vlan = flow_tcf_item_mask
2043                                 (items, &rte_flow_item_vlan_mask,
2044                                  &flow_tcf_mask_supported.vlan,
2045                                  &flow_tcf_mask_empty.vlan,
2046                                  sizeof(flow_tcf_mask_supported.vlan),
2047                                  error);
2048                         assert(mask.vlan);
2049                         if (!eth_type_set)
2050                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_ETH_TYPE,
2051                                                  RTE_BE16(ETH_P_8021Q));
2052                         eth_type_set = 1;
2053                         vlan_present = 1;
2054                         if (mask.vlan == &flow_tcf_mask_empty.vlan)
2055                                 break;
2056                         spec.vlan = items->spec;
2057                         if (mask.vlan->inner_type) {
2058                                 mnl_attr_put_u16(nlh,
2059                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE,
2060                                                  spec.vlan->inner_type);
2061                                 vlan_eth_type_set = 1;
2062                         }
2063                         if (mask.vlan->tci & RTE_BE16(0xe000))
2064                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_VLAN_PRIO,
2065                                                 (rte_be_to_cpu_16
2066                                                  (spec.vlan->tci) >> 13) & 0x7);
2067                         if (mask.vlan->tci & RTE_BE16(0x0fff))
2068                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_VLAN_ID,
2069                                                  rte_be_to_cpu_16
2070                                                  (spec.vlan->tci &
2071                                                   RTE_BE16(0x0fff)));
2072                         break;
2073                 case RTE_FLOW_ITEM_TYPE_IPV4:
2074                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
2075                         mask.ipv4 = flow_tcf_item_mask
2076                                 (items, &rte_flow_item_ipv4_mask,
2077                                  &flow_tcf_mask_supported.ipv4,
2078                                  &flow_tcf_mask_empty.ipv4,
2079                                  sizeof(flow_tcf_mask_supported.ipv4),
2080                                  error);
2081                         assert(mask.ipv4);
2082                         if (!eth_type_set || !vlan_eth_type_set)
2083                                 mnl_attr_put_u16(nlh,
2084                                                  vlan_present ?
2085                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE :
2086                                                  TCA_FLOWER_KEY_ETH_TYPE,
2087                                                  RTE_BE16(ETH_P_IP));
2088                         eth_type_set = 1;
2089                         vlan_eth_type_set = 1;
2090                         if (mask.ipv4 == &flow_tcf_mask_empty.ipv4)
2091                                 break;
2092                         spec.ipv4 = items->spec;
2093                         if (mask.ipv4->hdr.next_proto_id) {
2094                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
2095                                                 spec.ipv4->hdr.next_proto_id);
2096                                 ip_proto_set = 1;
2097                         }
2098                         if (mask.ipv4->hdr.src_addr) {
2099                                 mnl_attr_put_u32(nlh, TCA_FLOWER_KEY_IPV4_SRC,
2100                                                  spec.ipv4->hdr.src_addr);
2101                                 mnl_attr_put_u32(nlh,
2102                                                  TCA_FLOWER_KEY_IPV4_SRC_MASK,
2103                                                  mask.ipv4->hdr.src_addr);
2104                         }
2105                         if (mask.ipv4->hdr.dst_addr) {
2106                                 mnl_attr_put_u32(nlh, TCA_FLOWER_KEY_IPV4_DST,
2107                                                  spec.ipv4->hdr.dst_addr);
2108                                 mnl_attr_put_u32(nlh,
2109                                                  TCA_FLOWER_KEY_IPV4_DST_MASK,
2110                                                  mask.ipv4->hdr.dst_addr);
2111                         }
2112                         break;
2113                 case RTE_FLOW_ITEM_TYPE_IPV6:
2114                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
2115                         mask.ipv6 = flow_tcf_item_mask
2116                                 (items, &rte_flow_item_ipv6_mask,
2117                                  &flow_tcf_mask_supported.ipv6,
2118                                  &flow_tcf_mask_empty.ipv6,
2119                                  sizeof(flow_tcf_mask_supported.ipv6),
2120                                  error);
2121                         assert(mask.ipv6);
2122                         if (!eth_type_set || !vlan_eth_type_set)
2123                                 mnl_attr_put_u16(nlh,
2124                                                  vlan_present ?
2125                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE :
2126                                                  TCA_FLOWER_KEY_ETH_TYPE,
2127                                                  RTE_BE16(ETH_P_IPV6));
2128                         eth_type_set = 1;
2129                         vlan_eth_type_set = 1;
2130                         if (mask.ipv6 == &flow_tcf_mask_empty.ipv6)
2131                                 break;
2132                         spec.ipv6 = items->spec;
2133                         if (mask.ipv6->hdr.proto) {
2134                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
2135                                                 spec.ipv6->hdr.proto);
2136                                 ip_proto_set = 1;
2137                         }
2138                         if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.src_addr)) {
2139                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_SRC,
2140                                              sizeof(spec.ipv6->hdr.src_addr),
2141                                              spec.ipv6->hdr.src_addr);
2142                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_SRC_MASK,
2143                                              sizeof(mask.ipv6->hdr.src_addr),
2144                                              mask.ipv6->hdr.src_addr);
2145                         }
2146                         if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.dst_addr)) {
2147                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_DST,
2148                                              sizeof(spec.ipv6->hdr.dst_addr),
2149                                              spec.ipv6->hdr.dst_addr);
2150                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_DST_MASK,
2151                                              sizeof(mask.ipv6->hdr.dst_addr),
2152                                              mask.ipv6->hdr.dst_addr);
2153                         }
2154                         break;
2155                 case RTE_FLOW_ITEM_TYPE_UDP:
2156                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
2157                         mask.udp = flow_tcf_item_mask
2158                                 (items, &rte_flow_item_udp_mask,
2159                                  &flow_tcf_mask_supported.udp,
2160                                  &flow_tcf_mask_empty.udp,
2161                                  sizeof(flow_tcf_mask_supported.udp),
2162                                  error);
2163                         assert(mask.udp);
2164                         if (!ip_proto_set)
2165                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
2166                                                 IPPROTO_UDP);
2167                         if (mask.udp == &flow_tcf_mask_empty.udp)
2168                                 break;
2169                         spec.udp = items->spec;
2170                         if (mask.udp->hdr.src_port) {
2171                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_UDP_SRC,
2172                                                  spec.udp->hdr.src_port);
2173                                 mnl_attr_put_u16(nlh,
2174                                                  TCA_FLOWER_KEY_UDP_SRC_MASK,
2175                                                  mask.udp->hdr.src_port);
2176                         }
2177                         if (mask.udp->hdr.dst_port) {
2178                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_UDP_DST,
2179                                                  spec.udp->hdr.dst_port);
2180                                 mnl_attr_put_u16(nlh,
2181                                                  TCA_FLOWER_KEY_UDP_DST_MASK,
2182                                                  mask.udp->hdr.dst_port);
2183                         }
2184                         break;
2185                 case RTE_FLOW_ITEM_TYPE_TCP:
2186                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
2187                         mask.tcp = flow_tcf_item_mask
2188                                 (items, &rte_flow_item_tcp_mask,
2189                                  &flow_tcf_mask_supported.tcp,
2190                                  &flow_tcf_mask_empty.tcp,
2191                                  sizeof(flow_tcf_mask_supported.tcp),
2192                                  error);
2193                         assert(mask.tcp);
2194                         if (!ip_proto_set)
2195                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
2196                                                 IPPROTO_TCP);
2197                         if (mask.tcp == &flow_tcf_mask_empty.tcp)
2198                                 break;
2199                         spec.tcp = items->spec;
2200                         if (mask.tcp->hdr.src_port) {
2201                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_SRC,
2202                                                  spec.tcp->hdr.src_port);
2203                                 mnl_attr_put_u16(nlh,
2204                                                  TCA_FLOWER_KEY_TCP_SRC_MASK,
2205                                                  mask.tcp->hdr.src_port);
2206                         }
2207                         if (mask.tcp->hdr.dst_port) {
2208                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_DST,
2209                                                  spec.tcp->hdr.dst_port);
2210                                 mnl_attr_put_u16(nlh,
2211                                                  TCA_FLOWER_KEY_TCP_DST_MASK,
2212                                                  mask.tcp->hdr.dst_port);
2213                         }
2214                         if (mask.tcp->hdr.tcp_flags) {
2215                                 mnl_attr_put_u16
2216                                         (nlh,
2217                                          TCA_FLOWER_KEY_TCP_FLAGS,
2218                                          rte_cpu_to_be_16
2219                                                 (spec.tcp->hdr.tcp_flags));
2220                                 mnl_attr_put_u16
2221                                         (nlh,
2222                                          TCA_FLOWER_KEY_TCP_FLAGS_MASK,
2223                                          rte_cpu_to_be_16
2224                                                 (mask.tcp->hdr.tcp_flags));
2225                         }
2226                         break;
2227                 default:
2228                         return rte_flow_error_set(error, ENOTSUP,
2229                                                   RTE_FLOW_ERROR_TYPE_ITEM,
2230                                                   NULL, "item not supported");
2231                 }
2232         }
2233         na_flower_act = mnl_attr_nest_start(nlh, TCA_FLOWER_ACT);
2234         na_act_index_cur = 1;
2235         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
2236                 struct nlattr *na_act_index;
2237                 struct nlattr *na_act;
2238                 unsigned int vlan_act;
2239                 unsigned int i;
2240
2241                 switch (actions->type) {
2242                 case RTE_FLOW_ACTION_TYPE_VOID:
2243                         break;
2244                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
2245                         conf.port_id = actions->conf;
2246                         if (conf.port_id->original)
2247                                 i = 0;
2248                         else
2249                                 for (i = 0; ptoi[i].ifindex; ++i)
2250                                         if (ptoi[i].port_id == conf.port_id->id)
2251                                                 break;
2252                         assert(ptoi[i].ifindex);
2253                         na_act_index =
2254                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
2255                         assert(na_act_index);
2256                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
2257                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
2258                         assert(na_act);
2259                         mnl_attr_put(nlh, TCA_MIRRED_PARMS,
2260                                      sizeof(struct tc_mirred),
2261                                      &(struct tc_mirred){
2262                                         .action = TC_ACT_STOLEN,
2263                                         .eaction = TCA_EGRESS_REDIR,
2264                                         .ifindex = ptoi[i].ifindex,
2265                                      });
2266                         mnl_attr_nest_end(nlh, na_act);
2267                         mnl_attr_nest_end(nlh, na_act_index);
2268                         break;
2269                 case RTE_FLOW_ACTION_TYPE_JUMP:
2270                         conf.jump = actions->conf;
2271                         na_act_index =
2272                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
2273                         assert(na_act_index);
2274                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
2275                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
2276                         assert(na_act);
2277                         mnl_attr_put(nlh, TCA_GACT_PARMS,
2278                                      sizeof(struct tc_gact),
2279                                      &(struct tc_gact){
2280                                         .action = TC_ACT_GOTO_CHAIN |
2281                                                   conf.jump->group,
2282                                      });
2283                         mnl_attr_nest_end(nlh, na_act);
2284                         mnl_attr_nest_end(nlh, na_act_index);
2285                         break;
2286                 case RTE_FLOW_ACTION_TYPE_DROP:
2287                         na_act_index =
2288                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
2289                         assert(na_act_index);
2290                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
2291                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
2292                         assert(na_act);
2293                         mnl_attr_put(nlh, TCA_GACT_PARMS,
2294                                      sizeof(struct tc_gact),
2295                                      &(struct tc_gact){
2296                                         .action = TC_ACT_SHOT,
2297                                      });
2298                         mnl_attr_nest_end(nlh, na_act);
2299                         mnl_attr_nest_end(nlh, na_act_index);
2300                         break;
2301                 case RTE_FLOW_ACTION_TYPE_COUNT:
2302                         /*
2303                          * Driver adds the count action implicitly for
2304                          * each rule it creates.
2305                          */
2306                         ret = flow_tcf_translate_action_count(dev,
2307                                                               dev_flow, error);
2308                         if (ret < 0)
2309                                 return ret;
2310                         break;
2311                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
2312                         conf.of_push_vlan = NULL;
2313                         vlan_act = TCA_VLAN_ACT_POP;
2314                         goto action_of_vlan;
2315                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
2316                         conf.of_push_vlan = actions->conf;
2317                         vlan_act = TCA_VLAN_ACT_PUSH;
2318                         goto action_of_vlan;
2319                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
2320                         conf.of_set_vlan_vid = actions->conf;
2321                         if (na_vlan_id)
2322                                 goto override_na_vlan_id;
2323                         vlan_act = TCA_VLAN_ACT_MODIFY;
2324                         goto action_of_vlan;
2325                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
2326                         conf.of_set_vlan_pcp = actions->conf;
2327                         if (na_vlan_priority)
2328                                 goto override_na_vlan_priority;
2329                         vlan_act = TCA_VLAN_ACT_MODIFY;
2330                         goto action_of_vlan;
2331 action_of_vlan:
2332                         na_act_index =
2333                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
2334                         assert(na_act_index);
2335                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "vlan");
2336                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
2337                         assert(na_act);
2338                         mnl_attr_put(nlh, TCA_VLAN_PARMS,
2339                                      sizeof(struct tc_vlan),
2340                                      &(struct tc_vlan){
2341                                         .action = TC_ACT_PIPE,
2342                                         .v_action = vlan_act,
2343                                      });
2344                         if (vlan_act == TCA_VLAN_ACT_POP) {
2345                                 mnl_attr_nest_end(nlh, na_act);
2346                                 mnl_attr_nest_end(nlh, na_act_index);
2347                                 break;
2348                         }
2349                         if (vlan_act == TCA_VLAN_ACT_PUSH)
2350                                 mnl_attr_put_u16(nlh,
2351                                                  TCA_VLAN_PUSH_VLAN_PROTOCOL,
2352                                                  conf.of_push_vlan->ethertype);
2353                         na_vlan_id = mnl_nlmsg_get_payload_tail(nlh);
2354                         mnl_attr_put_u16(nlh, TCA_VLAN_PAD, 0);
2355                         na_vlan_priority = mnl_nlmsg_get_payload_tail(nlh);
2356                         mnl_attr_put_u8(nlh, TCA_VLAN_PAD, 0);
2357                         mnl_attr_nest_end(nlh, na_act);
2358                         mnl_attr_nest_end(nlh, na_act_index);
2359                         if (actions->type ==
2360                             RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
2361 override_na_vlan_id:
2362                                 na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID;
2363                                 *(uint16_t *)mnl_attr_get_payload(na_vlan_id) =
2364                                         rte_be_to_cpu_16
2365                                         (conf.of_set_vlan_vid->vlan_vid);
2366                         } else if (actions->type ==
2367                                    RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
2368 override_na_vlan_priority:
2369                                 na_vlan_priority->nla_type =
2370                                         TCA_VLAN_PUSH_VLAN_PRIORITY;
2371                                 *(uint8_t *)mnl_attr_get_payload
2372                                         (na_vlan_priority) =
2373                                         conf.of_set_vlan_pcp->vlan_pcp;
2374                         }
2375                         break;
2376                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
2377                 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
2378                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
2379                 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
2380                 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
2381                 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
2382                 case RTE_FLOW_ACTION_TYPE_SET_TTL:
2383                 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
2384                 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
2385                 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
2386                         na_act_index =
2387                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
2388                         flow_tcf_create_pedit_mnl_msg(nlh,
2389                                                       &actions, item_flags);
2390                         mnl_attr_nest_end(nlh, na_act_index);
2391                         break;
2392                 default:
2393                         return rte_flow_error_set(error, ENOTSUP,
2394                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2395                                                   actions,
2396                                                   "action not supported");
2397                 }
2398         }
2399         assert(na_flower);
2400         assert(na_flower_act);
2401         mnl_attr_nest_end(nlh, na_flower_act);
2402         mnl_attr_nest_end(nlh, na_flower);
2403         return 0;
2404 }
2405
2406 /**
2407  * Send Netlink message with acknowledgment.
2408  *
2409  * @param ctx
2410  *   Flow context to use.
2411  * @param nlh
2412  *   Message to send. This function always raises the NLM_F_ACK flag before
2413  *   sending.
2414  *
2415  * @return
2416  *   0 on success, a negative errno value otherwise and rte_errno is set.
2417  */
2418 static int
2419 flow_tcf_nl_ack(struct mlx5_flow_tcf_context *ctx, struct nlmsghdr *nlh)
2420 {
2421         alignas(struct nlmsghdr)
2422         uint8_t ans[mnl_nlmsg_size(sizeof(struct nlmsgerr)) +
2423                     nlh->nlmsg_len - sizeof(*nlh)];
2424         uint32_t seq = ctx->seq++;
2425         struct mnl_socket *nl = ctx->nl;
2426         int ret;
2427
2428         nlh->nlmsg_flags |= NLM_F_ACK;
2429         nlh->nlmsg_seq = seq;
2430         ret = mnl_socket_sendto(nl, nlh, nlh->nlmsg_len);
2431         if (ret != -1)
2432                 ret = mnl_socket_recvfrom(nl, ans, sizeof(ans));
2433         if (ret != -1)
2434                 ret = mnl_cb_run
2435                         (ans, ret, seq, mnl_socket_get_portid(nl), NULL, NULL);
2436         if (ret > 0)
2437                 return 0;
2438         rte_errno = errno;
2439         return -rte_errno;
2440 }
2441
2442 /**
2443  * Apply flow to E-Switch by sending Netlink message.
2444  *
2445  * @param[in] dev
2446  *   Pointer to Ethernet device.
2447  * @param[in, out] flow
2448  *   Pointer to the sub flow.
2449  * @param[out] error
2450  *   Pointer to the error structure.
2451  *
2452  * @return
2453  *   0 on success, a negative errno value otherwise and rte_ernno is set.
2454  */
2455 static int
2456 flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
2457                struct rte_flow_error *error)
2458 {
2459         struct priv *priv = dev->data->dev_private;
2460         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
2461         struct mlx5_flow *dev_flow;
2462         struct nlmsghdr *nlh;
2463
2464         dev_flow = LIST_FIRST(&flow->dev_flows);
2465         /* E-Switch flow can't be expanded. */
2466         assert(!LIST_NEXT(dev_flow, next));
2467         nlh = dev_flow->tcf.nlh;
2468         nlh->nlmsg_type = RTM_NEWTFILTER;
2469         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
2470         if (!flow_tcf_nl_ack(ctx, nlh))
2471                 return 0;
2472         return rte_flow_error_set(error, rte_errno,
2473                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
2474                                   "netlink: failed to create TC flow rule");
2475 }
2476
2477 /**
2478  * Remove flow from E-Switch by sending Netlink message.
2479  *
2480  * @param[in] dev
2481  *   Pointer to Ethernet device.
2482  * @param[in, out] flow
2483  *   Pointer to the sub flow.
2484  */
2485 static void
2486 flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
2487 {
2488         struct priv *priv = dev->data->dev_private;
2489         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
2490         struct mlx5_flow *dev_flow;
2491         struct nlmsghdr *nlh;
2492
2493         if (!flow)
2494                 return;
2495         if (flow->counter) {
2496                 if (--flow->counter->ref_cnt == 0) {
2497                         rte_free(flow->counter);
2498                         flow->counter = NULL;
2499                 }
2500         }
2501         dev_flow = LIST_FIRST(&flow->dev_flows);
2502         if (!dev_flow)
2503                 return;
2504         /* E-Switch flow can't be expanded. */
2505         assert(!LIST_NEXT(dev_flow, next));
2506         nlh = dev_flow->tcf.nlh;
2507         nlh->nlmsg_type = RTM_DELTFILTER;
2508         nlh->nlmsg_flags = NLM_F_REQUEST;
2509         flow_tcf_nl_ack(ctx, nlh);
2510 }
2511
2512 /**
2513  * Remove flow from E-Switch and release resources of the device flow.
2514  *
2515  * @param[in] dev
2516  *   Pointer to Ethernet device.
2517  * @param[in, out] flow
2518  *   Pointer to the sub flow.
2519  */
2520 static void
2521 flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
2522 {
2523         struct mlx5_flow *dev_flow;
2524
2525         if (!flow)
2526                 return;
2527         flow_tcf_remove(dev, flow);
2528         dev_flow = LIST_FIRST(&flow->dev_flows);
2529         if (!dev_flow)
2530                 return;
2531         /* E-Switch flow can't be expanded. */
2532         assert(!LIST_NEXT(dev_flow, next));
2533         LIST_REMOVE(dev_flow, next);
2534         rte_free(dev_flow);
2535 }
2536
2537 /**
2538  * Helper routine for figuring the space size required for a parse buffer.
2539  *
2540  * @param array
2541  *   array of values to use.
2542  * @param idx
2543  *   Current location in array.
2544  * @param value
2545  *   Value to compare with.
2546  *
2547  * @return
2548  *   The maximum between the given value and the array value on index.
2549  */
2550 static uint16_t
2551 flow_tcf_arr_val_max(uint16_t array[], int idx, uint16_t value)
2552 {
2553         return idx < 0 ? (value) : RTE_MAX((array)[idx], value);
2554 }
2555
2556 /**
2557  * Parse rtnetlink message attributes filling the attribute table with the info
2558  * retrieved.
2559  *
2560  * @param tb
2561  *   Attribute table to be filled.
2562  * @param[out] max
2563  *   Maxinum entry in the attribute table.
2564  * @param rte
2565  *   The attributes section in the message to be parsed.
2566  * @param len
2567  *   The length of the attributes section in the message.
2568  */
2569 static void
2570 flow_tcf_nl_parse_rtattr(struct rtattr *tb[], int max,
2571                          struct rtattr *rta, int len)
2572 {
2573         unsigned short type;
2574         memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
2575         while (RTA_OK(rta, len)) {
2576                 type = rta->rta_type;
2577                 if (type <= max && !tb[type])
2578                         tb[type] = rta;
2579                 rta = RTA_NEXT(rta, len);
2580         }
2581 }
2582
2583 /**
2584  * Extract flow counters from flower action.
2585  *
2586  * @param rta
2587  *   flower action stats properties in the Netlink message received.
2588  * @param rta_type
2589  *   The backward sequence of rta_types, as written in the attribute table,
2590  *   we need to traverse in order to get to the requested object.
2591  * @param idx
2592  *   Current location in rta_type table.
2593  * @param[out] data
2594  *   data holding the count statistics of the rte_flow retrieved from
2595  *   the message.
2596  *
2597  * @return
2598  *   0 if data was found and retrieved, -1 otherwise.
2599  */
2600 static int
2601 flow_tcf_nl_action_stats_parse_and_get(struct rtattr *rta,
2602                                        uint16_t rta_type[], int idx,
2603                                        struct gnet_stats_basic *data)
2604 {
2605         int tca_stats_max = flow_tcf_arr_val_max(rta_type, idx,
2606                                                  TCA_STATS_BASIC);
2607         struct rtattr *tbs[tca_stats_max + 1];
2608
2609         if (rta == NULL || idx < 0)
2610                 return -1;
2611         flow_tcf_nl_parse_rtattr(tbs, tca_stats_max,
2612                                  RTA_DATA(rta), RTA_PAYLOAD(rta));
2613         switch (rta_type[idx]) {
2614         case TCA_STATS_BASIC:
2615                 if (tbs[TCA_STATS_BASIC]) {
2616                         memcpy(data, RTA_DATA(tbs[TCA_STATS_BASIC]),
2617                                RTE_MIN(RTA_PAYLOAD(tbs[TCA_STATS_BASIC]),
2618                                sizeof(*data)));
2619                         return 0;
2620                 }
2621                 break;
2622         default:
2623                 break;
2624         }
2625         return -1;
2626 }
2627
2628 /**
2629  * Parse flower single action retrieving the requested action attribute,
2630  * if found.
2631  *
2632  * @param arg
2633  *   flower action properties in the Netlink message received.
2634  * @param rta_type
2635  *   The backward sequence of rta_types, as written in the attribute table,
2636  *   we need to traverse in order to get to the requested object.
2637  * @param idx
2638  *   Current location in rta_type table.
2639  * @param[out] data
2640  *   Count statistics retrieved from the message query.
2641  *
2642  * @return
2643  *   0 if data was found and retrieved, -1 otherwise.
2644  */
2645 static int
2646 flow_tcf_nl_parse_one_action_and_get(struct rtattr *arg,
2647                                      uint16_t rta_type[], int idx, void *data)
2648 {
2649         int tca_act_max = flow_tcf_arr_val_max(rta_type, idx, TCA_ACT_STATS);
2650         struct rtattr *tb[tca_act_max + 1];
2651
2652         if (arg == NULL || idx < 0)
2653                 return -1;
2654         flow_tcf_nl_parse_rtattr(tb, tca_act_max,
2655                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
2656         if (tb[TCA_ACT_KIND] == NULL)
2657                 return -1;
2658         switch (rta_type[idx]) {
2659         case TCA_ACT_STATS:
2660                 if (tb[TCA_ACT_STATS])
2661                         return flow_tcf_nl_action_stats_parse_and_get
2662                                         (tb[TCA_ACT_STATS],
2663                                          rta_type, --idx,
2664                                          (struct gnet_stats_basic *)data);
2665                 break;
2666         default:
2667                 break;
2668         }
2669         return -1;
2670 }
2671
2672 /**
2673  * Parse flower action section in the message retrieving the requested
2674  * attribute from the first action that provides it.
2675  *
2676  * @param opt
2677  *   flower section in the Netlink message received.
2678  * @param rta_type
2679  *   The backward sequence of rta_types, as written in the attribute table,
2680  *   we need to traverse in order to get to the requested object.
2681  * @param idx
2682  *   Current location in rta_type table.
2683  * @param[out] data
2684  *   data retrieved from the message query.
2685  *
2686  * @return
2687  *   0 if data was found and retrieved, -1 otherwise.
2688  */
2689 static int
2690 flow_tcf_nl_action_parse_and_get(struct rtattr *arg,
2691                                  uint16_t rta_type[], int idx, void *data)
2692 {
2693         struct rtattr *tb[TCA_ACT_MAX_PRIO + 1];
2694         int i;
2695
2696         if (arg == NULL || idx < 0)
2697                 return -1;
2698         flow_tcf_nl_parse_rtattr(tb, TCA_ACT_MAX_PRIO,
2699                                  RTA_DATA(arg), RTA_PAYLOAD(arg));
2700         switch (rta_type[idx]) {
2701         /*
2702          * flow counters are stored in the actions defined by the flow
2703          * and not in the flow itself, therefore we need to traverse the
2704          * flower chain of actions in search for them.
2705          *
2706          * Note that the index is not decremented here.
2707          */
2708         case TCA_ACT_STATS:
2709                 for (i = 0; i <= TCA_ACT_MAX_PRIO; i++) {
2710                         if (tb[i] &&
2711                         !flow_tcf_nl_parse_one_action_and_get(tb[i],
2712                                                               rta_type,
2713                                                               idx, data))
2714                                 return 0;
2715                 }
2716                 break;
2717         default:
2718                 break;
2719         }
2720         return -1;
2721 }
2722
2723 /**
2724  * Parse flower classifier options in the message, retrieving the requested
2725  * attribute if found.
2726  *
2727  * @param opt
2728  *   flower section in the Netlink message received.
2729  * @param rta_type
2730  *   The backward sequence of rta_types, as written in the attribute table,
2731  *   we need to traverse in order to get to the requested object.
2732  * @param idx
2733  *   Current location in rta_type table.
2734  * @param[out] data
2735  *   data retrieved from the message query.
2736  *
2737  * @return
2738  *   0 if data was found and retrieved, -1 otherwise.
2739  */
2740 static int
2741 flow_tcf_nl_opts_parse_and_get(struct rtattr *opt,
2742                                uint16_t rta_type[], int idx, void *data)
2743 {
2744         int tca_flower_max = flow_tcf_arr_val_max(rta_type, idx,
2745                                                   TCA_FLOWER_ACT);
2746         struct rtattr *tb[tca_flower_max + 1];
2747
2748         if (!opt || idx < 0)
2749                 return -1;
2750         flow_tcf_nl_parse_rtattr(tb, tca_flower_max,
2751                                  RTA_DATA(opt), RTA_PAYLOAD(opt));
2752         switch (rta_type[idx]) {
2753         case TCA_FLOWER_ACT:
2754                 if (tb[TCA_FLOWER_ACT])
2755                         return flow_tcf_nl_action_parse_and_get
2756                                                         (tb[TCA_FLOWER_ACT],
2757                                                          rta_type, --idx, data);
2758                 break;
2759         default:
2760                 break;
2761         }
2762         return -1;
2763 }
2764
2765 /**
2766  * Parse Netlink reply on filter query, retrieving the flow counters.
2767  *
2768  * @param nlh
2769  *   Message received from Netlink.
2770  * @param rta_type
2771  *   The backward sequence of rta_types, as written in the attribute table,
2772  *   we need to traverse in order to get to the requested object.
2773  * @param idx
2774  *   Current location in rta_type table.
2775  * @param[out] data
2776  *   data retrieved from the message query.
2777  *
2778  * @return
2779  *   0 if data was found and retrieved, -1 otherwise.
2780  */
2781 static int
2782 flow_tcf_nl_filter_parse_and_get(struct nlmsghdr *cnlh,
2783                                  uint16_t rta_type[], int idx, void *data)
2784 {
2785         struct nlmsghdr *nlh = cnlh;
2786         struct tcmsg *t = NLMSG_DATA(nlh);
2787         int len = nlh->nlmsg_len;
2788         int tca_max = flow_tcf_arr_val_max(rta_type, idx, TCA_OPTIONS);
2789         struct rtattr *tb[tca_max + 1];
2790
2791         if (idx < 0)
2792                 return -1;
2793         if (nlh->nlmsg_type != RTM_NEWTFILTER &&
2794             nlh->nlmsg_type != RTM_GETTFILTER &&
2795             nlh->nlmsg_type != RTM_DELTFILTER)
2796                 return -1;
2797         len -= NLMSG_LENGTH(sizeof(*t));
2798         if (len < 0)
2799                 return -1;
2800         flow_tcf_nl_parse_rtattr(tb, tca_max, TCA_RTA(t), len);
2801         /* Not a TC flower flow - bail out */
2802         if (!tb[TCA_KIND] ||
2803             strcmp(RTA_DATA(tb[TCA_KIND]), "flower"))
2804                 return -1;
2805         switch (rta_type[idx]) {
2806         case TCA_OPTIONS:
2807                 if (tb[TCA_OPTIONS])
2808                         return flow_tcf_nl_opts_parse_and_get(tb[TCA_OPTIONS],
2809                                                               rta_type,
2810                                                               --idx, data);
2811                 break;
2812         default:
2813                 break;
2814         }
2815         return -1;
2816 }
2817
2818 /**
2819  * A callback to parse Netlink reply on TC flower query.
2820  *
2821  * @param nlh
2822  *   Message received from Netlink.
2823  * @param[out] data
2824  *   Pointer to data area to be filled by the parsing routine.
2825  *   assumed to be a pinter to struct flow_tcf_stats_basic.
2826  *
2827  * @return
2828  *   MNL_CB_OK value.
2829  */
2830 static int
2831 flow_tcf_nl_message_get_stats_basic(const struct nlmsghdr *nlh, void *data)
2832 {
2833         /*
2834          * The backward sequence of rta_types to pass in order to get
2835          *  to the counters.
2836          */
2837         uint16_t rta_type[] = { TCA_STATS_BASIC, TCA_ACT_STATS,
2838                                 TCA_FLOWER_ACT, TCA_OPTIONS };
2839         struct flow_tcf_stats_basic *sb_data = data;
2840         union {
2841                 const struct nlmsghdr *c;
2842                 struct nlmsghdr *nc;
2843         } tnlh = { .c = nlh };
2844
2845         if (!flow_tcf_nl_filter_parse_and_get(tnlh.nc, rta_type,
2846                                               RTE_DIM(rta_type) - 1,
2847                                               (void *)&sb_data->counters))
2848                 sb_data->valid = true;
2849         return MNL_CB_OK;
2850 }
2851
2852 /**
2853  * Query a TC flower rule for its statistics via netlink.
2854  *
2855  * @param[in] dev
2856  *   Pointer to Ethernet device.
2857  * @param[in] flow
2858  *   Pointer to the sub flow.
2859  * @param[out] data
2860  *   data retrieved by the query.
2861  * @param[out] error
2862  *   Perform verbose error reporting if not NULL.
2863  *
2864  * @return
2865  *   0 on success, a negative errno value otherwise and rte_errno is set.
2866  */
2867 static int
2868 flow_tcf_query_count(struct rte_eth_dev *dev,
2869                           struct rte_flow *flow,
2870                           void *data,
2871                           struct rte_flow_error *error)
2872 {
2873         struct flow_tcf_stats_basic sb_data = { 0 };
2874         struct rte_flow_query_count *qc = data;
2875         struct priv *priv = dev->data->dev_private;
2876         struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
2877         struct mnl_socket *nl = ctx->nl;
2878         struct mlx5_flow *dev_flow;
2879         struct nlmsghdr *nlh;
2880         uint32_t seq = priv->tcf_context->seq++;
2881         ssize_t ret;
2882         assert(qc);
2883
2884         dev_flow = LIST_FIRST(&flow->dev_flows);
2885         /* E-Switch flow can't be expanded. */
2886         assert(!LIST_NEXT(dev_flow, next));
2887         if (!dev_flow->flow->counter)
2888                 goto notsup_exit;
2889         nlh = dev_flow->tcf.nlh;
2890         nlh->nlmsg_type = RTM_GETTFILTER;
2891         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ECHO;
2892         nlh->nlmsg_seq = seq;
2893         if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) == -1)
2894                 goto error_exit;
2895         do {
2896                 ret = mnl_socket_recvfrom(nl, ctx->buf, ctx->buf_size);
2897                 if (ret <= 0)
2898                         break;
2899                 ret = mnl_cb_run(ctx->buf, ret, seq,
2900                                  mnl_socket_get_portid(nl),
2901                                  flow_tcf_nl_message_get_stats_basic,
2902                                  (void *)&sb_data);
2903         } while (ret > 0);
2904         /* Return the delta from last reset. */
2905         if (sb_data.valid) {
2906                 /* Return the delta from last reset. */
2907                 qc->hits_set = 1;
2908                 qc->bytes_set = 1;
2909                 qc->hits = sb_data.counters.packets - flow->counter->hits;
2910                 qc->bytes = sb_data.counters.bytes - flow->counter->bytes;
2911                 if (qc->reset) {
2912                         flow->counter->hits = sb_data.counters.packets;
2913                         flow->counter->bytes = sb_data.counters.bytes;
2914                 }
2915                 return 0;
2916         }
2917         return rte_flow_error_set(error, EINVAL,
2918                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
2919                                   NULL,
2920                                   "flow does not have counter");
2921 error_exit:
2922         return rte_flow_error_set
2923                         (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
2924                          NULL, "netlink: failed to read flow rule counters");
2925 notsup_exit:
2926         return rte_flow_error_set
2927                         (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
2928                          NULL, "counters are not available.");
2929 }
2930
2931 /**
2932  * Query a flow.
2933  *
2934  * @see rte_flow_query()
2935  * @see rte_flow_ops
2936  */
2937 static int
2938 flow_tcf_query(struct rte_eth_dev *dev,
2939                struct rte_flow *flow,
2940                const struct rte_flow_action *actions,
2941                void *data,
2942                struct rte_flow_error *error)
2943 {
2944         int ret = -EINVAL;
2945
2946         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
2947                 switch (actions->type) {
2948                 case RTE_FLOW_ACTION_TYPE_VOID:
2949                         break;
2950                 case RTE_FLOW_ACTION_TYPE_COUNT:
2951                         ret = flow_tcf_query_count(dev, flow, data, error);
2952                         break;
2953                 default:
2954                         return rte_flow_error_set(error, ENOTSUP,
2955                                                   RTE_FLOW_ERROR_TYPE_ACTION,
2956                                                   actions,
2957                                                   "action not supported");
2958                 }
2959         }
2960         return ret;
2961 }
2962
2963 const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops = {
2964         .validate = flow_tcf_validate,
2965         .prepare = flow_tcf_prepare,
2966         .translate = flow_tcf_translate,
2967         .apply = flow_tcf_apply,
2968         .remove = flow_tcf_remove,
2969         .destroy = flow_tcf_destroy,
2970         .query = flow_tcf_query,
2971 };
2972
2973 /**
2974  * Create and configure a libmnl socket for Netlink flow rules.
2975  *
2976  * @return
2977  *   A valid libmnl socket object pointer on success, NULL otherwise and
2978  *   rte_errno is set.
2979  */
2980 static struct mnl_socket *
2981 flow_tcf_mnl_socket_create(void)
2982 {
2983         struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
2984
2985         if (nl) {
2986                 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
2987                                       sizeof(int));
2988                 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
2989                         return nl;
2990         }
2991         rte_errno = errno;
2992         if (nl)
2993                 mnl_socket_close(nl);
2994         return NULL;
2995 }
2996
2997 /**
2998  * Destroy a libmnl socket.
2999  *
3000  * @param nl
3001  *   Libmnl socket of the @p NETLINK_ROUTE kind.
3002  */
3003 static void
3004 flow_tcf_mnl_socket_destroy(struct mnl_socket *nl)
3005 {
3006         if (nl)
3007                 mnl_socket_close(nl);
3008 }
3009
3010 /**
3011  * Initialize ingress qdisc of a given network interface.
3012  *
3013  * @param ctx
3014  *   Pointer to tc-flower context to use.
3015  * @param ifindex
3016  *   Index of network interface to initialize.
3017  * @param[out] error
3018  *   Perform verbose error reporting if not NULL.
3019  *
3020  * @return
3021  *   0 on success, a negative errno value otherwise and rte_errno is set.
3022  */
3023 int
3024 mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx,
3025                    unsigned int ifindex, struct rte_flow_error *error)
3026 {
3027         struct nlmsghdr *nlh;
3028         struct tcmsg *tcm;
3029         alignas(struct nlmsghdr)
3030         uint8_t buf[mnl_nlmsg_size(sizeof(*tcm) + 128)];
3031
3032         /* Destroy existing ingress qdisc and everything attached to it. */
3033         nlh = mnl_nlmsg_put_header(buf);
3034         nlh->nlmsg_type = RTM_DELQDISC;
3035         nlh->nlmsg_flags = NLM_F_REQUEST;
3036         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
3037         tcm->tcm_family = AF_UNSPEC;
3038         tcm->tcm_ifindex = ifindex;
3039         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
3040         tcm->tcm_parent = TC_H_INGRESS;
3041         /* Ignore errors when qdisc is already absent. */
3042         if (flow_tcf_nl_ack(ctx, nlh) &&
3043             rte_errno != EINVAL && rte_errno != ENOENT)
3044                 return rte_flow_error_set(error, rte_errno,
3045                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
3046                                           "netlink: failed to remove ingress"
3047                                           " qdisc");
3048         /* Create fresh ingress qdisc. */
3049         nlh = mnl_nlmsg_put_header(buf);
3050         nlh->nlmsg_type = RTM_NEWQDISC;
3051         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
3052         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
3053         tcm->tcm_family = AF_UNSPEC;
3054         tcm->tcm_ifindex = ifindex;
3055         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
3056         tcm->tcm_parent = TC_H_INGRESS;
3057         mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
3058         if (flow_tcf_nl_ack(ctx, nlh))
3059                 return rte_flow_error_set(error, rte_errno,
3060                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
3061                                           "netlink: failed to create ingress"
3062                                           " qdisc");
3063         return 0;
3064 }
3065
3066 /**
3067  * Create libmnl context for Netlink flow rules.
3068  *
3069  * @return
3070  *   A valid libmnl socket object pointer on success, NULL otherwise and
3071  *   rte_errno is set.
3072  */
3073 struct mlx5_flow_tcf_context *
3074 mlx5_flow_tcf_context_create(void)
3075 {
3076         struct mlx5_flow_tcf_context *ctx = rte_zmalloc(__func__,
3077                                                         sizeof(*ctx),
3078                                                         sizeof(uint32_t));
3079         if (!ctx)
3080                 goto error;
3081         ctx->nl = flow_tcf_mnl_socket_create();
3082         if (!ctx->nl)
3083                 goto error;
3084         ctx->buf_size = MNL_SOCKET_BUFFER_SIZE;
3085         ctx->buf = rte_zmalloc(__func__,
3086                                ctx->buf_size, sizeof(uint32_t));
3087         if (!ctx->buf)
3088                 goto error;
3089         ctx->seq = random();
3090         return ctx;
3091 error:
3092         mlx5_flow_tcf_context_destroy(ctx);
3093         return NULL;
3094 }
3095
3096 /**
3097  * Destroy a libmnl context.
3098  *
3099  * @param ctx
3100  *   Libmnl socket of the @p NETLINK_ROUTE kind.
3101  */
3102 void
3103 mlx5_flow_tcf_context_destroy(struct mlx5_flow_tcf_context *ctx)
3104 {
3105         if (!ctx)
3106                 return;
3107         flow_tcf_mnl_socket_destroy(ctx->nl);
3108         rte_free(ctx->buf);
3109         rte_free(ctx);
3110 }