1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2018 6WIND S.A.
3 * Copyright 2018 Mellanox Technologies, Ltd
8 #include <libmnl/libmnl.h>
9 #include <linux/gen_stats.h>
10 #include <linux/if_ether.h>
11 #include <linux/netlink.h>
12 #include <linux/pkt_cls.h>
13 #include <linux/pkt_sched.h>
14 #include <linux/rtnetlink.h>
15 #include <linux/tc_act/tc_gact.h>
16 #include <linux/tc_act/tc_mirred.h>
17 #include <netinet/in.h>
23 #include <sys/socket.h>
25 #include <rte_byteorder.h>
26 #include <rte_errno.h>
27 #include <rte_ether.h>
29 #include <rte_malloc.h>
30 #include <rte_common.h>
33 #include "mlx5_flow.h"
34 #include "mlx5_autoconf.h"
36 #ifdef HAVE_TC_ACT_VLAN
38 #include <linux/tc_act/tc_vlan.h>
40 #else /* HAVE_TC_ACT_VLAN */
42 #define TCA_VLAN_ACT_POP 1
43 #define TCA_VLAN_ACT_PUSH 2
44 #define TCA_VLAN_ACT_MODIFY 3
45 #define TCA_VLAN_PARMS 2
46 #define TCA_VLAN_PUSH_VLAN_ID 3
47 #define TCA_VLAN_PUSH_VLAN_PROTOCOL 4
48 #define TCA_VLAN_PAD 5
49 #define TCA_VLAN_PUSH_VLAN_PRIORITY 6
56 #endif /* HAVE_TC_ACT_VLAN */
58 #ifdef HAVE_TC_ACT_PEDIT
60 #include <linux/tc_act/tc_pedit.h>
62 #else /* HAVE_TC_ACT_VLAN */
76 TCA_PEDIT_KEY_EX_HTYPE = 1,
77 TCA_PEDIT_KEY_EX_CMD = 2,
78 __TCA_PEDIT_KEY_EX_MAX
81 enum pedit_header_type {
82 TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK = 0,
83 TCA_PEDIT_KEY_EX_HDR_TYPE_ETH = 1,
84 TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 = 2,
85 TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 = 3,
86 TCA_PEDIT_KEY_EX_HDR_TYPE_TCP = 4,
87 TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
92 TCA_PEDIT_KEY_EX_CMD_SET = 0,
93 TCA_PEDIT_KEY_EX_CMD_ADD = 1,
100 __u32 off; /*offset */
107 struct tc_pedit_sel {
111 struct tc_pedit_key keys[0];
114 #endif /* HAVE_TC_ACT_VLAN */
116 #ifdef HAVE_TC_ACT_TUNNEL_KEY
118 #include <linux/tc_act/tc_tunnel_key.h>
120 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
121 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
124 #ifndef HAVE_TCA_TUNNEL_KEY_NO_CSUM
125 #define TCA_TUNNEL_KEY_NO_CSUM 10
128 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_TOS
129 #define TCA_TUNNEL_KEY_ENC_TOS 12
132 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_TTL
133 #define TCA_TUNNEL_KEY_ENC_TTL 13
136 #else /* HAVE_TC_ACT_TUNNEL_KEY */
138 #define TCA_ACT_TUNNEL_KEY 17
139 #define TCA_TUNNEL_KEY_ACT_SET 1
140 #define TCA_TUNNEL_KEY_ACT_RELEASE 2
141 #define TCA_TUNNEL_KEY_PARMS 2
142 #define TCA_TUNNEL_KEY_ENC_IPV4_SRC 3
143 #define TCA_TUNNEL_KEY_ENC_IPV4_DST 4
144 #define TCA_TUNNEL_KEY_ENC_IPV6_SRC 5
145 #define TCA_TUNNEL_KEY_ENC_IPV6_DST 6
146 #define TCA_TUNNEL_KEY_ENC_KEY_ID 7
147 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
148 #define TCA_TUNNEL_KEY_NO_CSUM 10
149 #define TCA_TUNNEL_KEY_ENC_TOS 12
150 #define TCA_TUNNEL_KEY_ENC_TTL 13
152 struct tc_tunnel_key {
157 #endif /* HAVE_TC_ACT_TUNNEL_KEY */
159 /* Normally found in linux/netlink.h. */
160 #ifndef NETLINK_CAP_ACK
161 #define NETLINK_CAP_ACK 10
164 /* Normally found in linux/pkt_sched.h. */
165 #ifndef TC_H_MIN_INGRESS
166 #define TC_H_MIN_INGRESS 0xfff2u
169 /* Normally found in linux/pkt_cls.h. */
170 #ifndef TCA_CLS_FLAGS_SKIP_SW
171 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
173 #ifndef TCA_CLS_FLAGS_IN_HW
174 #define TCA_CLS_FLAGS_IN_HW (1 << 2)
176 #ifndef HAVE_TCA_CHAIN
179 #ifndef HAVE_TCA_FLOWER_ACT
180 #define TCA_FLOWER_ACT 3
182 #ifndef HAVE_TCA_FLOWER_FLAGS
183 #define TCA_FLOWER_FLAGS 22
185 #ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE
186 #define TCA_FLOWER_KEY_ETH_TYPE 8
188 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST
189 #define TCA_FLOWER_KEY_ETH_DST 4
191 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK
192 #define TCA_FLOWER_KEY_ETH_DST_MASK 5
194 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC
195 #define TCA_FLOWER_KEY_ETH_SRC 6
197 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK
198 #define TCA_FLOWER_KEY_ETH_SRC_MASK 7
200 #ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO
201 #define TCA_FLOWER_KEY_IP_PROTO 9
203 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC
204 #define TCA_FLOWER_KEY_IPV4_SRC 10
206 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK
207 #define TCA_FLOWER_KEY_IPV4_SRC_MASK 11
209 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST
210 #define TCA_FLOWER_KEY_IPV4_DST 12
212 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK
213 #define TCA_FLOWER_KEY_IPV4_DST_MASK 13
215 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC
216 #define TCA_FLOWER_KEY_IPV6_SRC 14
218 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK
219 #define TCA_FLOWER_KEY_IPV6_SRC_MASK 15
221 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST
222 #define TCA_FLOWER_KEY_IPV6_DST 16
224 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK
225 #define TCA_FLOWER_KEY_IPV6_DST_MASK 17
227 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC
228 #define TCA_FLOWER_KEY_TCP_SRC 18
230 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK
231 #define TCA_FLOWER_KEY_TCP_SRC_MASK 35
233 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST
234 #define TCA_FLOWER_KEY_TCP_DST 19
236 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK
237 #define TCA_FLOWER_KEY_TCP_DST_MASK 36
239 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC
240 #define TCA_FLOWER_KEY_UDP_SRC 20
242 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK
243 #define TCA_FLOWER_KEY_UDP_SRC_MASK 37
245 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST
246 #define TCA_FLOWER_KEY_UDP_DST 21
248 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK
249 #define TCA_FLOWER_KEY_UDP_DST_MASK 38
251 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID
252 #define TCA_FLOWER_KEY_VLAN_ID 23
254 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO
255 #define TCA_FLOWER_KEY_VLAN_PRIO 24
257 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
258 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
260 #ifndef HAVE_TCA_FLOWER_KEY_ENC_KEY_ID
261 #define TCA_FLOWER_KEY_ENC_KEY_ID 26
263 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC
264 #define TCA_FLOWER_KEY_ENC_IPV4_SRC 27
266 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK
267 #define TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK 28
269 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST
270 #define TCA_FLOWER_KEY_ENC_IPV4_DST 29
272 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK
273 #define TCA_FLOWER_KEY_ENC_IPV4_DST_MASK 30
275 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC
276 #define TCA_FLOWER_KEY_ENC_IPV6_SRC 31
278 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK
279 #define TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK 32
281 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST
282 #define TCA_FLOWER_KEY_ENC_IPV6_DST 33
284 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK
285 #define TCA_FLOWER_KEY_ENC_IPV6_DST_MASK 34
287 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT
288 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT 43
290 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK
291 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK 44
293 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT
294 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT 45
296 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK
297 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK 46
299 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS
300 #define TCA_FLOWER_KEY_TCP_FLAGS 71
302 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK
303 #define TCA_FLOWER_KEY_TCP_FLAGS_MASK 72
305 #ifndef HAVE_TCA_FLOWER_KEY_IP_TOS
306 #define TCA_FLOWER_KEY_IP_TOS 73
308 #ifndef HAVE_TCA_FLOWER_KEY_IP_TOS_MASK
309 #define TCA_FLOWER_KEY_IP_TOS_MASK 74
311 #ifndef HAVE_TCA_FLOWER_KEY_IP_TTL
312 #define TCA_FLOWER_KEY_IP_TTL 75
314 #ifndef HAVE_TCA_FLOWER_KEY_IP_TTL_MASK
315 #define TCA_FLOWER_KEY_IP_TTL_MASK 76
317 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TOS
318 #define TCA_FLOWER_KEY_ENC_IP_TOS 80
320 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TOS_MASK
321 #define TCA_FLOWER_KEY_ENC_IP_TOS_MASK 81
323 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TTL
324 #define TCA_FLOWER_KEY_ENC_IP_TTL 82
326 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TTL_MASK
327 #define TCA_FLOWER_KEY_ENC_IP_TTL_MASK 83
330 #ifndef HAVE_TC_ACT_GOTO_CHAIN
331 #define TC_ACT_GOTO_CHAIN 0x20000000
334 #ifndef IPV6_ADDR_LEN
335 #define IPV6_ADDR_LEN 16
338 #ifndef IPV4_ADDR_LEN
339 #define IPV4_ADDR_LEN 4
343 #define TP_PORT_LEN 2 /* Transport Port (UDP/TCP) Length */
350 #ifndef TCA_ACT_MAX_PRIO
351 #define TCA_ACT_MAX_PRIO 32
354 /** Parameters of VXLAN devices created by driver. */
355 #define MLX5_VXLAN_DEFAULT_VNI 1
356 #define MLX5_VXLAN_DEVICE_PFX "vmlx_"
358 /** Tunnel action type, used for @p type in header structure. */
359 enum flow_tcf_tunact_type {
360 FLOW_TCF_TUNACT_VXLAN_DECAP,
361 FLOW_TCF_TUNACT_VXLAN_ENCAP,
364 /** Flags used for @p mask in tunnel action encap descriptors. */
365 #define FLOW_TCF_ENCAP_ETH_SRC (1u << 0)
366 #define FLOW_TCF_ENCAP_ETH_DST (1u << 1)
367 #define FLOW_TCF_ENCAP_IPV4_SRC (1u << 2)
368 #define FLOW_TCF_ENCAP_IPV4_DST (1u << 3)
369 #define FLOW_TCF_ENCAP_IPV6_SRC (1u << 4)
370 #define FLOW_TCF_ENCAP_IPV6_DST (1u << 5)
371 #define FLOW_TCF_ENCAP_UDP_SRC (1u << 6)
372 #define FLOW_TCF_ENCAP_UDP_DST (1u << 7)
373 #define FLOW_TCF_ENCAP_VXLAN_VNI (1u << 8)
376 * Structure for holding netlink context.
377 * Note the size of the message buffer which is MNL_SOCKET_BUFFER_SIZE.
378 * Using this (8KB) buffer size ensures that netlink messages will never be
381 struct mlx5_flow_tcf_context {
382 struct mnl_socket *nl; /* NETLINK_ROUTE libmnl socket. */
383 uint32_t seq; /* Message sequence number. */
384 uint32_t buf_size; /* Message buffer size. */
385 uint8_t *buf; /* Message buffer. */
389 * Neigh rule structure. The neigh rule is applied via Netlink to
390 * outer tunnel iface in order to provide destination MAC address
391 * for the VXLAN encapsultion. The neigh rule is implicitly related
392 * to the Flow itself and can be shared by multiple Flows.
394 struct tcf_neigh_rule {
395 LIST_ENTRY(tcf_neigh_rule) next;
397 struct ether_addr eth;
404 uint8_t dst[IPV6_ADDR_LEN];
410 * Local rule structure. The local rule is applied via Netlink to
411 * outer tunnel iface in order to provide local and peer IP addresses
412 * of the VXLAN tunnel for encapsulation. The local rule is implicitly
413 * related to the Flow itself and can be shared by multiple Flows.
415 struct tcf_local_rule {
416 LIST_ENTRY(tcf_local_rule) next;
425 uint8_t dst[IPV6_ADDR_LEN];
426 uint8_t src[IPV6_ADDR_LEN];
431 /** Outer interface VXLAN encapsulation rules container. */
433 LIST_ENTRY(tcf_irule) next;
434 LIST_HEAD(, tcf_neigh_rule) neigh;
435 LIST_HEAD(, tcf_local_rule) local;
437 unsigned int ifouter; /**< Own interface index. */
440 /** VXLAN virtual netdev. */
442 LIST_ENTRY(tcf_vtep) next;
444 unsigned int ifindex; /**< Own interface index. */
449 /** Tunnel descriptor header, common for all tunnel types. */
450 struct flow_tcf_tunnel_hdr {
451 uint32_t type; /**< Tunnel action type. */
452 struct tcf_vtep *vtep; /**< Virtual tunnel endpoint device. */
453 unsigned int ifindex_org; /**< Original dst/src interface */
454 unsigned int *ifindex_ptr; /**< Interface ptr in message. */
457 struct flow_tcf_vxlan_decap {
458 struct flow_tcf_tunnel_hdr hdr;
462 struct flow_tcf_vxlan_encap {
463 struct flow_tcf_tunnel_hdr hdr;
464 struct tcf_irule *iface;
467 struct ether_addr dst;
468 struct ether_addr src;
476 uint8_t dst[IPV6_ADDR_LEN];
477 uint8_t src[IPV6_ADDR_LEN];
489 /** Structure used when extracting the values of a flow counters
490 * from a netlink message.
492 struct flow_tcf_stats_basic {
494 struct gnet_stats_basic counters;
497 /** Empty masks for known item types. */
499 struct rte_flow_item_port_id port_id;
500 struct rte_flow_item_eth eth;
501 struct rte_flow_item_vlan vlan;
502 struct rte_flow_item_ipv4 ipv4;
503 struct rte_flow_item_ipv6 ipv6;
504 struct rte_flow_item_tcp tcp;
505 struct rte_flow_item_udp udp;
506 struct rte_flow_item_vxlan vxlan;
507 } flow_tcf_mask_empty = {
511 /** Supported masks for known item types. */
512 static const struct {
513 struct rte_flow_item_port_id port_id;
514 struct rte_flow_item_eth eth;
515 struct rte_flow_item_vlan vlan;
516 struct rte_flow_item_ipv4 ipv4;
517 struct rte_flow_item_ipv6 ipv6;
518 struct rte_flow_item_tcp tcp;
519 struct rte_flow_item_udp udp;
520 struct rte_flow_item_vxlan vxlan;
521 } flow_tcf_mask_supported = {
526 .type = RTE_BE16(0xffff),
527 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
528 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
531 /* PCP and VID only, no DEI. */
532 .tci = RTE_BE16(0xefff),
533 .inner_type = RTE_BE16(0xffff),
536 .next_proto_id = 0xff,
537 .src_addr = RTE_BE32(0xffffffff),
538 .dst_addr = RTE_BE32(0xffffffff),
543 "\xff\xff\xff\xff\xff\xff\xff\xff"
544 "\xff\xff\xff\xff\xff\xff\xff\xff",
546 "\xff\xff\xff\xff\xff\xff\xff\xff"
547 "\xff\xff\xff\xff\xff\xff\xff\xff",
550 .src_port = RTE_BE16(0xffff),
551 .dst_port = RTE_BE16(0xffff),
555 .src_port = RTE_BE16(0xffff),
556 .dst_port = RTE_BE16(0xffff),
559 .vni = "\xff\xff\xff",
563 #define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr))
564 #define SZ_NLATTR_NEST SZ_NLATTR_HDR
565 #define SZ_NLATTR_DATA_OF(len) MNL_ALIGN(SZ_NLATTR_HDR + (len))
566 #define SZ_NLATTR_TYPE_OF(typ) SZ_NLATTR_DATA_OF(sizeof(typ))
567 #define SZ_NLATTR_STRZ_OF(str) SZ_NLATTR_DATA_OF(strlen(str) + 1)
569 #define PTOI_TABLE_SZ_MAX(dev) (mlx5_dev_to_port_id((dev)->device, NULL, 0) + 2)
571 /** DPDK port to network interface index (ifindex) conversion. */
572 struct flow_tcf_ptoi {
573 uint16_t port_id; /**< DPDK port ID. */
574 unsigned int ifindex; /**< Network interface index. */
577 /* Due to a limitation on driver/FW. */
578 #define MLX5_TCF_GROUP_ID_MAX 3
581 * Due to a limitation on driver/FW, priority ranges from 1 to 16 in kernel.
582 * Priority in rte_flow attribute starts from 0 and is added by 1 in
583 * translation. This is subject to be changed to determine the max priority
584 * based on trial-and-error like Verbs driver once the restriction is lifted or
585 * the range is extended.
587 #define MLX5_TCF_GROUP_PRIORITY_MAX 15
589 #define MLX5_TCF_FATE_ACTIONS \
590 (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \
591 MLX5_FLOW_ACTION_JUMP)
593 #define MLX5_TCF_VLAN_ACTIONS \
594 (MLX5_FLOW_ACTION_OF_POP_VLAN | MLX5_FLOW_ACTION_OF_PUSH_VLAN | \
595 MLX5_FLOW_ACTION_OF_SET_VLAN_VID | MLX5_FLOW_ACTION_OF_SET_VLAN_PCP)
597 #define MLX5_TCF_VXLAN_ACTIONS \
598 (MLX5_FLOW_ACTION_VXLAN_ENCAP | MLX5_FLOW_ACTION_VXLAN_DECAP)
600 #define MLX5_TCF_PEDIT_ACTIONS \
601 (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST | \
602 MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST | \
603 MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST | \
604 MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL | \
605 MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)
607 #define MLX5_TCF_CONFIG_ACTIONS \
608 (MLX5_FLOW_ACTION_PORT_ID | MLX5_FLOW_ACTION_JUMP | \
609 MLX5_FLOW_ACTION_OF_PUSH_VLAN | MLX5_FLOW_ACTION_OF_SET_VLAN_VID | \
610 MLX5_FLOW_ACTION_OF_SET_VLAN_PCP | \
611 (MLX5_TCF_PEDIT_ACTIONS & ~MLX5_FLOW_ACTION_DEC_TTL))
613 #define MAX_PEDIT_KEYS 128
614 #define SZ_PEDIT_KEY_VAL 4
616 #define NUM_OF_PEDIT_KEYS(sz) \
617 (((sz) / SZ_PEDIT_KEY_VAL) + (((sz) % SZ_PEDIT_KEY_VAL) ? 1 : 0))
619 struct pedit_key_ex {
620 enum pedit_header_type htype;
624 struct pedit_parser {
625 struct tc_pedit_sel sel;
626 struct tc_pedit_key keys[MAX_PEDIT_KEYS];
627 struct pedit_key_ex keys_ex[MAX_PEDIT_KEYS];
631 * Create space for using the implicitly created TC flow counter.
634 * Pointer to the Ethernet device structure.
637 * A pointer to the counter data structure, NULL otherwise and
640 static struct mlx5_flow_counter *
641 flow_tcf_counter_new(void)
643 struct mlx5_flow_counter *cnt;
646 * eswitch counter cannot be shared and its id is unknown.
647 * currently returning all with id 0.
648 * in the future maybe better to switch to unique numbers.
650 struct mlx5_flow_counter tmpl = {
653 cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0);
659 /* Implicit counter, do not add to list. */
664 * Set pedit key of MAC address
667 * pointer to action specification
668 * @param[in,out] p_parser
669 * pointer to pedit_parser
672 flow_tcf_pedit_key_set_mac(const struct rte_flow_action *actions,
673 struct pedit_parser *p_parser)
675 int idx = p_parser->sel.nkeys;
676 uint32_t off = actions->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ?
677 offsetof(struct ether_hdr, s_addr) :
678 offsetof(struct ether_hdr, d_addr);
679 const struct rte_flow_action_set_mac *conf =
680 (const struct rte_flow_action_set_mac *)actions->conf;
682 p_parser->keys[idx].off = off;
683 p_parser->keys[idx].mask = ~UINT32_MAX;
684 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
685 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
686 memcpy(&p_parser->keys[idx].val,
687 conf->mac_addr, SZ_PEDIT_KEY_VAL);
689 p_parser->keys[idx].off = off + SZ_PEDIT_KEY_VAL;
690 p_parser->keys[idx].mask = 0xFFFF0000;
691 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
692 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
693 memcpy(&p_parser->keys[idx].val,
694 conf->mac_addr + SZ_PEDIT_KEY_VAL,
695 ETHER_ADDR_LEN - SZ_PEDIT_KEY_VAL);
696 p_parser->sel.nkeys = (++idx);
700 * Set pedit key of decrease/set ttl
703 * pointer to action specification
704 * @param[in,out] p_parser
705 * pointer to pedit_parser
706 * @param[in] item_flags
707 * flags of all items presented
710 flow_tcf_pedit_key_set_dec_ttl(const struct rte_flow_action *actions,
711 struct pedit_parser *p_parser,
714 int idx = p_parser->sel.nkeys;
716 p_parser->keys[idx].mask = 0xFFFFFF00;
717 if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) {
718 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
719 p_parser->keys[idx].off =
720 offsetof(struct ipv4_hdr, time_to_live);
722 if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6) {
723 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
724 p_parser->keys[idx].off =
725 offsetof(struct ipv6_hdr, hop_limits);
727 if (actions->type == RTE_FLOW_ACTION_TYPE_DEC_TTL) {
728 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_ADD;
729 p_parser->keys[idx].val = 0x000000FF;
731 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
732 p_parser->keys[idx].val =
733 (__u32)((const struct rte_flow_action_set_ttl *)
734 actions->conf)->ttl_value;
736 p_parser->sel.nkeys = (++idx);
740 * Set pedit key of transport (TCP/UDP) port value
743 * pointer to action specification
744 * @param[in,out] p_parser
745 * pointer to pedit_parser
746 * @param[in] item_flags
747 * flags of all items presented
750 flow_tcf_pedit_key_set_tp_port(const struct rte_flow_action *actions,
751 struct pedit_parser *p_parser,
754 int idx = p_parser->sel.nkeys;
756 if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)
757 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_UDP;
758 if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_TCP)
759 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_TCP;
760 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
761 /* offset of src/dst port is same for TCP and UDP */
762 p_parser->keys[idx].off =
763 actions->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC ?
764 offsetof(struct tcp_hdr, src_port) :
765 offsetof(struct tcp_hdr, dst_port);
766 p_parser->keys[idx].mask = 0xFFFF0000;
767 p_parser->keys[idx].val =
768 (__u32)((const struct rte_flow_action_set_tp *)
769 actions->conf)->port;
770 p_parser->sel.nkeys = (++idx);
774 * Set pedit key of ipv6 address
777 * pointer to action specification
778 * @param[in,out] p_parser
779 * pointer to pedit_parser
782 flow_tcf_pedit_key_set_ipv6_addr(const struct rte_flow_action *actions,
783 struct pedit_parser *p_parser)
785 int idx = p_parser->sel.nkeys;
786 int keys = NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
788 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ?
789 offsetof(struct ipv6_hdr, src_addr) :
790 offsetof(struct ipv6_hdr, dst_addr);
791 const struct rte_flow_action_set_ipv6 *conf =
792 (const struct rte_flow_action_set_ipv6 *)actions->conf;
794 for (int i = 0; i < keys; i++, idx++) {
795 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
796 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
797 p_parser->keys[idx].off = off_base + i * SZ_PEDIT_KEY_VAL;
798 p_parser->keys[idx].mask = ~UINT32_MAX;
799 memcpy(&p_parser->keys[idx].val,
800 conf->ipv6_addr + i * SZ_PEDIT_KEY_VAL,
803 p_parser->sel.nkeys += keys;
807 * Set pedit key of ipv4 address
810 * pointer to action specification
811 * @param[in,out] p_parser
812 * pointer to pedit_parser
815 flow_tcf_pedit_key_set_ipv4_addr(const struct rte_flow_action *actions,
816 struct pedit_parser *p_parser)
818 int idx = p_parser->sel.nkeys;
820 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
821 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
822 p_parser->keys[idx].off =
823 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ?
824 offsetof(struct ipv4_hdr, src_addr) :
825 offsetof(struct ipv4_hdr, dst_addr);
826 p_parser->keys[idx].mask = ~UINT32_MAX;
827 p_parser->keys[idx].val =
828 ((const struct rte_flow_action_set_ipv4 *)
829 actions->conf)->ipv4_addr;
830 p_parser->sel.nkeys = (++idx);
834 * Create the pedit's na attribute in netlink message
835 * on pre-allocate message buffer
838 * pointer to pre-allocated netlink message buffer
839 * @param[in,out] actions
840 * pointer to pointer of actions specification.
841 * @param[in,out] action_flags
842 * pointer to actions flags
843 * @param[in] item_flags
844 * flags of all item presented
847 flow_tcf_create_pedit_mnl_msg(struct nlmsghdr *nl,
848 const struct rte_flow_action **actions,
851 struct pedit_parser p_parser;
852 struct nlattr *na_act_options;
853 struct nlattr *na_pedit_keys;
855 memset(&p_parser, 0, sizeof(p_parser));
856 mnl_attr_put_strz(nl, TCA_ACT_KIND, "pedit");
857 na_act_options = mnl_attr_nest_start(nl, TCA_ACT_OPTIONS);
858 /* all modify header actions should be in one tc-pedit action */
859 for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
860 switch ((*actions)->type) {
861 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
862 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
863 flow_tcf_pedit_key_set_ipv4_addr(*actions, &p_parser);
865 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
866 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
867 flow_tcf_pedit_key_set_ipv6_addr(*actions, &p_parser);
869 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
870 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
871 flow_tcf_pedit_key_set_tp_port(*actions,
872 &p_parser, item_flags);
874 case RTE_FLOW_ACTION_TYPE_SET_TTL:
875 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
876 flow_tcf_pedit_key_set_dec_ttl(*actions,
877 &p_parser, item_flags);
879 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
880 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
881 flow_tcf_pedit_key_set_mac(*actions, &p_parser);
884 goto pedit_mnl_msg_done;
888 p_parser.sel.action = TC_ACT_PIPE;
889 mnl_attr_put(nl, TCA_PEDIT_PARMS_EX,
890 sizeof(p_parser.sel) +
891 p_parser.sel.nkeys * sizeof(struct tc_pedit_key),
894 mnl_attr_nest_start(nl, TCA_PEDIT_KEYS_EX | NLA_F_NESTED);
895 for (int i = 0; i < p_parser.sel.nkeys; i++) {
896 struct nlattr *na_pedit_key =
897 mnl_attr_nest_start(nl,
898 TCA_PEDIT_KEY_EX | NLA_F_NESTED);
899 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_HTYPE,
900 p_parser.keys_ex[i].htype);
901 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_CMD,
902 p_parser.keys_ex[i].cmd);
903 mnl_attr_nest_end(nl, na_pedit_key);
905 mnl_attr_nest_end(nl, na_pedit_keys);
906 mnl_attr_nest_end(nl, na_act_options);
911 * Calculate max memory size of one TC-pedit actions.
912 * One TC-pedit action can contain set of keys each defining
913 * a rewrite element (rte_flow action)
915 * @param[in,out] actions
916 * actions specification.
917 * @param[in,out] action_flags
919 * @param[in,out] size
922 * Max memory size of one TC-pedit action
925 flow_tcf_get_pedit_actions_size(const struct rte_flow_action **actions,
926 uint64_t *action_flags)
932 pedit_size += SZ_NLATTR_NEST + /* na_act_index. */
933 SZ_NLATTR_STRZ_OF("pedit") +
934 SZ_NLATTR_NEST; /* TCA_ACT_OPTIONS. */
935 for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
936 switch ((*actions)->type) {
937 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
938 keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
939 flags |= MLX5_FLOW_ACTION_SET_IPV4_SRC;
941 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
942 keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
943 flags |= MLX5_FLOW_ACTION_SET_IPV4_DST;
945 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
946 keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
947 flags |= MLX5_FLOW_ACTION_SET_IPV6_SRC;
949 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
950 keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
951 flags |= MLX5_FLOW_ACTION_SET_IPV6_DST;
953 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
954 /* TCP is as same as UDP */
955 keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
956 flags |= MLX5_FLOW_ACTION_SET_TP_SRC;
958 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
959 /* TCP is as same as UDP */
960 keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
961 flags |= MLX5_FLOW_ACTION_SET_TP_DST;
963 case RTE_FLOW_ACTION_TYPE_SET_TTL:
964 keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
965 flags |= MLX5_FLOW_ACTION_SET_TTL;
967 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
968 keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
969 flags |= MLX5_FLOW_ACTION_DEC_TTL;
971 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
972 keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
973 flags |= MLX5_FLOW_ACTION_SET_MAC_SRC;
975 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
976 keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
977 flags |= MLX5_FLOW_ACTION_SET_MAC_DST;
980 goto get_pedit_action_size_done;
983 get_pedit_action_size_done:
984 /* TCA_PEDIT_PARAMS_EX */
986 SZ_NLATTR_DATA_OF(sizeof(struct tc_pedit_sel) +
987 keys * sizeof(struct tc_pedit_key));
988 pedit_size += SZ_NLATTR_NEST; /* TCA_PEDIT_KEYS */
990 /* TCA_PEDIT_KEY_EX + HTYPE + CMD */
991 (SZ_NLATTR_NEST + SZ_NLATTR_DATA_OF(2) +
992 SZ_NLATTR_DATA_OF(2));
993 (*action_flags) |= flags;
999 * Retrieve mask for pattern item.
1001 * This function does basic sanity checks on a pattern item in order to
1002 * return the most appropriate mask for it.
1005 * Item specification.
1006 * @param[in] mask_default
1007 * Default mask for pattern item as specified by the flow API.
1008 * @param[in] mask_supported
1009 * Mask fields supported by the implementation.
1010 * @param[in] mask_empty
1011 * Empty mask to return when there is no specification.
1013 * Perform verbose error reporting if not NULL.
1016 * Either @p item->mask or one of the mask parameters on success, NULL
1017 * otherwise and rte_errno is set.
1020 flow_tcf_item_mask(const struct rte_flow_item *item, const void *mask_default,
1021 const void *mask_supported, const void *mask_empty,
1022 size_t mask_size, struct rte_flow_error *error)
1024 const uint8_t *mask;
1027 /* item->last and item->mask cannot exist without item->spec. */
1028 if (!item->spec && (item->mask || item->last)) {
1029 rte_flow_error_set(error, EINVAL,
1030 RTE_FLOW_ERROR_TYPE_ITEM, item,
1031 "\"mask\" or \"last\" field provided without"
1032 " a corresponding \"spec\"");
1035 /* No spec, no mask, no problem. */
1038 mask = item->mask ? item->mask : mask_default;
1041 * Single-pass check to make sure that:
1042 * - Mask is supported, no bits are set outside mask_supported.
1043 * - Both item->spec and item->last are included in mask.
1045 for (i = 0; i != mask_size; ++i) {
1048 if ((mask[i] | ((const uint8_t *)mask_supported)[i]) !=
1049 ((const uint8_t *)mask_supported)[i]) {
1050 rte_flow_error_set(error, ENOTSUP,
1051 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1052 "unsupported field found"
1057 (((const uint8_t *)item->spec)[i] & mask[i]) !=
1058 (((const uint8_t *)item->last)[i] & mask[i])) {
1059 rte_flow_error_set(error, EINVAL,
1060 RTE_FLOW_ERROR_TYPE_ITEM_LAST,
1062 "range between \"spec\" and \"last\""
1063 " not comprised in \"mask\"");
1071 * Build a conversion table between port ID and ifindex.
1074 * Pointer to Ethernet device.
1076 * Pointer to ptoi table.
1078 * Size of ptoi table provided.
1081 * Size of ptoi table filled.
1084 flow_tcf_build_ptoi_table(struct rte_eth_dev *dev, struct flow_tcf_ptoi *ptoi,
1087 unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
1088 uint16_t port_id[n + 1];
1090 unsigned int own = 0;
1092 /* At least one port is needed when no switch domain is present. */
1095 port_id[0] = dev->data->port_id;
1097 n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
1101 for (i = 0; i != n; ++i) {
1102 struct rte_eth_dev_info dev_info;
1104 rte_eth_dev_info_get(port_id[i], &dev_info);
1105 if (port_id[i] == dev->data->port_id)
1107 ptoi[i].port_id = port_id[i];
1108 ptoi[i].ifindex = dev_info.if_index;
1110 /* Ensure first entry of ptoi[] is the current device. */
1113 ptoi[0] = ptoi[own];
1114 ptoi[own] = ptoi[n];
1116 /* An entry with zero ifindex terminates ptoi[]. */
1117 ptoi[n].port_id = 0;
1118 ptoi[n].ifindex = 0;
1123 * Verify the @p attr will be correctly understood by the E-switch.
1126 * Pointer to flow attributes
1128 * Pointer to error structure.
1131 * 0 on success, a negative errno value otherwise and rte_errno is set.
1134 flow_tcf_validate_attributes(const struct rte_flow_attr *attr,
1135 struct rte_flow_error *error)
1138 * Supported attributes: groups, some priorities and ingress only.
1139 * group is supported only if kernel supports chain. Don't care about
1140 * transfer as it is the caller's problem.
1142 if (attr->group > MLX5_TCF_GROUP_ID_MAX)
1143 return rte_flow_error_set(error, ENOTSUP,
1144 RTE_FLOW_ERROR_TYPE_ATTR_GROUP, attr,
1145 "group ID larger than "
1146 RTE_STR(MLX5_TCF_GROUP_ID_MAX)
1147 " isn't supported");
1148 else if (attr->priority > MLX5_TCF_GROUP_PRIORITY_MAX)
1149 return rte_flow_error_set(error, ENOTSUP,
1150 RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1152 "priority more than "
1153 RTE_STR(MLX5_TCF_GROUP_PRIORITY_MAX)
1154 " is not supported");
1156 return rte_flow_error_set(error, EINVAL,
1157 RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1158 attr, "only ingress is supported");
1160 return rte_flow_error_set(error, ENOTSUP,
1161 RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1162 attr, "egress is not supported");
1167 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_ETH item for E-Switch.
1168 * The routine checks the L2 fields to be used in encapsulation header.
1171 * Pointer to the item structure.
1173 * Pointer to the error structure.
1176 * 0 on success, a negative errno value otherwise and rte_errno is set.
1179 flow_tcf_validate_vxlan_encap_eth(const struct rte_flow_item *item,
1180 struct rte_flow_error *error)
1182 const struct rte_flow_item_eth *spec = item->spec;
1183 const struct rte_flow_item_eth *mask = item->mask;
1187 * Specification for L2 addresses can be empty
1188 * because these ones are optional and not
1189 * required directly by tc rule. Kernel tries
1190 * to resolve these ones on its own
1195 /* If mask is not specified use the default one. */
1196 mask = &rte_flow_item_eth_mask;
1198 if (memcmp(&mask->dst,
1199 &flow_tcf_mask_empty.eth.dst,
1200 sizeof(flow_tcf_mask_empty.eth.dst))) {
1201 if (memcmp(&mask->dst,
1202 &rte_flow_item_eth_mask.dst,
1203 sizeof(rte_flow_item_eth_mask.dst)))
1204 return rte_flow_error_set
1206 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1207 "no support for partial mask on"
1208 " \"eth.dst\" field");
1210 if (memcmp(&mask->src,
1211 &flow_tcf_mask_empty.eth.src,
1212 sizeof(flow_tcf_mask_empty.eth.src))) {
1213 if (memcmp(&mask->src,
1214 &rte_flow_item_eth_mask.src,
1215 sizeof(rte_flow_item_eth_mask.src)))
1216 return rte_flow_error_set
1218 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1219 "no support for partial mask on"
1220 " \"eth.src\" field");
1222 if (mask->type != RTE_BE16(0x0000)) {
1223 if (mask->type != RTE_BE16(0xffff))
1224 return rte_flow_error_set
1226 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1227 "no support for partial mask on"
1228 " \"eth.type\" field");
1230 "outer ethernet type field"
1231 " cannot be forced for vxlan"
1232 " encapsulation, parameter ignored");
1238 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV4 item for E-Switch.
1239 * The routine checks the IPv4 fields to be used in encapsulation header.
1242 * Pointer to the item structure.
1244 * Pointer to the error structure.
1247 * 0 on success, a negative errno value otherwise and rte_errno is set.
1250 flow_tcf_validate_vxlan_encap_ipv4(const struct rte_flow_item *item,
1251 struct rte_flow_error *error)
1253 const struct rte_flow_item_ipv4 *spec = item->spec;
1254 const struct rte_flow_item_ipv4 *mask = item->mask;
1258 * Specification for IP addresses cannot be empty
1259 * because it is required by tunnel_key parameter.
1261 return rte_flow_error_set(error, EINVAL,
1262 RTE_FLOW_ERROR_TYPE_ITEM, item,
1263 "NULL outer ipv4 address"
1264 " specification for vxlan"
1268 mask = &rte_flow_item_ipv4_mask;
1269 if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) {
1270 if (mask->hdr.dst_addr != RTE_BE32(0xffffffff))
1271 return rte_flow_error_set
1273 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1274 "no support for partial mask on"
1275 " \"ipv4.hdr.dst_addr\" field"
1276 " for vxlan encapsulation");
1277 /* More IPv4 address validations can be put here. */
1280 * Kernel uses the destination IP address to determine
1281 * the routing path and obtain the MAC destination
1282 * address, so IP destination address must be
1283 * specified in the tc rule.
1285 return rte_flow_error_set(error, EINVAL,
1286 RTE_FLOW_ERROR_TYPE_ITEM, item,
1287 "outer ipv4 destination address"
1288 " must be specified for"
1289 " vxlan encapsulation");
1291 if (mask->hdr.src_addr != RTE_BE32(0x00000000)) {
1292 if (mask->hdr.src_addr != RTE_BE32(0xffffffff))
1293 return rte_flow_error_set
1295 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1296 "no support for partial mask on"
1297 " \"ipv4.hdr.src_addr\" field"
1298 " for vxlan encapsulation");
1299 /* More IPv4 address validations can be put here. */
1302 * Kernel uses the source IP address to select the
1303 * interface for egress encapsulated traffic, so
1304 * it must be specified in the tc rule.
1306 return rte_flow_error_set(error, EINVAL,
1307 RTE_FLOW_ERROR_TYPE_ITEM, item,
1308 "outer ipv4 source address"
1309 " must be specified for"
1310 " vxlan encapsulation");
1316 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV6 item for E-Switch.
1317 * The routine checks the IPv6 fields to be used in encapsulation header.
1320 * Pointer to the item structure.
1322 * Pointer to the error structure.
1325 * 0 on success, a negative errno value otherwise and rte_errno is set.
1328 flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item,
1329 struct rte_flow_error *error)
1331 const struct rte_flow_item_ipv6 *spec = item->spec;
1332 const struct rte_flow_item_ipv6 *mask = item->mask;
1336 * Specification for IP addresses cannot be empty
1337 * because it is required by tunnel_key parameter.
1339 return rte_flow_error_set(error, EINVAL,
1340 RTE_FLOW_ERROR_TYPE_ITEM, item,
1341 "NULL outer ipv6 address"
1342 " specification for"
1343 " vxlan encapsulation");
1346 mask = &rte_flow_item_ipv6_mask;
1347 if (memcmp(&mask->hdr.dst_addr,
1348 &flow_tcf_mask_empty.ipv6.hdr.dst_addr,
1350 if (memcmp(&mask->hdr.dst_addr,
1351 &rte_flow_item_ipv6_mask.hdr.dst_addr,
1353 return rte_flow_error_set
1355 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1356 "no support for partial mask on"
1357 " \"ipv6.hdr.dst_addr\" field"
1358 " for vxlan encapsulation");
1359 /* More IPv6 address validations can be put here. */
1362 * Kernel uses the destination IP address to determine
1363 * the routing path and obtain the MAC destination
1364 * address (heigh or gate), so IP destination address
1365 * must be specified within the tc rule.
1367 return rte_flow_error_set(error, EINVAL,
1368 RTE_FLOW_ERROR_TYPE_ITEM, item,
1369 "outer ipv6 destination address"
1370 " must be specified for"
1371 " vxlan encapsulation");
1373 if (memcmp(&mask->hdr.src_addr,
1374 &flow_tcf_mask_empty.ipv6.hdr.src_addr,
1376 if (memcmp(&mask->hdr.src_addr,
1377 &rte_flow_item_ipv6_mask.hdr.src_addr,
1379 return rte_flow_error_set
1381 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1382 "no support for partial mask on"
1383 " \"ipv6.hdr.src_addr\" field"
1384 " for vxlan encapsulation");
1385 /* More L3 address validation can be put here. */
1388 * Kernel uses the source IP address to select the
1389 * interface for egress encapsulated traffic, so
1390 * it must be specified in the tc rule.
1392 return rte_flow_error_set(error, EINVAL,
1393 RTE_FLOW_ERROR_TYPE_ITEM, item,
1394 "outer L3 source address"
1395 " must be specified for"
1396 " vxlan encapsulation");
1402 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_UDP item for E-Switch.
1403 * The routine checks the UDP fields to be used in encapsulation header.
1406 * Pointer to the item structure.
1408 * Pointer to the error structure.
1411 * 0 on success, a negative errno value otherwise and rte_errno is set.
1414 flow_tcf_validate_vxlan_encap_udp(const struct rte_flow_item *item,
1415 struct rte_flow_error *error)
1417 const struct rte_flow_item_udp *spec = item->spec;
1418 const struct rte_flow_item_udp *mask = item->mask;
1422 * Specification for UDP ports cannot be empty
1423 * because it is required by tunnel_key parameter.
1425 return rte_flow_error_set(error, EINVAL,
1426 RTE_FLOW_ERROR_TYPE_ITEM, item,
1427 "NULL UDP port specification "
1428 " for vxlan encapsulation");
1431 mask = &rte_flow_item_udp_mask;
1432 if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1433 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1434 return rte_flow_error_set
1436 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1437 "no support for partial mask on"
1438 " \"udp.hdr.dst_port\" field"
1439 " for vxlan encapsulation");
1440 if (!spec->hdr.dst_port)
1441 return rte_flow_error_set
1443 RTE_FLOW_ERROR_TYPE_ITEM, item,
1444 "outer UDP remote port cannot be"
1445 " 0 for vxlan encapsulation");
1447 return rte_flow_error_set(error, EINVAL,
1448 RTE_FLOW_ERROR_TYPE_ITEM, item,
1449 "outer UDP remote port"
1450 " must be specified for"
1451 " vxlan encapsulation");
1453 if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1454 if (mask->hdr.src_port != RTE_BE16(0xffff))
1455 return rte_flow_error_set
1457 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1458 "no support for partial mask on"
1459 " \"udp.hdr.src_port\" field"
1460 " for vxlan encapsulation");
1462 "outer UDP source port cannot be"
1463 " forced for vxlan encapsulation,"
1464 " parameter ignored");
1470 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_VXLAN item for E-Switch.
1471 * The routine checks the VNIP fields to be used in encapsulation header.
1474 * Pointer to the item structure.
1476 * Pointer to the error structure.
1479 * 0 on success, a negative errno value otherwise and rte_errno is set.
1482 flow_tcf_validate_vxlan_encap_vni(const struct rte_flow_item *item,
1483 struct rte_flow_error *error)
1485 const struct rte_flow_item_vxlan *spec = item->spec;
1486 const struct rte_flow_item_vxlan *mask = item->mask;
1489 /* Outer VNI is required by tunnel_key parameter. */
1490 return rte_flow_error_set(error, EINVAL,
1491 RTE_FLOW_ERROR_TYPE_ITEM, item,
1492 "NULL VNI specification"
1493 " for vxlan encapsulation");
1496 mask = &rte_flow_item_vxlan_mask;
1497 if (!mask->vni[0] && !mask->vni[1] && !mask->vni[2])
1498 return rte_flow_error_set(error, EINVAL,
1499 RTE_FLOW_ERROR_TYPE_ITEM, item,
1500 "outer VNI must be specified "
1501 "for vxlan encapsulation");
1502 if (mask->vni[0] != 0xff ||
1503 mask->vni[1] != 0xff ||
1504 mask->vni[2] != 0xff)
1505 return rte_flow_error_set(error, ENOTSUP,
1506 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1507 "no support for partial mask on"
1508 " \"vxlan.vni\" field");
1510 if (!spec->vni[0] && !spec->vni[1] && !spec->vni[2])
1511 return rte_flow_error_set(error, EINVAL,
1512 RTE_FLOW_ERROR_TYPE_ITEM, item,
1513 "vxlan vni cannot be 0");
1518 * Validate VXLAN_ENCAP action item list for E-Switch.
1519 * The routine checks items to be used in encapsulation header.
1522 * Pointer to the VXLAN_ENCAP action structure.
1524 * Pointer to the error structure.
1527 * 0 on success, a negative errno value otherwise and rte_errno is set.
1530 flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action,
1531 struct rte_flow_error *error)
1533 const struct rte_flow_item *items;
1535 uint32_t item_flags = 0;
1538 return rte_flow_error_set(error, EINVAL,
1539 RTE_FLOW_ERROR_TYPE_ACTION, action,
1540 "Missing vxlan tunnel"
1541 " action configuration");
1542 items = ((const struct rte_flow_action_vxlan_encap *)
1543 action->conf)->definition;
1545 return rte_flow_error_set(error, EINVAL,
1546 RTE_FLOW_ERROR_TYPE_ACTION, action,
1547 "Missing vxlan tunnel"
1548 " encapsulation parameters");
1549 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1550 switch (items->type) {
1551 case RTE_FLOW_ITEM_TYPE_VOID:
1553 case RTE_FLOW_ITEM_TYPE_ETH:
1554 ret = mlx5_flow_validate_item_eth(items, item_flags,
1558 ret = flow_tcf_validate_vxlan_encap_eth(items, error);
1561 item_flags |= MLX5_FLOW_LAYER_OUTER_L2;
1564 case RTE_FLOW_ITEM_TYPE_IPV4:
1565 ret = mlx5_flow_validate_item_ipv4(items, item_flags,
1569 ret = flow_tcf_validate_vxlan_encap_ipv4(items, error);
1572 item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
1574 case RTE_FLOW_ITEM_TYPE_IPV6:
1575 ret = mlx5_flow_validate_item_ipv6(items, item_flags,
1579 ret = flow_tcf_validate_vxlan_encap_ipv6(items, error);
1582 item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
1584 case RTE_FLOW_ITEM_TYPE_UDP:
1585 ret = mlx5_flow_validate_item_udp(items, item_flags,
1589 ret = flow_tcf_validate_vxlan_encap_udp(items, error);
1592 item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
1594 case RTE_FLOW_ITEM_TYPE_VXLAN:
1595 ret = mlx5_flow_validate_item_vxlan(items,
1599 ret = flow_tcf_validate_vxlan_encap_vni(items, error);
1602 item_flags |= MLX5_FLOW_LAYER_VXLAN;
1605 return rte_flow_error_set
1607 RTE_FLOW_ERROR_TYPE_ITEM, items,
1608 "vxlan encap item not supported");
1611 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3))
1612 return rte_flow_error_set(error, EINVAL,
1613 RTE_FLOW_ERROR_TYPE_ACTION, action,
1614 "no outer IP layer found"
1615 " for vxlan encapsulation");
1616 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
1617 return rte_flow_error_set(error, EINVAL,
1618 RTE_FLOW_ERROR_TYPE_ACTION, action,
1619 "no outer UDP layer found"
1620 " for vxlan encapsulation");
1621 if (!(item_flags & MLX5_FLOW_LAYER_VXLAN))
1622 return rte_flow_error_set(error, EINVAL,
1623 RTE_FLOW_ERROR_TYPE_ACTION, action,
1624 "no VXLAN VNI found"
1625 " for vxlan encapsulation");
1630 * Validate outer RTE_FLOW_ITEM_TYPE_UDP item if tunnel item
1631 * RTE_FLOW_ITEM_TYPE_VXLAN is present in item list.
1634 * Outer UDP layer item (if any, NULL otherwise).
1636 * Pointer to the error structure.
1639 * 0 on success, a negative errno value otherwise and rte_errno is set.
1642 flow_tcf_validate_vxlan_decap_udp(const struct rte_flow_item *udp,
1643 struct rte_flow_error *error)
1645 const struct rte_flow_item_udp *spec = udp->spec;
1646 const struct rte_flow_item_udp *mask = udp->mask;
1650 * Specification for UDP ports cannot be empty
1651 * because it is required as decap parameter.
1653 return rte_flow_error_set(error, EINVAL,
1654 RTE_FLOW_ERROR_TYPE_ITEM, udp,
1655 "NULL UDP port specification"
1656 " for VXLAN decapsulation");
1658 mask = &rte_flow_item_udp_mask;
1659 if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1660 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1661 return rte_flow_error_set
1663 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1664 "no support for partial mask on"
1665 " \"udp.hdr.dst_port\" field");
1666 if (!spec->hdr.dst_port)
1667 return rte_flow_error_set
1669 RTE_FLOW_ERROR_TYPE_ITEM, udp,
1670 "zero decap local UDP port");
1672 return rte_flow_error_set(error, EINVAL,
1673 RTE_FLOW_ERROR_TYPE_ITEM, udp,
1674 "outer UDP destination port must be "
1675 "specified for vxlan decapsulation");
1677 if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1678 if (mask->hdr.src_port != RTE_BE16(0xffff))
1679 return rte_flow_error_set
1681 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1682 "no support for partial mask on"
1683 " \"udp.hdr.src_port\" field");
1685 "outer UDP local port cannot be "
1686 "forced for VXLAN encapsulation, "
1687 "parameter ignored");
1693 * Validate flow for E-Switch.
1696 * Pointer to the priv structure.
1698 * Pointer to the flow attributes.
1700 * Pointer to the list of items.
1701 * @param[in] actions
1702 * Pointer to the list of actions.
1704 * Pointer to the error structure.
1707 * 0 on success, a negative errno value otherwise and rte_errno is set.
1710 flow_tcf_validate(struct rte_eth_dev *dev,
1711 const struct rte_flow_attr *attr,
1712 const struct rte_flow_item items[],
1713 const struct rte_flow_action actions[],
1714 struct rte_flow_error *error)
1717 const struct rte_flow_item_port_id *port_id;
1718 const struct rte_flow_item_eth *eth;
1719 const struct rte_flow_item_vlan *vlan;
1720 const struct rte_flow_item_ipv4 *ipv4;
1721 const struct rte_flow_item_ipv6 *ipv6;
1722 const struct rte_flow_item_tcp *tcp;
1723 const struct rte_flow_item_udp *udp;
1724 const struct rte_flow_item_vxlan *vxlan;
1727 const struct rte_flow_action_port_id *port_id;
1728 const struct rte_flow_action_jump *jump;
1729 const struct rte_flow_action_of_push_vlan *of_push_vlan;
1730 const struct rte_flow_action_of_set_vlan_vid *
1732 const struct rte_flow_action_of_set_vlan_pcp *
1734 const struct rte_flow_action_vxlan_encap *vxlan_encap;
1735 const struct rte_flow_action_set_ipv4 *set_ipv4;
1736 const struct rte_flow_action_set_ipv6 *set_ipv6;
1738 const struct rte_flow_item *outer_udp = NULL;
1739 rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
1740 rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
1741 rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
1742 uint64_t item_flags = 0;
1743 uint64_t action_flags = 0;
1744 uint8_t next_protocol = 0xff;
1745 unsigned int tcm_ifindex = 0;
1746 uint8_t pedit_validated = 0;
1747 struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
1748 struct rte_eth_dev *port_id_dev = NULL;
1749 bool in_port_id_set;
1752 claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
1753 PTOI_TABLE_SZ_MAX(dev)));
1754 ret = flow_tcf_validate_attributes(attr, error);
1757 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
1759 uint64_t current_action_flag = 0;
1761 switch (actions->type) {
1762 case RTE_FLOW_ACTION_TYPE_VOID:
1764 case RTE_FLOW_ACTION_TYPE_PORT_ID:
1765 current_action_flag = MLX5_FLOW_ACTION_PORT_ID;
1768 conf.port_id = actions->conf;
1769 if (conf.port_id->original)
1772 for (i = 0; ptoi[i].ifindex; ++i)
1773 if (ptoi[i].port_id == conf.port_id->id)
1775 if (!ptoi[i].ifindex)
1776 return rte_flow_error_set
1778 RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1780 "missing data to convert port ID to"
1782 port_id_dev = &rte_eth_devices[conf.port_id->id];
1784 case RTE_FLOW_ACTION_TYPE_JUMP:
1785 current_action_flag = MLX5_FLOW_ACTION_JUMP;
1788 conf.jump = actions->conf;
1789 if (attr->group >= conf.jump->group)
1790 return rte_flow_error_set
1792 RTE_FLOW_ERROR_TYPE_ACTION,
1794 "can jump only to a group forward");
1796 case RTE_FLOW_ACTION_TYPE_DROP:
1797 current_action_flag = MLX5_FLOW_ACTION_DROP;
1799 case RTE_FLOW_ACTION_TYPE_COUNT:
1801 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
1802 current_action_flag = MLX5_FLOW_ACTION_OF_POP_VLAN;
1804 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: {
1805 rte_be16_t ethertype;
1807 current_action_flag = MLX5_FLOW_ACTION_OF_PUSH_VLAN;
1810 conf.of_push_vlan = actions->conf;
1811 ethertype = conf.of_push_vlan->ethertype;
1812 if (ethertype != RTE_BE16(ETH_P_8021Q) &&
1813 ethertype != RTE_BE16(ETH_P_8021AD))
1814 return rte_flow_error_set
1816 RTE_FLOW_ERROR_TYPE_ACTION, actions,
1817 "vlan push TPID must be "
1818 "802.1Q or 802.1AD");
1821 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
1822 if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1823 return rte_flow_error_set
1825 RTE_FLOW_ERROR_TYPE_ACTION, actions,
1826 "vlan modify is not supported,"
1827 " set action must follow push action");
1828 current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
1830 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
1831 if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1832 return rte_flow_error_set
1834 RTE_FLOW_ERROR_TYPE_ACTION, actions,
1835 "vlan modify is not supported,"
1836 " set action must follow push action");
1837 current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
1839 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
1840 current_action_flag = MLX5_FLOW_ACTION_VXLAN_DECAP;
1842 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
1843 ret = flow_tcf_validate_vxlan_encap(actions, error);
1846 current_action_flag = MLX5_FLOW_ACTION_VXLAN_ENCAP;
1848 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
1849 current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_SRC;
1851 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
1852 current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_DST;
1854 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
1855 current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_SRC;
1857 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
1858 current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_DST;
1860 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
1861 current_action_flag = MLX5_FLOW_ACTION_SET_TP_SRC;
1863 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
1864 current_action_flag = MLX5_FLOW_ACTION_SET_TP_DST;
1866 case RTE_FLOW_ACTION_TYPE_SET_TTL:
1867 current_action_flag = MLX5_FLOW_ACTION_SET_TTL;
1869 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
1870 current_action_flag = MLX5_FLOW_ACTION_DEC_TTL;
1872 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
1873 current_action_flag = MLX5_FLOW_ACTION_SET_MAC_SRC;
1875 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
1876 current_action_flag = MLX5_FLOW_ACTION_SET_MAC_DST;
1879 return rte_flow_error_set(error, ENOTSUP,
1880 RTE_FLOW_ERROR_TYPE_ACTION,
1882 "action not supported");
1884 if (current_action_flag & MLX5_TCF_CONFIG_ACTIONS) {
1886 return rte_flow_error_set
1888 RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1890 "action configuration not set");
1892 if ((current_action_flag & MLX5_TCF_PEDIT_ACTIONS) &&
1894 return rte_flow_error_set(error, ENOTSUP,
1895 RTE_FLOW_ERROR_TYPE_ACTION,
1897 "set actions should be "
1898 "listed successively");
1899 if ((current_action_flag & ~MLX5_TCF_PEDIT_ACTIONS) &&
1900 (action_flags & MLX5_TCF_PEDIT_ACTIONS))
1901 pedit_validated = 1;
1902 if ((current_action_flag & MLX5_TCF_FATE_ACTIONS) &&
1903 (action_flags & MLX5_TCF_FATE_ACTIONS))
1904 return rte_flow_error_set(error, EINVAL,
1905 RTE_FLOW_ERROR_TYPE_ACTION,
1907 "can't have multiple fate"
1909 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1910 (action_flags & MLX5_TCF_VXLAN_ACTIONS))
1911 return rte_flow_error_set(error, EINVAL,
1912 RTE_FLOW_ERROR_TYPE_ACTION,
1914 "can't have multiple vxlan"
1916 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1917 (action_flags & MLX5_TCF_VLAN_ACTIONS))
1918 return rte_flow_error_set(error, ENOTSUP,
1919 RTE_FLOW_ERROR_TYPE_ACTION,
1921 "can't have vxlan and vlan"
1922 " actions in the same rule");
1923 action_flags |= current_action_flag;
1925 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1928 switch (items->type) {
1929 case RTE_FLOW_ITEM_TYPE_VOID:
1931 case RTE_FLOW_ITEM_TYPE_PORT_ID:
1932 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
1933 return rte_flow_error_set
1935 RTE_FLOW_ERROR_TYPE_ITEM, items,
1936 "inner tunnel port id"
1937 " item is not supported");
1938 mask.port_id = flow_tcf_item_mask
1939 (items, &rte_flow_item_port_id_mask,
1940 &flow_tcf_mask_supported.port_id,
1941 &flow_tcf_mask_empty.port_id,
1942 sizeof(flow_tcf_mask_supported.port_id),
1946 if (mask.port_id == &flow_tcf_mask_empty.port_id) {
1950 spec.port_id = items->spec;
1951 if (mask.port_id->id && mask.port_id->id != 0xffffffff)
1952 return rte_flow_error_set
1954 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
1956 "no support for partial mask on"
1958 if (!mask.port_id->id)
1961 for (i = 0; ptoi[i].ifindex; ++i)
1962 if (ptoi[i].port_id == spec.port_id->id)
1964 if (!ptoi[i].ifindex)
1965 return rte_flow_error_set
1967 RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
1969 "missing data to convert port ID to"
1971 if (in_port_id_set && ptoi[i].ifindex != tcm_ifindex)
1972 return rte_flow_error_set
1974 RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
1976 "cannot match traffic for"
1977 " several port IDs through"
1978 " a single flow rule");
1979 tcm_ifindex = ptoi[i].ifindex;
1982 case RTE_FLOW_ITEM_TYPE_ETH:
1983 ret = mlx5_flow_validate_item_eth(items, item_flags,
1987 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
1988 MLX5_FLOW_LAYER_INNER_L2 :
1989 MLX5_FLOW_LAYER_OUTER_L2;
1991 * Redundant check due to different supported mask.
1992 * Same for the rest of items.
1994 mask.eth = flow_tcf_item_mask
1995 (items, &rte_flow_item_eth_mask,
1996 &flow_tcf_mask_supported.eth,
1997 &flow_tcf_mask_empty.eth,
1998 sizeof(flow_tcf_mask_supported.eth),
2002 if (mask.eth->type && mask.eth->type !=
2004 return rte_flow_error_set
2006 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2008 "no support for partial mask on"
2010 assert(items->spec);
2011 spec.eth = items->spec;
2012 if (mask.eth->type &&
2013 (item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
2014 inner_etype != RTE_BE16(ETH_P_ALL) &&
2015 inner_etype != spec.eth->type)
2016 return rte_flow_error_set
2018 RTE_FLOW_ERROR_TYPE_ITEM,
2020 "inner eth_type conflict");
2021 if (mask.eth->type &&
2022 !(item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
2023 outer_etype != RTE_BE16(ETH_P_ALL) &&
2024 outer_etype != spec.eth->type)
2025 return rte_flow_error_set
2027 RTE_FLOW_ERROR_TYPE_ITEM,
2029 "outer eth_type conflict");
2030 if (mask.eth->type) {
2031 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
2032 inner_etype = spec.eth->type;
2034 outer_etype = spec.eth->type;
2037 case RTE_FLOW_ITEM_TYPE_VLAN:
2038 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
2039 return rte_flow_error_set
2041 RTE_FLOW_ERROR_TYPE_ITEM, items,
2043 " is not supported");
2044 ret = mlx5_flow_validate_item_vlan(items, item_flags,
2048 item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
2049 mask.vlan = flow_tcf_item_mask
2050 (items, &rte_flow_item_vlan_mask,
2051 &flow_tcf_mask_supported.vlan,
2052 &flow_tcf_mask_empty.vlan,
2053 sizeof(flow_tcf_mask_supported.vlan),
2057 if ((mask.vlan->tci & RTE_BE16(0xe000) &&
2058 (mask.vlan->tci & RTE_BE16(0xe000)) !=
2059 RTE_BE16(0xe000)) ||
2060 (mask.vlan->tci & RTE_BE16(0x0fff) &&
2061 (mask.vlan->tci & RTE_BE16(0x0fff)) !=
2062 RTE_BE16(0x0fff)) ||
2063 (mask.vlan->inner_type &&
2064 mask.vlan->inner_type != RTE_BE16(0xffff)))
2065 return rte_flow_error_set
2067 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2069 "no support for partial masks on"
2070 " \"tci\" (PCP and VID parts) and"
2071 " \"inner_type\" fields");
2072 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2073 outer_etype != RTE_BE16(ETH_P_8021Q))
2074 return rte_flow_error_set
2076 RTE_FLOW_ERROR_TYPE_ITEM,
2078 "outer eth_type conflict,"
2080 outer_etype = RTE_BE16(ETH_P_8021Q);
2081 assert(items->spec);
2082 spec.vlan = items->spec;
2083 if (mask.vlan->inner_type &&
2084 vlan_etype != RTE_BE16(ETH_P_ALL) &&
2085 vlan_etype != spec.vlan->inner_type)
2086 return rte_flow_error_set
2088 RTE_FLOW_ERROR_TYPE_ITEM,
2090 "vlan eth_type conflict");
2091 if (mask.vlan->inner_type)
2092 vlan_etype = spec.vlan->inner_type;
2094 case RTE_FLOW_ITEM_TYPE_IPV4:
2095 ret = mlx5_flow_validate_item_ipv4(items, item_flags,
2099 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2100 MLX5_FLOW_LAYER_INNER_L3_IPV4 :
2101 MLX5_FLOW_LAYER_OUTER_L3_IPV4;
2102 mask.ipv4 = flow_tcf_item_mask
2103 (items, &rte_flow_item_ipv4_mask,
2104 &flow_tcf_mask_supported.ipv4,
2105 &flow_tcf_mask_empty.ipv4,
2106 sizeof(flow_tcf_mask_supported.ipv4),
2110 if (mask.ipv4->hdr.next_proto_id &&
2111 mask.ipv4->hdr.next_proto_id != 0xff)
2112 return rte_flow_error_set
2114 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2116 "no support for partial mask on"
2117 " \"hdr.next_proto_id\" field");
2118 else if (mask.ipv4->hdr.next_proto_id)
2120 ((const struct rte_flow_item_ipv4 *)
2121 (items->spec))->hdr.next_proto_id;
2122 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2123 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2124 inner_etype != RTE_BE16(ETH_P_IP))
2125 return rte_flow_error_set
2127 RTE_FLOW_ERROR_TYPE_ITEM,
2129 "inner eth_type conflict,"
2130 " IPv4 is required");
2131 inner_etype = RTE_BE16(ETH_P_IP);
2132 } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2133 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2134 vlan_etype != RTE_BE16(ETH_P_IP))
2135 return rte_flow_error_set
2137 RTE_FLOW_ERROR_TYPE_ITEM,
2139 "vlan eth_type conflict,"
2140 " IPv4 is required");
2141 vlan_etype = RTE_BE16(ETH_P_IP);
2143 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2144 outer_etype != RTE_BE16(ETH_P_IP))
2145 return rte_flow_error_set
2147 RTE_FLOW_ERROR_TYPE_ITEM,
2149 "eth_type conflict,"
2150 " IPv4 is required");
2151 outer_etype = RTE_BE16(ETH_P_IP);
2154 case RTE_FLOW_ITEM_TYPE_IPV6:
2155 ret = mlx5_flow_validate_item_ipv6(items, item_flags,
2159 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2160 MLX5_FLOW_LAYER_INNER_L3_IPV6 :
2161 MLX5_FLOW_LAYER_OUTER_L3_IPV6;
2162 mask.ipv6 = flow_tcf_item_mask
2163 (items, &rte_flow_item_ipv6_mask,
2164 &flow_tcf_mask_supported.ipv6,
2165 &flow_tcf_mask_empty.ipv6,
2166 sizeof(flow_tcf_mask_supported.ipv6),
2170 if (mask.ipv6->hdr.proto &&
2171 mask.ipv6->hdr.proto != 0xff)
2172 return rte_flow_error_set
2174 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2176 "no support for partial mask on"
2177 " \"hdr.proto\" field");
2178 else if (mask.ipv6->hdr.proto)
2180 ((const struct rte_flow_item_ipv6 *)
2181 (items->spec))->hdr.proto;
2182 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2183 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2184 inner_etype != RTE_BE16(ETH_P_IPV6))
2185 return rte_flow_error_set
2187 RTE_FLOW_ERROR_TYPE_ITEM,
2189 "inner eth_type conflict,"
2190 " IPv6 is required");
2191 inner_etype = RTE_BE16(ETH_P_IPV6);
2192 } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2193 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2194 vlan_etype != RTE_BE16(ETH_P_IPV6))
2195 return rte_flow_error_set
2197 RTE_FLOW_ERROR_TYPE_ITEM,
2199 "vlan eth_type conflict,"
2200 " IPv6 is required");
2201 vlan_etype = RTE_BE16(ETH_P_IPV6);
2203 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2204 outer_etype != RTE_BE16(ETH_P_IPV6))
2205 return rte_flow_error_set
2207 RTE_FLOW_ERROR_TYPE_ITEM,
2209 "eth_type conflict,"
2210 " IPv6 is required");
2211 outer_etype = RTE_BE16(ETH_P_IPV6);
2214 case RTE_FLOW_ITEM_TYPE_UDP:
2215 ret = mlx5_flow_validate_item_udp(items, item_flags,
2216 next_protocol, error);
2219 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2220 MLX5_FLOW_LAYER_INNER_L4_UDP :
2221 MLX5_FLOW_LAYER_OUTER_L4_UDP;
2222 mask.udp = flow_tcf_item_mask
2223 (items, &rte_flow_item_udp_mask,
2224 &flow_tcf_mask_supported.udp,
2225 &flow_tcf_mask_empty.udp,
2226 sizeof(flow_tcf_mask_supported.udp),
2231 * Save the presumed outer UDP item for extra check
2232 * if the tunnel item will be found later in the list.
2234 if (!(item_flags & MLX5_FLOW_LAYER_TUNNEL))
2237 case RTE_FLOW_ITEM_TYPE_TCP:
2238 ret = mlx5_flow_validate_item_tcp
2241 &flow_tcf_mask_supported.tcp,
2245 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2246 MLX5_FLOW_LAYER_INNER_L4_TCP :
2247 MLX5_FLOW_LAYER_OUTER_L4_TCP;
2248 mask.tcp = flow_tcf_item_mask
2249 (items, &rte_flow_item_tcp_mask,
2250 &flow_tcf_mask_supported.tcp,
2251 &flow_tcf_mask_empty.tcp,
2252 sizeof(flow_tcf_mask_supported.tcp),
2257 case RTE_FLOW_ITEM_TYPE_VXLAN:
2258 if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN)
2259 return rte_flow_error_set
2261 RTE_FLOW_ERROR_TYPE_ITEM, items,
2262 "vxlan tunnel over vlan"
2263 " is not supported");
2264 ret = mlx5_flow_validate_item_vxlan(items,
2268 item_flags |= MLX5_FLOW_LAYER_VXLAN;
2269 mask.vxlan = flow_tcf_item_mask
2270 (items, &rte_flow_item_vxlan_mask,
2271 &flow_tcf_mask_supported.vxlan,
2272 &flow_tcf_mask_empty.vxlan,
2273 sizeof(flow_tcf_mask_supported.vxlan), error);
2276 if (mask.vxlan->vni[0] != 0xff ||
2277 mask.vxlan->vni[1] != 0xff ||
2278 mask.vxlan->vni[2] != 0xff)
2279 return rte_flow_error_set
2281 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2283 "no support for partial or "
2284 "empty mask on \"vxlan.vni\" field");
2286 * The VNI item assumes the VXLAN tunnel, it requires
2287 * at least the outer destination UDP port must be
2288 * specified without wildcards to allow kernel select
2289 * the virtual VXLAN device by port. Also outer IPv4
2290 * or IPv6 item must be specified (wilcards or even
2291 * zero mask are allowed) to let driver know the tunnel
2292 * IP version and process UDP traffic correctly.
2295 (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2296 MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2297 return rte_flow_error_set
2299 RTE_FLOW_ERROR_TYPE_ACTION,
2301 "no outer IP pattern found"
2302 " for vxlan tunnel");
2303 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
2304 return rte_flow_error_set
2306 RTE_FLOW_ERROR_TYPE_ACTION,
2308 "no outer UDP pattern found"
2309 " for vxlan tunnel");
2311 * All items preceding the tunnel item become outer
2312 * ones and we should do extra validation for them
2313 * due to tc limitations for tunnel outer parameters.
2314 * Currently only outer UDP item requres extra check,
2315 * use the saved pointer instead of item list rescan.
2318 ret = flow_tcf_validate_vxlan_decap_udp
2322 /* Reset L4 protocol for inner parameters. */
2323 next_protocol = 0xff;
2326 return rte_flow_error_set(error, ENOTSUP,
2327 RTE_FLOW_ERROR_TYPE_ITEM,
2328 items, "item not supported");
2331 if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2332 (action_flags & MLX5_FLOW_ACTION_DROP))
2333 return rte_flow_error_set(error, ENOTSUP,
2334 RTE_FLOW_ERROR_TYPE_ACTION,
2336 "set action is not compatible with "
2338 if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2339 !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2340 return rte_flow_error_set(error, ENOTSUP,
2341 RTE_FLOW_ERROR_TYPE_ACTION,
2343 "set action must be followed by "
2346 (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST)) {
2347 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4))
2348 return rte_flow_error_set(error, EINVAL,
2349 RTE_FLOW_ERROR_TYPE_ACTION,
2351 "no ipv4 item found in"
2355 (MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST)) {
2356 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6))
2357 return rte_flow_error_set(error, EINVAL,
2358 RTE_FLOW_ERROR_TYPE_ACTION,
2360 "no ipv6 item found in"
2364 (MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST)) {
2366 (MLX5_FLOW_LAYER_OUTER_L4_UDP |
2367 MLX5_FLOW_LAYER_OUTER_L4_TCP)))
2368 return rte_flow_error_set(error, EINVAL,
2369 RTE_FLOW_ERROR_TYPE_ACTION,
2371 "no TCP/UDP item found in"
2375 * FW syndrome (0xA9C090):
2376 * set_flow_table_entry: push vlan action fte in fdb can ONLY be
2377 * forward to the uplink.
2379 if ((action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) &&
2380 (action_flags & MLX5_FLOW_ACTION_PORT_ID) &&
2381 ((struct priv *)port_id_dev->data->dev_private)->representor)
2382 return rte_flow_error_set(error, ENOTSUP,
2383 RTE_FLOW_ERROR_TYPE_ACTION, actions,
2384 "vlan push can only be applied"
2385 " when forwarding to uplink port");
2387 * FW syndrome (0x294609):
2388 * set_flow_table_entry: modify/pop/push actions in fdb flow table
2389 * are supported only while forwarding to vport.
2391 if ((action_flags & MLX5_TCF_VLAN_ACTIONS) &&
2392 !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2393 return rte_flow_error_set(error, ENOTSUP,
2394 RTE_FLOW_ERROR_TYPE_ACTION, actions,
2395 "vlan actions are supported"
2396 " only with port_id action");
2397 if ((action_flags & MLX5_TCF_VXLAN_ACTIONS) &&
2398 !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2399 return rte_flow_error_set(error, ENOTSUP,
2400 RTE_FLOW_ERROR_TYPE_ACTION, NULL,
2401 "vxlan actions are supported"
2402 " only with port_id action");
2403 if (!(action_flags & MLX5_TCF_FATE_ACTIONS))
2404 return rte_flow_error_set(error, EINVAL,
2405 RTE_FLOW_ERROR_TYPE_ACTION, actions,
2406 "no fate action is found");
2408 (MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL)) {
2410 (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2411 MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2412 return rte_flow_error_set(error, EINVAL,
2413 RTE_FLOW_ERROR_TYPE_ACTION,
2415 "no IP found in pattern");
2418 (MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)) {
2419 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L2))
2420 return rte_flow_error_set(error, ENOTSUP,
2421 RTE_FLOW_ERROR_TYPE_ACTION,
2423 "no ethernet found in"
2426 if ((action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) &&
2427 !(item_flags & MLX5_FLOW_LAYER_VXLAN))
2428 return rte_flow_error_set(error, EINVAL,
2429 RTE_FLOW_ERROR_TYPE_ACTION,
2431 "no VNI pattern found"
2432 " for vxlan decap action");
2433 if ((action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) &&
2434 (item_flags & MLX5_FLOW_LAYER_TUNNEL))
2435 return rte_flow_error_set(error, EINVAL,
2436 RTE_FLOW_ERROR_TYPE_ACTION,
2438 "vxlan encap not supported"
2439 " for tunneled traffic");
2444 * Calculate maximum size of memory for flow items of Linux TC flower.
2447 * Pointer to the flow attributes.
2449 * Pointer to the list of items.
2450 * @param[out] action_flags
2451 * Pointer to the detected actions.
2454 * Maximum size of memory for items.
2457 flow_tcf_get_items_size(const struct rte_flow_attr *attr,
2458 const struct rte_flow_item items[],
2459 uint64_t *action_flags)
2463 size += SZ_NLATTR_STRZ_OF("flower") +
2464 SZ_NLATTR_TYPE_OF(uint16_t) + /* Outer ether type. */
2465 SZ_NLATTR_NEST + /* TCA_OPTIONS. */
2466 SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */
2467 if (attr->group > 0)
2468 size += SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CHAIN. */
2469 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2470 switch (items->type) {
2471 case RTE_FLOW_ITEM_TYPE_VOID:
2473 case RTE_FLOW_ITEM_TYPE_PORT_ID:
2475 case RTE_FLOW_ITEM_TYPE_ETH:
2476 size += SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4;
2477 /* dst/src MAC addr and mask. */
2479 case RTE_FLOW_ITEM_TYPE_VLAN:
2480 size += SZ_NLATTR_TYPE_OF(uint16_t) +
2481 /* VLAN Ether type. */
2482 SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */
2483 SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */
2485 case RTE_FLOW_ITEM_TYPE_IPV4:
2486 size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2487 SZ_NLATTR_TYPE_OF(uint32_t) * 4;
2488 /* dst/src IP addr and mask. */
2490 case RTE_FLOW_ITEM_TYPE_IPV6:
2491 size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2492 SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4;
2493 /* dst/src IP addr and mask. */
2495 case RTE_FLOW_ITEM_TYPE_UDP:
2496 size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2497 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2498 /* dst/src port and mask. */
2500 case RTE_FLOW_ITEM_TYPE_TCP:
2501 size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2502 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2503 /* dst/src port and mask. */
2505 case RTE_FLOW_ITEM_TYPE_VXLAN:
2506 size += SZ_NLATTR_TYPE_OF(uint32_t);
2508 * There might be no VXLAN decap action in the action
2509 * list, nonetheless the VXLAN tunnel flow requires
2510 * the decap structure to be correctly applied to
2511 * VXLAN device, set the flag to create the structure.
2512 * Translation routine will not put the decap action
2513 * in tne Netlink message if there is no actual action
2516 *action_flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2520 "unsupported item %p type %d,"
2521 " items must be validated before flow creation",
2522 (const void *)items, items->type);
2530 * Calculate size of memory to store the VXLAN encapsultion
2531 * related items in the Netlink message buffer. Items list
2532 * is specified by RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action.
2533 * The item list should be validated.
2536 * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2537 * List of pattern items to scan data from.
2540 * The size the part of Netlink message buffer to store the
2541 * VXLAN encapsulation item attributes.
2544 flow_tcf_vxlan_encap_size(const struct rte_flow_action *action)
2546 const struct rte_flow_item *items;
2549 assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
2550 assert(action->conf);
2552 items = ((const struct rte_flow_action_vxlan_encap *)
2553 action->conf)->definition;
2555 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2556 switch (items->type) {
2557 case RTE_FLOW_ITEM_TYPE_VOID:
2559 case RTE_FLOW_ITEM_TYPE_ETH:
2560 /* This item does not require message buffer. */
2562 case RTE_FLOW_ITEM_TYPE_IPV4:
2563 size += SZ_NLATTR_DATA_OF(IPV4_ADDR_LEN) * 2;
2565 case RTE_FLOW_ITEM_TYPE_IPV6:
2566 size += SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 2;
2568 case RTE_FLOW_ITEM_TYPE_UDP: {
2569 const struct rte_flow_item_udp *udp = items->mask;
2571 size += SZ_NLATTR_TYPE_OF(uint16_t);
2572 if (!udp || udp->hdr.src_port != RTE_BE16(0x0000))
2573 size += SZ_NLATTR_TYPE_OF(uint16_t);
2576 case RTE_FLOW_ITEM_TYPE_VXLAN:
2577 size += SZ_NLATTR_TYPE_OF(uint32_t);
2582 "unsupported item %p type %d,"
2583 " items must be validated"
2584 " before flow creation",
2585 (const void *)items, items->type);
2593 * Calculate maximum size of memory for flow actions of Linux TC flower and
2594 * extract specified actions.
2596 * @param[in] actions
2597 * Pointer to the list of actions.
2598 * @param[out] action_flags
2599 * Pointer to the detected actions.
2602 * Maximum size of memory for actions.
2605 flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
2606 uint64_t *action_flags)
2611 size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */
2612 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
2613 switch (actions->type) {
2614 case RTE_FLOW_ACTION_TYPE_VOID:
2616 case RTE_FLOW_ACTION_TYPE_PORT_ID:
2617 size += SZ_NLATTR_NEST + /* na_act_index. */
2618 SZ_NLATTR_STRZ_OF("mirred") +
2619 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2620 SZ_NLATTR_TYPE_OF(struct tc_mirred);
2621 flags |= MLX5_FLOW_ACTION_PORT_ID;
2623 case RTE_FLOW_ACTION_TYPE_JUMP:
2624 size += SZ_NLATTR_NEST + /* na_act_index. */
2625 SZ_NLATTR_STRZ_OF("gact") +
2626 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2627 SZ_NLATTR_TYPE_OF(struct tc_gact);
2628 flags |= MLX5_FLOW_ACTION_JUMP;
2630 case RTE_FLOW_ACTION_TYPE_DROP:
2631 size += SZ_NLATTR_NEST + /* na_act_index. */
2632 SZ_NLATTR_STRZ_OF("gact") +
2633 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2634 SZ_NLATTR_TYPE_OF(struct tc_gact);
2635 flags |= MLX5_FLOW_ACTION_DROP;
2637 case RTE_FLOW_ACTION_TYPE_COUNT:
2639 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
2640 flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
2641 goto action_of_vlan;
2642 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
2643 flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
2644 goto action_of_vlan;
2645 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
2646 flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
2647 goto action_of_vlan;
2648 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
2649 flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
2650 goto action_of_vlan;
2652 size += SZ_NLATTR_NEST + /* na_act_index. */
2653 SZ_NLATTR_STRZ_OF("vlan") +
2654 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2655 SZ_NLATTR_TYPE_OF(struct tc_vlan) +
2656 SZ_NLATTR_TYPE_OF(uint16_t) +
2657 /* VLAN protocol. */
2658 SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
2659 SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
2661 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
2662 size += SZ_NLATTR_NEST + /* na_act_index. */
2663 SZ_NLATTR_STRZ_OF("tunnel_key") +
2664 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2665 SZ_NLATTR_TYPE_OF(uint8_t);
2666 size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2667 size += flow_tcf_vxlan_encap_size(actions) +
2668 RTE_ALIGN_CEIL /* preceding encap params. */
2669 (sizeof(struct flow_tcf_vxlan_encap),
2671 flags |= MLX5_FLOW_ACTION_VXLAN_ENCAP;
2673 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
2674 size += SZ_NLATTR_NEST + /* na_act_index. */
2675 SZ_NLATTR_STRZ_OF("tunnel_key") +
2676 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2677 SZ_NLATTR_TYPE_OF(uint8_t);
2678 size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2679 size += RTE_ALIGN_CEIL /* preceding decap params. */
2680 (sizeof(struct flow_tcf_vxlan_decap),
2682 flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2684 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
2685 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
2686 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
2687 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
2688 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
2689 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
2690 case RTE_FLOW_ACTION_TYPE_SET_TTL:
2691 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
2692 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
2693 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
2694 size += flow_tcf_get_pedit_actions_size(&actions,
2699 "unsupported action %p type %d,"
2700 " items must be validated before flow creation",
2701 (const void *)actions, actions->type);
2705 *action_flags = flags;
2710 * Brand rtnetlink buffer with unique handle.
2712 * This handle should be unique for a given network interface to avoid
2716 * Pointer to Netlink message.
2718 * Unique 32-bit handle to use.
2721 flow_tcf_nl_brand(struct nlmsghdr *nlh, uint32_t handle)
2723 struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
2725 tcm->tcm_handle = handle;
2726 DRV_LOG(DEBUG, "Netlink msg %p is branded with handle %x",
2727 (void *)nlh, handle);
2731 * Prepare a flow object for Linux TC flower. It calculates the maximum size of
2732 * memory required, allocates the memory, initializes Netlink message headers
2733 * and set unique TC message handle.
2736 * Pointer to the flow attributes.
2738 * Pointer to the list of items.
2739 * @param[in] actions
2740 * Pointer to the list of actions.
2742 * Pointer to the error structure.
2745 * Pointer to mlx5_flow object on success,
2746 * otherwise NULL and rte_errno is set.
2748 static struct mlx5_flow *
2749 flow_tcf_prepare(const struct rte_flow_attr *attr,
2750 const struct rte_flow_item items[],
2751 const struct rte_flow_action actions[],
2752 struct rte_flow_error *error)
2754 size_t size = RTE_ALIGN_CEIL
2755 (sizeof(struct mlx5_flow),
2756 alignof(struct flow_tcf_tunnel_hdr)) +
2757 MNL_ALIGN(sizeof(struct nlmsghdr)) +
2758 MNL_ALIGN(sizeof(struct tcmsg));
2759 struct mlx5_flow *dev_flow;
2760 uint64_t action_flags = 0;
2761 struct nlmsghdr *nlh;
2763 uint8_t *sp, *tun = NULL;
2765 size += flow_tcf_get_items_size(attr, items, &action_flags);
2766 size += flow_tcf_get_actions_and_size(actions, &action_flags);
2767 dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
2769 rte_flow_error_set(error, ENOMEM,
2770 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
2771 "not enough memory to create E-Switch flow");
2774 sp = (uint8_t *)(dev_flow + 1);
2775 if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) {
2777 (sp, alignof(struct flow_tcf_tunnel_hdr));
2779 sp += RTE_ALIGN_CEIL
2780 (sizeof(struct flow_tcf_vxlan_encap),
2783 size -= RTE_ALIGN_CEIL
2784 (sizeof(struct flow_tcf_vxlan_encap),
2787 } else if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2789 (sp, alignof(struct flow_tcf_tunnel_hdr));
2791 sp += RTE_ALIGN_CEIL
2792 (sizeof(struct flow_tcf_vxlan_decap),
2795 size -= RTE_ALIGN_CEIL
2796 (sizeof(struct flow_tcf_vxlan_decap),
2800 sp = RTE_PTR_ALIGN(sp, MNL_ALIGNTO);
2802 nlh = mnl_nlmsg_put_header(sp);
2803 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
2804 *dev_flow = (struct mlx5_flow){
2805 .tcf = (struct mlx5_flow_tcf){
2807 .nlsize = size - RTE_ALIGN_CEIL
2808 (sizeof(struct mlx5_flow),
2809 alignof(struct flow_tcf_tunnel_hdr)),
2811 .tunnel = (struct flow_tcf_tunnel_hdr *)tun,
2816 if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP)
2817 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_DECAP;
2818 else if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP)
2819 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_ENCAP;
2821 * Generate a reasonably unique handle based on the address of the
2824 * This is straightforward on 32-bit systems where the flow pointer can
2825 * be used directly. Otherwise, its least significant part is taken
2826 * after shifting it by the previous power of two of the pointed buffer
2829 if (sizeof(dev_flow) <= 4)
2830 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow);
2832 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow >>
2833 rte_log2_u32(rte_align32prevpow2(size)));
2838 * Make adjustments for supporting count actions.
2841 * Pointer to the Ethernet device structure.
2842 * @param[in] dev_flow
2843 * Pointer to mlx5_flow.
2845 * Pointer to error structure.
2848 * 0 On success else a negative errno value is returned and rte_errno is set.
2851 flow_tcf_translate_action_count(struct rte_eth_dev *dev __rte_unused,
2852 struct mlx5_flow *dev_flow,
2853 struct rte_flow_error *error)
2855 struct rte_flow *flow = dev_flow->flow;
2857 if (!flow->counter) {
2858 flow->counter = flow_tcf_counter_new();
2860 return rte_flow_error_set(error, rte_errno,
2861 RTE_FLOW_ERROR_TYPE_ACTION,
2863 "cannot get counter"
2870 * Convert VXLAN VNI to 32-bit integer.
2873 * VXLAN VNI in 24-bit wire format.
2876 * VXLAN VNI as a 32-bit integer value in network endian.
2878 static inline rte_be32_t
2879 vxlan_vni_as_be32(const uint8_t vni[3])
2885 .vni = { 0, vni[0], vni[1], vni[2] },
2891 * Helper function to process RTE_FLOW_ITEM_TYPE_ETH entry in configuration
2892 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the MAC address fields
2893 * in the encapsulation parameters structure. The item must be prevalidated,
2894 * no any validation checks performed by function.
2897 * RTE_FLOW_ITEM_TYPE_ETH entry specification.
2899 * RTE_FLOW_ITEM_TYPE_ETH entry mask.
2901 * Structure to fill the gathered MAC address data.
2904 flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec,
2905 const struct rte_flow_item_eth *mask,
2906 struct flow_tcf_vxlan_encap *encap)
2908 /* Item must be validated before. No redundant checks. */
2910 if (!mask || !memcmp(&mask->dst,
2911 &rte_flow_item_eth_mask.dst,
2912 sizeof(rte_flow_item_eth_mask.dst))) {
2914 * Ethernet addresses are not supported by
2915 * tc as tunnel_key parameters. Destination
2916 * address is needed to form encap packet
2917 * header and retrieved by kernel from
2918 * implicit sources (ARP table, etc),
2919 * address masks are not supported at all.
2921 encap->eth.dst = spec->dst;
2922 encap->mask |= FLOW_TCF_ENCAP_ETH_DST;
2924 if (!mask || !memcmp(&mask->src,
2925 &rte_flow_item_eth_mask.src,
2926 sizeof(rte_flow_item_eth_mask.src))) {
2928 * Ethernet addresses are not supported by
2929 * tc as tunnel_key parameters. Source ethernet
2930 * address is ignored anyway.
2932 encap->eth.src = spec->src;
2933 encap->mask |= FLOW_TCF_ENCAP_ETH_SRC;
2938 * Helper function to process RTE_FLOW_ITEM_TYPE_IPV4 entry in configuration
2939 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV4 address fields
2940 * in the encapsulation parameters structure. The item must be prevalidated,
2941 * no any validation checks performed by function.
2944 * RTE_FLOW_ITEM_TYPE_IPV4 entry specification.
2946 * Structure to fill the gathered IPV4 address data.
2949 flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec,
2950 struct flow_tcf_vxlan_encap *encap)
2952 /* Item must be validated before. No redundant checks. */
2954 encap->ipv4.dst = spec->hdr.dst_addr;
2955 encap->ipv4.src = spec->hdr.src_addr;
2956 encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC |
2957 FLOW_TCF_ENCAP_IPV4_DST;
2961 * Helper function to process RTE_FLOW_ITEM_TYPE_IPV6 entry in configuration
2962 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV6 address fields
2963 * in the encapsulation parameters structure. The item must be prevalidated,
2964 * no any validation checks performed by function.
2967 * RTE_FLOW_ITEM_TYPE_IPV6 entry specification.
2969 * Structure to fill the gathered IPV6 address data.
2972 flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec,
2973 struct flow_tcf_vxlan_encap *encap)
2975 /* Item must be validated before. No redundant checks. */
2977 memcpy(encap->ipv6.dst, spec->hdr.dst_addr, IPV6_ADDR_LEN);
2978 memcpy(encap->ipv6.src, spec->hdr.src_addr, IPV6_ADDR_LEN);
2979 encap->mask |= FLOW_TCF_ENCAP_IPV6_SRC |
2980 FLOW_TCF_ENCAP_IPV6_DST;
2984 * Helper function to process RTE_FLOW_ITEM_TYPE_UDP entry in configuration
2985 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the UDP port fields
2986 * in the encapsulation parameters structure. The item must be prevalidated,
2987 * no any validation checks performed by function.
2990 * RTE_FLOW_ITEM_TYPE_UDP entry specification.
2992 * RTE_FLOW_ITEM_TYPE_UDP entry mask.
2994 * Structure to fill the gathered UDP port data.
2997 flow_tcf_parse_vxlan_encap_udp(const struct rte_flow_item_udp *spec,
2998 const struct rte_flow_item_udp *mask,
2999 struct flow_tcf_vxlan_encap *encap)
3002 encap->udp.dst = spec->hdr.dst_port;
3003 encap->mask |= FLOW_TCF_ENCAP_UDP_DST;
3004 if (!mask || mask->hdr.src_port != RTE_BE16(0x0000)) {
3005 encap->udp.src = spec->hdr.src_port;
3006 encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC;
3011 * Helper function to process RTE_FLOW_ITEM_TYPE_VXLAN entry in configuration
3012 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the VNI fields
3013 * in the encapsulation parameters structure. The item must be prevalidated,
3014 * no any validation checks performed by function.
3017 * RTE_FLOW_ITEM_TYPE_VXLAN entry specification.
3019 * Structure to fill the gathered VNI address data.
3022 flow_tcf_parse_vxlan_encap_vni(const struct rte_flow_item_vxlan *spec,
3023 struct flow_tcf_vxlan_encap *encap)
3025 /* Item must be validated before. Do not redundant checks. */
3027 memcpy(encap->vxlan.vni, spec->vni, sizeof(encap->vxlan.vni));
3028 encap->mask |= FLOW_TCF_ENCAP_VXLAN_VNI;
3032 * Populate consolidated encapsulation object from list of pattern items.
3034 * Helper function to process configuration of action such as
3035 * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. The item list should be
3036 * validated, there is no way to return an meaningful error.
3039 * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
3040 * List of pattern items to gather data from.
3042 * Structure to fill gathered data.
3045 flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action,
3046 struct flow_tcf_vxlan_encap *encap)
3049 const struct rte_flow_item_eth *eth;
3050 const struct rte_flow_item_ipv4 *ipv4;
3051 const struct rte_flow_item_ipv6 *ipv6;
3052 const struct rte_flow_item_udp *udp;
3053 const struct rte_flow_item_vxlan *vxlan;
3055 const struct rte_flow_item *items;
3057 assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
3058 assert(action->conf);
3060 items = ((const struct rte_flow_action_vxlan_encap *)
3061 action->conf)->definition;
3063 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3064 switch (items->type) {
3065 case RTE_FLOW_ITEM_TYPE_VOID:
3067 case RTE_FLOW_ITEM_TYPE_ETH:
3068 mask.eth = items->mask;
3069 spec.eth = items->spec;
3070 flow_tcf_parse_vxlan_encap_eth(spec.eth, mask.eth,
3073 case RTE_FLOW_ITEM_TYPE_IPV4:
3074 spec.ipv4 = items->spec;
3075 flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4, encap);
3077 case RTE_FLOW_ITEM_TYPE_IPV6:
3078 spec.ipv6 = items->spec;
3079 flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6, encap);
3081 case RTE_FLOW_ITEM_TYPE_UDP:
3082 mask.udp = items->mask;
3083 spec.udp = items->spec;
3084 flow_tcf_parse_vxlan_encap_udp(spec.udp, mask.udp,
3087 case RTE_FLOW_ITEM_TYPE_VXLAN:
3088 spec.vxlan = items->spec;
3089 flow_tcf_parse_vxlan_encap_vni(spec.vxlan, encap);
3094 "unsupported item %p type %d,"
3095 " items must be validated"
3096 " before flow creation",
3097 (const void *)items, items->type);
3105 * Translate flow for Linux TC flower and construct Netlink message.
3108 * Pointer to the priv structure.
3109 * @param[in, out] flow
3110 * Pointer to the sub flow.
3112 * Pointer to the flow attributes.
3114 * Pointer to the list of items.
3115 * @param[in] actions
3116 * Pointer to the list of actions.
3118 * Pointer to the error structure.
3121 * 0 on success, a negative errno value otherwise and rte_errno is set.
3124 flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow,
3125 const struct rte_flow_attr *attr,
3126 const struct rte_flow_item items[],
3127 const struct rte_flow_action actions[],
3128 struct rte_flow_error *error)
3131 const struct rte_flow_item_port_id *port_id;
3132 const struct rte_flow_item_eth *eth;
3133 const struct rte_flow_item_vlan *vlan;
3134 const struct rte_flow_item_ipv4 *ipv4;
3135 const struct rte_flow_item_ipv6 *ipv6;
3136 const struct rte_flow_item_tcp *tcp;
3137 const struct rte_flow_item_udp *udp;
3138 const struct rte_flow_item_vxlan *vxlan;
3141 const struct rte_flow_action_port_id *port_id;
3142 const struct rte_flow_action_jump *jump;
3143 const struct rte_flow_action_of_push_vlan *of_push_vlan;
3144 const struct rte_flow_action_of_set_vlan_vid *
3146 const struct rte_flow_action_of_set_vlan_pcp *
3150 struct flow_tcf_tunnel_hdr *hdr;
3151 struct flow_tcf_vxlan_decap *vxlan;
3156 struct flow_tcf_tunnel_hdr *hdr;
3157 struct flow_tcf_vxlan_encap *vxlan;
3161 struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
3162 struct nlmsghdr *nlh = dev_flow->tcf.nlh;
3163 struct tcmsg *tcm = dev_flow->tcf.tcm;
3164 uint32_t na_act_index_cur;
3165 rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
3166 rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
3167 rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
3168 bool ip_proto_set = 0;
3169 bool tunnel_outer = 0;
3170 struct nlattr *na_flower;
3171 struct nlattr *na_flower_act;
3172 struct nlattr *na_vlan_id = NULL;
3173 struct nlattr *na_vlan_priority = NULL;
3174 uint64_t item_flags = 0;
3177 claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
3178 PTOI_TABLE_SZ_MAX(dev)));
3179 if (dev_flow->tcf.tunnel) {
3180 switch (dev_flow->tcf.tunnel->type) {
3181 case FLOW_TCF_TUNACT_VXLAN_DECAP:
3182 decap.vxlan = dev_flow->tcf.vxlan_decap;
3185 case FLOW_TCF_TUNACT_VXLAN_ENCAP:
3186 encap.vxlan = dev_flow->tcf.vxlan_encap;
3188 /* New tunnel actions can be added here. */
3194 nlh = dev_flow->tcf.nlh;
3195 tcm = dev_flow->tcf.tcm;
3196 /* Prepare API must have been called beforehand. */
3197 assert(nlh != NULL && tcm != NULL);
3198 tcm->tcm_family = AF_UNSPEC;
3199 tcm->tcm_ifindex = ptoi[0].ifindex;
3200 tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
3202 * Priority cannot be zero to prevent the kernel from picking one
3205 tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16, outer_etype);
3206 if (attr->group > 0)
3207 mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group);
3208 mnl_attr_put_strz(nlh, TCA_KIND, "flower");
3209 na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
3210 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3213 switch (items->type) {
3214 case RTE_FLOW_ITEM_TYPE_VOID:
3216 case RTE_FLOW_ITEM_TYPE_PORT_ID:
3217 mask.port_id = flow_tcf_item_mask
3218 (items, &rte_flow_item_port_id_mask,
3219 &flow_tcf_mask_supported.port_id,
3220 &flow_tcf_mask_empty.port_id,
3221 sizeof(flow_tcf_mask_supported.port_id),
3223 assert(mask.port_id);
3224 if (mask.port_id == &flow_tcf_mask_empty.port_id)
3226 spec.port_id = items->spec;
3227 if (!mask.port_id->id)
3230 for (i = 0; ptoi[i].ifindex; ++i)
3231 if (ptoi[i].port_id == spec.port_id->id)
3233 assert(ptoi[i].ifindex);
3234 tcm->tcm_ifindex = ptoi[i].ifindex;
3236 case RTE_FLOW_ITEM_TYPE_ETH:
3237 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3238 MLX5_FLOW_LAYER_INNER_L2 :
3239 MLX5_FLOW_LAYER_OUTER_L2;
3240 mask.eth = flow_tcf_item_mask
3241 (items, &rte_flow_item_eth_mask,
3242 &flow_tcf_mask_supported.eth,
3243 &flow_tcf_mask_empty.eth,
3244 sizeof(flow_tcf_mask_supported.eth),
3247 if (mask.eth == &flow_tcf_mask_empty.eth)
3249 spec.eth = items->spec;
3250 if (mask.eth->type) {
3251 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
3252 inner_etype = spec.eth->type;
3254 outer_etype = spec.eth->type;
3258 "outer L2 addresses cannot be"
3259 " forced is outer ones for tunnel,"
3260 " parameter is ignored");
3263 if (!is_zero_ether_addr(&mask.eth->dst)) {
3264 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
3266 spec.eth->dst.addr_bytes);
3267 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK,
3269 mask.eth->dst.addr_bytes);
3271 if (!is_zero_ether_addr(&mask.eth->src)) {
3272 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC,
3274 spec.eth->src.addr_bytes);
3275 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK,
3277 mask.eth->src.addr_bytes);
3279 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3281 case RTE_FLOW_ITEM_TYPE_VLAN:
3284 assert(!tunnel_outer);
3285 item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
3286 mask.vlan = flow_tcf_item_mask
3287 (items, &rte_flow_item_vlan_mask,
3288 &flow_tcf_mask_supported.vlan,
3289 &flow_tcf_mask_empty.vlan,
3290 sizeof(flow_tcf_mask_supported.vlan),
3293 if (mask.vlan == &flow_tcf_mask_empty.vlan)
3295 spec.vlan = items->spec;
3296 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3297 outer_etype == RTE_BE16(ETH_P_8021Q));
3298 outer_etype = RTE_BE16(ETH_P_8021Q);
3299 if (mask.vlan->inner_type)
3300 vlan_etype = spec.vlan->inner_type;
3301 if (mask.vlan->tci & RTE_BE16(0xe000))
3302 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_VLAN_PRIO,
3304 (spec.vlan->tci) >> 13) & 0x7);
3305 if (mask.vlan->tci & RTE_BE16(0x0fff))
3306 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_VLAN_ID,
3310 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3312 case RTE_FLOW_ITEM_TYPE_IPV4:
3313 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3314 MLX5_FLOW_LAYER_INNER_L3_IPV4 :
3315 MLX5_FLOW_LAYER_OUTER_L3_IPV4;
3316 mask.ipv4 = flow_tcf_item_mask
3317 (items, &rte_flow_item_ipv4_mask,
3318 &flow_tcf_mask_supported.ipv4,
3319 &flow_tcf_mask_empty.ipv4,
3320 sizeof(flow_tcf_mask_supported.ipv4),
3323 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3324 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3325 inner_etype == RTE_BE16(ETH_P_IP));
3326 inner_etype = RTE_BE16(ETH_P_IP);
3327 } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3328 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3329 vlan_etype == RTE_BE16(ETH_P_IP));
3330 vlan_etype = RTE_BE16(ETH_P_IP);
3332 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3333 outer_etype == RTE_BE16(ETH_P_IP));
3334 outer_etype = RTE_BE16(ETH_P_IP);
3336 spec.ipv4 = items->spec;
3337 if (!tunnel_outer && mask.ipv4->hdr.next_proto_id) {
3339 * No way to set IP protocol for outer tunnel
3340 * layers. Usually it is fixed, for example,
3341 * to UDP for VXLAN/GPE.
3343 assert(spec.ipv4); /* Mask is not empty. */
3344 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3345 spec.ipv4->hdr.next_proto_id);
3348 if (mask.ipv4 == &flow_tcf_mask_empty.ipv4 ||
3349 (!mask.ipv4->hdr.src_addr &&
3350 !mask.ipv4->hdr.dst_addr)) {
3354 * For tunnel outer we must set outer IP key
3355 * anyway, even if the specification/mask is
3356 * empty. There is no another way to tell
3357 * kernel about he outer layer protocol.
3360 (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC,
3361 mask.ipv4->hdr.src_addr);
3363 (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
3364 mask.ipv4->hdr.src_addr);
3365 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3368 if (mask.ipv4->hdr.src_addr) {
3370 (nlh, tunnel_outer ?
3371 TCA_FLOWER_KEY_ENC_IPV4_SRC :
3372 TCA_FLOWER_KEY_IPV4_SRC,
3373 spec.ipv4->hdr.src_addr);
3375 (nlh, tunnel_outer ?
3376 TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK :
3377 TCA_FLOWER_KEY_IPV4_SRC_MASK,
3378 mask.ipv4->hdr.src_addr);
3380 if (mask.ipv4->hdr.dst_addr) {
3382 (nlh, tunnel_outer ?
3383 TCA_FLOWER_KEY_ENC_IPV4_DST :
3384 TCA_FLOWER_KEY_IPV4_DST,
3385 spec.ipv4->hdr.dst_addr);
3387 (nlh, tunnel_outer ?
3388 TCA_FLOWER_KEY_ENC_IPV4_DST_MASK :
3389 TCA_FLOWER_KEY_IPV4_DST_MASK,
3390 mask.ipv4->hdr.dst_addr);
3392 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3394 case RTE_FLOW_ITEM_TYPE_IPV6: {
3395 bool ipv6_src, ipv6_dst;
3397 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3398 MLX5_FLOW_LAYER_INNER_L3_IPV6 :
3399 MLX5_FLOW_LAYER_OUTER_L3_IPV6;
3400 mask.ipv6 = flow_tcf_item_mask
3401 (items, &rte_flow_item_ipv6_mask,
3402 &flow_tcf_mask_supported.ipv6,
3403 &flow_tcf_mask_empty.ipv6,
3404 sizeof(flow_tcf_mask_supported.ipv6),
3407 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3408 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3409 inner_etype == RTE_BE16(ETH_P_IPV6));
3410 inner_etype = RTE_BE16(ETH_P_IPV6);
3411 } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3412 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3413 vlan_etype == RTE_BE16(ETH_P_IPV6));
3414 vlan_etype = RTE_BE16(ETH_P_IPV6);
3416 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3417 outer_etype == RTE_BE16(ETH_P_IPV6));
3418 outer_etype = RTE_BE16(ETH_P_IPV6);
3420 spec.ipv6 = items->spec;
3421 if (!tunnel_outer && mask.ipv6->hdr.proto) {
3423 * No way to set IP protocol for outer tunnel
3424 * layers. Usually it is fixed, for example,
3425 * to UDP for VXLAN/GPE.
3427 assert(spec.ipv6); /* Mask is not empty. */
3428 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3429 spec.ipv6->hdr.proto);
3432 ipv6_dst = !IN6_IS_ADDR_UNSPECIFIED
3433 (mask.ipv6->hdr.dst_addr);
3434 ipv6_src = !IN6_IS_ADDR_UNSPECIFIED
3435 (mask.ipv6->hdr.src_addr);
3436 if (mask.ipv6 == &flow_tcf_mask_empty.ipv6 ||
3437 (!ipv6_dst && !ipv6_src)) {
3441 * For tunnel outer we must set outer IP key
3442 * anyway, even if the specification/mask is
3443 * empty. There is no another way to tell
3444 * kernel about he outer layer protocol.
3447 TCA_FLOWER_KEY_ENC_IPV6_SRC,
3449 mask.ipv6->hdr.src_addr);
3451 TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
3453 mask.ipv6->hdr.src_addr);
3454 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3458 mnl_attr_put(nlh, tunnel_outer ?
3459 TCA_FLOWER_KEY_ENC_IPV6_SRC :
3460 TCA_FLOWER_KEY_IPV6_SRC,
3462 spec.ipv6->hdr.src_addr);
3463 mnl_attr_put(nlh, tunnel_outer ?
3464 TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK :
3465 TCA_FLOWER_KEY_IPV6_SRC_MASK,
3467 mask.ipv6->hdr.src_addr);
3470 mnl_attr_put(nlh, tunnel_outer ?
3471 TCA_FLOWER_KEY_ENC_IPV6_DST :
3472 TCA_FLOWER_KEY_IPV6_DST,
3474 spec.ipv6->hdr.dst_addr);
3475 mnl_attr_put(nlh, tunnel_outer ?
3476 TCA_FLOWER_KEY_ENC_IPV6_DST_MASK :
3477 TCA_FLOWER_KEY_IPV6_DST_MASK,
3479 mask.ipv6->hdr.dst_addr);
3481 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3484 case RTE_FLOW_ITEM_TYPE_UDP:
3485 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3486 MLX5_FLOW_LAYER_INNER_L4_UDP :
3487 MLX5_FLOW_LAYER_OUTER_L4_UDP;
3488 mask.udp = flow_tcf_item_mask
3489 (items, &rte_flow_item_udp_mask,
3490 &flow_tcf_mask_supported.udp,
3491 &flow_tcf_mask_empty.udp,
3492 sizeof(flow_tcf_mask_supported.udp),
3495 spec.udp = items->spec;
3496 if (!tunnel_outer) {
3499 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3501 if (mask.udp == &flow_tcf_mask_empty.udp)
3504 assert(mask.udp != &flow_tcf_mask_empty.udp);
3505 decap.vxlan->udp_port =
3507 (spec.udp->hdr.dst_port);
3509 if (mask.udp->hdr.src_port) {
3511 (nlh, tunnel_outer ?
3512 TCA_FLOWER_KEY_ENC_UDP_SRC_PORT :
3513 TCA_FLOWER_KEY_UDP_SRC,
3514 spec.udp->hdr.src_port);
3516 (nlh, tunnel_outer ?
3517 TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK :
3518 TCA_FLOWER_KEY_UDP_SRC_MASK,
3519 mask.udp->hdr.src_port);
3521 if (mask.udp->hdr.dst_port) {
3523 (nlh, tunnel_outer ?
3524 TCA_FLOWER_KEY_ENC_UDP_DST_PORT :
3525 TCA_FLOWER_KEY_UDP_DST,
3526 spec.udp->hdr.dst_port);
3528 (nlh, tunnel_outer ?
3529 TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK :
3530 TCA_FLOWER_KEY_UDP_DST_MASK,
3531 mask.udp->hdr.dst_port);
3533 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3535 case RTE_FLOW_ITEM_TYPE_TCP:
3536 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3537 MLX5_FLOW_LAYER_INNER_L4_TCP :
3538 MLX5_FLOW_LAYER_OUTER_L4_TCP;
3539 mask.tcp = flow_tcf_item_mask
3540 (items, &rte_flow_item_tcp_mask,
3541 &flow_tcf_mask_supported.tcp,
3542 &flow_tcf_mask_empty.tcp,
3543 sizeof(flow_tcf_mask_supported.tcp),
3547 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3549 if (mask.tcp == &flow_tcf_mask_empty.tcp)
3551 spec.tcp = items->spec;
3552 if (mask.tcp->hdr.src_port) {
3553 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_SRC,
3554 spec.tcp->hdr.src_port);
3555 mnl_attr_put_u16(nlh,
3556 TCA_FLOWER_KEY_TCP_SRC_MASK,
3557 mask.tcp->hdr.src_port);
3559 if (mask.tcp->hdr.dst_port) {
3560 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_DST,
3561 spec.tcp->hdr.dst_port);
3562 mnl_attr_put_u16(nlh,
3563 TCA_FLOWER_KEY_TCP_DST_MASK,
3564 mask.tcp->hdr.dst_port);
3566 if (mask.tcp->hdr.tcp_flags) {
3569 TCA_FLOWER_KEY_TCP_FLAGS,
3571 (spec.tcp->hdr.tcp_flags));
3574 TCA_FLOWER_KEY_TCP_FLAGS_MASK,
3576 (mask.tcp->hdr.tcp_flags));
3578 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3580 case RTE_FLOW_ITEM_TYPE_VXLAN:
3581 assert(decap.vxlan);
3583 item_flags |= MLX5_FLOW_LAYER_VXLAN;
3584 spec.vxlan = items->spec;
3585 mnl_attr_put_u32(nlh,
3586 TCA_FLOWER_KEY_ENC_KEY_ID,
3587 vxlan_vni_as_be32(spec.vxlan->vni));
3588 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3591 return rte_flow_error_set(error, ENOTSUP,
3592 RTE_FLOW_ERROR_TYPE_ITEM,
3593 NULL, "item not supported");
3597 * Set the ether_type flower key and tc rule protocol:
3598 * - if there is nor VLAN neither VXLAN the key is taken from
3599 * eth item directly or deduced from L3 items.
3600 * - if there is vlan item then key is fixed to 802.1q.
3601 * - if there is vxlan item then key is set to inner tunnel type.
3602 * - simultaneous vlan and vxlan items are prohibited.
3604 if (outer_etype != RTE_BE16(ETH_P_ALL)) {
3605 tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
3607 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3608 if (inner_etype != RTE_BE16(ETH_P_ALL))
3609 mnl_attr_put_u16(nlh,
3610 TCA_FLOWER_KEY_ETH_TYPE,
3613 mnl_attr_put_u16(nlh,
3614 TCA_FLOWER_KEY_ETH_TYPE,
3616 if (outer_etype == RTE_BE16(ETH_P_8021Q) &&
3617 vlan_etype != RTE_BE16(ETH_P_ALL))
3618 mnl_attr_put_u16(nlh,
3619 TCA_FLOWER_KEY_VLAN_ETH_TYPE,
3622 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3624 na_flower_act = mnl_attr_nest_start(nlh, TCA_FLOWER_ACT);
3625 na_act_index_cur = 1;
3626 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
3627 struct nlattr *na_act_index;
3628 struct nlattr *na_act;
3629 unsigned int vlan_act;
3632 switch (actions->type) {
3633 case RTE_FLOW_ACTION_TYPE_VOID:
3635 case RTE_FLOW_ACTION_TYPE_PORT_ID:
3636 conf.port_id = actions->conf;
3637 if (conf.port_id->original)
3640 for (i = 0; ptoi[i].ifindex; ++i)
3641 if (ptoi[i].port_id == conf.port_id->id)
3643 assert(ptoi[i].ifindex);
3645 mnl_attr_nest_start(nlh, na_act_index_cur++);
3646 assert(na_act_index);
3647 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
3648 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3651 assert(dev_flow->tcf.tunnel);
3652 dev_flow->tcf.tunnel->ifindex_ptr =
3653 &((struct tc_mirred *)
3654 mnl_attr_get_payload
3655 (mnl_nlmsg_get_payload_tail
3658 mnl_attr_put(nlh, TCA_MIRRED_PARMS,
3659 sizeof(struct tc_mirred),
3660 &(struct tc_mirred){
3661 .action = TC_ACT_STOLEN,
3662 .eaction = TCA_EGRESS_REDIR,
3663 .ifindex = ptoi[i].ifindex,
3665 mnl_attr_nest_end(nlh, na_act);
3666 mnl_attr_nest_end(nlh, na_act_index);
3668 case RTE_FLOW_ACTION_TYPE_JUMP:
3669 conf.jump = actions->conf;
3671 mnl_attr_nest_start(nlh, na_act_index_cur++);
3672 assert(na_act_index);
3673 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3674 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3676 mnl_attr_put(nlh, TCA_GACT_PARMS,
3677 sizeof(struct tc_gact),
3679 .action = TC_ACT_GOTO_CHAIN |
3682 mnl_attr_nest_end(nlh, na_act);
3683 mnl_attr_nest_end(nlh, na_act_index);
3685 case RTE_FLOW_ACTION_TYPE_DROP:
3687 mnl_attr_nest_start(nlh, na_act_index_cur++);
3688 assert(na_act_index);
3689 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3690 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3692 mnl_attr_put(nlh, TCA_GACT_PARMS,
3693 sizeof(struct tc_gact),
3695 .action = TC_ACT_SHOT,
3697 mnl_attr_nest_end(nlh, na_act);
3698 mnl_attr_nest_end(nlh, na_act_index);
3700 case RTE_FLOW_ACTION_TYPE_COUNT:
3702 * Driver adds the count action implicitly for
3703 * each rule it creates.
3705 ret = flow_tcf_translate_action_count(dev,
3710 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
3711 conf.of_push_vlan = NULL;
3712 vlan_act = TCA_VLAN_ACT_POP;
3713 goto action_of_vlan;
3714 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
3715 conf.of_push_vlan = actions->conf;
3716 vlan_act = TCA_VLAN_ACT_PUSH;
3717 goto action_of_vlan;
3718 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
3719 conf.of_set_vlan_vid = actions->conf;
3721 goto override_na_vlan_id;
3722 vlan_act = TCA_VLAN_ACT_MODIFY;
3723 goto action_of_vlan;
3724 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
3725 conf.of_set_vlan_pcp = actions->conf;
3726 if (na_vlan_priority)
3727 goto override_na_vlan_priority;
3728 vlan_act = TCA_VLAN_ACT_MODIFY;
3729 goto action_of_vlan;
3732 mnl_attr_nest_start(nlh, na_act_index_cur++);
3733 assert(na_act_index);
3734 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "vlan");
3735 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3737 mnl_attr_put(nlh, TCA_VLAN_PARMS,
3738 sizeof(struct tc_vlan),
3740 .action = TC_ACT_PIPE,
3741 .v_action = vlan_act,
3743 if (vlan_act == TCA_VLAN_ACT_POP) {
3744 mnl_attr_nest_end(nlh, na_act);
3745 mnl_attr_nest_end(nlh, na_act_index);
3748 if (vlan_act == TCA_VLAN_ACT_PUSH)
3749 mnl_attr_put_u16(nlh,
3750 TCA_VLAN_PUSH_VLAN_PROTOCOL,
3751 conf.of_push_vlan->ethertype);
3752 na_vlan_id = mnl_nlmsg_get_payload_tail(nlh);
3753 mnl_attr_put_u16(nlh, TCA_VLAN_PAD, 0);
3754 na_vlan_priority = mnl_nlmsg_get_payload_tail(nlh);
3755 mnl_attr_put_u8(nlh, TCA_VLAN_PAD, 0);
3756 mnl_attr_nest_end(nlh, na_act);
3757 mnl_attr_nest_end(nlh, na_act_index);
3758 if (actions->type ==
3759 RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
3760 override_na_vlan_id:
3761 na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID;
3762 *(uint16_t *)mnl_attr_get_payload(na_vlan_id) =
3764 (conf.of_set_vlan_vid->vlan_vid);
3765 } else if (actions->type ==
3766 RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
3767 override_na_vlan_priority:
3768 na_vlan_priority->nla_type =
3769 TCA_VLAN_PUSH_VLAN_PRIORITY;
3770 *(uint8_t *)mnl_attr_get_payload
3771 (na_vlan_priority) =
3772 conf.of_set_vlan_pcp->vlan_pcp;
3775 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
3776 assert(decap.vxlan);
3777 assert(dev_flow->tcf.tunnel);
3778 dev_flow->tcf.tunnel->ifindex_ptr =
3779 (unsigned int *)&tcm->tcm_ifindex;
3781 mnl_attr_nest_start(nlh, na_act_index_cur++);
3782 assert(na_act_index);
3783 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3784 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3786 mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3787 sizeof(struct tc_tunnel_key),
3788 &(struct tc_tunnel_key){
3789 .action = TC_ACT_PIPE,
3790 .t_action = TCA_TUNNEL_KEY_ACT_RELEASE,
3792 mnl_attr_nest_end(nlh, na_act);
3793 mnl_attr_nest_end(nlh, na_act_index);
3794 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3796 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
3797 assert(encap.vxlan);
3798 flow_tcf_vxlan_encap_parse(actions, encap.vxlan);
3800 mnl_attr_nest_start(nlh, na_act_index_cur++);
3801 assert(na_act_index);
3802 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3803 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3805 mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3806 sizeof(struct tc_tunnel_key),
3807 &(struct tc_tunnel_key){
3808 .action = TC_ACT_PIPE,
3809 .t_action = TCA_TUNNEL_KEY_ACT_SET,
3811 if (encap.vxlan->mask & FLOW_TCF_ENCAP_UDP_DST)
3812 mnl_attr_put_u16(nlh,
3813 TCA_TUNNEL_KEY_ENC_DST_PORT,
3814 encap.vxlan->udp.dst);
3815 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_SRC)
3816 mnl_attr_put_u32(nlh,
3817 TCA_TUNNEL_KEY_ENC_IPV4_SRC,
3818 encap.vxlan->ipv4.src);
3819 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_DST)
3820 mnl_attr_put_u32(nlh,
3821 TCA_TUNNEL_KEY_ENC_IPV4_DST,
3822 encap.vxlan->ipv4.dst);
3823 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_SRC)
3825 TCA_TUNNEL_KEY_ENC_IPV6_SRC,
3826 sizeof(encap.vxlan->ipv6.src),
3827 &encap.vxlan->ipv6.src);
3828 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_DST)
3830 TCA_TUNNEL_KEY_ENC_IPV6_DST,
3831 sizeof(encap.vxlan->ipv6.dst),
3832 &encap.vxlan->ipv6.dst);
3833 if (encap.vxlan->mask & FLOW_TCF_ENCAP_VXLAN_VNI)
3834 mnl_attr_put_u32(nlh,
3835 TCA_TUNNEL_KEY_ENC_KEY_ID,
3837 (encap.vxlan->vxlan.vni));
3838 mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM, 0);
3839 mnl_attr_nest_end(nlh, na_act);
3840 mnl_attr_nest_end(nlh, na_act_index);
3841 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3843 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
3844 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
3845 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
3846 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
3847 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
3848 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
3849 case RTE_FLOW_ACTION_TYPE_SET_TTL:
3850 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
3851 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
3852 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
3854 mnl_attr_nest_start(nlh, na_act_index_cur++);
3855 flow_tcf_create_pedit_mnl_msg(nlh,
3856 &actions, item_flags);
3857 mnl_attr_nest_end(nlh, na_act_index);
3860 return rte_flow_error_set(error, ENOTSUP,
3861 RTE_FLOW_ERROR_TYPE_ACTION,
3863 "action not supported");
3867 assert(na_flower_act);
3868 mnl_attr_nest_end(nlh, na_flower_act);
3869 dev_flow->tcf.ptc_flags = mnl_attr_get_payload
3870 (mnl_nlmsg_get_payload_tail(nlh));
3871 mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, decap.vxlan ?
3872 0 : TCA_CLS_FLAGS_SKIP_SW);
3873 mnl_attr_nest_end(nlh, na_flower);
3874 if (dev_flow->tcf.tunnel && dev_flow->tcf.tunnel->ifindex_ptr)
3875 dev_flow->tcf.tunnel->ifindex_org =
3876 *dev_flow->tcf.tunnel->ifindex_ptr;
3877 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3882 * Send Netlink message with acknowledgment.
3885 * Flow context to use.
3887 * Message to send. This function always raises the NLM_F_ACK flag before
3890 * Callback handler for received message.
3892 * Context pointer for callback handler.
3895 * 0 on success, a negative errno value otherwise and rte_errno is set.
3898 flow_tcf_nl_ack(struct mlx5_flow_tcf_context *tcf,
3899 struct nlmsghdr *nlh,
3900 mnl_cb_t cb, void *arg)
3902 unsigned int portid = mnl_socket_get_portid(tcf->nl);
3903 uint32_t seq = tcf->seq++;
3909 /* seq 0 is reserved for kernel event-driven notifications. */
3912 nlh->nlmsg_seq = seq;
3913 nlh->nlmsg_flags |= NLM_F_ACK;
3914 ret = mnl_socket_sendto(tcf->nl, nlh, nlh->nlmsg_len);
3916 /* Message send error occurres. */
3920 nlh = (struct nlmsghdr *)(tcf->buf);
3922 * The following loop postpones non-fatal errors until multipart
3923 * messages are complete.
3926 ret = mnl_socket_recvfrom(tcf->nl, tcf->buf, tcf->buf_size);
3930 * In case of overflow Will receive till
3931 * end of multipart message. We may lost part
3932 * of reply messages but mark and return an error.
3934 if (err != ENOSPC ||
3935 !(nlh->nlmsg_flags & NLM_F_MULTI) ||
3936 nlh->nlmsg_type == NLMSG_DONE)
3939 ret = mnl_cb_run(nlh, ret, seq, portid, cb, arg);
3942 * libmnl returns 0 if DONE or
3943 * success ACK message found.
3949 * ACK message with error found
3950 * or some error occurred.
3955 /* We should continue receiving. */
3964 #define MNL_BUF_EXTRA_SPACE 16
3965 #define MNL_REQUEST_SIZE_MIN 256
3966 #define MNL_REQUEST_SIZE_MAX 2048
3967 #define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \
3968 MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX)
3970 /* Data structures used by flow_tcf_xxx_cb() routines. */
3971 struct tcf_nlcb_buf {
3972 LIST_ENTRY(tcf_nlcb_buf) next;
3974 alignas(struct nlmsghdr)
3975 uint8_t msg[]; /**< Netlink message data. */
3978 struct tcf_nlcb_context {
3979 unsigned int ifindex; /**< Base interface index. */
3981 LIST_HEAD(, tcf_nlcb_buf) nlbuf;
3985 * Allocate space for netlink command in buffer list
3987 * @param[in, out] ctx
3988 * Pointer to callback context with command buffers list.
3990 * Required size of data buffer to be allocated.
3993 * Pointer to allocated memory, aligned as message header.
3994 * NULL if some error occurred.
3996 static struct nlmsghdr *
3997 flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size)
3999 struct tcf_nlcb_buf *buf;
4000 struct nlmsghdr *nlh;
4002 size = NLMSG_ALIGN(size);
4003 buf = LIST_FIRST(&ctx->nlbuf);
4004 if (buf && (buf->size + size) <= ctx->bufsize) {
4005 nlh = (struct nlmsghdr *)&buf->msg[buf->size];
4009 if (size > ctx->bufsize) {
4010 DRV_LOG(WARNING, "netlink: too long command buffer requested");
4013 buf = rte_malloc(__func__,
4014 ctx->bufsize + sizeof(struct tcf_nlcb_buf),
4015 alignof(struct tcf_nlcb_buf));
4017 DRV_LOG(WARNING, "netlink: no memory for command buffer");
4020 LIST_INSERT_HEAD(&ctx->nlbuf, buf, next);
4022 nlh = (struct nlmsghdr *)&buf->msg[0];
4027 * Send the buffers with prepared netlink commands. Scans the list and
4028 * sends all found buffers. Buffers are sent and freed anyway in order
4029 * to prevent memory leakage if some every message in received packet.
4032 * Context object initialized by mlx5_flow_tcf_context_create().
4033 * @param[in, out] ctx
4034 * Pointer to callback context with command buffers list.
4037 * Zero value on success, negative errno value otherwise
4038 * and rte_errno is set.
4041 flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf,
4042 struct tcf_nlcb_context *ctx)
4044 struct tcf_nlcb_buf *bc = LIST_FIRST(&ctx->nlbuf);
4048 struct tcf_nlcb_buf *bn = LIST_NEXT(bc, next);
4049 struct nlmsghdr *nlh;
4053 while (msg < bc->size) {
4055 * Send Netlink commands from buffer in one by one
4056 * fashion. If we send multiple rule deletion commands
4057 * in one Netlink message and some error occurs it may
4058 * cause multiple ACK error messages and break sequence
4059 * numbers of Netlink communication, because we expect
4060 * the only one ACK reply.
4062 assert((bc->size - msg) >= sizeof(struct nlmsghdr));
4063 nlh = (struct nlmsghdr *)&bc->msg[msg];
4064 assert((bc->size - msg) >= nlh->nlmsg_len);
4065 msg += nlh->nlmsg_len;
4066 rc = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4069 "netlink: cleanup error %d", rc);
4077 LIST_INIT(&ctx->nlbuf);
4082 * Collect local IP address rules with scope link attribute on specified
4083 * network device. This is callback routine called by libmnl mnl_cb_run()
4084 * in loop for every message in received packet.
4087 * Pointer to reply header.
4088 * @param[in, out] arg
4089 * Opaque data pointer for this callback.
4092 * A positive, nonzero value on success, negative errno value otherwise
4093 * and rte_errno is set.
4096 flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg)
4098 struct tcf_nlcb_context *ctx = arg;
4099 struct nlmsghdr *cmd;
4100 struct ifaddrmsg *ifa;
4102 struct nlattr *na_local = NULL;
4103 struct nlattr *na_peer = NULL;
4104 unsigned char family;
4107 if (nlh->nlmsg_type != RTM_NEWADDR) {
4111 ifa = mnl_nlmsg_get_payload(nlh);
4112 family = ifa->ifa_family;
4113 if (ifa->ifa_index != ctx->ifindex ||
4114 ifa->ifa_scope != RT_SCOPE_LINK ||
4115 !(ifa->ifa_flags & IFA_F_PERMANENT) ||
4116 (family != AF_INET && family != AF_INET6))
4118 mnl_attr_for_each(na, nlh, sizeof(*ifa)) {
4119 switch (mnl_attr_get_type(na)) {
4127 if (na_local && na_peer)
4130 if (!na_local || !na_peer)
4132 /* Local rule found with scope link, permanent and assigned peer. */
4133 size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4134 MNL_ALIGN(sizeof(struct ifaddrmsg)) +
4135 (family == AF_INET6 ? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4136 : 2 * SZ_NLATTR_TYPE_OF(uint32_t));
4137 cmd = flow_tcf_alloc_nlcmd(ctx, size);
4142 cmd = mnl_nlmsg_put_header(cmd);
4143 cmd->nlmsg_type = RTM_DELADDR;
4144 cmd->nlmsg_flags = NLM_F_REQUEST;
4145 ifa = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifa));
4146 ifa->ifa_flags = IFA_F_PERMANENT;
4147 ifa->ifa_scope = RT_SCOPE_LINK;
4148 ifa->ifa_index = ctx->ifindex;
4149 if (family == AF_INET) {
4150 ifa->ifa_family = AF_INET;
4151 ifa->ifa_prefixlen = 32;
4152 mnl_attr_put_u32(cmd, IFA_LOCAL, mnl_attr_get_u32(na_local));
4153 mnl_attr_put_u32(cmd, IFA_ADDRESS, mnl_attr_get_u32(na_peer));
4155 ifa->ifa_family = AF_INET6;
4156 ifa->ifa_prefixlen = 128;
4157 mnl_attr_put(cmd, IFA_LOCAL, IPV6_ADDR_LEN,
4158 mnl_attr_get_payload(na_local));
4159 mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN,
4160 mnl_attr_get_payload(na_peer));
4162 assert(size == cmd->nlmsg_len);
4167 * Cleanup the local IP addresses on outer interface.
4170 * Context object initialized by mlx5_flow_tcf_context_create().
4171 * @param[in] ifindex
4172 * Network inferface index to perform cleanup.
4175 flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf,
4176 unsigned int ifindex)
4178 struct nlmsghdr *nlh;
4179 struct ifaddrmsg *ifa;
4180 struct tcf_nlcb_context ctx = {
4182 .bufsize = MNL_REQUEST_SIZE,
4183 .nlbuf = LIST_HEAD_INITIALIZER(),
4189 * Seek and destroy leftovers of local IP addresses with
4190 * matching properties "scope link".
4192 nlh = mnl_nlmsg_put_header(tcf->buf);
4193 nlh->nlmsg_type = RTM_GETADDR;
4194 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4195 ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4196 ifa->ifa_family = AF_UNSPEC;
4197 ifa->ifa_index = ifindex;
4198 ifa->ifa_scope = RT_SCOPE_LINK;
4199 ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_local_cb, &ctx);
4201 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4202 ret = flow_tcf_send_nlcmd(tcf, &ctx);
4204 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4208 * Collect neigh permament rules on specified network device.
4209 * This is callback routine called by libmnl mnl_cb_run() in loop for
4210 * every message in received packet.
4213 * Pointer to reply header.
4214 * @param[in, out] arg
4215 * Opaque data pointer for this callback.
4218 * A positive, nonzero value on success, negative errno value otherwise
4219 * and rte_errno is set.
4222 flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg)
4224 struct tcf_nlcb_context *ctx = arg;
4225 struct nlmsghdr *cmd;
4228 struct nlattr *na_ip = NULL;
4229 struct nlattr *na_mac = NULL;
4230 unsigned char family;
4233 if (nlh->nlmsg_type != RTM_NEWNEIGH) {
4237 ndm = mnl_nlmsg_get_payload(nlh);
4238 family = ndm->ndm_family;
4239 if (ndm->ndm_ifindex != (int)ctx->ifindex ||
4240 !(ndm->ndm_state & NUD_PERMANENT) ||
4241 (family != AF_INET && family != AF_INET6))
4243 mnl_attr_for_each(na, nlh, sizeof(*ndm)) {
4244 switch (mnl_attr_get_type(na)) {
4252 if (na_mac && na_ip)
4255 if (!na_mac || !na_ip)
4257 /* Neigh rule with permenent attribute found. */
4258 size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4259 MNL_ALIGN(sizeof(struct ndmsg)) +
4260 SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) +
4261 (family == AF_INET6 ? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4262 : SZ_NLATTR_TYPE_OF(uint32_t));
4263 cmd = flow_tcf_alloc_nlcmd(ctx, size);
4268 cmd = mnl_nlmsg_put_header(cmd);
4269 cmd->nlmsg_type = RTM_DELNEIGH;
4270 cmd->nlmsg_flags = NLM_F_REQUEST;
4271 ndm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ndm));
4272 ndm->ndm_ifindex = ctx->ifindex;
4273 ndm->ndm_state = NUD_PERMANENT;
4276 if (family == AF_INET) {
4277 ndm->ndm_family = AF_INET;
4278 mnl_attr_put_u32(cmd, NDA_DST, mnl_attr_get_u32(na_ip));
4280 ndm->ndm_family = AF_INET6;
4281 mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN,
4282 mnl_attr_get_payload(na_ip));
4284 mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN,
4285 mnl_attr_get_payload(na_mac));
4286 assert(size == cmd->nlmsg_len);
4291 * Cleanup the neigh rules on outer interface.
4294 * Context object initialized by mlx5_flow_tcf_context_create().
4295 * @param[in] ifindex
4296 * Network inferface index to perform cleanup.
4299 flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf,
4300 unsigned int ifindex)
4302 struct nlmsghdr *nlh;
4304 struct tcf_nlcb_context ctx = {
4306 .bufsize = MNL_REQUEST_SIZE,
4307 .nlbuf = LIST_HEAD_INITIALIZER(),
4312 /* Seek and destroy leftovers of neigh rules. */
4313 nlh = mnl_nlmsg_put_header(tcf->buf);
4314 nlh->nlmsg_type = RTM_GETNEIGH;
4315 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4316 ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4317 ndm->ndm_family = AF_UNSPEC;
4318 ndm->ndm_ifindex = ifindex;
4319 ndm->ndm_state = NUD_PERMANENT;
4320 ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_neigh_cb, &ctx);
4322 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4323 ret = flow_tcf_send_nlcmd(tcf, &ctx);
4325 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4329 * Collect indices of VXLAN encap/decap interfaces associated with device.
4330 * This is callback routine called by libmnl mnl_cb_run() in loop for
4331 * every message in received packet.
4334 * Pointer to reply header.
4335 * @param[in, out] arg
4336 * Opaque data pointer for this callback.
4339 * A positive, nonzero value on success, negative errno value otherwise
4340 * and rte_errno is set.
4343 flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg)
4345 struct tcf_nlcb_context *ctx = arg;
4346 struct nlmsghdr *cmd;
4347 struct ifinfomsg *ifm;
4349 struct nlattr *na_info = NULL;
4350 struct nlattr *na_vxlan = NULL;
4352 unsigned int vxindex;
4355 if (nlh->nlmsg_type != RTM_NEWLINK) {
4359 ifm = mnl_nlmsg_get_payload(nlh);
4360 if (!ifm->ifi_index) {
4364 mnl_attr_for_each(na, nlh, sizeof(*ifm))
4365 if (mnl_attr_get_type(na) == IFLA_LINKINFO) {
4371 mnl_attr_for_each_nested(na, na_info) {
4372 switch (mnl_attr_get_type(na)) {
4373 case IFLA_INFO_KIND:
4374 if (!strncmp("vxlan", mnl_attr_get_str(na),
4375 mnl_attr_get_len(na)))
4378 case IFLA_INFO_DATA:
4382 if (found && na_vxlan)
4385 if (!found || !na_vxlan)
4388 mnl_attr_for_each_nested(na, na_vxlan) {
4389 if (mnl_attr_get_type(na) == IFLA_VXLAN_LINK &&
4390 mnl_attr_get_u32(na) == ctx->ifindex) {
4397 /* Attached VXLAN device found, store the command to delete. */
4398 vxindex = ifm->ifi_index;
4399 size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4400 MNL_ALIGN(sizeof(struct ifinfomsg));
4401 cmd = flow_tcf_alloc_nlcmd(ctx, size);
4406 cmd = mnl_nlmsg_put_header(cmd);
4407 cmd->nlmsg_type = RTM_DELLINK;
4408 cmd->nlmsg_flags = NLM_F_REQUEST;
4409 ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm));
4410 ifm->ifi_family = AF_UNSPEC;
4411 ifm->ifi_index = vxindex;
4412 assert(size == cmd->nlmsg_len);
4417 * Cleanup the outer interface. Removes all found vxlan devices
4418 * attached to specified index, flushes the neigh and local IP
4422 * Context object initialized by mlx5_flow_tcf_context_create().
4423 * @param[in] ifindex
4424 * Network inferface index to perform cleanup.
4427 flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf,
4428 unsigned int ifindex)
4430 struct nlmsghdr *nlh;
4431 struct ifinfomsg *ifm;
4432 struct tcf_nlcb_context ctx = {
4434 .bufsize = MNL_REQUEST_SIZE,
4435 .nlbuf = LIST_HEAD_INITIALIZER(),
4441 * Seek and destroy leftover VXLAN encap/decap interfaces with
4442 * matching properties.
4444 nlh = mnl_nlmsg_put_header(tcf->buf);
4445 nlh->nlmsg_type = RTM_GETLINK;
4446 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4447 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4448 ifm->ifi_family = AF_UNSPEC;
4449 ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_vxlan_cb, &ctx);
4451 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4452 ret = flow_tcf_send_nlcmd(tcf, &ctx);
4454 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4458 * Emit Netlink message to add/remove local address to the outer device.
4459 * The address being added is visible within the link only (scope link).
4461 * Note that an implicit route is maintained by the kernel due to the
4462 * presence of a peer address (IFA_ADDRESS).
4464 * These rules are used for encapsultion only and allow to assign
4465 * the outer tunnel source IP address.
4468 * Libmnl socket context object.
4470 * Encapsulation properties (source address and its peer).
4471 * @param[in] ifindex
4472 * Network interface to apply rule.
4474 * Toggle between add and remove.
4476 * Perform verbose error reporting if not NULL.
4479 * 0 on success, a negative errno value otherwise and rte_errno is set.
4482 flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf,
4483 const struct flow_tcf_vxlan_encap *encap,
4484 unsigned int ifindex,
4486 struct rte_flow_error *error)
4488 struct nlmsghdr *nlh;
4489 struct ifaddrmsg *ifa;
4490 alignas(struct nlmsghdr)
4491 uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
4493 nlh = mnl_nlmsg_put_header(buf);
4494 nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
4496 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4498 ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4499 ifa->ifa_flags = IFA_F_PERMANENT;
4500 ifa->ifa_scope = RT_SCOPE_LINK;
4501 ifa->ifa_index = ifindex;
4502 if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4503 ifa->ifa_family = AF_INET;
4504 ifa->ifa_prefixlen = 32;
4505 mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src);
4506 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST)
4507 mnl_attr_put_u32(nlh, IFA_ADDRESS,
4510 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4511 ifa->ifa_family = AF_INET6;
4512 ifa->ifa_prefixlen = 128;
4513 mnl_attr_put(nlh, IFA_LOCAL,
4514 sizeof(encap->ipv6.src),
4516 if (encap->mask & FLOW_TCF_ENCAP_IPV6_DST)
4517 mnl_attr_put(nlh, IFA_ADDRESS,
4518 sizeof(encap->ipv6.dst),
4521 if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4523 return rte_flow_error_set(error, rte_errno,
4524 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4525 "netlink: cannot complete IFA request"
4530 * Emit Netlink message to add/remove neighbor.
4533 * Libmnl socket context object.
4535 * Encapsulation properties (destination address).
4536 * @param[in] ifindex
4537 * Network interface.
4539 * Toggle between add and remove.
4541 * Perform verbose error reporting if not NULL.
4544 * 0 on success, a negative errno value otherwise and rte_errno is set.
4547 flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf,
4548 const struct flow_tcf_vxlan_encap *encap,
4549 unsigned int ifindex,
4551 struct rte_flow_error *error)
4553 struct nlmsghdr *nlh;
4555 alignas(struct nlmsghdr)
4556 uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
4558 nlh = mnl_nlmsg_put_header(buf);
4559 nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
4561 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4563 ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4564 ndm->ndm_ifindex = ifindex;
4565 ndm->ndm_state = NUD_PERMANENT;
4568 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4569 ndm->ndm_family = AF_INET;
4570 mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst);
4572 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4573 ndm->ndm_family = AF_INET6;
4574 mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst),
4577 if (encap->mask & FLOW_TCF_ENCAP_ETH_SRC && enable)
4579 "outer ethernet source address cannot be "
4580 "forced for VXLAN encapsulation");
4581 if (encap->mask & FLOW_TCF_ENCAP_ETH_DST)
4582 mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst),
4584 if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4586 return rte_flow_error_set(error, rte_errno,
4587 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4588 "netlink: cannot complete ND request"
4593 * Manage the local IP addresses and their peers IP addresses on the
4594 * outer interface for encapsulation purposes. The kernel searches the
4595 * appropriate device for tunnel egress traffic using the outer source
4596 * IP, this IP should be assigned to the outer network device, otherwise
4597 * kernel rejects the rule.
4599 * Adds or removes the addresses using the Netlink command like this:
4600 * ip addr add <src_ip> peer <dst_ip> scope link dev <ifouter>
4602 * The addresses are local to the netdev ("scope link"), this reduces
4603 * the risk of conflicts. Note that an implicit route is maintained by
4604 * the kernel due to the presence of a peer address (IFA_ADDRESS).
4607 * Libmnl socket context object.
4609 * Object, contains rule database and ifouter index.
4610 * @param[in] dev_flow
4611 * Flow object, contains the tunnel parameters (for encap only).
4613 * Toggle between add and remove.
4615 * Perform verbose error reporting if not NULL.
4618 * 0 on success, a negative errno value otherwise and rte_errno is set.
4621 flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf,
4622 struct tcf_irule *iface,
4623 struct mlx5_flow *dev_flow,
4625 struct rte_flow_error *error)
4627 const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4628 struct tcf_local_rule *rule = NULL;
4632 assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4633 if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4634 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_DST);
4635 LIST_FOREACH(rule, &iface->local, next) {
4636 if (rule->mask & FLOW_TCF_ENCAP_IPV4_SRC &&
4637 encap->ipv4.src == rule->ipv4.src &&
4638 encap->ipv4.dst == rule->ipv4.dst) {
4643 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4644 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4645 LIST_FOREACH(rule, &iface->local, next) {
4646 if (rule->mask & FLOW_TCF_ENCAP_IPV6_SRC &&
4647 !memcmp(&encap->ipv6.src, &rule->ipv6.src,
4648 sizeof(encap->ipv6.src)) &&
4649 !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4650 sizeof(encap->ipv6.dst))) {
4660 if (!rule->refcnt || !--rule->refcnt) {
4661 LIST_REMOVE(rule, next);
4662 return flow_tcf_rule_local(tcf, encap,
4663 iface->ifouter, false, error);
4668 DRV_LOG(WARNING, "disabling not existing local rule");
4669 rte_flow_error_set(error, ENOENT,
4670 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4671 "disabling not existing local rule");
4674 rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule),
4675 alignof(struct tcf_local_rule));
4677 rte_flow_error_set(error, ENOMEM,
4678 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4679 "unable to allocate memory for local rule");
4682 *rule = (struct tcf_local_rule){.refcnt = 0,
4685 if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4686 rule->mask = FLOW_TCF_ENCAP_IPV4_SRC
4687 | FLOW_TCF_ENCAP_IPV4_DST;
4688 rule->ipv4.src = encap->ipv4.src;
4689 rule->ipv4.dst = encap->ipv4.dst;
4691 rule->mask = FLOW_TCF_ENCAP_IPV6_SRC
4692 | FLOW_TCF_ENCAP_IPV6_DST;
4693 memcpy(&rule->ipv6.src, &encap->ipv6.src, IPV6_ADDR_LEN);
4694 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4696 ret = flow_tcf_rule_local(tcf, encap, iface->ifouter, true, error);
4702 LIST_INSERT_HEAD(&iface->local, rule, next);
4707 * Manage the destination MAC/IP addresses neigh database, kernel uses
4708 * this one to determine the destination MAC address within encapsulation
4709 * header. Adds or removes the entries using the Netlink command like this:
4710 * ip neigh add dev <ifouter> lladdr <dst_mac> to <dst_ip> nud permanent
4713 * Libmnl socket context object.
4715 * Object, contains rule database and ifouter index.
4716 * @param[in] dev_flow
4717 * Flow object, contains the tunnel parameters (for encap only).
4719 * Toggle between add and remove.
4721 * Perform verbose error reporting if not NULL.
4724 * 0 on success, a negative errno value otherwise and rte_errno is set.
4727 flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf,
4728 struct tcf_irule *iface,
4729 struct mlx5_flow *dev_flow,
4731 struct rte_flow_error *error)
4733 const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4734 struct tcf_neigh_rule *rule = NULL;
4738 assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4739 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4740 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_SRC);
4741 LIST_FOREACH(rule, &iface->neigh, next) {
4742 if (rule->mask & FLOW_TCF_ENCAP_IPV4_DST &&
4743 encap->ipv4.dst == rule->ipv4.dst) {
4748 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4749 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4750 LIST_FOREACH(rule, &iface->neigh, next) {
4751 if (rule->mask & FLOW_TCF_ENCAP_IPV6_DST &&
4752 !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4753 sizeof(encap->ipv6.dst))) {
4759 if (memcmp(&encap->eth.dst, &rule->eth,
4760 sizeof(encap->eth.dst))) {
4761 DRV_LOG(WARNING, "Destination MAC differs"
4763 rte_flow_error_set(error, EEXIST,
4764 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
4765 NULL, "Different MAC address"
4766 " neigh rule for the same"
4774 if (!rule->refcnt || !--rule->refcnt) {
4775 LIST_REMOVE(rule, next);
4776 return flow_tcf_rule_neigh(tcf, encap,
4783 DRV_LOG(WARNING, "Disabling not existing neigh rule");
4784 rte_flow_error_set(error, ENOENT,
4785 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4786 "unable to allocate memory for neigh rule");
4789 rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule),
4790 alignof(struct tcf_neigh_rule));
4792 rte_flow_error_set(error, ENOMEM,
4793 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4794 "unable to allocate memory for neigh rule");
4797 *rule = (struct tcf_neigh_rule){.refcnt = 0,
4800 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4801 rule->mask = FLOW_TCF_ENCAP_IPV4_DST;
4802 rule->ipv4.dst = encap->ipv4.dst;
4804 rule->mask = FLOW_TCF_ENCAP_IPV6_DST;
4805 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4807 memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth));
4808 ret = flow_tcf_rule_neigh(tcf, encap, iface->ifouter, true, error);
4814 LIST_INSERT_HEAD(&iface->neigh, rule, next);
4818 /* VXLAN encap rule database for outer interfaces. */
4819 static LIST_HEAD(, tcf_irule) iface_list_vxlan = LIST_HEAD_INITIALIZER();
4821 /* VTEP device list is shared between PMD port instances. */
4822 static LIST_HEAD(, tcf_vtep) vtep_list_vxlan = LIST_HEAD_INITIALIZER();
4823 static pthread_mutex_t vtep_list_mutex = PTHREAD_MUTEX_INITIALIZER;
4826 * Acquire the VXLAN encap rules container for specified interface.
4827 * First looks for the container in the existing ones list, creates
4828 * and initializes the new container if existing not found.
4831 * Context object initialized by mlx5_flow_tcf_context_create().
4832 * @param[in] ifouter
4833 * Network interface index to create VXLAN encap rules on.
4835 * Perform verbose error reporting if not NULL.
4837 * Rule container pointer on success,
4838 * NULL otherwise and rte_errno is set.
4840 static struct tcf_irule*
4841 flow_tcf_encap_irule_acquire(struct mlx5_flow_tcf_context *tcf,
4842 unsigned int ifouter,
4843 struct rte_flow_error *error)
4845 struct tcf_irule *iface;
4847 /* Look whether the container for encap rules is created. */
4849 LIST_FOREACH(iface, &iface_list_vxlan, next) {
4850 if (iface->ifouter == ifouter)
4854 /* Container already exists, just increment the reference. */
4858 /* Not found, we should create the new container. */
4859 iface = rte_zmalloc(__func__, sizeof(*iface),
4860 alignof(struct tcf_irule));
4862 rte_flow_error_set(error, ENOMEM,
4863 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4864 "unable to allocate memory for container");
4867 *iface = (struct tcf_irule){
4868 .local = LIST_HEAD_INITIALIZER(),
4869 .neigh = LIST_HEAD_INITIALIZER(),
4873 /* Interface cleanup for new container created. */
4874 flow_tcf_encap_iface_cleanup(tcf, ifouter);
4875 flow_tcf_encap_local_cleanup(tcf, ifouter);
4876 flow_tcf_encap_neigh_cleanup(tcf, ifouter);
4877 LIST_INSERT_HEAD(&iface_list_vxlan, iface, next);
4882 * Releases VXLAN encap rules container by pointer. Decrements the
4883 * reference cointer and deletes the container if counter is zero.
4886 * VXLAN rule container pointer to release.
4889 flow_tcf_encap_irule_release(struct tcf_irule *iface)
4891 assert(iface->refcnt);
4892 if (--iface->refcnt == 0) {
4893 /* Reference counter is zero, delete the container. */
4894 assert(LIST_EMPTY(&iface->local));
4895 assert(LIST_EMPTY(&iface->neigh));
4896 LIST_REMOVE(iface, next);
4902 * Deletes VTEP network device.
4905 * Context object initialized by mlx5_flow_tcf_context_create().
4907 * Object represinting the network device to delete. Memory
4908 * allocated for this object is freed by routine.
4911 flow_tcf_vtep_delete(struct mlx5_flow_tcf_context *tcf,
4912 struct tcf_vtep *vtep)
4914 struct nlmsghdr *nlh;
4915 struct ifinfomsg *ifm;
4916 alignas(struct nlmsghdr)
4917 uint8_t buf[mnl_nlmsg_size(MNL_ALIGN(sizeof(*ifm))) +
4918 MNL_BUF_EXTRA_SPACE];
4921 assert(!vtep->refcnt);
4922 /* Delete only ifaces those we actually created. */
4923 if (vtep->created && vtep->ifindex) {
4924 DRV_LOG(INFO, "VTEP delete (%d)", vtep->ifindex);
4925 nlh = mnl_nlmsg_put_header(buf);
4926 nlh->nlmsg_type = RTM_DELLINK;
4927 nlh->nlmsg_flags = NLM_F_REQUEST;
4928 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4929 ifm->ifi_family = AF_UNSPEC;
4930 ifm->ifi_index = vtep->ifindex;
4931 assert(sizeof(buf) >= nlh->nlmsg_len);
4932 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4934 DRV_LOG(WARNING, "netlink: error deleting vxlan"
4935 " encap/decap ifindex %u",
4942 * Creates VTEP network device.
4945 * Context object initialized by mlx5_flow_tcf_context_create().
4947 * UDP port of created VTEP device.
4949 * Perform verbose error reporting if not NULL.
4952 * Pointer to created device structure on success,
4953 * NULL otherwise and rte_errno is set.
4955 static struct tcf_vtep*
4956 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf,
4957 uint16_t port, struct rte_flow_error *error)
4959 struct tcf_vtep *vtep;
4960 struct nlmsghdr *nlh;
4961 struct ifinfomsg *ifm;
4962 char name[sizeof(MLX5_VXLAN_DEVICE_PFX) + 24];
4963 alignas(struct nlmsghdr)
4964 uint8_t buf[mnl_nlmsg_size(sizeof(*ifm)) +
4965 SZ_NLATTR_DATA_OF(sizeof(name)) +
4966 SZ_NLATTR_NEST * 2 +
4967 SZ_NLATTR_STRZ_OF("vxlan") +
4968 SZ_NLATTR_DATA_OF(sizeof(uint32_t)) +
4969 SZ_NLATTR_DATA_OF(sizeof(uint16_t)) +
4970 SZ_NLATTR_DATA_OF(sizeof(uint8_t)) * 3 +
4971 MNL_BUF_EXTRA_SPACE];
4972 struct nlattr *na_info;
4973 struct nlattr *na_vxlan;
4974 rte_be16_t vxlan_port = rte_cpu_to_be_16(port);
4977 vtep = rte_zmalloc(__func__, sizeof(*vtep), alignof(struct tcf_vtep));
4979 rte_flow_error_set(error, ENOMEM,
4980 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4981 "unable to allocate memory for VTEP");
4984 *vtep = (struct tcf_vtep){
4987 memset(buf, 0, sizeof(buf));
4988 nlh = mnl_nlmsg_put_header(buf);
4989 nlh->nlmsg_type = RTM_NEWLINK;
4990 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
4991 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4992 ifm->ifi_family = AF_UNSPEC;
4995 ifm->ifi_flags = IFF_UP;
4996 ifm->ifi_change = 0xffffffff;
4997 snprintf(name, sizeof(name), "%s%u", MLX5_VXLAN_DEVICE_PFX, port);
4998 mnl_attr_put_strz(nlh, IFLA_IFNAME, name);
4999 na_info = mnl_attr_nest_start(nlh, IFLA_LINKINFO);
5001 mnl_attr_put_strz(nlh, IFLA_INFO_KIND, "vxlan");
5002 na_vxlan = mnl_attr_nest_start(nlh, IFLA_INFO_DATA);
5004 #ifdef HAVE_IFLA_VXLAN_COLLECT_METADATA
5006 * RH 7.2 does not support metadata for tunnel device.
5007 * It does not matter because we are going to use the
5008 * hardware offload by mlx5 driver.
5010 mnl_attr_put_u8(nlh, IFLA_VXLAN_COLLECT_METADATA, 1);
5012 mnl_attr_put_u8(nlh, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1);
5013 mnl_attr_put_u8(nlh, IFLA_VXLAN_LEARNING, 0);
5014 mnl_attr_put_u16(nlh, IFLA_VXLAN_PORT, vxlan_port);
5015 #ifndef HAVE_IFLA_VXLAN_COLLECT_METADATA
5017 * We must specify VNI explicitly if metadata not supported.
5018 * Note, VNI is transferred with native endianness format.
5020 mnl_attr_put_u16(nlh, IFLA_VXLAN_ID, MLX5_VXLAN_DEFAULT_VNI);
5022 mnl_attr_nest_end(nlh, na_vxlan);
5023 mnl_attr_nest_end(nlh, na_info);
5024 assert(sizeof(buf) >= nlh->nlmsg_len);
5025 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5028 "netlink: VTEP %s create failure (%d)",
5030 if (rte_errno != EEXIST)
5032 * Some unhandled error occurred or device is
5033 * for encapsulation and cannot be shared.
5038 * Mark device we actually created.
5039 * We should explicitly delete
5040 * when we do not need it anymore.
5044 /* Try to get ifindex of created of pre-existing device. */
5045 ret = if_nametoindex(name);
5048 "VTEP %s failed to get index (%d)", name, errno);
5051 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5052 "netlink: failed to retrieve VTEP ifindex");
5055 vtep->ifindex = ret;
5056 memset(buf, 0, sizeof(buf));
5057 nlh = mnl_nlmsg_put_header(buf);
5058 nlh->nlmsg_type = RTM_NEWLINK;
5059 nlh->nlmsg_flags = NLM_F_REQUEST;
5060 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5061 ifm->ifi_family = AF_UNSPEC;
5063 ifm->ifi_index = vtep->ifindex;
5064 ifm->ifi_flags = IFF_UP;
5065 ifm->ifi_change = IFF_UP;
5066 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5068 rte_flow_error_set(error, -errno,
5069 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5070 "netlink: failed to set VTEP link up");
5071 DRV_LOG(WARNING, "netlink: VTEP %s set link up failure (%d)",
5075 ret = mlx5_flow_tcf_init(tcf, vtep->ifindex, error);
5077 DRV_LOG(WARNING, "VTEP %s init failure (%d)", name, rte_errno);
5080 DRV_LOG(INFO, "VTEP create (%d, %d)", vtep->port, vtep->ifindex);
5084 flow_tcf_vtep_delete(tcf, vtep);
5092 * Acquire target interface index for VXLAN tunneling decapsulation.
5093 * In order to share the UDP port within the other interfaces the
5094 * VXLAN device created as not attached to any interface (if created).
5097 * Context object initialized by mlx5_flow_tcf_context_create().
5098 * @param[in] dev_flow
5099 * Flow tcf object with tunnel structure pointer set.
5101 * Perform verbose error reporting if not NULL.
5103 * Interface descriptor pointer on success,
5104 * NULL otherwise and rte_errno is set.
5106 static struct tcf_vtep*
5107 flow_tcf_decap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5108 struct mlx5_flow *dev_flow,
5109 struct rte_flow_error *error)
5111 struct tcf_vtep *vtep;
5112 uint16_t port = dev_flow->tcf.vxlan_decap->udp_port;
5114 LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5115 if (vtep->port == port)
5119 /* Device exists, just increment the reference counter. */
5121 assert(vtep->ifindex);
5124 /* No decapsulation device exists, try to create the new one. */
5125 vtep = flow_tcf_vtep_create(tcf, port, error);
5127 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5132 * Aqcuire target interface index for VXLAN tunneling encapsulation.
5135 * Context object initialized by mlx5_flow_tcf_context_create().
5136 * @param[in] ifouter
5137 * Network interface index to attach VXLAN encap device to.
5138 * @param[in] dev_flow
5139 * Flow tcf object with tunnel structure pointer set.
5141 * Perform verbose error reporting if not NULL.
5143 * Interface descriptor pointer on success,
5144 * NULL otherwise and rte_errno is set.
5146 static struct tcf_vtep*
5147 flow_tcf_encap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5148 unsigned int ifouter,
5149 struct mlx5_flow *dev_flow,
5150 struct rte_flow_error *error)
5152 static uint16_t port;
5153 struct tcf_vtep *vtep;
5154 struct tcf_irule *iface;
5158 /* Look whether the VTEP for specified port is created. */
5159 port = rte_be_to_cpu_16(dev_flow->tcf.vxlan_encap->udp.dst);
5160 LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5161 if (vtep->port == port)
5165 /* VTEP already exists, just increment the reference. */
5168 /* Not found, we should create the new VTEP. */
5169 vtep = flow_tcf_vtep_create(tcf, port, error);
5172 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5174 assert(vtep->ifindex);
5175 iface = flow_tcf_encap_irule_acquire(tcf, ifouter, error);
5177 if (--vtep->refcnt == 0)
5178 flow_tcf_vtep_delete(tcf, vtep);
5181 dev_flow->tcf.vxlan_encap->iface = iface;
5182 /* Create local ipaddr with peer to specify the outer IPs. */
5183 ret = flow_tcf_encap_local(tcf, iface, dev_flow, true, error);
5185 /* Create neigh rule to specify outer destination MAC. */
5186 ret = flow_tcf_encap_neigh(tcf, iface, dev_flow, true, error);
5188 flow_tcf_encap_local(tcf, iface,
5189 dev_flow, false, error);
5192 dev_flow->tcf.vxlan_encap->iface = NULL;
5193 flow_tcf_encap_irule_release(iface);
5194 if (--vtep->refcnt == 0)
5195 flow_tcf_vtep_delete(tcf, vtep);
5202 * Acquires target interface index for tunneling of any type.
5203 * Creates the new VTEP if needed.
5206 * Context object initialized by mlx5_flow_tcf_context_create().
5207 * @param[in] ifouter
5208 * Network interface index to create VXLAN encap rules on.
5209 * @param[in] dev_flow
5210 * Flow tcf object with tunnel structure pointer set.
5212 * Perform verbose error reporting if not NULL.
5214 * Interface descriptor pointer on success,
5215 * NULL otherwise and rte_errno is set.
5217 static struct tcf_vtep*
5218 flow_tcf_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5219 unsigned int ifouter,
5220 struct mlx5_flow *dev_flow,
5221 struct rte_flow_error *error)
5223 struct tcf_vtep *vtep = NULL;
5225 assert(dev_flow->tcf.tunnel);
5226 pthread_mutex_lock(&vtep_list_mutex);
5227 switch (dev_flow->tcf.tunnel->type) {
5228 case FLOW_TCF_TUNACT_VXLAN_ENCAP:
5229 vtep = flow_tcf_encap_vtep_acquire(tcf, ifouter,
5232 case FLOW_TCF_TUNACT_VXLAN_DECAP:
5233 vtep = flow_tcf_decap_vtep_acquire(tcf, dev_flow, error);
5236 rte_flow_error_set(error, ENOTSUP,
5237 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5238 "unsupported tunnel type");
5241 pthread_mutex_unlock(&vtep_list_mutex);
5246 * Release tunneling interface by ifindex. Decrements reference
5247 * counter and actually removes the device if counter is zero.
5250 * Context object initialized by mlx5_flow_tcf_context_create().
5252 * VTEP device descriptor structure.
5253 * @param[in] dev_flow
5254 * Flow tcf object with tunnel structure pointer set.
5257 flow_tcf_vtep_release(struct mlx5_flow_tcf_context *tcf,
5258 struct tcf_vtep *vtep,
5259 struct mlx5_flow *dev_flow)
5261 assert(dev_flow->tcf.tunnel);
5262 pthread_mutex_lock(&vtep_list_mutex);
5263 switch (dev_flow->tcf.tunnel->type) {
5264 case FLOW_TCF_TUNACT_VXLAN_DECAP:
5266 case FLOW_TCF_TUNACT_VXLAN_ENCAP: {
5267 struct tcf_irule *iface;
5269 /* Remove the encap ancillary rules first. */
5270 iface = dev_flow->tcf.vxlan_encap->iface;
5272 flow_tcf_encap_neigh(tcf, iface, dev_flow, false, NULL);
5273 flow_tcf_encap_local(tcf, iface, dev_flow, false, NULL);
5274 flow_tcf_encap_irule_release(iface);
5275 dev_flow->tcf.vxlan_encap->iface = NULL;
5280 DRV_LOG(WARNING, "Unsupported tunnel type");
5283 assert(vtep->refcnt);
5284 if (--vtep->refcnt == 0) {
5285 LIST_REMOVE(vtep, next);
5286 flow_tcf_vtep_delete(tcf, vtep);
5288 pthread_mutex_unlock(&vtep_list_mutex);
5291 struct tcf_nlcb_query {
5294 uint32_t flags_valid:1;
5298 * Collect queried rule attributes. This is callback routine called by
5299 * libmnl mnl_cb_run() in loop for every message in received packet.
5300 * Current implementation collects the flower flags only.
5303 * Pointer to reply header.
5304 * @param[in, out] arg
5305 * Context pointer for this callback.
5308 * A positive, nonzero value on success (required by libmnl
5309 * to continue messages processing).
5312 flow_tcf_collect_query_cb(const struct nlmsghdr *nlh, void *arg)
5314 struct tcf_nlcb_query *query = arg;
5315 struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
5316 struct nlattr *na, *na_opt;
5317 bool flower = false;
5319 if (nlh->nlmsg_type != RTM_NEWTFILTER ||
5320 tcm->tcm_handle != query->handle)
5322 mnl_attr_for_each(na, nlh, sizeof(*tcm)) {
5323 switch (mnl_attr_get_type(na)) {
5325 if (strcmp(mnl_attr_get_payload(na), "flower")) {
5326 /* Not flower filter, drop entire message. */
5333 /* Not flower options, drop entire message. */
5336 /* Check nested flower options. */
5337 mnl_attr_for_each_nested(na_opt, na) {
5338 switch (mnl_attr_get_type(na_opt)) {
5339 case TCA_FLOWER_FLAGS:
5340 query->flags_valid = 1;
5342 mnl_attr_get_u32(na_opt);
5353 * Query a TC flower rule flags via netlink.
5356 * Context object initialized by mlx5_flow_tcf_context_create().
5357 * @param[in] dev_flow
5358 * Pointer to the flow.
5359 * @param[out] pflags
5360 * pointer to the data retrieved by the query.
5363 * 0 on success, a negative errno value otherwise.
5366 flow_tcf_query_flags(struct mlx5_flow_tcf_context *tcf,
5367 struct mlx5_flow *dev_flow,
5370 struct nlmsghdr *nlh;
5372 struct tcf_nlcb_query query = {
5373 .handle = dev_flow->tcf.tcm->tcm_handle,
5376 nlh = mnl_nlmsg_put_header(tcf->buf);
5377 nlh->nlmsg_type = RTM_GETTFILTER;
5378 nlh->nlmsg_flags = NLM_F_REQUEST;
5379 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
5380 memcpy(tcm, dev_flow->tcf.tcm, sizeof(*tcm));
5382 * Ignore Netlink error for filter query operations.
5383 * The reply length is sent by kernel as errno.
5384 * Just check we got the flags option.
5386 flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_query_cb, &query);
5387 if (!query.flags_valid) {
5391 *pflags = query.tc_flags;
5396 * Query and check the in_hw set for specified rule.
5399 * Context object initialized by mlx5_flow_tcf_context_create().
5400 * @param[in] dev_flow
5401 * Pointer to the flow to check.
5404 * 0 on success, a negative errno value otherwise.
5407 flow_tcf_check_inhw(struct mlx5_flow_tcf_context *tcf,
5408 struct mlx5_flow *dev_flow)
5413 ret = flow_tcf_query_flags(tcf, dev_flow, &flags);
5416 return (flags & TCA_CLS_FLAGS_IN_HW) ? 0 : -ENOENT;
5420 * Remove flow from E-Switch by sending Netlink message.
5423 * Pointer to Ethernet device.
5424 * @param[in, out] flow
5425 * Pointer to the sub flow.
5428 flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
5430 struct priv *priv = dev->data->dev_private;
5431 struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5432 struct mlx5_flow *dev_flow;
5433 struct nlmsghdr *nlh;
5437 dev_flow = LIST_FIRST(&flow->dev_flows);
5440 /* E-Switch flow can't be expanded. */
5441 assert(!LIST_NEXT(dev_flow, next));
5442 if (dev_flow->tcf.applied) {
5443 nlh = dev_flow->tcf.nlh;
5444 nlh->nlmsg_type = RTM_DELTFILTER;
5445 nlh->nlmsg_flags = NLM_F_REQUEST;
5446 flow_tcf_nl_ack(ctx, nlh, NULL, NULL);
5447 if (dev_flow->tcf.tunnel) {
5448 assert(dev_flow->tcf.tunnel->vtep);
5449 flow_tcf_vtep_release(ctx,
5450 dev_flow->tcf.tunnel->vtep,
5452 dev_flow->tcf.tunnel->vtep = NULL;
5454 dev_flow->tcf.applied = 0;
5459 * Apply flow to E-Switch by sending Netlink message.
5462 * Pointer to Ethernet device.
5463 * @param[in, out] flow
5464 * Pointer to the sub flow.
5466 * Pointer to the error structure.
5469 * 0 on success, a negative errno value otherwise and rte_errno is set.
5472 flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
5473 struct rte_flow_error *error)
5475 struct priv *priv = dev->data->dev_private;
5476 struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5477 struct mlx5_flow *dev_flow;
5478 struct nlmsghdr *nlh;
5480 dev_flow = LIST_FIRST(&flow->dev_flows);
5481 /* E-Switch flow can't be expanded. */
5482 assert(!LIST_NEXT(dev_flow, next));
5483 if (dev_flow->tcf.applied)
5485 nlh = dev_flow->tcf.nlh;
5486 nlh->nlmsg_type = RTM_NEWTFILTER;
5487 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
5488 if (dev_flow->tcf.tunnel) {
5490 * Replace the interface index, target for
5491 * encapsulation, source for decapsulation.
5493 assert(!dev_flow->tcf.tunnel->vtep);
5494 assert(dev_flow->tcf.tunnel->ifindex_ptr);
5495 /* Acquire actual VTEP device when rule is being applied. */
5496 dev_flow->tcf.tunnel->vtep =
5497 flow_tcf_vtep_acquire(ctx,
5498 dev_flow->tcf.tunnel->ifindex_org,
5500 if (!dev_flow->tcf.tunnel->vtep)
5502 DRV_LOG(INFO, "Replace ifindex: %d->%d",
5503 dev_flow->tcf.tunnel->vtep->ifindex,
5504 dev_flow->tcf.tunnel->ifindex_org);
5505 *dev_flow->tcf.tunnel->ifindex_ptr =
5506 dev_flow->tcf.tunnel->vtep->ifindex;
5508 if (!flow_tcf_nl_ack(ctx, nlh, NULL, NULL)) {
5509 dev_flow->tcf.applied = 1;
5510 if (*dev_flow->tcf.ptc_flags & TCA_CLS_FLAGS_SKIP_SW)
5513 * Rule was applied without skip_sw flag set.
5514 * We should check whether the rule was acctually
5515 * accepted by hardware (have look at in_hw flag).
5517 if (flow_tcf_check_inhw(ctx, dev_flow)) {
5518 flow_tcf_remove(dev, flow);
5519 return rte_flow_error_set
5521 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5522 "netlink: rule has no in_hw flag set");
5526 if (dev_flow->tcf.tunnel) {
5527 /* Rollback the VTEP configuration if rule apply failed. */
5528 assert(dev_flow->tcf.tunnel->vtep);
5529 flow_tcf_vtep_release(ctx, dev_flow->tcf.tunnel->vtep,
5531 dev_flow->tcf.tunnel->vtep = NULL;
5533 return rte_flow_error_set(error, rte_errno,
5534 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5535 "netlink: failed to create TC flow rule");
5539 * Remove flow from E-Switch and release resources of the device flow.
5542 * Pointer to Ethernet device.
5543 * @param[in, out] flow
5544 * Pointer to the sub flow.
5547 flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
5549 struct mlx5_flow *dev_flow;
5553 flow_tcf_remove(dev, flow);
5554 if (flow->counter) {
5555 if (--flow->counter->ref_cnt == 0) {
5556 rte_free(flow->counter);
5557 flow->counter = NULL;
5560 dev_flow = LIST_FIRST(&flow->dev_flows);
5563 /* E-Switch flow can't be expanded. */
5564 assert(!LIST_NEXT(dev_flow, next));
5565 LIST_REMOVE(dev_flow, next);
5570 * Helper routine for figuring the space size required for a parse buffer.
5573 * array of values to use.
5575 * Current location in array.
5577 * Value to compare with.
5580 * The maximum between the given value and the array value on index.
5583 flow_tcf_arr_val_max(uint16_t array[], int idx, uint16_t value)
5585 return idx < 0 ? (value) : RTE_MAX((array)[idx], value);
5589 * Parse rtnetlink message attributes filling the attribute table with the info
5593 * Attribute table to be filled.
5595 * Maxinum entry in the attribute table.
5597 * The attributes section in the message to be parsed.
5599 * The length of the attributes section in the message.
5602 flow_tcf_nl_parse_rtattr(struct rtattr *tb[], int max,
5603 struct rtattr *rta, int len)
5605 unsigned short type;
5606 memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
5607 while (RTA_OK(rta, len)) {
5608 type = rta->rta_type;
5609 if (type <= max && !tb[type])
5611 rta = RTA_NEXT(rta, len);
5616 * Extract flow counters from flower action.
5619 * flower action stats properties in the Netlink message received.
5621 * The backward sequence of rta_types, as written in the attribute table,
5622 * we need to traverse in order to get to the requested object.
5624 * Current location in rta_type table.
5626 * data holding the count statistics of the rte_flow retrieved from
5630 * 0 if data was found and retrieved, -1 otherwise.
5633 flow_tcf_nl_action_stats_parse_and_get(struct rtattr *rta,
5634 uint16_t rta_type[], int idx,
5635 struct gnet_stats_basic *data)
5637 int tca_stats_max = flow_tcf_arr_val_max(rta_type, idx,
5639 struct rtattr *tbs[tca_stats_max + 1];
5641 if (rta == NULL || idx < 0)
5643 flow_tcf_nl_parse_rtattr(tbs, tca_stats_max,
5644 RTA_DATA(rta), RTA_PAYLOAD(rta));
5645 switch (rta_type[idx]) {
5646 case TCA_STATS_BASIC:
5647 if (tbs[TCA_STATS_BASIC]) {
5648 memcpy(data, RTA_DATA(tbs[TCA_STATS_BASIC]),
5649 RTE_MIN(RTA_PAYLOAD(tbs[TCA_STATS_BASIC]),
5661 * Parse flower single action retrieving the requested action attribute,
5665 * flower action properties in the Netlink message received.
5667 * The backward sequence of rta_types, as written in the attribute table,
5668 * we need to traverse in order to get to the requested object.
5670 * Current location in rta_type table.
5672 * Count statistics retrieved from the message query.
5675 * 0 if data was found and retrieved, -1 otherwise.
5678 flow_tcf_nl_parse_one_action_and_get(struct rtattr *arg,
5679 uint16_t rta_type[], int idx, void *data)
5681 int tca_act_max = flow_tcf_arr_val_max(rta_type, idx, TCA_ACT_STATS);
5682 struct rtattr *tb[tca_act_max + 1];
5684 if (arg == NULL || idx < 0)
5686 flow_tcf_nl_parse_rtattr(tb, tca_act_max,
5687 RTA_DATA(arg), RTA_PAYLOAD(arg));
5688 if (tb[TCA_ACT_KIND] == NULL)
5690 switch (rta_type[idx]) {
5692 if (tb[TCA_ACT_STATS])
5693 return flow_tcf_nl_action_stats_parse_and_get
5696 (struct gnet_stats_basic *)data);
5705 * Parse flower action section in the message retrieving the requested
5706 * attribute from the first action that provides it.
5709 * flower section in the Netlink message received.
5711 * The backward sequence of rta_types, as written in the attribute table,
5712 * we need to traverse in order to get to the requested object.
5714 * Current location in rta_type table.
5716 * data retrieved from the message query.
5719 * 0 if data was found and retrieved, -1 otherwise.
5722 flow_tcf_nl_action_parse_and_get(struct rtattr *arg,
5723 uint16_t rta_type[], int idx, void *data)
5725 struct rtattr *tb[TCA_ACT_MAX_PRIO + 1];
5728 if (arg == NULL || idx < 0)
5730 flow_tcf_nl_parse_rtattr(tb, TCA_ACT_MAX_PRIO,
5731 RTA_DATA(arg), RTA_PAYLOAD(arg));
5732 switch (rta_type[idx]) {
5734 * flow counters are stored in the actions defined by the flow
5735 * and not in the flow itself, therefore we need to traverse the
5736 * flower chain of actions in search for them.
5738 * Note that the index is not decremented here.
5741 for (i = 0; i <= TCA_ACT_MAX_PRIO; i++) {
5743 !flow_tcf_nl_parse_one_action_and_get(tb[i],
5756 * Parse flower classifier options in the message, retrieving the requested
5757 * attribute if found.
5760 * flower section in the Netlink message received.
5762 * The backward sequence of rta_types, as written in the attribute table,
5763 * we need to traverse in order to get to the requested object.
5765 * Current location in rta_type table.
5767 * data retrieved from the message query.
5770 * 0 if data was found and retrieved, -1 otherwise.
5773 flow_tcf_nl_opts_parse_and_get(struct rtattr *opt,
5774 uint16_t rta_type[], int idx, void *data)
5776 int tca_flower_max = flow_tcf_arr_val_max(rta_type, idx,
5778 struct rtattr *tb[tca_flower_max + 1];
5780 if (!opt || idx < 0)
5782 flow_tcf_nl_parse_rtattr(tb, tca_flower_max,
5783 RTA_DATA(opt), RTA_PAYLOAD(opt));
5784 switch (rta_type[idx]) {
5785 case TCA_FLOWER_ACT:
5786 if (tb[TCA_FLOWER_ACT])
5787 return flow_tcf_nl_action_parse_and_get
5788 (tb[TCA_FLOWER_ACT],
5789 rta_type, --idx, data);
5798 * Parse Netlink reply on filter query, retrieving the flow counters.
5801 * Message received from Netlink.
5803 * The backward sequence of rta_types, as written in the attribute table,
5804 * we need to traverse in order to get to the requested object.
5806 * Current location in rta_type table.
5808 * data retrieved from the message query.
5811 * 0 if data was found and retrieved, -1 otherwise.
5814 flow_tcf_nl_filter_parse_and_get(struct nlmsghdr *cnlh,
5815 uint16_t rta_type[], int idx, void *data)
5817 struct nlmsghdr *nlh = cnlh;
5818 struct tcmsg *t = NLMSG_DATA(nlh);
5819 int len = nlh->nlmsg_len;
5820 int tca_max = flow_tcf_arr_val_max(rta_type, idx, TCA_OPTIONS);
5821 struct rtattr *tb[tca_max + 1];
5825 if (nlh->nlmsg_type != RTM_NEWTFILTER &&
5826 nlh->nlmsg_type != RTM_GETTFILTER &&
5827 nlh->nlmsg_type != RTM_DELTFILTER)
5829 len -= NLMSG_LENGTH(sizeof(*t));
5832 flow_tcf_nl_parse_rtattr(tb, tca_max, TCA_RTA(t), len);
5833 /* Not a TC flower flow - bail out */
5834 if (!tb[TCA_KIND] ||
5835 strcmp(RTA_DATA(tb[TCA_KIND]), "flower"))
5837 switch (rta_type[idx]) {
5839 if (tb[TCA_OPTIONS])
5840 return flow_tcf_nl_opts_parse_and_get(tb[TCA_OPTIONS],
5851 * A callback to parse Netlink reply on TC flower query.
5854 * Message received from Netlink.
5856 * Pointer to data area to be filled by the parsing routine.
5857 * assumed to be a pointer to struct flow_tcf_stats_basic.
5863 flow_tcf_nl_message_get_stats_basic(const struct nlmsghdr *nlh, void *data)
5866 * The backward sequence of rta_types to pass in order to get
5869 uint16_t rta_type[] = { TCA_STATS_BASIC, TCA_ACT_STATS,
5870 TCA_FLOWER_ACT, TCA_OPTIONS };
5871 struct flow_tcf_stats_basic *sb_data = data;
5873 const struct nlmsghdr *c;
5874 struct nlmsghdr *nc;
5875 } tnlh = { .c = nlh };
5877 if (!flow_tcf_nl_filter_parse_and_get(tnlh.nc, rta_type,
5878 RTE_DIM(rta_type) - 1,
5879 (void *)&sb_data->counters))
5880 sb_data->valid = true;
5885 * Query a TC flower rule for its statistics via netlink.
5888 * Pointer to Ethernet device.
5890 * Pointer to the sub flow.
5892 * data retrieved by the query.
5894 * Perform verbose error reporting if not NULL.
5897 * 0 on success, a negative errno value otherwise and rte_errno is set.
5900 flow_tcf_query_count(struct rte_eth_dev *dev,
5901 struct rte_flow *flow,
5903 struct rte_flow_error *error)
5905 struct flow_tcf_stats_basic sb_data;
5906 struct rte_flow_query_count *qc = data;
5907 struct priv *priv = dev->data->dev_private;
5908 struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5909 struct mnl_socket *nl = ctx->nl;
5910 struct mlx5_flow *dev_flow;
5911 struct nlmsghdr *nlh;
5912 uint32_t seq = priv->tcf_context->seq++;
5916 memset(&sb_data, 0, sizeof(sb_data));
5917 dev_flow = LIST_FIRST(&flow->dev_flows);
5918 /* E-Switch flow can't be expanded. */
5919 assert(!LIST_NEXT(dev_flow, next));
5920 if (!dev_flow->flow->counter)
5922 nlh = dev_flow->tcf.nlh;
5923 nlh->nlmsg_type = RTM_GETTFILTER;
5924 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ECHO;
5925 nlh->nlmsg_seq = seq;
5926 if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) == -1)
5929 ret = mnl_socket_recvfrom(nl, ctx->buf, ctx->buf_size);
5932 ret = mnl_cb_run(ctx->buf, ret, seq,
5933 mnl_socket_get_portid(nl),
5934 flow_tcf_nl_message_get_stats_basic,
5937 /* Return the delta from last reset. */
5938 if (sb_data.valid) {
5939 /* Return the delta from last reset. */
5942 qc->hits = sb_data.counters.packets - flow->counter->hits;
5943 qc->bytes = sb_data.counters.bytes - flow->counter->bytes;
5945 flow->counter->hits = sb_data.counters.packets;
5946 flow->counter->bytes = sb_data.counters.bytes;
5950 return rte_flow_error_set(error, EINVAL,
5951 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5953 "flow does not have counter");
5955 return rte_flow_error_set
5956 (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5957 NULL, "netlink: failed to read flow rule counters");
5959 return rte_flow_error_set
5960 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
5961 NULL, "counters are not available.");
5967 * @see rte_flow_query()
5971 flow_tcf_query(struct rte_eth_dev *dev,
5972 struct rte_flow *flow,
5973 const struct rte_flow_action *actions,
5975 struct rte_flow_error *error)
5979 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
5980 switch (actions->type) {
5981 case RTE_FLOW_ACTION_TYPE_VOID:
5983 case RTE_FLOW_ACTION_TYPE_COUNT:
5984 ret = flow_tcf_query_count(dev, flow, data, error);
5987 return rte_flow_error_set(error, ENOTSUP,
5988 RTE_FLOW_ERROR_TYPE_ACTION,
5990 "action not supported");
5996 const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops = {
5997 .validate = flow_tcf_validate,
5998 .prepare = flow_tcf_prepare,
5999 .translate = flow_tcf_translate,
6000 .apply = flow_tcf_apply,
6001 .remove = flow_tcf_remove,
6002 .destroy = flow_tcf_destroy,
6003 .query = flow_tcf_query,
6007 * Create and configure a libmnl socket for Netlink flow rules.
6010 * A valid libmnl socket object pointer on success, NULL otherwise and
6013 static struct mnl_socket *
6014 flow_tcf_mnl_socket_create(void)
6016 struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
6019 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
6021 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
6026 mnl_socket_close(nl);
6031 * Destroy a libmnl socket.
6034 * Libmnl socket of the @p NETLINK_ROUTE kind.
6037 flow_tcf_mnl_socket_destroy(struct mnl_socket *nl)
6040 mnl_socket_close(nl);
6044 * Initialize ingress qdisc of a given network interface.
6047 * Pointer to tc-flower context to use.
6049 * Index of network interface to initialize.
6051 * Perform verbose error reporting if not NULL.
6054 * 0 on success, a negative errno value otherwise and rte_errno is set.
6057 mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx,
6058 unsigned int ifindex, struct rte_flow_error *error)
6060 struct nlmsghdr *nlh;
6062 alignas(struct nlmsghdr)
6063 uint8_t buf[mnl_nlmsg_size(sizeof(*tcm)) +
6064 SZ_NLATTR_STRZ_OF("ingress") +
6065 MNL_BUF_EXTRA_SPACE];
6067 /* Destroy existing ingress qdisc and everything attached to it. */
6068 nlh = mnl_nlmsg_put_header(buf);
6069 nlh->nlmsg_type = RTM_DELQDISC;
6070 nlh->nlmsg_flags = NLM_F_REQUEST;
6071 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6072 tcm->tcm_family = AF_UNSPEC;
6073 tcm->tcm_ifindex = ifindex;
6074 tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6075 tcm->tcm_parent = TC_H_INGRESS;
6076 assert(sizeof(buf) >= nlh->nlmsg_len);
6077 /* Ignore errors when qdisc is already absent. */
6078 if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL) &&
6079 rte_errno != EINVAL && rte_errno != ENOENT)
6080 return rte_flow_error_set(error, rte_errno,
6081 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6082 "netlink: failed to remove ingress"
6084 /* Create fresh ingress qdisc. */
6085 nlh = mnl_nlmsg_put_header(buf);
6086 nlh->nlmsg_type = RTM_NEWQDISC;
6087 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
6088 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6089 tcm->tcm_family = AF_UNSPEC;
6090 tcm->tcm_ifindex = ifindex;
6091 tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6092 tcm->tcm_parent = TC_H_INGRESS;
6093 mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
6094 assert(sizeof(buf) >= nlh->nlmsg_len);
6095 if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL))
6096 return rte_flow_error_set(error, rte_errno,
6097 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6098 "netlink: failed to create ingress"
6104 * Create libmnl context for Netlink flow rules.
6107 * A valid libmnl socket object pointer on success, NULL otherwise and
6110 struct mlx5_flow_tcf_context *
6111 mlx5_flow_tcf_context_create(void)
6113 struct mlx5_flow_tcf_context *ctx = rte_zmalloc(__func__,
6118 ctx->nl = flow_tcf_mnl_socket_create();
6121 ctx->buf_size = MNL_SOCKET_BUFFER_SIZE;
6122 ctx->buf = rte_zmalloc(__func__,
6123 ctx->buf_size, sizeof(uint32_t));
6126 ctx->seq = random();
6129 mlx5_flow_tcf_context_destroy(ctx);
6134 * Destroy a libmnl context.
6137 * Libmnl socket of the @p NETLINK_ROUTE kind.
6140 mlx5_flow_tcf_context_destroy(struct mlx5_flow_tcf_context *ctx)
6144 flow_tcf_mnl_socket_destroy(ctx->nl);