1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2018 6WIND S.A.
3 * Copyright 2018 Mellanox Technologies, Ltd
8 #include <libmnl/libmnl.h>
9 #include <linux/gen_stats.h>
10 #include <linux/if_ether.h>
11 #include <linux/netlink.h>
12 #include <linux/pkt_cls.h>
13 #include <linux/pkt_sched.h>
14 #include <linux/rtnetlink.h>
15 #include <linux/tc_act/tc_gact.h>
16 #include <linux/tc_act/tc_mirred.h>
17 #include <netinet/in.h>
23 #include <sys/socket.h>
25 #include <rte_byteorder.h>
26 #include <rte_errno.h>
27 #include <rte_ether.h>
29 #include <rte_malloc.h>
30 #include <rte_common.h>
33 #include "mlx5_flow.h"
34 #include "mlx5_autoconf.h"
36 #ifdef HAVE_TC_ACT_VLAN
38 #include <linux/tc_act/tc_vlan.h>
40 #else /* HAVE_TC_ACT_VLAN */
42 #define TCA_VLAN_ACT_POP 1
43 #define TCA_VLAN_ACT_PUSH 2
44 #define TCA_VLAN_ACT_MODIFY 3
45 #define TCA_VLAN_PARMS 2
46 #define TCA_VLAN_PUSH_VLAN_ID 3
47 #define TCA_VLAN_PUSH_VLAN_PROTOCOL 4
48 #define TCA_VLAN_PAD 5
49 #define TCA_VLAN_PUSH_VLAN_PRIORITY 6
56 #endif /* HAVE_TC_ACT_VLAN */
58 #ifdef HAVE_TC_ACT_PEDIT
60 #include <linux/tc_act/tc_pedit.h>
62 #else /* HAVE_TC_ACT_VLAN */
76 TCA_PEDIT_KEY_EX_HTYPE = 1,
77 TCA_PEDIT_KEY_EX_CMD = 2,
78 __TCA_PEDIT_KEY_EX_MAX
81 enum pedit_header_type {
82 TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK = 0,
83 TCA_PEDIT_KEY_EX_HDR_TYPE_ETH = 1,
84 TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 = 2,
85 TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 = 3,
86 TCA_PEDIT_KEY_EX_HDR_TYPE_TCP = 4,
87 TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
92 TCA_PEDIT_KEY_EX_CMD_SET = 0,
93 TCA_PEDIT_KEY_EX_CMD_ADD = 1,
100 __u32 off; /*offset */
107 struct tc_pedit_sel {
111 struct tc_pedit_key keys[0];
114 #endif /* HAVE_TC_ACT_VLAN */
116 #ifdef HAVE_TC_ACT_TUNNEL_KEY
118 #include <linux/tc_act/tc_tunnel_key.h>
120 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
121 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
124 #ifndef HAVE_TCA_TUNNEL_KEY_NO_CSUM
125 #define TCA_TUNNEL_KEY_NO_CSUM 10
128 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_TOS
129 #define TCA_TUNNEL_KEY_ENC_TOS 12
132 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_TTL
133 #define TCA_TUNNEL_KEY_ENC_TTL 13
136 #else /* HAVE_TC_ACT_TUNNEL_KEY */
138 #define TCA_ACT_TUNNEL_KEY 17
139 #define TCA_TUNNEL_KEY_ACT_SET 1
140 #define TCA_TUNNEL_KEY_ACT_RELEASE 2
141 #define TCA_TUNNEL_KEY_PARMS 2
142 #define TCA_TUNNEL_KEY_ENC_IPV4_SRC 3
143 #define TCA_TUNNEL_KEY_ENC_IPV4_DST 4
144 #define TCA_TUNNEL_KEY_ENC_IPV6_SRC 5
145 #define TCA_TUNNEL_KEY_ENC_IPV6_DST 6
146 #define TCA_TUNNEL_KEY_ENC_KEY_ID 7
147 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
148 #define TCA_TUNNEL_KEY_NO_CSUM 10
149 #define TCA_TUNNEL_KEY_ENC_TOS 12
150 #define TCA_TUNNEL_KEY_ENC_TTL 13
152 struct tc_tunnel_key {
157 #endif /* HAVE_TC_ACT_TUNNEL_KEY */
159 /* Normally found in linux/netlink.h. */
160 #ifndef NETLINK_CAP_ACK
161 #define NETLINK_CAP_ACK 10
164 /* Normally found in linux/pkt_sched.h. */
165 #ifndef TC_H_MIN_INGRESS
166 #define TC_H_MIN_INGRESS 0xfff2u
169 /* Normally found in linux/pkt_cls.h. */
170 #ifndef TCA_CLS_FLAGS_SKIP_SW
171 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
173 #ifndef TCA_CLS_FLAGS_IN_HW
174 #define TCA_CLS_FLAGS_IN_HW (1 << 2)
176 #ifndef HAVE_TCA_CHAIN
179 #ifndef HAVE_TCA_FLOWER_ACT
180 #define TCA_FLOWER_ACT 3
182 #ifndef HAVE_TCA_FLOWER_FLAGS
183 #define TCA_FLOWER_FLAGS 22
185 #ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE
186 #define TCA_FLOWER_KEY_ETH_TYPE 8
188 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST
189 #define TCA_FLOWER_KEY_ETH_DST 4
191 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK
192 #define TCA_FLOWER_KEY_ETH_DST_MASK 5
194 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC
195 #define TCA_FLOWER_KEY_ETH_SRC 6
197 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK
198 #define TCA_FLOWER_KEY_ETH_SRC_MASK 7
200 #ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO
201 #define TCA_FLOWER_KEY_IP_PROTO 9
203 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC
204 #define TCA_FLOWER_KEY_IPV4_SRC 10
206 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK
207 #define TCA_FLOWER_KEY_IPV4_SRC_MASK 11
209 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST
210 #define TCA_FLOWER_KEY_IPV4_DST 12
212 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK
213 #define TCA_FLOWER_KEY_IPV4_DST_MASK 13
215 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC
216 #define TCA_FLOWER_KEY_IPV6_SRC 14
218 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK
219 #define TCA_FLOWER_KEY_IPV6_SRC_MASK 15
221 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST
222 #define TCA_FLOWER_KEY_IPV6_DST 16
224 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK
225 #define TCA_FLOWER_KEY_IPV6_DST_MASK 17
227 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC
228 #define TCA_FLOWER_KEY_TCP_SRC 18
230 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK
231 #define TCA_FLOWER_KEY_TCP_SRC_MASK 35
233 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST
234 #define TCA_FLOWER_KEY_TCP_DST 19
236 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK
237 #define TCA_FLOWER_KEY_TCP_DST_MASK 36
239 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC
240 #define TCA_FLOWER_KEY_UDP_SRC 20
242 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK
243 #define TCA_FLOWER_KEY_UDP_SRC_MASK 37
245 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST
246 #define TCA_FLOWER_KEY_UDP_DST 21
248 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK
249 #define TCA_FLOWER_KEY_UDP_DST_MASK 38
251 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID
252 #define TCA_FLOWER_KEY_VLAN_ID 23
254 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO
255 #define TCA_FLOWER_KEY_VLAN_PRIO 24
257 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
258 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
260 #ifndef HAVE_TCA_FLOWER_KEY_ENC_KEY_ID
261 #define TCA_FLOWER_KEY_ENC_KEY_ID 26
263 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC
264 #define TCA_FLOWER_KEY_ENC_IPV4_SRC 27
266 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK
267 #define TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK 28
269 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST
270 #define TCA_FLOWER_KEY_ENC_IPV4_DST 29
272 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK
273 #define TCA_FLOWER_KEY_ENC_IPV4_DST_MASK 30
275 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC
276 #define TCA_FLOWER_KEY_ENC_IPV6_SRC 31
278 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK
279 #define TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK 32
281 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST
282 #define TCA_FLOWER_KEY_ENC_IPV6_DST 33
284 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK
285 #define TCA_FLOWER_KEY_ENC_IPV6_DST_MASK 34
287 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT
288 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT 43
290 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK
291 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK 44
293 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT
294 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT 45
296 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK
297 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK 46
299 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS
300 #define TCA_FLOWER_KEY_TCP_FLAGS 71
302 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK
303 #define TCA_FLOWER_KEY_TCP_FLAGS_MASK 72
305 #ifndef HAVE_TCA_FLOWER_KEY_IP_TOS
306 #define TCA_FLOWER_KEY_IP_TOS 73
308 #ifndef HAVE_TCA_FLOWER_KEY_IP_TOS_MASK
309 #define TCA_FLOWER_KEY_IP_TOS_MASK 74
311 #ifndef HAVE_TCA_FLOWER_KEY_IP_TTL
312 #define TCA_FLOWER_KEY_IP_TTL 75
314 #ifndef HAVE_TCA_FLOWER_KEY_IP_TTL_MASK
315 #define TCA_FLOWER_KEY_IP_TTL_MASK 76
317 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TOS
318 #define TCA_FLOWER_KEY_ENC_IP_TOS 80
320 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TOS_MASK
321 #define TCA_FLOWER_KEY_ENC_IP_TOS_MASK 81
323 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TTL
324 #define TCA_FLOWER_KEY_ENC_IP_TTL 82
326 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TTL_MASK
327 #define TCA_FLOWER_KEY_ENC_IP_TTL_MASK 83
330 #ifndef HAVE_TC_ACT_GOTO_CHAIN
331 #define TC_ACT_GOTO_CHAIN 0x20000000
334 #ifndef IPV6_ADDR_LEN
335 #define IPV6_ADDR_LEN 16
338 #ifndef IPV4_ADDR_LEN
339 #define IPV4_ADDR_LEN 4
343 #define TP_PORT_LEN 2 /* Transport Port (UDP/TCP) Length */
350 #ifndef TCA_ACT_MAX_PRIO
351 #define TCA_ACT_MAX_PRIO 32
354 /** Parameters of VXLAN devices created by driver. */
355 #define MLX5_VXLAN_DEFAULT_VNI 1
356 #define MLX5_VXLAN_DEVICE_PFX "vmlx_"
358 /** Tunnel action type, used for @p type in header structure. */
359 enum flow_tcf_tunact_type {
360 FLOW_TCF_TUNACT_VXLAN_DECAP,
361 FLOW_TCF_TUNACT_VXLAN_ENCAP,
364 /** Flags used for @p mask in tunnel action encap descriptors. */
365 #define FLOW_TCF_ENCAP_ETH_SRC (1u << 0)
366 #define FLOW_TCF_ENCAP_ETH_DST (1u << 1)
367 #define FLOW_TCF_ENCAP_IPV4_SRC (1u << 2)
368 #define FLOW_TCF_ENCAP_IPV4_DST (1u << 3)
369 #define FLOW_TCF_ENCAP_IPV6_SRC (1u << 4)
370 #define FLOW_TCF_ENCAP_IPV6_DST (1u << 5)
371 #define FLOW_TCF_ENCAP_UDP_SRC (1u << 6)
372 #define FLOW_TCF_ENCAP_UDP_DST (1u << 7)
373 #define FLOW_TCF_ENCAP_VXLAN_VNI (1u << 8)
374 #define FLOW_TCF_ENCAP_IP_TTL (1u << 9)
375 #define FLOW_TCF_ENCAP_IP_TOS (1u << 10)
378 * Structure for holding netlink context.
379 * Note the size of the message buffer which is MNL_SOCKET_BUFFER_SIZE.
380 * Using this (8KB) buffer size ensures that netlink messages will never be
383 struct mlx5_flow_tcf_context {
384 struct mnl_socket *nl; /* NETLINK_ROUTE libmnl socket. */
385 uint32_t seq; /* Message sequence number. */
386 uint32_t buf_size; /* Message buffer size. */
387 uint8_t *buf; /* Message buffer. */
391 * Neigh rule structure. The neigh rule is applied via Netlink to
392 * outer tunnel iface in order to provide destination MAC address
393 * for the VXLAN encapsultion. The neigh rule is implicitly related
394 * to the Flow itself and can be shared by multiple Flows.
396 struct tcf_neigh_rule {
397 LIST_ENTRY(tcf_neigh_rule) next;
399 struct ether_addr eth;
406 uint8_t dst[IPV6_ADDR_LEN];
412 * Local rule structure. The local rule is applied via Netlink to
413 * outer tunnel iface in order to provide local and peer IP addresses
414 * of the VXLAN tunnel for encapsulation. The local rule is implicitly
415 * related to the Flow itself and can be shared by multiple Flows.
417 struct tcf_local_rule {
418 LIST_ENTRY(tcf_local_rule) next;
427 uint8_t dst[IPV6_ADDR_LEN];
428 uint8_t src[IPV6_ADDR_LEN];
433 /** Outer interface VXLAN encapsulation rules container. */
435 LIST_ENTRY(tcf_irule) next;
436 LIST_HEAD(, tcf_neigh_rule) neigh;
437 LIST_HEAD(, tcf_local_rule) local;
439 unsigned int ifouter; /**< Own interface index. */
442 /** VXLAN virtual netdev. */
444 LIST_ENTRY(tcf_vtep) next;
446 unsigned int ifindex; /**< Own interface index. */
451 /** Tunnel descriptor header, common for all tunnel types. */
452 struct flow_tcf_tunnel_hdr {
453 uint32_t type; /**< Tunnel action type. */
454 struct tcf_vtep *vtep; /**< Virtual tunnel endpoint device. */
455 unsigned int ifindex_org; /**< Original dst/src interface */
456 unsigned int *ifindex_ptr; /**< Interface ptr in message. */
459 struct flow_tcf_vxlan_decap {
460 struct flow_tcf_tunnel_hdr hdr;
464 struct flow_tcf_vxlan_encap {
465 struct flow_tcf_tunnel_hdr hdr;
466 struct tcf_irule *iface;
471 struct ether_addr dst;
472 struct ether_addr src;
480 uint8_t dst[IPV6_ADDR_LEN];
481 uint8_t src[IPV6_ADDR_LEN];
493 /** Structure used when extracting the values of a flow counters
494 * from a netlink message.
496 struct flow_tcf_stats_basic {
498 struct gnet_stats_basic counters;
501 /** Empty masks for known item types. */
503 struct rte_flow_item_port_id port_id;
504 struct rte_flow_item_eth eth;
505 struct rte_flow_item_vlan vlan;
506 struct rte_flow_item_ipv4 ipv4;
507 struct rte_flow_item_ipv6 ipv6;
508 struct rte_flow_item_tcp tcp;
509 struct rte_flow_item_udp udp;
510 struct rte_flow_item_vxlan vxlan;
511 } flow_tcf_mask_empty = {
515 /** Supported masks for known item types. */
516 static const struct {
517 struct rte_flow_item_port_id port_id;
518 struct rte_flow_item_eth eth;
519 struct rte_flow_item_vlan vlan;
520 struct rte_flow_item_ipv4 ipv4;
521 struct rte_flow_item_ipv6 ipv6;
522 struct rte_flow_item_tcp tcp;
523 struct rte_flow_item_udp udp;
524 struct rte_flow_item_vxlan vxlan;
525 } flow_tcf_mask_supported = {
530 .type = RTE_BE16(0xffff),
531 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
532 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
535 /* PCP and VID only, no DEI. */
536 .tci = RTE_BE16(0xefff),
537 .inner_type = RTE_BE16(0xffff),
540 .next_proto_id = 0xff,
541 .time_to_live = 0xff,
542 .type_of_service = 0xff,
543 .src_addr = RTE_BE32(0xffffffff),
544 .dst_addr = RTE_BE32(0xffffffff),
548 .vtc_flow = RTE_BE32(0xfful << IPV6_HDR_FL_SHIFT),
551 "\xff\xff\xff\xff\xff\xff\xff\xff"
552 "\xff\xff\xff\xff\xff\xff\xff\xff",
554 "\xff\xff\xff\xff\xff\xff\xff\xff"
555 "\xff\xff\xff\xff\xff\xff\xff\xff",
558 .src_port = RTE_BE16(0xffff),
559 .dst_port = RTE_BE16(0xffff),
563 .src_port = RTE_BE16(0xffff),
564 .dst_port = RTE_BE16(0xffff),
567 .vni = "\xff\xff\xff",
571 #define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr))
572 #define SZ_NLATTR_NEST SZ_NLATTR_HDR
573 #define SZ_NLATTR_DATA_OF(len) MNL_ALIGN(SZ_NLATTR_HDR + (len))
574 #define SZ_NLATTR_TYPE_OF(typ) SZ_NLATTR_DATA_OF(sizeof(typ))
575 #define SZ_NLATTR_STRZ_OF(str) SZ_NLATTR_DATA_OF(strlen(str) + 1)
577 #define PTOI_TABLE_SZ_MAX(dev) (mlx5_dev_to_port_id((dev)->device, NULL, 0) + 2)
579 /** DPDK port to network interface index (ifindex) conversion. */
580 struct flow_tcf_ptoi {
581 uint16_t port_id; /**< DPDK port ID. */
582 unsigned int ifindex; /**< Network interface index. */
585 /* Due to a limitation on driver/FW. */
586 #define MLX5_TCF_GROUP_ID_MAX 3
589 * Due to a limitation on driver/FW, priority ranges from 1 to 16 in kernel.
590 * Priority in rte_flow attribute starts from 0 and is added by 1 in
591 * translation. This is subject to be changed to determine the max priority
592 * based on trial-and-error like Verbs driver once the restriction is lifted or
593 * the range is extended.
595 #define MLX5_TCF_GROUP_PRIORITY_MAX 15
597 #define MLX5_TCF_FATE_ACTIONS \
598 (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \
599 MLX5_FLOW_ACTION_JUMP)
601 #define MLX5_TCF_VLAN_ACTIONS \
602 (MLX5_FLOW_ACTION_OF_POP_VLAN | MLX5_FLOW_ACTION_OF_PUSH_VLAN | \
603 MLX5_FLOW_ACTION_OF_SET_VLAN_VID | MLX5_FLOW_ACTION_OF_SET_VLAN_PCP)
605 #define MLX5_TCF_VXLAN_ACTIONS \
606 (MLX5_FLOW_ACTION_VXLAN_ENCAP | MLX5_FLOW_ACTION_VXLAN_DECAP)
608 #define MLX5_TCF_PEDIT_ACTIONS \
609 (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST | \
610 MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST | \
611 MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST | \
612 MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL | \
613 MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)
615 #define MLX5_TCF_CONFIG_ACTIONS \
616 (MLX5_FLOW_ACTION_PORT_ID | MLX5_FLOW_ACTION_JUMP | \
617 MLX5_FLOW_ACTION_OF_PUSH_VLAN | MLX5_FLOW_ACTION_OF_SET_VLAN_VID | \
618 MLX5_FLOW_ACTION_OF_SET_VLAN_PCP | \
619 (MLX5_TCF_PEDIT_ACTIONS & ~MLX5_FLOW_ACTION_DEC_TTL))
621 #define MAX_PEDIT_KEYS 128
622 #define SZ_PEDIT_KEY_VAL 4
624 #define NUM_OF_PEDIT_KEYS(sz) \
625 (((sz) / SZ_PEDIT_KEY_VAL) + (((sz) % SZ_PEDIT_KEY_VAL) ? 1 : 0))
627 struct pedit_key_ex {
628 enum pedit_header_type htype;
632 struct pedit_parser {
633 struct tc_pedit_sel sel;
634 struct tc_pedit_key keys[MAX_PEDIT_KEYS];
635 struct pedit_key_ex keys_ex[MAX_PEDIT_KEYS];
639 * Create space for using the implicitly created TC flow counter.
642 * Pointer to the Ethernet device structure.
645 * A pointer to the counter data structure, NULL otherwise and
648 static struct mlx5_flow_counter *
649 flow_tcf_counter_new(void)
651 struct mlx5_flow_counter *cnt;
654 * eswitch counter cannot be shared and its id is unknown.
655 * currently returning all with id 0.
656 * in the future maybe better to switch to unique numbers.
658 struct mlx5_flow_counter tmpl = {
661 cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0);
667 /* Implicit counter, do not add to list. */
672 * Set pedit key of MAC address
675 * pointer to action specification
676 * @param[in,out] p_parser
677 * pointer to pedit_parser
680 flow_tcf_pedit_key_set_mac(const struct rte_flow_action *actions,
681 struct pedit_parser *p_parser)
683 int idx = p_parser->sel.nkeys;
684 uint32_t off = actions->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ?
685 offsetof(struct ether_hdr, s_addr) :
686 offsetof(struct ether_hdr, d_addr);
687 const struct rte_flow_action_set_mac *conf =
688 (const struct rte_flow_action_set_mac *)actions->conf;
690 p_parser->keys[idx].off = off;
691 p_parser->keys[idx].mask = ~UINT32_MAX;
692 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
693 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
694 memcpy(&p_parser->keys[idx].val,
695 conf->mac_addr, SZ_PEDIT_KEY_VAL);
697 p_parser->keys[idx].off = off + SZ_PEDIT_KEY_VAL;
698 p_parser->keys[idx].mask = 0xFFFF0000;
699 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
700 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
701 memcpy(&p_parser->keys[idx].val,
702 conf->mac_addr + SZ_PEDIT_KEY_VAL,
703 ETHER_ADDR_LEN - SZ_PEDIT_KEY_VAL);
704 p_parser->sel.nkeys = (++idx);
708 * Set pedit key of decrease/set ttl
711 * pointer to action specification
712 * @param[in,out] p_parser
713 * pointer to pedit_parser
714 * @param[in] item_flags
715 * flags of all items presented
718 flow_tcf_pedit_key_set_dec_ttl(const struct rte_flow_action *actions,
719 struct pedit_parser *p_parser,
722 int idx = p_parser->sel.nkeys;
724 p_parser->keys[idx].mask = 0xFFFFFF00;
725 if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) {
726 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
727 p_parser->keys[idx].off =
728 offsetof(struct ipv4_hdr, time_to_live);
730 if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6) {
731 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
732 p_parser->keys[idx].off =
733 offsetof(struct ipv6_hdr, hop_limits);
735 if (actions->type == RTE_FLOW_ACTION_TYPE_DEC_TTL) {
736 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_ADD;
737 p_parser->keys[idx].val = 0x000000FF;
739 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
740 p_parser->keys[idx].val =
741 (__u32)((const struct rte_flow_action_set_ttl *)
742 actions->conf)->ttl_value;
744 p_parser->sel.nkeys = (++idx);
748 * Set pedit key of transport (TCP/UDP) port value
751 * pointer to action specification
752 * @param[in,out] p_parser
753 * pointer to pedit_parser
754 * @param[in] item_flags
755 * flags of all items presented
758 flow_tcf_pedit_key_set_tp_port(const struct rte_flow_action *actions,
759 struct pedit_parser *p_parser,
762 int idx = p_parser->sel.nkeys;
764 if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)
765 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_UDP;
766 if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_TCP)
767 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_TCP;
768 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
769 /* offset of src/dst port is same for TCP and UDP */
770 p_parser->keys[idx].off =
771 actions->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC ?
772 offsetof(struct tcp_hdr, src_port) :
773 offsetof(struct tcp_hdr, dst_port);
774 p_parser->keys[idx].mask = 0xFFFF0000;
775 p_parser->keys[idx].val =
776 (__u32)((const struct rte_flow_action_set_tp *)
777 actions->conf)->port;
778 p_parser->sel.nkeys = (++idx);
782 * Set pedit key of ipv6 address
785 * pointer to action specification
786 * @param[in,out] p_parser
787 * pointer to pedit_parser
790 flow_tcf_pedit_key_set_ipv6_addr(const struct rte_flow_action *actions,
791 struct pedit_parser *p_parser)
793 int idx = p_parser->sel.nkeys;
794 int keys = NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
796 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ?
797 offsetof(struct ipv6_hdr, src_addr) :
798 offsetof(struct ipv6_hdr, dst_addr);
799 const struct rte_flow_action_set_ipv6 *conf =
800 (const struct rte_flow_action_set_ipv6 *)actions->conf;
802 for (int i = 0; i < keys; i++, idx++) {
803 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
804 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
805 p_parser->keys[idx].off = off_base + i * SZ_PEDIT_KEY_VAL;
806 p_parser->keys[idx].mask = ~UINT32_MAX;
807 memcpy(&p_parser->keys[idx].val,
808 conf->ipv6_addr + i * SZ_PEDIT_KEY_VAL,
811 p_parser->sel.nkeys += keys;
815 * Set pedit key of ipv4 address
818 * pointer to action specification
819 * @param[in,out] p_parser
820 * pointer to pedit_parser
823 flow_tcf_pedit_key_set_ipv4_addr(const struct rte_flow_action *actions,
824 struct pedit_parser *p_parser)
826 int idx = p_parser->sel.nkeys;
828 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
829 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
830 p_parser->keys[idx].off =
831 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ?
832 offsetof(struct ipv4_hdr, src_addr) :
833 offsetof(struct ipv4_hdr, dst_addr);
834 p_parser->keys[idx].mask = ~UINT32_MAX;
835 p_parser->keys[idx].val =
836 ((const struct rte_flow_action_set_ipv4 *)
837 actions->conf)->ipv4_addr;
838 p_parser->sel.nkeys = (++idx);
842 * Create the pedit's na attribute in netlink message
843 * on pre-allocate message buffer
846 * pointer to pre-allocated netlink message buffer
847 * @param[in,out] actions
848 * pointer to pointer of actions specification.
849 * @param[in,out] action_flags
850 * pointer to actions flags
851 * @param[in] item_flags
852 * flags of all item presented
855 flow_tcf_create_pedit_mnl_msg(struct nlmsghdr *nl,
856 const struct rte_flow_action **actions,
859 struct pedit_parser p_parser;
860 struct nlattr *na_act_options;
861 struct nlattr *na_pedit_keys;
863 memset(&p_parser, 0, sizeof(p_parser));
864 mnl_attr_put_strz(nl, TCA_ACT_KIND, "pedit");
865 na_act_options = mnl_attr_nest_start(nl, TCA_ACT_OPTIONS);
866 /* all modify header actions should be in one tc-pedit action */
867 for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
868 switch ((*actions)->type) {
869 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
870 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
871 flow_tcf_pedit_key_set_ipv4_addr(*actions, &p_parser);
873 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
874 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
875 flow_tcf_pedit_key_set_ipv6_addr(*actions, &p_parser);
877 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
878 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
879 flow_tcf_pedit_key_set_tp_port(*actions,
880 &p_parser, item_flags);
882 case RTE_FLOW_ACTION_TYPE_SET_TTL:
883 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
884 flow_tcf_pedit_key_set_dec_ttl(*actions,
885 &p_parser, item_flags);
887 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
888 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
889 flow_tcf_pedit_key_set_mac(*actions, &p_parser);
892 goto pedit_mnl_msg_done;
896 p_parser.sel.action = TC_ACT_PIPE;
897 mnl_attr_put(nl, TCA_PEDIT_PARMS_EX,
898 sizeof(p_parser.sel) +
899 p_parser.sel.nkeys * sizeof(struct tc_pedit_key),
902 mnl_attr_nest_start(nl, TCA_PEDIT_KEYS_EX | NLA_F_NESTED);
903 for (int i = 0; i < p_parser.sel.nkeys; i++) {
904 struct nlattr *na_pedit_key =
905 mnl_attr_nest_start(nl,
906 TCA_PEDIT_KEY_EX | NLA_F_NESTED);
907 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_HTYPE,
908 p_parser.keys_ex[i].htype);
909 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_CMD,
910 p_parser.keys_ex[i].cmd);
911 mnl_attr_nest_end(nl, na_pedit_key);
913 mnl_attr_nest_end(nl, na_pedit_keys);
914 mnl_attr_nest_end(nl, na_act_options);
919 * Calculate max memory size of one TC-pedit actions.
920 * One TC-pedit action can contain set of keys each defining
921 * a rewrite element (rte_flow action)
923 * @param[in,out] actions
924 * actions specification.
925 * @param[in,out] action_flags
927 * @param[in,out] size
930 * Max memory size of one TC-pedit action
933 flow_tcf_get_pedit_actions_size(const struct rte_flow_action **actions,
934 uint64_t *action_flags)
940 pedit_size += SZ_NLATTR_NEST + /* na_act_index. */
941 SZ_NLATTR_STRZ_OF("pedit") +
942 SZ_NLATTR_NEST; /* TCA_ACT_OPTIONS. */
943 for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
944 switch ((*actions)->type) {
945 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
946 keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
947 flags |= MLX5_FLOW_ACTION_SET_IPV4_SRC;
949 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
950 keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
951 flags |= MLX5_FLOW_ACTION_SET_IPV4_DST;
953 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
954 keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
955 flags |= MLX5_FLOW_ACTION_SET_IPV6_SRC;
957 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
958 keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
959 flags |= MLX5_FLOW_ACTION_SET_IPV6_DST;
961 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
962 /* TCP is as same as UDP */
963 keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
964 flags |= MLX5_FLOW_ACTION_SET_TP_SRC;
966 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
967 /* TCP is as same as UDP */
968 keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
969 flags |= MLX5_FLOW_ACTION_SET_TP_DST;
971 case RTE_FLOW_ACTION_TYPE_SET_TTL:
972 keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
973 flags |= MLX5_FLOW_ACTION_SET_TTL;
975 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
976 keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
977 flags |= MLX5_FLOW_ACTION_DEC_TTL;
979 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
980 keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
981 flags |= MLX5_FLOW_ACTION_SET_MAC_SRC;
983 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
984 keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
985 flags |= MLX5_FLOW_ACTION_SET_MAC_DST;
988 goto get_pedit_action_size_done;
991 get_pedit_action_size_done:
992 /* TCA_PEDIT_PARAMS_EX */
994 SZ_NLATTR_DATA_OF(sizeof(struct tc_pedit_sel) +
995 keys * sizeof(struct tc_pedit_key));
996 pedit_size += SZ_NLATTR_NEST; /* TCA_PEDIT_KEYS */
998 /* TCA_PEDIT_KEY_EX + HTYPE + CMD */
999 (SZ_NLATTR_NEST + SZ_NLATTR_DATA_OF(2) +
1000 SZ_NLATTR_DATA_OF(2));
1001 (*action_flags) |= flags;
1007 * Retrieve mask for pattern item.
1009 * This function does basic sanity checks on a pattern item in order to
1010 * return the most appropriate mask for it.
1013 * Item specification.
1014 * @param[in] mask_default
1015 * Default mask for pattern item as specified by the flow API.
1016 * @param[in] mask_supported
1017 * Mask fields supported by the implementation.
1018 * @param[in] mask_empty
1019 * Empty mask to return when there is no specification.
1021 * Perform verbose error reporting if not NULL.
1024 * Either @p item->mask or one of the mask parameters on success, NULL
1025 * otherwise and rte_errno is set.
1028 flow_tcf_item_mask(const struct rte_flow_item *item, const void *mask_default,
1029 const void *mask_supported, const void *mask_empty,
1030 size_t mask_size, struct rte_flow_error *error)
1032 const uint8_t *mask;
1035 /* item->last and item->mask cannot exist without item->spec. */
1036 if (!item->spec && (item->mask || item->last)) {
1037 rte_flow_error_set(error, EINVAL,
1038 RTE_FLOW_ERROR_TYPE_ITEM, item,
1039 "\"mask\" or \"last\" field provided without"
1040 " a corresponding \"spec\"");
1043 /* No spec, no mask, no problem. */
1046 mask = item->mask ? item->mask : mask_default;
1049 * Single-pass check to make sure that:
1050 * - Mask is supported, no bits are set outside mask_supported.
1051 * - Both item->spec and item->last are included in mask.
1053 for (i = 0; i != mask_size; ++i) {
1056 if ((mask[i] | ((const uint8_t *)mask_supported)[i]) !=
1057 ((const uint8_t *)mask_supported)[i]) {
1058 rte_flow_error_set(error, ENOTSUP,
1059 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1060 "unsupported field found"
1065 (((const uint8_t *)item->spec)[i] & mask[i]) !=
1066 (((const uint8_t *)item->last)[i] & mask[i])) {
1067 rte_flow_error_set(error, EINVAL,
1068 RTE_FLOW_ERROR_TYPE_ITEM_LAST,
1070 "range between \"spec\" and \"last\""
1071 " not comprised in \"mask\"");
1079 * Build a conversion table between port ID and ifindex.
1082 * Pointer to Ethernet device.
1084 * Pointer to ptoi table.
1086 * Size of ptoi table provided.
1089 * Size of ptoi table filled.
1092 flow_tcf_build_ptoi_table(struct rte_eth_dev *dev, struct flow_tcf_ptoi *ptoi,
1095 unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
1096 uint16_t port_id[n + 1];
1098 unsigned int own = 0;
1100 /* At least one port is needed when no switch domain is present. */
1103 port_id[0] = dev->data->port_id;
1105 n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
1109 for (i = 0; i != n; ++i) {
1110 struct rte_eth_dev_info dev_info;
1112 rte_eth_dev_info_get(port_id[i], &dev_info);
1113 if (port_id[i] == dev->data->port_id)
1115 ptoi[i].port_id = port_id[i];
1116 ptoi[i].ifindex = dev_info.if_index;
1118 /* Ensure first entry of ptoi[] is the current device. */
1121 ptoi[0] = ptoi[own];
1122 ptoi[own] = ptoi[n];
1124 /* An entry with zero ifindex terminates ptoi[]. */
1125 ptoi[n].port_id = 0;
1126 ptoi[n].ifindex = 0;
1131 * Verify the @p attr will be correctly understood by the E-switch.
1134 * Pointer to flow attributes
1136 * Pointer to error structure.
1139 * 0 on success, a negative errno value otherwise and rte_errno is set.
1142 flow_tcf_validate_attributes(const struct rte_flow_attr *attr,
1143 struct rte_flow_error *error)
1146 * Supported attributes: groups, some priorities and ingress only.
1147 * group is supported only if kernel supports chain. Don't care about
1148 * transfer as it is the caller's problem.
1150 if (attr->group > MLX5_TCF_GROUP_ID_MAX)
1151 return rte_flow_error_set(error, ENOTSUP,
1152 RTE_FLOW_ERROR_TYPE_ATTR_GROUP, attr,
1153 "group ID larger than "
1154 RTE_STR(MLX5_TCF_GROUP_ID_MAX)
1155 " isn't supported");
1156 else if (attr->priority > MLX5_TCF_GROUP_PRIORITY_MAX)
1157 return rte_flow_error_set(error, ENOTSUP,
1158 RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1160 "priority more than "
1161 RTE_STR(MLX5_TCF_GROUP_PRIORITY_MAX)
1162 " is not supported");
1164 return rte_flow_error_set(error, EINVAL,
1165 RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1166 attr, "only ingress is supported");
1168 return rte_flow_error_set(error, ENOTSUP,
1169 RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1170 attr, "egress is not supported");
1175 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_ETH item for E-Switch.
1176 * The routine checks the L2 fields to be used in encapsulation header.
1179 * Pointer to the item structure.
1181 * Pointer to the error structure.
1184 * 0 on success, a negative errno value otherwise and rte_errno is set.
1187 flow_tcf_validate_vxlan_encap_eth(const struct rte_flow_item *item,
1188 struct rte_flow_error *error)
1190 const struct rte_flow_item_eth *spec = item->spec;
1191 const struct rte_flow_item_eth *mask = item->mask;
1195 * Specification for L2 addresses can be empty
1196 * because these ones are optional and not
1197 * required directly by tc rule. Kernel tries
1198 * to resolve these ones on its own
1203 /* If mask is not specified use the default one. */
1204 mask = &rte_flow_item_eth_mask;
1206 if (memcmp(&mask->dst,
1207 &flow_tcf_mask_empty.eth.dst,
1208 sizeof(flow_tcf_mask_empty.eth.dst))) {
1209 if (memcmp(&mask->dst,
1210 &rte_flow_item_eth_mask.dst,
1211 sizeof(rte_flow_item_eth_mask.dst)))
1212 return rte_flow_error_set
1214 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1215 "no support for partial mask on"
1216 " \"eth.dst\" field");
1218 if (memcmp(&mask->src,
1219 &flow_tcf_mask_empty.eth.src,
1220 sizeof(flow_tcf_mask_empty.eth.src))) {
1221 if (memcmp(&mask->src,
1222 &rte_flow_item_eth_mask.src,
1223 sizeof(rte_flow_item_eth_mask.src)))
1224 return rte_flow_error_set
1226 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1227 "no support for partial mask on"
1228 " \"eth.src\" field");
1230 if (mask->type != RTE_BE16(0x0000)) {
1231 if (mask->type != RTE_BE16(0xffff))
1232 return rte_flow_error_set
1234 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1235 "no support for partial mask on"
1236 " \"eth.type\" field");
1238 "outer ethernet type field"
1239 " cannot be forced for vxlan"
1240 " encapsulation, parameter ignored");
1246 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV4 item for E-Switch.
1247 * The routine checks the IPv4 fields to be used in encapsulation header.
1250 * Pointer to the item structure.
1252 * Pointer to the error structure.
1255 * 0 on success, a negative errno value otherwise and rte_errno is set.
1258 flow_tcf_validate_vxlan_encap_ipv4(const struct rte_flow_item *item,
1259 struct rte_flow_error *error)
1261 const struct rte_flow_item_ipv4 *spec = item->spec;
1262 const struct rte_flow_item_ipv4 *mask = item->mask;
1266 * Specification for IP addresses cannot be empty
1267 * because it is required by tunnel_key parameter.
1269 return rte_flow_error_set(error, EINVAL,
1270 RTE_FLOW_ERROR_TYPE_ITEM, item,
1271 "NULL outer ipv4 address"
1272 " specification for vxlan"
1276 mask = &rte_flow_item_ipv4_mask;
1277 if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) {
1278 if (mask->hdr.dst_addr != RTE_BE32(0xffffffff))
1279 return rte_flow_error_set
1281 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1282 "no support for partial mask on"
1283 " \"ipv4.hdr.dst_addr\" field"
1284 " for vxlan encapsulation");
1285 /* More IPv4 address validations can be put here. */
1288 * Kernel uses the destination IP address to determine
1289 * the routing path and obtain the MAC destination
1290 * address, so IP destination address must be
1291 * specified in the tc rule.
1293 return rte_flow_error_set(error, EINVAL,
1294 RTE_FLOW_ERROR_TYPE_ITEM, item,
1295 "outer ipv4 destination address"
1296 " must be specified for"
1297 " vxlan encapsulation");
1299 if (mask->hdr.src_addr != RTE_BE32(0x00000000)) {
1300 if (mask->hdr.src_addr != RTE_BE32(0xffffffff))
1301 return rte_flow_error_set
1303 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1304 "no support for partial mask on"
1305 " \"ipv4.hdr.src_addr\" field"
1306 " for vxlan encapsulation");
1307 /* More IPv4 address validations can be put here. */
1310 * Kernel uses the source IP address to select the
1311 * interface for egress encapsulated traffic, so
1312 * it must be specified in the tc rule.
1314 return rte_flow_error_set(error, EINVAL,
1315 RTE_FLOW_ERROR_TYPE_ITEM, item,
1316 "outer ipv4 source address"
1317 " must be specified for"
1318 " vxlan encapsulation");
1320 if (mask->hdr.type_of_service &&
1321 mask->hdr.type_of_service != 0xff)
1322 return rte_flow_error_set(error, ENOTSUP,
1323 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1324 "no support for partial mask on"
1325 " \"ipv4.hdr.type_of_service\" field"
1326 " for vxlan encapsulation");
1327 if (mask->hdr.time_to_live &&
1328 mask->hdr.time_to_live != 0xff)
1329 return rte_flow_error_set(error, ENOTSUP,
1330 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1331 "no support for partial mask on"
1332 " \"ipv4.hdr.time_to_live\" field"
1333 " for vxlan encapsulation");
1338 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV6 item for E-Switch.
1339 * The routine checks the IPv6 fields to be used in encapsulation header.
1342 * Pointer to the item structure.
1344 * Pointer to the error structure.
1347 * 0 on success, a negative errno value otherwise and rte_errno is set.
1350 flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item,
1351 struct rte_flow_error *error)
1353 const struct rte_flow_item_ipv6 *spec = item->spec;
1354 const struct rte_flow_item_ipv6 *mask = item->mask;
1359 * Specification for IP addresses cannot be empty
1360 * because it is required by tunnel_key parameter.
1362 return rte_flow_error_set(error, EINVAL,
1363 RTE_FLOW_ERROR_TYPE_ITEM, item,
1364 "NULL outer ipv6 address"
1365 " specification for"
1366 " vxlan encapsulation");
1369 mask = &rte_flow_item_ipv6_mask;
1370 if (memcmp(&mask->hdr.dst_addr,
1371 &flow_tcf_mask_empty.ipv6.hdr.dst_addr,
1373 if (memcmp(&mask->hdr.dst_addr,
1374 &rte_flow_item_ipv6_mask.hdr.dst_addr,
1376 return rte_flow_error_set
1378 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1379 "no support for partial mask on"
1380 " \"ipv6.hdr.dst_addr\" field"
1381 " for vxlan encapsulation");
1382 /* More IPv6 address validations can be put here. */
1385 * Kernel uses the destination IP address to determine
1386 * the routing path and obtain the MAC destination
1387 * address (heigh or gate), so IP destination address
1388 * must be specified within the tc rule.
1390 return rte_flow_error_set(error, EINVAL,
1391 RTE_FLOW_ERROR_TYPE_ITEM, item,
1392 "outer ipv6 destination address"
1393 " must be specified for"
1394 " vxlan encapsulation");
1396 if (memcmp(&mask->hdr.src_addr,
1397 &flow_tcf_mask_empty.ipv6.hdr.src_addr,
1399 if (memcmp(&mask->hdr.src_addr,
1400 &rte_flow_item_ipv6_mask.hdr.src_addr,
1402 return rte_flow_error_set
1404 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1405 "no support for partial mask on"
1406 " \"ipv6.hdr.src_addr\" field"
1407 " for vxlan encapsulation");
1408 /* More L3 address validation can be put here. */
1411 * Kernel uses the source IP address to select the
1412 * interface for egress encapsulated traffic, so
1413 * it must be specified in the tc rule.
1415 return rte_flow_error_set(error, EINVAL,
1416 RTE_FLOW_ERROR_TYPE_ITEM, item,
1417 "outer L3 source address"
1418 " must be specified for"
1419 " vxlan encapsulation");
1421 msk6 = (rte_be_to_cpu_32(mask->hdr.vtc_flow) >>
1422 IPV6_HDR_TC_SHIFT) & 0xff;
1423 if (msk6 && msk6 != 0xff)
1424 return rte_flow_error_set(error, ENOTSUP,
1425 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1426 "no support for partial mask on"
1427 " \"ipv6.hdr.vtc_flow.tos\" field"
1428 " for vxlan encapsulation");
1429 if (mask->hdr.hop_limits && mask->hdr.hop_limits != 0xff)
1430 return rte_flow_error_set(error, ENOTSUP,
1431 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1432 "no support for partial mask on"
1433 " \"ipv6.hdr.hop_limits\" field"
1434 " for vxlan encapsulation");
1439 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_UDP item for E-Switch.
1440 * The routine checks the UDP fields to be used in encapsulation header.
1443 * Pointer to the item structure.
1445 * Pointer to the error structure.
1448 * 0 on success, a negative errno value otherwise and rte_errno is set.
1451 flow_tcf_validate_vxlan_encap_udp(const struct rte_flow_item *item,
1452 struct rte_flow_error *error)
1454 const struct rte_flow_item_udp *spec = item->spec;
1455 const struct rte_flow_item_udp *mask = item->mask;
1459 * Specification for UDP ports cannot be empty
1460 * because it is required by tunnel_key parameter.
1462 return rte_flow_error_set(error, EINVAL,
1463 RTE_FLOW_ERROR_TYPE_ITEM, item,
1464 "NULL UDP port specification "
1465 " for vxlan encapsulation");
1468 mask = &rte_flow_item_udp_mask;
1469 if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1470 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1471 return rte_flow_error_set
1473 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1474 "no support for partial mask on"
1475 " \"udp.hdr.dst_port\" field"
1476 " for vxlan encapsulation");
1477 if (!spec->hdr.dst_port)
1478 return rte_flow_error_set
1480 RTE_FLOW_ERROR_TYPE_ITEM, item,
1481 "outer UDP remote port cannot be"
1482 " 0 for vxlan encapsulation");
1484 return rte_flow_error_set(error, EINVAL,
1485 RTE_FLOW_ERROR_TYPE_ITEM, item,
1486 "outer UDP remote port"
1487 " must be specified for"
1488 " vxlan encapsulation");
1490 if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1491 if (mask->hdr.src_port != RTE_BE16(0xffff))
1492 return rte_flow_error_set
1494 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1495 "no support for partial mask on"
1496 " \"udp.hdr.src_port\" field"
1497 " for vxlan encapsulation");
1499 "outer UDP source port cannot be"
1500 " forced for vxlan encapsulation,"
1501 " parameter ignored");
1507 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_VXLAN item for E-Switch.
1508 * The routine checks the VNIP fields to be used in encapsulation header.
1511 * Pointer to the item structure.
1513 * Pointer to the error structure.
1516 * 0 on success, a negative errno value otherwise and rte_errno is set.
1519 flow_tcf_validate_vxlan_encap_vni(const struct rte_flow_item *item,
1520 struct rte_flow_error *error)
1522 const struct rte_flow_item_vxlan *spec = item->spec;
1523 const struct rte_flow_item_vxlan *mask = item->mask;
1526 /* Outer VNI is required by tunnel_key parameter. */
1527 return rte_flow_error_set(error, EINVAL,
1528 RTE_FLOW_ERROR_TYPE_ITEM, item,
1529 "NULL VNI specification"
1530 " for vxlan encapsulation");
1533 mask = &rte_flow_item_vxlan_mask;
1534 if (!mask->vni[0] && !mask->vni[1] && !mask->vni[2])
1535 return rte_flow_error_set(error, EINVAL,
1536 RTE_FLOW_ERROR_TYPE_ITEM, item,
1537 "outer VNI must be specified "
1538 "for vxlan encapsulation");
1539 if (mask->vni[0] != 0xff ||
1540 mask->vni[1] != 0xff ||
1541 mask->vni[2] != 0xff)
1542 return rte_flow_error_set(error, ENOTSUP,
1543 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1544 "no support for partial mask on"
1545 " \"vxlan.vni\" field");
1547 if (!spec->vni[0] && !spec->vni[1] && !spec->vni[2])
1548 return rte_flow_error_set(error, EINVAL,
1549 RTE_FLOW_ERROR_TYPE_ITEM, item,
1550 "vxlan vni cannot be 0");
1555 * Validate VXLAN_ENCAP action item list for E-Switch.
1556 * The routine checks items to be used in encapsulation header.
1559 * Pointer to the VXLAN_ENCAP action structure.
1561 * Pointer to the error structure.
1564 * 0 on success, a negative errno value otherwise and rte_errno is set.
1567 flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action,
1568 struct rte_flow_error *error)
1570 const struct rte_flow_item *items;
1572 uint32_t item_flags = 0;
1575 return rte_flow_error_set(error, EINVAL,
1576 RTE_FLOW_ERROR_TYPE_ACTION, action,
1577 "Missing vxlan tunnel"
1578 " action configuration");
1579 items = ((const struct rte_flow_action_vxlan_encap *)
1580 action->conf)->definition;
1582 return rte_flow_error_set(error, EINVAL,
1583 RTE_FLOW_ERROR_TYPE_ACTION, action,
1584 "Missing vxlan tunnel"
1585 " encapsulation parameters");
1586 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1587 switch (items->type) {
1588 case RTE_FLOW_ITEM_TYPE_VOID:
1590 case RTE_FLOW_ITEM_TYPE_ETH:
1591 ret = mlx5_flow_validate_item_eth(items, item_flags,
1595 ret = flow_tcf_validate_vxlan_encap_eth(items, error);
1598 item_flags |= MLX5_FLOW_LAYER_OUTER_L2;
1601 case RTE_FLOW_ITEM_TYPE_IPV4:
1602 ret = mlx5_flow_validate_item_ipv4
1604 &flow_tcf_mask_supported.ipv4, error);
1607 ret = flow_tcf_validate_vxlan_encap_ipv4(items, error);
1610 item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
1612 case RTE_FLOW_ITEM_TYPE_IPV6:
1613 ret = mlx5_flow_validate_item_ipv6
1615 &flow_tcf_mask_supported.ipv6, error);
1618 ret = flow_tcf_validate_vxlan_encap_ipv6(items, error);
1621 item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
1623 case RTE_FLOW_ITEM_TYPE_UDP:
1624 ret = mlx5_flow_validate_item_udp(items, item_flags,
1628 ret = flow_tcf_validate_vxlan_encap_udp(items, error);
1631 item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
1633 case RTE_FLOW_ITEM_TYPE_VXLAN:
1634 ret = mlx5_flow_validate_item_vxlan(items,
1638 ret = flow_tcf_validate_vxlan_encap_vni(items, error);
1641 item_flags |= MLX5_FLOW_LAYER_VXLAN;
1644 return rte_flow_error_set
1646 RTE_FLOW_ERROR_TYPE_ITEM, items,
1647 "vxlan encap item not supported");
1650 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3))
1651 return rte_flow_error_set(error, EINVAL,
1652 RTE_FLOW_ERROR_TYPE_ACTION, action,
1653 "no outer IP layer found"
1654 " for vxlan encapsulation");
1655 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
1656 return rte_flow_error_set(error, EINVAL,
1657 RTE_FLOW_ERROR_TYPE_ACTION, action,
1658 "no outer UDP layer found"
1659 " for vxlan encapsulation");
1660 if (!(item_flags & MLX5_FLOW_LAYER_VXLAN))
1661 return rte_flow_error_set(error, EINVAL,
1662 RTE_FLOW_ERROR_TYPE_ACTION, action,
1663 "no VXLAN VNI found"
1664 " for vxlan encapsulation");
1669 * Validate outer RTE_FLOW_ITEM_TYPE_UDP item if tunnel item
1670 * RTE_FLOW_ITEM_TYPE_VXLAN is present in item list.
1673 * Outer UDP layer item (if any, NULL otherwise).
1675 * Pointer to the error structure.
1678 * 0 on success, a negative errno value otherwise and rte_errno is set.
1681 flow_tcf_validate_vxlan_decap_udp(const struct rte_flow_item *udp,
1682 struct rte_flow_error *error)
1684 const struct rte_flow_item_udp *spec = udp->spec;
1685 const struct rte_flow_item_udp *mask = udp->mask;
1689 * Specification for UDP ports cannot be empty
1690 * because it is required as decap parameter.
1692 return rte_flow_error_set(error, EINVAL,
1693 RTE_FLOW_ERROR_TYPE_ITEM, udp,
1694 "NULL UDP port specification"
1695 " for VXLAN decapsulation");
1697 mask = &rte_flow_item_udp_mask;
1698 if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1699 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1700 return rte_flow_error_set
1702 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1703 "no support for partial mask on"
1704 " \"udp.hdr.dst_port\" field");
1705 if (!spec->hdr.dst_port)
1706 return rte_flow_error_set
1708 RTE_FLOW_ERROR_TYPE_ITEM, udp,
1709 "zero decap local UDP port");
1711 return rte_flow_error_set(error, EINVAL,
1712 RTE_FLOW_ERROR_TYPE_ITEM, udp,
1713 "outer UDP destination port must be "
1714 "specified for vxlan decapsulation");
1716 if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1717 if (mask->hdr.src_port != RTE_BE16(0xffff))
1718 return rte_flow_error_set
1720 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1721 "no support for partial mask on"
1722 " \"udp.hdr.src_port\" field");
1724 "outer UDP local port cannot be "
1725 "forced for VXLAN encapsulation, "
1726 "parameter ignored");
1732 * Validate flow for E-Switch.
1735 * Pointer to the priv structure.
1737 * Pointer to the flow attributes.
1739 * Pointer to the list of items.
1740 * @param[in] actions
1741 * Pointer to the list of actions.
1743 * Pointer to the error structure.
1746 * 0 on success, a negative errno value otherwise and rte_errno is set.
1749 flow_tcf_validate(struct rte_eth_dev *dev,
1750 const struct rte_flow_attr *attr,
1751 const struct rte_flow_item items[],
1752 const struct rte_flow_action actions[],
1753 struct rte_flow_error *error)
1756 const struct rte_flow_item_port_id *port_id;
1757 const struct rte_flow_item_eth *eth;
1758 const struct rte_flow_item_vlan *vlan;
1759 const struct rte_flow_item_ipv4 *ipv4;
1760 const struct rte_flow_item_ipv6 *ipv6;
1761 const struct rte_flow_item_tcp *tcp;
1762 const struct rte_flow_item_udp *udp;
1763 const struct rte_flow_item_vxlan *vxlan;
1766 const struct rte_flow_action_port_id *port_id;
1767 const struct rte_flow_action_jump *jump;
1768 const struct rte_flow_action_of_push_vlan *of_push_vlan;
1769 const struct rte_flow_action_of_set_vlan_vid *
1771 const struct rte_flow_action_of_set_vlan_pcp *
1773 const struct rte_flow_action_vxlan_encap *vxlan_encap;
1774 const struct rte_flow_action_set_ipv4 *set_ipv4;
1775 const struct rte_flow_action_set_ipv6 *set_ipv6;
1777 const struct rte_flow_item *outer_udp = NULL;
1778 rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
1779 rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
1780 rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
1781 uint64_t item_flags = 0;
1782 uint64_t action_flags = 0;
1783 uint8_t next_protocol = 0xff;
1784 unsigned int tcm_ifindex = 0;
1785 uint8_t pedit_validated = 0;
1786 struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
1787 struct rte_eth_dev *port_id_dev = NULL;
1788 bool in_port_id_set;
1791 claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
1792 PTOI_TABLE_SZ_MAX(dev)));
1793 ret = flow_tcf_validate_attributes(attr, error);
1796 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
1798 uint64_t current_action_flag = 0;
1800 switch (actions->type) {
1801 case RTE_FLOW_ACTION_TYPE_VOID:
1803 case RTE_FLOW_ACTION_TYPE_PORT_ID:
1804 current_action_flag = MLX5_FLOW_ACTION_PORT_ID;
1807 conf.port_id = actions->conf;
1808 if (conf.port_id->original)
1811 for (i = 0; ptoi[i].ifindex; ++i)
1812 if (ptoi[i].port_id == conf.port_id->id)
1814 if (!ptoi[i].ifindex)
1815 return rte_flow_error_set
1817 RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1819 "missing data to convert port ID to"
1821 port_id_dev = &rte_eth_devices[conf.port_id->id];
1823 case RTE_FLOW_ACTION_TYPE_JUMP:
1824 current_action_flag = MLX5_FLOW_ACTION_JUMP;
1827 conf.jump = actions->conf;
1828 if (attr->group >= conf.jump->group)
1829 return rte_flow_error_set
1831 RTE_FLOW_ERROR_TYPE_ACTION,
1833 "can jump only to a group forward");
1835 case RTE_FLOW_ACTION_TYPE_DROP:
1836 current_action_flag = MLX5_FLOW_ACTION_DROP;
1838 case RTE_FLOW_ACTION_TYPE_COUNT:
1840 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
1841 current_action_flag = MLX5_FLOW_ACTION_OF_POP_VLAN;
1843 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: {
1844 rte_be16_t ethertype;
1846 current_action_flag = MLX5_FLOW_ACTION_OF_PUSH_VLAN;
1849 conf.of_push_vlan = actions->conf;
1850 ethertype = conf.of_push_vlan->ethertype;
1851 if (ethertype != RTE_BE16(ETH_P_8021Q) &&
1852 ethertype != RTE_BE16(ETH_P_8021AD))
1853 return rte_flow_error_set
1855 RTE_FLOW_ERROR_TYPE_ACTION, actions,
1856 "vlan push TPID must be "
1857 "802.1Q or 802.1AD");
1860 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
1861 if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1862 return rte_flow_error_set
1864 RTE_FLOW_ERROR_TYPE_ACTION, actions,
1865 "vlan modify is not supported,"
1866 " set action must follow push action");
1867 current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
1869 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
1870 if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1871 return rte_flow_error_set
1873 RTE_FLOW_ERROR_TYPE_ACTION, actions,
1874 "vlan modify is not supported,"
1875 " set action must follow push action");
1876 current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
1878 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
1879 current_action_flag = MLX5_FLOW_ACTION_VXLAN_DECAP;
1881 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
1882 ret = flow_tcf_validate_vxlan_encap(actions, error);
1885 current_action_flag = MLX5_FLOW_ACTION_VXLAN_ENCAP;
1887 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
1888 current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_SRC;
1890 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
1891 current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_DST;
1893 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
1894 current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_SRC;
1896 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
1897 current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_DST;
1899 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
1900 current_action_flag = MLX5_FLOW_ACTION_SET_TP_SRC;
1902 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
1903 current_action_flag = MLX5_FLOW_ACTION_SET_TP_DST;
1905 case RTE_FLOW_ACTION_TYPE_SET_TTL:
1906 current_action_flag = MLX5_FLOW_ACTION_SET_TTL;
1908 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
1909 current_action_flag = MLX5_FLOW_ACTION_DEC_TTL;
1911 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
1912 current_action_flag = MLX5_FLOW_ACTION_SET_MAC_SRC;
1914 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
1915 current_action_flag = MLX5_FLOW_ACTION_SET_MAC_DST;
1918 return rte_flow_error_set(error, ENOTSUP,
1919 RTE_FLOW_ERROR_TYPE_ACTION,
1921 "action not supported");
1923 if (current_action_flag & MLX5_TCF_CONFIG_ACTIONS) {
1925 return rte_flow_error_set
1927 RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1929 "action configuration not set");
1931 if ((current_action_flag & MLX5_TCF_PEDIT_ACTIONS) &&
1933 return rte_flow_error_set(error, ENOTSUP,
1934 RTE_FLOW_ERROR_TYPE_ACTION,
1936 "set actions should be "
1937 "listed successively");
1938 if ((current_action_flag & ~MLX5_TCF_PEDIT_ACTIONS) &&
1939 (action_flags & MLX5_TCF_PEDIT_ACTIONS))
1940 pedit_validated = 1;
1941 if ((current_action_flag & MLX5_TCF_FATE_ACTIONS) &&
1942 (action_flags & MLX5_TCF_FATE_ACTIONS))
1943 return rte_flow_error_set(error, EINVAL,
1944 RTE_FLOW_ERROR_TYPE_ACTION,
1946 "can't have multiple fate"
1948 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1949 (action_flags & MLX5_TCF_VXLAN_ACTIONS))
1950 return rte_flow_error_set(error, EINVAL,
1951 RTE_FLOW_ERROR_TYPE_ACTION,
1953 "can't have multiple vxlan"
1955 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1956 (action_flags & MLX5_TCF_VLAN_ACTIONS))
1957 return rte_flow_error_set(error, ENOTSUP,
1958 RTE_FLOW_ERROR_TYPE_ACTION,
1960 "can't have vxlan and vlan"
1961 " actions in the same rule");
1962 action_flags |= current_action_flag;
1964 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1967 switch (items->type) {
1968 case RTE_FLOW_ITEM_TYPE_VOID:
1970 case RTE_FLOW_ITEM_TYPE_PORT_ID:
1971 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
1972 return rte_flow_error_set
1974 RTE_FLOW_ERROR_TYPE_ITEM, items,
1975 "inner tunnel port id"
1976 " item is not supported");
1977 mask.port_id = flow_tcf_item_mask
1978 (items, &rte_flow_item_port_id_mask,
1979 &flow_tcf_mask_supported.port_id,
1980 &flow_tcf_mask_empty.port_id,
1981 sizeof(flow_tcf_mask_supported.port_id),
1985 if (mask.port_id == &flow_tcf_mask_empty.port_id) {
1989 spec.port_id = items->spec;
1990 if (mask.port_id->id && mask.port_id->id != 0xffffffff)
1991 return rte_flow_error_set
1993 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
1995 "no support for partial mask on"
1997 if (!mask.port_id->id)
2000 for (i = 0; ptoi[i].ifindex; ++i)
2001 if (ptoi[i].port_id == spec.port_id->id)
2003 if (!ptoi[i].ifindex)
2004 return rte_flow_error_set
2006 RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
2008 "missing data to convert port ID to"
2010 if (in_port_id_set && ptoi[i].ifindex != tcm_ifindex)
2011 return rte_flow_error_set
2013 RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
2015 "cannot match traffic for"
2016 " several port IDs through"
2017 " a single flow rule");
2018 tcm_ifindex = ptoi[i].ifindex;
2021 case RTE_FLOW_ITEM_TYPE_ETH:
2022 ret = mlx5_flow_validate_item_eth(items, item_flags,
2026 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2027 MLX5_FLOW_LAYER_INNER_L2 :
2028 MLX5_FLOW_LAYER_OUTER_L2;
2030 * Redundant check due to different supported mask.
2031 * Same for the rest of items.
2033 mask.eth = flow_tcf_item_mask
2034 (items, &rte_flow_item_eth_mask,
2035 &flow_tcf_mask_supported.eth,
2036 &flow_tcf_mask_empty.eth,
2037 sizeof(flow_tcf_mask_supported.eth),
2041 if (mask.eth->type && mask.eth->type !=
2043 return rte_flow_error_set
2045 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2047 "no support for partial mask on"
2049 assert(items->spec);
2050 spec.eth = items->spec;
2051 if (mask.eth->type &&
2052 (item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
2053 inner_etype != RTE_BE16(ETH_P_ALL) &&
2054 inner_etype != spec.eth->type)
2055 return rte_flow_error_set
2057 RTE_FLOW_ERROR_TYPE_ITEM,
2059 "inner eth_type conflict");
2060 if (mask.eth->type &&
2061 !(item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
2062 outer_etype != RTE_BE16(ETH_P_ALL) &&
2063 outer_etype != spec.eth->type)
2064 return rte_flow_error_set
2066 RTE_FLOW_ERROR_TYPE_ITEM,
2068 "outer eth_type conflict");
2069 if (mask.eth->type) {
2070 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
2071 inner_etype = spec.eth->type;
2073 outer_etype = spec.eth->type;
2076 case RTE_FLOW_ITEM_TYPE_VLAN:
2077 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
2078 return rte_flow_error_set
2080 RTE_FLOW_ERROR_TYPE_ITEM, items,
2082 " is not supported");
2083 ret = mlx5_flow_validate_item_vlan(items, item_flags,
2087 item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
2088 mask.vlan = flow_tcf_item_mask
2089 (items, &rte_flow_item_vlan_mask,
2090 &flow_tcf_mask_supported.vlan,
2091 &flow_tcf_mask_empty.vlan,
2092 sizeof(flow_tcf_mask_supported.vlan),
2096 if ((mask.vlan->tci & RTE_BE16(0xe000) &&
2097 (mask.vlan->tci & RTE_BE16(0xe000)) !=
2098 RTE_BE16(0xe000)) ||
2099 (mask.vlan->tci & RTE_BE16(0x0fff) &&
2100 (mask.vlan->tci & RTE_BE16(0x0fff)) !=
2101 RTE_BE16(0x0fff)) ||
2102 (mask.vlan->inner_type &&
2103 mask.vlan->inner_type != RTE_BE16(0xffff)))
2104 return rte_flow_error_set
2106 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2108 "no support for partial masks on"
2109 " \"tci\" (PCP and VID parts) and"
2110 " \"inner_type\" fields");
2111 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2112 outer_etype != RTE_BE16(ETH_P_8021Q))
2113 return rte_flow_error_set
2115 RTE_FLOW_ERROR_TYPE_ITEM,
2117 "outer eth_type conflict,"
2119 outer_etype = RTE_BE16(ETH_P_8021Q);
2120 assert(items->spec);
2121 spec.vlan = items->spec;
2122 if (mask.vlan->inner_type &&
2123 vlan_etype != RTE_BE16(ETH_P_ALL) &&
2124 vlan_etype != spec.vlan->inner_type)
2125 return rte_flow_error_set
2127 RTE_FLOW_ERROR_TYPE_ITEM,
2129 "vlan eth_type conflict");
2130 if (mask.vlan->inner_type)
2131 vlan_etype = spec.vlan->inner_type;
2133 case RTE_FLOW_ITEM_TYPE_IPV4:
2134 ret = mlx5_flow_validate_item_ipv4
2136 &flow_tcf_mask_supported.ipv4, error);
2139 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2140 MLX5_FLOW_LAYER_INNER_L3_IPV4 :
2141 MLX5_FLOW_LAYER_OUTER_L3_IPV4;
2142 mask.ipv4 = flow_tcf_item_mask
2143 (items, &rte_flow_item_ipv4_mask,
2144 &flow_tcf_mask_supported.ipv4,
2145 &flow_tcf_mask_empty.ipv4,
2146 sizeof(flow_tcf_mask_supported.ipv4),
2150 if (mask.ipv4->hdr.next_proto_id &&
2151 mask.ipv4->hdr.next_proto_id != 0xff)
2152 return rte_flow_error_set
2154 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2156 "no support for partial mask on"
2157 " \"hdr.next_proto_id\" field");
2158 else if (mask.ipv4->hdr.next_proto_id)
2160 ((const struct rte_flow_item_ipv4 *)
2161 (items->spec))->hdr.next_proto_id;
2162 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2163 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2164 inner_etype != RTE_BE16(ETH_P_IP))
2165 return rte_flow_error_set
2167 RTE_FLOW_ERROR_TYPE_ITEM,
2169 "inner eth_type conflict,"
2170 " IPv4 is required");
2171 inner_etype = RTE_BE16(ETH_P_IP);
2172 } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2173 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2174 vlan_etype != RTE_BE16(ETH_P_IP))
2175 return rte_flow_error_set
2177 RTE_FLOW_ERROR_TYPE_ITEM,
2179 "vlan eth_type conflict,"
2180 " IPv4 is required");
2181 vlan_etype = RTE_BE16(ETH_P_IP);
2183 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2184 outer_etype != RTE_BE16(ETH_P_IP))
2185 return rte_flow_error_set
2187 RTE_FLOW_ERROR_TYPE_ITEM,
2189 "eth_type conflict,"
2190 " IPv4 is required");
2191 outer_etype = RTE_BE16(ETH_P_IP);
2194 case RTE_FLOW_ITEM_TYPE_IPV6:
2195 ret = mlx5_flow_validate_item_ipv6
2197 &flow_tcf_mask_supported.ipv6, error);
2200 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2201 MLX5_FLOW_LAYER_INNER_L3_IPV6 :
2202 MLX5_FLOW_LAYER_OUTER_L3_IPV6;
2203 mask.ipv6 = flow_tcf_item_mask
2204 (items, &rte_flow_item_ipv6_mask,
2205 &flow_tcf_mask_supported.ipv6,
2206 &flow_tcf_mask_empty.ipv6,
2207 sizeof(flow_tcf_mask_supported.ipv6),
2211 if (mask.ipv6->hdr.proto &&
2212 mask.ipv6->hdr.proto != 0xff)
2213 return rte_flow_error_set
2215 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2217 "no support for partial mask on"
2218 " \"hdr.proto\" field");
2219 else if (mask.ipv6->hdr.proto)
2221 ((const struct rte_flow_item_ipv6 *)
2222 (items->spec))->hdr.proto;
2223 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2224 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2225 inner_etype != RTE_BE16(ETH_P_IPV6))
2226 return rte_flow_error_set
2228 RTE_FLOW_ERROR_TYPE_ITEM,
2230 "inner eth_type conflict,"
2231 " IPv6 is required");
2232 inner_etype = RTE_BE16(ETH_P_IPV6);
2233 } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2234 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2235 vlan_etype != RTE_BE16(ETH_P_IPV6))
2236 return rte_flow_error_set
2238 RTE_FLOW_ERROR_TYPE_ITEM,
2240 "vlan eth_type conflict,"
2241 " IPv6 is required");
2242 vlan_etype = RTE_BE16(ETH_P_IPV6);
2244 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2245 outer_etype != RTE_BE16(ETH_P_IPV6))
2246 return rte_flow_error_set
2248 RTE_FLOW_ERROR_TYPE_ITEM,
2250 "eth_type conflict,"
2251 " IPv6 is required");
2252 outer_etype = RTE_BE16(ETH_P_IPV6);
2255 case RTE_FLOW_ITEM_TYPE_UDP:
2256 ret = mlx5_flow_validate_item_udp(items, item_flags,
2257 next_protocol, error);
2260 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2261 MLX5_FLOW_LAYER_INNER_L4_UDP :
2262 MLX5_FLOW_LAYER_OUTER_L4_UDP;
2263 mask.udp = flow_tcf_item_mask
2264 (items, &rte_flow_item_udp_mask,
2265 &flow_tcf_mask_supported.udp,
2266 &flow_tcf_mask_empty.udp,
2267 sizeof(flow_tcf_mask_supported.udp),
2272 * Save the presumed outer UDP item for extra check
2273 * if the tunnel item will be found later in the list.
2275 if (!(item_flags & MLX5_FLOW_LAYER_TUNNEL))
2278 case RTE_FLOW_ITEM_TYPE_TCP:
2279 ret = mlx5_flow_validate_item_tcp
2282 &flow_tcf_mask_supported.tcp,
2286 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2287 MLX5_FLOW_LAYER_INNER_L4_TCP :
2288 MLX5_FLOW_LAYER_OUTER_L4_TCP;
2289 mask.tcp = flow_tcf_item_mask
2290 (items, &rte_flow_item_tcp_mask,
2291 &flow_tcf_mask_supported.tcp,
2292 &flow_tcf_mask_empty.tcp,
2293 sizeof(flow_tcf_mask_supported.tcp),
2298 case RTE_FLOW_ITEM_TYPE_VXLAN:
2299 if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN)
2300 return rte_flow_error_set
2302 RTE_FLOW_ERROR_TYPE_ITEM, items,
2303 "vxlan tunnel over vlan"
2304 " is not supported");
2305 ret = mlx5_flow_validate_item_vxlan(items,
2309 item_flags |= MLX5_FLOW_LAYER_VXLAN;
2310 mask.vxlan = flow_tcf_item_mask
2311 (items, &rte_flow_item_vxlan_mask,
2312 &flow_tcf_mask_supported.vxlan,
2313 &flow_tcf_mask_empty.vxlan,
2314 sizeof(flow_tcf_mask_supported.vxlan), error);
2317 if (mask.vxlan->vni[0] != 0xff ||
2318 mask.vxlan->vni[1] != 0xff ||
2319 mask.vxlan->vni[2] != 0xff)
2320 return rte_flow_error_set
2322 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2324 "no support for partial or "
2325 "empty mask on \"vxlan.vni\" field");
2327 * The VNI item assumes the VXLAN tunnel, it requires
2328 * at least the outer destination UDP port must be
2329 * specified without wildcards to allow kernel select
2330 * the virtual VXLAN device by port. Also outer IPv4
2331 * or IPv6 item must be specified (wilcards or even
2332 * zero mask are allowed) to let driver know the tunnel
2333 * IP version and process UDP traffic correctly.
2336 (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2337 MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2338 return rte_flow_error_set
2340 RTE_FLOW_ERROR_TYPE_ACTION,
2342 "no outer IP pattern found"
2343 " for vxlan tunnel");
2344 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
2345 return rte_flow_error_set
2347 RTE_FLOW_ERROR_TYPE_ACTION,
2349 "no outer UDP pattern found"
2350 " for vxlan tunnel");
2352 * All items preceding the tunnel item become outer
2353 * ones and we should do extra validation for them
2354 * due to tc limitations for tunnel outer parameters.
2355 * Currently only outer UDP item requres extra check,
2356 * use the saved pointer instead of item list rescan.
2359 ret = flow_tcf_validate_vxlan_decap_udp
2363 /* Reset L4 protocol for inner parameters. */
2364 next_protocol = 0xff;
2367 return rte_flow_error_set(error, ENOTSUP,
2368 RTE_FLOW_ERROR_TYPE_ITEM,
2369 items, "item not supported");
2372 if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2373 (action_flags & MLX5_FLOW_ACTION_DROP))
2374 return rte_flow_error_set(error, ENOTSUP,
2375 RTE_FLOW_ERROR_TYPE_ACTION,
2377 "set action is not compatible with "
2379 if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2380 !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2381 return rte_flow_error_set(error, ENOTSUP,
2382 RTE_FLOW_ERROR_TYPE_ACTION,
2384 "set action must be followed by "
2387 (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST)) {
2388 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4))
2389 return rte_flow_error_set(error, EINVAL,
2390 RTE_FLOW_ERROR_TYPE_ACTION,
2392 "no ipv4 item found in"
2396 (MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST)) {
2397 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6))
2398 return rte_flow_error_set(error, EINVAL,
2399 RTE_FLOW_ERROR_TYPE_ACTION,
2401 "no ipv6 item found in"
2405 (MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST)) {
2407 (MLX5_FLOW_LAYER_OUTER_L4_UDP |
2408 MLX5_FLOW_LAYER_OUTER_L4_TCP)))
2409 return rte_flow_error_set(error, EINVAL,
2410 RTE_FLOW_ERROR_TYPE_ACTION,
2412 "no TCP/UDP item found in"
2416 * FW syndrome (0xA9C090):
2417 * set_flow_table_entry: push vlan action fte in fdb can ONLY be
2418 * forward to the uplink.
2420 if ((action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) &&
2421 (action_flags & MLX5_FLOW_ACTION_PORT_ID) &&
2422 ((struct priv *)port_id_dev->data->dev_private)->representor)
2423 return rte_flow_error_set(error, ENOTSUP,
2424 RTE_FLOW_ERROR_TYPE_ACTION, actions,
2425 "vlan push can only be applied"
2426 " when forwarding to uplink port");
2428 * FW syndrome (0x294609):
2429 * set_flow_table_entry: modify/pop/push actions in fdb flow table
2430 * are supported only while forwarding to vport.
2432 if ((action_flags & MLX5_TCF_VLAN_ACTIONS) &&
2433 !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2434 return rte_flow_error_set(error, ENOTSUP,
2435 RTE_FLOW_ERROR_TYPE_ACTION, actions,
2436 "vlan actions are supported"
2437 " only with port_id action");
2438 if ((action_flags & MLX5_TCF_VXLAN_ACTIONS) &&
2439 !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2440 return rte_flow_error_set(error, ENOTSUP,
2441 RTE_FLOW_ERROR_TYPE_ACTION, NULL,
2442 "vxlan actions are supported"
2443 " only with port_id action");
2444 if (!(action_flags & MLX5_TCF_FATE_ACTIONS))
2445 return rte_flow_error_set(error, EINVAL,
2446 RTE_FLOW_ERROR_TYPE_ACTION, actions,
2447 "no fate action is found");
2449 (MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL)) {
2451 (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2452 MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2453 return rte_flow_error_set(error, EINVAL,
2454 RTE_FLOW_ERROR_TYPE_ACTION,
2456 "no IP found in pattern");
2459 (MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)) {
2460 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L2))
2461 return rte_flow_error_set(error, ENOTSUP,
2462 RTE_FLOW_ERROR_TYPE_ACTION,
2464 "no ethernet found in"
2467 if ((action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) &&
2468 !(item_flags & MLX5_FLOW_LAYER_VXLAN))
2469 return rte_flow_error_set(error, EINVAL,
2470 RTE_FLOW_ERROR_TYPE_ACTION,
2472 "no VNI pattern found"
2473 " for vxlan decap action");
2474 if ((action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) &&
2475 (item_flags & MLX5_FLOW_LAYER_TUNNEL))
2476 return rte_flow_error_set(error, EINVAL,
2477 RTE_FLOW_ERROR_TYPE_ACTION,
2479 "vxlan encap not supported"
2480 " for tunneled traffic");
2485 * Calculate maximum size of memory for flow items of Linux TC flower.
2488 * Pointer to the flow attributes.
2490 * Pointer to the list of items.
2491 * @param[out] action_flags
2492 * Pointer to the detected actions.
2495 * Maximum size of memory for items.
2498 flow_tcf_get_items_size(const struct rte_flow_attr *attr,
2499 const struct rte_flow_item items[],
2500 uint64_t *action_flags)
2504 size += SZ_NLATTR_STRZ_OF("flower") +
2505 SZ_NLATTR_TYPE_OF(uint16_t) + /* Outer ether type. */
2506 SZ_NLATTR_NEST + /* TCA_OPTIONS. */
2507 SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */
2508 if (attr->group > 0)
2509 size += SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CHAIN. */
2510 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2511 switch (items->type) {
2512 case RTE_FLOW_ITEM_TYPE_VOID:
2514 case RTE_FLOW_ITEM_TYPE_PORT_ID:
2516 case RTE_FLOW_ITEM_TYPE_ETH:
2517 size += SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4;
2518 /* dst/src MAC addr and mask. */
2520 case RTE_FLOW_ITEM_TYPE_VLAN:
2521 size += SZ_NLATTR_TYPE_OF(uint16_t) +
2522 /* VLAN Ether type. */
2523 SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */
2524 SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */
2526 case RTE_FLOW_ITEM_TYPE_IPV4: {
2527 const struct rte_flow_item_ipv4 *ipv4 = items->mask;
2529 size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2530 SZ_NLATTR_TYPE_OF(uint32_t) * 4;
2531 /* dst/src IP addr and mask. */
2532 if (ipv4 && ipv4->hdr.time_to_live)
2533 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2534 if (ipv4 && ipv4->hdr.type_of_service)
2535 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2538 case RTE_FLOW_ITEM_TYPE_IPV6: {
2539 const struct rte_flow_item_ipv6 *ipv6 = items->mask;
2541 size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2542 SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4;
2543 /* dst/src IP addr and mask. */
2544 if (ipv6 && ipv6->hdr.hop_limits)
2545 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2546 if (ipv6 && (rte_be_to_cpu_32(ipv6->hdr.vtc_flow) &
2547 (0xfful << IPV6_HDR_TC_SHIFT)))
2548 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2551 case RTE_FLOW_ITEM_TYPE_UDP:
2552 size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2553 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2554 /* dst/src port and mask. */
2556 case RTE_FLOW_ITEM_TYPE_TCP:
2557 size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2558 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2559 /* dst/src port and mask. */
2561 case RTE_FLOW_ITEM_TYPE_VXLAN:
2562 size += SZ_NLATTR_TYPE_OF(uint32_t);
2564 * There might be no VXLAN decap action in the action
2565 * list, nonetheless the VXLAN tunnel flow requires
2566 * the decap structure to be correctly applied to
2567 * VXLAN device, set the flag to create the structure.
2568 * Translation routine will not put the decap action
2569 * in tne Netlink message if there is no actual action
2572 *action_flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2576 "unsupported item %p type %d,"
2577 " items must be validated before flow creation",
2578 (const void *)items, items->type);
2586 * Calculate size of memory to store the VXLAN encapsultion
2587 * related items in the Netlink message buffer. Items list
2588 * is specified by RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action.
2589 * The item list should be validated.
2592 * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2593 * List of pattern items to scan data from.
2596 * The size the part of Netlink message buffer to store the
2597 * VXLAN encapsulation item attributes.
2600 flow_tcf_vxlan_encap_size(const struct rte_flow_action *action)
2602 const struct rte_flow_item *items;
2605 assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
2606 assert(action->conf);
2608 items = ((const struct rte_flow_action_vxlan_encap *)
2609 action->conf)->definition;
2611 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2612 switch (items->type) {
2613 case RTE_FLOW_ITEM_TYPE_VOID:
2615 case RTE_FLOW_ITEM_TYPE_ETH:
2616 /* This item does not require message buffer. */
2618 case RTE_FLOW_ITEM_TYPE_IPV4: {
2619 const struct rte_flow_item_ipv4 *ipv4 = items->mask;
2621 size += SZ_NLATTR_DATA_OF(IPV4_ADDR_LEN) * 2;
2622 if (ipv4 && ipv4->hdr.time_to_live)
2623 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2624 if (ipv4 && ipv4->hdr.type_of_service)
2625 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2628 case RTE_FLOW_ITEM_TYPE_IPV6: {
2629 const struct rte_flow_item_ipv6 *ipv6 = items->mask;
2631 size += SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 2;
2632 if (ipv6 && ipv6->hdr.hop_limits)
2633 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2634 if (ipv6 && (rte_be_to_cpu_32(ipv6->hdr.vtc_flow) &
2635 (0xfful << IPV6_HDR_TC_SHIFT)))
2636 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2639 case RTE_FLOW_ITEM_TYPE_UDP: {
2640 const struct rte_flow_item_udp *udp = items->mask;
2642 size += SZ_NLATTR_TYPE_OF(uint16_t);
2643 if (!udp || udp->hdr.src_port != RTE_BE16(0x0000))
2644 size += SZ_NLATTR_TYPE_OF(uint16_t);
2647 case RTE_FLOW_ITEM_TYPE_VXLAN:
2648 size += SZ_NLATTR_TYPE_OF(uint32_t);
2653 "unsupported item %p type %d,"
2654 " items must be validated"
2655 " before flow creation",
2656 (const void *)items, items->type);
2664 * Calculate maximum size of memory for flow actions of Linux TC flower and
2665 * extract specified actions.
2667 * @param[in] actions
2668 * Pointer to the list of actions.
2669 * @param[out] action_flags
2670 * Pointer to the detected actions.
2673 * Maximum size of memory for actions.
2676 flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
2677 uint64_t *action_flags)
2682 size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */
2683 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
2684 switch (actions->type) {
2685 case RTE_FLOW_ACTION_TYPE_VOID:
2687 case RTE_FLOW_ACTION_TYPE_PORT_ID:
2688 size += SZ_NLATTR_NEST + /* na_act_index. */
2689 SZ_NLATTR_STRZ_OF("mirred") +
2690 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2691 SZ_NLATTR_TYPE_OF(struct tc_mirred);
2692 flags |= MLX5_FLOW_ACTION_PORT_ID;
2694 case RTE_FLOW_ACTION_TYPE_JUMP:
2695 size += SZ_NLATTR_NEST + /* na_act_index. */
2696 SZ_NLATTR_STRZ_OF("gact") +
2697 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2698 SZ_NLATTR_TYPE_OF(struct tc_gact);
2699 flags |= MLX5_FLOW_ACTION_JUMP;
2701 case RTE_FLOW_ACTION_TYPE_DROP:
2702 size += SZ_NLATTR_NEST + /* na_act_index. */
2703 SZ_NLATTR_STRZ_OF("gact") +
2704 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2705 SZ_NLATTR_TYPE_OF(struct tc_gact);
2706 flags |= MLX5_FLOW_ACTION_DROP;
2708 case RTE_FLOW_ACTION_TYPE_COUNT:
2710 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
2711 flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
2712 goto action_of_vlan;
2713 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
2714 flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
2715 goto action_of_vlan;
2716 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
2717 flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
2718 goto action_of_vlan;
2719 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
2720 flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
2721 goto action_of_vlan;
2723 size += SZ_NLATTR_NEST + /* na_act_index. */
2724 SZ_NLATTR_STRZ_OF("vlan") +
2725 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2726 SZ_NLATTR_TYPE_OF(struct tc_vlan) +
2727 SZ_NLATTR_TYPE_OF(uint16_t) +
2728 /* VLAN protocol. */
2729 SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
2730 SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
2732 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
2733 size += SZ_NLATTR_NEST + /* na_act_index. */
2734 SZ_NLATTR_STRZ_OF("tunnel_key") +
2735 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2736 SZ_NLATTR_TYPE_OF(uint8_t);
2737 size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2738 size += flow_tcf_vxlan_encap_size(actions) +
2739 RTE_ALIGN_CEIL /* preceding encap params. */
2740 (sizeof(struct flow_tcf_vxlan_encap),
2742 flags |= MLX5_FLOW_ACTION_VXLAN_ENCAP;
2744 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
2745 size += SZ_NLATTR_NEST + /* na_act_index. */
2746 SZ_NLATTR_STRZ_OF("tunnel_key") +
2747 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2748 SZ_NLATTR_TYPE_OF(uint8_t);
2749 size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2750 size += RTE_ALIGN_CEIL /* preceding decap params. */
2751 (sizeof(struct flow_tcf_vxlan_decap),
2753 flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2755 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
2756 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
2757 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
2758 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
2759 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
2760 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
2761 case RTE_FLOW_ACTION_TYPE_SET_TTL:
2762 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
2763 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
2764 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
2765 size += flow_tcf_get_pedit_actions_size(&actions,
2770 "unsupported action %p type %d,"
2771 " items must be validated before flow creation",
2772 (const void *)actions, actions->type);
2776 *action_flags = flags;
2781 * Brand rtnetlink buffer with unique handle.
2783 * This handle should be unique for a given network interface to avoid
2787 * Pointer to Netlink message.
2789 * Unique 32-bit handle to use.
2792 flow_tcf_nl_brand(struct nlmsghdr *nlh, uint32_t handle)
2794 struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
2796 tcm->tcm_handle = handle;
2797 DRV_LOG(DEBUG, "Netlink msg %p is branded with handle %x",
2798 (void *)nlh, handle);
2802 * Prepare a flow object for Linux TC flower. It calculates the maximum size of
2803 * memory required, allocates the memory, initializes Netlink message headers
2804 * and set unique TC message handle.
2807 * Pointer to the flow attributes.
2809 * Pointer to the list of items.
2810 * @param[in] actions
2811 * Pointer to the list of actions.
2813 * Pointer to the error structure.
2816 * Pointer to mlx5_flow object on success,
2817 * otherwise NULL and rte_errno is set.
2819 static struct mlx5_flow *
2820 flow_tcf_prepare(const struct rte_flow_attr *attr,
2821 const struct rte_flow_item items[],
2822 const struct rte_flow_action actions[],
2823 struct rte_flow_error *error)
2825 size_t size = RTE_ALIGN_CEIL
2826 (sizeof(struct mlx5_flow),
2827 alignof(struct flow_tcf_tunnel_hdr)) +
2828 MNL_ALIGN(sizeof(struct nlmsghdr)) +
2829 MNL_ALIGN(sizeof(struct tcmsg));
2830 struct mlx5_flow *dev_flow;
2831 uint64_t action_flags = 0;
2832 struct nlmsghdr *nlh;
2834 uint8_t *sp, *tun = NULL;
2836 size += flow_tcf_get_items_size(attr, items, &action_flags);
2837 size += flow_tcf_get_actions_and_size(actions, &action_flags);
2838 dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
2840 rte_flow_error_set(error, ENOMEM,
2841 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
2842 "not enough memory to create E-Switch flow");
2845 sp = (uint8_t *)(dev_flow + 1);
2846 if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) {
2848 (sp, alignof(struct flow_tcf_tunnel_hdr));
2850 sp += RTE_ALIGN_CEIL
2851 (sizeof(struct flow_tcf_vxlan_encap),
2854 size -= RTE_ALIGN_CEIL
2855 (sizeof(struct flow_tcf_vxlan_encap),
2858 } else if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2860 (sp, alignof(struct flow_tcf_tunnel_hdr));
2862 sp += RTE_ALIGN_CEIL
2863 (sizeof(struct flow_tcf_vxlan_decap),
2866 size -= RTE_ALIGN_CEIL
2867 (sizeof(struct flow_tcf_vxlan_decap),
2871 sp = RTE_PTR_ALIGN(sp, MNL_ALIGNTO);
2873 nlh = mnl_nlmsg_put_header(sp);
2874 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
2875 *dev_flow = (struct mlx5_flow){
2876 .tcf = (struct mlx5_flow_tcf){
2878 .nlsize = size - RTE_ALIGN_CEIL
2879 (sizeof(struct mlx5_flow),
2880 alignof(struct flow_tcf_tunnel_hdr)),
2882 .tunnel = (struct flow_tcf_tunnel_hdr *)tun,
2887 if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP)
2888 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_DECAP;
2889 else if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP)
2890 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_ENCAP;
2892 * Generate a reasonably unique handle based on the address of the
2895 * This is straightforward on 32-bit systems where the flow pointer can
2896 * be used directly. Otherwise, its least significant part is taken
2897 * after shifting it by the previous power of two of the pointed buffer
2900 if (sizeof(dev_flow) <= 4)
2901 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow);
2903 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow >>
2904 rte_log2_u32(rte_align32prevpow2(size)));
2909 * Make adjustments for supporting count actions.
2912 * Pointer to the Ethernet device structure.
2913 * @param[in] dev_flow
2914 * Pointer to mlx5_flow.
2916 * Pointer to error structure.
2919 * 0 On success else a negative errno value is returned and rte_errno is set.
2922 flow_tcf_translate_action_count(struct rte_eth_dev *dev __rte_unused,
2923 struct mlx5_flow *dev_flow,
2924 struct rte_flow_error *error)
2926 struct rte_flow *flow = dev_flow->flow;
2928 if (!flow->counter) {
2929 flow->counter = flow_tcf_counter_new();
2931 return rte_flow_error_set(error, rte_errno,
2932 RTE_FLOW_ERROR_TYPE_ACTION,
2934 "cannot get counter"
2941 * Convert VXLAN VNI to 32-bit integer.
2944 * VXLAN VNI in 24-bit wire format.
2947 * VXLAN VNI as a 32-bit integer value in network endian.
2949 static inline rte_be32_t
2950 vxlan_vni_as_be32(const uint8_t vni[3])
2956 .vni = { 0, vni[0], vni[1], vni[2] },
2962 * Helper function to process RTE_FLOW_ITEM_TYPE_ETH entry in configuration
2963 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the MAC address fields
2964 * in the encapsulation parameters structure. The item must be prevalidated,
2965 * no any validation checks performed by function.
2968 * RTE_FLOW_ITEM_TYPE_ETH entry specification.
2970 * RTE_FLOW_ITEM_TYPE_ETH entry mask.
2972 * Structure to fill the gathered MAC address data.
2975 flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec,
2976 const struct rte_flow_item_eth *mask,
2977 struct flow_tcf_vxlan_encap *encap)
2979 /* Item must be validated before. No redundant checks. */
2981 if (!mask || !memcmp(&mask->dst,
2982 &rte_flow_item_eth_mask.dst,
2983 sizeof(rte_flow_item_eth_mask.dst))) {
2985 * Ethernet addresses are not supported by
2986 * tc as tunnel_key parameters. Destination
2987 * address is needed to form encap packet
2988 * header and retrieved by kernel from
2989 * implicit sources (ARP table, etc),
2990 * address masks are not supported at all.
2992 encap->eth.dst = spec->dst;
2993 encap->mask |= FLOW_TCF_ENCAP_ETH_DST;
2995 if (!mask || !memcmp(&mask->src,
2996 &rte_flow_item_eth_mask.src,
2997 sizeof(rte_flow_item_eth_mask.src))) {
2999 * Ethernet addresses are not supported by
3000 * tc as tunnel_key parameters. Source ethernet
3001 * address is ignored anyway.
3003 encap->eth.src = spec->src;
3004 encap->mask |= FLOW_TCF_ENCAP_ETH_SRC;
3009 * Helper function to process RTE_FLOW_ITEM_TYPE_IPV4 entry in configuration
3010 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV4 address fields
3011 * in the encapsulation parameters structure. The item must be prevalidated,
3012 * no any validation checks performed by function.
3015 * RTE_FLOW_ITEM_TYPE_IPV4 entry specification.
3017 * RTE_FLOW_ITEM_TYPE_IPV4 entry mask.
3019 * Structure to fill the gathered IPV4 address data.
3022 flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec,
3023 const struct rte_flow_item_ipv4 *mask,
3024 struct flow_tcf_vxlan_encap *encap)
3026 /* Item must be validated before. No redundant checks. */
3028 encap->ipv4.dst = spec->hdr.dst_addr;
3029 encap->ipv4.src = spec->hdr.src_addr;
3030 encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC |
3031 FLOW_TCF_ENCAP_IPV4_DST;
3032 if (mask && mask->hdr.type_of_service) {
3033 encap->mask |= FLOW_TCF_ENCAP_IP_TOS;
3034 encap->ip_tos = spec->hdr.type_of_service;
3036 if (mask && mask->hdr.time_to_live) {
3037 encap->mask |= FLOW_TCF_ENCAP_IP_TTL;
3038 encap->ip_ttl_hop = spec->hdr.time_to_live;
3043 * Helper function to process RTE_FLOW_ITEM_TYPE_IPV6 entry in configuration
3044 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV6 address fields
3045 * in the encapsulation parameters structure. The item must be prevalidated,
3046 * no any validation checks performed by function.
3049 * RTE_FLOW_ITEM_TYPE_IPV6 entry specification.
3051 * RTE_FLOW_ITEM_TYPE_IPV6 entry mask.
3053 * Structure to fill the gathered IPV6 address data.
3056 flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec,
3057 const struct rte_flow_item_ipv6 *mask,
3058 struct flow_tcf_vxlan_encap *encap)
3060 /* Item must be validated before. No redundant checks. */
3062 memcpy(encap->ipv6.dst, spec->hdr.dst_addr, IPV6_ADDR_LEN);
3063 memcpy(encap->ipv6.src, spec->hdr.src_addr, IPV6_ADDR_LEN);
3064 encap->mask |= FLOW_TCF_ENCAP_IPV6_SRC |
3065 FLOW_TCF_ENCAP_IPV6_DST;
3067 if ((rte_be_to_cpu_32(mask->hdr.vtc_flow) >>
3068 IPV6_HDR_TC_SHIFT) & 0xff) {
3069 encap->mask |= FLOW_TCF_ENCAP_IP_TOS;
3070 encap->ip_tos = (rte_be_to_cpu_32
3071 (spec->hdr.vtc_flow) >>
3072 IPV6_HDR_TC_SHIFT) & 0xff;
3074 if (mask->hdr.hop_limits) {
3075 encap->mask |= FLOW_TCF_ENCAP_IP_TTL;
3076 encap->ip_ttl_hop = spec->hdr.hop_limits;
3082 * Helper function to process RTE_FLOW_ITEM_TYPE_UDP entry in configuration
3083 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the UDP port fields
3084 * in the encapsulation parameters structure. The item must be prevalidated,
3085 * no any validation checks performed by function.
3088 * RTE_FLOW_ITEM_TYPE_UDP entry specification.
3090 * RTE_FLOW_ITEM_TYPE_UDP entry mask.
3092 * Structure to fill the gathered UDP port data.
3095 flow_tcf_parse_vxlan_encap_udp(const struct rte_flow_item_udp *spec,
3096 const struct rte_flow_item_udp *mask,
3097 struct flow_tcf_vxlan_encap *encap)
3100 encap->udp.dst = spec->hdr.dst_port;
3101 encap->mask |= FLOW_TCF_ENCAP_UDP_DST;
3102 if (!mask || mask->hdr.src_port != RTE_BE16(0x0000)) {
3103 encap->udp.src = spec->hdr.src_port;
3104 encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC;
3109 * Helper function to process RTE_FLOW_ITEM_TYPE_VXLAN entry in configuration
3110 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the VNI fields
3111 * in the encapsulation parameters structure. The item must be prevalidated,
3112 * no any validation checks performed by function.
3115 * RTE_FLOW_ITEM_TYPE_VXLAN entry specification.
3117 * Structure to fill the gathered VNI address data.
3120 flow_tcf_parse_vxlan_encap_vni(const struct rte_flow_item_vxlan *spec,
3121 struct flow_tcf_vxlan_encap *encap)
3123 /* Item must be validated before. Do not redundant checks. */
3125 memcpy(encap->vxlan.vni, spec->vni, sizeof(encap->vxlan.vni));
3126 encap->mask |= FLOW_TCF_ENCAP_VXLAN_VNI;
3130 * Populate consolidated encapsulation object from list of pattern items.
3132 * Helper function to process configuration of action such as
3133 * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. The item list should be
3134 * validated, there is no way to return an meaningful error.
3137 * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
3138 * List of pattern items to gather data from.
3140 * Structure to fill gathered data.
3143 flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action,
3144 struct flow_tcf_vxlan_encap *encap)
3147 const struct rte_flow_item_eth *eth;
3148 const struct rte_flow_item_ipv4 *ipv4;
3149 const struct rte_flow_item_ipv6 *ipv6;
3150 const struct rte_flow_item_udp *udp;
3151 const struct rte_flow_item_vxlan *vxlan;
3153 const struct rte_flow_item *items;
3155 assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
3156 assert(action->conf);
3158 items = ((const struct rte_flow_action_vxlan_encap *)
3159 action->conf)->definition;
3161 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3162 switch (items->type) {
3163 case RTE_FLOW_ITEM_TYPE_VOID:
3165 case RTE_FLOW_ITEM_TYPE_ETH:
3166 mask.eth = items->mask;
3167 spec.eth = items->spec;
3168 flow_tcf_parse_vxlan_encap_eth(spec.eth, mask.eth,
3171 case RTE_FLOW_ITEM_TYPE_IPV4:
3172 spec.ipv4 = items->spec;
3173 mask.ipv4 = items->mask;
3174 flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4, mask.ipv4,
3177 case RTE_FLOW_ITEM_TYPE_IPV6:
3178 spec.ipv6 = items->spec;
3179 mask.ipv6 = items->mask;
3180 flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6, mask.ipv6,
3183 case RTE_FLOW_ITEM_TYPE_UDP:
3184 mask.udp = items->mask;
3185 spec.udp = items->spec;
3186 flow_tcf_parse_vxlan_encap_udp(spec.udp, mask.udp,
3189 case RTE_FLOW_ITEM_TYPE_VXLAN:
3190 spec.vxlan = items->spec;
3191 flow_tcf_parse_vxlan_encap_vni(spec.vxlan, encap);
3196 "unsupported item %p type %d,"
3197 " items must be validated"
3198 " before flow creation",
3199 (const void *)items, items->type);
3207 * Translate flow for Linux TC flower and construct Netlink message.
3210 * Pointer to the priv structure.
3211 * @param[in, out] flow
3212 * Pointer to the sub flow.
3214 * Pointer to the flow attributes.
3216 * Pointer to the list of items.
3217 * @param[in] actions
3218 * Pointer to the list of actions.
3220 * Pointer to the error structure.
3223 * 0 on success, a negative errno value otherwise and rte_errno is set.
3226 flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow,
3227 const struct rte_flow_attr *attr,
3228 const struct rte_flow_item items[],
3229 const struct rte_flow_action actions[],
3230 struct rte_flow_error *error)
3233 const struct rte_flow_item_port_id *port_id;
3234 const struct rte_flow_item_eth *eth;
3235 const struct rte_flow_item_vlan *vlan;
3236 const struct rte_flow_item_ipv4 *ipv4;
3237 const struct rte_flow_item_ipv6 *ipv6;
3238 const struct rte_flow_item_tcp *tcp;
3239 const struct rte_flow_item_udp *udp;
3240 const struct rte_flow_item_vxlan *vxlan;
3243 const struct rte_flow_action_port_id *port_id;
3244 const struct rte_flow_action_jump *jump;
3245 const struct rte_flow_action_of_push_vlan *of_push_vlan;
3246 const struct rte_flow_action_of_set_vlan_vid *
3248 const struct rte_flow_action_of_set_vlan_pcp *
3252 struct flow_tcf_tunnel_hdr *hdr;
3253 struct flow_tcf_vxlan_decap *vxlan;
3258 struct flow_tcf_tunnel_hdr *hdr;
3259 struct flow_tcf_vxlan_encap *vxlan;
3263 struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
3264 struct nlmsghdr *nlh = dev_flow->tcf.nlh;
3265 struct tcmsg *tcm = dev_flow->tcf.tcm;
3266 uint32_t na_act_index_cur;
3267 rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
3268 rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
3269 rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
3270 bool ip_proto_set = 0;
3271 bool tunnel_outer = 0;
3272 struct nlattr *na_flower;
3273 struct nlattr *na_flower_act;
3274 struct nlattr *na_vlan_id = NULL;
3275 struct nlattr *na_vlan_priority = NULL;
3276 uint64_t item_flags = 0;
3279 claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
3280 PTOI_TABLE_SZ_MAX(dev)));
3281 if (dev_flow->tcf.tunnel) {
3282 switch (dev_flow->tcf.tunnel->type) {
3283 case FLOW_TCF_TUNACT_VXLAN_DECAP:
3284 decap.vxlan = dev_flow->tcf.vxlan_decap;
3287 case FLOW_TCF_TUNACT_VXLAN_ENCAP:
3288 encap.vxlan = dev_flow->tcf.vxlan_encap;
3290 /* New tunnel actions can be added here. */
3296 nlh = dev_flow->tcf.nlh;
3297 tcm = dev_flow->tcf.tcm;
3298 /* Prepare API must have been called beforehand. */
3299 assert(nlh != NULL && tcm != NULL);
3300 tcm->tcm_family = AF_UNSPEC;
3301 tcm->tcm_ifindex = ptoi[0].ifindex;
3302 tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
3304 * Priority cannot be zero to prevent the kernel from picking one
3307 tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16, outer_etype);
3308 if (attr->group > 0)
3309 mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group);
3310 mnl_attr_put_strz(nlh, TCA_KIND, "flower");
3311 na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
3312 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3315 switch (items->type) {
3316 case RTE_FLOW_ITEM_TYPE_VOID:
3318 case RTE_FLOW_ITEM_TYPE_PORT_ID:
3319 mask.port_id = flow_tcf_item_mask
3320 (items, &rte_flow_item_port_id_mask,
3321 &flow_tcf_mask_supported.port_id,
3322 &flow_tcf_mask_empty.port_id,
3323 sizeof(flow_tcf_mask_supported.port_id),
3325 assert(mask.port_id);
3326 if (mask.port_id == &flow_tcf_mask_empty.port_id)
3328 spec.port_id = items->spec;
3329 if (!mask.port_id->id)
3332 for (i = 0; ptoi[i].ifindex; ++i)
3333 if (ptoi[i].port_id == spec.port_id->id)
3335 assert(ptoi[i].ifindex);
3336 tcm->tcm_ifindex = ptoi[i].ifindex;
3338 case RTE_FLOW_ITEM_TYPE_ETH:
3339 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3340 MLX5_FLOW_LAYER_INNER_L2 :
3341 MLX5_FLOW_LAYER_OUTER_L2;
3342 mask.eth = flow_tcf_item_mask
3343 (items, &rte_flow_item_eth_mask,
3344 &flow_tcf_mask_supported.eth,
3345 &flow_tcf_mask_empty.eth,
3346 sizeof(flow_tcf_mask_supported.eth),
3349 if (mask.eth == &flow_tcf_mask_empty.eth)
3351 spec.eth = items->spec;
3352 if (mask.eth->type) {
3353 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
3354 inner_etype = spec.eth->type;
3356 outer_etype = spec.eth->type;
3360 "outer L2 addresses cannot be"
3361 " forced is outer ones for tunnel,"
3362 " parameter is ignored");
3365 if (!is_zero_ether_addr(&mask.eth->dst)) {
3366 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
3368 spec.eth->dst.addr_bytes);
3369 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK,
3371 mask.eth->dst.addr_bytes);
3373 if (!is_zero_ether_addr(&mask.eth->src)) {
3374 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC,
3376 spec.eth->src.addr_bytes);
3377 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK,
3379 mask.eth->src.addr_bytes);
3381 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3383 case RTE_FLOW_ITEM_TYPE_VLAN:
3386 assert(!tunnel_outer);
3387 item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
3388 mask.vlan = flow_tcf_item_mask
3389 (items, &rte_flow_item_vlan_mask,
3390 &flow_tcf_mask_supported.vlan,
3391 &flow_tcf_mask_empty.vlan,
3392 sizeof(flow_tcf_mask_supported.vlan),
3395 if (mask.vlan == &flow_tcf_mask_empty.vlan)
3397 spec.vlan = items->spec;
3398 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3399 outer_etype == RTE_BE16(ETH_P_8021Q));
3400 outer_etype = RTE_BE16(ETH_P_8021Q);
3401 if (mask.vlan->inner_type)
3402 vlan_etype = spec.vlan->inner_type;
3403 if (mask.vlan->tci & RTE_BE16(0xe000))
3404 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_VLAN_PRIO,
3406 (spec.vlan->tci) >> 13) & 0x7);
3407 if (mask.vlan->tci & RTE_BE16(0x0fff))
3408 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_VLAN_ID,
3412 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3414 case RTE_FLOW_ITEM_TYPE_IPV4:
3415 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3416 MLX5_FLOW_LAYER_INNER_L3_IPV4 :
3417 MLX5_FLOW_LAYER_OUTER_L3_IPV4;
3418 mask.ipv4 = flow_tcf_item_mask
3419 (items, &rte_flow_item_ipv4_mask,
3420 &flow_tcf_mask_supported.ipv4,
3421 &flow_tcf_mask_empty.ipv4,
3422 sizeof(flow_tcf_mask_supported.ipv4),
3425 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3426 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3427 inner_etype == RTE_BE16(ETH_P_IP));
3428 inner_etype = RTE_BE16(ETH_P_IP);
3429 } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3430 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3431 vlan_etype == RTE_BE16(ETH_P_IP));
3432 vlan_etype = RTE_BE16(ETH_P_IP);
3434 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3435 outer_etype == RTE_BE16(ETH_P_IP));
3436 outer_etype = RTE_BE16(ETH_P_IP);
3438 spec.ipv4 = items->spec;
3439 if (!tunnel_outer && mask.ipv4->hdr.next_proto_id) {
3441 * No way to set IP protocol for outer tunnel
3442 * layers. Usually it is fixed, for example,
3443 * to UDP for VXLAN/GPE.
3445 assert(spec.ipv4); /* Mask is not empty. */
3446 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3447 spec.ipv4->hdr.next_proto_id);
3450 if (mask.ipv4 == &flow_tcf_mask_empty.ipv4 ||
3451 (!mask.ipv4->hdr.src_addr &&
3452 !mask.ipv4->hdr.dst_addr)) {
3456 * For tunnel outer we must set outer IP key
3457 * anyway, even if the specification/mask is
3458 * empty. There is no another way to tell
3459 * kernel about he outer layer protocol.
3462 (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC,
3463 mask.ipv4->hdr.src_addr);
3465 (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
3466 mask.ipv4->hdr.src_addr);
3467 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3470 if (mask.ipv4->hdr.src_addr) {
3472 (nlh, tunnel_outer ?
3473 TCA_FLOWER_KEY_ENC_IPV4_SRC :
3474 TCA_FLOWER_KEY_IPV4_SRC,
3475 spec.ipv4->hdr.src_addr);
3477 (nlh, tunnel_outer ?
3478 TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK :
3479 TCA_FLOWER_KEY_IPV4_SRC_MASK,
3480 mask.ipv4->hdr.src_addr);
3482 if (mask.ipv4->hdr.dst_addr) {
3484 (nlh, tunnel_outer ?
3485 TCA_FLOWER_KEY_ENC_IPV4_DST :
3486 TCA_FLOWER_KEY_IPV4_DST,
3487 spec.ipv4->hdr.dst_addr);
3489 (nlh, tunnel_outer ?
3490 TCA_FLOWER_KEY_ENC_IPV4_DST_MASK :
3491 TCA_FLOWER_KEY_IPV4_DST_MASK,
3492 mask.ipv4->hdr.dst_addr);
3494 if (mask.ipv4->hdr.time_to_live) {
3496 (nlh, tunnel_outer ?
3497 TCA_FLOWER_KEY_ENC_IP_TTL :
3498 TCA_FLOWER_KEY_IP_TTL,
3499 spec.ipv4->hdr.time_to_live);
3501 (nlh, tunnel_outer ?
3502 TCA_FLOWER_KEY_ENC_IP_TTL_MASK :
3503 TCA_FLOWER_KEY_IP_TTL_MASK,
3504 mask.ipv4->hdr.time_to_live);
3506 if (mask.ipv4->hdr.type_of_service) {
3508 (nlh, tunnel_outer ?
3509 TCA_FLOWER_KEY_ENC_IP_TOS :
3510 TCA_FLOWER_KEY_IP_TOS,
3511 spec.ipv4->hdr.type_of_service);
3513 (nlh, tunnel_outer ?
3514 TCA_FLOWER_KEY_ENC_IP_TOS_MASK :
3515 TCA_FLOWER_KEY_IP_TOS_MASK,
3516 mask.ipv4->hdr.type_of_service);
3518 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3520 case RTE_FLOW_ITEM_TYPE_IPV6: {
3521 bool ipv6_src, ipv6_dst;
3524 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3525 MLX5_FLOW_LAYER_INNER_L3_IPV6 :
3526 MLX5_FLOW_LAYER_OUTER_L3_IPV6;
3527 mask.ipv6 = flow_tcf_item_mask
3528 (items, &rte_flow_item_ipv6_mask,
3529 &flow_tcf_mask_supported.ipv6,
3530 &flow_tcf_mask_empty.ipv6,
3531 sizeof(flow_tcf_mask_supported.ipv6),
3534 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3535 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3536 inner_etype == RTE_BE16(ETH_P_IPV6));
3537 inner_etype = RTE_BE16(ETH_P_IPV6);
3538 } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3539 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3540 vlan_etype == RTE_BE16(ETH_P_IPV6));
3541 vlan_etype = RTE_BE16(ETH_P_IPV6);
3543 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3544 outer_etype == RTE_BE16(ETH_P_IPV6));
3545 outer_etype = RTE_BE16(ETH_P_IPV6);
3547 spec.ipv6 = items->spec;
3548 if (!tunnel_outer && mask.ipv6->hdr.proto) {
3550 * No way to set IP protocol for outer tunnel
3551 * layers. Usually it is fixed, for example,
3552 * to UDP for VXLAN/GPE.
3554 assert(spec.ipv6); /* Mask is not empty. */
3555 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3556 spec.ipv6->hdr.proto);
3559 ipv6_dst = !IN6_IS_ADDR_UNSPECIFIED
3560 (mask.ipv6->hdr.dst_addr);
3561 ipv6_src = !IN6_IS_ADDR_UNSPECIFIED
3562 (mask.ipv6->hdr.src_addr);
3563 if (mask.ipv6 == &flow_tcf_mask_empty.ipv6 ||
3564 (!ipv6_dst && !ipv6_src)) {
3568 * For tunnel outer we must set outer IP key
3569 * anyway, even if the specification/mask is
3570 * empty. There is no another way to tell
3571 * kernel about he outer layer protocol.
3574 TCA_FLOWER_KEY_ENC_IPV6_SRC,
3576 mask.ipv6->hdr.src_addr);
3578 TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
3580 mask.ipv6->hdr.src_addr);
3581 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3585 mnl_attr_put(nlh, tunnel_outer ?
3586 TCA_FLOWER_KEY_ENC_IPV6_SRC :
3587 TCA_FLOWER_KEY_IPV6_SRC,
3589 spec.ipv6->hdr.src_addr);
3590 mnl_attr_put(nlh, tunnel_outer ?
3591 TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK :
3592 TCA_FLOWER_KEY_IPV6_SRC_MASK,
3594 mask.ipv6->hdr.src_addr);
3597 mnl_attr_put(nlh, tunnel_outer ?
3598 TCA_FLOWER_KEY_ENC_IPV6_DST :
3599 TCA_FLOWER_KEY_IPV6_DST,
3601 spec.ipv6->hdr.dst_addr);
3602 mnl_attr_put(nlh, tunnel_outer ?
3603 TCA_FLOWER_KEY_ENC_IPV6_DST_MASK :
3604 TCA_FLOWER_KEY_IPV6_DST_MASK,
3606 mask.ipv6->hdr.dst_addr);
3608 if (mask.ipv6->hdr.hop_limits) {
3610 (nlh, tunnel_outer ?
3611 TCA_FLOWER_KEY_ENC_IP_TTL :
3612 TCA_FLOWER_KEY_IP_TTL,
3613 spec.ipv6->hdr.hop_limits);
3615 (nlh, tunnel_outer ?
3616 TCA_FLOWER_KEY_ENC_IP_TTL_MASK :
3617 TCA_FLOWER_KEY_IP_TTL_MASK,
3618 mask.ipv6->hdr.hop_limits);
3620 msk6 = (rte_be_to_cpu_32(mask.ipv6->hdr.vtc_flow) >>
3621 IPV6_HDR_TC_SHIFT) & 0xff;
3623 tos6 = (rte_be_to_cpu_32
3624 (spec.ipv6->hdr.vtc_flow) >>
3625 IPV6_HDR_TC_SHIFT) & 0xff;
3627 (nlh, tunnel_outer ?
3628 TCA_FLOWER_KEY_ENC_IP_TOS :
3629 TCA_FLOWER_KEY_IP_TOS, tos6);
3631 (nlh, tunnel_outer ?
3632 TCA_FLOWER_KEY_ENC_IP_TOS_MASK :
3633 TCA_FLOWER_KEY_IP_TOS_MASK, msk6);
3635 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3638 case RTE_FLOW_ITEM_TYPE_UDP:
3639 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3640 MLX5_FLOW_LAYER_INNER_L4_UDP :
3641 MLX5_FLOW_LAYER_OUTER_L4_UDP;
3642 mask.udp = flow_tcf_item_mask
3643 (items, &rte_flow_item_udp_mask,
3644 &flow_tcf_mask_supported.udp,
3645 &flow_tcf_mask_empty.udp,
3646 sizeof(flow_tcf_mask_supported.udp),
3649 spec.udp = items->spec;
3650 if (!tunnel_outer) {
3653 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3655 if (mask.udp == &flow_tcf_mask_empty.udp)
3658 assert(mask.udp != &flow_tcf_mask_empty.udp);
3659 decap.vxlan->udp_port =
3661 (spec.udp->hdr.dst_port);
3663 if (mask.udp->hdr.src_port) {
3665 (nlh, tunnel_outer ?
3666 TCA_FLOWER_KEY_ENC_UDP_SRC_PORT :
3667 TCA_FLOWER_KEY_UDP_SRC,
3668 spec.udp->hdr.src_port);
3670 (nlh, tunnel_outer ?
3671 TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK :
3672 TCA_FLOWER_KEY_UDP_SRC_MASK,
3673 mask.udp->hdr.src_port);
3675 if (mask.udp->hdr.dst_port) {
3677 (nlh, tunnel_outer ?
3678 TCA_FLOWER_KEY_ENC_UDP_DST_PORT :
3679 TCA_FLOWER_KEY_UDP_DST,
3680 spec.udp->hdr.dst_port);
3682 (nlh, tunnel_outer ?
3683 TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK :
3684 TCA_FLOWER_KEY_UDP_DST_MASK,
3685 mask.udp->hdr.dst_port);
3687 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3689 case RTE_FLOW_ITEM_TYPE_TCP:
3690 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3691 MLX5_FLOW_LAYER_INNER_L4_TCP :
3692 MLX5_FLOW_LAYER_OUTER_L4_TCP;
3693 mask.tcp = flow_tcf_item_mask
3694 (items, &rte_flow_item_tcp_mask,
3695 &flow_tcf_mask_supported.tcp,
3696 &flow_tcf_mask_empty.tcp,
3697 sizeof(flow_tcf_mask_supported.tcp),
3701 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3703 if (mask.tcp == &flow_tcf_mask_empty.tcp)
3705 spec.tcp = items->spec;
3706 if (mask.tcp->hdr.src_port) {
3707 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_SRC,
3708 spec.tcp->hdr.src_port);
3709 mnl_attr_put_u16(nlh,
3710 TCA_FLOWER_KEY_TCP_SRC_MASK,
3711 mask.tcp->hdr.src_port);
3713 if (mask.tcp->hdr.dst_port) {
3714 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_DST,
3715 spec.tcp->hdr.dst_port);
3716 mnl_attr_put_u16(nlh,
3717 TCA_FLOWER_KEY_TCP_DST_MASK,
3718 mask.tcp->hdr.dst_port);
3720 if (mask.tcp->hdr.tcp_flags) {
3723 TCA_FLOWER_KEY_TCP_FLAGS,
3725 (spec.tcp->hdr.tcp_flags));
3728 TCA_FLOWER_KEY_TCP_FLAGS_MASK,
3730 (mask.tcp->hdr.tcp_flags));
3732 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3734 case RTE_FLOW_ITEM_TYPE_VXLAN:
3735 assert(decap.vxlan);
3737 item_flags |= MLX5_FLOW_LAYER_VXLAN;
3738 spec.vxlan = items->spec;
3739 mnl_attr_put_u32(nlh,
3740 TCA_FLOWER_KEY_ENC_KEY_ID,
3741 vxlan_vni_as_be32(spec.vxlan->vni));
3742 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3745 return rte_flow_error_set(error, ENOTSUP,
3746 RTE_FLOW_ERROR_TYPE_ITEM,
3747 NULL, "item not supported");
3751 * Set the ether_type flower key and tc rule protocol:
3752 * - if there is nor VLAN neither VXLAN the key is taken from
3753 * eth item directly or deduced from L3 items.
3754 * - if there is vlan item then key is fixed to 802.1q.
3755 * - if there is vxlan item then key is set to inner tunnel type.
3756 * - simultaneous vlan and vxlan items are prohibited.
3758 if (outer_etype != RTE_BE16(ETH_P_ALL)) {
3759 tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
3761 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3762 if (inner_etype != RTE_BE16(ETH_P_ALL))
3763 mnl_attr_put_u16(nlh,
3764 TCA_FLOWER_KEY_ETH_TYPE,
3767 mnl_attr_put_u16(nlh,
3768 TCA_FLOWER_KEY_ETH_TYPE,
3770 if (outer_etype == RTE_BE16(ETH_P_8021Q) &&
3771 vlan_etype != RTE_BE16(ETH_P_ALL))
3772 mnl_attr_put_u16(nlh,
3773 TCA_FLOWER_KEY_VLAN_ETH_TYPE,
3776 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3778 na_flower_act = mnl_attr_nest_start(nlh, TCA_FLOWER_ACT);
3779 na_act_index_cur = 1;
3780 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
3781 struct nlattr *na_act_index;
3782 struct nlattr *na_act;
3783 unsigned int vlan_act;
3786 switch (actions->type) {
3787 case RTE_FLOW_ACTION_TYPE_VOID:
3789 case RTE_FLOW_ACTION_TYPE_PORT_ID:
3790 conf.port_id = actions->conf;
3791 if (conf.port_id->original)
3794 for (i = 0; ptoi[i].ifindex; ++i)
3795 if (ptoi[i].port_id == conf.port_id->id)
3797 assert(ptoi[i].ifindex);
3799 mnl_attr_nest_start(nlh, na_act_index_cur++);
3800 assert(na_act_index);
3801 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
3802 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3805 assert(dev_flow->tcf.tunnel);
3806 dev_flow->tcf.tunnel->ifindex_ptr =
3807 &((struct tc_mirred *)
3808 mnl_attr_get_payload
3809 (mnl_nlmsg_get_payload_tail
3812 mnl_attr_put(nlh, TCA_MIRRED_PARMS,
3813 sizeof(struct tc_mirred),
3814 &(struct tc_mirred){
3815 .action = TC_ACT_STOLEN,
3816 .eaction = TCA_EGRESS_REDIR,
3817 .ifindex = ptoi[i].ifindex,
3819 mnl_attr_nest_end(nlh, na_act);
3820 mnl_attr_nest_end(nlh, na_act_index);
3822 case RTE_FLOW_ACTION_TYPE_JUMP:
3823 conf.jump = actions->conf;
3825 mnl_attr_nest_start(nlh, na_act_index_cur++);
3826 assert(na_act_index);
3827 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3828 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3830 mnl_attr_put(nlh, TCA_GACT_PARMS,
3831 sizeof(struct tc_gact),
3833 .action = TC_ACT_GOTO_CHAIN |
3836 mnl_attr_nest_end(nlh, na_act);
3837 mnl_attr_nest_end(nlh, na_act_index);
3839 case RTE_FLOW_ACTION_TYPE_DROP:
3841 mnl_attr_nest_start(nlh, na_act_index_cur++);
3842 assert(na_act_index);
3843 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3844 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3846 mnl_attr_put(nlh, TCA_GACT_PARMS,
3847 sizeof(struct tc_gact),
3849 .action = TC_ACT_SHOT,
3851 mnl_attr_nest_end(nlh, na_act);
3852 mnl_attr_nest_end(nlh, na_act_index);
3854 case RTE_FLOW_ACTION_TYPE_COUNT:
3856 * Driver adds the count action implicitly for
3857 * each rule it creates.
3859 ret = flow_tcf_translate_action_count(dev,
3864 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
3865 conf.of_push_vlan = NULL;
3866 vlan_act = TCA_VLAN_ACT_POP;
3867 goto action_of_vlan;
3868 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
3869 conf.of_push_vlan = actions->conf;
3870 vlan_act = TCA_VLAN_ACT_PUSH;
3871 goto action_of_vlan;
3872 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
3873 conf.of_set_vlan_vid = actions->conf;
3875 goto override_na_vlan_id;
3876 vlan_act = TCA_VLAN_ACT_MODIFY;
3877 goto action_of_vlan;
3878 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
3879 conf.of_set_vlan_pcp = actions->conf;
3880 if (na_vlan_priority)
3881 goto override_na_vlan_priority;
3882 vlan_act = TCA_VLAN_ACT_MODIFY;
3883 goto action_of_vlan;
3886 mnl_attr_nest_start(nlh, na_act_index_cur++);
3887 assert(na_act_index);
3888 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "vlan");
3889 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3891 mnl_attr_put(nlh, TCA_VLAN_PARMS,
3892 sizeof(struct tc_vlan),
3894 .action = TC_ACT_PIPE,
3895 .v_action = vlan_act,
3897 if (vlan_act == TCA_VLAN_ACT_POP) {
3898 mnl_attr_nest_end(nlh, na_act);
3899 mnl_attr_nest_end(nlh, na_act_index);
3902 if (vlan_act == TCA_VLAN_ACT_PUSH)
3903 mnl_attr_put_u16(nlh,
3904 TCA_VLAN_PUSH_VLAN_PROTOCOL,
3905 conf.of_push_vlan->ethertype);
3906 na_vlan_id = mnl_nlmsg_get_payload_tail(nlh);
3907 mnl_attr_put_u16(nlh, TCA_VLAN_PAD, 0);
3908 na_vlan_priority = mnl_nlmsg_get_payload_tail(nlh);
3909 mnl_attr_put_u8(nlh, TCA_VLAN_PAD, 0);
3910 mnl_attr_nest_end(nlh, na_act);
3911 mnl_attr_nest_end(nlh, na_act_index);
3912 if (actions->type ==
3913 RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
3914 override_na_vlan_id:
3915 na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID;
3916 *(uint16_t *)mnl_attr_get_payload(na_vlan_id) =
3918 (conf.of_set_vlan_vid->vlan_vid);
3919 } else if (actions->type ==
3920 RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
3921 override_na_vlan_priority:
3922 na_vlan_priority->nla_type =
3923 TCA_VLAN_PUSH_VLAN_PRIORITY;
3924 *(uint8_t *)mnl_attr_get_payload
3925 (na_vlan_priority) =
3926 conf.of_set_vlan_pcp->vlan_pcp;
3929 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
3930 assert(decap.vxlan);
3931 assert(dev_flow->tcf.tunnel);
3932 dev_flow->tcf.tunnel->ifindex_ptr =
3933 (unsigned int *)&tcm->tcm_ifindex;
3935 mnl_attr_nest_start(nlh, na_act_index_cur++);
3936 assert(na_act_index);
3937 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3938 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3940 mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3941 sizeof(struct tc_tunnel_key),
3942 &(struct tc_tunnel_key){
3943 .action = TC_ACT_PIPE,
3944 .t_action = TCA_TUNNEL_KEY_ACT_RELEASE,
3946 mnl_attr_nest_end(nlh, na_act);
3947 mnl_attr_nest_end(nlh, na_act_index);
3948 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3950 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
3951 assert(encap.vxlan);
3952 flow_tcf_vxlan_encap_parse(actions, encap.vxlan);
3954 mnl_attr_nest_start(nlh, na_act_index_cur++);
3955 assert(na_act_index);
3956 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3957 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3959 mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3960 sizeof(struct tc_tunnel_key),
3961 &(struct tc_tunnel_key){
3962 .action = TC_ACT_PIPE,
3963 .t_action = TCA_TUNNEL_KEY_ACT_SET,
3965 if (encap.vxlan->mask & FLOW_TCF_ENCAP_UDP_DST)
3966 mnl_attr_put_u16(nlh,
3967 TCA_TUNNEL_KEY_ENC_DST_PORT,
3968 encap.vxlan->udp.dst);
3969 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_SRC)
3970 mnl_attr_put_u32(nlh,
3971 TCA_TUNNEL_KEY_ENC_IPV4_SRC,
3972 encap.vxlan->ipv4.src);
3973 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_DST)
3974 mnl_attr_put_u32(nlh,
3975 TCA_TUNNEL_KEY_ENC_IPV4_DST,
3976 encap.vxlan->ipv4.dst);
3977 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_SRC)
3979 TCA_TUNNEL_KEY_ENC_IPV6_SRC,
3980 sizeof(encap.vxlan->ipv6.src),
3981 &encap.vxlan->ipv6.src);
3982 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_DST)
3984 TCA_TUNNEL_KEY_ENC_IPV6_DST,
3985 sizeof(encap.vxlan->ipv6.dst),
3986 &encap.vxlan->ipv6.dst);
3987 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IP_TTL)
3988 mnl_attr_put_u8(nlh,
3989 TCA_TUNNEL_KEY_ENC_TTL,
3990 encap.vxlan->ip_ttl_hop);
3991 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IP_TOS)
3992 mnl_attr_put_u8(nlh,
3993 TCA_TUNNEL_KEY_ENC_TOS,
3994 encap.vxlan->ip_tos);
3995 if (encap.vxlan->mask & FLOW_TCF_ENCAP_VXLAN_VNI)
3996 mnl_attr_put_u32(nlh,
3997 TCA_TUNNEL_KEY_ENC_KEY_ID,
3999 (encap.vxlan->vxlan.vni));
4000 mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM, 0);
4001 mnl_attr_nest_end(nlh, na_act);
4002 mnl_attr_nest_end(nlh, na_act_index);
4003 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
4005 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
4006 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
4007 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
4008 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
4009 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
4010 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
4011 case RTE_FLOW_ACTION_TYPE_SET_TTL:
4012 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
4013 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
4014 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
4016 mnl_attr_nest_start(nlh, na_act_index_cur++);
4017 flow_tcf_create_pedit_mnl_msg(nlh,
4018 &actions, item_flags);
4019 mnl_attr_nest_end(nlh, na_act_index);
4022 return rte_flow_error_set(error, ENOTSUP,
4023 RTE_FLOW_ERROR_TYPE_ACTION,
4025 "action not supported");
4029 assert(na_flower_act);
4030 mnl_attr_nest_end(nlh, na_flower_act);
4031 dev_flow->tcf.ptc_flags = mnl_attr_get_payload
4032 (mnl_nlmsg_get_payload_tail(nlh));
4033 mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, decap.vxlan ?
4034 0 : TCA_CLS_FLAGS_SKIP_SW);
4035 mnl_attr_nest_end(nlh, na_flower);
4036 if (dev_flow->tcf.tunnel && dev_flow->tcf.tunnel->ifindex_ptr)
4037 dev_flow->tcf.tunnel->ifindex_org =
4038 *dev_flow->tcf.tunnel->ifindex_ptr;
4039 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
4044 * Send Netlink message with acknowledgment.
4047 * Flow context to use.
4049 * Message to send. This function always raises the NLM_F_ACK flag before
4052 * Callback handler for received message.
4054 * Context pointer for callback handler.
4057 * 0 on success, a negative errno value otherwise and rte_errno is set.
4060 flow_tcf_nl_ack(struct mlx5_flow_tcf_context *tcf,
4061 struct nlmsghdr *nlh,
4062 mnl_cb_t cb, void *arg)
4064 unsigned int portid = mnl_socket_get_portid(tcf->nl);
4065 uint32_t seq = tcf->seq++;
4071 /* seq 0 is reserved for kernel event-driven notifications. */
4074 nlh->nlmsg_seq = seq;
4075 nlh->nlmsg_flags |= NLM_F_ACK;
4076 ret = mnl_socket_sendto(tcf->nl, nlh, nlh->nlmsg_len);
4078 /* Message send error occurres. */
4082 nlh = (struct nlmsghdr *)(tcf->buf);
4084 * The following loop postpones non-fatal errors until multipart
4085 * messages are complete.
4088 ret = mnl_socket_recvfrom(tcf->nl, tcf->buf, tcf->buf_size);
4092 * In case of overflow Will receive till
4093 * end of multipart message. We may lost part
4094 * of reply messages but mark and return an error.
4096 if (err != ENOSPC ||
4097 !(nlh->nlmsg_flags & NLM_F_MULTI) ||
4098 nlh->nlmsg_type == NLMSG_DONE)
4101 ret = mnl_cb_run(nlh, ret, seq, portid, cb, arg);
4104 * libmnl returns 0 if DONE or
4105 * success ACK message found.
4111 * ACK message with error found
4112 * or some error occurred.
4117 /* We should continue receiving. */
4126 #define MNL_BUF_EXTRA_SPACE 16
4127 #define MNL_REQUEST_SIZE_MIN 256
4128 #define MNL_REQUEST_SIZE_MAX 2048
4129 #define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \
4130 MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX)
4132 /* Data structures used by flow_tcf_xxx_cb() routines. */
4133 struct tcf_nlcb_buf {
4134 LIST_ENTRY(tcf_nlcb_buf) next;
4136 alignas(struct nlmsghdr)
4137 uint8_t msg[]; /**< Netlink message data. */
4140 struct tcf_nlcb_context {
4141 unsigned int ifindex; /**< Base interface index. */
4143 LIST_HEAD(, tcf_nlcb_buf) nlbuf;
4147 * Allocate space for netlink command in buffer list
4149 * @param[in, out] ctx
4150 * Pointer to callback context with command buffers list.
4152 * Required size of data buffer to be allocated.
4155 * Pointer to allocated memory, aligned as message header.
4156 * NULL if some error occurred.
4158 static struct nlmsghdr *
4159 flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size)
4161 struct tcf_nlcb_buf *buf;
4162 struct nlmsghdr *nlh;
4164 size = NLMSG_ALIGN(size);
4165 buf = LIST_FIRST(&ctx->nlbuf);
4166 if (buf && (buf->size + size) <= ctx->bufsize) {
4167 nlh = (struct nlmsghdr *)&buf->msg[buf->size];
4171 if (size > ctx->bufsize) {
4172 DRV_LOG(WARNING, "netlink: too long command buffer requested");
4175 buf = rte_malloc(__func__,
4176 ctx->bufsize + sizeof(struct tcf_nlcb_buf),
4177 alignof(struct tcf_nlcb_buf));
4179 DRV_LOG(WARNING, "netlink: no memory for command buffer");
4182 LIST_INSERT_HEAD(&ctx->nlbuf, buf, next);
4184 nlh = (struct nlmsghdr *)&buf->msg[0];
4189 * Send the buffers with prepared netlink commands. Scans the list and
4190 * sends all found buffers. Buffers are sent and freed anyway in order
4191 * to prevent memory leakage if some every message in received packet.
4194 * Context object initialized by mlx5_flow_tcf_context_create().
4195 * @param[in, out] ctx
4196 * Pointer to callback context with command buffers list.
4199 * Zero value on success, negative errno value otherwise
4200 * and rte_errno is set.
4203 flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf,
4204 struct tcf_nlcb_context *ctx)
4206 struct tcf_nlcb_buf *bc = LIST_FIRST(&ctx->nlbuf);
4210 struct tcf_nlcb_buf *bn = LIST_NEXT(bc, next);
4211 struct nlmsghdr *nlh;
4215 while (msg < bc->size) {
4217 * Send Netlink commands from buffer in one by one
4218 * fashion. If we send multiple rule deletion commands
4219 * in one Netlink message and some error occurs it may
4220 * cause multiple ACK error messages and break sequence
4221 * numbers of Netlink communication, because we expect
4222 * the only one ACK reply.
4224 assert((bc->size - msg) >= sizeof(struct nlmsghdr));
4225 nlh = (struct nlmsghdr *)&bc->msg[msg];
4226 assert((bc->size - msg) >= nlh->nlmsg_len);
4227 msg += nlh->nlmsg_len;
4228 rc = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4231 "netlink: cleanup error %d", rc);
4239 LIST_INIT(&ctx->nlbuf);
4244 * Collect local IP address rules with scope link attribute on specified
4245 * network device. This is callback routine called by libmnl mnl_cb_run()
4246 * in loop for every message in received packet.
4249 * Pointer to reply header.
4250 * @param[in, out] arg
4251 * Opaque data pointer for this callback.
4254 * A positive, nonzero value on success, negative errno value otherwise
4255 * and rte_errno is set.
4258 flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg)
4260 struct tcf_nlcb_context *ctx = arg;
4261 struct nlmsghdr *cmd;
4262 struct ifaddrmsg *ifa;
4264 struct nlattr *na_local = NULL;
4265 struct nlattr *na_peer = NULL;
4266 unsigned char family;
4269 if (nlh->nlmsg_type != RTM_NEWADDR) {
4273 ifa = mnl_nlmsg_get_payload(nlh);
4274 family = ifa->ifa_family;
4275 if (ifa->ifa_index != ctx->ifindex ||
4276 ifa->ifa_scope != RT_SCOPE_LINK ||
4277 !(ifa->ifa_flags & IFA_F_PERMANENT) ||
4278 (family != AF_INET && family != AF_INET6))
4280 mnl_attr_for_each(na, nlh, sizeof(*ifa)) {
4281 switch (mnl_attr_get_type(na)) {
4289 if (na_local && na_peer)
4292 if (!na_local || !na_peer)
4294 /* Local rule found with scope link, permanent and assigned peer. */
4295 size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4296 MNL_ALIGN(sizeof(struct ifaddrmsg)) +
4297 (family == AF_INET6 ? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4298 : 2 * SZ_NLATTR_TYPE_OF(uint32_t));
4299 cmd = flow_tcf_alloc_nlcmd(ctx, size);
4304 cmd = mnl_nlmsg_put_header(cmd);
4305 cmd->nlmsg_type = RTM_DELADDR;
4306 cmd->nlmsg_flags = NLM_F_REQUEST;
4307 ifa = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifa));
4308 ifa->ifa_flags = IFA_F_PERMANENT;
4309 ifa->ifa_scope = RT_SCOPE_LINK;
4310 ifa->ifa_index = ctx->ifindex;
4311 if (family == AF_INET) {
4312 ifa->ifa_family = AF_INET;
4313 ifa->ifa_prefixlen = 32;
4314 mnl_attr_put_u32(cmd, IFA_LOCAL, mnl_attr_get_u32(na_local));
4315 mnl_attr_put_u32(cmd, IFA_ADDRESS, mnl_attr_get_u32(na_peer));
4317 ifa->ifa_family = AF_INET6;
4318 ifa->ifa_prefixlen = 128;
4319 mnl_attr_put(cmd, IFA_LOCAL, IPV6_ADDR_LEN,
4320 mnl_attr_get_payload(na_local));
4321 mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN,
4322 mnl_attr_get_payload(na_peer));
4324 assert(size == cmd->nlmsg_len);
4329 * Cleanup the local IP addresses on outer interface.
4332 * Context object initialized by mlx5_flow_tcf_context_create().
4333 * @param[in] ifindex
4334 * Network inferface index to perform cleanup.
4337 flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf,
4338 unsigned int ifindex)
4340 struct nlmsghdr *nlh;
4341 struct ifaddrmsg *ifa;
4342 struct tcf_nlcb_context ctx = {
4344 .bufsize = MNL_REQUEST_SIZE,
4345 .nlbuf = LIST_HEAD_INITIALIZER(),
4351 * Seek and destroy leftovers of local IP addresses with
4352 * matching properties "scope link".
4354 nlh = mnl_nlmsg_put_header(tcf->buf);
4355 nlh->nlmsg_type = RTM_GETADDR;
4356 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4357 ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4358 ifa->ifa_family = AF_UNSPEC;
4359 ifa->ifa_index = ifindex;
4360 ifa->ifa_scope = RT_SCOPE_LINK;
4361 ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_local_cb, &ctx);
4363 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4364 ret = flow_tcf_send_nlcmd(tcf, &ctx);
4366 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4370 * Collect neigh permament rules on specified network device.
4371 * This is callback routine called by libmnl mnl_cb_run() in loop for
4372 * every message in received packet.
4375 * Pointer to reply header.
4376 * @param[in, out] arg
4377 * Opaque data pointer for this callback.
4380 * A positive, nonzero value on success, negative errno value otherwise
4381 * and rte_errno is set.
4384 flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg)
4386 struct tcf_nlcb_context *ctx = arg;
4387 struct nlmsghdr *cmd;
4390 struct nlattr *na_ip = NULL;
4391 struct nlattr *na_mac = NULL;
4392 unsigned char family;
4395 if (nlh->nlmsg_type != RTM_NEWNEIGH) {
4399 ndm = mnl_nlmsg_get_payload(nlh);
4400 family = ndm->ndm_family;
4401 if (ndm->ndm_ifindex != (int)ctx->ifindex ||
4402 !(ndm->ndm_state & NUD_PERMANENT) ||
4403 (family != AF_INET && family != AF_INET6))
4405 mnl_attr_for_each(na, nlh, sizeof(*ndm)) {
4406 switch (mnl_attr_get_type(na)) {
4414 if (na_mac && na_ip)
4417 if (!na_mac || !na_ip)
4419 /* Neigh rule with permenent attribute found. */
4420 size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4421 MNL_ALIGN(sizeof(struct ndmsg)) +
4422 SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) +
4423 (family == AF_INET6 ? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4424 : SZ_NLATTR_TYPE_OF(uint32_t));
4425 cmd = flow_tcf_alloc_nlcmd(ctx, size);
4430 cmd = mnl_nlmsg_put_header(cmd);
4431 cmd->nlmsg_type = RTM_DELNEIGH;
4432 cmd->nlmsg_flags = NLM_F_REQUEST;
4433 ndm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ndm));
4434 ndm->ndm_ifindex = ctx->ifindex;
4435 ndm->ndm_state = NUD_PERMANENT;
4438 if (family == AF_INET) {
4439 ndm->ndm_family = AF_INET;
4440 mnl_attr_put_u32(cmd, NDA_DST, mnl_attr_get_u32(na_ip));
4442 ndm->ndm_family = AF_INET6;
4443 mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN,
4444 mnl_attr_get_payload(na_ip));
4446 mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN,
4447 mnl_attr_get_payload(na_mac));
4448 assert(size == cmd->nlmsg_len);
4453 * Cleanup the neigh rules on outer interface.
4456 * Context object initialized by mlx5_flow_tcf_context_create().
4457 * @param[in] ifindex
4458 * Network inferface index to perform cleanup.
4461 flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf,
4462 unsigned int ifindex)
4464 struct nlmsghdr *nlh;
4466 struct tcf_nlcb_context ctx = {
4468 .bufsize = MNL_REQUEST_SIZE,
4469 .nlbuf = LIST_HEAD_INITIALIZER(),
4474 /* Seek and destroy leftovers of neigh rules. */
4475 nlh = mnl_nlmsg_put_header(tcf->buf);
4476 nlh->nlmsg_type = RTM_GETNEIGH;
4477 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4478 ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4479 ndm->ndm_family = AF_UNSPEC;
4480 ndm->ndm_ifindex = ifindex;
4481 ndm->ndm_state = NUD_PERMANENT;
4482 ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_neigh_cb, &ctx);
4484 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4485 ret = flow_tcf_send_nlcmd(tcf, &ctx);
4487 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4491 * Collect indices of VXLAN encap/decap interfaces associated with device.
4492 * This is callback routine called by libmnl mnl_cb_run() in loop for
4493 * every message in received packet.
4496 * Pointer to reply header.
4497 * @param[in, out] arg
4498 * Opaque data pointer for this callback.
4501 * A positive, nonzero value on success, negative errno value otherwise
4502 * and rte_errno is set.
4505 flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg)
4507 struct tcf_nlcb_context *ctx = arg;
4508 struct nlmsghdr *cmd;
4509 struct ifinfomsg *ifm;
4511 struct nlattr *na_info = NULL;
4512 struct nlattr *na_vxlan = NULL;
4514 unsigned int vxindex;
4517 if (nlh->nlmsg_type != RTM_NEWLINK) {
4521 ifm = mnl_nlmsg_get_payload(nlh);
4522 if (!ifm->ifi_index) {
4526 mnl_attr_for_each(na, nlh, sizeof(*ifm))
4527 if (mnl_attr_get_type(na) == IFLA_LINKINFO) {
4533 mnl_attr_for_each_nested(na, na_info) {
4534 switch (mnl_attr_get_type(na)) {
4535 case IFLA_INFO_KIND:
4536 if (!strncmp("vxlan", mnl_attr_get_str(na),
4537 mnl_attr_get_len(na)))
4540 case IFLA_INFO_DATA:
4544 if (found && na_vxlan)
4547 if (!found || !na_vxlan)
4550 mnl_attr_for_each_nested(na, na_vxlan) {
4551 if (mnl_attr_get_type(na) == IFLA_VXLAN_LINK &&
4552 mnl_attr_get_u32(na) == ctx->ifindex) {
4559 /* Attached VXLAN device found, store the command to delete. */
4560 vxindex = ifm->ifi_index;
4561 size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4562 MNL_ALIGN(sizeof(struct ifinfomsg));
4563 cmd = flow_tcf_alloc_nlcmd(ctx, size);
4568 cmd = mnl_nlmsg_put_header(cmd);
4569 cmd->nlmsg_type = RTM_DELLINK;
4570 cmd->nlmsg_flags = NLM_F_REQUEST;
4571 ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm));
4572 ifm->ifi_family = AF_UNSPEC;
4573 ifm->ifi_index = vxindex;
4574 assert(size == cmd->nlmsg_len);
4579 * Cleanup the outer interface. Removes all found vxlan devices
4580 * attached to specified index, flushes the neigh and local IP
4584 * Context object initialized by mlx5_flow_tcf_context_create().
4585 * @param[in] ifindex
4586 * Network inferface index to perform cleanup.
4589 flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf,
4590 unsigned int ifindex)
4592 struct nlmsghdr *nlh;
4593 struct ifinfomsg *ifm;
4594 struct tcf_nlcb_context ctx = {
4596 .bufsize = MNL_REQUEST_SIZE,
4597 .nlbuf = LIST_HEAD_INITIALIZER(),
4603 * Seek and destroy leftover VXLAN encap/decap interfaces with
4604 * matching properties.
4606 nlh = mnl_nlmsg_put_header(tcf->buf);
4607 nlh->nlmsg_type = RTM_GETLINK;
4608 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4609 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4610 ifm->ifi_family = AF_UNSPEC;
4611 ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_vxlan_cb, &ctx);
4613 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4614 ret = flow_tcf_send_nlcmd(tcf, &ctx);
4616 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4620 * Emit Netlink message to add/remove local address to the outer device.
4621 * The address being added is visible within the link only (scope link).
4623 * Note that an implicit route is maintained by the kernel due to the
4624 * presence of a peer address (IFA_ADDRESS).
4626 * These rules are used for encapsultion only and allow to assign
4627 * the outer tunnel source IP address.
4630 * Libmnl socket context object.
4632 * Encapsulation properties (source address and its peer).
4633 * @param[in] ifindex
4634 * Network interface to apply rule.
4636 * Toggle between add and remove.
4638 * Perform verbose error reporting if not NULL.
4641 * 0 on success, a negative errno value otherwise and rte_errno is set.
4644 flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf,
4645 const struct flow_tcf_vxlan_encap *encap,
4646 unsigned int ifindex,
4648 struct rte_flow_error *error)
4650 struct nlmsghdr *nlh;
4651 struct ifaddrmsg *ifa;
4652 alignas(struct nlmsghdr)
4653 uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
4655 nlh = mnl_nlmsg_put_header(buf);
4656 nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
4658 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4660 ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4661 ifa->ifa_flags = IFA_F_PERMANENT;
4662 ifa->ifa_scope = RT_SCOPE_LINK;
4663 ifa->ifa_index = ifindex;
4664 if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4665 ifa->ifa_family = AF_INET;
4666 ifa->ifa_prefixlen = 32;
4667 mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src);
4668 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST)
4669 mnl_attr_put_u32(nlh, IFA_ADDRESS,
4672 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4673 ifa->ifa_family = AF_INET6;
4674 ifa->ifa_prefixlen = 128;
4675 mnl_attr_put(nlh, IFA_LOCAL,
4676 sizeof(encap->ipv6.src),
4678 if (encap->mask & FLOW_TCF_ENCAP_IPV6_DST)
4679 mnl_attr_put(nlh, IFA_ADDRESS,
4680 sizeof(encap->ipv6.dst),
4683 if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4685 return rte_flow_error_set(error, rte_errno,
4686 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4687 "netlink: cannot complete IFA request"
4692 * Emit Netlink message to add/remove neighbor.
4695 * Libmnl socket context object.
4697 * Encapsulation properties (destination address).
4698 * @param[in] ifindex
4699 * Network interface.
4701 * Toggle between add and remove.
4703 * Perform verbose error reporting if not NULL.
4706 * 0 on success, a negative errno value otherwise and rte_errno is set.
4709 flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf,
4710 const struct flow_tcf_vxlan_encap *encap,
4711 unsigned int ifindex,
4713 struct rte_flow_error *error)
4715 struct nlmsghdr *nlh;
4717 alignas(struct nlmsghdr)
4718 uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
4720 nlh = mnl_nlmsg_put_header(buf);
4721 nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
4723 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4725 ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4726 ndm->ndm_ifindex = ifindex;
4727 ndm->ndm_state = NUD_PERMANENT;
4730 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4731 ndm->ndm_family = AF_INET;
4732 mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst);
4734 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4735 ndm->ndm_family = AF_INET6;
4736 mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst),
4739 if (encap->mask & FLOW_TCF_ENCAP_ETH_SRC && enable)
4741 "outer ethernet source address cannot be "
4742 "forced for VXLAN encapsulation");
4743 if (encap->mask & FLOW_TCF_ENCAP_ETH_DST)
4744 mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst),
4746 if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4748 return rte_flow_error_set(error, rte_errno,
4749 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4750 "netlink: cannot complete ND request"
4755 * Manage the local IP addresses and their peers IP addresses on the
4756 * outer interface for encapsulation purposes. The kernel searches the
4757 * appropriate device for tunnel egress traffic using the outer source
4758 * IP, this IP should be assigned to the outer network device, otherwise
4759 * kernel rejects the rule.
4761 * Adds or removes the addresses using the Netlink command like this:
4762 * ip addr add <src_ip> peer <dst_ip> scope link dev <ifouter>
4764 * The addresses are local to the netdev ("scope link"), this reduces
4765 * the risk of conflicts. Note that an implicit route is maintained by
4766 * the kernel due to the presence of a peer address (IFA_ADDRESS).
4769 * Libmnl socket context object.
4771 * Object, contains rule database and ifouter index.
4772 * @param[in] dev_flow
4773 * Flow object, contains the tunnel parameters (for encap only).
4775 * Toggle between add and remove.
4777 * Perform verbose error reporting if not NULL.
4780 * 0 on success, a negative errno value otherwise and rte_errno is set.
4783 flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf,
4784 struct tcf_irule *iface,
4785 struct mlx5_flow *dev_flow,
4787 struct rte_flow_error *error)
4789 const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4790 struct tcf_local_rule *rule = NULL;
4794 assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4795 if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4796 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_DST);
4797 LIST_FOREACH(rule, &iface->local, next) {
4798 if (rule->mask & FLOW_TCF_ENCAP_IPV4_SRC &&
4799 encap->ipv4.src == rule->ipv4.src &&
4800 encap->ipv4.dst == rule->ipv4.dst) {
4805 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4806 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4807 LIST_FOREACH(rule, &iface->local, next) {
4808 if (rule->mask & FLOW_TCF_ENCAP_IPV6_SRC &&
4809 !memcmp(&encap->ipv6.src, &rule->ipv6.src,
4810 sizeof(encap->ipv6.src)) &&
4811 !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4812 sizeof(encap->ipv6.dst))) {
4822 if (!rule->refcnt || !--rule->refcnt) {
4823 LIST_REMOVE(rule, next);
4824 return flow_tcf_rule_local(tcf, encap,
4825 iface->ifouter, false, error);
4830 DRV_LOG(WARNING, "disabling not existing local rule");
4831 rte_flow_error_set(error, ENOENT,
4832 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4833 "disabling not existing local rule");
4836 rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule),
4837 alignof(struct tcf_local_rule));
4839 rte_flow_error_set(error, ENOMEM,
4840 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4841 "unable to allocate memory for local rule");
4844 *rule = (struct tcf_local_rule){.refcnt = 0,
4847 if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4848 rule->mask = FLOW_TCF_ENCAP_IPV4_SRC
4849 | FLOW_TCF_ENCAP_IPV4_DST;
4850 rule->ipv4.src = encap->ipv4.src;
4851 rule->ipv4.dst = encap->ipv4.dst;
4853 rule->mask = FLOW_TCF_ENCAP_IPV6_SRC
4854 | FLOW_TCF_ENCAP_IPV6_DST;
4855 memcpy(&rule->ipv6.src, &encap->ipv6.src, IPV6_ADDR_LEN);
4856 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4858 ret = flow_tcf_rule_local(tcf, encap, iface->ifouter, true, error);
4864 LIST_INSERT_HEAD(&iface->local, rule, next);
4869 * Manage the destination MAC/IP addresses neigh database, kernel uses
4870 * this one to determine the destination MAC address within encapsulation
4871 * header. Adds or removes the entries using the Netlink command like this:
4872 * ip neigh add dev <ifouter> lladdr <dst_mac> to <dst_ip> nud permanent
4875 * Libmnl socket context object.
4877 * Object, contains rule database and ifouter index.
4878 * @param[in] dev_flow
4879 * Flow object, contains the tunnel parameters (for encap only).
4881 * Toggle between add and remove.
4883 * Perform verbose error reporting if not NULL.
4886 * 0 on success, a negative errno value otherwise and rte_errno is set.
4889 flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf,
4890 struct tcf_irule *iface,
4891 struct mlx5_flow *dev_flow,
4893 struct rte_flow_error *error)
4895 const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4896 struct tcf_neigh_rule *rule = NULL;
4900 assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4901 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4902 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_SRC);
4903 LIST_FOREACH(rule, &iface->neigh, next) {
4904 if (rule->mask & FLOW_TCF_ENCAP_IPV4_DST &&
4905 encap->ipv4.dst == rule->ipv4.dst) {
4910 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4911 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4912 LIST_FOREACH(rule, &iface->neigh, next) {
4913 if (rule->mask & FLOW_TCF_ENCAP_IPV6_DST &&
4914 !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4915 sizeof(encap->ipv6.dst))) {
4921 if (memcmp(&encap->eth.dst, &rule->eth,
4922 sizeof(encap->eth.dst))) {
4923 DRV_LOG(WARNING, "Destination MAC differs"
4925 rte_flow_error_set(error, EEXIST,
4926 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
4927 NULL, "Different MAC address"
4928 " neigh rule for the same"
4936 if (!rule->refcnt || !--rule->refcnt) {
4937 LIST_REMOVE(rule, next);
4938 return flow_tcf_rule_neigh(tcf, encap,
4945 DRV_LOG(WARNING, "Disabling not existing neigh rule");
4946 rte_flow_error_set(error, ENOENT,
4947 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4948 "unable to allocate memory for neigh rule");
4951 rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule),
4952 alignof(struct tcf_neigh_rule));
4954 rte_flow_error_set(error, ENOMEM,
4955 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4956 "unable to allocate memory for neigh rule");
4959 *rule = (struct tcf_neigh_rule){.refcnt = 0,
4962 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4963 rule->mask = FLOW_TCF_ENCAP_IPV4_DST;
4964 rule->ipv4.dst = encap->ipv4.dst;
4966 rule->mask = FLOW_TCF_ENCAP_IPV6_DST;
4967 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4969 memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth));
4970 ret = flow_tcf_rule_neigh(tcf, encap, iface->ifouter, true, error);
4976 LIST_INSERT_HEAD(&iface->neigh, rule, next);
4980 /* VXLAN encap rule database for outer interfaces. */
4981 static LIST_HEAD(, tcf_irule) iface_list_vxlan = LIST_HEAD_INITIALIZER();
4983 /* VTEP device list is shared between PMD port instances. */
4984 static LIST_HEAD(, tcf_vtep) vtep_list_vxlan = LIST_HEAD_INITIALIZER();
4985 static pthread_mutex_t vtep_list_mutex = PTHREAD_MUTEX_INITIALIZER;
4988 * Acquire the VXLAN encap rules container for specified interface.
4989 * First looks for the container in the existing ones list, creates
4990 * and initializes the new container if existing not found.
4993 * Context object initialized by mlx5_flow_tcf_context_create().
4994 * @param[in] ifouter
4995 * Network interface index to create VXLAN encap rules on.
4997 * Perform verbose error reporting if not NULL.
4999 * Rule container pointer on success,
5000 * NULL otherwise and rte_errno is set.
5002 static struct tcf_irule*
5003 flow_tcf_encap_irule_acquire(struct mlx5_flow_tcf_context *tcf,
5004 unsigned int ifouter,
5005 struct rte_flow_error *error)
5007 struct tcf_irule *iface;
5009 /* Look whether the container for encap rules is created. */
5011 LIST_FOREACH(iface, &iface_list_vxlan, next) {
5012 if (iface->ifouter == ifouter)
5016 /* Container already exists, just increment the reference. */
5020 /* Not found, we should create the new container. */
5021 iface = rte_zmalloc(__func__, sizeof(*iface),
5022 alignof(struct tcf_irule));
5024 rte_flow_error_set(error, ENOMEM,
5025 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5026 "unable to allocate memory for container");
5029 *iface = (struct tcf_irule){
5030 .local = LIST_HEAD_INITIALIZER(),
5031 .neigh = LIST_HEAD_INITIALIZER(),
5035 /* Interface cleanup for new container created. */
5036 flow_tcf_encap_iface_cleanup(tcf, ifouter);
5037 flow_tcf_encap_local_cleanup(tcf, ifouter);
5038 flow_tcf_encap_neigh_cleanup(tcf, ifouter);
5039 LIST_INSERT_HEAD(&iface_list_vxlan, iface, next);
5044 * Releases VXLAN encap rules container by pointer. Decrements the
5045 * reference cointer and deletes the container if counter is zero.
5048 * VXLAN rule container pointer to release.
5051 flow_tcf_encap_irule_release(struct tcf_irule *iface)
5053 assert(iface->refcnt);
5054 if (--iface->refcnt == 0) {
5055 /* Reference counter is zero, delete the container. */
5056 assert(LIST_EMPTY(&iface->local));
5057 assert(LIST_EMPTY(&iface->neigh));
5058 LIST_REMOVE(iface, next);
5064 * Deletes VTEP network device.
5067 * Context object initialized by mlx5_flow_tcf_context_create().
5069 * Object represinting the network device to delete. Memory
5070 * allocated for this object is freed by routine.
5073 flow_tcf_vtep_delete(struct mlx5_flow_tcf_context *tcf,
5074 struct tcf_vtep *vtep)
5076 struct nlmsghdr *nlh;
5077 struct ifinfomsg *ifm;
5078 alignas(struct nlmsghdr)
5079 uint8_t buf[mnl_nlmsg_size(MNL_ALIGN(sizeof(*ifm))) +
5080 MNL_BUF_EXTRA_SPACE];
5083 assert(!vtep->refcnt);
5084 /* Delete only ifaces those we actually created. */
5085 if (vtep->created && vtep->ifindex) {
5086 DRV_LOG(INFO, "VTEP delete (%d)", vtep->ifindex);
5087 nlh = mnl_nlmsg_put_header(buf);
5088 nlh->nlmsg_type = RTM_DELLINK;
5089 nlh->nlmsg_flags = NLM_F_REQUEST;
5090 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5091 ifm->ifi_family = AF_UNSPEC;
5092 ifm->ifi_index = vtep->ifindex;
5093 assert(sizeof(buf) >= nlh->nlmsg_len);
5094 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5096 DRV_LOG(WARNING, "netlink: error deleting vxlan"
5097 " encap/decap ifindex %u",
5104 * Creates VTEP network device.
5107 * Context object initialized by mlx5_flow_tcf_context_create().
5109 * UDP port of created VTEP device.
5111 * Perform verbose error reporting if not NULL.
5114 * Pointer to created device structure on success,
5115 * NULL otherwise and rte_errno is set.
5117 static struct tcf_vtep*
5118 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf,
5119 uint16_t port, struct rte_flow_error *error)
5121 struct tcf_vtep *vtep;
5122 struct nlmsghdr *nlh;
5123 struct ifinfomsg *ifm;
5124 char name[sizeof(MLX5_VXLAN_DEVICE_PFX) + 24];
5125 alignas(struct nlmsghdr)
5126 uint8_t buf[mnl_nlmsg_size(sizeof(*ifm)) +
5127 SZ_NLATTR_DATA_OF(sizeof(name)) +
5128 SZ_NLATTR_NEST * 2 +
5129 SZ_NLATTR_STRZ_OF("vxlan") +
5130 SZ_NLATTR_DATA_OF(sizeof(uint32_t)) +
5131 SZ_NLATTR_DATA_OF(sizeof(uint16_t)) +
5132 SZ_NLATTR_DATA_OF(sizeof(uint8_t)) * 3 +
5133 MNL_BUF_EXTRA_SPACE];
5134 struct nlattr *na_info;
5135 struct nlattr *na_vxlan;
5136 rte_be16_t vxlan_port = rte_cpu_to_be_16(port);
5139 vtep = rte_zmalloc(__func__, sizeof(*vtep), alignof(struct tcf_vtep));
5141 rte_flow_error_set(error, ENOMEM,
5142 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5143 "unable to allocate memory for VTEP");
5146 *vtep = (struct tcf_vtep){
5149 memset(buf, 0, sizeof(buf));
5150 nlh = mnl_nlmsg_put_header(buf);
5151 nlh->nlmsg_type = RTM_NEWLINK;
5152 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
5153 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5154 ifm->ifi_family = AF_UNSPEC;
5157 ifm->ifi_flags = IFF_UP;
5158 ifm->ifi_change = 0xffffffff;
5159 snprintf(name, sizeof(name), "%s%u", MLX5_VXLAN_DEVICE_PFX, port);
5160 mnl_attr_put_strz(nlh, IFLA_IFNAME, name);
5161 na_info = mnl_attr_nest_start(nlh, IFLA_LINKINFO);
5163 mnl_attr_put_strz(nlh, IFLA_INFO_KIND, "vxlan");
5164 na_vxlan = mnl_attr_nest_start(nlh, IFLA_INFO_DATA);
5166 #ifdef HAVE_IFLA_VXLAN_COLLECT_METADATA
5168 * RH 7.2 does not support metadata for tunnel device.
5169 * It does not matter because we are going to use the
5170 * hardware offload by mlx5 driver.
5172 mnl_attr_put_u8(nlh, IFLA_VXLAN_COLLECT_METADATA, 1);
5174 mnl_attr_put_u8(nlh, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1);
5175 mnl_attr_put_u8(nlh, IFLA_VXLAN_LEARNING, 0);
5176 mnl_attr_put_u16(nlh, IFLA_VXLAN_PORT, vxlan_port);
5177 #ifndef HAVE_IFLA_VXLAN_COLLECT_METADATA
5179 * We must specify VNI explicitly if metadata not supported.
5180 * Note, VNI is transferred with native endianness format.
5182 mnl_attr_put_u16(nlh, IFLA_VXLAN_ID, MLX5_VXLAN_DEFAULT_VNI);
5184 mnl_attr_nest_end(nlh, na_vxlan);
5185 mnl_attr_nest_end(nlh, na_info);
5186 assert(sizeof(buf) >= nlh->nlmsg_len);
5187 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5190 "netlink: VTEP %s create failure (%d)",
5192 if (rte_errno != EEXIST)
5194 * Some unhandled error occurred or device is
5195 * for encapsulation and cannot be shared.
5200 * Mark device we actually created.
5201 * We should explicitly delete
5202 * when we do not need it anymore.
5206 /* Try to get ifindex of created of pre-existing device. */
5207 ret = if_nametoindex(name);
5210 "VTEP %s failed to get index (%d)", name, errno);
5213 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5214 "netlink: failed to retrieve VTEP ifindex");
5217 vtep->ifindex = ret;
5218 memset(buf, 0, sizeof(buf));
5219 nlh = mnl_nlmsg_put_header(buf);
5220 nlh->nlmsg_type = RTM_NEWLINK;
5221 nlh->nlmsg_flags = NLM_F_REQUEST;
5222 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5223 ifm->ifi_family = AF_UNSPEC;
5225 ifm->ifi_index = vtep->ifindex;
5226 ifm->ifi_flags = IFF_UP;
5227 ifm->ifi_change = IFF_UP;
5228 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5230 rte_flow_error_set(error, -errno,
5231 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5232 "netlink: failed to set VTEP link up");
5233 DRV_LOG(WARNING, "netlink: VTEP %s set link up failure (%d)",
5237 ret = mlx5_flow_tcf_init(tcf, vtep->ifindex, error);
5239 DRV_LOG(WARNING, "VTEP %s init failure (%d)", name, rte_errno);
5242 DRV_LOG(INFO, "VTEP create (%d, %d)", vtep->port, vtep->ifindex);
5246 flow_tcf_vtep_delete(tcf, vtep);
5254 * Acquire target interface index for VXLAN tunneling decapsulation.
5255 * In order to share the UDP port within the other interfaces the
5256 * VXLAN device created as not attached to any interface (if created).
5259 * Context object initialized by mlx5_flow_tcf_context_create().
5260 * @param[in] dev_flow
5261 * Flow tcf object with tunnel structure pointer set.
5263 * Perform verbose error reporting if not NULL.
5265 * Interface descriptor pointer on success,
5266 * NULL otherwise and rte_errno is set.
5268 static struct tcf_vtep*
5269 flow_tcf_decap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5270 struct mlx5_flow *dev_flow,
5271 struct rte_flow_error *error)
5273 struct tcf_vtep *vtep;
5274 uint16_t port = dev_flow->tcf.vxlan_decap->udp_port;
5276 LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5277 if (vtep->port == port)
5281 /* Device exists, just increment the reference counter. */
5283 assert(vtep->ifindex);
5286 /* No decapsulation device exists, try to create the new one. */
5287 vtep = flow_tcf_vtep_create(tcf, port, error);
5289 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5294 * Aqcuire target interface index for VXLAN tunneling encapsulation.
5297 * Context object initialized by mlx5_flow_tcf_context_create().
5298 * @param[in] ifouter
5299 * Network interface index to attach VXLAN encap device to.
5300 * @param[in] dev_flow
5301 * Flow tcf object with tunnel structure pointer set.
5303 * Perform verbose error reporting if not NULL.
5305 * Interface descriptor pointer on success,
5306 * NULL otherwise and rte_errno is set.
5308 static struct tcf_vtep*
5309 flow_tcf_encap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5310 unsigned int ifouter,
5311 struct mlx5_flow *dev_flow,
5312 struct rte_flow_error *error)
5314 static uint16_t port;
5315 struct tcf_vtep *vtep;
5316 struct tcf_irule *iface;
5320 /* Look whether the VTEP for specified port is created. */
5321 port = rte_be_to_cpu_16(dev_flow->tcf.vxlan_encap->udp.dst);
5322 LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5323 if (vtep->port == port)
5327 /* VTEP already exists, just increment the reference. */
5330 /* Not found, we should create the new VTEP. */
5331 vtep = flow_tcf_vtep_create(tcf, port, error);
5334 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5336 assert(vtep->ifindex);
5337 iface = flow_tcf_encap_irule_acquire(tcf, ifouter, error);
5339 if (--vtep->refcnt == 0)
5340 flow_tcf_vtep_delete(tcf, vtep);
5343 dev_flow->tcf.vxlan_encap->iface = iface;
5344 /* Create local ipaddr with peer to specify the outer IPs. */
5345 ret = flow_tcf_encap_local(tcf, iface, dev_flow, true, error);
5347 /* Create neigh rule to specify outer destination MAC. */
5348 ret = flow_tcf_encap_neigh(tcf, iface, dev_flow, true, error);
5350 flow_tcf_encap_local(tcf, iface,
5351 dev_flow, false, error);
5354 dev_flow->tcf.vxlan_encap->iface = NULL;
5355 flow_tcf_encap_irule_release(iface);
5356 if (--vtep->refcnt == 0)
5357 flow_tcf_vtep_delete(tcf, vtep);
5364 * Acquires target interface index for tunneling of any type.
5365 * Creates the new VTEP if needed.
5368 * Context object initialized by mlx5_flow_tcf_context_create().
5369 * @param[in] ifouter
5370 * Network interface index to create VXLAN encap rules on.
5371 * @param[in] dev_flow
5372 * Flow tcf object with tunnel structure pointer set.
5374 * Perform verbose error reporting if not NULL.
5376 * Interface descriptor pointer on success,
5377 * NULL otherwise and rte_errno is set.
5379 static struct tcf_vtep*
5380 flow_tcf_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5381 unsigned int ifouter,
5382 struct mlx5_flow *dev_flow,
5383 struct rte_flow_error *error)
5385 struct tcf_vtep *vtep = NULL;
5387 assert(dev_flow->tcf.tunnel);
5388 pthread_mutex_lock(&vtep_list_mutex);
5389 switch (dev_flow->tcf.tunnel->type) {
5390 case FLOW_TCF_TUNACT_VXLAN_ENCAP:
5391 vtep = flow_tcf_encap_vtep_acquire(tcf, ifouter,
5394 case FLOW_TCF_TUNACT_VXLAN_DECAP:
5395 vtep = flow_tcf_decap_vtep_acquire(tcf, dev_flow, error);
5398 rte_flow_error_set(error, ENOTSUP,
5399 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5400 "unsupported tunnel type");
5403 pthread_mutex_unlock(&vtep_list_mutex);
5408 * Release tunneling interface by ifindex. Decrements reference
5409 * counter and actually removes the device if counter is zero.
5412 * Context object initialized by mlx5_flow_tcf_context_create().
5414 * VTEP device descriptor structure.
5415 * @param[in] dev_flow
5416 * Flow tcf object with tunnel structure pointer set.
5419 flow_tcf_vtep_release(struct mlx5_flow_tcf_context *tcf,
5420 struct tcf_vtep *vtep,
5421 struct mlx5_flow *dev_flow)
5423 assert(dev_flow->tcf.tunnel);
5424 pthread_mutex_lock(&vtep_list_mutex);
5425 switch (dev_flow->tcf.tunnel->type) {
5426 case FLOW_TCF_TUNACT_VXLAN_DECAP:
5428 case FLOW_TCF_TUNACT_VXLAN_ENCAP: {
5429 struct tcf_irule *iface;
5431 /* Remove the encap ancillary rules first. */
5432 iface = dev_flow->tcf.vxlan_encap->iface;
5434 flow_tcf_encap_neigh(tcf, iface, dev_flow, false, NULL);
5435 flow_tcf_encap_local(tcf, iface, dev_flow, false, NULL);
5436 flow_tcf_encap_irule_release(iface);
5437 dev_flow->tcf.vxlan_encap->iface = NULL;
5442 DRV_LOG(WARNING, "Unsupported tunnel type");
5445 assert(vtep->refcnt);
5446 if (--vtep->refcnt == 0) {
5447 LIST_REMOVE(vtep, next);
5448 flow_tcf_vtep_delete(tcf, vtep);
5450 pthread_mutex_unlock(&vtep_list_mutex);
5453 struct tcf_nlcb_query {
5456 uint32_t flags_valid:1;
5460 * Collect queried rule attributes. This is callback routine called by
5461 * libmnl mnl_cb_run() in loop for every message in received packet.
5462 * Current implementation collects the flower flags only.
5465 * Pointer to reply header.
5466 * @param[in, out] arg
5467 * Context pointer for this callback.
5470 * A positive, nonzero value on success (required by libmnl
5471 * to continue messages processing).
5474 flow_tcf_collect_query_cb(const struct nlmsghdr *nlh, void *arg)
5476 struct tcf_nlcb_query *query = arg;
5477 struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
5478 struct nlattr *na, *na_opt;
5479 bool flower = false;
5481 if (nlh->nlmsg_type != RTM_NEWTFILTER ||
5482 tcm->tcm_handle != query->handle)
5484 mnl_attr_for_each(na, nlh, sizeof(*tcm)) {
5485 switch (mnl_attr_get_type(na)) {
5487 if (strcmp(mnl_attr_get_payload(na), "flower")) {
5488 /* Not flower filter, drop entire message. */
5495 /* Not flower options, drop entire message. */
5498 /* Check nested flower options. */
5499 mnl_attr_for_each_nested(na_opt, na) {
5500 switch (mnl_attr_get_type(na_opt)) {
5501 case TCA_FLOWER_FLAGS:
5502 query->flags_valid = 1;
5504 mnl_attr_get_u32(na_opt);
5515 * Query a TC flower rule flags via netlink.
5518 * Context object initialized by mlx5_flow_tcf_context_create().
5519 * @param[in] dev_flow
5520 * Pointer to the flow.
5521 * @param[out] pflags
5522 * pointer to the data retrieved by the query.
5525 * 0 on success, a negative errno value otherwise.
5528 flow_tcf_query_flags(struct mlx5_flow_tcf_context *tcf,
5529 struct mlx5_flow *dev_flow,
5532 struct nlmsghdr *nlh;
5534 struct tcf_nlcb_query query = {
5535 .handle = dev_flow->tcf.tcm->tcm_handle,
5538 nlh = mnl_nlmsg_put_header(tcf->buf);
5539 nlh->nlmsg_type = RTM_GETTFILTER;
5540 nlh->nlmsg_flags = NLM_F_REQUEST;
5541 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
5542 memcpy(tcm, dev_flow->tcf.tcm, sizeof(*tcm));
5544 * Ignore Netlink error for filter query operations.
5545 * The reply length is sent by kernel as errno.
5546 * Just check we got the flags option.
5548 flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_query_cb, &query);
5549 if (!query.flags_valid) {
5553 *pflags = query.tc_flags;
5558 * Query and check the in_hw set for specified rule.
5561 * Context object initialized by mlx5_flow_tcf_context_create().
5562 * @param[in] dev_flow
5563 * Pointer to the flow to check.
5566 * 0 on success, a negative errno value otherwise.
5569 flow_tcf_check_inhw(struct mlx5_flow_tcf_context *tcf,
5570 struct mlx5_flow *dev_flow)
5575 ret = flow_tcf_query_flags(tcf, dev_flow, &flags);
5578 return (flags & TCA_CLS_FLAGS_IN_HW) ? 0 : -ENOENT;
5582 * Remove flow from E-Switch by sending Netlink message.
5585 * Pointer to Ethernet device.
5586 * @param[in, out] flow
5587 * Pointer to the sub flow.
5590 flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
5592 struct priv *priv = dev->data->dev_private;
5593 struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5594 struct mlx5_flow *dev_flow;
5595 struct nlmsghdr *nlh;
5599 dev_flow = LIST_FIRST(&flow->dev_flows);
5602 /* E-Switch flow can't be expanded. */
5603 assert(!LIST_NEXT(dev_flow, next));
5604 if (dev_flow->tcf.applied) {
5605 nlh = dev_flow->tcf.nlh;
5606 nlh->nlmsg_type = RTM_DELTFILTER;
5607 nlh->nlmsg_flags = NLM_F_REQUEST;
5608 flow_tcf_nl_ack(ctx, nlh, NULL, NULL);
5609 if (dev_flow->tcf.tunnel) {
5610 assert(dev_flow->tcf.tunnel->vtep);
5611 flow_tcf_vtep_release(ctx,
5612 dev_flow->tcf.tunnel->vtep,
5614 dev_flow->tcf.tunnel->vtep = NULL;
5616 dev_flow->tcf.applied = 0;
5621 * Apply flow to E-Switch by sending Netlink message.
5624 * Pointer to Ethernet device.
5625 * @param[in, out] flow
5626 * Pointer to the sub flow.
5628 * Pointer to the error structure.
5631 * 0 on success, a negative errno value otherwise and rte_errno is set.
5634 flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
5635 struct rte_flow_error *error)
5637 struct priv *priv = dev->data->dev_private;
5638 struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5639 struct mlx5_flow *dev_flow;
5640 struct nlmsghdr *nlh;
5642 dev_flow = LIST_FIRST(&flow->dev_flows);
5643 /* E-Switch flow can't be expanded. */
5644 assert(!LIST_NEXT(dev_flow, next));
5645 if (dev_flow->tcf.applied)
5647 nlh = dev_flow->tcf.nlh;
5648 nlh->nlmsg_type = RTM_NEWTFILTER;
5649 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
5650 if (dev_flow->tcf.tunnel) {
5652 * Replace the interface index, target for
5653 * encapsulation, source for decapsulation.
5655 assert(!dev_flow->tcf.tunnel->vtep);
5656 assert(dev_flow->tcf.tunnel->ifindex_ptr);
5657 /* Acquire actual VTEP device when rule is being applied. */
5658 dev_flow->tcf.tunnel->vtep =
5659 flow_tcf_vtep_acquire(ctx,
5660 dev_flow->tcf.tunnel->ifindex_org,
5662 if (!dev_flow->tcf.tunnel->vtep)
5664 DRV_LOG(INFO, "Replace ifindex: %d->%d",
5665 dev_flow->tcf.tunnel->vtep->ifindex,
5666 dev_flow->tcf.tunnel->ifindex_org);
5667 *dev_flow->tcf.tunnel->ifindex_ptr =
5668 dev_flow->tcf.tunnel->vtep->ifindex;
5670 if (!flow_tcf_nl_ack(ctx, nlh, NULL, NULL)) {
5671 dev_flow->tcf.applied = 1;
5672 if (*dev_flow->tcf.ptc_flags & TCA_CLS_FLAGS_SKIP_SW)
5675 * Rule was applied without skip_sw flag set.
5676 * We should check whether the rule was acctually
5677 * accepted by hardware (have look at in_hw flag).
5679 if (flow_tcf_check_inhw(ctx, dev_flow)) {
5680 flow_tcf_remove(dev, flow);
5681 return rte_flow_error_set
5683 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5684 "netlink: rule has no in_hw flag set");
5688 if (dev_flow->tcf.tunnel) {
5689 /* Rollback the VTEP configuration if rule apply failed. */
5690 assert(dev_flow->tcf.tunnel->vtep);
5691 flow_tcf_vtep_release(ctx, dev_flow->tcf.tunnel->vtep,
5693 dev_flow->tcf.tunnel->vtep = NULL;
5695 return rte_flow_error_set(error, rte_errno,
5696 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5697 "netlink: failed to create TC flow rule");
5701 * Remove flow from E-Switch and release resources of the device flow.
5704 * Pointer to Ethernet device.
5705 * @param[in, out] flow
5706 * Pointer to the sub flow.
5709 flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
5711 struct mlx5_flow *dev_flow;
5715 flow_tcf_remove(dev, flow);
5716 if (flow->counter) {
5717 if (--flow->counter->ref_cnt == 0) {
5718 rte_free(flow->counter);
5719 flow->counter = NULL;
5722 dev_flow = LIST_FIRST(&flow->dev_flows);
5725 /* E-Switch flow can't be expanded. */
5726 assert(!LIST_NEXT(dev_flow, next));
5727 LIST_REMOVE(dev_flow, next);
5732 * Helper routine for figuring the space size required for a parse buffer.
5735 * array of values to use.
5737 * Current location in array.
5739 * Value to compare with.
5742 * The maximum between the given value and the array value on index.
5745 flow_tcf_arr_val_max(uint16_t array[], int idx, uint16_t value)
5747 return idx < 0 ? (value) : RTE_MAX((array)[idx], value);
5751 * Parse rtnetlink message attributes filling the attribute table with the info
5755 * Attribute table to be filled.
5757 * Maxinum entry in the attribute table.
5759 * The attributes section in the message to be parsed.
5761 * The length of the attributes section in the message.
5764 flow_tcf_nl_parse_rtattr(struct rtattr *tb[], int max,
5765 struct rtattr *rta, int len)
5767 unsigned short type;
5768 memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
5769 while (RTA_OK(rta, len)) {
5770 type = rta->rta_type;
5771 if (type <= max && !tb[type])
5773 rta = RTA_NEXT(rta, len);
5778 * Extract flow counters from flower action.
5781 * flower action stats properties in the Netlink message received.
5783 * The backward sequence of rta_types, as written in the attribute table,
5784 * we need to traverse in order to get to the requested object.
5786 * Current location in rta_type table.
5788 * data holding the count statistics of the rte_flow retrieved from
5792 * 0 if data was found and retrieved, -1 otherwise.
5795 flow_tcf_nl_action_stats_parse_and_get(struct rtattr *rta,
5796 uint16_t rta_type[], int idx,
5797 struct gnet_stats_basic *data)
5799 int tca_stats_max = flow_tcf_arr_val_max(rta_type, idx,
5801 struct rtattr *tbs[tca_stats_max + 1];
5803 if (rta == NULL || idx < 0)
5805 flow_tcf_nl_parse_rtattr(tbs, tca_stats_max,
5806 RTA_DATA(rta), RTA_PAYLOAD(rta));
5807 switch (rta_type[idx]) {
5808 case TCA_STATS_BASIC:
5809 if (tbs[TCA_STATS_BASIC]) {
5810 memcpy(data, RTA_DATA(tbs[TCA_STATS_BASIC]),
5811 RTE_MIN(RTA_PAYLOAD(tbs[TCA_STATS_BASIC]),
5823 * Parse flower single action retrieving the requested action attribute,
5827 * flower action properties in the Netlink message received.
5829 * The backward sequence of rta_types, as written in the attribute table,
5830 * we need to traverse in order to get to the requested object.
5832 * Current location in rta_type table.
5834 * Count statistics retrieved from the message query.
5837 * 0 if data was found and retrieved, -1 otherwise.
5840 flow_tcf_nl_parse_one_action_and_get(struct rtattr *arg,
5841 uint16_t rta_type[], int idx, void *data)
5843 int tca_act_max = flow_tcf_arr_val_max(rta_type, idx, TCA_ACT_STATS);
5844 struct rtattr *tb[tca_act_max + 1];
5846 if (arg == NULL || idx < 0)
5848 flow_tcf_nl_parse_rtattr(tb, tca_act_max,
5849 RTA_DATA(arg), RTA_PAYLOAD(arg));
5850 if (tb[TCA_ACT_KIND] == NULL)
5852 switch (rta_type[idx]) {
5854 if (tb[TCA_ACT_STATS])
5855 return flow_tcf_nl_action_stats_parse_and_get
5858 (struct gnet_stats_basic *)data);
5867 * Parse flower action section in the message retrieving the requested
5868 * attribute from the first action that provides it.
5871 * flower section in the Netlink message received.
5873 * The backward sequence of rta_types, as written in the attribute table,
5874 * we need to traverse in order to get to the requested object.
5876 * Current location in rta_type table.
5878 * data retrieved from the message query.
5881 * 0 if data was found and retrieved, -1 otherwise.
5884 flow_tcf_nl_action_parse_and_get(struct rtattr *arg,
5885 uint16_t rta_type[], int idx, void *data)
5887 struct rtattr *tb[TCA_ACT_MAX_PRIO + 1];
5890 if (arg == NULL || idx < 0)
5892 flow_tcf_nl_parse_rtattr(tb, TCA_ACT_MAX_PRIO,
5893 RTA_DATA(arg), RTA_PAYLOAD(arg));
5894 switch (rta_type[idx]) {
5896 * flow counters are stored in the actions defined by the flow
5897 * and not in the flow itself, therefore we need to traverse the
5898 * flower chain of actions in search for them.
5900 * Note that the index is not decremented here.
5903 for (i = 0; i <= TCA_ACT_MAX_PRIO; i++) {
5905 !flow_tcf_nl_parse_one_action_and_get(tb[i],
5918 * Parse flower classifier options in the message, retrieving the requested
5919 * attribute if found.
5922 * flower section in the Netlink message received.
5924 * The backward sequence of rta_types, as written in the attribute table,
5925 * we need to traverse in order to get to the requested object.
5927 * Current location in rta_type table.
5929 * data retrieved from the message query.
5932 * 0 if data was found and retrieved, -1 otherwise.
5935 flow_tcf_nl_opts_parse_and_get(struct rtattr *opt,
5936 uint16_t rta_type[], int idx, void *data)
5938 int tca_flower_max = flow_tcf_arr_val_max(rta_type, idx,
5940 struct rtattr *tb[tca_flower_max + 1];
5942 if (!opt || idx < 0)
5944 flow_tcf_nl_parse_rtattr(tb, tca_flower_max,
5945 RTA_DATA(opt), RTA_PAYLOAD(opt));
5946 switch (rta_type[idx]) {
5947 case TCA_FLOWER_ACT:
5948 if (tb[TCA_FLOWER_ACT])
5949 return flow_tcf_nl_action_parse_and_get
5950 (tb[TCA_FLOWER_ACT],
5951 rta_type, --idx, data);
5960 * Parse Netlink reply on filter query, retrieving the flow counters.
5963 * Message received from Netlink.
5965 * The backward sequence of rta_types, as written in the attribute table,
5966 * we need to traverse in order to get to the requested object.
5968 * Current location in rta_type table.
5970 * data retrieved from the message query.
5973 * 0 if data was found and retrieved, -1 otherwise.
5976 flow_tcf_nl_filter_parse_and_get(struct nlmsghdr *cnlh,
5977 uint16_t rta_type[], int idx, void *data)
5979 struct nlmsghdr *nlh = cnlh;
5980 struct tcmsg *t = NLMSG_DATA(nlh);
5981 int len = nlh->nlmsg_len;
5982 int tca_max = flow_tcf_arr_val_max(rta_type, idx, TCA_OPTIONS);
5983 struct rtattr *tb[tca_max + 1];
5987 if (nlh->nlmsg_type != RTM_NEWTFILTER &&
5988 nlh->nlmsg_type != RTM_GETTFILTER &&
5989 nlh->nlmsg_type != RTM_DELTFILTER)
5991 len -= NLMSG_LENGTH(sizeof(*t));
5994 flow_tcf_nl_parse_rtattr(tb, tca_max, TCA_RTA(t), len);
5995 /* Not a TC flower flow - bail out */
5996 if (!tb[TCA_KIND] ||
5997 strcmp(RTA_DATA(tb[TCA_KIND]), "flower"))
5999 switch (rta_type[idx]) {
6001 if (tb[TCA_OPTIONS])
6002 return flow_tcf_nl_opts_parse_and_get(tb[TCA_OPTIONS],
6013 * A callback to parse Netlink reply on TC flower query.
6016 * Message received from Netlink.
6018 * Pointer to data area to be filled by the parsing routine.
6019 * assumed to be a pointer to struct flow_tcf_stats_basic.
6025 flow_tcf_nl_message_get_stats_basic(const struct nlmsghdr *nlh, void *data)
6028 * The backward sequence of rta_types to pass in order to get
6031 uint16_t rta_type[] = { TCA_STATS_BASIC, TCA_ACT_STATS,
6032 TCA_FLOWER_ACT, TCA_OPTIONS };
6033 struct flow_tcf_stats_basic *sb_data = data;
6035 const struct nlmsghdr *c;
6036 struct nlmsghdr *nc;
6037 } tnlh = { .c = nlh };
6039 if (!flow_tcf_nl_filter_parse_and_get(tnlh.nc, rta_type,
6040 RTE_DIM(rta_type) - 1,
6041 (void *)&sb_data->counters))
6042 sb_data->valid = true;
6047 * Query a TC flower rule for its statistics via netlink.
6050 * Pointer to Ethernet device.
6052 * Pointer to the sub flow.
6054 * data retrieved by the query.
6056 * Perform verbose error reporting if not NULL.
6059 * 0 on success, a negative errno value otherwise and rte_errno is set.
6062 flow_tcf_query_count(struct rte_eth_dev *dev,
6063 struct rte_flow *flow,
6065 struct rte_flow_error *error)
6067 struct flow_tcf_stats_basic sb_data;
6068 struct rte_flow_query_count *qc = data;
6069 struct priv *priv = dev->data->dev_private;
6070 struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
6071 struct mnl_socket *nl = ctx->nl;
6072 struct mlx5_flow *dev_flow;
6073 struct nlmsghdr *nlh;
6074 uint32_t seq = priv->tcf_context->seq++;
6078 memset(&sb_data, 0, sizeof(sb_data));
6079 dev_flow = LIST_FIRST(&flow->dev_flows);
6080 /* E-Switch flow can't be expanded. */
6081 assert(!LIST_NEXT(dev_flow, next));
6082 if (!dev_flow->flow->counter)
6084 nlh = dev_flow->tcf.nlh;
6085 nlh->nlmsg_type = RTM_GETTFILTER;
6086 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ECHO;
6087 nlh->nlmsg_seq = seq;
6088 if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) == -1)
6091 ret = mnl_socket_recvfrom(nl, ctx->buf, ctx->buf_size);
6094 ret = mnl_cb_run(ctx->buf, ret, seq,
6095 mnl_socket_get_portid(nl),
6096 flow_tcf_nl_message_get_stats_basic,
6099 /* Return the delta from last reset. */
6100 if (sb_data.valid) {
6101 /* Return the delta from last reset. */
6104 qc->hits = sb_data.counters.packets - flow->counter->hits;
6105 qc->bytes = sb_data.counters.bytes - flow->counter->bytes;
6107 flow->counter->hits = sb_data.counters.packets;
6108 flow->counter->bytes = sb_data.counters.bytes;
6112 return rte_flow_error_set(error, EINVAL,
6113 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
6115 "flow does not have counter");
6117 return rte_flow_error_set
6118 (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
6119 NULL, "netlink: failed to read flow rule counters");
6121 return rte_flow_error_set
6122 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
6123 NULL, "counters are not available.");
6129 * @see rte_flow_query()
6133 flow_tcf_query(struct rte_eth_dev *dev,
6134 struct rte_flow *flow,
6135 const struct rte_flow_action *actions,
6137 struct rte_flow_error *error)
6141 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
6142 switch (actions->type) {
6143 case RTE_FLOW_ACTION_TYPE_VOID:
6145 case RTE_FLOW_ACTION_TYPE_COUNT:
6146 ret = flow_tcf_query_count(dev, flow, data, error);
6149 return rte_flow_error_set(error, ENOTSUP,
6150 RTE_FLOW_ERROR_TYPE_ACTION,
6152 "action not supported");
6158 const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops = {
6159 .validate = flow_tcf_validate,
6160 .prepare = flow_tcf_prepare,
6161 .translate = flow_tcf_translate,
6162 .apply = flow_tcf_apply,
6163 .remove = flow_tcf_remove,
6164 .destroy = flow_tcf_destroy,
6165 .query = flow_tcf_query,
6169 * Create and configure a libmnl socket for Netlink flow rules.
6172 * A valid libmnl socket object pointer on success, NULL otherwise and
6175 static struct mnl_socket *
6176 flow_tcf_mnl_socket_create(void)
6178 struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
6181 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
6183 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
6188 mnl_socket_close(nl);
6193 * Destroy a libmnl socket.
6196 * Libmnl socket of the @p NETLINK_ROUTE kind.
6199 flow_tcf_mnl_socket_destroy(struct mnl_socket *nl)
6202 mnl_socket_close(nl);
6206 * Initialize ingress qdisc of a given network interface.
6209 * Pointer to tc-flower context to use.
6211 * Index of network interface to initialize.
6213 * Perform verbose error reporting if not NULL.
6216 * 0 on success, a negative errno value otherwise and rte_errno is set.
6219 mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx,
6220 unsigned int ifindex, struct rte_flow_error *error)
6222 struct nlmsghdr *nlh;
6224 alignas(struct nlmsghdr)
6225 uint8_t buf[mnl_nlmsg_size(sizeof(*tcm)) +
6226 SZ_NLATTR_STRZ_OF("ingress") +
6227 MNL_BUF_EXTRA_SPACE];
6229 /* Destroy existing ingress qdisc and everything attached to it. */
6230 nlh = mnl_nlmsg_put_header(buf);
6231 nlh->nlmsg_type = RTM_DELQDISC;
6232 nlh->nlmsg_flags = NLM_F_REQUEST;
6233 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6234 tcm->tcm_family = AF_UNSPEC;
6235 tcm->tcm_ifindex = ifindex;
6236 tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6237 tcm->tcm_parent = TC_H_INGRESS;
6238 assert(sizeof(buf) >= nlh->nlmsg_len);
6239 /* Ignore errors when qdisc is already absent. */
6240 if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL) &&
6241 rte_errno != EINVAL && rte_errno != ENOENT)
6242 return rte_flow_error_set(error, rte_errno,
6243 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6244 "netlink: failed to remove ingress"
6246 /* Create fresh ingress qdisc. */
6247 nlh = mnl_nlmsg_put_header(buf);
6248 nlh->nlmsg_type = RTM_NEWQDISC;
6249 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
6250 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6251 tcm->tcm_family = AF_UNSPEC;
6252 tcm->tcm_ifindex = ifindex;
6253 tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6254 tcm->tcm_parent = TC_H_INGRESS;
6255 mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
6256 assert(sizeof(buf) >= nlh->nlmsg_len);
6257 if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL))
6258 return rte_flow_error_set(error, rte_errno,
6259 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6260 "netlink: failed to create ingress"
6266 * Create libmnl context for Netlink flow rules.
6269 * A valid libmnl socket object pointer on success, NULL otherwise and
6272 struct mlx5_flow_tcf_context *
6273 mlx5_flow_tcf_context_create(void)
6275 struct mlx5_flow_tcf_context *ctx = rte_zmalloc(__func__,
6280 ctx->nl = flow_tcf_mnl_socket_create();
6283 ctx->buf_size = MNL_SOCKET_BUFFER_SIZE;
6284 ctx->buf = rte_zmalloc(__func__,
6285 ctx->buf_size, sizeof(uint32_t));
6288 ctx->seq = random();
6291 mlx5_flow_tcf_context_destroy(ctx);
6296 * Destroy a libmnl context.
6299 * Libmnl socket of the @p NETLINK_ROUTE kind.
6302 mlx5_flow_tcf_context_destroy(struct mlx5_flow_tcf_context *ctx)
6306 flow_tcf_mnl_socket_destroy(ctx->nl);