1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2018 6WIND S.A.
3 * Copyright 2018 Mellanox Technologies, Ltd
8 #include <libmnl/libmnl.h>
9 #include <linux/gen_stats.h>
10 #include <linux/if_ether.h>
11 #include <linux/netlink.h>
12 #include <linux/pkt_cls.h>
13 #include <linux/pkt_sched.h>
14 #include <linux/rtnetlink.h>
15 #include <linux/tc_act/tc_gact.h>
16 #include <linux/tc_act/tc_mirred.h>
17 #include <netinet/in.h>
23 #include <sys/socket.h>
25 #include <rte_byteorder.h>
26 #include <rte_errno.h>
27 #include <rte_ether.h>
29 #include <rte_malloc.h>
30 #include <rte_common.h>
33 #include "mlx5_flow.h"
34 #include "mlx5_autoconf.h"
36 #ifdef HAVE_TC_ACT_VLAN
38 #include <linux/tc_act/tc_vlan.h>
40 #else /* HAVE_TC_ACT_VLAN */
42 #define TCA_VLAN_ACT_POP 1
43 #define TCA_VLAN_ACT_PUSH 2
44 #define TCA_VLAN_ACT_MODIFY 3
45 #define TCA_VLAN_PARMS 2
46 #define TCA_VLAN_PUSH_VLAN_ID 3
47 #define TCA_VLAN_PUSH_VLAN_PROTOCOL 4
48 #define TCA_VLAN_PAD 5
49 #define TCA_VLAN_PUSH_VLAN_PRIORITY 6
56 #endif /* HAVE_TC_ACT_VLAN */
58 #ifdef HAVE_TC_ACT_PEDIT
60 #include <linux/tc_act/tc_pedit.h>
62 #else /* HAVE_TC_ACT_VLAN */
76 TCA_PEDIT_KEY_EX_HTYPE = 1,
77 TCA_PEDIT_KEY_EX_CMD = 2,
78 __TCA_PEDIT_KEY_EX_MAX
81 enum pedit_header_type {
82 TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK = 0,
83 TCA_PEDIT_KEY_EX_HDR_TYPE_ETH = 1,
84 TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 = 2,
85 TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 = 3,
86 TCA_PEDIT_KEY_EX_HDR_TYPE_TCP = 4,
87 TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
92 TCA_PEDIT_KEY_EX_CMD_SET = 0,
93 TCA_PEDIT_KEY_EX_CMD_ADD = 1,
100 __u32 off; /*offset */
107 struct tc_pedit_sel {
111 struct tc_pedit_key keys[0];
114 #endif /* HAVE_TC_ACT_VLAN */
116 #ifdef HAVE_TC_ACT_TUNNEL_KEY
118 #include <linux/tc_act/tc_tunnel_key.h>
120 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
121 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
124 #ifndef HAVE_TCA_TUNNEL_KEY_NO_CSUM
125 #define TCA_TUNNEL_KEY_NO_CSUM 10
128 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_TOS
129 #define TCA_TUNNEL_KEY_ENC_TOS 12
132 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_TTL
133 #define TCA_TUNNEL_KEY_ENC_TTL 13
136 #else /* HAVE_TC_ACT_TUNNEL_KEY */
138 #define TCA_ACT_TUNNEL_KEY 17
139 #define TCA_TUNNEL_KEY_ACT_SET 1
140 #define TCA_TUNNEL_KEY_ACT_RELEASE 2
141 #define TCA_TUNNEL_KEY_PARMS 2
142 #define TCA_TUNNEL_KEY_ENC_IPV4_SRC 3
143 #define TCA_TUNNEL_KEY_ENC_IPV4_DST 4
144 #define TCA_TUNNEL_KEY_ENC_IPV6_SRC 5
145 #define TCA_TUNNEL_KEY_ENC_IPV6_DST 6
146 #define TCA_TUNNEL_KEY_ENC_KEY_ID 7
147 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
148 #define TCA_TUNNEL_KEY_NO_CSUM 10
149 #define TCA_TUNNEL_KEY_ENC_TOS 12
150 #define TCA_TUNNEL_KEY_ENC_TTL 13
152 struct tc_tunnel_key {
157 #endif /* HAVE_TC_ACT_TUNNEL_KEY */
159 /* Normally found in linux/netlink.h. */
160 #ifndef NETLINK_CAP_ACK
161 #define NETLINK_CAP_ACK 10
164 /* Normally found in linux/pkt_sched.h. */
165 #ifndef TC_H_MIN_INGRESS
166 #define TC_H_MIN_INGRESS 0xfff2u
169 /* Normally found in linux/pkt_cls.h. */
170 #ifndef TCA_CLS_FLAGS_SKIP_SW
171 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
173 #ifndef TCA_CLS_FLAGS_IN_HW
174 #define TCA_CLS_FLAGS_IN_HW (1 << 2)
176 #ifndef HAVE_TCA_CHAIN
179 #ifndef HAVE_TCA_FLOWER_ACT
180 #define TCA_FLOWER_ACT 3
182 #ifndef HAVE_TCA_FLOWER_FLAGS
183 #define TCA_FLOWER_FLAGS 22
185 #ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE
186 #define TCA_FLOWER_KEY_ETH_TYPE 8
188 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST
189 #define TCA_FLOWER_KEY_ETH_DST 4
191 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK
192 #define TCA_FLOWER_KEY_ETH_DST_MASK 5
194 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC
195 #define TCA_FLOWER_KEY_ETH_SRC 6
197 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK
198 #define TCA_FLOWER_KEY_ETH_SRC_MASK 7
200 #ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO
201 #define TCA_FLOWER_KEY_IP_PROTO 9
203 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC
204 #define TCA_FLOWER_KEY_IPV4_SRC 10
206 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK
207 #define TCA_FLOWER_KEY_IPV4_SRC_MASK 11
209 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST
210 #define TCA_FLOWER_KEY_IPV4_DST 12
212 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK
213 #define TCA_FLOWER_KEY_IPV4_DST_MASK 13
215 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC
216 #define TCA_FLOWER_KEY_IPV6_SRC 14
218 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK
219 #define TCA_FLOWER_KEY_IPV6_SRC_MASK 15
221 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST
222 #define TCA_FLOWER_KEY_IPV6_DST 16
224 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK
225 #define TCA_FLOWER_KEY_IPV6_DST_MASK 17
227 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC
228 #define TCA_FLOWER_KEY_TCP_SRC 18
230 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK
231 #define TCA_FLOWER_KEY_TCP_SRC_MASK 35
233 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST
234 #define TCA_FLOWER_KEY_TCP_DST 19
236 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK
237 #define TCA_FLOWER_KEY_TCP_DST_MASK 36
239 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC
240 #define TCA_FLOWER_KEY_UDP_SRC 20
242 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK
243 #define TCA_FLOWER_KEY_UDP_SRC_MASK 37
245 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST
246 #define TCA_FLOWER_KEY_UDP_DST 21
248 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK
249 #define TCA_FLOWER_KEY_UDP_DST_MASK 38
251 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID
252 #define TCA_FLOWER_KEY_VLAN_ID 23
254 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO
255 #define TCA_FLOWER_KEY_VLAN_PRIO 24
257 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
258 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
260 #ifndef HAVE_TCA_FLOWER_KEY_ENC_KEY_ID
261 #define TCA_FLOWER_KEY_ENC_KEY_ID 26
263 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC
264 #define TCA_FLOWER_KEY_ENC_IPV4_SRC 27
266 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK
267 #define TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK 28
269 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST
270 #define TCA_FLOWER_KEY_ENC_IPV4_DST 29
272 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK
273 #define TCA_FLOWER_KEY_ENC_IPV4_DST_MASK 30
275 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC
276 #define TCA_FLOWER_KEY_ENC_IPV6_SRC 31
278 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK
279 #define TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK 32
281 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST
282 #define TCA_FLOWER_KEY_ENC_IPV6_DST 33
284 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK
285 #define TCA_FLOWER_KEY_ENC_IPV6_DST_MASK 34
287 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT
288 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT 43
290 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK
291 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK 44
293 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT
294 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT 45
296 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK
297 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK 46
299 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS
300 #define TCA_FLOWER_KEY_TCP_FLAGS 71
302 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK
303 #define TCA_FLOWER_KEY_TCP_FLAGS_MASK 72
305 #ifndef HAVE_TCA_FLOWER_KEY_IP_TOS
306 #define TCA_FLOWER_KEY_IP_TOS 73
308 #ifndef HAVE_TCA_FLOWER_KEY_IP_TOS_MASK
309 #define TCA_FLOWER_KEY_IP_TOS_MASK 74
311 #ifndef HAVE_TCA_FLOWER_KEY_IP_TTL
312 #define TCA_FLOWER_KEY_IP_TTL 75
314 #ifndef HAVE_TCA_FLOWER_KEY_IP_TTL_MASK
315 #define TCA_FLOWER_KEY_IP_TTL_MASK 76
317 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TOS
318 #define TCA_FLOWER_KEY_ENC_IP_TOS 80
320 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TOS_MASK
321 #define TCA_FLOWER_KEY_ENC_IP_TOS_MASK 81
323 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TTL
324 #define TCA_FLOWER_KEY_ENC_IP_TTL 82
326 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TTL_MASK
327 #define TCA_FLOWER_KEY_ENC_IP_TTL_MASK 83
330 #ifndef HAVE_TC_ACT_GOTO_CHAIN
331 #define TC_ACT_GOTO_CHAIN 0x20000000
334 #ifndef IPV6_ADDR_LEN
335 #define IPV6_ADDR_LEN 16
338 #ifndef IPV4_ADDR_LEN
339 #define IPV4_ADDR_LEN 4
343 #define TP_PORT_LEN 2 /* Transport Port (UDP/TCP) Length */
350 #ifndef TCA_ACT_MAX_PRIO
351 #define TCA_ACT_MAX_PRIO 32
354 /** Parameters of VXLAN devices created by driver. */
355 #define MLX5_VXLAN_DEFAULT_VNI 1
356 #define MLX5_VXLAN_DEVICE_PFX "vmlx_"
358 /** Tunnel action type, used for @p type in header structure. */
359 enum flow_tcf_tunact_type {
360 FLOW_TCF_TUNACT_VXLAN_DECAP,
361 FLOW_TCF_TUNACT_VXLAN_ENCAP,
364 /** Flags used for @p mask in tunnel action encap descriptors. */
365 #define FLOW_TCF_ENCAP_ETH_SRC (1u << 0)
366 #define FLOW_TCF_ENCAP_ETH_DST (1u << 1)
367 #define FLOW_TCF_ENCAP_IPV4_SRC (1u << 2)
368 #define FLOW_TCF_ENCAP_IPV4_DST (1u << 3)
369 #define FLOW_TCF_ENCAP_IPV6_SRC (1u << 4)
370 #define FLOW_TCF_ENCAP_IPV6_DST (1u << 5)
371 #define FLOW_TCF_ENCAP_UDP_SRC (1u << 6)
372 #define FLOW_TCF_ENCAP_UDP_DST (1u << 7)
373 #define FLOW_TCF_ENCAP_VXLAN_VNI (1u << 8)
374 #define FLOW_TCF_ENCAP_IP_TTL (1u << 9)
375 #define FLOW_TCF_ENCAP_IP_TOS (1u << 10)
378 * Structure for holding netlink context.
379 * Note the size of the message buffer which is MNL_SOCKET_BUFFER_SIZE.
380 * Using this (8KB) buffer size ensures that netlink messages will never be
383 struct mlx5_flow_tcf_context {
384 struct mnl_socket *nl; /* NETLINK_ROUTE libmnl socket. */
385 uint32_t seq; /* Message sequence number. */
386 uint32_t buf_size; /* Message buffer size. */
387 uint8_t *buf; /* Message buffer. */
391 * Neigh rule structure. The neigh rule is applied via Netlink to
392 * outer tunnel iface in order to provide destination MAC address
393 * for the VXLAN encapsultion. The neigh rule is implicitly related
394 * to the Flow itself and can be shared by multiple Flows.
396 struct tcf_neigh_rule {
397 LIST_ENTRY(tcf_neigh_rule) next;
399 struct ether_addr eth;
406 uint8_t dst[IPV6_ADDR_LEN];
412 * Local rule structure. The local rule is applied via Netlink to
413 * outer tunnel iface in order to provide local and peer IP addresses
414 * of the VXLAN tunnel for encapsulation. The local rule is implicitly
415 * related to the Flow itself and can be shared by multiple Flows.
417 struct tcf_local_rule {
418 LIST_ENTRY(tcf_local_rule) next;
427 uint8_t dst[IPV6_ADDR_LEN];
428 uint8_t src[IPV6_ADDR_LEN];
433 /** Outer interface VXLAN encapsulation rules container. */
435 LIST_ENTRY(tcf_irule) next;
436 LIST_HEAD(, tcf_neigh_rule) neigh;
437 LIST_HEAD(, tcf_local_rule) local;
439 unsigned int ifouter; /**< Own interface index. */
442 /** VXLAN virtual netdev. */
444 LIST_ENTRY(tcf_vtep) next;
446 unsigned int ifindex; /**< Own interface index. */
451 /** Tunnel descriptor header, common for all tunnel types. */
452 struct flow_tcf_tunnel_hdr {
453 uint32_t type; /**< Tunnel action type. */
454 struct tcf_vtep *vtep; /**< Virtual tunnel endpoint device. */
455 unsigned int ifindex_org; /**< Original dst/src interface */
456 unsigned int *ifindex_ptr; /**< Interface ptr in message. */
459 struct flow_tcf_vxlan_decap {
460 struct flow_tcf_tunnel_hdr hdr;
464 struct flow_tcf_vxlan_encap {
465 struct flow_tcf_tunnel_hdr hdr;
466 struct tcf_irule *iface;
471 struct ether_addr dst;
472 struct ether_addr src;
480 uint8_t dst[IPV6_ADDR_LEN];
481 uint8_t src[IPV6_ADDR_LEN];
493 /** Structure used when extracting the values of a flow counters
494 * from a netlink message.
496 struct flow_tcf_stats_basic {
498 struct gnet_stats_basic counters;
501 /** Empty masks for known item types. */
503 struct rte_flow_item_port_id port_id;
504 struct rte_flow_item_eth eth;
505 struct rte_flow_item_vlan vlan;
506 struct rte_flow_item_ipv4 ipv4;
507 struct rte_flow_item_ipv6 ipv6;
508 struct rte_flow_item_tcp tcp;
509 struct rte_flow_item_udp udp;
510 struct rte_flow_item_vxlan vxlan;
511 } flow_tcf_mask_empty = {
515 /** Supported masks for known item types. */
516 static const struct {
517 struct rte_flow_item_port_id port_id;
518 struct rte_flow_item_eth eth;
519 struct rte_flow_item_vlan vlan;
520 struct rte_flow_item_ipv4 ipv4;
521 struct rte_flow_item_ipv6 ipv6;
522 struct rte_flow_item_tcp tcp;
523 struct rte_flow_item_udp udp;
524 struct rte_flow_item_vxlan vxlan;
525 } flow_tcf_mask_supported = {
530 .type = RTE_BE16(0xffff),
531 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
532 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
535 /* PCP and VID only, no DEI. */
536 .tci = RTE_BE16(0xefff),
537 .inner_type = RTE_BE16(0xffff),
540 .next_proto_id = 0xff,
541 .time_to_live = 0xff,
542 .type_of_service = 0xff,
543 .src_addr = RTE_BE32(0xffffffff),
544 .dst_addr = RTE_BE32(0xffffffff),
548 .vtc_flow = RTE_BE32(0xfful << IPV6_HDR_FL_SHIFT),
551 "\xff\xff\xff\xff\xff\xff\xff\xff"
552 "\xff\xff\xff\xff\xff\xff\xff\xff",
554 "\xff\xff\xff\xff\xff\xff\xff\xff"
555 "\xff\xff\xff\xff\xff\xff\xff\xff",
558 .src_port = RTE_BE16(0xffff),
559 .dst_port = RTE_BE16(0xffff),
563 .src_port = RTE_BE16(0xffff),
564 .dst_port = RTE_BE16(0xffff),
567 .vni = "\xff\xff\xff",
571 #define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr))
572 #define SZ_NLATTR_NEST SZ_NLATTR_HDR
573 #define SZ_NLATTR_DATA_OF(len) MNL_ALIGN(SZ_NLATTR_HDR + (len))
574 #define SZ_NLATTR_TYPE_OF(typ) SZ_NLATTR_DATA_OF(sizeof(typ))
575 #define SZ_NLATTR_STRZ_OF(str) SZ_NLATTR_DATA_OF(strlen(str) + 1)
577 #define PTOI_TABLE_SZ_MAX(dev) (mlx5_dev_to_port_id((dev)->device, NULL, 0) + 2)
579 /** DPDK port to network interface index (ifindex) conversion. */
580 struct flow_tcf_ptoi {
581 uint16_t port_id; /**< DPDK port ID. */
582 unsigned int ifindex; /**< Network interface index. */
585 /* Due to a limitation on driver/FW. */
586 #define MLX5_TCF_GROUP_ID_MAX 3
589 * Due to a limitation on driver/FW, priority ranges from 1 to 16 in kernel.
590 * Priority in rte_flow attribute starts from 0 and is added by 1 in
591 * translation. This is subject to be changed to determine the max priority
592 * based on trial-and-error like Verbs driver once the restriction is lifted or
593 * the range is extended.
595 #define MLX5_TCF_GROUP_PRIORITY_MAX 15
597 #define MLX5_TCF_FATE_ACTIONS \
598 (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \
599 MLX5_FLOW_ACTION_JUMP)
601 #define MLX5_TCF_VLAN_ACTIONS \
602 (MLX5_FLOW_ACTION_OF_POP_VLAN | MLX5_FLOW_ACTION_OF_PUSH_VLAN | \
603 MLX5_FLOW_ACTION_OF_SET_VLAN_VID | MLX5_FLOW_ACTION_OF_SET_VLAN_PCP)
605 #define MLX5_TCF_VXLAN_ACTIONS \
606 (MLX5_FLOW_ACTION_VXLAN_ENCAP | MLX5_FLOW_ACTION_VXLAN_DECAP)
608 #define MLX5_TCF_PEDIT_ACTIONS \
609 (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST | \
610 MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST | \
611 MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST | \
612 MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL | \
613 MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)
615 #define MLX5_TCF_CONFIG_ACTIONS \
616 (MLX5_FLOW_ACTION_PORT_ID | MLX5_FLOW_ACTION_JUMP | \
617 MLX5_FLOW_ACTION_OF_PUSH_VLAN | MLX5_FLOW_ACTION_OF_SET_VLAN_VID | \
618 MLX5_FLOW_ACTION_OF_SET_VLAN_PCP | \
619 (MLX5_TCF_PEDIT_ACTIONS & ~MLX5_FLOW_ACTION_DEC_TTL))
621 #define MAX_PEDIT_KEYS 128
622 #define SZ_PEDIT_KEY_VAL 4
624 #define NUM_OF_PEDIT_KEYS(sz) \
625 (((sz) / SZ_PEDIT_KEY_VAL) + (((sz) % SZ_PEDIT_KEY_VAL) ? 1 : 0))
627 struct pedit_key_ex {
628 enum pedit_header_type htype;
632 struct pedit_parser {
633 struct tc_pedit_sel sel;
634 struct tc_pedit_key keys[MAX_PEDIT_KEYS];
635 struct pedit_key_ex keys_ex[MAX_PEDIT_KEYS];
639 * Create space for using the implicitly created TC flow counter.
642 * Pointer to the Ethernet device structure.
645 * A pointer to the counter data structure, NULL otherwise and
648 static struct mlx5_flow_counter *
649 flow_tcf_counter_new(void)
651 struct mlx5_flow_counter *cnt;
654 * eswitch counter cannot be shared and its id is unknown.
655 * currently returning all with id 0.
656 * in the future maybe better to switch to unique numbers.
658 struct mlx5_flow_counter tmpl = {
661 cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0);
667 /* Implicit counter, do not add to list. */
672 * Set pedit key of MAC address
675 * pointer to action specification
676 * @param[in,out] p_parser
677 * pointer to pedit_parser
680 flow_tcf_pedit_key_set_mac(const struct rte_flow_action *actions,
681 struct pedit_parser *p_parser)
683 int idx = p_parser->sel.nkeys;
684 uint32_t off = actions->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ?
685 offsetof(struct ether_hdr, s_addr) :
686 offsetof(struct ether_hdr, d_addr);
687 const struct rte_flow_action_set_mac *conf =
688 (const struct rte_flow_action_set_mac *)actions->conf;
690 p_parser->keys[idx].off = off;
691 p_parser->keys[idx].mask = ~UINT32_MAX;
692 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
693 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
694 memcpy(&p_parser->keys[idx].val,
695 conf->mac_addr, SZ_PEDIT_KEY_VAL);
697 p_parser->keys[idx].off = off + SZ_PEDIT_KEY_VAL;
698 p_parser->keys[idx].mask = 0xFFFF0000;
699 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
700 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
701 memcpy(&p_parser->keys[idx].val,
702 conf->mac_addr + SZ_PEDIT_KEY_VAL,
703 ETHER_ADDR_LEN - SZ_PEDIT_KEY_VAL);
704 p_parser->sel.nkeys = (++idx);
708 * Set pedit key of decrease/set ttl
711 * pointer to action specification
712 * @param[in,out] p_parser
713 * pointer to pedit_parser
714 * @param[in] item_flags
715 * flags of all items presented
718 flow_tcf_pedit_key_set_dec_ttl(const struct rte_flow_action *actions,
719 struct pedit_parser *p_parser,
722 int idx = p_parser->sel.nkeys;
724 p_parser->keys[idx].mask = 0xFFFFFF00;
725 if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) {
726 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
727 p_parser->keys[idx].off =
728 offsetof(struct ipv4_hdr, time_to_live);
730 if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6) {
731 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
732 p_parser->keys[idx].off =
733 offsetof(struct ipv6_hdr, hop_limits);
735 if (actions->type == RTE_FLOW_ACTION_TYPE_DEC_TTL) {
736 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_ADD;
737 p_parser->keys[idx].val = 0x000000FF;
739 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
740 p_parser->keys[idx].val =
741 (__u32)((const struct rte_flow_action_set_ttl *)
742 actions->conf)->ttl_value;
744 p_parser->sel.nkeys = (++idx);
748 * Set pedit key of transport (TCP/UDP) port value
751 * pointer to action specification
752 * @param[in,out] p_parser
753 * pointer to pedit_parser
754 * @param[in] item_flags
755 * flags of all items presented
758 flow_tcf_pedit_key_set_tp_port(const struct rte_flow_action *actions,
759 struct pedit_parser *p_parser,
762 int idx = p_parser->sel.nkeys;
764 if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)
765 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_UDP;
766 if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_TCP)
767 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_TCP;
768 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
769 /* offset of src/dst port is same for TCP and UDP */
770 p_parser->keys[idx].off =
771 actions->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC ?
772 offsetof(struct tcp_hdr, src_port) :
773 offsetof(struct tcp_hdr, dst_port);
774 p_parser->keys[idx].mask = 0xFFFF0000;
775 p_parser->keys[idx].val =
776 (__u32)((const struct rte_flow_action_set_tp *)
777 actions->conf)->port;
778 p_parser->sel.nkeys = (++idx);
782 * Set pedit key of ipv6 address
785 * pointer to action specification
786 * @param[in,out] p_parser
787 * pointer to pedit_parser
790 flow_tcf_pedit_key_set_ipv6_addr(const struct rte_flow_action *actions,
791 struct pedit_parser *p_parser)
793 int idx = p_parser->sel.nkeys;
794 int keys = NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
796 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ?
797 offsetof(struct ipv6_hdr, src_addr) :
798 offsetof(struct ipv6_hdr, dst_addr);
799 const struct rte_flow_action_set_ipv6 *conf =
800 (const struct rte_flow_action_set_ipv6 *)actions->conf;
802 for (int i = 0; i < keys; i++, idx++) {
803 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
804 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
805 p_parser->keys[idx].off = off_base + i * SZ_PEDIT_KEY_VAL;
806 p_parser->keys[idx].mask = ~UINT32_MAX;
807 memcpy(&p_parser->keys[idx].val,
808 conf->ipv6_addr + i * SZ_PEDIT_KEY_VAL,
811 p_parser->sel.nkeys += keys;
815 * Set pedit key of ipv4 address
818 * pointer to action specification
819 * @param[in,out] p_parser
820 * pointer to pedit_parser
823 flow_tcf_pedit_key_set_ipv4_addr(const struct rte_flow_action *actions,
824 struct pedit_parser *p_parser)
826 int idx = p_parser->sel.nkeys;
828 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
829 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
830 p_parser->keys[idx].off =
831 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ?
832 offsetof(struct ipv4_hdr, src_addr) :
833 offsetof(struct ipv4_hdr, dst_addr);
834 p_parser->keys[idx].mask = ~UINT32_MAX;
835 p_parser->keys[idx].val =
836 ((const struct rte_flow_action_set_ipv4 *)
837 actions->conf)->ipv4_addr;
838 p_parser->sel.nkeys = (++idx);
842 * Create the pedit's na attribute in netlink message
843 * on pre-allocate message buffer
846 * pointer to pre-allocated netlink message buffer
847 * @param[in,out] actions
848 * pointer to pointer of actions specification.
849 * @param[in,out] action_flags
850 * pointer to actions flags
851 * @param[in] item_flags
852 * flags of all item presented
855 flow_tcf_create_pedit_mnl_msg(struct nlmsghdr *nl,
856 const struct rte_flow_action **actions,
859 struct pedit_parser p_parser;
860 struct nlattr *na_act_options;
861 struct nlattr *na_pedit_keys;
863 memset(&p_parser, 0, sizeof(p_parser));
864 mnl_attr_put_strz(nl, TCA_ACT_KIND, "pedit");
865 na_act_options = mnl_attr_nest_start(nl, TCA_ACT_OPTIONS);
866 /* all modify header actions should be in one tc-pedit action */
867 for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
868 switch ((*actions)->type) {
869 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
870 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
871 flow_tcf_pedit_key_set_ipv4_addr(*actions, &p_parser);
873 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
874 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
875 flow_tcf_pedit_key_set_ipv6_addr(*actions, &p_parser);
877 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
878 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
879 flow_tcf_pedit_key_set_tp_port(*actions,
880 &p_parser, item_flags);
882 case RTE_FLOW_ACTION_TYPE_SET_TTL:
883 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
884 flow_tcf_pedit_key_set_dec_ttl(*actions,
885 &p_parser, item_flags);
887 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
888 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
889 flow_tcf_pedit_key_set_mac(*actions, &p_parser);
892 goto pedit_mnl_msg_done;
896 p_parser.sel.action = TC_ACT_PIPE;
897 mnl_attr_put(nl, TCA_PEDIT_PARMS_EX,
898 sizeof(p_parser.sel) +
899 p_parser.sel.nkeys * sizeof(struct tc_pedit_key),
902 mnl_attr_nest_start(nl, TCA_PEDIT_KEYS_EX | NLA_F_NESTED);
903 for (int i = 0; i < p_parser.sel.nkeys; i++) {
904 struct nlattr *na_pedit_key =
905 mnl_attr_nest_start(nl,
906 TCA_PEDIT_KEY_EX | NLA_F_NESTED);
907 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_HTYPE,
908 p_parser.keys_ex[i].htype);
909 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_CMD,
910 p_parser.keys_ex[i].cmd);
911 mnl_attr_nest_end(nl, na_pedit_key);
913 mnl_attr_nest_end(nl, na_pedit_keys);
914 mnl_attr_nest_end(nl, na_act_options);
919 * Calculate max memory size of one TC-pedit actions.
920 * One TC-pedit action can contain set of keys each defining
921 * a rewrite element (rte_flow action)
923 * @param[in,out] actions
924 * actions specification.
925 * @param[in,out] action_flags
927 * @param[in,out] size
930 * Max memory size of one TC-pedit action
933 flow_tcf_get_pedit_actions_size(const struct rte_flow_action **actions,
934 uint64_t *action_flags)
940 pedit_size += SZ_NLATTR_NEST + /* na_act_index. */
941 SZ_NLATTR_STRZ_OF("pedit") +
942 SZ_NLATTR_NEST; /* TCA_ACT_OPTIONS. */
943 for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
944 switch ((*actions)->type) {
945 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
946 keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
947 flags |= MLX5_FLOW_ACTION_SET_IPV4_SRC;
949 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
950 keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
951 flags |= MLX5_FLOW_ACTION_SET_IPV4_DST;
953 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
954 keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
955 flags |= MLX5_FLOW_ACTION_SET_IPV6_SRC;
957 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
958 keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
959 flags |= MLX5_FLOW_ACTION_SET_IPV6_DST;
961 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
962 /* TCP is as same as UDP */
963 keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
964 flags |= MLX5_FLOW_ACTION_SET_TP_SRC;
966 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
967 /* TCP is as same as UDP */
968 keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
969 flags |= MLX5_FLOW_ACTION_SET_TP_DST;
971 case RTE_FLOW_ACTION_TYPE_SET_TTL:
972 keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
973 flags |= MLX5_FLOW_ACTION_SET_TTL;
975 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
976 keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
977 flags |= MLX5_FLOW_ACTION_DEC_TTL;
979 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
980 keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
981 flags |= MLX5_FLOW_ACTION_SET_MAC_SRC;
983 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
984 keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
985 flags |= MLX5_FLOW_ACTION_SET_MAC_DST;
988 goto get_pedit_action_size_done;
991 get_pedit_action_size_done:
992 /* TCA_PEDIT_PARAMS_EX */
994 SZ_NLATTR_DATA_OF(sizeof(struct tc_pedit_sel) +
995 keys * sizeof(struct tc_pedit_key));
996 pedit_size += SZ_NLATTR_NEST; /* TCA_PEDIT_KEYS */
998 /* TCA_PEDIT_KEY_EX + HTYPE + CMD */
999 (SZ_NLATTR_NEST + SZ_NLATTR_DATA_OF(2) +
1000 SZ_NLATTR_DATA_OF(2));
1001 (*action_flags) |= flags;
1007 * Retrieve mask for pattern item.
1009 * This function does basic sanity checks on a pattern item in order to
1010 * return the most appropriate mask for it.
1013 * Item specification.
1014 * @param[in] mask_default
1015 * Default mask for pattern item as specified by the flow API.
1016 * @param[in] mask_supported
1017 * Mask fields supported by the implementation.
1018 * @param[in] mask_empty
1019 * Empty mask to return when there is no specification.
1021 * Perform verbose error reporting if not NULL.
1024 * Either @p item->mask or one of the mask parameters on success, NULL
1025 * otherwise and rte_errno is set.
1028 flow_tcf_item_mask(const struct rte_flow_item *item, const void *mask_default,
1029 const void *mask_supported, const void *mask_empty,
1030 size_t mask_size, struct rte_flow_error *error)
1032 const uint8_t *mask;
1035 /* item->last and item->mask cannot exist without item->spec. */
1036 if (!item->spec && (item->mask || item->last)) {
1037 rte_flow_error_set(error, EINVAL,
1038 RTE_FLOW_ERROR_TYPE_ITEM, item,
1039 "\"mask\" or \"last\" field provided without"
1040 " a corresponding \"spec\"");
1043 /* No spec, no mask, no problem. */
1046 mask = item->mask ? item->mask : mask_default;
1049 * Single-pass check to make sure that:
1050 * - Mask is supported, no bits are set outside mask_supported.
1051 * - Both item->spec and item->last are included in mask.
1053 for (i = 0; i != mask_size; ++i) {
1056 if ((mask[i] | ((const uint8_t *)mask_supported)[i]) !=
1057 ((const uint8_t *)mask_supported)[i]) {
1058 rte_flow_error_set(error, ENOTSUP,
1059 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1060 "unsupported field found"
1065 (((const uint8_t *)item->spec)[i] & mask[i]) !=
1066 (((const uint8_t *)item->last)[i] & mask[i])) {
1067 rte_flow_error_set(error, EINVAL,
1068 RTE_FLOW_ERROR_TYPE_ITEM_LAST,
1070 "range between \"spec\" and \"last\""
1071 " not comprised in \"mask\"");
1079 * Build a conversion table between port ID and ifindex.
1082 * Pointer to Ethernet device.
1084 * Pointer to ptoi table.
1086 * Size of ptoi table provided.
1089 * Size of ptoi table filled.
1092 flow_tcf_build_ptoi_table(struct rte_eth_dev *dev, struct flow_tcf_ptoi *ptoi,
1095 unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
1096 uint16_t port_id[n + 1];
1098 unsigned int own = 0;
1100 /* At least one port is needed when no switch domain is present. */
1103 port_id[0] = dev->data->port_id;
1105 n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
1109 for (i = 0; i != n; ++i) {
1110 struct rte_eth_dev_info dev_info;
1112 rte_eth_dev_info_get(port_id[i], &dev_info);
1113 if (port_id[i] == dev->data->port_id)
1115 ptoi[i].port_id = port_id[i];
1116 ptoi[i].ifindex = dev_info.if_index;
1118 /* Ensure first entry of ptoi[] is the current device. */
1121 ptoi[0] = ptoi[own];
1122 ptoi[own] = ptoi[n];
1124 /* An entry with zero ifindex terminates ptoi[]. */
1125 ptoi[n].port_id = 0;
1126 ptoi[n].ifindex = 0;
1131 * Verify the @p attr will be correctly understood by the E-switch.
1134 * Pointer to flow attributes
1136 * Pointer to error structure.
1139 * 0 on success, a negative errno value otherwise and rte_errno is set.
1142 flow_tcf_validate_attributes(const struct rte_flow_attr *attr,
1143 struct rte_flow_error *error)
1146 * Supported attributes: groups, some priorities and ingress only.
1147 * group is supported only if kernel supports chain. Don't care about
1148 * transfer as it is the caller's problem.
1150 if (attr->group > MLX5_TCF_GROUP_ID_MAX)
1151 return rte_flow_error_set(error, ENOTSUP,
1152 RTE_FLOW_ERROR_TYPE_ATTR_GROUP, attr,
1153 "group ID larger than "
1154 RTE_STR(MLX5_TCF_GROUP_ID_MAX)
1155 " isn't supported");
1156 else if (attr->priority > MLX5_TCF_GROUP_PRIORITY_MAX)
1157 return rte_flow_error_set(error, ENOTSUP,
1158 RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1160 "priority more than "
1161 RTE_STR(MLX5_TCF_GROUP_PRIORITY_MAX)
1162 " is not supported");
1164 return rte_flow_error_set(error, EINVAL,
1165 RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1166 attr, "only ingress is supported");
1168 return rte_flow_error_set(error, ENOTSUP,
1169 RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1170 attr, "egress is not supported");
1175 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_ETH item for E-Switch.
1176 * The routine checks the L2 fields to be used in encapsulation header.
1179 * Pointer to the item structure.
1181 * Pointer to the error structure.
1184 * 0 on success, a negative errno value otherwise and rte_errno is set.
1187 flow_tcf_validate_vxlan_encap_eth(const struct rte_flow_item *item,
1188 struct rte_flow_error *error)
1190 const struct rte_flow_item_eth *spec = item->spec;
1191 const struct rte_flow_item_eth *mask = item->mask;
1195 * Specification for L2 addresses can be empty
1196 * because these ones are optional and not
1197 * required directly by tc rule. Kernel tries
1198 * to resolve these ones on its own
1203 /* If mask is not specified use the default one. */
1204 mask = &rte_flow_item_eth_mask;
1206 if (memcmp(&mask->dst,
1207 &flow_tcf_mask_empty.eth.dst,
1208 sizeof(flow_tcf_mask_empty.eth.dst))) {
1209 if (memcmp(&mask->dst,
1210 &rte_flow_item_eth_mask.dst,
1211 sizeof(rte_flow_item_eth_mask.dst)))
1212 return rte_flow_error_set
1214 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1215 "no support for partial mask on"
1216 " \"eth.dst\" field");
1218 if (memcmp(&mask->src,
1219 &flow_tcf_mask_empty.eth.src,
1220 sizeof(flow_tcf_mask_empty.eth.src))) {
1221 if (memcmp(&mask->src,
1222 &rte_flow_item_eth_mask.src,
1223 sizeof(rte_flow_item_eth_mask.src)))
1224 return rte_flow_error_set
1226 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1227 "no support for partial mask on"
1228 " \"eth.src\" field");
1230 if (mask->type != RTE_BE16(0x0000)) {
1231 if (mask->type != RTE_BE16(0xffff))
1232 return rte_flow_error_set
1234 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1235 "no support for partial mask on"
1236 " \"eth.type\" field");
1238 "outer ethernet type field"
1239 " cannot be forced for vxlan"
1240 " encapsulation, parameter ignored");
1246 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV4 item for E-Switch.
1247 * The routine checks the IPv4 fields to be used in encapsulation header.
1250 * Pointer to the item structure.
1252 * Pointer to the error structure.
1255 * 0 on success, a negative errno value otherwise and rte_errno is set.
1258 flow_tcf_validate_vxlan_encap_ipv4(const struct rte_flow_item *item,
1259 struct rte_flow_error *error)
1261 const struct rte_flow_item_ipv4 *spec = item->spec;
1262 const struct rte_flow_item_ipv4 *mask = item->mask;
1266 * Specification for IP addresses cannot be empty
1267 * because it is required by tunnel_key parameter.
1269 return rte_flow_error_set(error, EINVAL,
1270 RTE_FLOW_ERROR_TYPE_ITEM, item,
1271 "NULL outer ipv4 address"
1272 " specification for vxlan"
1276 mask = &rte_flow_item_ipv4_mask;
1277 if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) {
1278 if (mask->hdr.dst_addr != RTE_BE32(0xffffffff))
1279 return rte_flow_error_set
1281 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1282 "no support for partial mask on"
1283 " \"ipv4.hdr.dst_addr\" field"
1284 " for vxlan encapsulation");
1285 /* More IPv4 address validations can be put here. */
1288 * Kernel uses the destination IP address to determine
1289 * the routing path and obtain the MAC destination
1290 * address, so IP destination address must be
1291 * specified in the tc rule.
1293 return rte_flow_error_set(error, EINVAL,
1294 RTE_FLOW_ERROR_TYPE_ITEM, item,
1295 "outer ipv4 destination address"
1296 " must be specified for"
1297 " vxlan encapsulation");
1299 if (mask->hdr.src_addr != RTE_BE32(0x00000000)) {
1300 if (mask->hdr.src_addr != RTE_BE32(0xffffffff))
1301 return rte_flow_error_set
1303 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1304 "no support for partial mask on"
1305 " \"ipv4.hdr.src_addr\" field"
1306 " for vxlan encapsulation");
1307 /* More IPv4 address validations can be put here. */
1310 * Kernel uses the source IP address to select the
1311 * interface for egress encapsulated traffic, so
1312 * it must be specified in the tc rule.
1314 return rte_flow_error_set(error, EINVAL,
1315 RTE_FLOW_ERROR_TYPE_ITEM, item,
1316 "outer ipv4 source address"
1317 " must be specified for"
1318 " vxlan encapsulation");
1320 if (mask->hdr.type_of_service &&
1321 mask->hdr.type_of_service != 0xff)
1322 return rte_flow_error_set(error, ENOTSUP,
1323 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1324 "no support for partial mask on"
1325 " \"ipv4.hdr.type_of_service\" field"
1326 " for vxlan encapsulation");
1327 if (mask->hdr.time_to_live &&
1328 mask->hdr.time_to_live != 0xff)
1329 return rte_flow_error_set(error, ENOTSUP,
1330 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1331 "no support for partial mask on"
1332 " \"ipv4.hdr.time_to_live\" field"
1333 " for vxlan encapsulation");
1338 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV6 item for E-Switch.
1339 * The routine checks the IPv6 fields to be used in encapsulation header.
1342 * Pointer to the item structure.
1344 * Pointer to the error structure.
1347 * 0 on success, a negative errno value otherwise and rte_errno is set.
1350 flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item,
1351 struct rte_flow_error *error)
1353 const struct rte_flow_item_ipv6 *spec = item->spec;
1354 const struct rte_flow_item_ipv6 *mask = item->mask;
1359 * Specification for IP addresses cannot be empty
1360 * because it is required by tunnel_key parameter.
1362 return rte_flow_error_set(error, EINVAL,
1363 RTE_FLOW_ERROR_TYPE_ITEM, item,
1364 "NULL outer ipv6 address"
1365 " specification for"
1366 " vxlan encapsulation");
1369 mask = &rte_flow_item_ipv6_mask;
1370 if (memcmp(&mask->hdr.dst_addr,
1371 &flow_tcf_mask_empty.ipv6.hdr.dst_addr,
1373 if (memcmp(&mask->hdr.dst_addr,
1374 &rte_flow_item_ipv6_mask.hdr.dst_addr,
1376 return rte_flow_error_set
1378 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1379 "no support for partial mask on"
1380 " \"ipv6.hdr.dst_addr\" field"
1381 " for vxlan encapsulation");
1382 /* More IPv6 address validations can be put here. */
1385 * Kernel uses the destination IP address to determine
1386 * the routing path and obtain the MAC destination
1387 * address (heigh or gate), so IP destination address
1388 * must be specified within the tc rule.
1390 return rte_flow_error_set(error, EINVAL,
1391 RTE_FLOW_ERROR_TYPE_ITEM, item,
1392 "outer ipv6 destination address"
1393 " must be specified for"
1394 " vxlan encapsulation");
1396 if (memcmp(&mask->hdr.src_addr,
1397 &flow_tcf_mask_empty.ipv6.hdr.src_addr,
1399 if (memcmp(&mask->hdr.src_addr,
1400 &rte_flow_item_ipv6_mask.hdr.src_addr,
1402 return rte_flow_error_set
1404 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1405 "no support for partial mask on"
1406 " \"ipv6.hdr.src_addr\" field"
1407 " for vxlan encapsulation");
1408 /* More L3 address validation can be put here. */
1411 * Kernel uses the source IP address to select the
1412 * interface for egress encapsulated traffic, so
1413 * it must be specified in the tc rule.
1415 return rte_flow_error_set(error, EINVAL,
1416 RTE_FLOW_ERROR_TYPE_ITEM, item,
1417 "outer L3 source address"
1418 " must be specified for"
1419 " vxlan encapsulation");
1421 msk6 = (rte_be_to_cpu_32(mask->hdr.vtc_flow) >>
1422 IPV6_HDR_TC_SHIFT) & 0xff;
1423 if (msk6 && msk6 != 0xff)
1424 return rte_flow_error_set(error, ENOTSUP,
1425 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1426 "no support for partial mask on"
1427 " \"ipv6.hdr.vtc_flow.tos\" field"
1428 " for vxlan encapsulation");
1429 if (mask->hdr.hop_limits && mask->hdr.hop_limits != 0xff)
1430 return rte_flow_error_set(error, ENOTSUP,
1431 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1432 "no support for partial mask on"
1433 " \"ipv6.hdr.hop_limits\" field"
1434 " for vxlan encapsulation");
1439 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_UDP item for E-Switch.
1440 * The routine checks the UDP fields to be used in encapsulation header.
1443 * Pointer to the item structure.
1445 * Pointer to the error structure.
1448 * 0 on success, a negative errno value otherwise and rte_errno is set.
1451 flow_tcf_validate_vxlan_encap_udp(const struct rte_flow_item *item,
1452 struct rte_flow_error *error)
1454 const struct rte_flow_item_udp *spec = item->spec;
1455 const struct rte_flow_item_udp *mask = item->mask;
1459 * Specification for UDP ports cannot be empty
1460 * because it is required by tunnel_key parameter.
1462 return rte_flow_error_set(error, EINVAL,
1463 RTE_FLOW_ERROR_TYPE_ITEM, item,
1464 "NULL UDP port specification "
1465 " for vxlan encapsulation");
1468 mask = &rte_flow_item_udp_mask;
1469 if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1470 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1471 return rte_flow_error_set
1473 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1474 "no support for partial mask on"
1475 " \"udp.hdr.dst_port\" field"
1476 " for vxlan encapsulation");
1477 if (!spec->hdr.dst_port)
1478 return rte_flow_error_set
1480 RTE_FLOW_ERROR_TYPE_ITEM, item,
1481 "outer UDP remote port cannot be"
1482 " 0 for vxlan encapsulation");
1484 return rte_flow_error_set(error, EINVAL,
1485 RTE_FLOW_ERROR_TYPE_ITEM, item,
1486 "outer UDP remote port"
1487 " must be specified for"
1488 " vxlan encapsulation");
1490 if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1491 if (mask->hdr.src_port != RTE_BE16(0xffff))
1492 return rte_flow_error_set
1494 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1495 "no support for partial mask on"
1496 " \"udp.hdr.src_port\" field"
1497 " for vxlan encapsulation");
1499 "outer UDP source port cannot be"
1500 " forced for vxlan encapsulation,"
1501 " parameter ignored");
1507 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_VXLAN item for E-Switch.
1508 * The routine checks the VNIP fields to be used in encapsulation header.
1511 * Pointer to the item structure.
1513 * Pointer to the error structure.
1516 * 0 on success, a negative errno value otherwise and rte_errno is set.
1519 flow_tcf_validate_vxlan_encap_vni(const struct rte_flow_item *item,
1520 struct rte_flow_error *error)
1522 const struct rte_flow_item_vxlan *spec = item->spec;
1523 const struct rte_flow_item_vxlan *mask = item->mask;
1526 /* Outer VNI is required by tunnel_key parameter. */
1527 return rte_flow_error_set(error, EINVAL,
1528 RTE_FLOW_ERROR_TYPE_ITEM, item,
1529 "NULL VNI specification"
1530 " for vxlan encapsulation");
1533 mask = &rte_flow_item_vxlan_mask;
1534 if (!mask->vni[0] && !mask->vni[1] && !mask->vni[2])
1535 return rte_flow_error_set(error, EINVAL,
1536 RTE_FLOW_ERROR_TYPE_ITEM, item,
1537 "outer VNI must be specified "
1538 "for vxlan encapsulation");
1539 if (mask->vni[0] != 0xff ||
1540 mask->vni[1] != 0xff ||
1541 mask->vni[2] != 0xff)
1542 return rte_flow_error_set(error, ENOTSUP,
1543 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1544 "no support for partial mask on"
1545 " \"vxlan.vni\" field");
1547 if (!spec->vni[0] && !spec->vni[1] && !spec->vni[2])
1548 return rte_flow_error_set(error, EINVAL,
1549 RTE_FLOW_ERROR_TYPE_ITEM, item,
1550 "vxlan vni cannot be 0");
1555 * Validate VXLAN_ENCAP action item list for E-Switch.
1556 * The routine checks items to be used in encapsulation header.
1559 * Pointer to the VXLAN_ENCAP action structure.
1561 * Pointer to the error structure.
1564 * 0 on success, a negative errno value otherwise and rte_errno is set.
1567 flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action,
1568 struct rte_flow_error *error)
1570 const struct rte_flow_item *items;
1572 uint32_t item_flags = 0;
1575 return rte_flow_error_set(error, EINVAL,
1576 RTE_FLOW_ERROR_TYPE_ACTION, action,
1577 "Missing vxlan tunnel"
1578 " action configuration");
1579 items = ((const struct rte_flow_action_vxlan_encap *)
1580 action->conf)->definition;
1582 return rte_flow_error_set(error, EINVAL,
1583 RTE_FLOW_ERROR_TYPE_ACTION, action,
1584 "Missing vxlan tunnel"
1585 " encapsulation parameters");
1586 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1587 switch (items->type) {
1588 case RTE_FLOW_ITEM_TYPE_VOID:
1590 case RTE_FLOW_ITEM_TYPE_ETH:
1591 ret = mlx5_flow_validate_item_eth(items, item_flags,
1595 ret = flow_tcf_validate_vxlan_encap_eth(items, error);
1598 item_flags |= MLX5_FLOW_LAYER_OUTER_L2;
1601 case RTE_FLOW_ITEM_TYPE_IPV4:
1602 ret = mlx5_flow_validate_item_ipv4
1604 &flow_tcf_mask_supported.ipv4, error);
1607 ret = flow_tcf_validate_vxlan_encap_ipv4(items, error);
1610 item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
1612 case RTE_FLOW_ITEM_TYPE_IPV6:
1613 ret = mlx5_flow_validate_item_ipv6
1615 &flow_tcf_mask_supported.ipv6, error);
1618 ret = flow_tcf_validate_vxlan_encap_ipv6(items, error);
1621 item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
1623 case RTE_FLOW_ITEM_TYPE_UDP:
1624 ret = mlx5_flow_validate_item_udp(items, item_flags,
1628 ret = flow_tcf_validate_vxlan_encap_udp(items, error);
1631 item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
1633 case RTE_FLOW_ITEM_TYPE_VXLAN:
1634 ret = mlx5_flow_validate_item_vxlan(items,
1638 ret = flow_tcf_validate_vxlan_encap_vni(items, error);
1641 item_flags |= MLX5_FLOW_LAYER_VXLAN;
1644 return rte_flow_error_set
1646 RTE_FLOW_ERROR_TYPE_ITEM, items,
1647 "vxlan encap item not supported");
1650 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3))
1651 return rte_flow_error_set(error, EINVAL,
1652 RTE_FLOW_ERROR_TYPE_ACTION, action,
1653 "no outer IP layer found"
1654 " for vxlan encapsulation");
1655 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
1656 return rte_flow_error_set(error, EINVAL,
1657 RTE_FLOW_ERROR_TYPE_ACTION, action,
1658 "no outer UDP layer found"
1659 " for vxlan encapsulation");
1660 if (!(item_flags & MLX5_FLOW_LAYER_VXLAN))
1661 return rte_flow_error_set(error, EINVAL,
1662 RTE_FLOW_ERROR_TYPE_ACTION, action,
1663 "no VXLAN VNI found"
1664 " for vxlan encapsulation");
1669 * Validate outer RTE_FLOW_ITEM_TYPE_UDP item if tunnel item
1670 * RTE_FLOW_ITEM_TYPE_VXLAN is present in item list.
1673 * Outer UDP layer item (if any, NULL otherwise).
1675 * Pointer to the error structure.
1678 * 0 on success, a negative errno value otherwise and rte_errno is set.
1681 flow_tcf_validate_vxlan_decap_udp(const struct rte_flow_item *udp,
1682 struct rte_flow_error *error)
1684 const struct rte_flow_item_udp *spec = udp->spec;
1685 const struct rte_flow_item_udp *mask = udp->mask;
1689 * Specification for UDP ports cannot be empty
1690 * because it is required as decap parameter.
1692 return rte_flow_error_set(error, EINVAL,
1693 RTE_FLOW_ERROR_TYPE_ITEM, udp,
1694 "NULL UDP port specification"
1695 " for VXLAN decapsulation");
1697 mask = &rte_flow_item_udp_mask;
1698 if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1699 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1700 return rte_flow_error_set
1702 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1703 "no support for partial mask on"
1704 " \"udp.hdr.dst_port\" field");
1705 if (!spec->hdr.dst_port)
1706 return rte_flow_error_set
1708 RTE_FLOW_ERROR_TYPE_ITEM, udp,
1709 "zero decap local UDP port");
1711 return rte_flow_error_set(error, EINVAL,
1712 RTE_FLOW_ERROR_TYPE_ITEM, udp,
1713 "outer UDP destination port must be "
1714 "specified for vxlan decapsulation");
1716 if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1717 if (mask->hdr.src_port != RTE_BE16(0xffff))
1718 return rte_flow_error_set
1720 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1721 "no support for partial mask on"
1722 " \"udp.hdr.src_port\" field");
1724 "outer UDP local port cannot be "
1725 "forced for VXLAN encapsulation, "
1726 "parameter ignored");
1732 * Validate flow for E-Switch.
1735 * Pointer to the priv structure.
1737 * Pointer to the flow attributes.
1739 * Pointer to the list of items.
1740 * @param[in] actions
1741 * Pointer to the list of actions.
1743 * Pointer to the error structure.
1746 * 0 on success, a negative errno value otherwise and rte_errno is set.
1749 flow_tcf_validate(struct rte_eth_dev *dev,
1750 const struct rte_flow_attr *attr,
1751 const struct rte_flow_item items[],
1752 const struct rte_flow_action actions[],
1753 struct rte_flow_error *error)
1756 const struct rte_flow_item_port_id *port_id;
1757 const struct rte_flow_item_eth *eth;
1758 const struct rte_flow_item_vlan *vlan;
1759 const struct rte_flow_item_ipv4 *ipv4;
1760 const struct rte_flow_item_ipv6 *ipv6;
1761 const struct rte_flow_item_tcp *tcp;
1762 const struct rte_flow_item_udp *udp;
1763 const struct rte_flow_item_vxlan *vxlan;
1766 const struct rte_flow_action_port_id *port_id;
1767 const struct rte_flow_action_jump *jump;
1768 const struct rte_flow_action_of_push_vlan *of_push_vlan;
1769 const struct rte_flow_action_of_set_vlan_vid *
1771 const struct rte_flow_action_of_set_vlan_pcp *
1773 const struct rte_flow_action_vxlan_encap *vxlan_encap;
1774 const struct rte_flow_action_set_ipv4 *set_ipv4;
1775 const struct rte_flow_action_set_ipv6 *set_ipv6;
1777 const struct rte_flow_item *outer_udp = NULL;
1778 rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
1779 rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
1780 rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
1781 uint64_t item_flags = 0;
1782 uint64_t action_flags = 0;
1783 uint8_t next_protocol = 0xff;
1784 unsigned int tcm_ifindex = 0;
1785 uint8_t pedit_validated = 0;
1786 struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
1787 struct rte_eth_dev *port_id_dev = NULL;
1788 bool in_port_id_set;
1791 claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
1792 PTOI_TABLE_SZ_MAX(dev)));
1793 ret = flow_tcf_validate_attributes(attr, error);
1796 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
1798 uint64_t current_action_flag = 0;
1800 switch (actions->type) {
1801 case RTE_FLOW_ACTION_TYPE_VOID:
1803 case RTE_FLOW_ACTION_TYPE_PORT_ID:
1804 current_action_flag = MLX5_FLOW_ACTION_PORT_ID;
1807 conf.port_id = actions->conf;
1808 if (conf.port_id->original)
1811 for (i = 0; ptoi[i].ifindex; ++i)
1812 if (ptoi[i].port_id == conf.port_id->id)
1814 if (!ptoi[i].ifindex)
1815 return rte_flow_error_set
1817 RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1819 "missing data to convert port ID to"
1821 port_id_dev = &rte_eth_devices[conf.port_id->id];
1823 case RTE_FLOW_ACTION_TYPE_JUMP:
1824 current_action_flag = MLX5_FLOW_ACTION_JUMP;
1827 conf.jump = actions->conf;
1828 if (attr->group >= conf.jump->group)
1829 return rte_flow_error_set
1831 RTE_FLOW_ERROR_TYPE_ACTION,
1833 "can jump only to a group forward");
1835 case RTE_FLOW_ACTION_TYPE_DROP:
1836 current_action_flag = MLX5_FLOW_ACTION_DROP;
1838 case RTE_FLOW_ACTION_TYPE_COUNT:
1840 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
1841 current_action_flag = MLX5_FLOW_ACTION_OF_POP_VLAN;
1843 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: {
1844 rte_be16_t ethertype;
1846 current_action_flag = MLX5_FLOW_ACTION_OF_PUSH_VLAN;
1849 conf.of_push_vlan = actions->conf;
1850 ethertype = conf.of_push_vlan->ethertype;
1851 if (ethertype != RTE_BE16(ETH_P_8021Q) &&
1852 ethertype != RTE_BE16(ETH_P_8021AD))
1853 return rte_flow_error_set
1855 RTE_FLOW_ERROR_TYPE_ACTION, actions,
1856 "vlan push TPID must be "
1857 "802.1Q or 802.1AD");
1860 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
1861 if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1862 return rte_flow_error_set
1864 RTE_FLOW_ERROR_TYPE_ACTION, actions,
1865 "vlan modify is not supported,"
1866 " set action must follow push action");
1867 current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
1869 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
1870 if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1871 return rte_flow_error_set
1873 RTE_FLOW_ERROR_TYPE_ACTION, actions,
1874 "vlan modify is not supported,"
1875 " set action must follow push action");
1876 current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
1878 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
1879 current_action_flag = MLX5_FLOW_ACTION_VXLAN_DECAP;
1881 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
1882 ret = flow_tcf_validate_vxlan_encap(actions, error);
1885 current_action_flag = MLX5_FLOW_ACTION_VXLAN_ENCAP;
1887 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
1888 current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_SRC;
1890 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
1891 current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_DST;
1893 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
1894 current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_SRC;
1896 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
1897 current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_DST;
1899 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
1900 current_action_flag = MLX5_FLOW_ACTION_SET_TP_SRC;
1902 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
1903 current_action_flag = MLX5_FLOW_ACTION_SET_TP_DST;
1905 case RTE_FLOW_ACTION_TYPE_SET_TTL:
1906 current_action_flag = MLX5_FLOW_ACTION_SET_TTL;
1908 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
1909 current_action_flag = MLX5_FLOW_ACTION_DEC_TTL;
1911 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
1912 current_action_flag = MLX5_FLOW_ACTION_SET_MAC_SRC;
1914 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
1915 current_action_flag = MLX5_FLOW_ACTION_SET_MAC_DST;
1918 return rte_flow_error_set(error, ENOTSUP,
1919 RTE_FLOW_ERROR_TYPE_ACTION,
1921 "action not supported");
1923 if (current_action_flag & MLX5_TCF_CONFIG_ACTIONS) {
1925 return rte_flow_error_set
1927 RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1929 "action configuration not set");
1931 if ((current_action_flag & MLX5_TCF_PEDIT_ACTIONS) &&
1933 return rte_flow_error_set(error, ENOTSUP,
1934 RTE_FLOW_ERROR_TYPE_ACTION,
1936 "set actions should be "
1937 "listed successively");
1938 if ((current_action_flag & ~MLX5_TCF_PEDIT_ACTIONS) &&
1939 (action_flags & MLX5_TCF_PEDIT_ACTIONS))
1940 pedit_validated = 1;
1941 if ((current_action_flag & MLX5_TCF_FATE_ACTIONS) &&
1942 (action_flags & MLX5_TCF_FATE_ACTIONS))
1943 return rte_flow_error_set(error, EINVAL,
1944 RTE_FLOW_ERROR_TYPE_ACTION,
1946 "can't have multiple fate"
1948 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1949 (action_flags & MLX5_TCF_VXLAN_ACTIONS))
1950 return rte_flow_error_set(error, EINVAL,
1951 RTE_FLOW_ERROR_TYPE_ACTION,
1953 "can't have multiple vxlan"
1955 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1956 (action_flags & MLX5_TCF_VLAN_ACTIONS))
1957 return rte_flow_error_set(error, ENOTSUP,
1958 RTE_FLOW_ERROR_TYPE_ACTION,
1960 "can't have vxlan and vlan"
1961 " actions in the same rule");
1962 action_flags |= current_action_flag;
1964 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1967 switch (items->type) {
1968 case RTE_FLOW_ITEM_TYPE_VOID:
1970 case RTE_FLOW_ITEM_TYPE_PORT_ID:
1971 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
1972 return rte_flow_error_set
1974 RTE_FLOW_ERROR_TYPE_ITEM, items,
1975 "inner tunnel port id"
1976 " item is not supported");
1977 mask.port_id = flow_tcf_item_mask
1978 (items, &rte_flow_item_port_id_mask,
1979 &flow_tcf_mask_supported.port_id,
1980 &flow_tcf_mask_empty.port_id,
1981 sizeof(flow_tcf_mask_supported.port_id),
1985 if (mask.port_id == &flow_tcf_mask_empty.port_id) {
1989 spec.port_id = items->spec;
1990 if (mask.port_id->id && mask.port_id->id != 0xffffffff)
1991 return rte_flow_error_set
1993 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
1995 "no support for partial mask on"
1997 if (!mask.port_id->id)
2000 for (i = 0; ptoi[i].ifindex; ++i)
2001 if (ptoi[i].port_id == spec.port_id->id)
2003 if (!ptoi[i].ifindex)
2004 return rte_flow_error_set
2006 RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
2008 "missing data to convert port ID to"
2010 if (in_port_id_set && ptoi[i].ifindex != tcm_ifindex)
2011 return rte_flow_error_set
2013 RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
2015 "cannot match traffic for"
2016 " several port IDs through"
2017 " a single flow rule");
2018 tcm_ifindex = ptoi[i].ifindex;
2021 case RTE_FLOW_ITEM_TYPE_ETH:
2022 ret = mlx5_flow_validate_item_eth(items, item_flags,
2026 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2027 MLX5_FLOW_LAYER_INNER_L2 :
2028 MLX5_FLOW_LAYER_OUTER_L2;
2030 * Redundant check due to different supported mask.
2031 * Same for the rest of items.
2033 mask.eth = flow_tcf_item_mask
2034 (items, &rte_flow_item_eth_mask,
2035 &flow_tcf_mask_supported.eth,
2036 &flow_tcf_mask_empty.eth,
2037 sizeof(flow_tcf_mask_supported.eth),
2041 if (mask.eth->type && mask.eth->type !=
2043 return rte_flow_error_set
2045 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2047 "no support for partial mask on"
2049 assert(items->spec);
2050 spec.eth = items->spec;
2051 if (mask.eth->type &&
2052 (item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
2053 inner_etype != RTE_BE16(ETH_P_ALL) &&
2054 inner_etype != spec.eth->type)
2055 return rte_flow_error_set
2057 RTE_FLOW_ERROR_TYPE_ITEM,
2059 "inner eth_type conflict");
2060 if (mask.eth->type &&
2061 !(item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
2062 outer_etype != RTE_BE16(ETH_P_ALL) &&
2063 outer_etype != spec.eth->type)
2064 return rte_flow_error_set
2066 RTE_FLOW_ERROR_TYPE_ITEM,
2068 "outer eth_type conflict");
2069 if (mask.eth->type) {
2070 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
2071 inner_etype = spec.eth->type;
2073 outer_etype = spec.eth->type;
2076 case RTE_FLOW_ITEM_TYPE_VLAN:
2077 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
2078 return rte_flow_error_set
2080 RTE_FLOW_ERROR_TYPE_ITEM, items,
2082 " is not supported");
2083 ret = mlx5_flow_validate_item_vlan(items, item_flags,
2087 item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
2088 mask.vlan = flow_tcf_item_mask
2089 (items, &rte_flow_item_vlan_mask,
2090 &flow_tcf_mask_supported.vlan,
2091 &flow_tcf_mask_empty.vlan,
2092 sizeof(flow_tcf_mask_supported.vlan),
2096 if ((mask.vlan->tci & RTE_BE16(0xe000) &&
2097 (mask.vlan->tci & RTE_BE16(0xe000)) !=
2098 RTE_BE16(0xe000)) ||
2099 (mask.vlan->tci & RTE_BE16(0x0fff) &&
2100 (mask.vlan->tci & RTE_BE16(0x0fff)) !=
2101 RTE_BE16(0x0fff)) ||
2102 (mask.vlan->inner_type &&
2103 mask.vlan->inner_type != RTE_BE16(0xffff)))
2104 return rte_flow_error_set
2106 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2108 "no support for partial masks on"
2109 " \"tci\" (PCP and VID parts) and"
2110 " \"inner_type\" fields");
2111 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2112 outer_etype != RTE_BE16(ETH_P_8021Q))
2113 return rte_flow_error_set
2115 RTE_FLOW_ERROR_TYPE_ITEM,
2117 "outer eth_type conflict,"
2119 outer_etype = RTE_BE16(ETH_P_8021Q);
2120 assert(items->spec);
2121 spec.vlan = items->spec;
2122 if (mask.vlan->inner_type &&
2123 vlan_etype != RTE_BE16(ETH_P_ALL) &&
2124 vlan_etype != spec.vlan->inner_type)
2125 return rte_flow_error_set
2127 RTE_FLOW_ERROR_TYPE_ITEM,
2129 "vlan eth_type conflict");
2130 if (mask.vlan->inner_type)
2131 vlan_etype = spec.vlan->inner_type;
2133 case RTE_FLOW_ITEM_TYPE_IPV4:
2134 ret = mlx5_flow_validate_item_ipv4
2136 &flow_tcf_mask_supported.ipv4, error);
2139 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2140 MLX5_FLOW_LAYER_INNER_L3_IPV4 :
2141 MLX5_FLOW_LAYER_OUTER_L3_IPV4;
2142 mask.ipv4 = flow_tcf_item_mask
2143 (items, &rte_flow_item_ipv4_mask,
2144 &flow_tcf_mask_supported.ipv4,
2145 &flow_tcf_mask_empty.ipv4,
2146 sizeof(flow_tcf_mask_supported.ipv4),
2150 if (mask.ipv4->hdr.next_proto_id &&
2151 mask.ipv4->hdr.next_proto_id != 0xff)
2152 return rte_flow_error_set
2154 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2156 "no support for partial mask on"
2157 " \"hdr.next_proto_id\" field");
2158 else if (mask.ipv4->hdr.next_proto_id)
2160 ((const struct rte_flow_item_ipv4 *)
2161 (items->spec))->hdr.next_proto_id;
2162 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2163 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2164 inner_etype != RTE_BE16(ETH_P_IP))
2165 return rte_flow_error_set
2167 RTE_FLOW_ERROR_TYPE_ITEM,
2169 "inner eth_type conflict,"
2170 " IPv4 is required");
2171 inner_etype = RTE_BE16(ETH_P_IP);
2172 } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2173 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2174 vlan_etype != RTE_BE16(ETH_P_IP))
2175 return rte_flow_error_set
2177 RTE_FLOW_ERROR_TYPE_ITEM,
2179 "vlan eth_type conflict,"
2180 " IPv4 is required");
2181 vlan_etype = RTE_BE16(ETH_P_IP);
2183 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2184 outer_etype != RTE_BE16(ETH_P_IP))
2185 return rte_flow_error_set
2187 RTE_FLOW_ERROR_TYPE_ITEM,
2189 "eth_type conflict,"
2190 " IPv4 is required");
2191 outer_etype = RTE_BE16(ETH_P_IP);
2194 case RTE_FLOW_ITEM_TYPE_IPV6:
2195 ret = mlx5_flow_validate_item_ipv6
2197 &flow_tcf_mask_supported.ipv6, error);
2200 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2201 MLX5_FLOW_LAYER_INNER_L3_IPV6 :
2202 MLX5_FLOW_LAYER_OUTER_L3_IPV6;
2203 mask.ipv6 = flow_tcf_item_mask
2204 (items, &rte_flow_item_ipv6_mask,
2205 &flow_tcf_mask_supported.ipv6,
2206 &flow_tcf_mask_empty.ipv6,
2207 sizeof(flow_tcf_mask_supported.ipv6),
2211 if (mask.ipv6->hdr.proto &&
2212 mask.ipv6->hdr.proto != 0xff)
2213 return rte_flow_error_set
2215 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2217 "no support for partial mask on"
2218 " \"hdr.proto\" field");
2219 else if (mask.ipv6->hdr.proto)
2221 ((const struct rte_flow_item_ipv6 *)
2222 (items->spec))->hdr.proto;
2223 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2224 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2225 inner_etype != RTE_BE16(ETH_P_IPV6))
2226 return rte_flow_error_set
2228 RTE_FLOW_ERROR_TYPE_ITEM,
2230 "inner eth_type conflict,"
2231 " IPv6 is required");
2232 inner_etype = RTE_BE16(ETH_P_IPV6);
2233 } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2234 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2235 vlan_etype != RTE_BE16(ETH_P_IPV6))
2236 return rte_flow_error_set
2238 RTE_FLOW_ERROR_TYPE_ITEM,
2240 "vlan eth_type conflict,"
2241 " IPv6 is required");
2242 vlan_etype = RTE_BE16(ETH_P_IPV6);
2244 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2245 outer_etype != RTE_BE16(ETH_P_IPV6))
2246 return rte_flow_error_set
2248 RTE_FLOW_ERROR_TYPE_ITEM,
2250 "eth_type conflict,"
2251 " IPv6 is required");
2252 outer_etype = RTE_BE16(ETH_P_IPV6);
2255 case RTE_FLOW_ITEM_TYPE_UDP:
2256 ret = mlx5_flow_validate_item_udp(items, item_flags,
2257 next_protocol, error);
2260 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2261 MLX5_FLOW_LAYER_INNER_L4_UDP :
2262 MLX5_FLOW_LAYER_OUTER_L4_UDP;
2263 mask.udp = flow_tcf_item_mask
2264 (items, &rte_flow_item_udp_mask,
2265 &flow_tcf_mask_supported.udp,
2266 &flow_tcf_mask_empty.udp,
2267 sizeof(flow_tcf_mask_supported.udp),
2272 * Save the presumed outer UDP item for extra check
2273 * if the tunnel item will be found later in the list.
2275 if (!(item_flags & MLX5_FLOW_LAYER_TUNNEL))
2278 case RTE_FLOW_ITEM_TYPE_TCP:
2279 ret = mlx5_flow_validate_item_tcp
2282 &flow_tcf_mask_supported.tcp,
2286 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2287 MLX5_FLOW_LAYER_INNER_L4_TCP :
2288 MLX5_FLOW_LAYER_OUTER_L4_TCP;
2289 mask.tcp = flow_tcf_item_mask
2290 (items, &rte_flow_item_tcp_mask,
2291 &flow_tcf_mask_supported.tcp,
2292 &flow_tcf_mask_empty.tcp,
2293 sizeof(flow_tcf_mask_supported.tcp),
2298 case RTE_FLOW_ITEM_TYPE_VXLAN:
2299 if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN)
2300 return rte_flow_error_set
2302 RTE_FLOW_ERROR_TYPE_ITEM, items,
2303 "vxlan tunnel over vlan"
2304 " is not supported");
2305 ret = mlx5_flow_validate_item_vxlan(items,
2309 item_flags |= MLX5_FLOW_LAYER_VXLAN;
2310 mask.vxlan = flow_tcf_item_mask
2311 (items, &rte_flow_item_vxlan_mask,
2312 &flow_tcf_mask_supported.vxlan,
2313 &flow_tcf_mask_empty.vxlan,
2314 sizeof(flow_tcf_mask_supported.vxlan), error);
2317 if (mask.vxlan->vni[0] != 0xff ||
2318 mask.vxlan->vni[1] != 0xff ||
2319 mask.vxlan->vni[2] != 0xff)
2320 return rte_flow_error_set
2322 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2324 "no support for partial or "
2325 "empty mask on \"vxlan.vni\" field");
2327 * The VNI item assumes the VXLAN tunnel, it requires
2328 * at least the outer destination UDP port must be
2329 * specified without wildcards to allow kernel select
2330 * the virtual VXLAN device by port. Also outer IPv4
2331 * or IPv6 item must be specified (wilcards or even
2332 * zero mask are allowed) to let driver know the tunnel
2333 * IP version and process UDP traffic correctly.
2336 (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2337 MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2338 return rte_flow_error_set
2340 RTE_FLOW_ERROR_TYPE_ACTION,
2342 "no outer IP pattern found"
2343 " for vxlan tunnel");
2344 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
2345 return rte_flow_error_set
2347 RTE_FLOW_ERROR_TYPE_ACTION,
2349 "no outer UDP pattern found"
2350 " for vxlan tunnel");
2352 * All items preceding the tunnel item become outer
2353 * ones and we should do extra validation for them
2354 * due to tc limitations for tunnel outer parameters.
2355 * Currently only outer UDP item requres extra check,
2356 * use the saved pointer instead of item list rescan.
2359 ret = flow_tcf_validate_vxlan_decap_udp
2363 /* Reset L4 protocol for inner parameters. */
2364 next_protocol = 0xff;
2367 return rte_flow_error_set(error, ENOTSUP,
2368 RTE_FLOW_ERROR_TYPE_ITEM,
2369 items, "item not supported");
2372 if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2373 (action_flags & MLX5_FLOW_ACTION_DROP))
2374 return rte_flow_error_set(error, ENOTSUP,
2375 RTE_FLOW_ERROR_TYPE_ACTION,
2377 "set action is not compatible with "
2379 if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2380 !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2381 return rte_flow_error_set(error, ENOTSUP,
2382 RTE_FLOW_ERROR_TYPE_ACTION,
2384 "set action must be followed by "
2387 (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST)) {
2388 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4))
2389 return rte_flow_error_set(error, EINVAL,
2390 RTE_FLOW_ERROR_TYPE_ACTION,
2392 "no ipv4 item found in"
2396 (MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST)) {
2397 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6))
2398 return rte_flow_error_set(error, EINVAL,
2399 RTE_FLOW_ERROR_TYPE_ACTION,
2401 "no ipv6 item found in"
2405 (MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST)) {
2407 (MLX5_FLOW_LAYER_OUTER_L4_UDP |
2408 MLX5_FLOW_LAYER_OUTER_L4_TCP)))
2409 return rte_flow_error_set(error, EINVAL,
2410 RTE_FLOW_ERROR_TYPE_ACTION,
2412 "no TCP/UDP item found in"
2416 * FW syndrome (0xA9C090):
2417 * set_flow_table_entry: push vlan action fte in fdb can ONLY be
2418 * forward to the uplink.
2420 if ((action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) &&
2421 (action_flags & MLX5_FLOW_ACTION_PORT_ID) &&
2422 ((struct priv *)port_id_dev->data->dev_private)->representor)
2423 return rte_flow_error_set(error, ENOTSUP,
2424 RTE_FLOW_ERROR_TYPE_ACTION, actions,
2425 "vlan push can only be applied"
2426 " when forwarding to uplink port");
2428 * FW syndrome (0x294609):
2429 * set_flow_table_entry: modify/pop/push actions in fdb flow table
2430 * are supported only while forwarding to vport.
2432 if ((action_flags & MLX5_TCF_VLAN_ACTIONS) &&
2433 !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2434 return rte_flow_error_set(error, ENOTSUP,
2435 RTE_FLOW_ERROR_TYPE_ACTION, actions,
2436 "vlan actions are supported"
2437 " only with port_id action");
2438 if ((action_flags & MLX5_TCF_VXLAN_ACTIONS) &&
2439 !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2440 return rte_flow_error_set(error, ENOTSUP,
2441 RTE_FLOW_ERROR_TYPE_ACTION, NULL,
2442 "vxlan actions are supported"
2443 " only with port_id action");
2444 if (!(action_flags & MLX5_TCF_FATE_ACTIONS))
2445 return rte_flow_error_set(error, EINVAL,
2446 RTE_FLOW_ERROR_TYPE_ACTION, actions,
2447 "no fate action is found");
2449 (MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL)) {
2451 (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2452 MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2453 return rte_flow_error_set(error, EINVAL,
2454 RTE_FLOW_ERROR_TYPE_ACTION,
2456 "no IP found in pattern");
2459 (MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)) {
2460 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L2))
2461 return rte_flow_error_set(error, ENOTSUP,
2462 RTE_FLOW_ERROR_TYPE_ACTION,
2464 "no ethernet found in"
2467 if ((action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) &&
2468 !(item_flags & MLX5_FLOW_LAYER_VXLAN))
2469 return rte_flow_error_set(error, EINVAL,
2470 RTE_FLOW_ERROR_TYPE_ACTION,
2472 "no VNI pattern found"
2473 " for vxlan decap action");
2474 if ((action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) &&
2475 (item_flags & MLX5_FLOW_LAYER_TUNNEL))
2476 return rte_flow_error_set(error, EINVAL,
2477 RTE_FLOW_ERROR_TYPE_ACTION,
2479 "vxlan encap not supported"
2480 " for tunneled traffic");
2485 * Calculate maximum size of memory for flow items of Linux TC flower.
2488 * Pointer to the flow attributes.
2490 * Pointer to the list of items.
2491 * @param[out] action_flags
2492 * Pointer to the detected actions.
2495 * Maximum size of memory for items.
2498 flow_tcf_get_items_size(const struct rte_flow_attr *attr,
2499 const struct rte_flow_item items[],
2500 uint64_t *action_flags)
2504 size += SZ_NLATTR_STRZ_OF("flower") +
2505 SZ_NLATTR_TYPE_OF(uint16_t) + /* Outer ether type. */
2506 SZ_NLATTR_NEST + /* TCA_OPTIONS. */
2507 SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */
2508 if (attr->group > 0)
2509 size += SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CHAIN. */
2510 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2511 switch (items->type) {
2512 case RTE_FLOW_ITEM_TYPE_VOID:
2514 case RTE_FLOW_ITEM_TYPE_PORT_ID:
2516 case RTE_FLOW_ITEM_TYPE_ETH:
2517 size += SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4;
2518 /* dst/src MAC addr and mask. */
2520 case RTE_FLOW_ITEM_TYPE_VLAN:
2521 size += SZ_NLATTR_TYPE_OF(uint16_t) +
2522 /* VLAN Ether type. */
2523 SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */
2524 SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */
2526 case RTE_FLOW_ITEM_TYPE_IPV4: {
2527 const struct rte_flow_item_ipv4 *ipv4 = items->mask;
2529 size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2530 SZ_NLATTR_TYPE_OF(uint32_t) * 4;
2531 /* dst/src IP addr and mask. */
2532 if (ipv4 && ipv4->hdr.time_to_live)
2533 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2534 if (ipv4 && ipv4->hdr.type_of_service)
2535 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2538 case RTE_FLOW_ITEM_TYPE_IPV6: {
2539 const struct rte_flow_item_ipv6 *ipv6 = items->mask;
2541 size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2542 SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4;
2543 /* dst/src IP addr and mask. */
2544 if (ipv6 && ipv6->hdr.hop_limits)
2545 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2546 if (ipv6 && (rte_be_to_cpu_32(ipv6->hdr.vtc_flow) &
2547 (0xfful << IPV6_HDR_TC_SHIFT)))
2548 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2551 case RTE_FLOW_ITEM_TYPE_UDP:
2552 size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2553 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2554 /* dst/src port and mask. */
2556 case RTE_FLOW_ITEM_TYPE_TCP:
2557 size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2558 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2559 /* dst/src port and mask. */
2561 case RTE_FLOW_ITEM_TYPE_VXLAN:
2562 size += SZ_NLATTR_TYPE_OF(uint32_t);
2564 * There might be no VXLAN decap action in the action
2565 * list, nonetheless the VXLAN tunnel flow requires
2566 * the decap structure to be correctly applied to
2567 * VXLAN device, set the flag to create the structure.
2568 * Translation routine will not put the decap action
2569 * in tne Netlink message if there is no actual action
2572 *action_flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2576 "unsupported item %p type %d,"
2577 " items must be validated before flow creation",
2578 (const void *)items, items->type);
2586 * Calculate size of memory to store the VXLAN encapsultion
2587 * related items in the Netlink message buffer. Items list
2588 * is specified by RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action.
2589 * The item list should be validated.
2592 * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2593 * List of pattern items to scan data from.
2596 * The size the part of Netlink message buffer to store the
2597 * VXLAN encapsulation item attributes.
2600 flow_tcf_vxlan_encap_size(const struct rte_flow_action *action)
2602 const struct rte_flow_item *items;
2605 assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
2606 assert(action->conf);
2608 items = ((const struct rte_flow_action_vxlan_encap *)
2609 action->conf)->definition;
2611 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2612 switch (items->type) {
2613 case RTE_FLOW_ITEM_TYPE_VOID:
2615 case RTE_FLOW_ITEM_TYPE_ETH:
2616 /* This item does not require message buffer. */
2618 case RTE_FLOW_ITEM_TYPE_IPV4: {
2619 const struct rte_flow_item_ipv4 *ipv4 = items->mask;
2621 size += SZ_NLATTR_DATA_OF(IPV4_ADDR_LEN) * 2;
2622 if (ipv4 && ipv4->hdr.time_to_live)
2623 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2624 if (ipv4 && ipv4->hdr.type_of_service)
2625 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2628 case RTE_FLOW_ITEM_TYPE_IPV6: {
2629 const struct rte_flow_item_ipv6 *ipv6 = items->mask;
2631 size += SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 2;
2632 if (ipv6 && ipv6->hdr.hop_limits)
2633 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2634 if (ipv6 && (rte_be_to_cpu_32(ipv6->hdr.vtc_flow) &
2635 (0xfful << IPV6_HDR_TC_SHIFT)))
2636 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2639 case RTE_FLOW_ITEM_TYPE_UDP: {
2640 const struct rte_flow_item_udp *udp = items->mask;
2642 size += SZ_NLATTR_TYPE_OF(uint16_t);
2643 if (!udp || udp->hdr.src_port != RTE_BE16(0x0000))
2644 size += SZ_NLATTR_TYPE_OF(uint16_t);
2647 case RTE_FLOW_ITEM_TYPE_VXLAN:
2648 size += SZ_NLATTR_TYPE_OF(uint32_t);
2653 "unsupported item %p type %d,"
2654 " items must be validated"
2655 " before flow creation",
2656 (const void *)items, items->type);
2664 * Calculate maximum size of memory for flow actions of Linux TC flower and
2665 * extract specified actions.
2667 * @param[in] actions
2668 * Pointer to the list of actions.
2669 * @param[out] action_flags
2670 * Pointer to the detected actions.
2673 * Maximum size of memory for actions.
2676 flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
2677 uint64_t *action_flags)
2682 size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */
2683 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
2684 switch (actions->type) {
2685 case RTE_FLOW_ACTION_TYPE_VOID:
2687 case RTE_FLOW_ACTION_TYPE_PORT_ID:
2688 size += SZ_NLATTR_NEST + /* na_act_index. */
2689 SZ_NLATTR_STRZ_OF("mirred") +
2690 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2691 SZ_NLATTR_TYPE_OF(struct tc_mirred);
2692 flags |= MLX5_FLOW_ACTION_PORT_ID;
2694 case RTE_FLOW_ACTION_TYPE_JUMP:
2695 size += SZ_NLATTR_NEST + /* na_act_index. */
2696 SZ_NLATTR_STRZ_OF("gact") +
2697 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2698 SZ_NLATTR_TYPE_OF(struct tc_gact);
2699 flags |= MLX5_FLOW_ACTION_JUMP;
2701 case RTE_FLOW_ACTION_TYPE_DROP:
2702 size += SZ_NLATTR_NEST + /* na_act_index. */
2703 SZ_NLATTR_STRZ_OF("gact") +
2704 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2705 SZ_NLATTR_TYPE_OF(struct tc_gact);
2706 flags |= MLX5_FLOW_ACTION_DROP;
2708 case RTE_FLOW_ACTION_TYPE_COUNT:
2710 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
2711 flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
2712 goto action_of_vlan;
2713 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
2714 flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
2715 goto action_of_vlan;
2716 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
2717 flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
2718 goto action_of_vlan;
2719 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
2720 flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
2721 goto action_of_vlan;
2723 size += SZ_NLATTR_NEST + /* na_act_index. */
2724 SZ_NLATTR_STRZ_OF("vlan") +
2725 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2726 SZ_NLATTR_TYPE_OF(struct tc_vlan) +
2727 SZ_NLATTR_TYPE_OF(uint16_t) +
2728 /* VLAN protocol. */
2729 SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
2730 SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
2732 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
2733 size += SZ_NLATTR_NEST + /* na_act_index. */
2734 SZ_NLATTR_STRZ_OF("tunnel_key") +
2735 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2736 SZ_NLATTR_TYPE_OF(uint8_t);
2737 size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2738 size += flow_tcf_vxlan_encap_size(actions) +
2739 RTE_ALIGN_CEIL /* preceding encap params. */
2740 (sizeof(struct flow_tcf_vxlan_encap),
2742 flags |= MLX5_FLOW_ACTION_VXLAN_ENCAP;
2744 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
2745 size += SZ_NLATTR_NEST + /* na_act_index. */
2746 SZ_NLATTR_STRZ_OF("tunnel_key") +
2747 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2748 SZ_NLATTR_TYPE_OF(uint8_t);
2749 size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2750 size += RTE_ALIGN_CEIL /* preceding decap params. */
2751 (sizeof(struct flow_tcf_vxlan_decap),
2753 flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2755 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
2756 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
2757 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
2758 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
2759 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
2760 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
2761 case RTE_FLOW_ACTION_TYPE_SET_TTL:
2762 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
2763 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
2764 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
2765 size += flow_tcf_get_pedit_actions_size(&actions,
2770 "unsupported action %p type %d,"
2771 " items must be validated before flow creation",
2772 (const void *)actions, actions->type);
2776 *action_flags = flags;
2781 * Prepare a flow object for Linux TC flower. It calculates the maximum size of
2782 * memory required, allocates the memory, initializes Netlink message headers
2783 * and set unique TC message handle.
2786 * Pointer to the flow attributes.
2788 * Pointer to the list of items.
2789 * @param[in] actions
2790 * Pointer to the list of actions.
2792 * Pointer to the error structure.
2795 * Pointer to mlx5_flow object on success,
2796 * otherwise NULL and rte_errno is set.
2798 static struct mlx5_flow *
2799 flow_tcf_prepare(const struct rte_flow_attr *attr,
2800 const struct rte_flow_item items[],
2801 const struct rte_flow_action actions[],
2802 struct rte_flow_error *error)
2804 size_t size = RTE_ALIGN_CEIL
2805 (sizeof(struct mlx5_flow),
2806 alignof(struct flow_tcf_tunnel_hdr)) +
2807 MNL_ALIGN(sizeof(struct nlmsghdr)) +
2808 MNL_ALIGN(sizeof(struct tcmsg));
2809 struct mlx5_flow *dev_flow;
2810 uint64_t action_flags = 0;
2811 struct nlmsghdr *nlh;
2813 uint8_t *sp, *tun = NULL;
2815 size += flow_tcf_get_items_size(attr, items, &action_flags);
2816 size += flow_tcf_get_actions_and_size(actions, &action_flags);
2817 dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
2819 rte_flow_error_set(error, ENOMEM,
2820 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
2821 "not enough memory to create E-Switch flow");
2824 sp = (uint8_t *)(dev_flow + 1);
2825 if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) {
2827 (sp, alignof(struct flow_tcf_tunnel_hdr));
2829 sp += RTE_ALIGN_CEIL
2830 (sizeof(struct flow_tcf_vxlan_encap),
2833 size -= RTE_ALIGN_CEIL
2834 (sizeof(struct flow_tcf_vxlan_encap),
2837 } else if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2839 (sp, alignof(struct flow_tcf_tunnel_hdr));
2841 sp += RTE_ALIGN_CEIL
2842 (sizeof(struct flow_tcf_vxlan_decap),
2845 size -= RTE_ALIGN_CEIL
2846 (sizeof(struct flow_tcf_vxlan_decap),
2850 sp = RTE_PTR_ALIGN(sp, MNL_ALIGNTO);
2852 nlh = mnl_nlmsg_put_header(sp);
2853 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
2854 *dev_flow = (struct mlx5_flow){
2855 .tcf = (struct mlx5_flow_tcf){
2857 .nlsize = size - RTE_ALIGN_CEIL
2858 (sizeof(struct mlx5_flow),
2859 alignof(struct flow_tcf_tunnel_hdr)),
2861 .tunnel = (struct flow_tcf_tunnel_hdr *)tun,
2866 if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP)
2867 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_DECAP;
2868 else if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP)
2869 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_ENCAP;
2874 * Make adjustments for supporting count actions.
2877 * Pointer to the Ethernet device structure.
2878 * @param[in] dev_flow
2879 * Pointer to mlx5_flow.
2881 * Pointer to error structure.
2884 * 0 On success else a negative errno value is returned and rte_errno is set.
2887 flow_tcf_translate_action_count(struct rte_eth_dev *dev __rte_unused,
2888 struct mlx5_flow *dev_flow,
2889 struct rte_flow_error *error)
2891 struct rte_flow *flow = dev_flow->flow;
2893 if (!flow->counter) {
2894 flow->counter = flow_tcf_counter_new();
2896 return rte_flow_error_set(error, rte_errno,
2897 RTE_FLOW_ERROR_TYPE_ACTION,
2899 "cannot get counter"
2906 * Convert VXLAN VNI to 32-bit integer.
2909 * VXLAN VNI in 24-bit wire format.
2912 * VXLAN VNI as a 32-bit integer value in network endian.
2914 static inline rte_be32_t
2915 vxlan_vni_as_be32(const uint8_t vni[3])
2921 .vni = { 0, vni[0], vni[1], vni[2] },
2927 * Helper function to process RTE_FLOW_ITEM_TYPE_ETH entry in configuration
2928 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the MAC address fields
2929 * in the encapsulation parameters structure. The item must be prevalidated,
2930 * no any validation checks performed by function.
2933 * RTE_FLOW_ITEM_TYPE_ETH entry specification.
2935 * RTE_FLOW_ITEM_TYPE_ETH entry mask.
2937 * Structure to fill the gathered MAC address data.
2940 flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec,
2941 const struct rte_flow_item_eth *mask,
2942 struct flow_tcf_vxlan_encap *encap)
2944 /* Item must be validated before. No redundant checks. */
2946 if (!mask || !memcmp(&mask->dst,
2947 &rte_flow_item_eth_mask.dst,
2948 sizeof(rte_flow_item_eth_mask.dst))) {
2950 * Ethernet addresses are not supported by
2951 * tc as tunnel_key parameters. Destination
2952 * address is needed to form encap packet
2953 * header and retrieved by kernel from
2954 * implicit sources (ARP table, etc),
2955 * address masks are not supported at all.
2957 encap->eth.dst = spec->dst;
2958 encap->mask |= FLOW_TCF_ENCAP_ETH_DST;
2960 if (!mask || !memcmp(&mask->src,
2961 &rte_flow_item_eth_mask.src,
2962 sizeof(rte_flow_item_eth_mask.src))) {
2964 * Ethernet addresses are not supported by
2965 * tc as tunnel_key parameters. Source ethernet
2966 * address is ignored anyway.
2968 encap->eth.src = spec->src;
2969 encap->mask |= FLOW_TCF_ENCAP_ETH_SRC;
2974 * Helper function to process RTE_FLOW_ITEM_TYPE_IPV4 entry in configuration
2975 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV4 address fields
2976 * in the encapsulation parameters structure. The item must be prevalidated,
2977 * no any validation checks performed by function.
2980 * RTE_FLOW_ITEM_TYPE_IPV4 entry specification.
2982 * RTE_FLOW_ITEM_TYPE_IPV4 entry mask.
2984 * Structure to fill the gathered IPV4 address data.
2987 flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec,
2988 const struct rte_flow_item_ipv4 *mask,
2989 struct flow_tcf_vxlan_encap *encap)
2991 /* Item must be validated before. No redundant checks. */
2993 encap->ipv4.dst = spec->hdr.dst_addr;
2994 encap->ipv4.src = spec->hdr.src_addr;
2995 encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC |
2996 FLOW_TCF_ENCAP_IPV4_DST;
2997 if (mask && mask->hdr.type_of_service) {
2998 encap->mask |= FLOW_TCF_ENCAP_IP_TOS;
2999 encap->ip_tos = spec->hdr.type_of_service;
3001 if (mask && mask->hdr.time_to_live) {
3002 encap->mask |= FLOW_TCF_ENCAP_IP_TTL;
3003 encap->ip_ttl_hop = spec->hdr.time_to_live;
3008 * Helper function to process RTE_FLOW_ITEM_TYPE_IPV6 entry in configuration
3009 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV6 address fields
3010 * in the encapsulation parameters structure. The item must be prevalidated,
3011 * no any validation checks performed by function.
3014 * RTE_FLOW_ITEM_TYPE_IPV6 entry specification.
3016 * RTE_FLOW_ITEM_TYPE_IPV6 entry mask.
3018 * Structure to fill the gathered IPV6 address data.
3021 flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec,
3022 const struct rte_flow_item_ipv6 *mask,
3023 struct flow_tcf_vxlan_encap *encap)
3025 /* Item must be validated before. No redundant checks. */
3027 memcpy(encap->ipv6.dst, spec->hdr.dst_addr, IPV6_ADDR_LEN);
3028 memcpy(encap->ipv6.src, spec->hdr.src_addr, IPV6_ADDR_LEN);
3029 encap->mask |= FLOW_TCF_ENCAP_IPV6_SRC |
3030 FLOW_TCF_ENCAP_IPV6_DST;
3032 if ((rte_be_to_cpu_32(mask->hdr.vtc_flow) >>
3033 IPV6_HDR_TC_SHIFT) & 0xff) {
3034 encap->mask |= FLOW_TCF_ENCAP_IP_TOS;
3035 encap->ip_tos = (rte_be_to_cpu_32
3036 (spec->hdr.vtc_flow) >>
3037 IPV6_HDR_TC_SHIFT) & 0xff;
3039 if (mask->hdr.hop_limits) {
3040 encap->mask |= FLOW_TCF_ENCAP_IP_TTL;
3041 encap->ip_ttl_hop = spec->hdr.hop_limits;
3047 * Helper function to process RTE_FLOW_ITEM_TYPE_UDP entry in configuration
3048 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the UDP port fields
3049 * in the encapsulation parameters structure. The item must be prevalidated,
3050 * no any validation checks performed by function.
3053 * RTE_FLOW_ITEM_TYPE_UDP entry specification.
3055 * RTE_FLOW_ITEM_TYPE_UDP entry mask.
3057 * Structure to fill the gathered UDP port data.
3060 flow_tcf_parse_vxlan_encap_udp(const struct rte_flow_item_udp *spec,
3061 const struct rte_flow_item_udp *mask,
3062 struct flow_tcf_vxlan_encap *encap)
3065 encap->udp.dst = spec->hdr.dst_port;
3066 encap->mask |= FLOW_TCF_ENCAP_UDP_DST;
3067 if (!mask || mask->hdr.src_port != RTE_BE16(0x0000)) {
3068 encap->udp.src = spec->hdr.src_port;
3069 encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC;
3074 * Helper function to process RTE_FLOW_ITEM_TYPE_VXLAN entry in configuration
3075 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the VNI fields
3076 * in the encapsulation parameters structure. The item must be prevalidated,
3077 * no any validation checks performed by function.
3080 * RTE_FLOW_ITEM_TYPE_VXLAN entry specification.
3082 * Structure to fill the gathered VNI address data.
3085 flow_tcf_parse_vxlan_encap_vni(const struct rte_flow_item_vxlan *spec,
3086 struct flow_tcf_vxlan_encap *encap)
3088 /* Item must be validated before. Do not redundant checks. */
3090 memcpy(encap->vxlan.vni, spec->vni, sizeof(encap->vxlan.vni));
3091 encap->mask |= FLOW_TCF_ENCAP_VXLAN_VNI;
3095 * Populate consolidated encapsulation object from list of pattern items.
3097 * Helper function to process configuration of action such as
3098 * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. The item list should be
3099 * validated, there is no way to return an meaningful error.
3102 * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
3103 * List of pattern items to gather data from.
3105 * Structure to fill gathered data.
3108 flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action,
3109 struct flow_tcf_vxlan_encap *encap)
3112 const struct rte_flow_item_eth *eth;
3113 const struct rte_flow_item_ipv4 *ipv4;
3114 const struct rte_flow_item_ipv6 *ipv6;
3115 const struct rte_flow_item_udp *udp;
3116 const struct rte_flow_item_vxlan *vxlan;
3118 const struct rte_flow_item *items;
3120 assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
3121 assert(action->conf);
3123 items = ((const struct rte_flow_action_vxlan_encap *)
3124 action->conf)->definition;
3126 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3127 switch (items->type) {
3128 case RTE_FLOW_ITEM_TYPE_VOID:
3130 case RTE_FLOW_ITEM_TYPE_ETH:
3131 mask.eth = items->mask;
3132 spec.eth = items->spec;
3133 flow_tcf_parse_vxlan_encap_eth(spec.eth, mask.eth,
3136 case RTE_FLOW_ITEM_TYPE_IPV4:
3137 spec.ipv4 = items->spec;
3138 mask.ipv4 = items->mask;
3139 flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4, mask.ipv4,
3142 case RTE_FLOW_ITEM_TYPE_IPV6:
3143 spec.ipv6 = items->spec;
3144 mask.ipv6 = items->mask;
3145 flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6, mask.ipv6,
3148 case RTE_FLOW_ITEM_TYPE_UDP:
3149 mask.udp = items->mask;
3150 spec.udp = items->spec;
3151 flow_tcf_parse_vxlan_encap_udp(spec.udp, mask.udp,
3154 case RTE_FLOW_ITEM_TYPE_VXLAN:
3155 spec.vxlan = items->spec;
3156 flow_tcf_parse_vxlan_encap_vni(spec.vxlan, encap);
3161 "unsupported item %p type %d,"
3162 " items must be validated"
3163 " before flow creation",
3164 (const void *)items, items->type);
3172 * Translate flow for Linux TC flower and construct Netlink message.
3175 * Pointer to the priv structure.
3176 * @param[in, out] flow
3177 * Pointer to the sub flow.
3179 * Pointer to the flow attributes.
3181 * Pointer to the list of items.
3182 * @param[in] actions
3183 * Pointer to the list of actions.
3185 * Pointer to the error structure.
3188 * 0 on success, a negative errno value otherwise and rte_errno is set.
3191 flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow,
3192 const struct rte_flow_attr *attr,
3193 const struct rte_flow_item items[],
3194 const struct rte_flow_action actions[],
3195 struct rte_flow_error *error)
3198 const struct rte_flow_item_port_id *port_id;
3199 const struct rte_flow_item_eth *eth;
3200 const struct rte_flow_item_vlan *vlan;
3201 const struct rte_flow_item_ipv4 *ipv4;
3202 const struct rte_flow_item_ipv6 *ipv6;
3203 const struct rte_flow_item_tcp *tcp;
3204 const struct rte_flow_item_udp *udp;
3205 const struct rte_flow_item_vxlan *vxlan;
3208 const struct rte_flow_action_port_id *port_id;
3209 const struct rte_flow_action_jump *jump;
3210 const struct rte_flow_action_of_push_vlan *of_push_vlan;
3211 const struct rte_flow_action_of_set_vlan_vid *
3213 const struct rte_flow_action_of_set_vlan_pcp *
3217 struct flow_tcf_tunnel_hdr *hdr;
3218 struct flow_tcf_vxlan_decap *vxlan;
3223 struct flow_tcf_tunnel_hdr *hdr;
3224 struct flow_tcf_vxlan_encap *vxlan;
3228 struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
3229 struct nlmsghdr *nlh = dev_flow->tcf.nlh;
3230 struct tcmsg *tcm = dev_flow->tcf.tcm;
3231 uint32_t na_act_index_cur;
3232 rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
3233 rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
3234 rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
3235 bool ip_proto_set = 0;
3236 bool tunnel_outer = 0;
3237 struct nlattr *na_flower;
3238 struct nlattr *na_flower_act;
3239 struct nlattr *na_vlan_id = NULL;
3240 struct nlattr *na_vlan_priority = NULL;
3241 uint64_t item_flags = 0;
3244 claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
3245 PTOI_TABLE_SZ_MAX(dev)));
3246 if (dev_flow->tcf.tunnel) {
3247 switch (dev_flow->tcf.tunnel->type) {
3248 case FLOW_TCF_TUNACT_VXLAN_DECAP:
3249 decap.vxlan = dev_flow->tcf.vxlan_decap;
3252 case FLOW_TCF_TUNACT_VXLAN_ENCAP:
3253 encap.vxlan = dev_flow->tcf.vxlan_encap;
3255 /* New tunnel actions can be added here. */
3261 nlh = dev_flow->tcf.nlh;
3262 tcm = dev_flow->tcf.tcm;
3263 /* Prepare API must have been called beforehand. */
3264 assert(nlh != NULL && tcm != NULL);
3265 tcm->tcm_family = AF_UNSPEC;
3266 tcm->tcm_ifindex = ptoi[0].ifindex;
3267 tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
3269 * Priority cannot be zero to prevent the kernel from picking one
3272 tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16, outer_etype);
3273 if (attr->group > 0)
3274 mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group);
3275 mnl_attr_put_strz(nlh, TCA_KIND, "flower");
3276 na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
3277 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3280 switch (items->type) {
3281 case RTE_FLOW_ITEM_TYPE_VOID:
3283 case RTE_FLOW_ITEM_TYPE_PORT_ID:
3284 mask.port_id = flow_tcf_item_mask
3285 (items, &rte_flow_item_port_id_mask,
3286 &flow_tcf_mask_supported.port_id,
3287 &flow_tcf_mask_empty.port_id,
3288 sizeof(flow_tcf_mask_supported.port_id),
3290 assert(mask.port_id);
3291 if (mask.port_id == &flow_tcf_mask_empty.port_id)
3293 spec.port_id = items->spec;
3294 if (!mask.port_id->id)
3297 for (i = 0; ptoi[i].ifindex; ++i)
3298 if (ptoi[i].port_id == spec.port_id->id)
3300 assert(ptoi[i].ifindex);
3301 tcm->tcm_ifindex = ptoi[i].ifindex;
3303 case RTE_FLOW_ITEM_TYPE_ETH:
3304 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3305 MLX5_FLOW_LAYER_INNER_L2 :
3306 MLX5_FLOW_LAYER_OUTER_L2;
3307 mask.eth = flow_tcf_item_mask
3308 (items, &rte_flow_item_eth_mask,
3309 &flow_tcf_mask_supported.eth,
3310 &flow_tcf_mask_empty.eth,
3311 sizeof(flow_tcf_mask_supported.eth),
3314 if (mask.eth == &flow_tcf_mask_empty.eth)
3316 spec.eth = items->spec;
3317 if (mask.eth->type) {
3318 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
3319 inner_etype = spec.eth->type;
3321 outer_etype = spec.eth->type;
3325 "outer L2 addresses cannot be"
3326 " forced is outer ones for tunnel,"
3327 " parameter is ignored");
3330 if (!is_zero_ether_addr(&mask.eth->dst)) {
3331 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
3333 spec.eth->dst.addr_bytes);
3334 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK,
3336 mask.eth->dst.addr_bytes);
3338 if (!is_zero_ether_addr(&mask.eth->src)) {
3339 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC,
3341 spec.eth->src.addr_bytes);
3342 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK,
3344 mask.eth->src.addr_bytes);
3346 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3348 case RTE_FLOW_ITEM_TYPE_VLAN:
3351 assert(!tunnel_outer);
3352 item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
3353 mask.vlan = flow_tcf_item_mask
3354 (items, &rte_flow_item_vlan_mask,
3355 &flow_tcf_mask_supported.vlan,
3356 &flow_tcf_mask_empty.vlan,
3357 sizeof(flow_tcf_mask_supported.vlan),
3360 if (mask.vlan == &flow_tcf_mask_empty.vlan)
3362 spec.vlan = items->spec;
3363 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3364 outer_etype == RTE_BE16(ETH_P_8021Q));
3365 outer_etype = RTE_BE16(ETH_P_8021Q);
3366 if (mask.vlan->inner_type)
3367 vlan_etype = spec.vlan->inner_type;
3368 if (mask.vlan->tci & RTE_BE16(0xe000))
3369 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_VLAN_PRIO,
3371 (spec.vlan->tci) >> 13) & 0x7);
3372 if (mask.vlan->tci & RTE_BE16(0x0fff))
3373 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_VLAN_ID,
3377 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3379 case RTE_FLOW_ITEM_TYPE_IPV4:
3380 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3381 MLX5_FLOW_LAYER_INNER_L3_IPV4 :
3382 MLX5_FLOW_LAYER_OUTER_L3_IPV4;
3383 mask.ipv4 = flow_tcf_item_mask
3384 (items, &rte_flow_item_ipv4_mask,
3385 &flow_tcf_mask_supported.ipv4,
3386 &flow_tcf_mask_empty.ipv4,
3387 sizeof(flow_tcf_mask_supported.ipv4),
3390 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3391 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3392 inner_etype == RTE_BE16(ETH_P_IP));
3393 inner_etype = RTE_BE16(ETH_P_IP);
3394 } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3395 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3396 vlan_etype == RTE_BE16(ETH_P_IP));
3397 vlan_etype = RTE_BE16(ETH_P_IP);
3399 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3400 outer_etype == RTE_BE16(ETH_P_IP));
3401 outer_etype = RTE_BE16(ETH_P_IP);
3403 spec.ipv4 = items->spec;
3404 if (!tunnel_outer && mask.ipv4->hdr.next_proto_id) {
3406 * No way to set IP protocol for outer tunnel
3407 * layers. Usually it is fixed, for example,
3408 * to UDP for VXLAN/GPE.
3410 assert(spec.ipv4); /* Mask is not empty. */
3411 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3412 spec.ipv4->hdr.next_proto_id);
3415 if (mask.ipv4 == &flow_tcf_mask_empty.ipv4 ||
3416 (!mask.ipv4->hdr.src_addr &&
3417 !mask.ipv4->hdr.dst_addr)) {
3421 * For tunnel outer we must set outer IP key
3422 * anyway, even if the specification/mask is
3423 * empty. There is no another way to tell
3424 * kernel about he outer layer protocol.
3427 (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC,
3428 mask.ipv4->hdr.src_addr);
3430 (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
3431 mask.ipv4->hdr.src_addr);
3432 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3435 if (mask.ipv4->hdr.src_addr) {
3437 (nlh, tunnel_outer ?
3438 TCA_FLOWER_KEY_ENC_IPV4_SRC :
3439 TCA_FLOWER_KEY_IPV4_SRC,
3440 spec.ipv4->hdr.src_addr);
3442 (nlh, tunnel_outer ?
3443 TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK :
3444 TCA_FLOWER_KEY_IPV4_SRC_MASK,
3445 mask.ipv4->hdr.src_addr);
3447 if (mask.ipv4->hdr.dst_addr) {
3449 (nlh, tunnel_outer ?
3450 TCA_FLOWER_KEY_ENC_IPV4_DST :
3451 TCA_FLOWER_KEY_IPV4_DST,
3452 spec.ipv4->hdr.dst_addr);
3454 (nlh, tunnel_outer ?
3455 TCA_FLOWER_KEY_ENC_IPV4_DST_MASK :
3456 TCA_FLOWER_KEY_IPV4_DST_MASK,
3457 mask.ipv4->hdr.dst_addr);
3459 if (mask.ipv4->hdr.time_to_live) {
3461 (nlh, tunnel_outer ?
3462 TCA_FLOWER_KEY_ENC_IP_TTL :
3463 TCA_FLOWER_KEY_IP_TTL,
3464 spec.ipv4->hdr.time_to_live);
3466 (nlh, tunnel_outer ?
3467 TCA_FLOWER_KEY_ENC_IP_TTL_MASK :
3468 TCA_FLOWER_KEY_IP_TTL_MASK,
3469 mask.ipv4->hdr.time_to_live);
3471 if (mask.ipv4->hdr.type_of_service) {
3473 (nlh, tunnel_outer ?
3474 TCA_FLOWER_KEY_ENC_IP_TOS :
3475 TCA_FLOWER_KEY_IP_TOS,
3476 spec.ipv4->hdr.type_of_service);
3478 (nlh, tunnel_outer ?
3479 TCA_FLOWER_KEY_ENC_IP_TOS_MASK :
3480 TCA_FLOWER_KEY_IP_TOS_MASK,
3481 mask.ipv4->hdr.type_of_service);
3483 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3485 case RTE_FLOW_ITEM_TYPE_IPV6: {
3486 bool ipv6_src, ipv6_dst;
3489 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3490 MLX5_FLOW_LAYER_INNER_L3_IPV6 :
3491 MLX5_FLOW_LAYER_OUTER_L3_IPV6;
3492 mask.ipv6 = flow_tcf_item_mask
3493 (items, &rte_flow_item_ipv6_mask,
3494 &flow_tcf_mask_supported.ipv6,
3495 &flow_tcf_mask_empty.ipv6,
3496 sizeof(flow_tcf_mask_supported.ipv6),
3499 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3500 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3501 inner_etype == RTE_BE16(ETH_P_IPV6));
3502 inner_etype = RTE_BE16(ETH_P_IPV6);
3503 } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3504 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3505 vlan_etype == RTE_BE16(ETH_P_IPV6));
3506 vlan_etype = RTE_BE16(ETH_P_IPV6);
3508 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3509 outer_etype == RTE_BE16(ETH_P_IPV6));
3510 outer_etype = RTE_BE16(ETH_P_IPV6);
3512 spec.ipv6 = items->spec;
3513 if (!tunnel_outer && mask.ipv6->hdr.proto) {
3515 * No way to set IP protocol for outer tunnel
3516 * layers. Usually it is fixed, for example,
3517 * to UDP for VXLAN/GPE.
3519 assert(spec.ipv6); /* Mask is not empty. */
3520 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3521 spec.ipv6->hdr.proto);
3524 ipv6_dst = !IN6_IS_ADDR_UNSPECIFIED
3525 (mask.ipv6->hdr.dst_addr);
3526 ipv6_src = !IN6_IS_ADDR_UNSPECIFIED
3527 (mask.ipv6->hdr.src_addr);
3528 if (mask.ipv6 == &flow_tcf_mask_empty.ipv6 ||
3529 (!ipv6_dst && !ipv6_src)) {
3533 * For tunnel outer we must set outer IP key
3534 * anyway, even if the specification/mask is
3535 * empty. There is no another way to tell
3536 * kernel about he outer layer protocol.
3539 TCA_FLOWER_KEY_ENC_IPV6_SRC,
3541 mask.ipv6->hdr.src_addr);
3543 TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
3545 mask.ipv6->hdr.src_addr);
3546 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3550 mnl_attr_put(nlh, tunnel_outer ?
3551 TCA_FLOWER_KEY_ENC_IPV6_SRC :
3552 TCA_FLOWER_KEY_IPV6_SRC,
3554 spec.ipv6->hdr.src_addr);
3555 mnl_attr_put(nlh, tunnel_outer ?
3556 TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK :
3557 TCA_FLOWER_KEY_IPV6_SRC_MASK,
3559 mask.ipv6->hdr.src_addr);
3562 mnl_attr_put(nlh, tunnel_outer ?
3563 TCA_FLOWER_KEY_ENC_IPV6_DST :
3564 TCA_FLOWER_KEY_IPV6_DST,
3566 spec.ipv6->hdr.dst_addr);
3567 mnl_attr_put(nlh, tunnel_outer ?
3568 TCA_FLOWER_KEY_ENC_IPV6_DST_MASK :
3569 TCA_FLOWER_KEY_IPV6_DST_MASK,
3571 mask.ipv6->hdr.dst_addr);
3573 if (mask.ipv6->hdr.hop_limits) {
3575 (nlh, tunnel_outer ?
3576 TCA_FLOWER_KEY_ENC_IP_TTL :
3577 TCA_FLOWER_KEY_IP_TTL,
3578 spec.ipv6->hdr.hop_limits);
3580 (nlh, tunnel_outer ?
3581 TCA_FLOWER_KEY_ENC_IP_TTL_MASK :
3582 TCA_FLOWER_KEY_IP_TTL_MASK,
3583 mask.ipv6->hdr.hop_limits);
3585 msk6 = (rte_be_to_cpu_32(mask.ipv6->hdr.vtc_flow) >>
3586 IPV6_HDR_TC_SHIFT) & 0xff;
3588 tos6 = (rte_be_to_cpu_32
3589 (spec.ipv6->hdr.vtc_flow) >>
3590 IPV6_HDR_TC_SHIFT) & 0xff;
3592 (nlh, tunnel_outer ?
3593 TCA_FLOWER_KEY_ENC_IP_TOS :
3594 TCA_FLOWER_KEY_IP_TOS, tos6);
3596 (nlh, tunnel_outer ?
3597 TCA_FLOWER_KEY_ENC_IP_TOS_MASK :
3598 TCA_FLOWER_KEY_IP_TOS_MASK, msk6);
3600 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3603 case RTE_FLOW_ITEM_TYPE_UDP:
3604 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3605 MLX5_FLOW_LAYER_INNER_L4_UDP :
3606 MLX5_FLOW_LAYER_OUTER_L4_UDP;
3607 mask.udp = flow_tcf_item_mask
3608 (items, &rte_flow_item_udp_mask,
3609 &flow_tcf_mask_supported.udp,
3610 &flow_tcf_mask_empty.udp,
3611 sizeof(flow_tcf_mask_supported.udp),
3614 spec.udp = items->spec;
3615 if (!tunnel_outer) {
3618 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3620 if (mask.udp == &flow_tcf_mask_empty.udp)
3623 assert(mask.udp != &flow_tcf_mask_empty.udp);
3624 decap.vxlan->udp_port =
3626 (spec.udp->hdr.dst_port);
3628 if (mask.udp->hdr.src_port) {
3630 (nlh, tunnel_outer ?
3631 TCA_FLOWER_KEY_ENC_UDP_SRC_PORT :
3632 TCA_FLOWER_KEY_UDP_SRC,
3633 spec.udp->hdr.src_port);
3635 (nlh, tunnel_outer ?
3636 TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK :
3637 TCA_FLOWER_KEY_UDP_SRC_MASK,
3638 mask.udp->hdr.src_port);
3640 if (mask.udp->hdr.dst_port) {
3642 (nlh, tunnel_outer ?
3643 TCA_FLOWER_KEY_ENC_UDP_DST_PORT :
3644 TCA_FLOWER_KEY_UDP_DST,
3645 spec.udp->hdr.dst_port);
3647 (nlh, tunnel_outer ?
3648 TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK :
3649 TCA_FLOWER_KEY_UDP_DST_MASK,
3650 mask.udp->hdr.dst_port);
3652 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3654 case RTE_FLOW_ITEM_TYPE_TCP:
3655 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3656 MLX5_FLOW_LAYER_INNER_L4_TCP :
3657 MLX5_FLOW_LAYER_OUTER_L4_TCP;
3658 mask.tcp = flow_tcf_item_mask
3659 (items, &rte_flow_item_tcp_mask,
3660 &flow_tcf_mask_supported.tcp,
3661 &flow_tcf_mask_empty.tcp,
3662 sizeof(flow_tcf_mask_supported.tcp),
3666 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3668 if (mask.tcp == &flow_tcf_mask_empty.tcp)
3670 spec.tcp = items->spec;
3671 if (mask.tcp->hdr.src_port) {
3672 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_SRC,
3673 spec.tcp->hdr.src_port);
3674 mnl_attr_put_u16(nlh,
3675 TCA_FLOWER_KEY_TCP_SRC_MASK,
3676 mask.tcp->hdr.src_port);
3678 if (mask.tcp->hdr.dst_port) {
3679 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_DST,
3680 spec.tcp->hdr.dst_port);
3681 mnl_attr_put_u16(nlh,
3682 TCA_FLOWER_KEY_TCP_DST_MASK,
3683 mask.tcp->hdr.dst_port);
3685 if (mask.tcp->hdr.tcp_flags) {
3688 TCA_FLOWER_KEY_TCP_FLAGS,
3690 (spec.tcp->hdr.tcp_flags));
3693 TCA_FLOWER_KEY_TCP_FLAGS_MASK,
3695 (mask.tcp->hdr.tcp_flags));
3697 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3699 case RTE_FLOW_ITEM_TYPE_VXLAN:
3700 assert(decap.vxlan);
3702 item_flags |= MLX5_FLOW_LAYER_VXLAN;
3703 spec.vxlan = items->spec;
3704 mnl_attr_put_u32(nlh,
3705 TCA_FLOWER_KEY_ENC_KEY_ID,
3706 vxlan_vni_as_be32(spec.vxlan->vni));
3707 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3710 return rte_flow_error_set(error, ENOTSUP,
3711 RTE_FLOW_ERROR_TYPE_ITEM,
3712 NULL, "item not supported");
3716 * Set the ether_type flower key and tc rule protocol:
3717 * - if there is nor VLAN neither VXLAN the key is taken from
3718 * eth item directly or deduced from L3 items.
3719 * - if there is vlan item then key is fixed to 802.1q.
3720 * - if there is vxlan item then key is set to inner tunnel type.
3721 * - simultaneous vlan and vxlan items are prohibited.
3723 if (outer_etype != RTE_BE16(ETH_P_ALL)) {
3724 tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
3726 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3727 if (inner_etype != RTE_BE16(ETH_P_ALL))
3728 mnl_attr_put_u16(nlh,
3729 TCA_FLOWER_KEY_ETH_TYPE,
3732 mnl_attr_put_u16(nlh,
3733 TCA_FLOWER_KEY_ETH_TYPE,
3735 if (outer_etype == RTE_BE16(ETH_P_8021Q) &&
3736 vlan_etype != RTE_BE16(ETH_P_ALL))
3737 mnl_attr_put_u16(nlh,
3738 TCA_FLOWER_KEY_VLAN_ETH_TYPE,
3741 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3743 na_flower_act = mnl_attr_nest_start(nlh, TCA_FLOWER_ACT);
3744 na_act_index_cur = 1;
3745 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
3746 struct nlattr *na_act_index;
3747 struct nlattr *na_act;
3748 unsigned int vlan_act;
3751 switch (actions->type) {
3752 case RTE_FLOW_ACTION_TYPE_VOID:
3754 case RTE_FLOW_ACTION_TYPE_PORT_ID:
3755 conf.port_id = actions->conf;
3756 if (conf.port_id->original)
3759 for (i = 0; ptoi[i].ifindex; ++i)
3760 if (ptoi[i].port_id == conf.port_id->id)
3762 assert(ptoi[i].ifindex);
3764 mnl_attr_nest_start(nlh, na_act_index_cur++);
3765 assert(na_act_index);
3766 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
3767 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3770 assert(dev_flow->tcf.tunnel);
3771 dev_flow->tcf.tunnel->ifindex_ptr =
3772 &((struct tc_mirred *)
3773 mnl_attr_get_payload
3774 (mnl_nlmsg_get_payload_tail
3777 mnl_attr_put(nlh, TCA_MIRRED_PARMS,
3778 sizeof(struct tc_mirred),
3779 &(struct tc_mirred){
3780 .action = TC_ACT_STOLEN,
3781 .eaction = TCA_EGRESS_REDIR,
3782 .ifindex = ptoi[i].ifindex,
3784 mnl_attr_nest_end(nlh, na_act);
3785 mnl_attr_nest_end(nlh, na_act_index);
3787 case RTE_FLOW_ACTION_TYPE_JUMP:
3788 conf.jump = actions->conf;
3790 mnl_attr_nest_start(nlh, na_act_index_cur++);
3791 assert(na_act_index);
3792 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3793 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3795 mnl_attr_put(nlh, TCA_GACT_PARMS,
3796 sizeof(struct tc_gact),
3798 .action = TC_ACT_GOTO_CHAIN |
3801 mnl_attr_nest_end(nlh, na_act);
3802 mnl_attr_nest_end(nlh, na_act_index);
3804 case RTE_FLOW_ACTION_TYPE_DROP:
3806 mnl_attr_nest_start(nlh, na_act_index_cur++);
3807 assert(na_act_index);
3808 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3809 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3811 mnl_attr_put(nlh, TCA_GACT_PARMS,
3812 sizeof(struct tc_gact),
3814 .action = TC_ACT_SHOT,
3816 mnl_attr_nest_end(nlh, na_act);
3817 mnl_attr_nest_end(nlh, na_act_index);
3819 case RTE_FLOW_ACTION_TYPE_COUNT:
3821 * Driver adds the count action implicitly for
3822 * each rule it creates.
3824 ret = flow_tcf_translate_action_count(dev,
3829 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
3830 conf.of_push_vlan = NULL;
3831 vlan_act = TCA_VLAN_ACT_POP;
3832 goto action_of_vlan;
3833 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
3834 conf.of_push_vlan = actions->conf;
3835 vlan_act = TCA_VLAN_ACT_PUSH;
3836 goto action_of_vlan;
3837 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
3838 conf.of_set_vlan_vid = actions->conf;
3840 goto override_na_vlan_id;
3841 vlan_act = TCA_VLAN_ACT_MODIFY;
3842 goto action_of_vlan;
3843 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
3844 conf.of_set_vlan_pcp = actions->conf;
3845 if (na_vlan_priority)
3846 goto override_na_vlan_priority;
3847 vlan_act = TCA_VLAN_ACT_MODIFY;
3848 goto action_of_vlan;
3851 mnl_attr_nest_start(nlh, na_act_index_cur++);
3852 assert(na_act_index);
3853 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "vlan");
3854 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3856 mnl_attr_put(nlh, TCA_VLAN_PARMS,
3857 sizeof(struct tc_vlan),
3859 .action = TC_ACT_PIPE,
3860 .v_action = vlan_act,
3862 if (vlan_act == TCA_VLAN_ACT_POP) {
3863 mnl_attr_nest_end(nlh, na_act);
3864 mnl_attr_nest_end(nlh, na_act_index);
3867 if (vlan_act == TCA_VLAN_ACT_PUSH)
3868 mnl_attr_put_u16(nlh,
3869 TCA_VLAN_PUSH_VLAN_PROTOCOL,
3870 conf.of_push_vlan->ethertype);
3871 na_vlan_id = mnl_nlmsg_get_payload_tail(nlh);
3872 mnl_attr_put_u16(nlh, TCA_VLAN_PAD, 0);
3873 na_vlan_priority = mnl_nlmsg_get_payload_tail(nlh);
3874 mnl_attr_put_u8(nlh, TCA_VLAN_PAD, 0);
3875 mnl_attr_nest_end(nlh, na_act);
3876 mnl_attr_nest_end(nlh, na_act_index);
3877 if (actions->type ==
3878 RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
3879 override_na_vlan_id:
3880 na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID;
3881 *(uint16_t *)mnl_attr_get_payload(na_vlan_id) =
3883 (conf.of_set_vlan_vid->vlan_vid);
3884 } else if (actions->type ==
3885 RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
3886 override_na_vlan_priority:
3887 na_vlan_priority->nla_type =
3888 TCA_VLAN_PUSH_VLAN_PRIORITY;
3889 *(uint8_t *)mnl_attr_get_payload
3890 (na_vlan_priority) =
3891 conf.of_set_vlan_pcp->vlan_pcp;
3894 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
3895 assert(decap.vxlan);
3896 assert(dev_flow->tcf.tunnel);
3897 dev_flow->tcf.tunnel->ifindex_ptr =
3898 (unsigned int *)&tcm->tcm_ifindex;
3900 mnl_attr_nest_start(nlh, na_act_index_cur++);
3901 assert(na_act_index);
3902 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3903 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3905 mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3906 sizeof(struct tc_tunnel_key),
3907 &(struct tc_tunnel_key){
3908 .action = TC_ACT_PIPE,
3909 .t_action = TCA_TUNNEL_KEY_ACT_RELEASE,
3911 mnl_attr_nest_end(nlh, na_act);
3912 mnl_attr_nest_end(nlh, na_act_index);
3913 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3915 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
3916 assert(encap.vxlan);
3917 flow_tcf_vxlan_encap_parse(actions, encap.vxlan);
3919 mnl_attr_nest_start(nlh, na_act_index_cur++);
3920 assert(na_act_index);
3921 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3922 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3924 mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3925 sizeof(struct tc_tunnel_key),
3926 &(struct tc_tunnel_key){
3927 .action = TC_ACT_PIPE,
3928 .t_action = TCA_TUNNEL_KEY_ACT_SET,
3930 if (encap.vxlan->mask & FLOW_TCF_ENCAP_UDP_DST)
3931 mnl_attr_put_u16(nlh,
3932 TCA_TUNNEL_KEY_ENC_DST_PORT,
3933 encap.vxlan->udp.dst);
3934 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_SRC)
3935 mnl_attr_put_u32(nlh,
3936 TCA_TUNNEL_KEY_ENC_IPV4_SRC,
3937 encap.vxlan->ipv4.src);
3938 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_DST)
3939 mnl_attr_put_u32(nlh,
3940 TCA_TUNNEL_KEY_ENC_IPV4_DST,
3941 encap.vxlan->ipv4.dst);
3942 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_SRC)
3944 TCA_TUNNEL_KEY_ENC_IPV6_SRC,
3945 sizeof(encap.vxlan->ipv6.src),
3946 &encap.vxlan->ipv6.src);
3947 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_DST)
3949 TCA_TUNNEL_KEY_ENC_IPV6_DST,
3950 sizeof(encap.vxlan->ipv6.dst),
3951 &encap.vxlan->ipv6.dst);
3952 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IP_TTL)
3953 mnl_attr_put_u8(nlh,
3954 TCA_TUNNEL_KEY_ENC_TTL,
3955 encap.vxlan->ip_ttl_hop);
3956 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IP_TOS)
3957 mnl_attr_put_u8(nlh,
3958 TCA_TUNNEL_KEY_ENC_TOS,
3959 encap.vxlan->ip_tos);
3960 if (encap.vxlan->mask & FLOW_TCF_ENCAP_VXLAN_VNI)
3961 mnl_attr_put_u32(nlh,
3962 TCA_TUNNEL_KEY_ENC_KEY_ID,
3964 (encap.vxlan->vxlan.vni));
3965 mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM, 0);
3966 mnl_attr_nest_end(nlh, na_act);
3967 mnl_attr_nest_end(nlh, na_act_index);
3968 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3970 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
3971 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
3972 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
3973 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
3974 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
3975 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
3976 case RTE_FLOW_ACTION_TYPE_SET_TTL:
3977 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
3978 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
3979 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
3981 mnl_attr_nest_start(nlh, na_act_index_cur++);
3982 flow_tcf_create_pedit_mnl_msg(nlh,
3983 &actions, item_flags);
3984 mnl_attr_nest_end(nlh, na_act_index);
3987 return rte_flow_error_set(error, ENOTSUP,
3988 RTE_FLOW_ERROR_TYPE_ACTION,
3990 "action not supported");
3994 assert(na_flower_act);
3995 mnl_attr_nest_end(nlh, na_flower_act);
3996 dev_flow->tcf.ptc_flags = mnl_attr_get_payload
3997 (mnl_nlmsg_get_payload_tail(nlh));
3998 mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, decap.vxlan ?
3999 0 : TCA_CLS_FLAGS_SKIP_SW);
4000 mnl_attr_nest_end(nlh, na_flower);
4001 if (dev_flow->tcf.tunnel && dev_flow->tcf.tunnel->ifindex_ptr)
4002 dev_flow->tcf.tunnel->ifindex_org =
4003 *dev_flow->tcf.tunnel->ifindex_ptr;
4004 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
4009 * Send Netlink message with acknowledgment.
4012 * Flow context to use.
4014 * Message to send. This function always raises the NLM_F_ACK flag before
4017 * Callback handler for received message.
4019 * Context pointer for callback handler.
4022 * 0 on success, a negative errno value otherwise and rte_errno is set.
4025 flow_tcf_nl_ack(struct mlx5_flow_tcf_context *tcf,
4026 struct nlmsghdr *nlh,
4027 mnl_cb_t cb, void *arg)
4029 unsigned int portid = mnl_socket_get_portid(tcf->nl);
4030 uint32_t seq = tcf->seq++;
4036 /* seq 0 is reserved for kernel event-driven notifications. */
4039 nlh->nlmsg_seq = seq;
4040 nlh->nlmsg_flags |= NLM_F_ACK;
4041 ret = mnl_socket_sendto(tcf->nl, nlh, nlh->nlmsg_len);
4043 /* Message send error occurres. */
4047 nlh = (struct nlmsghdr *)(tcf->buf);
4049 * The following loop postpones non-fatal errors until multipart
4050 * messages are complete.
4053 ret = mnl_socket_recvfrom(tcf->nl, tcf->buf, tcf->buf_size);
4057 * In case of overflow Will receive till
4058 * end of multipart message. We may lost part
4059 * of reply messages but mark and return an error.
4061 if (err != ENOSPC ||
4062 !(nlh->nlmsg_flags & NLM_F_MULTI) ||
4063 nlh->nlmsg_type == NLMSG_DONE)
4066 ret = mnl_cb_run(nlh, ret, seq, portid, cb, arg);
4069 * libmnl returns 0 if DONE or
4070 * success ACK message found.
4076 * ACK message with error found
4077 * or some error occurred.
4082 /* We should continue receiving. */
4091 #define MNL_BUF_EXTRA_SPACE 16
4092 #define MNL_REQUEST_SIZE_MIN 256
4093 #define MNL_REQUEST_SIZE_MAX 2048
4094 #define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \
4095 MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX)
4097 /* Data structures used by flow_tcf_xxx_cb() routines. */
4098 struct tcf_nlcb_buf {
4099 LIST_ENTRY(tcf_nlcb_buf) next;
4101 alignas(struct nlmsghdr)
4102 uint8_t msg[]; /**< Netlink message data. */
4105 struct tcf_nlcb_context {
4106 unsigned int ifindex; /**< Base interface index. */
4108 LIST_HEAD(, tcf_nlcb_buf) nlbuf;
4112 * Allocate space for netlink command in buffer list
4114 * @param[in, out] ctx
4115 * Pointer to callback context with command buffers list.
4117 * Required size of data buffer to be allocated.
4120 * Pointer to allocated memory, aligned as message header.
4121 * NULL if some error occurred.
4123 static struct nlmsghdr *
4124 flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size)
4126 struct tcf_nlcb_buf *buf;
4127 struct nlmsghdr *nlh;
4129 size = NLMSG_ALIGN(size);
4130 buf = LIST_FIRST(&ctx->nlbuf);
4131 if (buf && (buf->size + size) <= ctx->bufsize) {
4132 nlh = (struct nlmsghdr *)&buf->msg[buf->size];
4136 if (size > ctx->bufsize) {
4137 DRV_LOG(WARNING, "netlink: too long command buffer requested");
4140 buf = rte_malloc(__func__,
4141 ctx->bufsize + sizeof(struct tcf_nlcb_buf),
4142 alignof(struct tcf_nlcb_buf));
4144 DRV_LOG(WARNING, "netlink: no memory for command buffer");
4147 LIST_INSERT_HEAD(&ctx->nlbuf, buf, next);
4149 nlh = (struct nlmsghdr *)&buf->msg[0];
4154 * Send the buffers with prepared netlink commands. Scans the list and
4155 * sends all found buffers. Buffers are sent and freed anyway in order
4156 * to prevent memory leakage if some every message in received packet.
4159 * Context object initialized by mlx5_flow_tcf_context_create().
4160 * @param[in, out] ctx
4161 * Pointer to callback context with command buffers list.
4164 * Zero value on success, negative errno value otherwise
4165 * and rte_errno is set.
4168 flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf,
4169 struct tcf_nlcb_context *ctx)
4171 struct tcf_nlcb_buf *bc = LIST_FIRST(&ctx->nlbuf);
4175 struct tcf_nlcb_buf *bn = LIST_NEXT(bc, next);
4176 struct nlmsghdr *nlh;
4180 while (msg < bc->size) {
4182 * Send Netlink commands from buffer in one by one
4183 * fashion. If we send multiple rule deletion commands
4184 * in one Netlink message and some error occurs it may
4185 * cause multiple ACK error messages and break sequence
4186 * numbers of Netlink communication, because we expect
4187 * the only one ACK reply.
4189 assert((bc->size - msg) >= sizeof(struct nlmsghdr));
4190 nlh = (struct nlmsghdr *)&bc->msg[msg];
4191 assert((bc->size - msg) >= nlh->nlmsg_len);
4192 msg += nlh->nlmsg_len;
4193 rc = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4196 "netlink: cleanup error %d", rc);
4204 LIST_INIT(&ctx->nlbuf);
4209 * Collect local IP address rules with scope link attribute on specified
4210 * network device. This is callback routine called by libmnl mnl_cb_run()
4211 * in loop for every message in received packet.
4214 * Pointer to reply header.
4215 * @param[in, out] arg
4216 * Opaque data pointer for this callback.
4219 * A positive, nonzero value on success, negative errno value otherwise
4220 * and rte_errno is set.
4223 flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg)
4225 struct tcf_nlcb_context *ctx = arg;
4226 struct nlmsghdr *cmd;
4227 struct ifaddrmsg *ifa;
4229 struct nlattr *na_local = NULL;
4230 struct nlattr *na_peer = NULL;
4231 unsigned char family;
4234 if (nlh->nlmsg_type != RTM_NEWADDR) {
4238 ifa = mnl_nlmsg_get_payload(nlh);
4239 family = ifa->ifa_family;
4240 if (ifa->ifa_index != ctx->ifindex ||
4241 ifa->ifa_scope != RT_SCOPE_LINK ||
4242 !(ifa->ifa_flags & IFA_F_PERMANENT) ||
4243 (family != AF_INET && family != AF_INET6))
4245 mnl_attr_for_each(na, nlh, sizeof(*ifa)) {
4246 switch (mnl_attr_get_type(na)) {
4254 if (na_local && na_peer)
4257 if (!na_local || !na_peer)
4259 /* Local rule found with scope link, permanent and assigned peer. */
4260 size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4261 MNL_ALIGN(sizeof(struct ifaddrmsg)) +
4262 (family == AF_INET6 ? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4263 : 2 * SZ_NLATTR_TYPE_OF(uint32_t));
4264 cmd = flow_tcf_alloc_nlcmd(ctx, size);
4269 cmd = mnl_nlmsg_put_header(cmd);
4270 cmd->nlmsg_type = RTM_DELADDR;
4271 cmd->nlmsg_flags = NLM_F_REQUEST;
4272 ifa = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifa));
4273 ifa->ifa_flags = IFA_F_PERMANENT;
4274 ifa->ifa_scope = RT_SCOPE_LINK;
4275 ifa->ifa_index = ctx->ifindex;
4276 if (family == AF_INET) {
4277 ifa->ifa_family = AF_INET;
4278 ifa->ifa_prefixlen = 32;
4279 mnl_attr_put_u32(cmd, IFA_LOCAL, mnl_attr_get_u32(na_local));
4280 mnl_attr_put_u32(cmd, IFA_ADDRESS, mnl_attr_get_u32(na_peer));
4282 ifa->ifa_family = AF_INET6;
4283 ifa->ifa_prefixlen = 128;
4284 mnl_attr_put(cmd, IFA_LOCAL, IPV6_ADDR_LEN,
4285 mnl_attr_get_payload(na_local));
4286 mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN,
4287 mnl_attr_get_payload(na_peer));
4289 assert(size == cmd->nlmsg_len);
4294 * Cleanup the local IP addresses on outer interface.
4297 * Context object initialized by mlx5_flow_tcf_context_create().
4298 * @param[in] ifindex
4299 * Network inferface index to perform cleanup.
4302 flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf,
4303 unsigned int ifindex)
4305 struct nlmsghdr *nlh;
4306 struct ifaddrmsg *ifa;
4307 struct tcf_nlcb_context ctx = {
4309 .bufsize = MNL_REQUEST_SIZE,
4310 .nlbuf = LIST_HEAD_INITIALIZER(),
4316 * Seek and destroy leftovers of local IP addresses with
4317 * matching properties "scope link".
4319 nlh = mnl_nlmsg_put_header(tcf->buf);
4320 nlh->nlmsg_type = RTM_GETADDR;
4321 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4322 ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4323 ifa->ifa_family = AF_UNSPEC;
4324 ifa->ifa_index = ifindex;
4325 ifa->ifa_scope = RT_SCOPE_LINK;
4326 ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_local_cb, &ctx);
4328 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4329 ret = flow_tcf_send_nlcmd(tcf, &ctx);
4331 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4335 * Collect neigh permament rules on specified network device.
4336 * This is callback routine called by libmnl mnl_cb_run() in loop for
4337 * every message in received packet.
4340 * Pointer to reply header.
4341 * @param[in, out] arg
4342 * Opaque data pointer for this callback.
4345 * A positive, nonzero value on success, negative errno value otherwise
4346 * and rte_errno is set.
4349 flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg)
4351 struct tcf_nlcb_context *ctx = arg;
4352 struct nlmsghdr *cmd;
4355 struct nlattr *na_ip = NULL;
4356 struct nlattr *na_mac = NULL;
4357 unsigned char family;
4360 if (nlh->nlmsg_type != RTM_NEWNEIGH) {
4364 ndm = mnl_nlmsg_get_payload(nlh);
4365 family = ndm->ndm_family;
4366 if (ndm->ndm_ifindex != (int)ctx->ifindex ||
4367 !(ndm->ndm_state & NUD_PERMANENT) ||
4368 (family != AF_INET && family != AF_INET6))
4370 mnl_attr_for_each(na, nlh, sizeof(*ndm)) {
4371 switch (mnl_attr_get_type(na)) {
4379 if (na_mac && na_ip)
4382 if (!na_mac || !na_ip)
4384 /* Neigh rule with permenent attribute found. */
4385 size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4386 MNL_ALIGN(sizeof(struct ndmsg)) +
4387 SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) +
4388 (family == AF_INET6 ? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4389 : SZ_NLATTR_TYPE_OF(uint32_t));
4390 cmd = flow_tcf_alloc_nlcmd(ctx, size);
4395 cmd = mnl_nlmsg_put_header(cmd);
4396 cmd->nlmsg_type = RTM_DELNEIGH;
4397 cmd->nlmsg_flags = NLM_F_REQUEST;
4398 ndm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ndm));
4399 ndm->ndm_ifindex = ctx->ifindex;
4400 ndm->ndm_state = NUD_PERMANENT;
4403 if (family == AF_INET) {
4404 ndm->ndm_family = AF_INET;
4405 mnl_attr_put_u32(cmd, NDA_DST, mnl_attr_get_u32(na_ip));
4407 ndm->ndm_family = AF_INET6;
4408 mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN,
4409 mnl_attr_get_payload(na_ip));
4411 mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN,
4412 mnl_attr_get_payload(na_mac));
4413 assert(size == cmd->nlmsg_len);
4418 * Cleanup the neigh rules on outer interface.
4421 * Context object initialized by mlx5_flow_tcf_context_create().
4422 * @param[in] ifindex
4423 * Network inferface index to perform cleanup.
4426 flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf,
4427 unsigned int ifindex)
4429 struct nlmsghdr *nlh;
4431 struct tcf_nlcb_context ctx = {
4433 .bufsize = MNL_REQUEST_SIZE,
4434 .nlbuf = LIST_HEAD_INITIALIZER(),
4439 /* Seek and destroy leftovers of neigh rules. */
4440 nlh = mnl_nlmsg_put_header(tcf->buf);
4441 nlh->nlmsg_type = RTM_GETNEIGH;
4442 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4443 ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4444 ndm->ndm_family = AF_UNSPEC;
4445 ndm->ndm_ifindex = ifindex;
4446 ndm->ndm_state = NUD_PERMANENT;
4447 ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_neigh_cb, &ctx);
4449 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4450 ret = flow_tcf_send_nlcmd(tcf, &ctx);
4452 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4456 * Collect indices of VXLAN encap/decap interfaces associated with device.
4457 * This is callback routine called by libmnl mnl_cb_run() in loop for
4458 * every message in received packet.
4461 * Pointer to reply header.
4462 * @param[in, out] arg
4463 * Opaque data pointer for this callback.
4466 * A positive, nonzero value on success, negative errno value otherwise
4467 * and rte_errno is set.
4470 flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg)
4472 struct tcf_nlcb_context *ctx = arg;
4473 struct nlmsghdr *cmd;
4474 struct ifinfomsg *ifm;
4476 struct nlattr *na_info = NULL;
4477 struct nlattr *na_vxlan = NULL;
4479 unsigned int vxindex;
4482 if (nlh->nlmsg_type != RTM_NEWLINK) {
4486 ifm = mnl_nlmsg_get_payload(nlh);
4487 if (!ifm->ifi_index) {
4491 mnl_attr_for_each(na, nlh, sizeof(*ifm))
4492 if (mnl_attr_get_type(na) == IFLA_LINKINFO) {
4498 mnl_attr_for_each_nested(na, na_info) {
4499 switch (mnl_attr_get_type(na)) {
4500 case IFLA_INFO_KIND:
4501 if (!strncmp("vxlan", mnl_attr_get_str(na),
4502 mnl_attr_get_len(na)))
4505 case IFLA_INFO_DATA:
4509 if (found && na_vxlan)
4512 if (!found || !na_vxlan)
4515 mnl_attr_for_each_nested(na, na_vxlan) {
4516 if (mnl_attr_get_type(na) == IFLA_VXLAN_LINK &&
4517 mnl_attr_get_u32(na) == ctx->ifindex) {
4524 /* Attached VXLAN device found, store the command to delete. */
4525 vxindex = ifm->ifi_index;
4526 size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4527 MNL_ALIGN(sizeof(struct ifinfomsg));
4528 cmd = flow_tcf_alloc_nlcmd(ctx, size);
4533 cmd = mnl_nlmsg_put_header(cmd);
4534 cmd->nlmsg_type = RTM_DELLINK;
4535 cmd->nlmsg_flags = NLM_F_REQUEST;
4536 ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm));
4537 ifm->ifi_family = AF_UNSPEC;
4538 ifm->ifi_index = vxindex;
4539 assert(size == cmd->nlmsg_len);
4544 * Cleanup the outer interface. Removes all found vxlan devices
4545 * attached to specified index, flushes the neigh and local IP
4549 * Context object initialized by mlx5_flow_tcf_context_create().
4550 * @param[in] ifindex
4551 * Network inferface index to perform cleanup.
4554 flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf,
4555 unsigned int ifindex)
4557 struct nlmsghdr *nlh;
4558 struct ifinfomsg *ifm;
4559 struct tcf_nlcb_context ctx = {
4561 .bufsize = MNL_REQUEST_SIZE,
4562 .nlbuf = LIST_HEAD_INITIALIZER(),
4568 * Seek and destroy leftover VXLAN encap/decap interfaces with
4569 * matching properties.
4571 nlh = mnl_nlmsg_put_header(tcf->buf);
4572 nlh->nlmsg_type = RTM_GETLINK;
4573 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4574 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4575 ifm->ifi_family = AF_UNSPEC;
4576 ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_vxlan_cb, &ctx);
4578 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4579 ret = flow_tcf_send_nlcmd(tcf, &ctx);
4581 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4585 * Emit Netlink message to add/remove local address to the outer device.
4586 * The address being added is visible within the link only (scope link).
4588 * Note that an implicit route is maintained by the kernel due to the
4589 * presence of a peer address (IFA_ADDRESS).
4591 * These rules are used for encapsultion only and allow to assign
4592 * the outer tunnel source IP address.
4595 * Libmnl socket context object.
4597 * Encapsulation properties (source address and its peer).
4598 * @param[in] ifindex
4599 * Network interface to apply rule.
4601 * Toggle between add and remove.
4603 * Perform verbose error reporting if not NULL.
4606 * 0 on success, a negative errno value otherwise and rte_errno is set.
4609 flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf,
4610 const struct flow_tcf_vxlan_encap *encap,
4611 unsigned int ifindex,
4613 struct rte_flow_error *error)
4615 struct nlmsghdr *nlh;
4616 struct ifaddrmsg *ifa;
4617 alignas(struct nlmsghdr)
4618 uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
4620 nlh = mnl_nlmsg_put_header(buf);
4621 nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
4623 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4625 ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4626 ifa->ifa_flags = IFA_F_PERMANENT;
4627 ifa->ifa_scope = RT_SCOPE_LINK;
4628 ifa->ifa_index = ifindex;
4629 if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4630 ifa->ifa_family = AF_INET;
4631 ifa->ifa_prefixlen = 32;
4632 mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src);
4633 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST)
4634 mnl_attr_put_u32(nlh, IFA_ADDRESS,
4637 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4638 ifa->ifa_family = AF_INET6;
4639 ifa->ifa_prefixlen = 128;
4640 mnl_attr_put(nlh, IFA_LOCAL,
4641 sizeof(encap->ipv6.src),
4643 if (encap->mask & FLOW_TCF_ENCAP_IPV6_DST)
4644 mnl_attr_put(nlh, IFA_ADDRESS,
4645 sizeof(encap->ipv6.dst),
4648 if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4650 return rte_flow_error_set(error, rte_errno,
4651 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4652 "netlink: cannot complete IFA request"
4657 * Emit Netlink message to add/remove neighbor.
4660 * Libmnl socket context object.
4662 * Encapsulation properties (destination address).
4663 * @param[in] ifindex
4664 * Network interface.
4666 * Toggle between add and remove.
4668 * Perform verbose error reporting if not NULL.
4671 * 0 on success, a negative errno value otherwise and rte_errno is set.
4674 flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf,
4675 const struct flow_tcf_vxlan_encap *encap,
4676 unsigned int ifindex,
4678 struct rte_flow_error *error)
4680 struct nlmsghdr *nlh;
4682 alignas(struct nlmsghdr)
4683 uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
4685 nlh = mnl_nlmsg_put_header(buf);
4686 nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
4688 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4690 ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4691 ndm->ndm_ifindex = ifindex;
4692 ndm->ndm_state = NUD_PERMANENT;
4695 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4696 ndm->ndm_family = AF_INET;
4697 mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst);
4699 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4700 ndm->ndm_family = AF_INET6;
4701 mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst),
4704 if (encap->mask & FLOW_TCF_ENCAP_ETH_SRC && enable)
4706 "outer ethernet source address cannot be "
4707 "forced for VXLAN encapsulation");
4708 if (encap->mask & FLOW_TCF_ENCAP_ETH_DST)
4709 mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst),
4711 if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4713 return rte_flow_error_set(error, rte_errno,
4714 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4715 "netlink: cannot complete ND request"
4720 * Manage the local IP addresses and their peers IP addresses on the
4721 * outer interface for encapsulation purposes. The kernel searches the
4722 * appropriate device for tunnel egress traffic using the outer source
4723 * IP, this IP should be assigned to the outer network device, otherwise
4724 * kernel rejects the rule.
4726 * Adds or removes the addresses using the Netlink command like this:
4727 * ip addr add <src_ip> peer <dst_ip> scope link dev <ifouter>
4729 * The addresses are local to the netdev ("scope link"), this reduces
4730 * the risk of conflicts. Note that an implicit route is maintained by
4731 * the kernel due to the presence of a peer address (IFA_ADDRESS).
4734 * Libmnl socket context object.
4736 * Object, contains rule database and ifouter index.
4737 * @param[in] dev_flow
4738 * Flow object, contains the tunnel parameters (for encap only).
4740 * Toggle between add and remove.
4742 * Perform verbose error reporting if not NULL.
4745 * 0 on success, a negative errno value otherwise and rte_errno is set.
4748 flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf,
4749 struct tcf_irule *iface,
4750 struct mlx5_flow *dev_flow,
4752 struct rte_flow_error *error)
4754 const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4755 struct tcf_local_rule *rule = NULL;
4759 assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4760 if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4761 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_DST);
4762 LIST_FOREACH(rule, &iface->local, next) {
4763 if (rule->mask & FLOW_TCF_ENCAP_IPV4_SRC &&
4764 encap->ipv4.src == rule->ipv4.src &&
4765 encap->ipv4.dst == rule->ipv4.dst) {
4770 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4771 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4772 LIST_FOREACH(rule, &iface->local, next) {
4773 if (rule->mask & FLOW_TCF_ENCAP_IPV6_SRC &&
4774 !memcmp(&encap->ipv6.src, &rule->ipv6.src,
4775 sizeof(encap->ipv6.src)) &&
4776 !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4777 sizeof(encap->ipv6.dst))) {
4787 if (!rule->refcnt || !--rule->refcnt) {
4788 LIST_REMOVE(rule, next);
4789 return flow_tcf_rule_local(tcf, encap,
4790 iface->ifouter, false, error);
4795 DRV_LOG(WARNING, "disabling not existing local rule");
4796 rte_flow_error_set(error, ENOENT,
4797 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4798 "disabling not existing local rule");
4801 rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule),
4802 alignof(struct tcf_local_rule));
4804 rte_flow_error_set(error, ENOMEM,
4805 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4806 "unable to allocate memory for local rule");
4809 *rule = (struct tcf_local_rule){.refcnt = 0,
4812 if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4813 rule->mask = FLOW_TCF_ENCAP_IPV4_SRC
4814 | FLOW_TCF_ENCAP_IPV4_DST;
4815 rule->ipv4.src = encap->ipv4.src;
4816 rule->ipv4.dst = encap->ipv4.dst;
4818 rule->mask = FLOW_TCF_ENCAP_IPV6_SRC
4819 | FLOW_TCF_ENCAP_IPV6_DST;
4820 memcpy(&rule->ipv6.src, &encap->ipv6.src, IPV6_ADDR_LEN);
4821 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4823 ret = flow_tcf_rule_local(tcf, encap, iface->ifouter, true, error);
4829 LIST_INSERT_HEAD(&iface->local, rule, next);
4834 * Manage the destination MAC/IP addresses neigh database, kernel uses
4835 * this one to determine the destination MAC address within encapsulation
4836 * header. Adds or removes the entries using the Netlink command like this:
4837 * ip neigh add dev <ifouter> lladdr <dst_mac> to <dst_ip> nud permanent
4840 * Libmnl socket context object.
4842 * Object, contains rule database and ifouter index.
4843 * @param[in] dev_flow
4844 * Flow object, contains the tunnel parameters (for encap only).
4846 * Toggle between add and remove.
4848 * Perform verbose error reporting if not NULL.
4851 * 0 on success, a negative errno value otherwise and rte_errno is set.
4854 flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf,
4855 struct tcf_irule *iface,
4856 struct mlx5_flow *dev_flow,
4858 struct rte_flow_error *error)
4860 const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4861 struct tcf_neigh_rule *rule = NULL;
4865 assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4866 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4867 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_SRC);
4868 LIST_FOREACH(rule, &iface->neigh, next) {
4869 if (rule->mask & FLOW_TCF_ENCAP_IPV4_DST &&
4870 encap->ipv4.dst == rule->ipv4.dst) {
4875 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4876 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4877 LIST_FOREACH(rule, &iface->neigh, next) {
4878 if (rule->mask & FLOW_TCF_ENCAP_IPV6_DST &&
4879 !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4880 sizeof(encap->ipv6.dst))) {
4886 if (memcmp(&encap->eth.dst, &rule->eth,
4887 sizeof(encap->eth.dst))) {
4888 DRV_LOG(WARNING, "Destination MAC differs"
4890 rte_flow_error_set(error, EEXIST,
4891 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
4892 NULL, "Different MAC address"
4893 " neigh rule for the same"
4901 if (!rule->refcnt || !--rule->refcnt) {
4902 LIST_REMOVE(rule, next);
4903 return flow_tcf_rule_neigh(tcf, encap,
4910 DRV_LOG(WARNING, "Disabling not existing neigh rule");
4911 rte_flow_error_set(error, ENOENT,
4912 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4913 "unable to allocate memory for neigh rule");
4916 rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule),
4917 alignof(struct tcf_neigh_rule));
4919 rte_flow_error_set(error, ENOMEM,
4920 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4921 "unable to allocate memory for neigh rule");
4924 *rule = (struct tcf_neigh_rule){.refcnt = 0,
4927 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4928 rule->mask = FLOW_TCF_ENCAP_IPV4_DST;
4929 rule->ipv4.dst = encap->ipv4.dst;
4931 rule->mask = FLOW_TCF_ENCAP_IPV6_DST;
4932 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4934 memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth));
4935 ret = flow_tcf_rule_neigh(tcf, encap, iface->ifouter, true, error);
4941 LIST_INSERT_HEAD(&iface->neigh, rule, next);
4945 /* VXLAN encap rule database for outer interfaces. */
4946 static LIST_HEAD(, tcf_irule) iface_list_vxlan = LIST_HEAD_INITIALIZER();
4948 /* VTEP device list is shared between PMD port instances. */
4949 static LIST_HEAD(, tcf_vtep) vtep_list_vxlan = LIST_HEAD_INITIALIZER();
4950 static pthread_mutex_t vtep_list_mutex = PTHREAD_MUTEX_INITIALIZER;
4953 * Acquire the VXLAN encap rules container for specified interface.
4954 * First looks for the container in the existing ones list, creates
4955 * and initializes the new container if existing not found.
4958 * Context object initialized by mlx5_flow_tcf_context_create().
4959 * @param[in] ifouter
4960 * Network interface index to create VXLAN encap rules on.
4962 * Perform verbose error reporting if not NULL.
4964 * Rule container pointer on success,
4965 * NULL otherwise and rte_errno is set.
4967 static struct tcf_irule*
4968 flow_tcf_encap_irule_acquire(struct mlx5_flow_tcf_context *tcf,
4969 unsigned int ifouter,
4970 struct rte_flow_error *error)
4972 struct tcf_irule *iface;
4974 /* Look whether the container for encap rules is created. */
4976 LIST_FOREACH(iface, &iface_list_vxlan, next) {
4977 if (iface->ifouter == ifouter)
4981 /* Container already exists, just increment the reference. */
4985 /* Not found, we should create the new container. */
4986 iface = rte_zmalloc(__func__, sizeof(*iface),
4987 alignof(struct tcf_irule));
4989 rte_flow_error_set(error, ENOMEM,
4990 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4991 "unable to allocate memory for container");
4994 *iface = (struct tcf_irule){
4995 .local = LIST_HEAD_INITIALIZER(),
4996 .neigh = LIST_HEAD_INITIALIZER(),
5000 /* Interface cleanup for new container created. */
5001 flow_tcf_encap_iface_cleanup(tcf, ifouter);
5002 flow_tcf_encap_local_cleanup(tcf, ifouter);
5003 flow_tcf_encap_neigh_cleanup(tcf, ifouter);
5004 LIST_INSERT_HEAD(&iface_list_vxlan, iface, next);
5009 * Releases VXLAN encap rules container by pointer. Decrements the
5010 * reference cointer and deletes the container if counter is zero.
5013 * VXLAN rule container pointer to release.
5016 flow_tcf_encap_irule_release(struct tcf_irule *iface)
5018 assert(iface->refcnt);
5019 if (--iface->refcnt == 0) {
5020 /* Reference counter is zero, delete the container. */
5021 assert(LIST_EMPTY(&iface->local));
5022 assert(LIST_EMPTY(&iface->neigh));
5023 LIST_REMOVE(iface, next);
5029 * Deletes VTEP network device.
5032 * Context object initialized by mlx5_flow_tcf_context_create().
5034 * Object represinting the network device to delete. Memory
5035 * allocated for this object is freed by routine.
5038 flow_tcf_vtep_delete(struct mlx5_flow_tcf_context *tcf,
5039 struct tcf_vtep *vtep)
5041 struct nlmsghdr *nlh;
5042 struct ifinfomsg *ifm;
5043 alignas(struct nlmsghdr)
5044 uint8_t buf[mnl_nlmsg_size(MNL_ALIGN(sizeof(*ifm))) +
5045 MNL_BUF_EXTRA_SPACE];
5048 assert(!vtep->refcnt);
5049 /* Delete only ifaces those we actually created. */
5050 if (vtep->created && vtep->ifindex) {
5051 DRV_LOG(INFO, "VTEP delete (%d)", vtep->ifindex);
5052 nlh = mnl_nlmsg_put_header(buf);
5053 nlh->nlmsg_type = RTM_DELLINK;
5054 nlh->nlmsg_flags = NLM_F_REQUEST;
5055 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5056 ifm->ifi_family = AF_UNSPEC;
5057 ifm->ifi_index = vtep->ifindex;
5058 assert(sizeof(buf) >= nlh->nlmsg_len);
5059 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5061 DRV_LOG(WARNING, "netlink: error deleting vxlan"
5062 " encap/decap ifindex %u",
5069 * Creates VTEP network device.
5072 * Context object initialized by mlx5_flow_tcf_context_create().
5074 * UDP port of created VTEP device.
5076 * Perform verbose error reporting if not NULL.
5079 * Pointer to created device structure on success,
5080 * NULL otherwise and rte_errno is set.
5082 static struct tcf_vtep*
5083 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf,
5084 uint16_t port, struct rte_flow_error *error)
5086 struct tcf_vtep *vtep;
5087 struct nlmsghdr *nlh;
5088 struct ifinfomsg *ifm;
5089 char name[sizeof(MLX5_VXLAN_DEVICE_PFX) + 24];
5090 alignas(struct nlmsghdr)
5091 uint8_t buf[mnl_nlmsg_size(sizeof(*ifm)) +
5092 SZ_NLATTR_DATA_OF(sizeof(name)) +
5093 SZ_NLATTR_NEST * 2 +
5094 SZ_NLATTR_STRZ_OF("vxlan") +
5095 SZ_NLATTR_DATA_OF(sizeof(uint32_t)) +
5096 SZ_NLATTR_DATA_OF(sizeof(uint16_t)) +
5097 SZ_NLATTR_DATA_OF(sizeof(uint8_t)) * 3 +
5098 MNL_BUF_EXTRA_SPACE];
5099 struct nlattr *na_info;
5100 struct nlattr *na_vxlan;
5101 rte_be16_t vxlan_port = rte_cpu_to_be_16(port);
5104 vtep = rte_zmalloc(__func__, sizeof(*vtep), alignof(struct tcf_vtep));
5106 rte_flow_error_set(error, ENOMEM,
5107 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5108 "unable to allocate memory for VTEP");
5111 *vtep = (struct tcf_vtep){
5114 memset(buf, 0, sizeof(buf));
5115 nlh = mnl_nlmsg_put_header(buf);
5116 nlh->nlmsg_type = RTM_NEWLINK;
5117 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
5118 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5119 ifm->ifi_family = AF_UNSPEC;
5122 ifm->ifi_flags = IFF_UP;
5123 ifm->ifi_change = 0xffffffff;
5124 snprintf(name, sizeof(name), "%s%u", MLX5_VXLAN_DEVICE_PFX, port);
5125 mnl_attr_put_strz(nlh, IFLA_IFNAME, name);
5126 na_info = mnl_attr_nest_start(nlh, IFLA_LINKINFO);
5128 mnl_attr_put_strz(nlh, IFLA_INFO_KIND, "vxlan");
5129 na_vxlan = mnl_attr_nest_start(nlh, IFLA_INFO_DATA);
5131 #ifdef HAVE_IFLA_VXLAN_COLLECT_METADATA
5133 * RH 7.2 does not support metadata for tunnel device.
5134 * It does not matter because we are going to use the
5135 * hardware offload by mlx5 driver.
5137 mnl_attr_put_u8(nlh, IFLA_VXLAN_COLLECT_METADATA, 1);
5139 mnl_attr_put_u8(nlh, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1);
5140 mnl_attr_put_u8(nlh, IFLA_VXLAN_LEARNING, 0);
5141 mnl_attr_put_u16(nlh, IFLA_VXLAN_PORT, vxlan_port);
5142 #ifndef HAVE_IFLA_VXLAN_COLLECT_METADATA
5144 * We must specify VNI explicitly if metadata not supported.
5145 * Note, VNI is transferred with native endianness format.
5147 mnl_attr_put_u16(nlh, IFLA_VXLAN_ID, MLX5_VXLAN_DEFAULT_VNI);
5149 mnl_attr_nest_end(nlh, na_vxlan);
5150 mnl_attr_nest_end(nlh, na_info);
5151 assert(sizeof(buf) >= nlh->nlmsg_len);
5152 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5155 "netlink: VTEP %s create failure (%d)",
5157 if (rte_errno != EEXIST)
5159 * Some unhandled error occurred or device is
5160 * for encapsulation and cannot be shared.
5165 * Mark device we actually created.
5166 * We should explicitly delete
5167 * when we do not need it anymore.
5171 /* Try to get ifindex of created of pre-existing device. */
5172 ret = if_nametoindex(name);
5175 "VTEP %s failed to get index (%d)", name, errno);
5178 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5179 "netlink: failed to retrieve VTEP ifindex");
5182 vtep->ifindex = ret;
5183 memset(buf, 0, sizeof(buf));
5184 nlh = mnl_nlmsg_put_header(buf);
5185 nlh->nlmsg_type = RTM_NEWLINK;
5186 nlh->nlmsg_flags = NLM_F_REQUEST;
5187 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5188 ifm->ifi_family = AF_UNSPEC;
5190 ifm->ifi_index = vtep->ifindex;
5191 ifm->ifi_flags = IFF_UP;
5192 ifm->ifi_change = IFF_UP;
5193 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5195 rte_flow_error_set(error, -errno,
5196 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5197 "netlink: failed to set VTEP link up");
5198 DRV_LOG(WARNING, "netlink: VTEP %s set link up failure (%d)",
5202 ret = mlx5_flow_tcf_init(tcf, vtep->ifindex, error);
5204 DRV_LOG(WARNING, "VTEP %s init failure (%d)", name, rte_errno);
5207 DRV_LOG(INFO, "VTEP create (%d, %d)", vtep->port, vtep->ifindex);
5211 flow_tcf_vtep_delete(tcf, vtep);
5219 * Acquire target interface index for VXLAN tunneling decapsulation.
5220 * In order to share the UDP port within the other interfaces the
5221 * VXLAN device created as not attached to any interface (if created).
5224 * Context object initialized by mlx5_flow_tcf_context_create().
5225 * @param[in] dev_flow
5226 * Flow tcf object with tunnel structure pointer set.
5228 * Perform verbose error reporting if not NULL.
5230 * Interface descriptor pointer on success,
5231 * NULL otherwise and rte_errno is set.
5233 static struct tcf_vtep*
5234 flow_tcf_decap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5235 struct mlx5_flow *dev_flow,
5236 struct rte_flow_error *error)
5238 struct tcf_vtep *vtep;
5239 uint16_t port = dev_flow->tcf.vxlan_decap->udp_port;
5241 LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5242 if (vtep->port == port)
5246 /* Device exists, just increment the reference counter. */
5248 assert(vtep->ifindex);
5251 /* No decapsulation device exists, try to create the new one. */
5252 vtep = flow_tcf_vtep_create(tcf, port, error);
5254 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5259 * Aqcuire target interface index for VXLAN tunneling encapsulation.
5262 * Context object initialized by mlx5_flow_tcf_context_create().
5263 * @param[in] ifouter
5264 * Network interface index to attach VXLAN encap device to.
5265 * @param[in] dev_flow
5266 * Flow tcf object with tunnel structure pointer set.
5268 * Perform verbose error reporting if not NULL.
5270 * Interface descriptor pointer on success,
5271 * NULL otherwise and rte_errno is set.
5273 static struct tcf_vtep*
5274 flow_tcf_encap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5275 unsigned int ifouter,
5276 struct mlx5_flow *dev_flow,
5277 struct rte_flow_error *error)
5279 static uint16_t port;
5280 struct tcf_vtep *vtep;
5281 struct tcf_irule *iface;
5285 /* Look whether the VTEP for specified port is created. */
5286 port = rte_be_to_cpu_16(dev_flow->tcf.vxlan_encap->udp.dst);
5287 LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5288 if (vtep->port == port)
5292 /* VTEP already exists, just increment the reference. */
5295 /* Not found, we should create the new VTEP. */
5296 vtep = flow_tcf_vtep_create(tcf, port, error);
5299 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5301 assert(vtep->ifindex);
5302 iface = flow_tcf_encap_irule_acquire(tcf, ifouter, error);
5304 if (--vtep->refcnt == 0)
5305 flow_tcf_vtep_delete(tcf, vtep);
5308 dev_flow->tcf.vxlan_encap->iface = iface;
5309 /* Create local ipaddr with peer to specify the outer IPs. */
5310 ret = flow_tcf_encap_local(tcf, iface, dev_flow, true, error);
5312 /* Create neigh rule to specify outer destination MAC. */
5313 ret = flow_tcf_encap_neigh(tcf, iface, dev_flow, true, error);
5315 flow_tcf_encap_local(tcf, iface,
5316 dev_flow, false, error);
5319 dev_flow->tcf.vxlan_encap->iface = NULL;
5320 flow_tcf_encap_irule_release(iface);
5321 if (--vtep->refcnt == 0)
5322 flow_tcf_vtep_delete(tcf, vtep);
5329 * Acquires target interface index for tunneling of any type.
5330 * Creates the new VTEP if needed.
5333 * Context object initialized by mlx5_flow_tcf_context_create().
5334 * @param[in] ifouter
5335 * Network interface index to create VXLAN encap rules on.
5336 * @param[in] dev_flow
5337 * Flow tcf object with tunnel structure pointer set.
5339 * Perform verbose error reporting if not NULL.
5341 * Interface descriptor pointer on success,
5342 * NULL otherwise and rte_errno is set.
5344 static struct tcf_vtep*
5345 flow_tcf_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5346 unsigned int ifouter,
5347 struct mlx5_flow *dev_flow,
5348 struct rte_flow_error *error)
5350 struct tcf_vtep *vtep = NULL;
5352 assert(dev_flow->tcf.tunnel);
5353 pthread_mutex_lock(&vtep_list_mutex);
5354 switch (dev_flow->tcf.tunnel->type) {
5355 case FLOW_TCF_TUNACT_VXLAN_ENCAP:
5356 vtep = flow_tcf_encap_vtep_acquire(tcf, ifouter,
5359 case FLOW_TCF_TUNACT_VXLAN_DECAP:
5360 vtep = flow_tcf_decap_vtep_acquire(tcf, dev_flow, error);
5363 rte_flow_error_set(error, ENOTSUP,
5364 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5365 "unsupported tunnel type");
5368 pthread_mutex_unlock(&vtep_list_mutex);
5373 * Release tunneling interface by ifindex. Decrements reference
5374 * counter and actually removes the device if counter is zero.
5377 * Context object initialized by mlx5_flow_tcf_context_create().
5379 * VTEP device descriptor structure.
5380 * @param[in] dev_flow
5381 * Flow tcf object with tunnel structure pointer set.
5384 flow_tcf_vtep_release(struct mlx5_flow_tcf_context *tcf,
5385 struct tcf_vtep *vtep,
5386 struct mlx5_flow *dev_flow)
5388 assert(dev_flow->tcf.tunnel);
5389 pthread_mutex_lock(&vtep_list_mutex);
5390 switch (dev_flow->tcf.tunnel->type) {
5391 case FLOW_TCF_TUNACT_VXLAN_DECAP:
5393 case FLOW_TCF_TUNACT_VXLAN_ENCAP: {
5394 struct tcf_irule *iface;
5396 /* Remove the encap ancillary rules first. */
5397 iface = dev_flow->tcf.vxlan_encap->iface;
5399 flow_tcf_encap_neigh(tcf, iface, dev_flow, false, NULL);
5400 flow_tcf_encap_local(tcf, iface, dev_flow, false, NULL);
5401 flow_tcf_encap_irule_release(iface);
5402 dev_flow->tcf.vxlan_encap->iface = NULL;
5407 DRV_LOG(WARNING, "Unsupported tunnel type");
5410 assert(vtep->refcnt);
5411 if (--vtep->refcnt == 0) {
5412 LIST_REMOVE(vtep, next);
5413 flow_tcf_vtep_delete(tcf, vtep);
5415 pthread_mutex_unlock(&vtep_list_mutex);
5418 struct tcf_nlcb_query {
5421 uint32_t flags_valid:1;
5425 * Collect queried rule attributes. This is callback routine called by
5426 * libmnl mnl_cb_run() in loop for every message in received packet.
5427 * Current implementation collects the flower flags only.
5430 * Pointer to reply header.
5431 * @param[in, out] arg
5432 * Context pointer for this callback.
5435 * A positive, nonzero value on success (required by libmnl
5436 * to continue messages processing).
5439 flow_tcf_collect_query_cb(const struct nlmsghdr *nlh, void *arg)
5441 struct tcf_nlcb_query *query = arg;
5442 struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
5443 struct nlattr *na, *na_opt;
5444 bool flower = false;
5446 if (nlh->nlmsg_type != RTM_NEWTFILTER ||
5447 tcm->tcm_handle != query->handle)
5449 mnl_attr_for_each(na, nlh, sizeof(*tcm)) {
5450 switch (mnl_attr_get_type(na)) {
5452 if (strcmp(mnl_attr_get_payload(na), "flower")) {
5453 /* Not flower filter, drop entire message. */
5460 /* Not flower options, drop entire message. */
5463 /* Check nested flower options. */
5464 mnl_attr_for_each_nested(na_opt, na) {
5465 switch (mnl_attr_get_type(na_opt)) {
5466 case TCA_FLOWER_FLAGS:
5467 query->flags_valid = 1;
5469 mnl_attr_get_u32(na_opt);
5480 * Query a TC flower rule flags via netlink.
5483 * Context object initialized by mlx5_flow_tcf_context_create().
5484 * @param[in] dev_flow
5485 * Pointer to the flow.
5486 * @param[out] pflags
5487 * pointer to the data retrieved by the query.
5490 * 0 on success, a negative errno value otherwise.
5493 flow_tcf_query_flags(struct mlx5_flow_tcf_context *tcf,
5494 struct mlx5_flow *dev_flow,
5497 struct nlmsghdr *nlh;
5499 struct tcf_nlcb_query query = {
5500 .handle = dev_flow->tcf.tcm->tcm_handle,
5503 nlh = mnl_nlmsg_put_header(tcf->buf);
5504 nlh->nlmsg_type = RTM_GETTFILTER;
5505 nlh->nlmsg_flags = NLM_F_REQUEST;
5506 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
5507 memcpy(tcm, dev_flow->tcf.tcm, sizeof(*tcm));
5509 * Ignore Netlink error for filter query operations.
5510 * The reply length is sent by kernel as errno.
5511 * Just check we got the flags option.
5513 flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_query_cb, &query);
5514 if (!query.flags_valid) {
5518 *pflags = query.tc_flags;
5523 * Query and check the in_hw set for specified rule.
5526 * Context object initialized by mlx5_flow_tcf_context_create().
5527 * @param[in] dev_flow
5528 * Pointer to the flow to check.
5531 * 0 on success, a negative errno value otherwise.
5534 flow_tcf_check_inhw(struct mlx5_flow_tcf_context *tcf,
5535 struct mlx5_flow *dev_flow)
5540 ret = flow_tcf_query_flags(tcf, dev_flow, &flags);
5543 return (flags & TCA_CLS_FLAGS_IN_HW) ? 0 : -ENOENT;
5547 * Remove flow from E-Switch by sending Netlink message.
5550 * Pointer to Ethernet device.
5551 * @param[in, out] flow
5552 * Pointer to the sub flow.
5555 flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
5557 struct priv *priv = dev->data->dev_private;
5558 struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5559 struct mlx5_flow *dev_flow;
5560 struct nlmsghdr *nlh;
5565 dev_flow = LIST_FIRST(&flow->dev_flows);
5568 /* E-Switch flow can't be expanded. */
5569 assert(!LIST_NEXT(dev_flow, next));
5570 if (dev_flow->tcf.applied) {
5571 nlh = dev_flow->tcf.nlh;
5572 nlh->nlmsg_type = RTM_DELTFILTER;
5573 nlh->nlmsg_flags = NLM_F_REQUEST;
5574 flow_tcf_nl_ack(ctx, nlh, NULL, NULL);
5575 if (dev_flow->tcf.tunnel) {
5576 assert(dev_flow->tcf.tunnel->vtep);
5577 flow_tcf_vtep_release(ctx,
5578 dev_flow->tcf.tunnel->vtep,
5580 dev_flow->tcf.tunnel->vtep = NULL;
5582 /* Cleanup the rule handle value. */
5583 tcm = mnl_nlmsg_get_payload(nlh);
5584 tcm->tcm_handle = 0;
5585 dev_flow->tcf.applied = 0;
5590 * Fetch the applied rule handle. This is callback routine called by
5591 * libmnl mnl_cb_run() in loop for every message in received packet.
5592 * When the NLM_F_ECHO flag i sspecified the kernel sends the created
5593 * rule descriptor back to the application and we can retrieve the
5594 * actual rule handle from updated descriptor.
5597 * Pointer to reply header.
5598 * @param[in, out] arg
5599 * Context pointer for this callback.
5602 * A positive, nonzero value on success (required by libmnl
5603 * to continue messages processing).
5606 flow_tcf_collect_apply_cb(const struct nlmsghdr *nlh, void *arg)
5608 struct nlmsghdr *nlhrq = arg;
5609 struct tcmsg *tcmrq = mnl_nlmsg_get_payload(nlhrq);
5610 struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
5613 if (nlh->nlmsg_type != RTM_NEWTFILTER ||
5614 nlh->nlmsg_seq != nlhrq->nlmsg_seq)
5616 mnl_attr_for_each(na, nlh, sizeof(*tcm)) {
5617 switch (mnl_attr_get_type(na)) {
5619 if (strcmp(mnl_attr_get_payload(na), "flower")) {
5620 /* Not flower filter, drop entire message. */
5623 tcmrq->tcm_handle = tcm->tcm_handle;
5630 * Apply flow to E-Switch by sending Netlink message.
5633 * Pointer to Ethernet device.
5634 * @param[in, out] flow
5635 * Pointer to the sub flow.
5637 * Pointer to the error structure.
5640 * 0 on success, a negative errno value otherwise and rte_errno is set.
5643 flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
5644 struct rte_flow_error *error)
5646 struct priv *priv = dev->data->dev_private;
5647 struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5648 struct mlx5_flow *dev_flow;
5649 struct nlmsghdr *nlh;
5653 dev_flow = LIST_FIRST(&flow->dev_flows);
5654 /* E-Switch flow can't be expanded. */
5655 assert(!LIST_NEXT(dev_flow, next));
5656 if (dev_flow->tcf.applied)
5658 nlh = dev_flow->tcf.nlh;
5659 nlh->nlmsg_type = RTM_NEWTFILTER;
5660 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
5661 NLM_F_EXCL | NLM_F_ECHO;
5662 tcm = mnl_nlmsg_get_payload(nlh);
5663 /* Allow kernel to assign handle on its own. */
5664 tcm->tcm_handle = 0;
5665 if (dev_flow->tcf.tunnel) {
5667 * Replace the interface index, target for
5668 * encapsulation, source for decapsulation.
5670 assert(!dev_flow->tcf.tunnel->vtep);
5671 assert(dev_flow->tcf.tunnel->ifindex_ptr);
5672 /* Acquire actual VTEP device when rule is being applied. */
5673 dev_flow->tcf.tunnel->vtep =
5674 flow_tcf_vtep_acquire(ctx,
5675 dev_flow->tcf.tunnel->ifindex_org,
5677 if (!dev_flow->tcf.tunnel->vtep)
5679 DRV_LOG(INFO, "Replace ifindex: %d->%d",
5680 dev_flow->tcf.tunnel->vtep->ifindex,
5681 dev_flow->tcf.tunnel->ifindex_org);
5682 *dev_flow->tcf.tunnel->ifindex_ptr =
5683 dev_flow->tcf.tunnel->vtep->ifindex;
5685 ret = flow_tcf_nl_ack(ctx, nlh, flow_tcf_collect_apply_cb, nlh);
5687 if (!tcm->tcm_handle) {
5688 flow_tcf_remove(dev, flow);
5689 return rte_flow_error_set
5691 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5692 "netlink: rule zero handle returned");
5694 dev_flow->tcf.applied = 1;
5695 if (*dev_flow->tcf.ptc_flags & TCA_CLS_FLAGS_SKIP_SW)
5698 * Rule was applied without skip_sw flag set.
5699 * We should check whether the rule was acctually
5700 * accepted by hardware (have look at in_hw flag).
5702 if (flow_tcf_check_inhw(ctx, dev_flow)) {
5703 flow_tcf_remove(dev, flow);
5704 return rte_flow_error_set
5706 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5707 "netlink: rule has no in_hw flag set");
5711 if (dev_flow->tcf.tunnel) {
5712 /* Rollback the VTEP configuration if rule apply failed. */
5713 assert(dev_flow->tcf.tunnel->vtep);
5714 flow_tcf_vtep_release(ctx, dev_flow->tcf.tunnel->vtep,
5716 dev_flow->tcf.tunnel->vtep = NULL;
5718 return rte_flow_error_set(error, rte_errno,
5719 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5720 "netlink: failed to create TC flow rule");
5724 * Remove flow from E-Switch and release resources of the device flow.
5727 * Pointer to Ethernet device.
5728 * @param[in, out] flow
5729 * Pointer to the sub flow.
5732 flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
5734 struct mlx5_flow *dev_flow;
5738 flow_tcf_remove(dev, flow);
5739 if (flow->counter) {
5740 if (--flow->counter->ref_cnt == 0) {
5741 rte_free(flow->counter);
5742 flow->counter = NULL;
5745 dev_flow = LIST_FIRST(&flow->dev_flows);
5748 /* E-Switch flow can't be expanded. */
5749 assert(!LIST_NEXT(dev_flow, next));
5750 LIST_REMOVE(dev_flow, next);
5755 * Helper routine for figuring the space size required for a parse buffer.
5758 * array of values to use.
5760 * Current location in array.
5762 * Value to compare with.
5765 * The maximum between the given value and the array value on index.
5768 flow_tcf_arr_val_max(uint16_t array[], int idx, uint16_t value)
5770 return idx < 0 ? (value) : RTE_MAX((array)[idx], value);
5774 * Parse rtnetlink message attributes filling the attribute table with the info
5778 * Attribute table to be filled.
5780 * Maxinum entry in the attribute table.
5782 * The attributes section in the message to be parsed.
5784 * The length of the attributes section in the message.
5787 flow_tcf_nl_parse_rtattr(struct rtattr *tb[], int max,
5788 struct rtattr *rta, int len)
5790 unsigned short type;
5791 memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
5792 while (RTA_OK(rta, len)) {
5793 type = rta->rta_type;
5794 if (type <= max && !tb[type])
5796 rta = RTA_NEXT(rta, len);
5801 * Extract flow counters from flower action.
5804 * flower action stats properties in the Netlink message received.
5806 * The backward sequence of rta_types, as written in the attribute table,
5807 * we need to traverse in order to get to the requested object.
5809 * Current location in rta_type table.
5811 * data holding the count statistics of the rte_flow retrieved from
5815 * 0 if data was found and retrieved, -1 otherwise.
5818 flow_tcf_nl_action_stats_parse_and_get(struct rtattr *rta,
5819 uint16_t rta_type[], int idx,
5820 struct gnet_stats_basic *data)
5822 int tca_stats_max = flow_tcf_arr_val_max(rta_type, idx,
5824 struct rtattr *tbs[tca_stats_max + 1];
5826 if (rta == NULL || idx < 0)
5828 flow_tcf_nl_parse_rtattr(tbs, tca_stats_max,
5829 RTA_DATA(rta), RTA_PAYLOAD(rta));
5830 switch (rta_type[idx]) {
5831 case TCA_STATS_BASIC:
5832 if (tbs[TCA_STATS_BASIC]) {
5833 memcpy(data, RTA_DATA(tbs[TCA_STATS_BASIC]),
5834 RTE_MIN(RTA_PAYLOAD(tbs[TCA_STATS_BASIC]),
5846 * Parse flower single action retrieving the requested action attribute,
5850 * flower action properties in the Netlink message received.
5852 * The backward sequence of rta_types, as written in the attribute table,
5853 * we need to traverse in order to get to the requested object.
5855 * Current location in rta_type table.
5857 * Count statistics retrieved from the message query.
5860 * 0 if data was found and retrieved, -1 otherwise.
5863 flow_tcf_nl_parse_one_action_and_get(struct rtattr *arg,
5864 uint16_t rta_type[], int idx, void *data)
5866 int tca_act_max = flow_tcf_arr_val_max(rta_type, idx, TCA_ACT_STATS);
5867 struct rtattr *tb[tca_act_max + 1];
5869 if (arg == NULL || idx < 0)
5871 flow_tcf_nl_parse_rtattr(tb, tca_act_max,
5872 RTA_DATA(arg), RTA_PAYLOAD(arg));
5873 if (tb[TCA_ACT_KIND] == NULL)
5875 switch (rta_type[idx]) {
5877 if (tb[TCA_ACT_STATS])
5878 return flow_tcf_nl_action_stats_parse_and_get
5881 (struct gnet_stats_basic *)data);
5890 * Parse flower action section in the message retrieving the requested
5891 * attribute from the first action that provides it.
5894 * flower section in the Netlink message received.
5896 * The backward sequence of rta_types, as written in the attribute table,
5897 * we need to traverse in order to get to the requested object.
5899 * Current location in rta_type table.
5901 * data retrieved from the message query.
5904 * 0 if data was found and retrieved, -1 otherwise.
5907 flow_tcf_nl_action_parse_and_get(struct rtattr *arg,
5908 uint16_t rta_type[], int idx, void *data)
5910 struct rtattr *tb[TCA_ACT_MAX_PRIO + 1];
5913 if (arg == NULL || idx < 0)
5915 flow_tcf_nl_parse_rtattr(tb, TCA_ACT_MAX_PRIO,
5916 RTA_DATA(arg), RTA_PAYLOAD(arg));
5917 switch (rta_type[idx]) {
5919 * flow counters are stored in the actions defined by the flow
5920 * and not in the flow itself, therefore we need to traverse the
5921 * flower chain of actions in search for them.
5923 * Note that the index is not decremented here.
5926 for (i = 0; i <= TCA_ACT_MAX_PRIO; i++) {
5928 !flow_tcf_nl_parse_one_action_and_get(tb[i],
5941 * Parse flower classifier options in the message, retrieving the requested
5942 * attribute if found.
5945 * flower section in the Netlink message received.
5947 * The backward sequence of rta_types, as written in the attribute table,
5948 * we need to traverse in order to get to the requested object.
5950 * Current location in rta_type table.
5952 * data retrieved from the message query.
5955 * 0 if data was found and retrieved, -1 otherwise.
5958 flow_tcf_nl_opts_parse_and_get(struct rtattr *opt,
5959 uint16_t rta_type[], int idx, void *data)
5961 int tca_flower_max = flow_tcf_arr_val_max(rta_type, idx,
5963 struct rtattr *tb[tca_flower_max + 1];
5965 if (!opt || idx < 0)
5967 flow_tcf_nl_parse_rtattr(tb, tca_flower_max,
5968 RTA_DATA(opt), RTA_PAYLOAD(opt));
5969 switch (rta_type[idx]) {
5970 case TCA_FLOWER_ACT:
5971 if (tb[TCA_FLOWER_ACT])
5972 return flow_tcf_nl_action_parse_and_get
5973 (tb[TCA_FLOWER_ACT],
5974 rta_type, --idx, data);
5983 * Parse Netlink reply on filter query, retrieving the flow counters.
5986 * Message received from Netlink.
5988 * The backward sequence of rta_types, as written in the attribute table,
5989 * we need to traverse in order to get to the requested object.
5991 * Current location in rta_type table.
5993 * data retrieved from the message query.
5996 * 0 if data was found and retrieved, -1 otherwise.
5999 flow_tcf_nl_filter_parse_and_get(struct nlmsghdr *cnlh,
6000 uint16_t rta_type[], int idx, void *data)
6002 struct nlmsghdr *nlh = cnlh;
6003 struct tcmsg *t = NLMSG_DATA(nlh);
6004 int len = nlh->nlmsg_len;
6005 int tca_max = flow_tcf_arr_val_max(rta_type, idx, TCA_OPTIONS);
6006 struct rtattr *tb[tca_max + 1];
6010 if (nlh->nlmsg_type != RTM_NEWTFILTER &&
6011 nlh->nlmsg_type != RTM_GETTFILTER &&
6012 nlh->nlmsg_type != RTM_DELTFILTER)
6014 len -= NLMSG_LENGTH(sizeof(*t));
6017 flow_tcf_nl_parse_rtattr(tb, tca_max, TCA_RTA(t), len);
6018 /* Not a TC flower flow - bail out */
6019 if (!tb[TCA_KIND] ||
6020 strcmp(RTA_DATA(tb[TCA_KIND]), "flower"))
6022 switch (rta_type[idx]) {
6024 if (tb[TCA_OPTIONS])
6025 return flow_tcf_nl_opts_parse_and_get(tb[TCA_OPTIONS],
6036 * A callback to parse Netlink reply on TC flower query.
6039 * Message received from Netlink.
6041 * Pointer to data area to be filled by the parsing routine.
6042 * assumed to be a pointer to struct flow_tcf_stats_basic.
6048 flow_tcf_nl_message_get_stats_basic(const struct nlmsghdr *nlh, void *data)
6051 * The backward sequence of rta_types to pass in order to get
6054 uint16_t rta_type[] = { TCA_STATS_BASIC, TCA_ACT_STATS,
6055 TCA_FLOWER_ACT, TCA_OPTIONS };
6056 struct flow_tcf_stats_basic *sb_data = data;
6058 const struct nlmsghdr *c;
6059 struct nlmsghdr *nc;
6060 } tnlh = { .c = nlh };
6062 if (!flow_tcf_nl_filter_parse_and_get(tnlh.nc, rta_type,
6063 RTE_DIM(rta_type) - 1,
6064 (void *)&sb_data->counters))
6065 sb_data->valid = true;
6070 * Query a TC flower rule for its statistics via netlink.
6073 * Pointer to Ethernet device.
6075 * Pointer to the sub flow.
6077 * data retrieved by the query.
6079 * Perform verbose error reporting if not NULL.
6082 * 0 on success, a negative errno value otherwise and rte_errno is set.
6085 flow_tcf_query_count(struct rte_eth_dev *dev,
6086 struct rte_flow *flow,
6088 struct rte_flow_error *error)
6090 struct flow_tcf_stats_basic sb_data;
6091 struct rte_flow_query_count *qc = data;
6092 struct priv *priv = dev->data->dev_private;
6093 struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
6094 struct mnl_socket *nl = ctx->nl;
6095 struct mlx5_flow *dev_flow;
6096 struct nlmsghdr *nlh;
6097 uint32_t seq = priv->tcf_context->seq++;
6101 memset(&sb_data, 0, sizeof(sb_data));
6102 dev_flow = LIST_FIRST(&flow->dev_flows);
6103 /* E-Switch flow can't be expanded. */
6104 assert(!LIST_NEXT(dev_flow, next));
6105 if (!dev_flow->flow->counter)
6107 nlh = dev_flow->tcf.nlh;
6108 nlh->nlmsg_type = RTM_GETTFILTER;
6109 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ECHO;
6110 nlh->nlmsg_seq = seq;
6111 if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) == -1)
6114 ret = mnl_socket_recvfrom(nl, ctx->buf, ctx->buf_size);
6117 ret = mnl_cb_run(ctx->buf, ret, seq,
6118 mnl_socket_get_portid(nl),
6119 flow_tcf_nl_message_get_stats_basic,
6122 /* Return the delta from last reset. */
6123 if (sb_data.valid) {
6124 /* Return the delta from last reset. */
6127 qc->hits = sb_data.counters.packets - flow->counter->hits;
6128 qc->bytes = sb_data.counters.bytes - flow->counter->bytes;
6130 flow->counter->hits = sb_data.counters.packets;
6131 flow->counter->bytes = sb_data.counters.bytes;
6135 return rte_flow_error_set(error, EINVAL,
6136 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
6138 "flow does not have counter");
6140 return rte_flow_error_set
6141 (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
6142 NULL, "netlink: failed to read flow rule counters");
6144 return rte_flow_error_set
6145 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
6146 NULL, "counters are not available.");
6152 * @see rte_flow_query()
6156 flow_tcf_query(struct rte_eth_dev *dev,
6157 struct rte_flow *flow,
6158 const struct rte_flow_action *actions,
6160 struct rte_flow_error *error)
6164 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
6165 switch (actions->type) {
6166 case RTE_FLOW_ACTION_TYPE_VOID:
6168 case RTE_FLOW_ACTION_TYPE_COUNT:
6169 ret = flow_tcf_query_count(dev, flow, data, error);
6172 return rte_flow_error_set(error, ENOTSUP,
6173 RTE_FLOW_ERROR_TYPE_ACTION,
6175 "action not supported");
6181 const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops = {
6182 .validate = flow_tcf_validate,
6183 .prepare = flow_tcf_prepare,
6184 .translate = flow_tcf_translate,
6185 .apply = flow_tcf_apply,
6186 .remove = flow_tcf_remove,
6187 .destroy = flow_tcf_destroy,
6188 .query = flow_tcf_query,
6192 * Create and configure a libmnl socket for Netlink flow rules.
6195 * A valid libmnl socket object pointer on success, NULL otherwise and
6198 static struct mnl_socket *
6199 flow_tcf_mnl_socket_create(void)
6201 struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
6204 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
6206 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
6211 mnl_socket_close(nl);
6216 * Destroy a libmnl socket.
6219 * Libmnl socket of the @p NETLINK_ROUTE kind.
6222 flow_tcf_mnl_socket_destroy(struct mnl_socket *nl)
6225 mnl_socket_close(nl);
6229 * Initialize ingress qdisc of a given network interface.
6232 * Pointer to tc-flower context to use.
6234 * Index of network interface to initialize.
6236 * Perform verbose error reporting if not NULL.
6239 * 0 on success, a negative errno value otherwise and rte_errno is set.
6242 mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx,
6243 unsigned int ifindex, struct rte_flow_error *error)
6245 struct nlmsghdr *nlh;
6247 alignas(struct nlmsghdr)
6248 uint8_t buf[mnl_nlmsg_size(sizeof(*tcm)) +
6249 SZ_NLATTR_STRZ_OF("ingress") +
6250 MNL_BUF_EXTRA_SPACE];
6252 /* Destroy existing ingress qdisc and everything attached to it. */
6253 nlh = mnl_nlmsg_put_header(buf);
6254 nlh->nlmsg_type = RTM_DELQDISC;
6255 nlh->nlmsg_flags = NLM_F_REQUEST;
6256 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6257 tcm->tcm_family = AF_UNSPEC;
6258 tcm->tcm_ifindex = ifindex;
6259 tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6260 tcm->tcm_parent = TC_H_INGRESS;
6261 assert(sizeof(buf) >= nlh->nlmsg_len);
6262 /* Ignore errors when qdisc is already absent. */
6263 if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL) &&
6264 rte_errno != EINVAL && rte_errno != ENOENT)
6265 return rte_flow_error_set(error, rte_errno,
6266 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6267 "netlink: failed to remove ingress"
6269 /* Create fresh ingress qdisc. */
6270 nlh = mnl_nlmsg_put_header(buf);
6271 nlh->nlmsg_type = RTM_NEWQDISC;
6272 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
6273 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6274 tcm->tcm_family = AF_UNSPEC;
6275 tcm->tcm_ifindex = ifindex;
6276 tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6277 tcm->tcm_parent = TC_H_INGRESS;
6278 mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
6279 assert(sizeof(buf) >= nlh->nlmsg_len);
6280 if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL))
6281 return rte_flow_error_set(error, rte_errno,
6282 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6283 "netlink: failed to create ingress"
6289 * Create libmnl context for Netlink flow rules.
6292 * A valid libmnl socket object pointer on success, NULL otherwise and
6295 struct mlx5_flow_tcf_context *
6296 mlx5_flow_tcf_context_create(void)
6298 struct mlx5_flow_tcf_context *ctx = rte_zmalloc(__func__,
6303 ctx->nl = flow_tcf_mnl_socket_create();
6306 ctx->buf_size = MNL_SOCKET_BUFFER_SIZE;
6307 ctx->buf = rte_zmalloc(__func__,
6308 ctx->buf_size, sizeof(uint32_t));
6311 ctx->seq = random();
6314 mlx5_flow_tcf_context_destroy(ctx);
6319 * Destroy a libmnl context.
6322 * Libmnl socket of the @p NETLINK_ROUTE kind.
6325 mlx5_flow_tcf_context_destroy(struct mlx5_flow_tcf_context *ctx)
6329 flow_tcf_mnl_socket_destroy(ctx->nl);