1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2018 6WIND S.A.
3 * Copyright 2018 Mellanox Technologies, Ltd
8 #include <libmnl/libmnl.h>
9 #include <linux/gen_stats.h>
10 #include <linux/if_ether.h>
11 #include <linux/netlink.h>
12 #include <linux/pkt_cls.h>
13 #include <linux/pkt_sched.h>
14 #include <linux/rtnetlink.h>
15 #include <linux/tc_act/tc_gact.h>
16 #include <linux/tc_act/tc_mirred.h>
17 #include <netinet/in.h>
23 #include <sys/socket.h>
25 #include <rte_byteorder.h>
26 #include <rte_errno.h>
27 #include <rte_ether.h>
29 #include <rte_malloc.h>
30 #include <rte_common.h>
31 #include <rte_cycles.h>
34 #include "mlx5_flow.h"
35 #include "mlx5_autoconf.h"
37 #ifdef HAVE_TC_ACT_VLAN
39 #include <linux/tc_act/tc_vlan.h>
41 #else /* HAVE_TC_ACT_VLAN */
43 #define TCA_VLAN_ACT_POP 1
44 #define TCA_VLAN_ACT_PUSH 2
45 #define TCA_VLAN_ACT_MODIFY 3
46 #define TCA_VLAN_PARMS 2
47 #define TCA_VLAN_PUSH_VLAN_ID 3
48 #define TCA_VLAN_PUSH_VLAN_PROTOCOL 4
49 #define TCA_VLAN_PAD 5
50 #define TCA_VLAN_PUSH_VLAN_PRIORITY 6
57 #endif /* HAVE_TC_ACT_VLAN */
59 #ifdef HAVE_TC_ACT_PEDIT
61 #include <linux/tc_act/tc_pedit.h>
63 #else /* HAVE_TC_ACT_VLAN */
77 TCA_PEDIT_KEY_EX_HTYPE = 1,
78 TCA_PEDIT_KEY_EX_CMD = 2,
79 __TCA_PEDIT_KEY_EX_MAX
82 enum pedit_header_type {
83 TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK = 0,
84 TCA_PEDIT_KEY_EX_HDR_TYPE_ETH = 1,
85 TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 = 2,
86 TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 = 3,
87 TCA_PEDIT_KEY_EX_HDR_TYPE_TCP = 4,
88 TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
93 TCA_PEDIT_KEY_EX_CMD_SET = 0,
94 TCA_PEDIT_KEY_EX_CMD_ADD = 1,
101 __u32 off; /*offset */
108 struct tc_pedit_sel {
112 struct tc_pedit_key keys[0];
115 #endif /* HAVE_TC_ACT_VLAN */
117 #ifdef HAVE_TC_ACT_TUNNEL_KEY
119 #include <linux/tc_act/tc_tunnel_key.h>
121 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
122 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
125 #ifndef HAVE_TCA_TUNNEL_KEY_NO_CSUM
126 #define TCA_TUNNEL_KEY_NO_CSUM 10
129 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_TOS
130 #define TCA_TUNNEL_KEY_ENC_TOS 12
133 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_TTL
134 #define TCA_TUNNEL_KEY_ENC_TTL 13
137 #else /* HAVE_TC_ACT_TUNNEL_KEY */
139 #define TCA_ACT_TUNNEL_KEY 17
140 #define TCA_TUNNEL_KEY_ACT_SET 1
141 #define TCA_TUNNEL_KEY_ACT_RELEASE 2
142 #define TCA_TUNNEL_KEY_PARMS 2
143 #define TCA_TUNNEL_KEY_ENC_IPV4_SRC 3
144 #define TCA_TUNNEL_KEY_ENC_IPV4_DST 4
145 #define TCA_TUNNEL_KEY_ENC_IPV6_SRC 5
146 #define TCA_TUNNEL_KEY_ENC_IPV6_DST 6
147 #define TCA_TUNNEL_KEY_ENC_KEY_ID 7
148 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
149 #define TCA_TUNNEL_KEY_NO_CSUM 10
150 #define TCA_TUNNEL_KEY_ENC_TOS 12
151 #define TCA_TUNNEL_KEY_ENC_TTL 13
153 struct tc_tunnel_key {
158 #endif /* HAVE_TC_ACT_TUNNEL_KEY */
160 /* Normally found in linux/netlink.h. */
161 #ifndef NETLINK_CAP_ACK
162 #define NETLINK_CAP_ACK 10
165 /* Normally found in linux/pkt_sched.h. */
166 #ifndef TC_H_MIN_INGRESS
167 #define TC_H_MIN_INGRESS 0xfff2u
170 /* Normally found in linux/pkt_cls.h. */
171 #ifndef TCA_CLS_FLAGS_SKIP_SW
172 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
174 #ifndef TCA_CLS_FLAGS_IN_HW
175 #define TCA_CLS_FLAGS_IN_HW (1 << 2)
177 #ifndef HAVE_TCA_CHAIN
180 #ifndef HAVE_TCA_FLOWER_ACT
181 #define TCA_FLOWER_ACT 3
183 #ifndef HAVE_TCA_FLOWER_FLAGS
184 #define TCA_FLOWER_FLAGS 22
186 #ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE
187 #define TCA_FLOWER_KEY_ETH_TYPE 8
189 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST
190 #define TCA_FLOWER_KEY_ETH_DST 4
192 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK
193 #define TCA_FLOWER_KEY_ETH_DST_MASK 5
195 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC
196 #define TCA_FLOWER_KEY_ETH_SRC 6
198 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK
199 #define TCA_FLOWER_KEY_ETH_SRC_MASK 7
201 #ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO
202 #define TCA_FLOWER_KEY_IP_PROTO 9
204 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC
205 #define TCA_FLOWER_KEY_IPV4_SRC 10
207 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK
208 #define TCA_FLOWER_KEY_IPV4_SRC_MASK 11
210 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST
211 #define TCA_FLOWER_KEY_IPV4_DST 12
213 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK
214 #define TCA_FLOWER_KEY_IPV4_DST_MASK 13
216 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC
217 #define TCA_FLOWER_KEY_IPV6_SRC 14
219 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK
220 #define TCA_FLOWER_KEY_IPV6_SRC_MASK 15
222 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST
223 #define TCA_FLOWER_KEY_IPV6_DST 16
225 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK
226 #define TCA_FLOWER_KEY_IPV6_DST_MASK 17
228 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC
229 #define TCA_FLOWER_KEY_TCP_SRC 18
231 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK
232 #define TCA_FLOWER_KEY_TCP_SRC_MASK 35
234 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST
235 #define TCA_FLOWER_KEY_TCP_DST 19
237 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK
238 #define TCA_FLOWER_KEY_TCP_DST_MASK 36
240 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC
241 #define TCA_FLOWER_KEY_UDP_SRC 20
243 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK
244 #define TCA_FLOWER_KEY_UDP_SRC_MASK 37
246 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST
247 #define TCA_FLOWER_KEY_UDP_DST 21
249 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK
250 #define TCA_FLOWER_KEY_UDP_DST_MASK 38
252 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID
253 #define TCA_FLOWER_KEY_VLAN_ID 23
255 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO
256 #define TCA_FLOWER_KEY_VLAN_PRIO 24
258 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
259 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
261 #ifndef HAVE_TCA_FLOWER_KEY_ENC_KEY_ID
262 #define TCA_FLOWER_KEY_ENC_KEY_ID 26
264 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC
265 #define TCA_FLOWER_KEY_ENC_IPV4_SRC 27
267 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK
268 #define TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK 28
270 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST
271 #define TCA_FLOWER_KEY_ENC_IPV4_DST 29
273 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK
274 #define TCA_FLOWER_KEY_ENC_IPV4_DST_MASK 30
276 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC
277 #define TCA_FLOWER_KEY_ENC_IPV6_SRC 31
279 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK
280 #define TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK 32
282 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST
283 #define TCA_FLOWER_KEY_ENC_IPV6_DST 33
285 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK
286 #define TCA_FLOWER_KEY_ENC_IPV6_DST_MASK 34
288 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT
289 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT 43
291 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK
292 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK 44
294 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT
295 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT 45
297 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK
298 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK 46
300 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS
301 #define TCA_FLOWER_KEY_TCP_FLAGS 71
303 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK
304 #define TCA_FLOWER_KEY_TCP_FLAGS_MASK 72
306 #ifndef HAVE_TCA_FLOWER_KEY_IP_TOS
307 #define TCA_FLOWER_KEY_IP_TOS 73
309 #ifndef HAVE_TCA_FLOWER_KEY_IP_TOS_MASK
310 #define TCA_FLOWER_KEY_IP_TOS_MASK 74
312 #ifndef HAVE_TCA_FLOWER_KEY_IP_TTL
313 #define TCA_FLOWER_KEY_IP_TTL 75
315 #ifndef HAVE_TCA_FLOWER_KEY_IP_TTL_MASK
316 #define TCA_FLOWER_KEY_IP_TTL_MASK 76
318 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TOS
319 #define TCA_FLOWER_KEY_ENC_IP_TOS 80
321 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TOS_MASK
322 #define TCA_FLOWER_KEY_ENC_IP_TOS_MASK 81
324 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TTL
325 #define TCA_FLOWER_KEY_ENC_IP_TTL 82
327 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TTL_MASK
328 #define TCA_FLOWER_KEY_ENC_IP_TTL_MASK 83
331 #ifndef HAVE_TC_ACT_GOTO_CHAIN
332 #define TC_ACT_GOTO_CHAIN 0x20000000
335 #ifndef IPV6_ADDR_LEN
336 #define IPV6_ADDR_LEN 16
339 #ifndef IPV4_ADDR_LEN
340 #define IPV4_ADDR_LEN 4
344 #define TP_PORT_LEN 2 /* Transport Port (UDP/TCP) Length */
351 #ifndef TCA_ACT_MAX_PRIO
352 #define TCA_ACT_MAX_PRIO 32
355 /** Parameters of VXLAN devices created by driver. */
356 #define MLX5_VXLAN_DEFAULT_VNI 1
357 #define MLX5_VXLAN_DEVICE_PFX "vmlx_"
359 * Timeout in milliseconds to wait VXLAN UDP offloaded port
360 * registration completed within the mlx5 driver.
362 #define MLX5_VXLAN_WAIT_PORT_REG_MS 250
364 /** Tunnel action type, used for @p type in header structure. */
365 enum flow_tcf_tunact_type {
366 FLOW_TCF_TUNACT_VXLAN_DECAP,
367 FLOW_TCF_TUNACT_VXLAN_ENCAP,
370 /** Flags used for @p mask in tunnel action encap descriptors. */
371 #define FLOW_TCF_ENCAP_ETH_SRC (1u << 0)
372 #define FLOW_TCF_ENCAP_ETH_DST (1u << 1)
373 #define FLOW_TCF_ENCAP_IPV4_SRC (1u << 2)
374 #define FLOW_TCF_ENCAP_IPV4_DST (1u << 3)
375 #define FLOW_TCF_ENCAP_IPV6_SRC (1u << 4)
376 #define FLOW_TCF_ENCAP_IPV6_DST (1u << 5)
377 #define FLOW_TCF_ENCAP_UDP_SRC (1u << 6)
378 #define FLOW_TCF_ENCAP_UDP_DST (1u << 7)
379 #define FLOW_TCF_ENCAP_VXLAN_VNI (1u << 8)
380 #define FLOW_TCF_ENCAP_IP_TTL (1u << 9)
381 #define FLOW_TCF_ENCAP_IP_TOS (1u << 10)
384 * Structure for holding netlink context.
385 * Note the size of the message buffer which is MNL_SOCKET_BUFFER_SIZE.
386 * Using this (8KB) buffer size ensures that netlink messages will never be
389 struct mlx5_flow_tcf_context {
390 struct mnl_socket *nl; /* NETLINK_ROUTE libmnl socket. */
391 uint32_t seq; /* Message sequence number. */
392 uint32_t buf_size; /* Message buffer size. */
393 uint8_t *buf; /* Message buffer. */
397 * Neigh rule structure. The neigh rule is applied via Netlink to
398 * outer tunnel iface in order to provide destination MAC address
399 * for the VXLAN encapsultion. The neigh rule is implicitly related
400 * to the Flow itself and can be shared by multiple Flows.
402 struct tcf_neigh_rule {
403 LIST_ENTRY(tcf_neigh_rule) next;
405 struct rte_ether_addr eth;
412 uint8_t dst[IPV6_ADDR_LEN];
418 * Local rule structure. The local rule is applied via Netlink to
419 * outer tunnel iface in order to provide local and peer IP addresses
420 * of the VXLAN tunnel for encapsulation. The local rule is implicitly
421 * related to the Flow itself and can be shared by multiple Flows.
423 struct tcf_local_rule {
424 LIST_ENTRY(tcf_local_rule) next;
433 uint8_t dst[IPV6_ADDR_LEN];
434 uint8_t src[IPV6_ADDR_LEN];
439 /** Outer interface VXLAN encapsulation rules container. */
441 LIST_ENTRY(tcf_irule) next;
442 LIST_HEAD(, tcf_neigh_rule) neigh;
443 LIST_HEAD(, tcf_local_rule) local;
445 unsigned int ifouter; /**< Own interface index. */
448 /** VXLAN virtual netdev. */
450 LIST_ENTRY(tcf_vtep) next;
452 unsigned int ifindex; /**< Own interface index. */
454 uint32_t created:1; /**< Actually created by PMD. */
455 uint32_t waitreg:1; /**< Wait for VXLAN UDP port registration. */
458 /** Tunnel descriptor header, common for all tunnel types. */
459 struct flow_tcf_tunnel_hdr {
460 uint32_t type; /**< Tunnel action type. */
461 struct tcf_vtep *vtep; /**< Virtual tunnel endpoint device. */
462 unsigned int ifindex_org; /**< Original dst/src interface */
463 unsigned int *ifindex_ptr; /**< Interface ptr in message. */
466 struct flow_tcf_vxlan_decap {
467 struct flow_tcf_tunnel_hdr hdr;
471 struct flow_tcf_vxlan_encap {
472 struct flow_tcf_tunnel_hdr hdr;
473 struct tcf_irule *iface;
478 struct rte_ether_addr dst;
479 struct rte_ether_addr src;
487 uint8_t dst[IPV6_ADDR_LEN];
488 uint8_t src[IPV6_ADDR_LEN];
500 /** Structure used when extracting the values of a flow counters
501 * from a netlink message.
503 struct flow_tcf_stats_basic {
505 struct gnet_stats_basic counters;
508 /** Empty masks for known item types. */
510 struct rte_flow_item_port_id port_id;
511 struct rte_flow_item_eth eth;
512 struct rte_flow_item_vlan vlan;
513 struct rte_flow_item_ipv4 ipv4;
514 struct rte_flow_item_ipv6 ipv6;
515 struct rte_flow_item_tcp tcp;
516 struct rte_flow_item_udp udp;
517 struct rte_flow_item_vxlan vxlan;
518 } flow_tcf_mask_empty = {
522 /** Supported masks for known item types. */
523 static const struct {
524 struct rte_flow_item_port_id port_id;
525 struct rte_flow_item_eth eth;
526 struct rte_flow_item_vlan vlan;
527 struct rte_flow_item_ipv4 ipv4;
528 struct rte_flow_item_ipv6 ipv6;
529 struct rte_flow_item_tcp tcp;
530 struct rte_flow_item_udp udp;
531 struct rte_flow_item_vxlan vxlan;
532 } flow_tcf_mask_supported = {
537 .type = RTE_BE16(0xffff),
538 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
539 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
542 /* PCP and VID only, no DEI. */
543 .tci = RTE_BE16(0xefff),
544 .inner_type = RTE_BE16(0xffff),
547 .next_proto_id = 0xff,
548 .time_to_live = 0xff,
549 .type_of_service = 0xff,
550 .src_addr = RTE_BE32(0xffffffff),
551 .dst_addr = RTE_BE32(0xffffffff),
555 .vtc_flow = RTE_BE32(0xfful << IPV6_HDR_FL_SHIFT),
558 "\xff\xff\xff\xff\xff\xff\xff\xff"
559 "\xff\xff\xff\xff\xff\xff\xff\xff",
561 "\xff\xff\xff\xff\xff\xff\xff\xff"
562 "\xff\xff\xff\xff\xff\xff\xff\xff",
565 .src_port = RTE_BE16(0xffff),
566 .dst_port = RTE_BE16(0xffff),
570 .src_port = RTE_BE16(0xffff),
571 .dst_port = RTE_BE16(0xffff),
574 .vni = "\xff\xff\xff",
578 #define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr))
579 #define SZ_NLATTR_NEST SZ_NLATTR_HDR
580 #define SZ_NLATTR_DATA_OF(len) MNL_ALIGN(SZ_NLATTR_HDR + (len))
581 #define SZ_NLATTR_TYPE_OF(typ) SZ_NLATTR_DATA_OF(sizeof(typ))
582 #define SZ_NLATTR_STRZ_OF(str) SZ_NLATTR_DATA_OF(strlen(str) + 1)
584 #define PTOI_TABLE_SZ_MAX(dev) (mlx5_dev_to_port_id((dev)->device, NULL, 0) + 2)
586 /** DPDK port to network interface index (ifindex) conversion. */
587 struct flow_tcf_ptoi {
588 uint16_t port_id; /**< DPDK port ID. */
589 unsigned int ifindex; /**< Network interface index. */
592 /* Due to a limitation on driver/FW. */
593 #define MLX5_TCF_GROUP_ID_MAX 3
596 * Due to a limitation on driver/FW, priority ranges from 1 to 16 in kernel.
597 * Priority in rte_flow attribute starts from 0 and is added by 1 in
598 * translation. This is subject to be changed to determine the max priority
599 * based on trial-and-error like Verbs driver once the restriction is lifted or
600 * the range is extended.
602 #define MLX5_TCF_GROUP_PRIORITY_MAX 15
604 #define MLX5_TCF_FATE_ACTIONS \
605 (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \
606 MLX5_FLOW_ACTION_JUMP)
608 #define MLX5_TCF_VLAN_ACTIONS \
609 (MLX5_FLOW_ACTION_OF_POP_VLAN | MLX5_FLOW_ACTION_OF_PUSH_VLAN | \
610 MLX5_FLOW_ACTION_OF_SET_VLAN_VID | MLX5_FLOW_ACTION_OF_SET_VLAN_PCP)
612 #define MLX5_TCF_VXLAN_ACTIONS \
613 (MLX5_FLOW_ACTION_VXLAN_ENCAP | MLX5_FLOW_ACTION_VXLAN_DECAP)
615 #define MLX5_TCF_PEDIT_ACTIONS \
616 (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST | \
617 MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST | \
618 MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST | \
619 MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL | \
620 MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)
622 #define MLX5_TCF_CONFIG_ACTIONS \
623 (MLX5_FLOW_ACTION_PORT_ID | MLX5_FLOW_ACTION_JUMP | \
624 MLX5_FLOW_ACTION_OF_PUSH_VLAN | MLX5_FLOW_ACTION_OF_SET_VLAN_VID | \
625 MLX5_FLOW_ACTION_OF_SET_VLAN_PCP | \
626 (MLX5_TCF_PEDIT_ACTIONS & ~MLX5_FLOW_ACTION_DEC_TTL))
628 #define MAX_PEDIT_KEYS 128
629 #define SZ_PEDIT_KEY_VAL 4
631 #define NUM_OF_PEDIT_KEYS(sz) \
632 (((sz) / SZ_PEDIT_KEY_VAL) + (((sz) % SZ_PEDIT_KEY_VAL) ? 1 : 0))
634 struct pedit_key_ex {
635 enum pedit_header_type htype;
639 struct pedit_parser {
640 struct tc_pedit_sel sel;
641 struct tc_pedit_key keys[MAX_PEDIT_KEYS];
642 struct pedit_key_ex keys_ex[MAX_PEDIT_KEYS];
646 * Create space for using the implicitly created TC flow counter.
649 * Pointer to the Ethernet device structure.
652 * A pointer to the counter data structure, NULL otherwise and
655 static struct mlx5_flow_counter *
656 flow_tcf_counter_new(void)
658 struct mlx5_flow_counter *cnt;
661 * eswitch counter cannot be shared and its id is unknown.
662 * currently returning all with id 0.
663 * in the future maybe better to switch to unique numbers.
665 struct mlx5_flow_counter tmpl = {
668 cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0);
674 /* Implicit counter, do not add to list. */
679 * Set pedit key of MAC address
682 * pointer to action specification
683 * @param[in,out] p_parser
684 * pointer to pedit_parser
687 flow_tcf_pedit_key_set_mac(const struct rte_flow_action *actions,
688 struct pedit_parser *p_parser)
690 int idx = p_parser->sel.nkeys;
691 uint32_t off = actions->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ?
692 offsetof(struct rte_ether_hdr, s_addr) :
693 offsetof(struct rte_ether_hdr, d_addr);
694 const struct rte_flow_action_set_mac *conf =
695 (const struct rte_flow_action_set_mac *)actions->conf;
697 p_parser->keys[idx].off = off;
698 p_parser->keys[idx].mask = ~UINT32_MAX;
699 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
700 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
701 memcpy(&p_parser->keys[idx].val,
702 conf->mac_addr, SZ_PEDIT_KEY_VAL);
704 p_parser->keys[idx].off = off + SZ_PEDIT_KEY_VAL;
705 p_parser->keys[idx].mask = 0xFFFF0000;
706 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
707 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
708 memcpy(&p_parser->keys[idx].val,
709 conf->mac_addr + SZ_PEDIT_KEY_VAL,
710 RTE_ETHER_ADDR_LEN - SZ_PEDIT_KEY_VAL);
711 p_parser->sel.nkeys = (++idx);
715 * Set pedit key of decrease/set ttl
718 * pointer to action specification
719 * @param[in,out] p_parser
720 * pointer to pedit_parser
721 * @param[in] item_flags
722 * flags of all items presented
725 flow_tcf_pedit_key_set_dec_ttl(const struct rte_flow_action *actions,
726 struct pedit_parser *p_parser,
729 int idx = p_parser->sel.nkeys;
731 p_parser->keys[idx].mask = 0xFFFFFF00;
732 if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) {
733 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
734 p_parser->keys[idx].off =
735 offsetof(struct ipv4_hdr, time_to_live);
737 if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6) {
738 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
739 p_parser->keys[idx].off =
740 offsetof(struct ipv6_hdr, hop_limits);
742 if (actions->type == RTE_FLOW_ACTION_TYPE_DEC_TTL) {
743 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_ADD;
744 p_parser->keys[idx].val = 0x000000FF;
746 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
747 p_parser->keys[idx].val =
748 (__u32)((const struct rte_flow_action_set_ttl *)
749 actions->conf)->ttl_value;
751 p_parser->sel.nkeys = (++idx);
755 * Set pedit key of transport (TCP/UDP) port value
758 * pointer to action specification
759 * @param[in,out] p_parser
760 * pointer to pedit_parser
761 * @param[in] item_flags
762 * flags of all items presented
765 flow_tcf_pedit_key_set_tp_port(const struct rte_flow_action *actions,
766 struct pedit_parser *p_parser,
769 int idx = p_parser->sel.nkeys;
771 if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)
772 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_UDP;
773 if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_TCP)
774 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_TCP;
775 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
776 /* offset of src/dst port is same for TCP and UDP */
777 p_parser->keys[idx].off =
778 actions->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC ?
779 offsetof(struct tcp_hdr, src_port) :
780 offsetof(struct tcp_hdr, dst_port);
781 p_parser->keys[idx].mask = 0xFFFF0000;
782 p_parser->keys[idx].val =
783 (__u32)((const struct rte_flow_action_set_tp *)
784 actions->conf)->port;
785 p_parser->sel.nkeys = (++idx);
789 * Set pedit key of ipv6 address
792 * pointer to action specification
793 * @param[in,out] p_parser
794 * pointer to pedit_parser
797 flow_tcf_pedit_key_set_ipv6_addr(const struct rte_flow_action *actions,
798 struct pedit_parser *p_parser)
800 int idx = p_parser->sel.nkeys;
801 int keys = NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
803 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ?
804 offsetof(struct ipv6_hdr, src_addr) :
805 offsetof(struct ipv6_hdr, dst_addr);
806 const struct rte_flow_action_set_ipv6 *conf =
807 (const struct rte_flow_action_set_ipv6 *)actions->conf;
809 for (int i = 0; i < keys; i++, idx++) {
810 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
811 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
812 p_parser->keys[idx].off = off_base + i * SZ_PEDIT_KEY_VAL;
813 p_parser->keys[idx].mask = ~UINT32_MAX;
814 memcpy(&p_parser->keys[idx].val,
815 conf->ipv6_addr + i * SZ_PEDIT_KEY_VAL,
818 p_parser->sel.nkeys += keys;
822 * Set pedit key of ipv4 address
825 * pointer to action specification
826 * @param[in,out] p_parser
827 * pointer to pedit_parser
830 flow_tcf_pedit_key_set_ipv4_addr(const struct rte_flow_action *actions,
831 struct pedit_parser *p_parser)
833 int idx = p_parser->sel.nkeys;
835 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
836 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
837 p_parser->keys[idx].off =
838 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ?
839 offsetof(struct ipv4_hdr, src_addr) :
840 offsetof(struct ipv4_hdr, dst_addr);
841 p_parser->keys[idx].mask = ~UINT32_MAX;
842 p_parser->keys[idx].val =
843 ((const struct rte_flow_action_set_ipv4 *)
844 actions->conf)->ipv4_addr;
845 p_parser->sel.nkeys = (++idx);
849 * Create the pedit's na attribute in netlink message
850 * on pre-allocate message buffer
853 * pointer to pre-allocated netlink message buffer
854 * @param[in,out] actions
855 * pointer to pointer of actions specification.
856 * @param[in,out] action_flags
857 * pointer to actions flags
858 * @param[in] item_flags
859 * flags of all item presented
862 flow_tcf_create_pedit_mnl_msg(struct nlmsghdr *nl,
863 const struct rte_flow_action **actions,
866 struct pedit_parser p_parser;
867 struct nlattr *na_act_options;
868 struct nlattr *na_pedit_keys;
870 memset(&p_parser, 0, sizeof(p_parser));
871 mnl_attr_put_strz(nl, TCA_ACT_KIND, "pedit");
872 na_act_options = mnl_attr_nest_start(nl, TCA_ACT_OPTIONS);
873 /* all modify header actions should be in one tc-pedit action */
874 for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
875 switch ((*actions)->type) {
876 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
877 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
878 flow_tcf_pedit_key_set_ipv4_addr(*actions, &p_parser);
880 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
881 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
882 flow_tcf_pedit_key_set_ipv6_addr(*actions, &p_parser);
884 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
885 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
886 flow_tcf_pedit_key_set_tp_port(*actions,
887 &p_parser, item_flags);
889 case RTE_FLOW_ACTION_TYPE_SET_TTL:
890 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
891 flow_tcf_pedit_key_set_dec_ttl(*actions,
892 &p_parser, item_flags);
894 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
895 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
896 flow_tcf_pedit_key_set_mac(*actions, &p_parser);
899 goto pedit_mnl_msg_done;
903 p_parser.sel.action = TC_ACT_PIPE;
904 mnl_attr_put(nl, TCA_PEDIT_PARMS_EX,
905 sizeof(p_parser.sel) +
906 p_parser.sel.nkeys * sizeof(struct tc_pedit_key),
909 mnl_attr_nest_start(nl, TCA_PEDIT_KEYS_EX | NLA_F_NESTED);
910 for (int i = 0; i < p_parser.sel.nkeys; i++) {
911 struct nlattr *na_pedit_key =
912 mnl_attr_nest_start(nl,
913 TCA_PEDIT_KEY_EX | NLA_F_NESTED);
914 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_HTYPE,
915 p_parser.keys_ex[i].htype);
916 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_CMD,
917 p_parser.keys_ex[i].cmd);
918 mnl_attr_nest_end(nl, na_pedit_key);
920 mnl_attr_nest_end(nl, na_pedit_keys);
921 mnl_attr_nest_end(nl, na_act_options);
926 * Calculate max memory size of one TC-pedit actions.
927 * One TC-pedit action can contain set of keys each defining
928 * a rewrite element (rte_flow action)
930 * @param[in,out] actions
931 * actions specification.
932 * @param[in,out] action_flags
934 * @param[in,out] size
937 * Max memory size of one TC-pedit action
940 flow_tcf_get_pedit_actions_size(const struct rte_flow_action **actions,
941 uint64_t *action_flags)
947 pedit_size += SZ_NLATTR_NEST + /* na_act_index. */
948 SZ_NLATTR_STRZ_OF("pedit") +
949 SZ_NLATTR_NEST; /* TCA_ACT_OPTIONS. */
950 for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
951 switch ((*actions)->type) {
952 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
953 keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
954 flags |= MLX5_FLOW_ACTION_SET_IPV4_SRC;
956 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
957 keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
958 flags |= MLX5_FLOW_ACTION_SET_IPV4_DST;
960 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
961 keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
962 flags |= MLX5_FLOW_ACTION_SET_IPV6_SRC;
964 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
965 keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
966 flags |= MLX5_FLOW_ACTION_SET_IPV6_DST;
968 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
969 /* TCP is as same as UDP */
970 keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
971 flags |= MLX5_FLOW_ACTION_SET_TP_SRC;
973 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
974 /* TCP is as same as UDP */
975 keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
976 flags |= MLX5_FLOW_ACTION_SET_TP_DST;
978 case RTE_FLOW_ACTION_TYPE_SET_TTL:
979 keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
980 flags |= MLX5_FLOW_ACTION_SET_TTL;
982 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
983 keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
984 flags |= MLX5_FLOW_ACTION_DEC_TTL;
986 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
987 keys += NUM_OF_PEDIT_KEYS(RTE_ETHER_ADDR_LEN);
988 flags |= MLX5_FLOW_ACTION_SET_MAC_SRC;
990 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
991 keys += NUM_OF_PEDIT_KEYS(RTE_ETHER_ADDR_LEN);
992 flags |= MLX5_FLOW_ACTION_SET_MAC_DST;
995 goto get_pedit_action_size_done;
998 get_pedit_action_size_done:
999 /* TCA_PEDIT_PARAMS_EX */
1001 SZ_NLATTR_DATA_OF(sizeof(struct tc_pedit_sel) +
1002 keys * sizeof(struct tc_pedit_key));
1003 pedit_size += SZ_NLATTR_NEST; /* TCA_PEDIT_KEYS */
1004 pedit_size += keys *
1005 /* TCA_PEDIT_KEY_EX + HTYPE + CMD */
1006 (SZ_NLATTR_NEST + SZ_NLATTR_DATA_OF(2) +
1007 SZ_NLATTR_DATA_OF(2));
1008 (*action_flags) |= flags;
1014 * Retrieve mask for pattern item.
1016 * This function does basic sanity checks on a pattern item in order to
1017 * return the most appropriate mask for it.
1020 * Item specification.
1021 * @param[in] mask_default
1022 * Default mask for pattern item as specified by the flow API.
1023 * @param[in] mask_supported
1024 * Mask fields supported by the implementation.
1025 * @param[in] mask_empty
1026 * Empty mask to return when there is no specification.
1028 * Perform verbose error reporting if not NULL.
1031 * Either @p item->mask or one of the mask parameters on success, NULL
1032 * otherwise and rte_errno is set.
1035 flow_tcf_item_mask(const struct rte_flow_item *item, const void *mask_default,
1036 const void *mask_supported, const void *mask_empty,
1037 size_t mask_size, struct rte_flow_error *error)
1039 const uint8_t *mask;
1042 /* item->last and item->mask cannot exist without item->spec. */
1043 if (!item->spec && (item->mask || item->last)) {
1044 rte_flow_error_set(error, EINVAL,
1045 RTE_FLOW_ERROR_TYPE_ITEM, item,
1046 "\"mask\" or \"last\" field provided without"
1047 " a corresponding \"spec\"");
1050 /* No spec, no mask, no problem. */
1053 mask = item->mask ? item->mask : mask_default;
1056 * Single-pass check to make sure that:
1057 * - Mask is supported, no bits are set outside mask_supported.
1058 * - Both item->spec and item->last are included in mask.
1060 for (i = 0; i != mask_size; ++i) {
1063 if ((mask[i] | ((const uint8_t *)mask_supported)[i]) !=
1064 ((const uint8_t *)mask_supported)[i]) {
1065 rte_flow_error_set(error, ENOTSUP,
1066 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1067 "unsupported field found"
1072 (((const uint8_t *)item->spec)[i] & mask[i]) !=
1073 (((const uint8_t *)item->last)[i] & mask[i])) {
1074 rte_flow_error_set(error, EINVAL,
1075 RTE_FLOW_ERROR_TYPE_ITEM_LAST,
1077 "range between \"spec\" and \"last\""
1078 " not comprised in \"mask\"");
1086 * Build a conversion table between port ID and ifindex.
1089 * Pointer to Ethernet device.
1091 * Pointer to ptoi table.
1093 * Size of ptoi table provided.
1096 * Size of ptoi table filled.
1099 flow_tcf_build_ptoi_table(struct rte_eth_dev *dev, struct flow_tcf_ptoi *ptoi,
1102 unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
1103 uint16_t port_id[n + 1];
1105 unsigned int own = 0;
1107 /* At least one port is needed when no switch domain is present. */
1110 port_id[0] = dev->data->port_id;
1112 n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
1116 for (i = 0; i != n; ++i) {
1117 struct rte_eth_dev_info dev_info;
1119 rte_eth_dev_info_get(port_id[i], &dev_info);
1120 if (port_id[i] == dev->data->port_id)
1122 ptoi[i].port_id = port_id[i];
1123 ptoi[i].ifindex = dev_info.if_index;
1125 /* Ensure first entry of ptoi[] is the current device. */
1128 ptoi[0] = ptoi[own];
1129 ptoi[own] = ptoi[n];
1131 /* An entry with zero ifindex terminates ptoi[]. */
1132 ptoi[n].port_id = 0;
1133 ptoi[n].ifindex = 0;
1138 * Verify the @p attr will be correctly understood by the E-switch.
1141 * Pointer to flow attributes
1143 * Pointer to error structure.
1146 * 0 on success, a negative errno value otherwise and rte_errno is set.
1149 flow_tcf_validate_attributes(const struct rte_flow_attr *attr,
1150 struct rte_flow_error *error)
1153 * Supported attributes: groups, some priorities and ingress only.
1154 * group is supported only if kernel supports chain. Don't care about
1155 * transfer as it is the caller's problem.
1157 if (attr->group > MLX5_TCF_GROUP_ID_MAX)
1158 return rte_flow_error_set(error, ENOTSUP,
1159 RTE_FLOW_ERROR_TYPE_ATTR_GROUP, attr,
1160 "group ID larger than "
1161 RTE_STR(MLX5_TCF_GROUP_ID_MAX)
1162 " isn't supported");
1163 else if (attr->priority > MLX5_TCF_GROUP_PRIORITY_MAX)
1164 return rte_flow_error_set(error, ENOTSUP,
1165 RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1167 "priority more than "
1168 RTE_STR(MLX5_TCF_GROUP_PRIORITY_MAX)
1169 " is not supported");
1171 return rte_flow_error_set(error, EINVAL,
1172 RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1173 attr, "only ingress is supported");
1175 return rte_flow_error_set(error, ENOTSUP,
1176 RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1177 attr, "egress is not supported");
1182 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_ETH item for E-Switch.
1183 * The routine checks the L2 fields to be used in encapsulation header.
1186 * Pointer to the item structure.
1188 * Pointer to the error structure.
1191 * 0 on success, a negative errno value otherwise and rte_errno is set.
1194 flow_tcf_validate_vxlan_encap_eth(const struct rte_flow_item *item,
1195 struct rte_flow_error *error)
1197 const struct rte_flow_item_eth *spec = item->spec;
1198 const struct rte_flow_item_eth *mask = item->mask;
1202 * Specification for L2 addresses can be empty
1203 * because these ones are optional and not
1204 * required directly by tc rule. Kernel tries
1205 * to resolve these ones on its own
1210 /* If mask is not specified use the default one. */
1211 mask = &rte_flow_item_eth_mask;
1213 if (memcmp(&mask->dst,
1214 &flow_tcf_mask_empty.eth.dst,
1215 sizeof(flow_tcf_mask_empty.eth.dst))) {
1216 if (memcmp(&mask->dst,
1217 &rte_flow_item_eth_mask.dst,
1218 sizeof(rte_flow_item_eth_mask.dst)))
1219 return rte_flow_error_set
1221 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1222 "no support for partial mask on"
1223 " \"eth.dst\" field");
1225 if (memcmp(&mask->src,
1226 &flow_tcf_mask_empty.eth.src,
1227 sizeof(flow_tcf_mask_empty.eth.src))) {
1228 if (memcmp(&mask->src,
1229 &rte_flow_item_eth_mask.src,
1230 sizeof(rte_flow_item_eth_mask.src)))
1231 return rte_flow_error_set
1233 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1234 "no support for partial mask on"
1235 " \"eth.src\" field");
1237 if (mask->type != RTE_BE16(0x0000)) {
1238 if (mask->type != RTE_BE16(0xffff))
1239 return rte_flow_error_set
1241 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1242 "no support for partial mask on"
1243 " \"eth.type\" field");
1245 "outer ethernet type field"
1246 " cannot be forced for vxlan"
1247 " encapsulation, parameter ignored");
1253 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV4 item for E-Switch.
1254 * The routine checks the IPv4 fields to be used in encapsulation header.
1257 * Pointer to the item structure.
1259 * Pointer to the error structure.
1262 * 0 on success, a negative errno value otherwise and rte_errno is set.
1265 flow_tcf_validate_vxlan_encap_ipv4(const struct rte_flow_item *item,
1266 struct rte_flow_error *error)
1268 const struct rte_flow_item_ipv4 *spec = item->spec;
1269 const struct rte_flow_item_ipv4 *mask = item->mask;
1273 * Specification for IP addresses cannot be empty
1274 * because it is required by tunnel_key parameter.
1276 return rte_flow_error_set(error, EINVAL,
1277 RTE_FLOW_ERROR_TYPE_ITEM, item,
1278 "NULL outer ipv4 address"
1279 " specification for vxlan"
1283 mask = &rte_flow_item_ipv4_mask;
1284 if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) {
1285 if (mask->hdr.dst_addr != RTE_BE32(0xffffffff))
1286 return rte_flow_error_set
1288 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1289 "no support for partial mask on"
1290 " \"ipv4.hdr.dst_addr\" field"
1291 " for vxlan encapsulation");
1292 /* More IPv4 address validations can be put here. */
1295 * Kernel uses the destination IP address to determine
1296 * the routing path and obtain the MAC destination
1297 * address, so IP destination address must be
1298 * specified in the tc rule.
1300 return rte_flow_error_set(error, EINVAL,
1301 RTE_FLOW_ERROR_TYPE_ITEM, item,
1302 "outer ipv4 destination address"
1303 " must be specified for"
1304 " vxlan encapsulation");
1306 if (mask->hdr.src_addr != RTE_BE32(0x00000000)) {
1307 if (mask->hdr.src_addr != RTE_BE32(0xffffffff))
1308 return rte_flow_error_set
1310 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1311 "no support for partial mask on"
1312 " \"ipv4.hdr.src_addr\" field"
1313 " for vxlan encapsulation");
1314 /* More IPv4 address validations can be put here. */
1317 * Kernel uses the source IP address to select the
1318 * interface for egress encapsulated traffic, so
1319 * it must be specified in the tc rule.
1321 return rte_flow_error_set(error, EINVAL,
1322 RTE_FLOW_ERROR_TYPE_ITEM, item,
1323 "outer ipv4 source address"
1324 " must be specified for"
1325 " vxlan encapsulation");
1327 if (mask->hdr.type_of_service &&
1328 mask->hdr.type_of_service != 0xff)
1329 return rte_flow_error_set(error, ENOTSUP,
1330 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1331 "no support for partial mask on"
1332 " \"ipv4.hdr.type_of_service\" field"
1333 " for vxlan encapsulation");
1334 if (mask->hdr.time_to_live &&
1335 mask->hdr.time_to_live != 0xff)
1336 return rte_flow_error_set(error, ENOTSUP,
1337 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1338 "no support for partial mask on"
1339 " \"ipv4.hdr.time_to_live\" field"
1340 " for vxlan encapsulation");
1345 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV6 item for E-Switch.
1346 * The routine checks the IPv6 fields to be used in encapsulation header.
1349 * Pointer to the item structure.
1351 * Pointer to the error structure.
1354 * 0 on success, a negative errno value otherwise and rte_errno is set.
1357 flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item,
1358 struct rte_flow_error *error)
1360 const struct rte_flow_item_ipv6 *spec = item->spec;
1361 const struct rte_flow_item_ipv6 *mask = item->mask;
1366 * Specification for IP addresses cannot be empty
1367 * because it is required by tunnel_key parameter.
1369 return rte_flow_error_set(error, EINVAL,
1370 RTE_FLOW_ERROR_TYPE_ITEM, item,
1371 "NULL outer ipv6 address"
1372 " specification for"
1373 " vxlan encapsulation");
1376 mask = &rte_flow_item_ipv6_mask;
1377 if (memcmp(&mask->hdr.dst_addr,
1378 &flow_tcf_mask_empty.ipv6.hdr.dst_addr,
1380 if (memcmp(&mask->hdr.dst_addr,
1381 &rte_flow_item_ipv6_mask.hdr.dst_addr,
1383 return rte_flow_error_set
1385 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1386 "no support for partial mask on"
1387 " \"ipv6.hdr.dst_addr\" field"
1388 " for vxlan encapsulation");
1389 /* More IPv6 address validations can be put here. */
1392 * Kernel uses the destination IP address to determine
1393 * the routing path and obtain the MAC destination
1394 * address (heigh or gate), so IP destination address
1395 * must be specified within the tc rule.
1397 return rte_flow_error_set(error, EINVAL,
1398 RTE_FLOW_ERROR_TYPE_ITEM, item,
1399 "outer ipv6 destination address"
1400 " must be specified for"
1401 " vxlan encapsulation");
1403 if (memcmp(&mask->hdr.src_addr,
1404 &flow_tcf_mask_empty.ipv6.hdr.src_addr,
1406 if (memcmp(&mask->hdr.src_addr,
1407 &rte_flow_item_ipv6_mask.hdr.src_addr,
1409 return rte_flow_error_set
1411 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1412 "no support for partial mask on"
1413 " \"ipv6.hdr.src_addr\" field"
1414 " for vxlan encapsulation");
1415 /* More L3 address validation can be put here. */
1418 * Kernel uses the source IP address to select the
1419 * interface for egress encapsulated traffic, so
1420 * it must be specified in the tc rule.
1422 return rte_flow_error_set(error, EINVAL,
1423 RTE_FLOW_ERROR_TYPE_ITEM, item,
1424 "outer L3 source address"
1425 " must be specified for"
1426 " vxlan encapsulation");
1428 msk6 = (rte_be_to_cpu_32(mask->hdr.vtc_flow) >>
1429 IPV6_HDR_TC_SHIFT) & 0xff;
1430 if (msk6 && msk6 != 0xff)
1431 return rte_flow_error_set(error, ENOTSUP,
1432 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1433 "no support for partial mask on"
1434 " \"ipv6.hdr.vtc_flow.tos\" field"
1435 " for vxlan encapsulation");
1436 if (mask->hdr.hop_limits && mask->hdr.hop_limits != 0xff)
1437 return rte_flow_error_set(error, ENOTSUP,
1438 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1439 "no support for partial mask on"
1440 " \"ipv6.hdr.hop_limits\" field"
1441 " for vxlan encapsulation");
1446 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_UDP item for E-Switch.
1447 * The routine checks the UDP fields to be used in encapsulation header.
1450 * Pointer to the item structure.
1452 * Pointer to the error structure.
1455 * 0 on success, a negative errno value otherwise and rte_errno is set.
1458 flow_tcf_validate_vxlan_encap_udp(const struct rte_flow_item *item,
1459 struct rte_flow_error *error)
1461 const struct rte_flow_item_udp *spec = item->spec;
1462 const struct rte_flow_item_udp *mask = item->mask;
1466 * Specification for UDP ports cannot be empty
1467 * because it is required by tunnel_key parameter.
1469 return rte_flow_error_set(error, EINVAL,
1470 RTE_FLOW_ERROR_TYPE_ITEM, item,
1471 "NULL UDP port specification "
1472 " for vxlan encapsulation");
1475 mask = &rte_flow_item_udp_mask;
1476 if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1477 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1478 return rte_flow_error_set
1480 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1481 "no support for partial mask on"
1482 " \"udp.hdr.dst_port\" field"
1483 " for vxlan encapsulation");
1484 if (!spec->hdr.dst_port)
1485 return rte_flow_error_set
1487 RTE_FLOW_ERROR_TYPE_ITEM, item,
1488 "outer UDP remote port cannot be"
1489 " 0 for vxlan encapsulation");
1491 return rte_flow_error_set(error, EINVAL,
1492 RTE_FLOW_ERROR_TYPE_ITEM, item,
1493 "outer UDP remote port"
1494 " must be specified for"
1495 " vxlan encapsulation");
1497 if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1498 if (mask->hdr.src_port != RTE_BE16(0xffff))
1499 return rte_flow_error_set
1501 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1502 "no support for partial mask on"
1503 " \"udp.hdr.src_port\" field"
1504 " for vxlan encapsulation");
1506 "outer UDP source port cannot be"
1507 " forced for vxlan encapsulation,"
1508 " parameter ignored");
1514 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_VXLAN item for E-Switch.
1515 * The routine checks the VNIP fields to be used in encapsulation header.
1518 * Pointer to the item structure.
1520 * Pointer to the error structure.
1523 * 0 on success, a negative errno value otherwise and rte_errno is set.
1526 flow_tcf_validate_vxlan_encap_vni(const struct rte_flow_item *item,
1527 struct rte_flow_error *error)
1529 const struct rte_flow_item_vxlan *spec = item->spec;
1530 const struct rte_flow_item_vxlan *mask = item->mask;
1533 /* Outer VNI is required by tunnel_key parameter. */
1534 return rte_flow_error_set(error, EINVAL,
1535 RTE_FLOW_ERROR_TYPE_ITEM, item,
1536 "NULL VNI specification"
1537 " for vxlan encapsulation");
1540 mask = &rte_flow_item_vxlan_mask;
1541 if (!mask->vni[0] && !mask->vni[1] && !mask->vni[2])
1542 return rte_flow_error_set(error, EINVAL,
1543 RTE_FLOW_ERROR_TYPE_ITEM, item,
1544 "outer VNI must be specified "
1545 "for vxlan encapsulation");
1546 if (mask->vni[0] != 0xff ||
1547 mask->vni[1] != 0xff ||
1548 mask->vni[2] != 0xff)
1549 return rte_flow_error_set(error, ENOTSUP,
1550 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1551 "no support for partial mask on"
1552 " \"vxlan.vni\" field");
1554 if (!spec->vni[0] && !spec->vni[1] && !spec->vni[2])
1555 return rte_flow_error_set(error, EINVAL,
1556 RTE_FLOW_ERROR_TYPE_ITEM, item,
1557 "vxlan vni cannot be 0");
1562 * Validate VXLAN_ENCAP action item list for E-Switch.
1563 * The routine checks items to be used in encapsulation header.
1566 * Pointer to the VXLAN_ENCAP action structure.
1568 * Pointer to the error structure.
1571 * 0 on success, a negative errno value otherwise and rte_errno is set.
1574 flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action,
1575 struct rte_flow_error *error)
1577 const struct rte_flow_item *items;
1579 uint32_t item_flags = 0;
1582 return rte_flow_error_set(error, EINVAL,
1583 RTE_FLOW_ERROR_TYPE_ACTION, action,
1584 "Missing vxlan tunnel"
1585 " action configuration");
1586 items = ((const struct rte_flow_action_vxlan_encap *)
1587 action->conf)->definition;
1589 return rte_flow_error_set(error, EINVAL,
1590 RTE_FLOW_ERROR_TYPE_ACTION, action,
1591 "Missing vxlan tunnel"
1592 " encapsulation parameters");
1593 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1594 switch (items->type) {
1595 case RTE_FLOW_ITEM_TYPE_VOID:
1597 case RTE_FLOW_ITEM_TYPE_ETH:
1598 ret = mlx5_flow_validate_item_eth(items, item_flags,
1602 ret = flow_tcf_validate_vxlan_encap_eth(items, error);
1605 item_flags |= MLX5_FLOW_LAYER_OUTER_L2;
1608 case RTE_FLOW_ITEM_TYPE_IPV4:
1609 ret = mlx5_flow_validate_item_ipv4
1611 &flow_tcf_mask_supported.ipv4, error);
1614 ret = flow_tcf_validate_vxlan_encap_ipv4(items, error);
1617 item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
1619 case RTE_FLOW_ITEM_TYPE_IPV6:
1620 ret = mlx5_flow_validate_item_ipv6
1622 &flow_tcf_mask_supported.ipv6, error);
1625 ret = flow_tcf_validate_vxlan_encap_ipv6(items, error);
1628 item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
1630 case RTE_FLOW_ITEM_TYPE_UDP:
1631 ret = mlx5_flow_validate_item_udp(items, item_flags,
1635 ret = flow_tcf_validate_vxlan_encap_udp(items, error);
1638 item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
1640 case RTE_FLOW_ITEM_TYPE_VXLAN:
1641 ret = mlx5_flow_validate_item_vxlan(items,
1645 ret = flow_tcf_validate_vxlan_encap_vni(items, error);
1648 item_flags |= MLX5_FLOW_LAYER_VXLAN;
1651 return rte_flow_error_set
1653 RTE_FLOW_ERROR_TYPE_ITEM, items,
1654 "vxlan encap item not supported");
1657 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3))
1658 return rte_flow_error_set(error, EINVAL,
1659 RTE_FLOW_ERROR_TYPE_ACTION, action,
1660 "no outer IP layer found"
1661 " for vxlan encapsulation");
1662 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
1663 return rte_flow_error_set(error, EINVAL,
1664 RTE_FLOW_ERROR_TYPE_ACTION, action,
1665 "no outer UDP layer found"
1666 " for vxlan encapsulation");
1667 if (!(item_flags & MLX5_FLOW_LAYER_VXLAN))
1668 return rte_flow_error_set(error, EINVAL,
1669 RTE_FLOW_ERROR_TYPE_ACTION, action,
1670 "no VXLAN VNI found"
1671 " for vxlan encapsulation");
1676 * Validate outer RTE_FLOW_ITEM_TYPE_UDP item if tunnel item
1677 * RTE_FLOW_ITEM_TYPE_VXLAN is present in item list.
1680 * Outer UDP layer item (if any, NULL otherwise).
1682 * Pointer to the error structure.
1685 * 0 on success, a negative errno value otherwise and rte_errno is set.
1688 flow_tcf_validate_vxlan_decap_udp(const struct rte_flow_item *udp,
1689 struct rte_flow_error *error)
1691 const struct rte_flow_item_udp *spec = udp->spec;
1692 const struct rte_flow_item_udp *mask = udp->mask;
1696 * Specification for UDP ports cannot be empty
1697 * because it is required as decap parameter.
1699 return rte_flow_error_set(error, EINVAL,
1700 RTE_FLOW_ERROR_TYPE_ITEM, udp,
1701 "NULL UDP port specification"
1702 " for VXLAN decapsulation");
1704 mask = &rte_flow_item_udp_mask;
1705 if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1706 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1707 return rte_flow_error_set
1709 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1710 "no support for partial mask on"
1711 " \"udp.hdr.dst_port\" field");
1712 if (!spec->hdr.dst_port)
1713 return rte_flow_error_set
1715 RTE_FLOW_ERROR_TYPE_ITEM, udp,
1716 "zero decap local UDP port");
1718 return rte_flow_error_set(error, EINVAL,
1719 RTE_FLOW_ERROR_TYPE_ITEM, udp,
1720 "outer UDP destination port must be "
1721 "specified for vxlan decapsulation");
1723 if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1724 if (mask->hdr.src_port != RTE_BE16(0xffff))
1725 return rte_flow_error_set
1727 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1728 "no support for partial mask on"
1729 " \"udp.hdr.src_port\" field");
1731 "outer UDP local port cannot be "
1732 "forced for VXLAN encapsulation, "
1733 "parameter ignored");
1739 * Validate flow for E-Switch.
1742 * Pointer to the priv structure.
1744 * Pointer to the flow attributes.
1746 * Pointer to the list of items.
1747 * @param[in] actions
1748 * Pointer to the list of actions.
1750 * Pointer to the error structure.
1753 * 0 on success, a negative errno value otherwise and rte_errno is set.
1756 flow_tcf_validate(struct rte_eth_dev *dev,
1757 const struct rte_flow_attr *attr,
1758 const struct rte_flow_item items[],
1759 const struct rte_flow_action actions[],
1760 struct rte_flow_error *error)
1763 const struct rte_flow_item_port_id *port_id;
1764 const struct rte_flow_item_eth *eth;
1765 const struct rte_flow_item_vlan *vlan;
1766 const struct rte_flow_item_ipv4 *ipv4;
1767 const struct rte_flow_item_ipv6 *ipv6;
1768 const struct rte_flow_item_tcp *tcp;
1769 const struct rte_flow_item_udp *udp;
1770 const struct rte_flow_item_vxlan *vxlan;
1773 const struct rte_flow_action_port_id *port_id;
1774 const struct rte_flow_action_jump *jump;
1775 const struct rte_flow_action_of_push_vlan *of_push_vlan;
1776 const struct rte_flow_action_of_set_vlan_vid *
1778 const struct rte_flow_action_of_set_vlan_pcp *
1780 const struct rte_flow_action_vxlan_encap *vxlan_encap;
1781 const struct rte_flow_action_set_ipv4 *set_ipv4;
1782 const struct rte_flow_action_set_ipv6 *set_ipv6;
1784 const struct rte_flow_item *outer_udp = NULL;
1785 rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
1786 rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
1787 rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
1788 uint64_t item_flags = 0;
1789 uint64_t action_flags = 0;
1790 uint8_t next_protocol = 0xff;
1791 unsigned int tcm_ifindex = 0;
1792 uint8_t pedit_validated = 0;
1793 struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
1794 struct rte_eth_dev *port_id_dev = NULL;
1795 bool in_port_id_set;
1798 claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
1799 PTOI_TABLE_SZ_MAX(dev)));
1800 ret = flow_tcf_validate_attributes(attr, error);
1803 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
1805 uint64_t current_action_flag = 0;
1807 switch (actions->type) {
1808 case RTE_FLOW_ACTION_TYPE_VOID:
1810 case RTE_FLOW_ACTION_TYPE_PORT_ID:
1811 current_action_flag = MLX5_FLOW_ACTION_PORT_ID;
1814 conf.port_id = actions->conf;
1815 if (conf.port_id->original)
1818 for (i = 0; ptoi[i].ifindex; ++i)
1819 if (ptoi[i].port_id == conf.port_id->id)
1821 if (!ptoi[i].ifindex)
1822 return rte_flow_error_set
1824 RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1826 "missing data to convert port ID to"
1828 port_id_dev = &rte_eth_devices[conf.port_id->id];
1830 case RTE_FLOW_ACTION_TYPE_JUMP:
1831 current_action_flag = MLX5_FLOW_ACTION_JUMP;
1834 conf.jump = actions->conf;
1835 if (attr->group >= conf.jump->group)
1836 return rte_flow_error_set
1838 RTE_FLOW_ERROR_TYPE_ACTION,
1840 "can jump only to a group forward");
1842 case RTE_FLOW_ACTION_TYPE_DROP:
1843 current_action_flag = MLX5_FLOW_ACTION_DROP;
1845 case RTE_FLOW_ACTION_TYPE_COUNT:
1847 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
1848 current_action_flag = MLX5_FLOW_ACTION_OF_POP_VLAN;
1850 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: {
1851 rte_be16_t ethertype;
1853 current_action_flag = MLX5_FLOW_ACTION_OF_PUSH_VLAN;
1856 conf.of_push_vlan = actions->conf;
1857 ethertype = conf.of_push_vlan->ethertype;
1858 if (ethertype != RTE_BE16(ETH_P_8021Q) &&
1859 ethertype != RTE_BE16(ETH_P_8021AD))
1860 return rte_flow_error_set
1862 RTE_FLOW_ERROR_TYPE_ACTION, actions,
1863 "vlan push TPID must be "
1864 "802.1Q or 802.1AD");
1867 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
1868 if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1869 return rte_flow_error_set
1871 RTE_FLOW_ERROR_TYPE_ACTION, actions,
1872 "vlan modify is not supported,"
1873 " set action must follow push action");
1874 current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
1876 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
1877 if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1878 return rte_flow_error_set
1880 RTE_FLOW_ERROR_TYPE_ACTION, actions,
1881 "vlan modify is not supported,"
1882 " set action must follow push action");
1883 current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
1885 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
1886 current_action_flag = MLX5_FLOW_ACTION_VXLAN_DECAP;
1888 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
1889 ret = flow_tcf_validate_vxlan_encap(actions, error);
1892 current_action_flag = MLX5_FLOW_ACTION_VXLAN_ENCAP;
1894 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
1895 current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_SRC;
1897 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
1898 current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_DST;
1900 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
1901 current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_SRC;
1903 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
1904 current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_DST;
1906 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
1907 current_action_flag = MLX5_FLOW_ACTION_SET_TP_SRC;
1909 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
1910 current_action_flag = MLX5_FLOW_ACTION_SET_TP_DST;
1912 case RTE_FLOW_ACTION_TYPE_SET_TTL:
1913 current_action_flag = MLX5_FLOW_ACTION_SET_TTL;
1915 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
1916 current_action_flag = MLX5_FLOW_ACTION_DEC_TTL;
1918 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
1919 current_action_flag = MLX5_FLOW_ACTION_SET_MAC_SRC;
1921 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
1922 current_action_flag = MLX5_FLOW_ACTION_SET_MAC_DST;
1925 return rte_flow_error_set(error, ENOTSUP,
1926 RTE_FLOW_ERROR_TYPE_ACTION,
1928 "action not supported");
1930 if (current_action_flag & MLX5_TCF_CONFIG_ACTIONS) {
1932 return rte_flow_error_set
1934 RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1936 "action configuration not set");
1938 if ((current_action_flag & MLX5_TCF_PEDIT_ACTIONS) &&
1940 return rte_flow_error_set(error, ENOTSUP,
1941 RTE_FLOW_ERROR_TYPE_ACTION,
1943 "set actions should be "
1944 "listed successively");
1945 if ((current_action_flag & ~MLX5_TCF_PEDIT_ACTIONS) &&
1946 (action_flags & MLX5_TCF_PEDIT_ACTIONS))
1947 pedit_validated = 1;
1948 if ((current_action_flag & MLX5_TCF_FATE_ACTIONS) &&
1949 (action_flags & MLX5_TCF_FATE_ACTIONS))
1950 return rte_flow_error_set(error, EINVAL,
1951 RTE_FLOW_ERROR_TYPE_ACTION,
1953 "can't have multiple fate"
1955 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1956 (action_flags & MLX5_TCF_VXLAN_ACTIONS))
1957 return rte_flow_error_set(error, EINVAL,
1958 RTE_FLOW_ERROR_TYPE_ACTION,
1960 "can't have multiple vxlan"
1962 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1963 (action_flags & MLX5_TCF_VLAN_ACTIONS))
1964 return rte_flow_error_set(error, ENOTSUP,
1965 RTE_FLOW_ERROR_TYPE_ACTION,
1967 "can't have vxlan and vlan"
1968 " actions in the same rule");
1969 action_flags |= current_action_flag;
1971 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1974 switch (items->type) {
1975 case RTE_FLOW_ITEM_TYPE_VOID:
1977 case RTE_FLOW_ITEM_TYPE_PORT_ID:
1978 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
1979 return rte_flow_error_set
1981 RTE_FLOW_ERROR_TYPE_ITEM, items,
1982 "inner tunnel port id"
1983 " item is not supported");
1984 mask.port_id = flow_tcf_item_mask
1985 (items, &rte_flow_item_port_id_mask,
1986 &flow_tcf_mask_supported.port_id,
1987 &flow_tcf_mask_empty.port_id,
1988 sizeof(flow_tcf_mask_supported.port_id),
1992 if (mask.port_id == &flow_tcf_mask_empty.port_id) {
1996 spec.port_id = items->spec;
1997 if (mask.port_id->id && mask.port_id->id != 0xffffffff)
1998 return rte_flow_error_set
2000 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2002 "no support for partial mask on"
2004 if (!mask.port_id->id)
2007 for (i = 0; ptoi[i].ifindex; ++i)
2008 if (ptoi[i].port_id == spec.port_id->id)
2010 if (!ptoi[i].ifindex)
2011 return rte_flow_error_set
2013 RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
2015 "missing data to convert port ID to"
2017 if (in_port_id_set && ptoi[i].ifindex != tcm_ifindex)
2018 return rte_flow_error_set
2020 RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
2022 "cannot match traffic for"
2023 " several port IDs through"
2024 " a single flow rule");
2025 tcm_ifindex = ptoi[i].ifindex;
2028 case RTE_FLOW_ITEM_TYPE_ETH:
2029 ret = mlx5_flow_validate_item_eth(items, item_flags,
2033 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2034 MLX5_FLOW_LAYER_INNER_L2 :
2035 MLX5_FLOW_LAYER_OUTER_L2;
2037 * Redundant check due to different supported mask.
2038 * Same for the rest of items.
2040 mask.eth = flow_tcf_item_mask
2041 (items, &rte_flow_item_eth_mask,
2042 &flow_tcf_mask_supported.eth,
2043 &flow_tcf_mask_empty.eth,
2044 sizeof(flow_tcf_mask_supported.eth),
2048 if (mask.eth->type && mask.eth->type !=
2050 return rte_flow_error_set
2052 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2054 "no support for partial mask on"
2056 assert(items->spec);
2057 spec.eth = items->spec;
2058 if (mask.eth->type &&
2059 (item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
2060 inner_etype != RTE_BE16(ETH_P_ALL) &&
2061 inner_etype != spec.eth->type)
2062 return rte_flow_error_set
2064 RTE_FLOW_ERROR_TYPE_ITEM,
2066 "inner eth_type conflict");
2067 if (mask.eth->type &&
2068 !(item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
2069 outer_etype != RTE_BE16(ETH_P_ALL) &&
2070 outer_etype != spec.eth->type)
2071 return rte_flow_error_set
2073 RTE_FLOW_ERROR_TYPE_ITEM,
2075 "outer eth_type conflict");
2076 if (mask.eth->type) {
2077 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
2078 inner_etype = spec.eth->type;
2080 outer_etype = spec.eth->type;
2083 case RTE_FLOW_ITEM_TYPE_VLAN:
2084 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
2085 return rte_flow_error_set
2087 RTE_FLOW_ERROR_TYPE_ITEM, items,
2089 " is not supported");
2090 ret = mlx5_flow_validate_item_vlan(items, item_flags,
2094 item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
2095 mask.vlan = flow_tcf_item_mask
2096 (items, &rte_flow_item_vlan_mask,
2097 &flow_tcf_mask_supported.vlan,
2098 &flow_tcf_mask_empty.vlan,
2099 sizeof(flow_tcf_mask_supported.vlan),
2103 if ((mask.vlan->tci & RTE_BE16(0xe000) &&
2104 (mask.vlan->tci & RTE_BE16(0xe000)) !=
2105 RTE_BE16(0xe000)) ||
2106 (mask.vlan->tci & RTE_BE16(0x0fff) &&
2107 (mask.vlan->tci & RTE_BE16(0x0fff)) !=
2108 RTE_BE16(0x0fff)) ||
2109 (mask.vlan->inner_type &&
2110 mask.vlan->inner_type != RTE_BE16(0xffff)))
2111 return rte_flow_error_set
2113 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2115 "no support for partial masks on"
2116 " \"tci\" (PCP and VID parts) and"
2117 " \"inner_type\" fields");
2118 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2119 outer_etype != RTE_BE16(ETH_P_8021Q))
2120 return rte_flow_error_set
2122 RTE_FLOW_ERROR_TYPE_ITEM,
2124 "outer eth_type conflict,"
2126 outer_etype = RTE_BE16(ETH_P_8021Q);
2127 assert(items->spec);
2128 spec.vlan = items->spec;
2129 if (mask.vlan->inner_type &&
2130 vlan_etype != RTE_BE16(ETH_P_ALL) &&
2131 vlan_etype != spec.vlan->inner_type)
2132 return rte_flow_error_set
2134 RTE_FLOW_ERROR_TYPE_ITEM,
2136 "vlan eth_type conflict");
2137 if (mask.vlan->inner_type)
2138 vlan_etype = spec.vlan->inner_type;
2140 case RTE_FLOW_ITEM_TYPE_IPV4:
2141 ret = mlx5_flow_validate_item_ipv4
2143 &flow_tcf_mask_supported.ipv4, error);
2146 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2147 MLX5_FLOW_LAYER_INNER_L3_IPV4 :
2148 MLX5_FLOW_LAYER_OUTER_L3_IPV4;
2149 mask.ipv4 = flow_tcf_item_mask
2150 (items, &rte_flow_item_ipv4_mask,
2151 &flow_tcf_mask_supported.ipv4,
2152 &flow_tcf_mask_empty.ipv4,
2153 sizeof(flow_tcf_mask_supported.ipv4),
2157 if (mask.ipv4->hdr.next_proto_id &&
2158 mask.ipv4->hdr.next_proto_id != 0xff)
2159 return rte_flow_error_set
2161 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2163 "no support for partial mask on"
2164 " \"hdr.next_proto_id\" field");
2165 else if (mask.ipv4->hdr.next_proto_id)
2167 ((const struct rte_flow_item_ipv4 *)
2168 (items->spec))->hdr.next_proto_id;
2169 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2170 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2171 inner_etype != RTE_BE16(ETH_P_IP))
2172 return rte_flow_error_set
2174 RTE_FLOW_ERROR_TYPE_ITEM,
2176 "inner eth_type conflict,"
2177 " IPv4 is required");
2178 inner_etype = RTE_BE16(ETH_P_IP);
2179 } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2180 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2181 vlan_etype != RTE_BE16(ETH_P_IP))
2182 return rte_flow_error_set
2184 RTE_FLOW_ERROR_TYPE_ITEM,
2186 "vlan eth_type conflict,"
2187 " IPv4 is required");
2188 vlan_etype = RTE_BE16(ETH_P_IP);
2190 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2191 outer_etype != RTE_BE16(ETH_P_IP))
2192 return rte_flow_error_set
2194 RTE_FLOW_ERROR_TYPE_ITEM,
2196 "eth_type conflict,"
2197 " IPv4 is required");
2198 outer_etype = RTE_BE16(ETH_P_IP);
2201 case RTE_FLOW_ITEM_TYPE_IPV6:
2202 ret = mlx5_flow_validate_item_ipv6
2204 &flow_tcf_mask_supported.ipv6, error);
2207 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2208 MLX5_FLOW_LAYER_INNER_L3_IPV6 :
2209 MLX5_FLOW_LAYER_OUTER_L3_IPV6;
2210 mask.ipv6 = flow_tcf_item_mask
2211 (items, &rte_flow_item_ipv6_mask,
2212 &flow_tcf_mask_supported.ipv6,
2213 &flow_tcf_mask_empty.ipv6,
2214 sizeof(flow_tcf_mask_supported.ipv6),
2218 if (mask.ipv6->hdr.proto &&
2219 mask.ipv6->hdr.proto != 0xff)
2220 return rte_flow_error_set
2222 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2224 "no support for partial mask on"
2225 " \"hdr.proto\" field");
2226 else if (mask.ipv6->hdr.proto)
2228 ((const struct rte_flow_item_ipv6 *)
2229 (items->spec))->hdr.proto;
2230 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2231 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2232 inner_etype != RTE_BE16(ETH_P_IPV6))
2233 return rte_flow_error_set
2235 RTE_FLOW_ERROR_TYPE_ITEM,
2237 "inner eth_type conflict,"
2238 " IPv6 is required");
2239 inner_etype = RTE_BE16(ETH_P_IPV6);
2240 } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2241 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2242 vlan_etype != RTE_BE16(ETH_P_IPV6))
2243 return rte_flow_error_set
2245 RTE_FLOW_ERROR_TYPE_ITEM,
2247 "vlan eth_type conflict,"
2248 " IPv6 is required");
2249 vlan_etype = RTE_BE16(ETH_P_IPV6);
2251 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2252 outer_etype != RTE_BE16(ETH_P_IPV6))
2253 return rte_flow_error_set
2255 RTE_FLOW_ERROR_TYPE_ITEM,
2257 "eth_type conflict,"
2258 " IPv6 is required");
2259 outer_etype = RTE_BE16(ETH_P_IPV6);
2262 case RTE_FLOW_ITEM_TYPE_UDP:
2263 ret = mlx5_flow_validate_item_udp(items, item_flags,
2264 next_protocol, error);
2267 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2268 MLX5_FLOW_LAYER_INNER_L4_UDP :
2269 MLX5_FLOW_LAYER_OUTER_L4_UDP;
2270 mask.udp = flow_tcf_item_mask
2271 (items, &rte_flow_item_udp_mask,
2272 &flow_tcf_mask_supported.udp,
2273 &flow_tcf_mask_empty.udp,
2274 sizeof(flow_tcf_mask_supported.udp),
2279 * Save the presumed outer UDP item for extra check
2280 * if the tunnel item will be found later in the list.
2282 if (!(item_flags & MLX5_FLOW_LAYER_TUNNEL))
2285 case RTE_FLOW_ITEM_TYPE_TCP:
2286 ret = mlx5_flow_validate_item_tcp
2289 &flow_tcf_mask_supported.tcp,
2293 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2294 MLX5_FLOW_LAYER_INNER_L4_TCP :
2295 MLX5_FLOW_LAYER_OUTER_L4_TCP;
2296 mask.tcp = flow_tcf_item_mask
2297 (items, &rte_flow_item_tcp_mask,
2298 &flow_tcf_mask_supported.tcp,
2299 &flow_tcf_mask_empty.tcp,
2300 sizeof(flow_tcf_mask_supported.tcp),
2305 case RTE_FLOW_ITEM_TYPE_VXLAN:
2306 if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN)
2307 return rte_flow_error_set
2309 RTE_FLOW_ERROR_TYPE_ITEM, items,
2310 "vxlan tunnel over vlan"
2311 " is not supported");
2312 ret = mlx5_flow_validate_item_vxlan(items,
2316 item_flags |= MLX5_FLOW_LAYER_VXLAN;
2317 mask.vxlan = flow_tcf_item_mask
2318 (items, &rte_flow_item_vxlan_mask,
2319 &flow_tcf_mask_supported.vxlan,
2320 &flow_tcf_mask_empty.vxlan,
2321 sizeof(flow_tcf_mask_supported.vxlan), error);
2324 if (mask.vxlan->vni[0] != 0xff ||
2325 mask.vxlan->vni[1] != 0xff ||
2326 mask.vxlan->vni[2] != 0xff)
2327 return rte_flow_error_set
2329 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2331 "no support for partial or "
2332 "empty mask on \"vxlan.vni\" field");
2334 * The VNI item assumes the VXLAN tunnel, it requires
2335 * at least the outer destination UDP port must be
2336 * specified without wildcards to allow kernel select
2337 * the virtual VXLAN device by port. Also outer IPv4
2338 * or IPv6 item must be specified (wilcards or even
2339 * zero mask are allowed) to let driver know the tunnel
2340 * IP version and process UDP traffic correctly.
2343 (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2344 MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2345 return rte_flow_error_set
2347 RTE_FLOW_ERROR_TYPE_ACTION,
2349 "no outer IP pattern found"
2350 " for vxlan tunnel");
2351 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
2352 return rte_flow_error_set
2354 RTE_FLOW_ERROR_TYPE_ACTION,
2356 "no outer UDP pattern found"
2357 " for vxlan tunnel");
2359 * All items preceding the tunnel item become outer
2360 * ones and we should do extra validation for them
2361 * due to tc limitations for tunnel outer parameters.
2362 * Currently only outer UDP item requres extra check,
2363 * use the saved pointer instead of item list rescan.
2366 ret = flow_tcf_validate_vxlan_decap_udp
2370 /* Reset L4 protocol for inner parameters. */
2371 next_protocol = 0xff;
2374 return rte_flow_error_set(error, ENOTSUP,
2375 RTE_FLOW_ERROR_TYPE_ITEM,
2376 items, "item not supported");
2379 if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2380 (action_flags & MLX5_FLOW_ACTION_DROP))
2381 return rte_flow_error_set(error, ENOTSUP,
2382 RTE_FLOW_ERROR_TYPE_ACTION,
2384 "set action is not compatible with "
2386 if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2387 !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2388 return rte_flow_error_set(error, ENOTSUP,
2389 RTE_FLOW_ERROR_TYPE_ACTION,
2391 "set action must be followed by "
2394 (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST)) {
2395 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4))
2396 return rte_flow_error_set(error, EINVAL,
2397 RTE_FLOW_ERROR_TYPE_ACTION,
2399 "no ipv4 item found in"
2403 (MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST)) {
2404 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6))
2405 return rte_flow_error_set(error, EINVAL,
2406 RTE_FLOW_ERROR_TYPE_ACTION,
2408 "no ipv6 item found in"
2412 (MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST)) {
2414 (MLX5_FLOW_LAYER_OUTER_L4_UDP |
2415 MLX5_FLOW_LAYER_OUTER_L4_TCP)))
2416 return rte_flow_error_set(error, EINVAL,
2417 RTE_FLOW_ERROR_TYPE_ACTION,
2419 "no TCP/UDP item found in"
2423 * FW syndrome (0xA9C090):
2424 * set_flow_table_entry: push vlan action fte in fdb can ONLY be
2425 * forward to the uplink.
2427 if ((action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) &&
2428 (action_flags & MLX5_FLOW_ACTION_PORT_ID) &&
2429 ((struct mlx5_priv *)port_id_dev->data->dev_private)->representor)
2430 return rte_flow_error_set(error, ENOTSUP,
2431 RTE_FLOW_ERROR_TYPE_ACTION, actions,
2432 "vlan push can only be applied"
2433 " when forwarding to uplink port");
2435 * FW syndrome (0x294609):
2436 * set_flow_table_entry: modify/pop/push actions in fdb flow table
2437 * are supported only while forwarding to vport.
2439 if ((action_flags & MLX5_TCF_VLAN_ACTIONS) &&
2440 !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2441 return rte_flow_error_set(error, ENOTSUP,
2442 RTE_FLOW_ERROR_TYPE_ACTION, actions,
2443 "vlan actions are supported"
2444 " only with port_id action");
2445 if ((action_flags & MLX5_TCF_VXLAN_ACTIONS) &&
2446 !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2447 return rte_flow_error_set(error, ENOTSUP,
2448 RTE_FLOW_ERROR_TYPE_ACTION, NULL,
2449 "vxlan actions are supported"
2450 " only with port_id action");
2451 if (!(action_flags & MLX5_TCF_FATE_ACTIONS))
2452 return rte_flow_error_set(error, EINVAL,
2453 RTE_FLOW_ERROR_TYPE_ACTION, actions,
2454 "no fate action is found");
2456 (MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL)) {
2458 (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2459 MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2460 return rte_flow_error_set(error, EINVAL,
2461 RTE_FLOW_ERROR_TYPE_ACTION,
2463 "no IP found in pattern");
2466 (MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)) {
2467 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L2))
2468 return rte_flow_error_set(error, ENOTSUP,
2469 RTE_FLOW_ERROR_TYPE_ACTION,
2471 "no ethernet found in"
2474 if ((action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) &&
2475 !(item_flags & MLX5_FLOW_LAYER_VXLAN))
2476 return rte_flow_error_set(error, EINVAL,
2477 RTE_FLOW_ERROR_TYPE_ACTION,
2479 "no VNI pattern found"
2480 " for vxlan decap action");
2481 if ((action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) &&
2482 (item_flags & MLX5_FLOW_LAYER_TUNNEL))
2483 return rte_flow_error_set(error, EINVAL,
2484 RTE_FLOW_ERROR_TYPE_ACTION,
2486 "vxlan encap not supported"
2487 " for tunneled traffic");
2492 * Calculate maximum size of memory for flow items of Linux TC flower.
2495 * Pointer to the flow attributes.
2497 * Pointer to the list of items.
2498 * @param[out] action_flags
2499 * Pointer to the detected actions.
2502 * Maximum size of memory for items.
2505 flow_tcf_get_items_size(const struct rte_flow_attr *attr,
2506 const struct rte_flow_item items[],
2507 uint64_t *action_flags)
2511 size += SZ_NLATTR_STRZ_OF("flower") +
2512 SZ_NLATTR_TYPE_OF(uint16_t) + /* Outer ether type. */
2513 SZ_NLATTR_NEST + /* TCA_OPTIONS. */
2514 SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */
2515 if (attr->group > 0)
2516 size += SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CHAIN. */
2517 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2518 switch (items->type) {
2519 case RTE_FLOW_ITEM_TYPE_VOID:
2521 case RTE_FLOW_ITEM_TYPE_PORT_ID:
2523 case RTE_FLOW_ITEM_TYPE_ETH:
2524 size += SZ_NLATTR_DATA_OF(RTE_ETHER_ADDR_LEN) * 4;
2525 /* dst/src MAC addr and mask. */
2527 case RTE_FLOW_ITEM_TYPE_VLAN:
2528 size += SZ_NLATTR_TYPE_OF(uint16_t) +
2529 /* VLAN Ether type. */
2530 SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */
2531 SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */
2533 case RTE_FLOW_ITEM_TYPE_IPV4: {
2534 const struct rte_flow_item_ipv4 *ipv4 = items->mask;
2536 size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2537 SZ_NLATTR_TYPE_OF(uint32_t) * 4;
2538 /* dst/src IP addr and mask. */
2539 if (ipv4 && ipv4->hdr.time_to_live)
2540 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2541 if (ipv4 && ipv4->hdr.type_of_service)
2542 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2545 case RTE_FLOW_ITEM_TYPE_IPV6: {
2546 const struct rte_flow_item_ipv6 *ipv6 = items->mask;
2548 size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2549 SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4;
2550 /* dst/src IP addr and mask. */
2551 if (ipv6 && ipv6->hdr.hop_limits)
2552 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2553 if (ipv6 && (rte_be_to_cpu_32(ipv6->hdr.vtc_flow) &
2554 (0xfful << IPV6_HDR_TC_SHIFT)))
2555 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2558 case RTE_FLOW_ITEM_TYPE_UDP:
2559 size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2560 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2561 /* dst/src port and mask. */
2563 case RTE_FLOW_ITEM_TYPE_TCP:
2564 size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2565 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2566 /* dst/src port and mask. */
2568 case RTE_FLOW_ITEM_TYPE_VXLAN:
2569 size += SZ_NLATTR_TYPE_OF(uint32_t);
2571 * There might be no VXLAN decap action in the action
2572 * list, nonetheless the VXLAN tunnel flow requires
2573 * the decap structure to be correctly applied to
2574 * VXLAN device, set the flag to create the structure.
2575 * Translation routine will not put the decap action
2576 * in tne Netlink message if there is no actual action
2579 *action_flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2583 "unsupported item %p type %d,"
2584 " items must be validated before flow creation",
2585 (const void *)items, items->type);
2593 * Calculate size of memory to store the VXLAN encapsultion
2594 * related items in the Netlink message buffer. Items list
2595 * is specified by RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action.
2596 * The item list should be validated.
2599 * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2600 * List of pattern items to scan data from.
2603 * The size the part of Netlink message buffer to store the
2604 * VXLAN encapsulation item attributes.
2607 flow_tcf_vxlan_encap_size(const struct rte_flow_action *action)
2609 const struct rte_flow_item *items;
2612 assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
2613 assert(action->conf);
2615 items = ((const struct rte_flow_action_vxlan_encap *)
2616 action->conf)->definition;
2618 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2619 switch (items->type) {
2620 case RTE_FLOW_ITEM_TYPE_VOID:
2622 case RTE_FLOW_ITEM_TYPE_ETH:
2623 /* This item does not require message buffer. */
2625 case RTE_FLOW_ITEM_TYPE_IPV4: {
2626 const struct rte_flow_item_ipv4 *ipv4 = items->mask;
2628 size += SZ_NLATTR_DATA_OF(IPV4_ADDR_LEN) * 2;
2629 if (ipv4 && ipv4->hdr.time_to_live)
2630 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2631 if (ipv4 && ipv4->hdr.type_of_service)
2632 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2635 case RTE_FLOW_ITEM_TYPE_IPV6: {
2636 const struct rte_flow_item_ipv6 *ipv6 = items->mask;
2638 size += SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 2;
2639 if (ipv6 && ipv6->hdr.hop_limits)
2640 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2641 if (ipv6 && (rte_be_to_cpu_32(ipv6->hdr.vtc_flow) &
2642 (0xfful << IPV6_HDR_TC_SHIFT)))
2643 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2646 case RTE_FLOW_ITEM_TYPE_UDP: {
2647 const struct rte_flow_item_udp *udp = items->mask;
2649 size += SZ_NLATTR_TYPE_OF(uint16_t);
2650 if (!udp || udp->hdr.src_port != RTE_BE16(0x0000))
2651 size += SZ_NLATTR_TYPE_OF(uint16_t);
2654 case RTE_FLOW_ITEM_TYPE_VXLAN:
2655 size += SZ_NLATTR_TYPE_OF(uint32_t);
2660 "unsupported item %p type %d,"
2661 " items must be validated"
2662 " before flow creation",
2663 (const void *)items, items->type);
2671 * Calculate maximum size of memory for flow actions of Linux TC flower and
2672 * extract specified actions.
2674 * @param[in] actions
2675 * Pointer to the list of actions.
2676 * @param[out] action_flags
2677 * Pointer to the detected actions.
2680 * Maximum size of memory for actions.
2683 flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
2684 uint64_t *action_flags)
2687 uint64_t flags = *action_flags;
2689 size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */
2690 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
2691 switch (actions->type) {
2692 case RTE_FLOW_ACTION_TYPE_VOID:
2694 case RTE_FLOW_ACTION_TYPE_PORT_ID:
2695 size += SZ_NLATTR_NEST + /* na_act_index. */
2696 SZ_NLATTR_STRZ_OF("mirred") +
2697 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2698 SZ_NLATTR_TYPE_OF(struct tc_mirred);
2699 flags |= MLX5_FLOW_ACTION_PORT_ID;
2701 case RTE_FLOW_ACTION_TYPE_JUMP:
2702 size += SZ_NLATTR_NEST + /* na_act_index. */
2703 SZ_NLATTR_STRZ_OF("gact") +
2704 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2705 SZ_NLATTR_TYPE_OF(struct tc_gact);
2706 flags |= MLX5_FLOW_ACTION_JUMP;
2708 case RTE_FLOW_ACTION_TYPE_DROP:
2709 size += SZ_NLATTR_NEST + /* na_act_index. */
2710 SZ_NLATTR_STRZ_OF("gact") +
2711 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2712 SZ_NLATTR_TYPE_OF(struct tc_gact);
2713 flags |= MLX5_FLOW_ACTION_DROP;
2715 case RTE_FLOW_ACTION_TYPE_COUNT:
2717 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
2718 flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
2719 goto action_of_vlan;
2720 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
2721 flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
2722 goto action_of_vlan;
2723 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
2724 flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
2725 goto action_of_vlan;
2726 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
2727 flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
2728 goto action_of_vlan;
2730 size += SZ_NLATTR_NEST + /* na_act_index. */
2731 SZ_NLATTR_STRZ_OF("vlan") +
2732 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2733 SZ_NLATTR_TYPE_OF(struct tc_vlan) +
2734 SZ_NLATTR_TYPE_OF(uint16_t) +
2735 /* VLAN protocol. */
2736 SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
2737 SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
2739 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
2740 size += SZ_NLATTR_NEST + /* na_act_index. */
2741 SZ_NLATTR_STRZ_OF("tunnel_key") +
2742 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2743 SZ_NLATTR_TYPE_OF(uint8_t);
2744 size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2745 size += flow_tcf_vxlan_encap_size(actions) +
2746 RTE_ALIGN_CEIL /* preceding encap params. */
2747 (sizeof(struct flow_tcf_vxlan_encap),
2749 flags |= MLX5_FLOW_ACTION_VXLAN_ENCAP;
2751 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
2752 size += SZ_NLATTR_NEST + /* na_act_index. */
2753 SZ_NLATTR_STRZ_OF("tunnel_key") +
2754 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2755 SZ_NLATTR_TYPE_OF(uint8_t);
2756 size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2757 size += RTE_ALIGN_CEIL /* preceding decap params. */
2758 (sizeof(struct flow_tcf_vxlan_decap),
2760 flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2762 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
2763 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
2764 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
2765 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
2766 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
2767 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
2768 case RTE_FLOW_ACTION_TYPE_SET_TTL:
2769 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
2770 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
2771 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
2772 size += flow_tcf_get_pedit_actions_size(&actions,
2777 "unsupported action %p type %d,"
2778 " items must be validated before flow creation",
2779 (const void *)actions, actions->type);
2783 *action_flags = flags;
2788 * Prepare a flow object for Linux TC flower. It calculates the maximum size of
2789 * memory required, allocates the memory, initializes Netlink message headers
2790 * and set unique TC message handle.
2793 * Pointer to the flow attributes.
2795 * Pointer to the list of items.
2796 * @param[in] actions
2797 * Pointer to the list of actions.
2799 * Pointer to the error structure.
2802 * Pointer to mlx5_flow object on success,
2803 * otherwise NULL and rte_errno is set.
2805 static struct mlx5_flow *
2806 flow_tcf_prepare(const struct rte_flow_attr *attr,
2807 const struct rte_flow_item items[],
2808 const struct rte_flow_action actions[],
2809 struct rte_flow_error *error)
2811 size_t size = RTE_ALIGN_CEIL
2812 (sizeof(struct mlx5_flow),
2813 alignof(struct flow_tcf_tunnel_hdr)) +
2814 MNL_ALIGN(sizeof(struct nlmsghdr)) +
2815 MNL_ALIGN(sizeof(struct tcmsg));
2816 struct mlx5_flow *dev_flow;
2817 uint64_t action_flags = 0;
2818 struct nlmsghdr *nlh;
2820 uint8_t *sp, *tun = NULL;
2822 size += flow_tcf_get_items_size(attr, items, &action_flags);
2823 size += flow_tcf_get_actions_and_size(actions, &action_flags);
2824 dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
2826 rte_flow_error_set(error, ENOMEM,
2827 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
2828 "not enough memory to create E-Switch flow");
2831 sp = (uint8_t *)(dev_flow + 1);
2832 if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) {
2834 (sp, alignof(struct flow_tcf_tunnel_hdr));
2836 sp += RTE_ALIGN_CEIL
2837 (sizeof(struct flow_tcf_vxlan_encap),
2840 size -= RTE_ALIGN_CEIL
2841 (sizeof(struct flow_tcf_vxlan_encap),
2844 } else if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2846 (sp, alignof(struct flow_tcf_tunnel_hdr));
2848 sp += RTE_ALIGN_CEIL
2849 (sizeof(struct flow_tcf_vxlan_decap),
2852 size -= RTE_ALIGN_CEIL
2853 (sizeof(struct flow_tcf_vxlan_decap),
2857 sp = RTE_PTR_ALIGN(sp, MNL_ALIGNTO);
2859 nlh = mnl_nlmsg_put_header(sp);
2860 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
2861 *dev_flow = (struct mlx5_flow){
2862 .tcf = (struct mlx5_flow_tcf){
2864 .nlsize = size - RTE_ALIGN_CEIL
2865 (sizeof(struct mlx5_flow),
2866 alignof(struct flow_tcf_tunnel_hdr)),
2868 .tunnel = (struct flow_tcf_tunnel_hdr *)tun,
2873 if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP)
2874 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_DECAP;
2875 else if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP)
2876 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_ENCAP;
2881 * Make adjustments for supporting count actions.
2884 * Pointer to the Ethernet device structure.
2885 * @param[in] dev_flow
2886 * Pointer to mlx5_flow.
2888 * Pointer to error structure.
2891 * 0 On success else a negative errno value is returned and rte_errno is set.
2894 flow_tcf_translate_action_count(struct rte_eth_dev *dev __rte_unused,
2895 struct mlx5_flow *dev_flow,
2896 struct rte_flow_error *error)
2898 struct rte_flow *flow = dev_flow->flow;
2900 if (!flow->counter) {
2901 flow->counter = flow_tcf_counter_new();
2903 return rte_flow_error_set(error, rte_errno,
2904 RTE_FLOW_ERROR_TYPE_ACTION,
2906 "cannot get counter"
2913 * Convert VXLAN VNI to 32-bit integer.
2916 * VXLAN VNI in 24-bit wire format.
2919 * VXLAN VNI as a 32-bit integer value in network endianness.
2921 static inline rte_be32_t
2922 vxlan_vni_as_be32(const uint8_t vni[3])
2928 .vni = { 0, vni[0], vni[1], vni[2] },
2934 * Helper function to process RTE_FLOW_ITEM_TYPE_ETH entry in configuration
2935 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the MAC address fields
2936 * in the encapsulation parameters structure. The item must be prevalidated,
2937 * no any validation checks performed by function.
2940 * RTE_FLOW_ITEM_TYPE_ETH entry specification.
2942 * RTE_FLOW_ITEM_TYPE_ETH entry mask.
2944 * Structure to fill the gathered MAC address data.
2947 flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec,
2948 const struct rte_flow_item_eth *mask,
2949 struct flow_tcf_vxlan_encap *encap)
2951 /* Item must be validated before. No redundant checks. */
2953 if (!mask || !memcmp(&mask->dst,
2954 &rte_flow_item_eth_mask.dst,
2955 sizeof(rte_flow_item_eth_mask.dst))) {
2957 * Ethernet addresses are not supported by
2958 * tc as tunnel_key parameters. Destination
2959 * address is needed to form encap packet
2960 * header and retrieved by kernel from
2961 * implicit sources (ARP table, etc),
2962 * address masks are not supported at all.
2964 encap->eth.dst = spec->dst;
2965 encap->mask |= FLOW_TCF_ENCAP_ETH_DST;
2967 if (!mask || !memcmp(&mask->src,
2968 &rte_flow_item_eth_mask.src,
2969 sizeof(rte_flow_item_eth_mask.src))) {
2971 * Ethernet addresses are not supported by
2972 * tc as tunnel_key parameters. Source ethernet
2973 * address is ignored anyway.
2975 encap->eth.src = spec->src;
2976 encap->mask |= FLOW_TCF_ENCAP_ETH_SRC;
2981 * Helper function to process RTE_FLOW_ITEM_TYPE_IPV4 entry in configuration
2982 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV4 address fields
2983 * in the encapsulation parameters structure. The item must be prevalidated,
2984 * no any validation checks performed by function.
2987 * RTE_FLOW_ITEM_TYPE_IPV4 entry specification.
2989 * RTE_FLOW_ITEM_TYPE_IPV4 entry mask.
2991 * Structure to fill the gathered IPV4 address data.
2994 flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec,
2995 const struct rte_flow_item_ipv4 *mask,
2996 struct flow_tcf_vxlan_encap *encap)
2998 /* Item must be validated before. No redundant checks. */
3000 encap->ipv4.dst = spec->hdr.dst_addr;
3001 encap->ipv4.src = spec->hdr.src_addr;
3002 encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC |
3003 FLOW_TCF_ENCAP_IPV4_DST;
3004 if (mask && mask->hdr.type_of_service) {
3005 encap->mask |= FLOW_TCF_ENCAP_IP_TOS;
3006 encap->ip_tos = spec->hdr.type_of_service;
3008 if (mask && mask->hdr.time_to_live) {
3009 encap->mask |= FLOW_TCF_ENCAP_IP_TTL;
3010 encap->ip_ttl_hop = spec->hdr.time_to_live;
3015 * Helper function to process RTE_FLOW_ITEM_TYPE_IPV6 entry in configuration
3016 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV6 address fields
3017 * in the encapsulation parameters structure. The item must be prevalidated,
3018 * no any validation checks performed by function.
3021 * RTE_FLOW_ITEM_TYPE_IPV6 entry specification.
3023 * RTE_FLOW_ITEM_TYPE_IPV6 entry mask.
3025 * Structure to fill the gathered IPV6 address data.
3028 flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec,
3029 const struct rte_flow_item_ipv6 *mask,
3030 struct flow_tcf_vxlan_encap *encap)
3032 /* Item must be validated before. No redundant checks. */
3034 memcpy(encap->ipv6.dst, spec->hdr.dst_addr, IPV6_ADDR_LEN);
3035 memcpy(encap->ipv6.src, spec->hdr.src_addr, IPV6_ADDR_LEN);
3036 encap->mask |= FLOW_TCF_ENCAP_IPV6_SRC |
3037 FLOW_TCF_ENCAP_IPV6_DST;
3039 if ((rte_be_to_cpu_32(mask->hdr.vtc_flow) >>
3040 IPV6_HDR_TC_SHIFT) & 0xff) {
3041 encap->mask |= FLOW_TCF_ENCAP_IP_TOS;
3042 encap->ip_tos = (rte_be_to_cpu_32
3043 (spec->hdr.vtc_flow) >>
3044 IPV6_HDR_TC_SHIFT) & 0xff;
3046 if (mask->hdr.hop_limits) {
3047 encap->mask |= FLOW_TCF_ENCAP_IP_TTL;
3048 encap->ip_ttl_hop = spec->hdr.hop_limits;
3054 * Helper function to process RTE_FLOW_ITEM_TYPE_UDP entry in configuration
3055 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the UDP port fields
3056 * in the encapsulation parameters structure. The item must be prevalidated,
3057 * no any validation checks performed by function.
3060 * RTE_FLOW_ITEM_TYPE_UDP entry specification.
3062 * RTE_FLOW_ITEM_TYPE_UDP entry mask.
3064 * Structure to fill the gathered UDP port data.
3067 flow_tcf_parse_vxlan_encap_udp(const struct rte_flow_item_udp *spec,
3068 const struct rte_flow_item_udp *mask,
3069 struct flow_tcf_vxlan_encap *encap)
3072 encap->udp.dst = spec->hdr.dst_port;
3073 encap->mask |= FLOW_TCF_ENCAP_UDP_DST;
3074 if (!mask || mask->hdr.src_port != RTE_BE16(0x0000)) {
3075 encap->udp.src = spec->hdr.src_port;
3076 encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC;
3081 * Helper function to process RTE_FLOW_ITEM_TYPE_VXLAN entry in configuration
3082 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the VNI fields
3083 * in the encapsulation parameters structure. The item must be prevalidated,
3084 * no any validation checks performed by function.
3087 * RTE_FLOW_ITEM_TYPE_VXLAN entry specification.
3089 * Structure to fill the gathered VNI address data.
3092 flow_tcf_parse_vxlan_encap_vni(const struct rte_flow_item_vxlan *spec,
3093 struct flow_tcf_vxlan_encap *encap)
3095 /* Item must be validated before. Do not redundant checks. */
3097 memcpy(encap->vxlan.vni, spec->vni, sizeof(encap->vxlan.vni));
3098 encap->mask |= FLOW_TCF_ENCAP_VXLAN_VNI;
3102 * Populate consolidated encapsulation object from list of pattern items.
3104 * Helper function to process configuration of action such as
3105 * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. The item list should be
3106 * validated, there is no way to return an meaningful error.
3109 * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
3110 * List of pattern items to gather data from.
3112 * Structure to fill gathered data.
3115 flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action,
3116 struct flow_tcf_vxlan_encap *encap)
3119 const struct rte_flow_item_eth *eth;
3120 const struct rte_flow_item_ipv4 *ipv4;
3121 const struct rte_flow_item_ipv6 *ipv6;
3122 const struct rte_flow_item_udp *udp;
3123 const struct rte_flow_item_vxlan *vxlan;
3125 const struct rte_flow_item *items;
3127 assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
3128 assert(action->conf);
3130 items = ((const struct rte_flow_action_vxlan_encap *)
3131 action->conf)->definition;
3133 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3134 switch (items->type) {
3135 case RTE_FLOW_ITEM_TYPE_VOID:
3137 case RTE_FLOW_ITEM_TYPE_ETH:
3138 mask.eth = items->mask;
3139 spec.eth = items->spec;
3140 flow_tcf_parse_vxlan_encap_eth(spec.eth, mask.eth,
3143 case RTE_FLOW_ITEM_TYPE_IPV4:
3144 spec.ipv4 = items->spec;
3145 mask.ipv4 = items->mask;
3146 flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4, mask.ipv4,
3149 case RTE_FLOW_ITEM_TYPE_IPV6:
3150 spec.ipv6 = items->spec;
3151 mask.ipv6 = items->mask;
3152 flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6, mask.ipv6,
3155 case RTE_FLOW_ITEM_TYPE_UDP:
3156 mask.udp = items->mask;
3157 spec.udp = items->spec;
3158 flow_tcf_parse_vxlan_encap_udp(spec.udp, mask.udp,
3161 case RTE_FLOW_ITEM_TYPE_VXLAN:
3162 spec.vxlan = items->spec;
3163 flow_tcf_parse_vxlan_encap_vni(spec.vxlan, encap);
3168 "unsupported item %p type %d,"
3169 " items must be validated"
3170 " before flow creation",
3171 (const void *)items, items->type);
3179 * Translate flow for Linux TC flower and construct Netlink message.
3182 * Pointer to the priv structure.
3183 * @param[in, out] flow
3184 * Pointer to the sub flow.
3186 * Pointer to the flow attributes.
3188 * Pointer to the list of items.
3189 * @param[in] actions
3190 * Pointer to the list of actions.
3192 * Pointer to the error structure.
3195 * 0 on success, a negative errno value otherwise and rte_errno is set.
3198 flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow,
3199 const struct rte_flow_attr *attr,
3200 const struct rte_flow_item items[],
3201 const struct rte_flow_action actions[],
3202 struct rte_flow_error *error)
3205 const struct rte_flow_item_port_id *port_id;
3206 const struct rte_flow_item_eth *eth;
3207 const struct rte_flow_item_vlan *vlan;
3208 const struct rte_flow_item_ipv4 *ipv4;
3209 const struct rte_flow_item_ipv6 *ipv6;
3210 const struct rte_flow_item_tcp *tcp;
3211 const struct rte_flow_item_udp *udp;
3212 const struct rte_flow_item_vxlan *vxlan;
3215 const struct rte_flow_action_port_id *port_id;
3216 const struct rte_flow_action_jump *jump;
3217 const struct rte_flow_action_of_push_vlan *of_push_vlan;
3218 const struct rte_flow_action_of_set_vlan_vid *
3220 const struct rte_flow_action_of_set_vlan_pcp *
3224 struct flow_tcf_tunnel_hdr *hdr;
3225 struct flow_tcf_vxlan_decap *vxlan;
3230 struct flow_tcf_tunnel_hdr *hdr;
3231 struct flow_tcf_vxlan_encap *vxlan;
3235 struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
3236 struct nlmsghdr *nlh = dev_flow->tcf.nlh;
3237 struct tcmsg *tcm = dev_flow->tcf.tcm;
3238 uint32_t na_act_index_cur;
3239 rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
3240 rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
3241 rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
3242 bool ip_proto_set = 0;
3243 bool tunnel_outer = 0;
3244 struct nlattr *na_flower;
3245 struct nlattr *na_flower_act;
3246 struct nlattr *na_vlan_id = NULL;
3247 struct nlattr *na_vlan_priority = NULL;
3248 uint64_t item_flags = 0;
3251 claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
3252 PTOI_TABLE_SZ_MAX(dev)));
3253 if (dev_flow->tcf.tunnel) {
3254 switch (dev_flow->tcf.tunnel->type) {
3255 case FLOW_TCF_TUNACT_VXLAN_DECAP:
3256 decap.vxlan = dev_flow->tcf.vxlan_decap;
3259 case FLOW_TCF_TUNACT_VXLAN_ENCAP:
3260 encap.vxlan = dev_flow->tcf.vxlan_encap;
3262 /* New tunnel actions can be added here. */
3268 nlh = dev_flow->tcf.nlh;
3269 tcm = dev_flow->tcf.tcm;
3270 /* Prepare API must have been called beforehand. */
3271 assert(nlh != NULL && tcm != NULL);
3272 tcm->tcm_family = AF_UNSPEC;
3273 tcm->tcm_ifindex = ptoi[0].ifindex;
3274 tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
3276 * Priority cannot be zero to prevent the kernel from picking one
3279 tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16, outer_etype);
3280 if (attr->group > 0)
3281 mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group);
3282 mnl_attr_put_strz(nlh, TCA_KIND, "flower");
3283 na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
3284 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3287 switch (items->type) {
3288 case RTE_FLOW_ITEM_TYPE_VOID:
3290 case RTE_FLOW_ITEM_TYPE_PORT_ID:
3291 mask.port_id = flow_tcf_item_mask
3292 (items, &rte_flow_item_port_id_mask,
3293 &flow_tcf_mask_supported.port_id,
3294 &flow_tcf_mask_empty.port_id,
3295 sizeof(flow_tcf_mask_supported.port_id),
3297 assert(mask.port_id);
3298 if (mask.port_id == &flow_tcf_mask_empty.port_id)
3300 spec.port_id = items->spec;
3301 if (!mask.port_id->id)
3304 for (i = 0; ptoi[i].ifindex; ++i)
3305 if (ptoi[i].port_id == spec.port_id->id)
3307 assert(ptoi[i].ifindex);
3308 tcm->tcm_ifindex = ptoi[i].ifindex;
3310 case RTE_FLOW_ITEM_TYPE_ETH:
3311 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3312 MLX5_FLOW_LAYER_INNER_L2 :
3313 MLX5_FLOW_LAYER_OUTER_L2;
3314 mask.eth = flow_tcf_item_mask
3315 (items, &rte_flow_item_eth_mask,
3316 &flow_tcf_mask_supported.eth,
3317 &flow_tcf_mask_empty.eth,
3318 sizeof(flow_tcf_mask_supported.eth),
3321 if (mask.eth == &flow_tcf_mask_empty.eth)
3323 spec.eth = items->spec;
3324 if (mask.eth->type) {
3325 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
3326 inner_etype = spec.eth->type;
3328 outer_etype = spec.eth->type;
3332 "outer L2 addresses cannot be"
3333 " forced is outer ones for tunnel,"
3334 " parameter is ignored");
3337 if (!rte_is_zero_ether_addr(&mask.eth->dst)) {
3338 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
3340 spec.eth->dst.addr_bytes);
3341 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK,
3343 mask.eth->dst.addr_bytes);
3345 if (!rte_is_zero_ether_addr(&mask.eth->src)) {
3346 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC,
3348 spec.eth->src.addr_bytes);
3349 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK,
3351 mask.eth->src.addr_bytes);
3353 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3355 case RTE_FLOW_ITEM_TYPE_VLAN:
3358 assert(!tunnel_outer);
3359 item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
3360 mask.vlan = flow_tcf_item_mask
3361 (items, &rte_flow_item_vlan_mask,
3362 &flow_tcf_mask_supported.vlan,
3363 &flow_tcf_mask_empty.vlan,
3364 sizeof(flow_tcf_mask_supported.vlan),
3367 if (mask.vlan == &flow_tcf_mask_empty.vlan)
3369 spec.vlan = items->spec;
3370 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3371 outer_etype == RTE_BE16(ETH_P_8021Q));
3372 outer_etype = RTE_BE16(ETH_P_8021Q);
3373 if (mask.vlan->inner_type)
3374 vlan_etype = spec.vlan->inner_type;
3375 if (mask.vlan->tci & RTE_BE16(0xe000))
3376 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_VLAN_PRIO,
3378 (spec.vlan->tci) >> 13) & 0x7);
3379 if (mask.vlan->tci & RTE_BE16(0x0fff))
3380 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_VLAN_ID,
3384 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3386 case RTE_FLOW_ITEM_TYPE_IPV4:
3387 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3388 MLX5_FLOW_LAYER_INNER_L3_IPV4 :
3389 MLX5_FLOW_LAYER_OUTER_L3_IPV4;
3390 mask.ipv4 = flow_tcf_item_mask
3391 (items, &rte_flow_item_ipv4_mask,
3392 &flow_tcf_mask_supported.ipv4,
3393 &flow_tcf_mask_empty.ipv4,
3394 sizeof(flow_tcf_mask_supported.ipv4),
3397 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3398 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3399 inner_etype == RTE_BE16(ETH_P_IP));
3400 inner_etype = RTE_BE16(ETH_P_IP);
3401 } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3402 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3403 vlan_etype == RTE_BE16(ETH_P_IP));
3404 vlan_etype = RTE_BE16(ETH_P_IP);
3406 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3407 outer_etype == RTE_BE16(ETH_P_IP));
3408 outer_etype = RTE_BE16(ETH_P_IP);
3410 spec.ipv4 = items->spec;
3411 if (!tunnel_outer && mask.ipv4->hdr.next_proto_id) {
3413 * No way to set IP protocol for outer tunnel
3414 * layers. Usually it is fixed, for example,
3415 * to UDP for VXLAN/GPE.
3417 assert(spec.ipv4); /* Mask is not empty. */
3418 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3419 spec.ipv4->hdr.next_proto_id);
3422 if (mask.ipv4 == &flow_tcf_mask_empty.ipv4 ||
3423 (!mask.ipv4->hdr.src_addr &&
3424 !mask.ipv4->hdr.dst_addr)) {
3428 * For tunnel outer we must set outer IP key
3429 * anyway, even if the specification/mask is
3430 * empty. There is no another way to tell
3431 * kernel about he outer layer protocol.
3434 (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC,
3435 mask.ipv4->hdr.src_addr);
3437 (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
3438 mask.ipv4->hdr.src_addr);
3439 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3442 if (mask.ipv4->hdr.src_addr) {
3444 (nlh, tunnel_outer ?
3445 TCA_FLOWER_KEY_ENC_IPV4_SRC :
3446 TCA_FLOWER_KEY_IPV4_SRC,
3447 spec.ipv4->hdr.src_addr);
3449 (nlh, tunnel_outer ?
3450 TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK :
3451 TCA_FLOWER_KEY_IPV4_SRC_MASK,
3452 mask.ipv4->hdr.src_addr);
3454 if (mask.ipv4->hdr.dst_addr) {
3456 (nlh, tunnel_outer ?
3457 TCA_FLOWER_KEY_ENC_IPV4_DST :
3458 TCA_FLOWER_KEY_IPV4_DST,
3459 spec.ipv4->hdr.dst_addr);
3461 (nlh, tunnel_outer ?
3462 TCA_FLOWER_KEY_ENC_IPV4_DST_MASK :
3463 TCA_FLOWER_KEY_IPV4_DST_MASK,
3464 mask.ipv4->hdr.dst_addr);
3466 if (mask.ipv4->hdr.time_to_live) {
3468 (nlh, tunnel_outer ?
3469 TCA_FLOWER_KEY_ENC_IP_TTL :
3470 TCA_FLOWER_KEY_IP_TTL,
3471 spec.ipv4->hdr.time_to_live);
3473 (nlh, tunnel_outer ?
3474 TCA_FLOWER_KEY_ENC_IP_TTL_MASK :
3475 TCA_FLOWER_KEY_IP_TTL_MASK,
3476 mask.ipv4->hdr.time_to_live);
3478 if (mask.ipv4->hdr.type_of_service) {
3480 (nlh, tunnel_outer ?
3481 TCA_FLOWER_KEY_ENC_IP_TOS :
3482 TCA_FLOWER_KEY_IP_TOS,
3483 spec.ipv4->hdr.type_of_service);
3485 (nlh, tunnel_outer ?
3486 TCA_FLOWER_KEY_ENC_IP_TOS_MASK :
3487 TCA_FLOWER_KEY_IP_TOS_MASK,
3488 mask.ipv4->hdr.type_of_service);
3490 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3492 case RTE_FLOW_ITEM_TYPE_IPV6: {
3493 bool ipv6_src, ipv6_dst;
3496 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3497 MLX5_FLOW_LAYER_INNER_L3_IPV6 :
3498 MLX5_FLOW_LAYER_OUTER_L3_IPV6;
3499 mask.ipv6 = flow_tcf_item_mask
3500 (items, &rte_flow_item_ipv6_mask,
3501 &flow_tcf_mask_supported.ipv6,
3502 &flow_tcf_mask_empty.ipv6,
3503 sizeof(flow_tcf_mask_supported.ipv6),
3506 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3507 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3508 inner_etype == RTE_BE16(ETH_P_IPV6));
3509 inner_etype = RTE_BE16(ETH_P_IPV6);
3510 } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3511 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3512 vlan_etype == RTE_BE16(ETH_P_IPV6));
3513 vlan_etype = RTE_BE16(ETH_P_IPV6);
3515 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3516 outer_etype == RTE_BE16(ETH_P_IPV6));
3517 outer_etype = RTE_BE16(ETH_P_IPV6);
3519 spec.ipv6 = items->spec;
3520 if (!tunnel_outer && mask.ipv6->hdr.proto) {
3522 * No way to set IP protocol for outer tunnel
3523 * layers. Usually it is fixed, for example,
3524 * to UDP for VXLAN/GPE.
3526 assert(spec.ipv6); /* Mask is not empty. */
3527 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3528 spec.ipv6->hdr.proto);
3531 ipv6_dst = !IN6_IS_ADDR_UNSPECIFIED
3532 (mask.ipv6->hdr.dst_addr);
3533 ipv6_src = !IN6_IS_ADDR_UNSPECIFIED
3534 (mask.ipv6->hdr.src_addr);
3535 if (mask.ipv6 == &flow_tcf_mask_empty.ipv6 ||
3536 (!ipv6_dst && !ipv6_src)) {
3540 * For tunnel outer we must set outer IP key
3541 * anyway, even if the specification/mask is
3542 * empty. There is no another way to tell
3543 * kernel about he outer layer protocol.
3546 TCA_FLOWER_KEY_ENC_IPV6_SRC,
3548 mask.ipv6->hdr.src_addr);
3550 TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
3552 mask.ipv6->hdr.src_addr);
3553 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3557 mnl_attr_put(nlh, tunnel_outer ?
3558 TCA_FLOWER_KEY_ENC_IPV6_SRC :
3559 TCA_FLOWER_KEY_IPV6_SRC,
3561 spec.ipv6->hdr.src_addr);
3562 mnl_attr_put(nlh, tunnel_outer ?
3563 TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK :
3564 TCA_FLOWER_KEY_IPV6_SRC_MASK,
3566 mask.ipv6->hdr.src_addr);
3569 mnl_attr_put(nlh, tunnel_outer ?
3570 TCA_FLOWER_KEY_ENC_IPV6_DST :
3571 TCA_FLOWER_KEY_IPV6_DST,
3573 spec.ipv6->hdr.dst_addr);
3574 mnl_attr_put(nlh, tunnel_outer ?
3575 TCA_FLOWER_KEY_ENC_IPV6_DST_MASK :
3576 TCA_FLOWER_KEY_IPV6_DST_MASK,
3578 mask.ipv6->hdr.dst_addr);
3580 if (mask.ipv6->hdr.hop_limits) {
3582 (nlh, tunnel_outer ?
3583 TCA_FLOWER_KEY_ENC_IP_TTL :
3584 TCA_FLOWER_KEY_IP_TTL,
3585 spec.ipv6->hdr.hop_limits);
3587 (nlh, tunnel_outer ?
3588 TCA_FLOWER_KEY_ENC_IP_TTL_MASK :
3589 TCA_FLOWER_KEY_IP_TTL_MASK,
3590 mask.ipv6->hdr.hop_limits);
3592 msk6 = (rte_be_to_cpu_32(mask.ipv6->hdr.vtc_flow) >>
3593 IPV6_HDR_TC_SHIFT) & 0xff;
3595 tos6 = (rte_be_to_cpu_32
3596 (spec.ipv6->hdr.vtc_flow) >>
3597 IPV6_HDR_TC_SHIFT) & 0xff;
3599 (nlh, tunnel_outer ?
3600 TCA_FLOWER_KEY_ENC_IP_TOS :
3601 TCA_FLOWER_KEY_IP_TOS, tos6);
3603 (nlh, tunnel_outer ?
3604 TCA_FLOWER_KEY_ENC_IP_TOS_MASK :
3605 TCA_FLOWER_KEY_IP_TOS_MASK, msk6);
3607 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3610 case RTE_FLOW_ITEM_TYPE_UDP:
3611 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3612 MLX5_FLOW_LAYER_INNER_L4_UDP :
3613 MLX5_FLOW_LAYER_OUTER_L4_UDP;
3614 mask.udp = flow_tcf_item_mask
3615 (items, &rte_flow_item_udp_mask,
3616 &flow_tcf_mask_supported.udp,
3617 &flow_tcf_mask_empty.udp,
3618 sizeof(flow_tcf_mask_supported.udp),
3621 spec.udp = items->spec;
3622 if (!tunnel_outer) {
3625 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3627 if (mask.udp == &flow_tcf_mask_empty.udp)
3630 assert(mask.udp != &flow_tcf_mask_empty.udp);
3631 decap.vxlan->udp_port =
3633 (spec.udp->hdr.dst_port);
3635 if (mask.udp->hdr.src_port) {
3637 (nlh, tunnel_outer ?
3638 TCA_FLOWER_KEY_ENC_UDP_SRC_PORT :
3639 TCA_FLOWER_KEY_UDP_SRC,
3640 spec.udp->hdr.src_port);
3642 (nlh, tunnel_outer ?
3643 TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK :
3644 TCA_FLOWER_KEY_UDP_SRC_MASK,
3645 mask.udp->hdr.src_port);
3647 if (mask.udp->hdr.dst_port) {
3649 (nlh, tunnel_outer ?
3650 TCA_FLOWER_KEY_ENC_UDP_DST_PORT :
3651 TCA_FLOWER_KEY_UDP_DST,
3652 spec.udp->hdr.dst_port);
3654 (nlh, tunnel_outer ?
3655 TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK :
3656 TCA_FLOWER_KEY_UDP_DST_MASK,
3657 mask.udp->hdr.dst_port);
3659 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3661 case RTE_FLOW_ITEM_TYPE_TCP:
3662 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3663 MLX5_FLOW_LAYER_INNER_L4_TCP :
3664 MLX5_FLOW_LAYER_OUTER_L4_TCP;
3665 mask.tcp = flow_tcf_item_mask
3666 (items, &rte_flow_item_tcp_mask,
3667 &flow_tcf_mask_supported.tcp,
3668 &flow_tcf_mask_empty.tcp,
3669 sizeof(flow_tcf_mask_supported.tcp),
3673 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3675 if (mask.tcp == &flow_tcf_mask_empty.tcp)
3677 spec.tcp = items->spec;
3678 if (mask.tcp->hdr.src_port) {
3679 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_SRC,
3680 spec.tcp->hdr.src_port);
3681 mnl_attr_put_u16(nlh,
3682 TCA_FLOWER_KEY_TCP_SRC_MASK,
3683 mask.tcp->hdr.src_port);
3685 if (mask.tcp->hdr.dst_port) {
3686 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_DST,
3687 spec.tcp->hdr.dst_port);
3688 mnl_attr_put_u16(nlh,
3689 TCA_FLOWER_KEY_TCP_DST_MASK,
3690 mask.tcp->hdr.dst_port);
3692 if (mask.tcp->hdr.tcp_flags) {
3695 TCA_FLOWER_KEY_TCP_FLAGS,
3697 (spec.tcp->hdr.tcp_flags));
3700 TCA_FLOWER_KEY_TCP_FLAGS_MASK,
3702 (mask.tcp->hdr.tcp_flags));
3704 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3706 case RTE_FLOW_ITEM_TYPE_VXLAN:
3707 assert(decap.vxlan);
3709 item_flags |= MLX5_FLOW_LAYER_VXLAN;
3710 spec.vxlan = items->spec;
3711 mnl_attr_put_u32(nlh,
3712 TCA_FLOWER_KEY_ENC_KEY_ID,
3713 vxlan_vni_as_be32(spec.vxlan->vni));
3714 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3717 return rte_flow_error_set(error, ENOTSUP,
3718 RTE_FLOW_ERROR_TYPE_ITEM,
3719 NULL, "item not supported");
3723 * Set the ether_type flower key and tc rule protocol:
3724 * - if there is nor VLAN neither VXLAN the key is taken from
3725 * eth item directly or deduced from L3 items.
3726 * - if there is vlan item then key is fixed to 802.1q.
3727 * - if there is vxlan item then key is set to inner tunnel type.
3728 * - simultaneous vlan and vxlan items are prohibited.
3730 if (outer_etype != RTE_BE16(ETH_P_ALL)) {
3731 tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
3733 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3734 if (inner_etype != RTE_BE16(ETH_P_ALL))
3735 mnl_attr_put_u16(nlh,
3736 TCA_FLOWER_KEY_ETH_TYPE,
3739 mnl_attr_put_u16(nlh,
3740 TCA_FLOWER_KEY_ETH_TYPE,
3742 if (outer_etype == RTE_BE16(ETH_P_8021Q) &&
3743 vlan_etype != RTE_BE16(ETH_P_ALL))
3744 mnl_attr_put_u16(nlh,
3745 TCA_FLOWER_KEY_VLAN_ETH_TYPE,
3748 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3750 na_flower_act = mnl_attr_nest_start(nlh, TCA_FLOWER_ACT);
3751 na_act_index_cur = 1;
3752 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
3753 struct nlattr *na_act_index;
3754 struct nlattr *na_act;
3755 unsigned int vlan_act;
3758 switch (actions->type) {
3759 case RTE_FLOW_ACTION_TYPE_VOID:
3761 case RTE_FLOW_ACTION_TYPE_PORT_ID:
3762 conf.port_id = actions->conf;
3763 if (conf.port_id->original)
3766 for (i = 0; ptoi[i].ifindex; ++i)
3767 if (ptoi[i].port_id == conf.port_id->id)
3769 assert(ptoi[i].ifindex);
3771 mnl_attr_nest_start(nlh, na_act_index_cur++);
3772 assert(na_act_index);
3773 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
3774 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3777 assert(dev_flow->tcf.tunnel);
3778 dev_flow->tcf.tunnel->ifindex_ptr =
3779 &((struct tc_mirred *)
3780 mnl_attr_get_payload
3781 (mnl_nlmsg_get_payload_tail
3783 } else if (decap.hdr) {
3784 assert(dev_flow->tcf.tunnel);
3785 dev_flow->tcf.tunnel->ifindex_ptr =
3786 (unsigned int *)&tcm->tcm_ifindex;
3788 mnl_attr_put(nlh, TCA_MIRRED_PARMS,
3789 sizeof(struct tc_mirred),
3790 &(struct tc_mirred){
3791 .action = TC_ACT_STOLEN,
3792 .eaction = TCA_EGRESS_REDIR,
3793 .ifindex = ptoi[i].ifindex,
3795 mnl_attr_nest_end(nlh, na_act);
3796 mnl_attr_nest_end(nlh, na_act_index);
3798 case RTE_FLOW_ACTION_TYPE_JUMP:
3799 conf.jump = actions->conf;
3801 mnl_attr_nest_start(nlh, na_act_index_cur++);
3802 assert(na_act_index);
3803 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3804 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3806 mnl_attr_put(nlh, TCA_GACT_PARMS,
3807 sizeof(struct tc_gact),
3809 .action = TC_ACT_GOTO_CHAIN |
3812 mnl_attr_nest_end(nlh, na_act);
3813 mnl_attr_nest_end(nlh, na_act_index);
3815 case RTE_FLOW_ACTION_TYPE_DROP:
3817 mnl_attr_nest_start(nlh, na_act_index_cur++);
3818 assert(na_act_index);
3819 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3820 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3822 mnl_attr_put(nlh, TCA_GACT_PARMS,
3823 sizeof(struct tc_gact),
3825 .action = TC_ACT_SHOT,
3827 mnl_attr_nest_end(nlh, na_act);
3828 mnl_attr_nest_end(nlh, na_act_index);
3830 case RTE_FLOW_ACTION_TYPE_COUNT:
3832 * Driver adds the count action implicitly for
3833 * each rule it creates.
3835 ret = flow_tcf_translate_action_count(dev,
3840 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
3841 conf.of_push_vlan = NULL;
3842 vlan_act = TCA_VLAN_ACT_POP;
3843 goto action_of_vlan;
3844 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
3845 conf.of_push_vlan = actions->conf;
3846 vlan_act = TCA_VLAN_ACT_PUSH;
3847 goto action_of_vlan;
3848 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
3849 conf.of_set_vlan_vid = actions->conf;
3851 goto override_na_vlan_id;
3852 vlan_act = TCA_VLAN_ACT_MODIFY;
3853 goto action_of_vlan;
3854 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
3855 conf.of_set_vlan_pcp = actions->conf;
3856 if (na_vlan_priority)
3857 goto override_na_vlan_priority;
3858 vlan_act = TCA_VLAN_ACT_MODIFY;
3859 goto action_of_vlan;
3862 mnl_attr_nest_start(nlh, na_act_index_cur++);
3863 assert(na_act_index);
3864 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "vlan");
3865 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3867 mnl_attr_put(nlh, TCA_VLAN_PARMS,
3868 sizeof(struct tc_vlan),
3870 .action = TC_ACT_PIPE,
3871 .v_action = vlan_act,
3873 if (vlan_act == TCA_VLAN_ACT_POP) {
3874 mnl_attr_nest_end(nlh, na_act);
3875 mnl_attr_nest_end(nlh, na_act_index);
3878 if (vlan_act == TCA_VLAN_ACT_PUSH)
3879 mnl_attr_put_u16(nlh,
3880 TCA_VLAN_PUSH_VLAN_PROTOCOL,
3881 conf.of_push_vlan->ethertype);
3882 na_vlan_id = mnl_nlmsg_get_payload_tail(nlh);
3883 mnl_attr_put_u16(nlh, TCA_VLAN_PAD, 0);
3884 na_vlan_priority = mnl_nlmsg_get_payload_tail(nlh);
3885 mnl_attr_put_u8(nlh, TCA_VLAN_PAD, 0);
3886 mnl_attr_nest_end(nlh, na_act);
3887 mnl_attr_nest_end(nlh, na_act_index);
3888 if (actions->type ==
3889 RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
3890 override_na_vlan_id:
3891 na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID;
3892 *(uint16_t *)mnl_attr_get_payload(na_vlan_id) =
3894 (conf.of_set_vlan_vid->vlan_vid);
3895 } else if (actions->type ==
3896 RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
3897 override_na_vlan_priority:
3898 na_vlan_priority->nla_type =
3899 TCA_VLAN_PUSH_VLAN_PRIORITY;
3900 *(uint8_t *)mnl_attr_get_payload
3901 (na_vlan_priority) =
3902 conf.of_set_vlan_pcp->vlan_pcp;
3905 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
3906 assert(decap.vxlan);
3907 assert(dev_flow->tcf.tunnel);
3908 dev_flow->tcf.tunnel->ifindex_ptr =
3909 (unsigned int *)&tcm->tcm_ifindex;
3911 mnl_attr_nest_start(nlh, na_act_index_cur++);
3912 assert(na_act_index);
3913 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3914 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3916 mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3917 sizeof(struct tc_tunnel_key),
3918 &(struct tc_tunnel_key){
3919 .action = TC_ACT_PIPE,
3920 .t_action = TCA_TUNNEL_KEY_ACT_RELEASE,
3922 mnl_attr_nest_end(nlh, na_act);
3923 mnl_attr_nest_end(nlh, na_act_index);
3924 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3926 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
3927 assert(encap.vxlan);
3928 flow_tcf_vxlan_encap_parse(actions, encap.vxlan);
3930 mnl_attr_nest_start(nlh, na_act_index_cur++);
3931 assert(na_act_index);
3932 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3933 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3935 mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3936 sizeof(struct tc_tunnel_key),
3937 &(struct tc_tunnel_key){
3938 .action = TC_ACT_PIPE,
3939 .t_action = TCA_TUNNEL_KEY_ACT_SET,
3941 if (encap.vxlan->mask & FLOW_TCF_ENCAP_UDP_DST)
3942 mnl_attr_put_u16(nlh,
3943 TCA_TUNNEL_KEY_ENC_DST_PORT,
3944 encap.vxlan->udp.dst);
3945 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_SRC)
3946 mnl_attr_put_u32(nlh,
3947 TCA_TUNNEL_KEY_ENC_IPV4_SRC,
3948 encap.vxlan->ipv4.src);
3949 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_DST)
3950 mnl_attr_put_u32(nlh,
3951 TCA_TUNNEL_KEY_ENC_IPV4_DST,
3952 encap.vxlan->ipv4.dst);
3953 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_SRC)
3955 TCA_TUNNEL_KEY_ENC_IPV6_SRC,
3956 sizeof(encap.vxlan->ipv6.src),
3957 &encap.vxlan->ipv6.src);
3958 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_DST)
3960 TCA_TUNNEL_KEY_ENC_IPV6_DST,
3961 sizeof(encap.vxlan->ipv6.dst),
3962 &encap.vxlan->ipv6.dst);
3963 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IP_TTL)
3964 mnl_attr_put_u8(nlh,
3965 TCA_TUNNEL_KEY_ENC_TTL,
3966 encap.vxlan->ip_ttl_hop);
3967 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IP_TOS)
3968 mnl_attr_put_u8(nlh,
3969 TCA_TUNNEL_KEY_ENC_TOS,
3970 encap.vxlan->ip_tos);
3971 if (encap.vxlan->mask & FLOW_TCF_ENCAP_VXLAN_VNI)
3972 mnl_attr_put_u32(nlh,
3973 TCA_TUNNEL_KEY_ENC_KEY_ID,
3975 (encap.vxlan->vxlan.vni));
3976 mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM, 0);
3977 mnl_attr_nest_end(nlh, na_act);
3978 mnl_attr_nest_end(nlh, na_act_index);
3979 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3981 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
3982 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
3983 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
3984 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
3985 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
3986 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
3987 case RTE_FLOW_ACTION_TYPE_SET_TTL:
3988 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
3989 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
3990 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
3992 mnl_attr_nest_start(nlh, na_act_index_cur++);
3993 flow_tcf_create_pedit_mnl_msg(nlh,
3994 &actions, item_flags);
3995 mnl_attr_nest_end(nlh, na_act_index);
3998 return rte_flow_error_set(error, ENOTSUP,
3999 RTE_FLOW_ERROR_TYPE_ACTION,
4001 "action not supported");
4005 assert(na_flower_act);
4006 mnl_attr_nest_end(nlh, na_flower_act);
4007 dev_flow->tcf.ptc_flags = mnl_attr_get_payload
4008 (mnl_nlmsg_get_payload_tail(nlh));
4009 mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, decap.vxlan ?
4010 0 : TCA_CLS_FLAGS_SKIP_SW);
4011 mnl_attr_nest_end(nlh, na_flower);
4012 if (dev_flow->tcf.tunnel && dev_flow->tcf.tunnel->ifindex_ptr)
4013 dev_flow->tcf.tunnel->ifindex_org =
4014 *dev_flow->tcf.tunnel->ifindex_ptr;
4015 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
4020 * Send Netlink message with acknowledgment.
4023 * Flow context to use.
4025 * Message to send. This function always raises the NLM_F_ACK flag before
4028 * Callback handler for received message.
4030 * Context pointer for callback handler.
4033 * 0 on success, a negative errno value otherwise and rte_errno is set.
4036 flow_tcf_nl_ack(struct mlx5_flow_tcf_context *tcf,
4037 struct nlmsghdr *nlh,
4038 mnl_cb_t cb, void *arg)
4040 unsigned int portid = mnl_socket_get_portid(tcf->nl);
4041 uint32_t seq = tcf->seq++;
4047 /* seq 0 is reserved for kernel event-driven notifications. */
4050 nlh->nlmsg_seq = seq;
4051 nlh->nlmsg_flags |= NLM_F_ACK;
4052 ret = mnl_socket_sendto(tcf->nl, nlh, nlh->nlmsg_len);
4054 /* Message send error occurred. */
4058 nlh = (struct nlmsghdr *)(tcf->buf);
4060 * The following loop postpones non-fatal errors until multipart
4061 * messages are complete.
4064 ret = mnl_socket_recvfrom(tcf->nl, tcf->buf, tcf->buf_size);
4068 * In case of overflow Will receive till
4069 * end of multipart message. We may lost part
4070 * of reply messages but mark and return an error.
4072 if (err != ENOSPC ||
4073 !(nlh->nlmsg_flags & NLM_F_MULTI) ||
4074 nlh->nlmsg_type == NLMSG_DONE)
4077 ret = mnl_cb_run(nlh, ret, seq, portid, cb, arg);
4080 * libmnl returns 0 if DONE or
4081 * success ACK message found.
4087 * ACK message with error found
4088 * or some error occurred.
4093 /* We should continue receiving. */
4102 #define MNL_BUF_EXTRA_SPACE 16
4103 #define MNL_REQUEST_SIZE_MIN 256
4104 #define MNL_REQUEST_SIZE_MAX 2048
4105 #define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \
4106 MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX)
4108 /* Data structures used by flow_tcf_xxx_cb() routines. */
4109 struct tcf_nlcb_buf {
4110 LIST_ENTRY(tcf_nlcb_buf) next;
4112 alignas(struct nlmsghdr)
4113 uint8_t msg[]; /**< Netlink message data. */
4116 struct tcf_nlcb_context {
4117 unsigned int ifindex; /**< Base interface index. */
4119 LIST_HEAD(, tcf_nlcb_buf) nlbuf;
4123 * Allocate space for netlink command in buffer list
4125 * @param[in, out] ctx
4126 * Pointer to callback context with command buffers list.
4128 * Required size of data buffer to be allocated.
4131 * Pointer to allocated memory, aligned as message header.
4132 * NULL if some error occurred.
4134 static struct nlmsghdr *
4135 flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size)
4137 struct tcf_nlcb_buf *buf;
4138 struct nlmsghdr *nlh;
4140 size = NLMSG_ALIGN(size);
4141 buf = LIST_FIRST(&ctx->nlbuf);
4142 if (buf && (buf->size + size) <= ctx->bufsize) {
4143 nlh = (struct nlmsghdr *)&buf->msg[buf->size];
4147 if (size > ctx->bufsize) {
4148 DRV_LOG(WARNING, "netlink: too long command buffer requested");
4151 buf = rte_malloc(__func__,
4152 ctx->bufsize + sizeof(struct tcf_nlcb_buf),
4153 alignof(struct tcf_nlcb_buf));
4155 DRV_LOG(WARNING, "netlink: no memory for command buffer");
4158 LIST_INSERT_HEAD(&ctx->nlbuf, buf, next);
4160 nlh = (struct nlmsghdr *)&buf->msg[0];
4165 * Send the buffers with prepared netlink commands. Scans the list and
4166 * sends all found buffers. Buffers are sent and freed anyway in order
4167 * to prevent memory leakage if some every message in received packet.
4170 * Context object initialized by mlx5_flow_tcf_context_create().
4171 * @param[in, out] ctx
4172 * Pointer to callback context with command buffers list.
4175 * Zero value on success, negative errno value otherwise
4176 * and rte_errno is set.
4179 flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf,
4180 struct tcf_nlcb_context *ctx)
4182 struct tcf_nlcb_buf *bc = LIST_FIRST(&ctx->nlbuf);
4186 struct tcf_nlcb_buf *bn = LIST_NEXT(bc, next);
4187 struct nlmsghdr *nlh;
4191 while (msg < bc->size) {
4193 * Send Netlink commands from buffer in one by one
4194 * fashion. If we send multiple rule deletion commands
4195 * in one Netlink message and some error occurs it may
4196 * cause multiple ACK error messages and break sequence
4197 * numbers of Netlink communication, because we expect
4198 * the only one ACK reply.
4200 assert((bc->size - msg) >= sizeof(struct nlmsghdr));
4201 nlh = (struct nlmsghdr *)&bc->msg[msg];
4202 assert((bc->size - msg) >= nlh->nlmsg_len);
4203 msg += nlh->nlmsg_len;
4204 rc = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4207 "netlink: cleanup error %d", rc);
4215 LIST_INIT(&ctx->nlbuf);
4220 * Collect local IP address rules with scope link attribute on specified
4221 * network device. This is callback routine called by libmnl mnl_cb_run()
4222 * in loop for every message in received packet.
4225 * Pointer to reply header.
4226 * @param[in, out] arg
4227 * Opaque data pointer for this callback.
4230 * A positive, nonzero value on success, negative errno value otherwise
4231 * and rte_errno is set.
4234 flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg)
4236 struct tcf_nlcb_context *ctx = arg;
4237 struct nlmsghdr *cmd;
4238 struct ifaddrmsg *ifa;
4240 struct nlattr *na_local = NULL;
4241 struct nlattr *na_peer = NULL;
4242 unsigned char family;
4245 if (nlh->nlmsg_type != RTM_NEWADDR) {
4249 ifa = mnl_nlmsg_get_payload(nlh);
4250 family = ifa->ifa_family;
4251 if (ifa->ifa_index != ctx->ifindex ||
4252 ifa->ifa_scope != RT_SCOPE_LINK ||
4253 !(ifa->ifa_flags & IFA_F_PERMANENT) ||
4254 (family != AF_INET && family != AF_INET6))
4256 mnl_attr_for_each(na, nlh, sizeof(*ifa)) {
4257 switch (mnl_attr_get_type(na)) {
4265 if (na_local && na_peer)
4268 if (!na_local || !na_peer)
4270 /* Local rule found with scope link, permanent and assigned peer. */
4271 size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4272 MNL_ALIGN(sizeof(struct ifaddrmsg)) +
4273 (family == AF_INET6 ? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4274 : 2 * SZ_NLATTR_TYPE_OF(uint32_t));
4275 cmd = flow_tcf_alloc_nlcmd(ctx, size);
4280 cmd = mnl_nlmsg_put_header(cmd);
4281 cmd->nlmsg_type = RTM_DELADDR;
4282 cmd->nlmsg_flags = NLM_F_REQUEST;
4283 ifa = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifa));
4284 ifa->ifa_flags = IFA_F_PERMANENT;
4285 ifa->ifa_scope = RT_SCOPE_LINK;
4286 ifa->ifa_index = ctx->ifindex;
4287 if (family == AF_INET) {
4288 ifa->ifa_family = AF_INET;
4289 ifa->ifa_prefixlen = 32;
4290 mnl_attr_put_u32(cmd, IFA_LOCAL, mnl_attr_get_u32(na_local));
4291 mnl_attr_put_u32(cmd, IFA_ADDRESS, mnl_attr_get_u32(na_peer));
4293 ifa->ifa_family = AF_INET6;
4294 ifa->ifa_prefixlen = 128;
4295 mnl_attr_put(cmd, IFA_LOCAL, IPV6_ADDR_LEN,
4296 mnl_attr_get_payload(na_local));
4297 mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN,
4298 mnl_attr_get_payload(na_peer));
4300 assert(size == cmd->nlmsg_len);
4305 * Cleanup the local IP addresses on outer interface.
4308 * Context object initialized by mlx5_flow_tcf_context_create().
4309 * @param[in] ifindex
4310 * Network interface index to perform cleanup.
4313 flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf,
4314 unsigned int ifindex)
4316 struct nlmsghdr *nlh;
4317 struct ifaddrmsg *ifa;
4318 struct tcf_nlcb_context ctx = {
4320 .bufsize = MNL_REQUEST_SIZE,
4321 .nlbuf = LIST_HEAD_INITIALIZER(),
4327 * Seek and destroy leftovers of local IP addresses with
4328 * matching properties "scope link".
4330 nlh = mnl_nlmsg_put_header(tcf->buf);
4331 nlh->nlmsg_type = RTM_GETADDR;
4332 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4333 ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4334 ifa->ifa_family = AF_UNSPEC;
4335 ifa->ifa_index = ifindex;
4336 ifa->ifa_scope = RT_SCOPE_LINK;
4337 ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_local_cb, &ctx);
4339 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4340 ret = flow_tcf_send_nlcmd(tcf, &ctx);
4342 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4346 * Collect neigh permanent rules on specified network device.
4347 * This is callback routine called by libmnl mnl_cb_run() in loop for
4348 * every message in received packet.
4351 * Pointer to reply header.
4352 * @param[in, out] arg
4353 * Opaque data pointer for this callback.
4356 * A positive, nonzero value on success, negative errno value otherwise
4357 * and rte_errno is set.
4360 flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg)
4362 struct tcf_nlcb_context *ctx = arg;
4363 struct nlmsghdr *cmd;
4366 struct nlattr *na_ip = NULL;
4367 struct nlattr *na_mac = NULL;
4368 unsigned char family;
4371 if (nlh->nlmsg_type != RTM_NEWNEIGH) {
4375 ndm = mnl_nlmsg_get_payload(nlh);
4376 family = ndm->ndm_family;
4377 if (ndm->ndm_ifindex != (int)ctx->ifindex ||
4378 !(ndm->ndm_state & NUD_PERMANENT) ||
4379 (family != AF_INET && family != AF_INET6))
4381 mnl_attr_for_each(na, nlh, sizeof(*ndm)) {
4382 switch (mnl_attr_get_type(na)) {
4390 if (na_mac && na_ip)
4393 if (!na_mac || !na_ip)
4395 /* Neigh rule with permanent attribute found. */
4396 size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4397 MNL_ALIGN(sizeof(struct ndmsg)) +
4398 SZ_NLATTR_DATA_OF(RTE_ETHER_ADDR_LEN) +
4399 (family == AF_INET6 ? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4400 : SZ_NLATTR_TYPE_OF(uint32_t));
4401 cmd = flow_tcf_alloc_nlcmd(ctx, size);
4406 cmd = mnl_nlmsg_put_header(cmd);
4407 cmd->nlmsg_type = RTM_DELNEIGH;
4408 cmd->nlmsg_flags = NLM_F_REQUEST;
4409 ndm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ndm));
4410 ndm->ndm_ifindex = ctx->ifindex;
4411 ndm->ndm_state = NUD_PERMANENT;
4414 if (family == AF_INET) {
4415 ndm->ndm_family = AF_INET;
4416 mnl_attr_put_u32(cmd, NDA_DST, mnl_attr_get_u32(na_ip));
4418 ndm->ndm_family = AF_INET6;
4419 mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN,
4420 mnl_attr_get_payload(na_ip));
4422 mnl_attr_put(cmd, NDA_LLADDR, RTE_ETHER_ADDR_LEN,
4423 mnl_attr_get_payload(na_mac));
4424 assert(size == cmd->nlmsg_len);
4429 * Cleanup the neigh rules on outer interface.
4432 * Context object initialized by mlx5_flow_tcf_context_create().
4433 * @param[in] ifindex
4434 * Network interface index to perform cleanup.
4437 flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf,
4438 unsigned int ifindex)
4440 struct nlmsghdr *nlh;
4442 struct tcf_nlcb_context ctx = {
4444 .bufsize = MNL_REQUEST_SIZE,
4445 .nlbuf = LIST_HEAD_INITIALIZER(),
4450 /* Seek and destroy leftovers of neigh rules. */
4451 nlh = mnl_nlmsg_put_header(tcf->buf);
4452 nlh->nlmsg_type = RTM_GETNEIGH;
4453 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4454 ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4455 ndm->ndm_family = AF_UNSPEC;
4456 ndm->ndm_ifindex = ifindex;
4457 ndm->ndm_state = NUD_PERMANENT;
4458 ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_neigh_cb, &ctx);
4460 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4461 ret = flow_tcf_send_nlcmd(tcf, &ctx);
4463 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4467 * Collect indices of VXLAN encap/decap interfaces associated with device.
4468 * This is callback routine called by libmnl mnl_cb_run() in loop for
4469 * every message in received packet.
4472 * Pointer to reply header.
4473 * @param[in, out] arg
4474 * Opaque data pointer for this callback.
4477 * A positive, nonzero value on success, negative errno value otherwise
4478 * and rte_errno is set.
4481 flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg)
4483 struct tcf_nlcb_context *ctx = arg;
4484 struct nlmsghdr *cmd;
4485 struct ifinfomsg *ifm;
4487 struct nlattr *na_info = NULL;
4488 struct nlattr *na_vxlan = NULL;
4490 unsigned int vxindex;
4493 if (nlh->nlmsg_type != RTM_NEWLINK) {
4497 ifm = mnl_nlmsg_get_payload(nlh);
4498 if (!ifm->ifi_index) {
4502 mnl_attr_for_each(na, nlh, sizeof(*ifm))
4503 if (mnl_attr_get_type(na) == IFLA_LINKINFO) {
4509 mnl_attr_for_each_nested(na, na_info) {
4510 switch (mnl_attr_get_type(na)) {
4511 case IFLA_INFO_KIND:
4512 if (!strncmp("vxlan", mnl_attr_get_str(na),
4513 mnl_attr_get_len(na)))
4516 case IFLA_INFO_DATA:
4520 if (found && na_vxlan)
4523 if (!found || !na_vxlan)
4526 mnl_attr_for_each_nested(na, na_vxlan) {
4527 if (mnl_attr_get_type(na) == IFLA_VXLAN_LINK &&
4528 mnl_attr_get_u32(na) == ctx->ifindex) {
4535 /* Attached VXLAN device found, store the command to delete. */
4536 vxindex = ifm->ifi_index;
4537 size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4538 MNL_ALIGN(sizeof(struct ifinfomsg));
4539 cmd = flow_tcf_alloc_nlcmd(ctx, size);
4544 cmd = mnl_nlmsg_put_header(cmd);
4545 cmd->nlmsg_type = RTM_DELLINK;
4546 cmd->nlmsg_flags = NLM_F_REQUEST;
4547 ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm));
4548 ifm->ifi_family = AF_UNSPEC;
4549 ifm->ifi_index = vxindex;
4550 assert(size == cmd->nlmsg_len);
4555 * Cleanup the outer interface. Removes all found vxlan devices
4556 * attached to specified index, flushes the neigh and local IP
4560 * Context object initialized by mlx5_flow_tcf_context_create().
4561 * @param[in] ifindex
4562 * Network inferface index to perform cleanup.
4565 flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf,
4566 unsigned int ifindex)
4568 struct nlmsghdr *nlh;
4569 struct ifinfomsg *ifm;
4570 struct tcf_nlcb_context ctx = {
4572 .bufsize = MNL_REQUEST_SIZE,
4573 .nlbuf = LIST_HEAD_INITIALIZER(),
4579 * Seek and destroy leftover VXLAN encap/decap interfaces with
4580 * matching properties.
4582 nlh = mnl_nlmsg_put_header(tcf->buf);
4583 nlh->nlmsg_type = RTM_GETLINK;
4584 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4585 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4586 ifm->ifi_family = AF_UNSPEC;
4587 ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_vxlan_cb, &ctx);
4589 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4590 ret = flow_tcf_send_nlcmd(tcf, &ctx);
4592 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4596 * Emit Netlink message to add/remove local address to the outer device.
4597 * The address being added is visible within the link only (scope link).
4599 * Note that an implicit route is maintained by the kernel due to the
4600 * presence of a peer address (IFA_ADDRESS).
4602 * These rules are used for encapsulation only and allow to assign
4603 * the outer tunnel source IP address.
4606 * Libmnl socket context object.
4608 * Encapsulation properties (source address and its peer).
4609 * @param[in] ifindex
4610 * Network interface to apply rule.
4612 * Toggle between add and remove.
4614 * Perform verbose error reporting if not NULL.
4617 * 0 on success, a negative errno value otherwise and rte_errno is set.
4620 flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf,
4621 const struct flow_tcf_vxlan_encap *encap,
4622 unsigned int ifindex,
4624 struct rte_flow_error *error)
4626 struct nlmsghdr *nlh;
4627 struct ifaddrmsg *ifa;
4628 alignas(struct nlmsghdr)
4629 uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
4631 nlh = mnl_nlmsg_put_header(buf);
4632 nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
4634 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4636 ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4637 ifa->ifa_flags = IFA_F_PERMANENT;
4638 ifa->ifa_scope = RT_SCOPE_LINK;
4639 ifa->ifa_index = ifindex;
4640 if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4641 ifa->ifa_family = AF_INET;
4642 ifa->ifa_prefixlen = 32;
4643 mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src);
4644 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST)
4645 mnl_attr_put_u32(nlh, IFA_ADDRESS,
4648 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4649 ifa->ifa_family = AF_INET6;
4650 ifa->ifa_prefixlen = 128;
4651 mnl_attr_put(nlh, IFA_LOCAL,
4652 sizeof(encap->ipv6.src),
4654 if (encap->mask & FLOW_TCF_ENCAP_IPV6_DST)
4655 mnl_attr_put(nlh, IFA_ADDRESS,
4656 sizeof(encap->ipv6.dst),
4659 if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4661 return rte_flow_error_set(error, rte_errno,
4662 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4663 "netlink: cannot complete IFA request"
4668 * Emit Netlink message to add/remove neighbor.
4671 * Libmnl socket context object.
4673 * Encapsulation properties (destination address).
4674 * @param[in] ifindex
4675 * Network interface.
4677 * Toggle between add and remove.
4679 * Perform verbose error reporting if not NULL.
4682 * 0 on success, a negative errno value otherwise and rte_errno is set.
4685 flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf,
4686 const struct flow_tcf_vxlan_encap *encap,
4687 unsigned int ifindex,
4689 struct rte_flow_error *error)
4691 struct nlmsghdr *nlh;
4693 alignas(struct nlmsghdr)
4694 uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
4696 nlh = mnl_nlmsg_put_header(buf);
4697 nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
4699 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4701 ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4702 ndm->ndm_ifindex = ifindex;
4703 ndm->ndm_state = NUD_PERMANENT;
4706 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4707 ndm->ndm_family = AF_INET;
4708 mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst);
4710 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4711 ndm->ndm_family = AF_INET6;
4712 mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst),
4715 if (encap->mask & FLOW_TCF_ENCAP_ETH_SRC && enable)
4717 "outer ethernet source address cannot be "
4718 "forced for VXLAN encapsulation");
4719 if (encap->mask & FLOW_TCF_ENCAP_ETH_DST)
4720 mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst),
4722 if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4724 return rte_flow_error_set(error, rte_errno,
4725 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4726 "netlink: cannot complete ND request"
4731 * Manage the local IP addresses and their peers IP addresses on the
4732 * outer interface for encapsulation purposes. The kernel searches the
4733 * appropriate device for tunnel egress traffic using the outer source
4734 * IP, this IP should be assigned to the outer network device, otherwise
4735 * kernel rejects the rule.
4737 * Adds or removes the addresses using the Netlink command like this:
4738 * ip addr add <src_ip> peer <dst_ip> scope link dev <ifouter>
4740 * The addresses are local to the netdev ("scope link"), this reduces
4741 * the risk of conflicts. Note that an implicit route is maintained by
4742 * the kernel due to the presence of a peer address (IFA_ADDRESS).
4745 * Libmnl socket context object.
4747 * Object, contains rule database and ifouter index.
4748 * @param[in] dev_flow
4749 * Flow object, contains the tunnel parameters (for encap only).
4751 * Toggle between add and remove.
4753 * Perform verbose error reporting if not NULL.
4756 * 0 on success, a negative errno value otherwise and rte_errno is set.
4759 flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf,
4760 struct tcf_irule *iface,
4761 struct mlx5_flow *dev_flow,
4763 struct rte_flow_error *error)
4765 const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4766 struct tcf_local_rule *rule = NULL;
4770 assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4771 if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4772 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_DST);
4773 LIST_FOREACH(rule, &iface->local, next) {
4774 if (rule->mask & FLOW_TCF_ENCAP_IPV4_SRC &&
4775 encap->ipv4.src == rule->ipv4.src &&
4776 encap->ipv4.dst == rule->ipv4.dst) {
4781 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4782 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4783 LIST_FOREACH(rule, &iface->local, next) {
4784 if (rule->mask & FLOW_TCF_ENCAP_IPV6_SRC &&
4785 !memcmp(&encap->ipv6.src, &rule->ipv6.src,
4786 sizeof(encap->ipv6.src)) &&
4787 !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4788 sizeof(encap->ipv6.dst))) {
4798 if (!rule->refcnt || !--rule->refcnt) {
4799 LIST_REMOVE(rule, next);
4800 return flow_tcf_rule_local(tcf, encap,
4801 iface->ifouter, false, error);
4806 DRV_LOG(WARNING, "disabling not existing local rule");
4807 rte_flow_error_set(error, ENOENT,
4808 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4809 "disabling not existing local rule");
4812 rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule),
4813 alignof(struct tcf_local_rule));
4815 rte_flow_error_set(error, ENOMEM,
4816 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4817 "unable to allocate memory for local rule");
4820 *rule = (struct tcf_local_rule){.refcnt = 0,
4823 if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4824 rule->mask = FLOW_TCF_ENCAP_IPV4_SRC
4825 | FLOW_TCF_ENCAP_IPV4_DST;
4826 rule->ipv4.src = encap->ipv4.src;
4827 rule->ipv4.dst = encap->ipv4.dst;
4829 rule->mask = FLOW_TCF_ENCAP_IPV6_SRC
4830 | FLOW_TCF_ENCAP_IPV6_DST;
4831 memcpy(&rule->ipv6.src, &encap->ipv6.src, IPV6_ADDR_LEN);
4832 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4834 ret = flow_tcf_rule_local(tcf, encap, iface->ifouter, true, error);
4840 LIST_INSERT_HEAD(&iface->local, rule, next);
4845 * Manage the destination MAC/IP addresses neigh database, kernel uses
4846 * this one to determine the destination MAC address within encapsulation
4847 * header. Adds or removes the entries using the Netlink command like this:
4848 * ip neigh add dev <ifouter> lladdr <dst_mac> to <dst_ip> nud permanent
4851 * Libmnl socket context object.
4853 * Object, contains rule database and ifouter index.
4854 * @param[in] dev_flow
4855 * Flow object, contains the tunnel parameters (for encap only).
4857 * Toggle between add and remove.
4859 * Perform verbose error reporting if not NULL.
4862 * 0 on success, a negative errno value otherwise and rte_errno is set.
4865 flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf,
4866 struct tcf_irule *iface,
4867 struct mlx5_flow *dev_flow,
4869 struct rte_flow_error *error)
4871 const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4872 struct tcf_neigh_rule *rule = NULL;
4876 assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4877 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4878 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_SRC);
4879 LIST_FOREACH(rule, &iface->neigh, next) {
4880 if (rule->mask & FLOW_TCF_ENCAP_IPV4_DST &&
4881 encap->ipv4.dst == rule->ipv4.dst) {
4886 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4887 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4888 LIST_FOREACH(rule, &iface->neigh, next) {
4889 if (rule->mask & FLOW_TCF_ENCAP_IPV6_DST &&
4890 !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4891 sizeof(encap->ipv6.dst))) {
4897 if (memcmp(&encap->eth.dst, &rule->eth,
4898 sizeof(encap->eth.dst))) {
4899 DRV_LOG(WARNING, "Destination MAC differs"
4901 rte_flow_error_set(error, EEXIST,
4902 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
4903 NULL, "Different MAC address"
4904 " neigh rule for the same"
4912 if (!rule->refcnt || !--rule->refcnt) {
4913 LIST_REMOVE(rule, next);
4914 return flow_tcf_rule_neigh(tcf, encap,
4921 DRV_LOG(WARNING, "Disabling not existing neigh rule");
4922 rte_flow_error_set(error, ENOENT,
4923 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4924 "unable to allocate memory for neigh rule");
4927 rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule),
4928 alignof(struct tcf_neigh_rule));
4930 rte_flow_error_set(error, ENOMEM,
4931 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4932 "unable to allocate memory for neigh rule");
4935 *rule = (struct tcf_neigh_rule){.refcnt = 0,
4938 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4939 rule->mask = FLOW_TCF_ENCAP_IPV4_DST;
4940 rule->ipv4.dst = encap->ipv4.dst;
4942 rule->mask = FLOW_TCF_ENCAP_IPV6_DST;
4943 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4945 memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth));
4946 ret = flow_tcf_rule_neigh(tcf, encap, iface->ifouter, true, error);
4952 LIST_INSERT_HEAD(&iface->neigh, rule, next);
4956 /* VXLAN encap rule database for outer interfaces. */
4957 static LIST_HEAD(, tcf_irule) iface_list_vxlan = LIST_HEAD_INITIALIZER();
4959 /* VTEP device list is shared between PMD port instances. */
4960 static LIST_HEAD(, tcf_vtep) vtep_list_vxlan = LIST_HEAD_INITIALIZER();
4961 static pthread_mutex_t vtep_list_mutex = PTHREAD_MUTEX_INITIALIZER;
4964 * Acquire the VXLAN encap rules container for specified interface.
4965 * First looks for the container in the existing ones list, creates
4966 * and initializes the new container if existing not found.
4969 * Context object initialized by mlx5_flow_tcf_context_create().
4970 * @param[in] ifouter
4971 * Network interface index to create VXLAN encap rules on.
4973 * Perform verbose error reporting if not NULL.
4975 * Rule container pointer on success,
4976 * NULL otherwise and rte_errno is set.
4978 static struct tcf_irule*
4979 flow_tcf_encap_irule_acquire(struct mlx5_flow_tcf_context *tcf,
4980 unsigned int ifouter,
4981 struct rte_flow_error *error)
4983 struct tcf_irule *iface;
4985 /* Look whether the container for encap rules is created. */
4987 LIST_FOREACH(iface, &iface_list_vxlan, next) {
4988 if (iface->ifouter == ifouter)
4992 /* Container already exists, just increment the reference. */
4996 /* Not found, we should create the new container. */
4997 iface = rte_zmalloc(__func__, sizeof(*iface),
4998 alignof(struct tcf_irule));
5000 rte_flow_error_set(error, ENOMEM,
5001 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5002 "unable to allocate memory for container");
5005 *iface = (struct tcf_irule){
5006 .local = LIST_HEAD_INITIALIZER(),
5007 .neigh = LIST_HEAD_INITIALIZER(),
5011 /* Interface cleanup for new container created. */
5012 flow_tcf_encap_iface_cleanup(tcf, ifouter);
5013 flow_tcf_encap_local_cleanup(tcf, ifouter);
5014 flow_tcf_encap_neigh_cleanup(tcf, ifouter);
5015 LIST_INSERT_HEAD(&iface_list_vxlan, iface, next);
5020 * Releases VXLAN encap rules container by pointer. Decrements the
5021 * reference counter and deletes the container if counter is zero.
5024 * VXLAN rule container pointer to release.
5027 flow_tcf_encap_irule_release(struct tcf_irule *iface)
5029 assert(iface->refcnt);
5030 if (--iface->refcnt == 0) {
5031 /* Reference counter is zero, delete the container. */
5032 assert(LIST_EMPTY(&iface->local));
5033 assert(LIST_EMPTY(&iface->neigh));
5034 LIST_REMOVE(iface, next);
5040 * Deletes VTEP network device.
5043 * Context object initialized by mlx5_flow_tcf_context_create().
5045 * Object representing the network device to delete. Memory
5046 * allocated for this object is freed by routine.
5049 flow_tcf_vtep_delete(struct mlx5_flow_tcf_context *tcf,
5050 struct tcf_vtep *vtep)
5052 struct nlmsghdr *nlh;
5053 struct ifinfomsg *ifm;
5054 alignas(struct nlmsghdr)
5055 uint8_t buf[mnl_nlmsg_size(MNL_ALIGN(sizeof(*ifm))) +
5056 MNL_BUF_EXTRA_SPACE];
5059 assert(!vtep->refcnt);
5060 /* Delete only ifaces those we actually created. */
5061 if (vtep->created && vtep->ifindex) {
5062 DRV_LOG(INFO, "VTEP delete (%d)", vtep->ifindex);
5063 nlh = mnl_nlmsg_put_header(buf);
5064 nlh->nlmsg_type = RTM_DELLINK;
5065 nlh->nlmsg_flags = NLM_F_REQUEST;
5066 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5067 ifm->ifi_family = AF_UNSPEC;
5068 ifm->ifi_index = vtep->ifindex;
5069 assert(sizeof(buf) >= nlh->nlmsg_len);
5070 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5072 DRV_LOG(WARNING, "netlink: error deleting vxlan"
5073 " encap/decap ifindex %u",
5080 * Creates VTEP network device.
5083 * Context object initialized by mlx5_flow_tcf_context_create().
5085 * UDP port of created VTEP device.
5087 * Perform verbose error reporting if not NULL.
5090 * Pointer to created device structure on success,
5091 * NULL otherwise and rte_errno is set.
5093 static struct tcf_vtep*
5094 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf,
5095 uint16_t port, struct rte_flow_error *error)
5097 struct tcf_vtep *vtep;
5098 struct nlmsghdr *nlh;
5099 struct ifinfomsg *ifm;
5100 char name[sizeof(MLX5_VXLAN_DEVICE_PFX) + 24];
5101 alignas(struct nlmsghdr)
5102 uint8_t buf[mnl_nlmsg_size(sizeof(*ifm)) +
5103 SZ_NLATTR_DATA_OF(sizeof(name)) +
5104 SZ_NLATTR_NEST * 2 +
5105 SZ_NLATTR_STRZ_OF("vxlan") +
5106 SZ_NLATTR_DATA_OF(sizeof(uint32_t)) +
5107 SZ_NLATTR_DATA_OF(sizeof(uint16_t)) +
5108 SZ_NLATTR_DATA_OF(sizeof(uint8_t)) * 3 +
5109 MNL_BUF_EXTRA_SPACE];
5110 struct nlattr *na_info;
5111 struct nlattr *na_vxlan;
5112 rte_be16_t vxlan_port = rte_cpu_to_be_16(port);
5115 vtep = rte_zmalloc(__func__, sizeof(*vtep), alignof(struct tcf_vtep));
5117 rte_flow_error_set(error, ENOMEM,
5118 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5119 "unable to allocate memory for VTEP");
5122 *vtep = (struct tcf_vtep){
5125 memset(buf, 0, sizeof(buf));
5126 nlh = mnl_nlmsg_put_header(buf);
5127 nlh->nlmsg_type = RTM_NEWLINK;
5128 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
5129 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5130 ifm->ifi_family = AF_UNSPEC;
5133 ifm->ifi_flags = IFF_UP;
5134 ifm->ifi_change = 0xffffffff;
5135 snprintf(name, sizeof(name), "%s%u", MLX5_VXLAN_DEVICE_PFX, port);
5136 mnl_attr_put_strz(nlh, IFLA_IFNAME, name);
5137 na_info = mnl_attr_nest_start(nlh, IFLA_LINKINFO);
5139 mnl_attr_put_strz(nlh, IFLA_INFO_KIND, "vxlan");
5140 na_vxlan = mnl_attr_nest_start(nlh, IFLA_INFO_DATA);
5142 #ifdef HAVE_IFLA_VXLAN_COLLECT_METADATA
5144 * RH 7.2 does not support metadata for tunnel device.
5145 * It does not matter because we are going to use the
5146 * hardware offload by mlx5 driver.
5148 mnl_attr_put_u8(nlh, IFLA_VXLAN_COLLECT_METADATA, 1);
5150 mnl_attr_put_u8(nlh, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1);
5151 mnl_attr_put_u8(nlh, IFLA_VXLAN_LEARNING, 0);
5152 mnl_attr_put_u16(nlh, IFLA_VXLAN_PORT, vxlan_port);
5153 #ifndef HAVE_IFLA_VXLAN_COLLECT_METADATA
5155 * We must specify VNI explicitly if metadata not supported.
5156 * Note, VNI is transferred with native endianness format.
5158 mnl_attr_put_u16(nlh, IFLA_VXLAN_ID, MLX5_VXLAN_DEFAULT_VNI);
5160 mnl_attr_nest_end(nlh, na_vxlan);
5161 mnl_attr_nest_end(nlh, na_info);
5162 assert(sizeof(buf) >= nlh->nlmsg_len);
5163 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5166 "netlink: VTEP %s create failure (%d)",
5168 if (rte_errno != EEXIST)
5170 * Some unhandled error occurred or device is
5171 * for encapsulation and cannot be shared.
5176 * Mark device we actually created.
5177 * We should explicitly delete
5178 * when we do not need it anymore.
5183 /* Try to get ifindex of created of pre-existing device. */
5184 ret = if_nametoindex(name);
5187 "VTEP %s failed to get index (%d)", name, errno);
5190 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5191 "netlink: failed to retrieve VTEP ifindex");
5194 vtep->ifindex = ret;
5195 memset(buf, 0, sizeof(buf));
5196 nlh = mnl_nlmsg_put_header(buf);
5197 nlh->nlmsg_type = RTM_NEWLINK;
5198 nlh->nlmsg_flags = NLM_F_REQUEST;
5199 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5200 ifm->ifi_family = AF_UNSPEC;
5202 ifm->ifi_index = vtep->ifindex;
5203 ifm->ifi_flags = IFF_UP;
5204 ifm->ifi_change = IFF_UP;
5205 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5207 rte_flow_error_set(error, -errno,
5208 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5209 "netlink: failed to set VTEP link up");
5210 DRV_LOG(WARNING, "netlink: VTEP %s set link up failure (%d)",
5214 ret = mlx5_flow_tcf_init(tcf, vtep->ifindex, error);
5216 DRV_LOG(WARNING, "VTEP %s init failure (%d)", name, rte_errno);
5219 DRV_LOG(INFO, "VTEP create (%d, %d)", vtep->port, vtep->ifindex);
5223 flow_tcf_vtep_delete(tcf, vtep);
5231 * Acquire target interface index for VXLAN tunneling decapsulation.
5232 * In order to share the UDP port within the other interfaces the
5233 * VXLAN device created as not attached to any interface (if created).
5236 * Context object initialized by mlx5_flow_tcf_context_create().
5237 * @param[in] dev_flow
5238 * Flow tcf object with tunnel structure pointer set.
5240 * Perform verbose error reporting if not NULL.
5242 * Interface descriptor pointer on success,
5243 * NULL otherwise and rte_errno is set.
5245 static struct tcf_vtep*
5246 flow_tcf_decap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5247 struct mlx5_flow *dev_flow,
5248 struct rte_flow_error *error)
5250 struct tcf_vtep *vtep;
5251 uint16_t port = dev_flow->tcf.vxlan_decap->udp_port;
5253 LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5254 if (vtep->port == port)
5258 /* Device exists, just increment the reference counter. */
5260 assert(vtep->ifindex);
5263 /* No decapsulation device exists, try to create the new one. */
5264 vtep = flow_tcf_vtep_create(tcf, port, error);
5266 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5271 * Acquire target interface index for VXLAN tunneling encapsulation.
5274 * Context object initialized by mlx5_flow_tcf_context_create().
5275 * @param[in] ifouter
5276 * Network interface index to attach VXLAN encap device to.
5277 * @param[in] dev_flow
5278 * Flow tcf object with tunnel structure pointer set.
5280 * Perform verbose error reporting if not NULL.
5282 * Interface descriptor pointer on success,
5283 * NULL otherwise and rte_errno is set.
5285 static struct tcf_vtep*
5286 flow_tcf_encap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5287 unsigned int ifouter,
5288 struct mlx5_flow *dev_flow,
5289 struct rte_flow_error *error)
5291 static uint16_t port;
5292 struct tcf_vtep *vtep;
5293 struct tcf_irule *iface;
5297 /* Look whether the VTEP for specified port is created. */
5298 port = rte_be_to_cpu_16(dev_flow->tcf.vxlan_encap->udp.dst);
5299 LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5300 if (vtep->port == port)
5304 /* VTEP already exists, just increment the reference. */
5307 /* Not found, we should create the new VTEP. */
5308 vtep = flow_tcf_vtep_create(tcf, port, error);
5311 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5313 assert(vtep->ifindex);
5314 iface = flow_tcf_encap_irule_acquire(tcf, ifouter, error);
5316 if (--vtep->refcnt == 0)
5317 flow_tcf_vtep_delete(tcf, vtep);
5320 dev_flow->tcf.vxlan_encap->iface = iface;
5321 /* Create local ipaddr with peer to specify the outer IPs. */
5322 ret = flow_tcf_encap_local(tcf, iface, dev_flow, true, error);
5324 /* Create neigh rule to specify outer destination MAC. */
5325 ret = flow_tcf_encap_neigh(tcf, iface, dev_flow, true, error);
5327 flow_tcf_encap_local(tcf, iface,
5328 dev_flow, false, error);
5331 dev_flow->tcf.vxlan_encap->iface = NULL;
5332 flow_tcf_encap_irule_release(iface);
5333 if (--vtep->refcnt == 0)
5334 flow_tcf_vtep_delete(tcf, vtep);
5341 * Acquires target interface index for tunneling of any type.
5342 * Creates the new VTEP if needed.
5345 * Context object initialized by mlx5_flow_tcf_context_create().
5346 * @param[in] ifouter
5347 * Network interface index to create VXLAN encap rules on.
5348 * @param[in] dev_flow
5349 * Flow tcf object with tunnel structure pointer set.
5351 * Perform verbose error reporting if not NULL.
5353 * Interface descriptor pointer on success,
5354 * NULL otherwise and rte_errno is set.
5356 static struct tcf_vtep*
5357 flow_tcf_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5358 unsigned int ifouter,
5359 struct mlx5_flow *dev_flow,
5360 struct rte_flow_error *error)
5362 struct tcf_vtep *vtep = NULL;
5364 assert(dev_flow->tcf.tunnel);
5365 pthread_mutex_lock(&vtep_list_mutex);
5366 switch (dev_flow->tcf.tunnel->type) {
5367 case FLOW_TCF_TUNACT_VXLAN_ENCAP:
5368 vtep = flow_tcf_encap_vtep_acquire(tcf, ifouter,
5371 case FLOW_TCF_TUNACT_VXLAN_DECAP:
5372 vtep = flow_tcf_decap_vtep_acquire(tcf, dev_flow, error);
5375 rte_flow_error_set(error, ENOTSUP,
5376 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5377 "unsupported tunnel type");
5380 pthread_mutex_unlock(&vtep_list_mutex);
5385 * Release tunneling interface by ifindex. Decrements reference
5386 * counter and actually removes the device if counter is zero.
5389 * Context object initialized by mlx5_flow_tcf_context_create().
5391 * VTEP device descriptor structure.
5392 * @param[in] dev_flow
5393 * Flow tcf object with tunnel structure pointer set.
5396 flow_tcf_vtep_release(struct mlx5_flow_tcf_context *tcf,
5397 struct tcf_vtep *vtep,
5398 struct mlx5_flow *dev_flow)
5400 assert(dev_flow->tcf.tunnel);
5401 pthread_mutex_lock(&vtep_list_mutex);
5402 switch (dev_flow->tcf.tunnel->type) {
5403 case FLOW_TCF_TUNACT_VXLAN_DECAP:
5405 case FLOW_TCF_TUNACT_VXLAN_ENCAP: {
5406 struct tcf_irule *iface;
5408 /* Remove the encap ancillary rules first. */
5409 iface = dev_flow->tcf.vxlan_encap->iface;
5411 flow_tcf_encap_neigh(tcf, iface, dev_flow, false, NULL);
5412 flow_tcf_encap_local(tcf, iface, dev_flow, false, NULL);
5413 flow_tcf_encap_irule_release(iface);
5414 dev_flow->tcf.vxlan_encap->iface = NULL;
5419 DRV_LOG(WARNING, "Unsupported tunnel type");
5422 assert(vtep->refcnt);
5423 if (--vtep->refcnt == 0) {
5424 LIST_REMOVE(vtep, next);
5425 flow_tcf_vtep_delete(tcf, vtep);
5427 pthread_mutex_unlock(&vtep_list_mutex);
5430 struct tcf_nlcb_query {
5433 uint32_t flags_valid:1;
5437 * Collect queried rule attributes. This is callback routine called by
5438 * libmnl mnl_cb_run() in loop for every message in received packet.
5439 * Current implementation collects the flower flags only.
5442 * Pointer to reply header.
5443 * @param[in, out] arg
5444 * Context pointer for this callback.
5447 * A positive, nonzero value on success (required by libmnl
5448 * to continue messages processing).
5451 flow_tcf_collect_query_cb(const struct nlmsghdr *nlh, void *arg)
5453 struct tcf_nlcb_query *query = arg;
5454 struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
5455 struct nlattr *na, *na_opt;
5456 bool flower = false;
5458 if (nlh->nlmsg_type != RTM_NEWTFILTER ||
5459 tcm->tcm_handle != query->handle)
5461 mnl_attr_for_each(na, nlh, sizeof(*tcm)) {
5462 switch (mnl_attr_get_type(na)) {
5464 if (strcmp(mnl_attr_get_payload(na), "flower")) {
5465 /* Not flower filter, drop entire message. */
5472 /* Not flower options, drop entire message. */
5475 /* Check nested flower options. */
5476 mnl_attr_for_each_nested(na_opt, na) {
5477 switch (mnl_attr_get_type(na_opt)) {
5478 case TCA_FLOWER_FLAGS:
5479 query->flags_valid = 1;
5481 mnl_attr_get_u32(na_opt);
5492 * Query a TC flower rule flags via netlink.
5495 * Context object initialized by mlx5_flow_tcf_context_create().
5496 * @param[in] dev_flow
5497 * Pointer to the flow.
5498 * @param[out] pflags
5499 * pointer to the data retrieved by the query.
5502 * 0 on success, a negative errno value otherwise.
5505 flow_tcf_query_flags(struct mlx5_flow_tcf_context *tcf,
5506 struct mlx5_flow *dev_flow,
5509 struct nlmsghdr *nlh;
5511 struct tcf_nlcb_query query = {
5512 .handle = dev_flow->tcf.tcm->tcm_handle,
5515 nlh = mnl_nlmsg_put_header(tcf->buf);
5516 nlh->nlmsg_type = RTM_GETTFILTER;
5517 nlh->nlmsg_flags = NLM_F_REQUEST;
5518 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
5519 memcpy(tcm, dev_flow->tcf.tcm, sizeof(*tcm));
5521 * Ignore Netlink error for filter query operations.
5522 * The reply length is sent by kernel as errno.
5523 * Just check we got the flags option.
5525 flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_query_cb, &query);
5526 if (!query.flags_valid) {
5530 *pflags = query.tc_flags;
5535 * Query and check the in_hw set for specified rule.
5538 * Context object initialized by mlx5_flow_tcf_context_create().
5539 * @param[in] dev_flow
5540 * Pointer to the flow to check.
5543 * 0 on success, a negative errno value otherwise.
5546 flow_tcf_check_inhw(struct mlx5_flow_tcf_context *tcf,
5547 struct mlx5_flow *dev_flow)
5552 ret = flow_tcf_query_flags(tcf, dev_flow, &flags);
5555 return (flags & TCA_CLS_FLAGS_IN_HW) ? 0 : -ENOENT;
5559 * Remove flow from E-Switch by sending Netlink message.
5562 * Pointer to Ethernet device.
5563 * @param[in, out] flow
5564 * Pointer to the sub flow.
5567 flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
5569 struct mlx5_priv *priv = dev->data->dev_private;
5570 struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5571 struct mlx5_flow *dev_flow;
5572 struct nlmsghdr *nlh;
5577 dev_flow = LIST_FIRST(&flow->dev_flows);
5580 /* E-Switch flow can't be expanded. */
5581 assert(!LIST_NEXT(dev_flow, next));
5582 if (dev_flow->tcf.applied) {
5583 nlh = dev_flow->tcf.nlh;
5584 nlh->nlmsg_type = RTM_DELTFILTER;
5585 nlh->nlmsg_flags = NLM_F_REQUEST;
5586 flow_tcf_nl_ack(ctx, nlh, NULL, NULL);
5587 if (dev_flow->tcf.tunnel) {
5588 assert(dev_flow->tcf.tunnel->vtep);
5589 flow_tcf_vtep_release(ctx,
5590 dev_flow->tcf.tunnel->vtep,
5592 dev_flow->tcf.tunnel->vtep = NULL;
5594 /* Cleanup the rule handle value. */
5595 tcm = mnl_nlmsg_get_payload(nlh);
5596 tcm->tcm_handle = 0;
5597 dev_flow->tcf.applied = 0;
5602 * Fetch the applied rule handle. This is callback routine called by
5603 * libmnl mnl_cb_run() in loop for every message in received packet.
5604 * When the NLM_F_ECHO flag is specified the kernel sends the created
5605 * rule descriptor back to the application and we can retrieve the
5606 * actual rule handle from updated descriptor.
5609 * Pointer to reply header.
5610 * @param[in, out] arg
5611 * Context pointer for this callback.
5614 * A positive, nonzero value on success (required by libmnl
5615 * to continue messages processing).
5618 flow_tcf_collect_apply_cb(const struct nlmsghdr *nlh, void *arg)
5620 struct nlmsghdr *nlhrq = arg;
5621 struct tcmsg *tcmrq = mnl_nlmsg_get_payload(nlhrq);
5622 struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
5625 if (nlh->nlmsg_type != RTM_NEWTFILTER ||
5626 nlh->nlmsg_seq != nlhrq->nlmsg_seq)
5628 mnl_attr_for_each(na, nlh, sizeof(*tcm)) {
5629 switch (mnl_attr_get_type(na)) {
5631 if (strcmp(mnl_attr_get_payload(na), "flower")) {
5632 /* Not flower filter, drop entire message. */
5635 tcmrq->tcm_handle = tcm->tcm_handle;
5642 * Apply flow to E-Switch by sending Netlink message.
5645 * Pointer to Ethernet device.
5646 * @param[in, out] flow
5647 * Pointer to the sub flow.
5649 * Pointer to the error structure.
5652 * 0 on success, a negative errno value otherwise and rte_errno is set.
5655 flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
5656 struct rte_flow_error *error)
5658 struct mlx5_priv *priv = dev->data->dev_private;
5659 struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5660 struct mlx5_flow *dev_flow;
5661 struct nlmsghdr *nlh;
5667 dev_flow = LIST_FIRST(&flow->dev_flows);
5668 /* E-Switch flow can't be expanded. */
5669 assert(!LIST_NEXT(dev_flow, next));
5670 if (dev_flow->tcf.applied)
5672 nlh = dev_flow->tcf.nlh;
5673 nlh->nlmsg_type = RTM_NEWTFILTER;
5674 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
5675 NLM_F_EXCL | NLM_F_ECHO;
5676 tcm = mnl_nlmsg_get_payload(nlh);
5677 /* Allow kernel to assign handle on its own. */
5678 tcm->tcm_handle = 0;
5679 if (dev_flow->tcf.tunnel) {
5681 * Replace the interface index, target for
5682 * encapsulation, source for decapsulation.
5684 assert(!dev_flow->tcf.tunnel->vtep);
5685 assert(dev_flow->tcf.tunnel->ifindex_ptr);
5686 /* Acquire actual VTEP device when rule is being applied. */
5687 dev_flow->tcf.tunnel->vtep =
5688 flow_tcf_vtep_acquire(ctx,
5689 dev_flow->tcf.tunnel->ifindex_org,
5691 if (!dev_flow->tcf.tunnel->vtep)
5693 DRV_LOG(INFO, "Replace ifindex: %d->%d",
5694 dev_flow->tcf.tunnel->vtep->ifindex,
5695 dev_flow->tcf.tunnel->ifindex_org);
5696 *dev_flow->tcf.tunnel->ifindex_ptr =
5697 dev_flow->tcf.tunnel->vtep->ifindex;
5698 if (dev_flow->tcf.tunnel->vtep->waitreg) {
5699 /* Clear wait flag for VXLAN port registration. */
5700 dev_flow->tcf.tunnel->vtep->waitreg = 0;
5701 twait = rte_get_timer_hz();
5702 assert(twait > MS_PER_S);
5703 twait = twait * MLX5_VXLAN_WAIT_PORT_REG_MS;
5704 twait = twait / MS_PER_S;
5705 start = rte_get_timer_cycles();
5709 * Kernel creates the VXLAN devices and registers UDP ports to
5710 * be hardware offloaded within the NIC kernel drivers. The
5711 * registration process is being performed into context of
5712 * working kernel thread and the race conditions might happen.
5713 * The VXLAN device is created and success is returned to
5714 * calling application, but the UDP port registration process
5715 * is not completed yet. The next applied rule may be rejected
5716 * by the driver with ENOSUP code. We are going to wait a bit,
5717 * allowing registration process to be completed. The waiting
5718 * is performed once after device been created.
5721 struct timespec onems;
5723 ret = flow_tcf_nl_ack(ctx, nlh,
5724 flow_tcf_collect_apply_cb, nlh);
5725 if (!ret || ret != -ENOTSUP || !twait)
5727 /* Wait one millisecond and try again till timeout. */
5729 onems.tv_nsec = NS_PER_S / MS_PER_S;
5730 nanosleep(&onems, 0);
5731 if ((rte_get_timer_cycles() - start) > twait) {
5732 /* Timeout elapsed, try once more and exit. */
5737 if (!tcm->tcm_handle) {
5738 flow_tcf_remove(dev, flow);
5739 return rte_flow_error_set
5741 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5742 "netlink: rule zero handle returned");
5744 dev_flow->tcf.applied = 1;
5745 if (*dev_flow->tcf.ptc_flags & TCA_CLS_FLAGS_SKIP_SW)
5748 * Rule was applied without skip_sw flag set.
5749 * We should check whether the rule was acctually
5750 * accepted by hardware (have look at in_hw flag).
5752 if (flow_tcf_check_inhw(ctx, dev_flow)) {
5753 flow_tcf_remove(dev, flow);
5754 return rte_flow_error_set
5756 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5757 "netlink: rule has no in_hw flag set");
5761 if (dev_flow->tcf.tunnel) {
5762 /* Rollback the VTEP configuration if rule apply failed. */
5763 assert(dev_flow->tcf.tunnel->vtep);
5764 flow_tcf_vtep_release(ctx, dev_flow->tcf.tunnel->vtep,
5766 dev_flow->tcf.tunnel->vtep = NULL;
5768 return rte_flow_error_set(error, rte_errno,
5769 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5770 "netlink: failed to create TC flow rule");
5774 * Remove flow from E-Switch and release resources of the device flow.
5777 * Pointer to Ethernet device.
5778 * @param[in, out] flow
5779 * Pointer to the sub flow.
5782 flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
5784 struct mlx5_flow *dev_flow;
5788 flow_tcf_remove(dev, flow);
5789 if (flow->counter) {
5790 if (--flow->counter->ref_cnt == 0) {
5791 rte_free(flow->counter);
5792 flow->counter = NULL;
5795 dev_flow = LIST_FIRST(&flow->dev_flows);
5798 /* E-Switch flow can't be expanded. */
5799 assert(!LIST_NEXT(dev_flow, next));
5800 LIST_REMOVE(dev_flow, next);
5805 * Helper routine for figuring the space size required for a parse buffer.
5808 * array of values to use.
5810 * Current location in array.
5812 * Value to compare with.
5815 * The maximum between the given value and the array value on index.
5818 flow_tcf_arr_val_max(uint16_t array[], int idx, uint16_t value)
5820 return idx < 0 ? (value) : RTE_MAX((array)[idx], value);
5824 * Parse rtnetlink message attributes filling the attribute table with the info
5828 * Attribute table to be filled.
5830 * Maxinum entry in the attribute table.
5832 * The attributes section in the message to be parsed.
5834 * The length of the attributes section in the message.
5837 flow_tcf_nl_parse_rtattr(struct rtattr *tb[], int max,
5838 struct rtattr *rta, int len)
5840 unsigned short type;
5841 memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
5842 while (RTA_OK(rta, len)) {
5843 type = rta->rta_type;
5844 if (type <= max && !tb[type])
5846 rta = RTA_NEXT(rta, len);
5851 * Extract flow counters from flower action.
5854 * flower action stats properties in the Netlink message received.
5856 * The backward sequence of rta_types, as written in the attribute table,
5857 * we need to traverse in order to get to the requested object.
5859 * Current location in rta_type table.
5861 * data holding the count statistics of the rte_flow retrieved from
5865 * 0 if data was found and retrieved, -1 otherwise.
5868 flow_tcf_nl_action_stats_parse_and_get(struct rtattr *rta,
5869 uint16_t rta_type[], int idx,
5870 struct gnet_stats_basic *data)
5872 int tca_stats_max = flow_tcf_arr_val_max(rta_type, idx,
5874 struct rtattr *tbs[tca_stats_max + 1];
5876 if (rta == NULL || idx < 0)
5878 flow_tcf_nl_parse_rtattr(tbs, tca_stats_max,
5879 RTA_DATA(rta), RTA_PAYLOAD(rta));
5880 switch (rta_type[idx]) {
5881 case TCA_STATS_BASIC:
5882 if (tbs[TCA_STATS_BASIC]) {
5883 memcpy(data, RTA_DATA(tbs[TCA_STATS_BASIC]),
5884 RTE_MIN(RTA_PAYLOAD(tbs[TCA_STATS_BASIC]),
5896 * Parse flower single action retrieving the requested action attribute,
5900 * flower action properties in the Netlink message received.
5902 * The backward sequence of rta_types, as written in the attribute table,
5903 * we need to traverse in order to get to the requested object.
5905 * Current location in rta_type table.
5907 * Count statistics retrieved from the message query.
5910 * 0 if data was found and retrieved, -1 otherwise.
5913 flow_tcf_nl_parse_one_action_and_get(struct rtattr *arg,
5914 uint16_t rta_type[], int idx, void *data)
5916 int tca_act_max = flow_tcf_arr_val_max(rta_type, idx, TCA_ACT_STATS);
5917 struct rtattr *tb[tca_act_max + 1];
5919 if (arg == NULL || idx < 0)
5921 flow_tcf_nl_parse_rtattr(tb, tca_act_max,
5922 RTA_DATA(arg), RTA_PAYLOAD(arg));
5923 if (tb[TCA_ACT_KIND] == NULL)
5925 switch (rta_type[idx]) {
5927 if (tb[TCA_ACT_STATS])
5928 return flow_tcf_nl_action_stats_parse_and_get
5931 (struct gnet_stats_basic *)data);
5940 * Parse flower action section in the message retrieving the requested
5941 * attribute from the first action that provides it.
5944 * flower section in the Netlink message received.
5946 * The backward sequence of rta_types, as written in the attribute table,
5947 * we need to traverse in order to get to the requested object.
5949 * Current location in rta_type table.
5951 * data retrieved from the message query.
5954 * 0 if data was found and retrieved, -1 otherwise.
5957 flow_tcf_nl_action_parse_and_get(struct rtattr *arg,
5958 uint16_t rta_type[], int idx, void *data)
5960 struct rtattr *tb[TCA_ACT_MAX_PRIO + 1];
5963 if (arg == NULL || idx < 0)
5965 flow_tcf_nl_parse_rtattr(tb, TCA_ACT_MAX_PRIO,
5966 RTA_DATA(arg), RTA_PAYLOAD(arg));
5967 switch (rta_type[idx]) {
5969 * flow counters are stored in the actions defined by the flow
5970 * and not in the flow itself, therefore we need to traverse the
5971 * flower chain of actions in search for them.
5973 * Note that the index is not decremented here.
5976 for (i = 0; i <= TCA_ACT_MAX_PRIO; i++) {
5978 !flow_tcf_nl_parse_one_action_and_get(tb[i],
5991 * Parse flower classifier options in the message, retrieving the requested
5992 * attribute if found.
5995 * flower section in the Netlink message received.
5997 * The backward sequence of rta_types, as written in the attribute table,
5998 * we need to traverse in order to get to the requested object.
6000 * Current location in rta_type table.
6002 * data retrieved from the message query.
6005 * 0 if data was found and retrieved, -1 otherwise.
6008 flow_tcf_nl_opts_parse_and_get(struct rtattr *opt,
6009 uint16_t rta_type[], int idx, void *data)
6011 int tca_flower_max = flow_tcf_arr_val_max(rta_type, idx,
6013 struct rtattr *tb[tca_flower_max + 1];
6015 if (!opt || idx < 0)
6017 flow_tcf_nl_parse_rtattr(tb, tca_flower_max,
6018 RTA_DATA(opt), RTA_PAYLOAD(opt));
6019 switch (rta_type[idx]) {
6020 case TCA_FLOWER_ACT:
6021 if (tb[TCA_FLOWER_ACT])
6022 return flow_tcf_nl_action_parse_and_get
6023 (tb[TCA_FLOWER_ACT],
6024 rta_type, --idx, data);
6033 * Parse Netlink reply on filter query, retrieving the flow counters.
6036 * Message received from Netlink.
6038 * The backward sequence of rta_types, as written in the attribute table,
6039 * we need to traverse in order to get to the requested object.
6041 * Current location in rta_type table.
6043 * data retrieved from the message query.
6046 * 0 if data was found and retrieved, -1 otherwise.
6049 flow_tcf_nl_filter_parse_and_get(struct nlmsghdr *cnlh,
6050 uint16_t rta_type[], int idx, void *data)
6052 struct nlmsghdr *nlh = cnlh;
6053 struct tcmsg *t = NLMSG_DATA(nlh);
6054 int len = nlh->nlmsg_len;
6055 int tca_max = flow_tcf_arr_val_max(rta_type, idx, TCA_OPTIONS);
6056 struct rtattr *tb[tca_max + 1];
6060 if (nlh->nlmsg_type != RTM_NEWTFILTER &&
6061 nlh->nlmsg_type != RTM_GETTFILTER &&
6062 nlh->nlmsg_type != RTM_DELTFILTER)
6064 len -= NLMSG_LENGTH(sizeof(*t));
6067 flow_tcf_nl_parse_rtattr(tb, tca_max, TCA_RTA(t), len);
6068 /* Not a TC flower flow - bail out */
6069 if (!tb[TCA_KIND] ||
6070 strcmp(RTA_DATA(tb[TCA_KIND]), "flower"))
6072 switch (rta_type[idx]) {
6074 if (tb[TCA_OPTIONS])
6075 return flow_tcf_nl_opts_parse_and_get(tb[TCA_OPTIONS],
6086 * A callback to parse Netlink reply on TC flower query.
6089 * Message received from Netlink.
6091 * Pointer to data area to be filled by the parsing routine.
6092 * assumed to be a pointer to struct flow_tcf_stats_basic.
6098 flow_tcf_nl_message_get_stats_basic(const struct nlmsghdr *nlh, void *data)
6101 * The backward sequence of rta_types to pass in order to get
6104 uint16_t rta_type[] = { TCA_STATS_BASIC, TCA_ACT_STATS,
6105 TCA_FLOWER_ACT, TCA_OPTIONS };
6106 struct flow_tcf_stats_basic *sb_data = data;
6108 const struct nlmsghdr *c;
6109 struct nlmsghdr *nc;
6110 } tnlh = { .c = nlh };
6112 if (!flow_tcf_nl_filter_parse_and_get(tnlh.nc, rta_type,
6113 RTE_DIM(rta_type) - 1,
6114 (void *)&sb_data->counters))
6115 sb_data->valid = true;
6120 * Query a TC flower rule for its statistics via netlink.
6123 * Pointer to Ethernet device.
6125 * Pointer to the sub flow.
6127 * data retrieved by the query.
6129 * Perform verbose error reporting if not NULL.
6132 * 0 on success, a negative errno value otherwise and rte_errno is set.
6135 flow_tcf_query_count(struct rte_eth_dev *dev,
6136 struct rte_flow *flow,
6138 struct rte_flow_error *error)
6140 struct flow_tcf_stats_basic sb_data;
6141 struct rte_flow_query_count *qc = data;
6142 struct mlx5_priv *priv = dev->data->dev_private;
6143 struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
6144 struct mnl_socket *nl = ctx->nl;
6145 struct mlx5_flow *dev_flow;
6146 struct nlmsghdr *nlh;
6147 uint32_t seq = priv->tcf_context->seq++;
6151 memset(&sb_data, 0, sizeof(sb_data));
6152 dev_flow = LIST_FIRST(&flow->dev_flows);
6153 /* E-Switch flow can't be expanded. */
6154 assert(!LIST_NEXT(dev_flow, next));
6155 if (!dev_flow->flow->counter)
6157 nlh = dev_flow->tcf.nlh;
6158 nlh->nlmsg_type = RTM_GETTFILTER;
6159 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ECHO;
6160 nlh->nlmsg_seq = seq;
6161 if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) == -1)
6164 ret = mnl_socket_recvfrom(nl, ctx->buf, ctx->buf_size);
6167 ret = mnl_cb_run(ctx->buf, ret, seq,
6168 mnl_socket_get_portid(nl),
6169 flow_tcf_nl_message_get_stats_basic,
6172 /* Return the delta from last reset. */
6173 if (sb_data.valid) {
6174 /* Return the delta from last reset. */
6177 qc->hits = sb_data.counters.packets - flow->counter->hits;
6178 qc->bytes = sb_data.counters.bytes - flow->counter->bytes;
6180 flow->counter->hits = sb_data.counters.packets;
6181 flow->counter->bytes = sb_data.counters.bytes;
6185 return rte_flow_error_set(error, EINVAL,
6186 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
6188 "flow does not have counter");
6190 return rte_flow_error_set
6191 (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
6192 NULL, "netlink: failed to read flow rule counters");
6194 return rte_flow_error_set
6195 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
6196 NULL, "counters are not available.");
6202 * @see rte_flow_query()
6206 flow_tcf_query(struct rte_eth_dev *dev,
6207 struct rte_flow *flow,
6208 const struct rte_flow_action *actions,
6210 struct rte_flow_error *error)
6214 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
6215 switch (actions->type) {
6216 case RTE_FLOW_ACTION_TYPE_VOID:
6218 case RTE_FLOW_ACTION_TYPE_COUNT:
6219 ret = flow_tcf_query_count(dev, flow, data, error);
6222 return rte_flow_error_set(error, ENOTSUP,
6223 RTE_FLOW_ERROR_TYPE_ACTION,
6225 "action not supported");
6231 const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops = {
6232 .validate = flow_tcf_validate,
6233 .prepare = flow_tcf_prepare,
6234 .translate = flow_tcf_translate,
6235 .apply = flow_tcf_apply,
6236 .remove = flow_tcf_remove,
6237 .destroy = flow_tcf_destroy,
6238 .query = flow_tcf_query,
6242 * Create and configure a libmnl socket for Netlink flow rules.
6245 * A valid libmnl socket object pointer on success, NULL otherwise and
6248 static struct mnl_socket *
6249 flow_tcf_mnl_socket_create(void)
6251 struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
6254 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
6256 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
6261 mnl_socket_close(nl);
6266 * Destroy a libmnl socket.
6269 * Libmnl socket of the @p NETLINK_ROUTE kind.
6272 flow_tcf_mnl_socket_destroy(struct mnl_socket *nl)
6275 mnl_socket_close(nl);
6279 * Initialize ingress qdisc of a given network interface.
6282 * Pointer to tc-flower context to use.
6284 * Index of network interface to initialize.
6286 * Perform verbose error reporting if not NULL.
6289 * 0 on success, a negative errno value otherwise and rte_errno is set.
6292 mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx,
6293 unsigned int ifindex, struct rte_flow_error *error)
6295 struct nlmsghdr *nlh;
6297 alignas(struct nlmsghdr)
6298 uint8_t buf[mnl_nlmsg_size(sizeof(*tcm)) +
6299 SZ_NLATTR_STRZ_OF("ingress") +
6300 MNL_BUF_EXTRA_SPACE];
6302 /* Destroy existing ingress qdisc and everything attached to it. */
6303 nlh = mnl_nlmsg_put_header(buf);
6304 nlh->nlmsg_type = RTM_DELQDISC;
6305 nlh->nlmsg_flags = NLM_F_REQUEST;
6306 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6307 tcm->tcm_family = AF_UNSPEC;
6308 tcm->tcm_ifindex = ifindex;
6309 tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6310 tcm->tcm_parent = TC_H_INGRESS;
6311 assert(sizeof(buf) >= nlh->nlmsg_len);
6312 /* Ignore errors when qdisc is already absent. */
6313 if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL) &&
6314 rte_errno != EINVAL && rte_errno != ENOENT)
6315 return rte_flow_error_set(error, rte_errno,
6316 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6317 "netlink: failed to remove ingress"
6319 /* Create fresh ingress qdisc. */
6320 nlh = mnl_nlmsg_put_header(buf);
6321 nlh->nlmsg_type = RTM_NEWQDISC;
6322 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
6323 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6324 tcm->tcm_family = AF_UNSPEC;
6325 tcm->tcm_ifindex = ifindex;
6326 tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6327 tcm->tcm_parent = TC_H_INGRESS;
6328 mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
6329 assert(sizeof(buf) >= nlh->nlmsg_len);
6330 if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL))
6331 return rte_flow_error_set(error, rte_errno,
6332 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6333 "netlink: failed to create ingress"
6339 * Create libmnl context for Netlink flow rules.
6342 * A valid libmnl socket object pointer on success, NULL otherwise and
6345 struct mlx5_flow_tcf_context *
6346 mlx5_flow_tcf_context_create(void)
6348 struct mlx5_flow_tcf_context *ctx = rte_zmalloc(__func__,
6353 ctx->nl = flow_tcf_mnl_socket_create();
6356 ctx->buf_size = MNL_SOCKET_BUFFER_SIZE;
6357 ctx->buf = rte_zmalloc(__func__,
6358 ctx->buf_size, sizeof(uint32_t));
6361 ctx->seq = random();
6364 mlx5_flow_tcf_context_destroy(ctx);
6369 * Destroy a libmnl context.
6372 * Libmnl socket of the @p NETLINK_ROUTE kind.
6375 mlx5_flow_tcf_context_destroy(struct mlx5_flow_tcf_context *ctx)
6379 flow_tcf_mnl_socket_destroy(ctx->nl);