1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2018 6WIND S.A.
3 * Copyright 2018 Mellanox Technologies, Ltd
8 #include <libmnl/libmnl.h>
9 #include <linux/gen_stats.h>
10 #include <linux/if_ether.h>
11 #include <linux/netlink.h>
12 #include <linux/pkt_cls.h>
13 #include <linux/pkt_sched.h>
14 #include <linux/rtnetlink.h>
15 #include <linux/tc_act/tc_gact.h>
16 #include <linux/tc_act/tc_mirred.h>
17 #include <netinet/in.h>
23 #include <sys/socket.h>
25 #include <rte_byteorder.h>
26 #include <rte_errno.h>
27 #include <rte_ether.h>
29 #include <rte_malloc.h>
30 #include <rte_common.h>
33 #include "mlx5_flow.h"
34 #include "mlx5_autoconf.h"
36 #ifdef HAVE_TC_ACT_VLAN
38 #include <linux/tc_act/tc_vlan.h>
40 #else /* HAVE_TC_ACT_VLAN */
42 #define TCA_VLAN_ACT_POP 1
43 #define TCA_VLAN_ACT_PUSH 2
44 #define TCA_VLAN_ACT_MODIFY 3
45 #define TCA_VLAN_PARMS 2
46 #define TCA_VLAN_PUSH_VLAN_ID 3
47 #define TCA_VLAN_PUSH_VLAN_PROTOCOL 4
48 #define TCA_VLAN_PAD 5
49 #define TCA_VLAN_PUSH_VLAN_PRIORITY 6
56 #endif /* HAVE_TC_ACT_VLAN */
58 #ifdef HAVE_TC_ACT_PEDIT
60 #include <linux/tc_act/tc_pedit.h>
62 #else /* HAVE_TC_ACT_VLAN */
76 TCA_PEDIT_KEY_EX_HTYPE = 1,
77 TCA_PEDIT_KEY_EX_CMD = 2,
78 __TCA_PEDIT_KEY_EX_MAX
81 enum pedit_header_type {
82 TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK = 0,
83 TCA_PEDIT_KEY_EX_HDR_TYPE_ETH = 1,
84 TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 = 2,
85 TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 = 3,
86 TCA_PEDIT_KEY_EX_HDR_TYPE_TCP = 4,
87 TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
92 TCA_PEDIT_KEY_EX_CMD_SET = 0,
93 TCA_PEDIT_KEY_EX_CMD_ADD = 1,
100 __u32 off; /*offset */
107 struct tc_pedit_sel {
111 struct tc_pedit_key keys[0];
114 #endif /* HAVE_TC_ACT_VLAN */
116 #ifdef HAVE_TC_ACT_TUNNEL_KEY
118 #include <linux/tc_act/tc_tunnel_key.h>
120 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
121 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
124 #ifndef HAVE_TCA_TUNNEL_KEY_NO_CSUM
125 #define TCA_TUNNEL_KEY_NO_CSUM 10
128 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_TOS
129 #define TCA_TUNNEL_KEY_ENC_TOS 12
132 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_TTL
133 #define TCA_TUNNEL_KEY_ENC_TTL 13
136 #else /* HAVE_TC_ACT_TUNNEL_KEY */
138 #define TCA_ACT_TUNNEL_KEY 17
139 #define TCA_TUNNEL_KEY_ACT_SET 1
140 #define TCA_TUNNEL_KEY_ACT_RELEASE 2
141 #define TCA_TUNNEL_KEY_PARMS 2
142 #define TCA_TUNNEL_KEY_ENC_IPV4_SRC 3
143 #define TCA_TUNNEL_KEY_ENC_IPV4_DST 4
144 #define TCA_TUNNEL_KEY_ENC_IPV6_SRC 5
145 #define TCA_TUNNEL_KEY_ENC_IPV6_DST 6
146 #define TCA_TUNNEL_KEY_ENC_KEY_ID 7
147 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
148 #define TCA_TUNNEL_KEY_NO_CSUM 10
149 #define TCA_TUNNEL_KEY_ENC_TOS 12
150 #define TCA_TUNNEL_KEY_ENC_TTL 13
152 struct tc_tunnel_key {
157 #endif /* HAVE_TC_ACT_TUNNEL_KEY */
159 /* Normally found in linux/netlink.h. */
160 #ifndef NETLINK_CAP_ACK
161 #define NETLINK_CAP_ACK 10
164 /* Normally found in linux/pkt_sched.h. */
165 #ifndef TC_H_MIN_INGRESS
166 #define TC_H_MIN_INGRESS 0xfff2u
169 /* Normally found in linux/pkt_cls.h. */
170 #ifndef TCA_CLS_FLAGS_SKIP_SW
171 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
173 #ifndef TCA_CLS_FLAGS_IN_HW
174 #define TCA_CLS_FLAGS_IN_HW (1 << 2)
176 #ifndef HAVE_TCA_CHAIN
179 #ifndef HAVE_TCA_FLOWER_ACT
180 #define TCA_FLOWER_ACT 3
182 #ifndef HAVE_TCA_FLOWER_FLAGS
183 #define TCA_FLOWER_FLAGS 22
185 #ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE
186 #define TCA_FLOWER_KEY_ETH_TYPE 8
188 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST
189 #define TCA_FLOWER_KEY_ETH_DST 4
191 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK
192 #define TCA_FLOWER_KEY_ETH_DST_MASK 5
194 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC
195 #define TCA_FLOWER_KEY_ETH_SRC 6
197 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK
198 #define TCA_FLOWER_KEY_ETH_SRC_MASK 7
200 #ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO
201 #define TCA_FLOWER_KEY_IP_PROTO 9
203 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC
204 #define TCA_FLOWER_KEY_IPV4_SRC 10
206 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK
207 #define TCA_FLOWER_KEY_IPV4_SRC_MASK 11
209 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST
210 #define TCA_FLOWER_KEY_IPV4_DST 12
212 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK
213 #define TCA_FLOWER_KEY_IPV4_DST_MASK 13
215 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC
216 #define TCA_FLOWER_KEY_IPV6_SRC 14
218 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK
219 #define TCA_FLOWER_KEY_IPV6_SRC_MASK 15
221 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST
222 #define TCA_FLOWER_KEY_IPV6_DST 16
224 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK
225 #define TCA_FLOWER_KEY_IPV6_DST_MASK 17
227 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC
228 #define TCA_FLOWER_KEY_TCP_SRC 18
230 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK
231 #define TCA_FLOWER_KEY_TCP_SRC_MASK 35
233 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST
234 #define TCA_FLOWER_KEY_TCP_DST 19
236 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK
237 #define TCA_FLOWER_KEY_TCP_DST_MASK 36
239 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC
240 #define TCA_FLOWER_KEY_UDP_SRC 20
242 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK
243 #define TCA_FLOWER_KEY_UDP_SRC_MASK 37
245 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST
246 #define TCA_FLOWER_KEY_UDP_DST 21
248 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK
249 #define TCA_FLOWER_KEY_UDP_DST_MASK 38
251 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID
252 #define TCA_FLOWER_KEY_VLAN_ID 23
254 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO
255 #define TCA_FLOWER_KEY_VLAN_PRIO 24
257 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
258 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
260 #ifndef HAVE_TCA_FLOWER_KEY_ENC_KEY_ID
261 #define TCA_FLOWER_KEY_ENC_KEY_ID 26
263 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC
264 #define TCA_FLOWER_KEY_ENC_IPV4_SRC 27
266 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK
267 #define TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK 28
269 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST
270 #define TCA_FLOWER_KEY_ENC_IPV4_DST 29
272 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK
273 #define TCA_FLOWER_KEY_ENC_IPV4_DST_MASK 30
275 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC
276 #define TCA_FLOWER_KEY_ENC_IPV6_SRC 31
278 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK
279 #define TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK 32
281 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST
282 #define TCA_FLOWER_KEY_ENC_IPV6_DST 33
284 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK
285 #define TCA_FLOWER_KEY_ENC_IPV6_DST_MASK 34
287 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT
288 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT 43
290 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK
291 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK 44
293 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT
294 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT 45
296 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK
297 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK 46
299 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS
300 #define TCA_FLOWER_KEY_TCP_FLAGS 71
302 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK
303 #define TCA_FLOWER_KEY_TCP_FLAGS_MASK 72
305 #ifndef HAVE_TCA_FLOWER_KEY_IP_TOS
306 #define TCA_FLOWER_KEY_IP_TOS 73
308 #ifndef HAVE_TCA_FLOWER_KEY_IP_TOS_MASK
309 #define TCA_FLOWER_KEY_IP_TOS_MASK 74
311 #ifndef HAVE_TCA_FLOWER_KEY_IP_TTL
312 #define TCA_FLOWER_KEY_IP_TTL 75
314 #ifndef HAVE_TCA_FLOWER_KEY_IP_TTL_MASK
315 #define TCA_FLOWER_KEY_IP_TTL_MASK 76
317 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TOS
318 #define TCA_FLOWER_KEY_ENC_IP_TOS 80
320 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TOS_MASK
321 #define TCA_FLOWER_KEY_ENC_IP_TOS_MASK 81
323 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TTL
324 #define TCA_FLOWER_KEY_ENC_IP_TTL 82
326 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TTL_MASK
327 #define TCA_FLOWER_KEY_ENC_IP_TTL_MASK 83
330 #ifndef HAVE_TC_ACT_GOTO_CHAIN
331 #define TC_ACT_GOTO_CHAIN 0x20000000
334 #ifndef IPV6_ADDR_LEN
335 #define IPV6_ADDR_LEN 16
338 #ifndef IPV4_ADDR_LEN
339 #define IPV4_ADDR_LEN 4
343 #define TP_PORT_LEN 2 /* Transport Port (UDP/TCP) Length */
350 #ifndef TCA_ACT_MAX_PRIO
351 #define TCA_ACT_MAX_PRIO 32
354 /** Parameters of VXLAN devices created by driver. */
355 #define MLX5_VXLAN_DEFAULT_VNI 1
356 #define MLX5_VXLAN_DEVICE_PFX "vmlx_"
358 /** Tunnel action type, used for @p type in header structure. */
359 enum flow_tcf_tunact_type {
360 FLOW_TCF_TUNACT_VXLAN_DECAP,
361 FLOW_TCF_TUNACT_VXLAN_ENCAP,
364 /** Flags used for @p mask in tunnel action encap descriptors. */
365 #define FLOW_TCF_ENCAP_ETH_SRC (1u << 0)
366 #define FLOW_TCF_ENCAP_ETH_DST (1u << 1)
367 #define FLOW_TCF_ENCAP_IPV4_SRC (1u << 2)
368 #define FLOW_TCF_ENCAP_IPV4_DST (1u << 3)
369 #define FLOW_TCF_ENCAP_IPV6_SRC (1u << 4)
370 #define FLOW_TCF_ENCAP_IPV6_DST (1u << 5)
371 #define FLOW_TCF_ENCAP_UDP_SRC (1u << 6)
372 #define FLOW_TCF_ENCAP_UDP_DST (1u << 7)
373 #define FLOW_TCF_ENCAP_VXLAN_VNI (1u << 8)
374 #define FLOW_TCF_ENCAP_IP_TTL (1u << 9)
375 #define FLOW_TCF_ENCAP_IP_TOS (1u << 10)
378 * Structure for holding netlink context.
379 * Note the size of the message buffer which is MNL_SOCKET_BUFFER_SIZE.
380 * Using this (8KB) buffer size ensures that netlink messages will never be
383 struct mlx5_flow_tcf_context {
384 struct mnl_socket *nl; /* NETLINK_ROUTE libmnl socket. */
385 uint32_t seq; /* Message sequence number. */
386 uint32_t buf_size; /* Message buffer size. */
387 uint8_t *buf; /* Message buffer. */
391 * Neigh rule structure. The neigh rule is applied via Netlink to
392 * outer tunnel iface in order to provide destination MAC address
393 * for the VXLAN encapsultion. The neigh rule is implicitly related
394 * to the Flow itself and can be shared by multiple Flows.
396 struct tcf_neigh_rule {
397 LIST_ENTRY(tcf_neigh_rule) next;
399 struct ether_addr eth;
406 uint8_t dst[IPV6_ADDR_LEN];
412 * Local rule structure. The local rule is applied via Netlink to
413 * outer tunnel iface in order to provide local and peer IP addresses
414 * of the VXLAN tunnel for encapsulation. The local rule is implicitly
415 * related to the Flow itself and can be shared by multiple Flows.
417 struct tcf_local_rule {
418 LIST_ENTRY(tcf_local_rule) next;
427 uint8_t dst[IPV6_ADDR_LEN];
428 uint8_t src[IPV6_ADDR_LEN];
433 /** Outer interface VXLAN encapsulation rules container. */
435 LIST_ENTRY(tcf_irule) next;
436 LIST_HEAD(, tcf_neigh_rule) neigh;
437 LIST_HEAD(, tcf_local_rule) local;
439 unsigned int ifouter; /**< Own interface index. */
442 /** VXLAN virtual netdev. */
444 LIST_ENTRY(tcf_vtep) next;
446 unsigned int ifindex; /**< Own interface index. */
451 /** Tunnel descriptor header, common for all tunnel types. */
452 struct flow_tcf_tunnel_hdr {
453 uint32_t type; /**< Tunnel action type. */
454 struct tcf_vtep *vtep; /**< Virtual tunnel endpoint device. */
455 unsigned int ifindex_org; /**< Original dst/src interface */
456 unsigned int *ifindex_ptr; /**< Interface ptr in message. */
459 struct flow_tcf_vxlan_decap {
460 struct flow_tcf_tunnel_hdr hdr;
464 struct flow_tcf_vxlan_encap {
465 struct flow_tcf_tunnel_hdr hdr;
466 struct tcf_irule *iface;
471 struct ether_addr dst;
472 struct ether_addr src;
480 uint8_t dst[IPV6_ADDR_LEN];
481 uint8_t src[IPV6_ADDR_LEN];
493 /** Structure used when extracting the values of a flow counters
494 * from a netlink message.
496 struct flow_tcf_stats_basic {
498 struct gnet_stats_basic counters;
501 /** Empty masks for known item types. */
503 struct rte_flow_item_port_id port_id;
504 struct rte_flow_item_eth eth;
505 struct rte_flow_item_vlan vlan;
506 struct rte_flow_item_ipv4 ipv4;
507 struct rte_flow_item_ipv6 ipv6;
508 struct rte_flow_item_tcp tcp;
509 struct rte_flow_item_udp udp;
510 struct rte_flow_item_vxlan vxlan;
511 } flow_tcf_mask_empty = {
515 /** Supported masks for known item types. */
516 static const struct {
517 struct rte_flow_item_port_id port_id;
518 struct rte_flow_item_eth eth;
519 struct rte_flow_item_vlan vlan;
520 struct rte_flow_item_ipv4 ipv4;
521 struct rte_flow_item_ipv6 ipv6;
522 struct rte_flow_item_tcp tcp;
523 struct rte_flow_item_udp udp;
524 struct rte_flow_item_vxlan vxlan;
525 } flow_tcf_mask_supported = {
530 .type = RTE_BE16(0xffff),
531 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
532 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
535 /* PCP and VID only, no DEI. */
536 .tci = RTE_BE16(0xefff),
537 .inner_type = RTE_BE16(0xffff),
540 .next_proto_id = 0xff,
541 .src_addr = RTE_BE32(0xffffffff),
542 .dst_addr = RTE_BE32(0xffffffff),
547 "\xff\xff\xff\xff\xff\xff\xff\xff"
548 "\xff\xff\xff\xff\xff\xff\xff\xff",
550 "\xff\xff\xff\xff\xff\xff\xff\xff"
551 "\xff\xff\xff\xff\xff\xff\xff\xff",
554 .src_port = RTE_BE16(0xffff),
555 .dst_port = RTE_BE16(0xffff),
559 .src_port = RTE_BE16(0xffff),
560 .dst_port = RTE_BE16(0xffff),
563 .vni = "\xff\xff\xff",
567 #define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr))
568 #define SZ_NLATTR_NEST SZ_NLATTR_HDR
569 #define SZ_NLATTR_DATA_OF(len) MNL_ALIGN(SZ_NLATTR_HDR + (len))
570 #define SZ_NLATTR_TYPE_OF(typ) SZ_NLATTR_DATA_OF(sizeof(typ))
571 #define SZ_NLATTR_STRZ_OF(str) SZ_NLATTR_DATA_OF(strlen(str) + 1)
573 #define PTOI_TABLE_SZ_MAX(dev) (mlx5_dev_to_port_id((dev)->device, NULL, 0) + 2)
575 /** DPDK port to network interface index (ifindex) conversion. */
576 struct flow_tcf_ptoi {
577 uint16_t port_id; /**< DPDK port ID. */
578 unsigned int ifindex; /**< Network interface index. */
581 /* Due to a limitation on driver/FW. */
582 #define MLX5_TCF_GROUP_ID_MAX 3
585 * Due to a limitation on driver/FW, priority ranges from 1 to 16 in kernel.
586 * Priority in rte_flow attribute starts from 0 and is added by 1 in
587 * translation. This is subject to be changed to determine the max priority
588 * based on trial-and-error like Verbs driver once the restriction is lifted or
589 * the range is extended.
591 #define MLX5_TCF_GROUP_PRIORITY_MAX 15
593 #define MLX5_TCF_FATE_ACTIONS \
594 (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \
595 MLX5_FLOW_ACTION_JUMP)
597 #define MLX5_TCF_VLAN_ACTIONS \
598 (MLX5_FLOW_ACTION_OF_POP_VLAN | MLX5_FLOW_ACTION_OF_PUSH_VLAN | \
599 MLX5_FLOW_ACTION_OF_SET_VLAN_VID | MLX5_FLOW_ACTION_OF_SET_VLAN_PCP)
601 #define MLX5_TCF_VXLAN_ACTIONS \
602 (MLX5_FLOW_ACTION_VXLAN_ENCAP | MLX5_FLOW_ACTION_VXLAN_DECAP)
604 #define MLX5_TCF_PEDIT_ACTIONS \
605 (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST | \
606 MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST | \
607 MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST | \
608 MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL | \
609 MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)
611 #define MLX5_TCF_CONFIG_ACTIONS \
612 (MLX5_FLOW_ACTION_PORT_ID | MLX5_FLOW_ACTION_JUMP | \
613 MLX5_FLOW_ACTION_OF_PUSH_VLAN | MLX5_FLOW_ACTION_OF_SET_VLAN_VID | \
614 MLX5_FLOW_ACTION_OF_SET_VLAN_PCP | \
615 (MLX5_TCF_PEDIT_ACTIONS & ~MLX5_FLOW_ACTION_DEC_TTL))
617 #define MAX_PEDIT_KEYS 128
618 #define SZ_PEDIT_KEY_VAL 4
620 #define NUM_OF_PEDIT_KEYS(sz) \
621 (((sz) / SZ_PEDIT_KEY_VAL) + (((sz) % SZ_PEDIT_KEY_VAL) ? 1 : 0))
623 struct pedit_key_ex {
624 enum pedit_header_type htype;
628 struct pedit_parser {
629 struct tc_pedit_sel sel;
630 struct tc_pedit_key keys[MAX_PEDIT_KEYS];
631 struct pedit_key_ex keys_ex[MAX_PEDIT_KEYS];
635 * Create space for using the implicitly created TC flow counter.
638 * Pointer to the Ethernet device structure.
641 * A pointer to the counter data structure, NULL otherwise and
644 static struct mlx5_flow_counter *
645 flow_tcf_counter_new(void)
647 struct mlx5_flow_counter *cnt;
650 * eswitch counter cannot be shared and its id is unknown.
651 * currently returning all with id 0.
652 * in the future maybe better to switch to unique numbers.
654 struct mlx5_flow_counter tmpl = {
657 cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0);
663 /* Implicit counter, do not add to list. */
668 * Set pedit key of MAC address
671 * pointer to action specification
672 * @param[in,out] p_parser
673 * pointer to pedit_parser
676 flow_tcf_pedit_key_set_mac(const struct rte_flow_action *actions,
677 struct pedit_parser *p_parser)
679 int idx = p_parser->sel.nkeys;
680 uint32_t off = actions->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ?
681 offsetof(struct ether_hdr, s_addr) :
682 offsetof(struct ether_hdr, d_addr);
683 const struct rte_flow_action_set_mac *conf =
684 (const struct rte_flow_action_set_mac *)actions->conf;
686 p_parser->keys[idx].off = off;
687 p_parser->keys[idx].mask = ~UINT32_MAX;
688 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
689 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
690 memcpy(&p_parser->keys[idx].val,
691 conf->mac_addr, SZ_PEDIT_KEY_VAL);
693 p_parser->keys[idx].off = off + SZ_PEDIT_KEY_VAL;
694 p_parser->keys[idx].mask = 0xFFFF0000;
695 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
696 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
697 memcpy(&p_parser->keys[idx].val,
698 conf->mac_addr + SZ_PEDIT_KEY_VAL,
699 ETHER_ADDR_LEN - SZ_PEDIT_KEY_VAL);
700 p_parser->sel.nkeys = (++idx);
704 * Set pedit key of decrease/set ttl
707 * pointer to action specification
708 * @param[in,out] p_parser
709 * pointer to pedit_parser
710 * @param[in] item_flags
711 * flags of all items presented
714 flow_tcf_pedit_key_set_dec_ttl(const struct rte_flow_action *actions,
715 struct pedit_parser *p_parser,
718 int idx = p_parser->sel.nkeys;
720 p_parser->keys[idx].mask = 0xFFFFFF00;
721 if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) {
722 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
723 p_parser->keys[idx].off =
724 offsetof(struct ipv4_hdr, time_to_live);
726 if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6) {
727 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
728 p_parser->keys[idx].off =
729 offsetof(struct ipv6_hdr, hop_limits);
731 if (actions->type == RTE_FLOW_ACTION_TYPE_DEC_TTL) {
732 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_ADD;
733 p_parser->keys[idx].val = 0x000000FF;
735 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
736 p_parser->keys[idx].val =
737 (__u32)((const struct rte_flow_action_set_ttl *)
738 actions->conf)->ttl_value;
740 p_parser->sel.nkeys = (++idx);
744 * Set pedit key of transport (TCP/UDP) port value
747 * pointer to action specification
748 * @param[in,out] p_parser
749 * pointer to pedit_parser
750 * @param[in] item_flags
751 * flags of all items presented
754 flow_tcf_pedit_key_set_tp_port(const struct rte_flow_action *actions,
755 struct pedit_parser *p_parser,
758 int idx = p_parser->sel.nkeys;
760 if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)
761 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_UDP;
762 if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_TCP)
763 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_TCP;
764 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
765 /* offset of src/dst port is same for TCP and UDP */
766 p_parser->keys[idx].off =
767 actions->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC ?
768 offsetof(struct tcp_hdr, src_port) :
769 offsetof(struct tcp_hdr, dst_port);
770 p_parser->keys[idx].mask = 0xFFFF0000;
771 p_parser->keys[idx].val =
772 (__u32)((const struct rte_flow_action_set_tp *)
773 actions->conf)->port;
774 p_parser->sel.nkeys = (++idx);
778 * Set pedit key of ipv6 address
781 * pointer to action specification
782 * @param[in,out] p_parser
783 * pointer to pedit_parser
786 flow_tcf_pedit_key_set_ipv6_addr(const struct rte_flow_action *actions,
787 struct pedit_parser *p_parser)
789 int idx = p_parser->sel.nkeys;
790 int keys = NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
792 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ?
793 offsetof(struct ipv6_hdr, src_addr) :
794 offsetof(struct ipv6_hdr, dst_addr);
795 const struct rte_flow_action_set_ipv6 *conf =
796 (const struct rte_flow_action_set_ipv6 *)actions->conf;
798 for (int i = 0; i < keys; i++, idx++) {
799 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
800 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
801 p_parser->keys[idx].off = off_base + i * SZ_PEDIT_KEY_VAL;
802 p_parser->keys[idx].mask = ~UINT32_MAX;
803 memcpy(&p_parser->keys[idx].val,
804 conf->ipv6_addr + i * SZ_PEDIT_KEY_VAL,
807 p_parser->sel.nkeys += keys;
811 * Set pedit key of ipv4 address
814 * pointer to action specification
815 * @param[in,out] p_parser
816 * pointer to pedit_parser
819 flow_tcf_pedit_key_set_ipv4_addr(const struct rte_flow_action *actions,
820 struct pedit_parser *p_parser)
822 int idx = p_parser->sel.nkeys;
824 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
825 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
826 p_parser->keys[idx].off =
827 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ?
828 offsetof(struct ipv4_hdr, src_addr) :
829 offsetof(struct ipv4_hdr, dst_addr);
830 p_parser->keys[idx].mask = ~UINT32_MAX;
831 p_parser->keys[idx].val =
832 ((const struct rte_flow_action_set_ipv4 *)
833 actions->conf)->ipv4_addr;
834 p_parser->sel.nkeys = (++idx);
838 * Create the pedit's na attribute in netlink message
839 * on pre-allocate message buffer
842 * pointer to pre-allocated netlink message buffer
843 * @param[in,out] actions
844 * pointer to pointer of actions specification.
845 * @param[in,out] action_flags
846 * pointer to actions flags
847 * @param[in] item_flags
848 * flags of all item presented
851 flow_tcf_create_pedit_mnl_msg(struct nlmsghdr *nl,
852 const struct rte_flow_action **actions,
855 struct pedit_parser p_parser;
856 struct nlattr *na_act_options;
857 struct nlattr *na_pedit_keys;
859 memset(&p_parser, 0, sizeof(p_parser));
860 mnl_attr_put_strz(nl, TCA_ACT_KIND, "pedit");
861 na_act_options = mnl_attr_nest_start(nl, TCA_ACT_OPTIONS);
862 /* all modify header actions should be in one tc-pedit action */
863 for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
864 switch ((*actions)->type) {
865 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
866 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
867 flow_tcf_pedit_key_set_ipv4_addr(*actions, &p_parser);
869 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
870 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
871 flow_tcf_pedit_key_set_ipv6_addr(*actions, &p_parser);
873 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
874 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
875 flow_tcf_pedit_key_set_tp_port(*actions,
876 &p_parser, item_flags);
878 case RTE_FLOW_ACTION_TYPE_SET_TTL:
879 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
880 flow_tcf_pedit_key_set_dec_ttl(*actions,
881 &p_parser, item_flags);
883 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
884 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
885 flow_tcf_pedit_key_set_mac(*actions, &p_parser);
888 goto pedit_mnl_msg_done;
892 p_parser.sel.action = TC_ACT_PIPE;
893 mnl_attr_put(nl, TCA_PEDIT_PARMS_EX,
894 sizeof(p_parser.sel) +
895 p_parser.sel.nkeys * sizeof(struct tc_pedit_key),
898 mnl_attr_nest_start(nl, TCA_PEDIT_KEYS_EX | NLA_F_NESTED);
899 for (int i = 0; i < p_parser.sel.nkeys; i++) {
900 struct nlattr *na_pedit_key =
901 mnl_attr_nest_start(nl,
902 TCA_PEDIT_KEY_EX | NLA_F_NESTED);
903 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_HTYPE,
904 p_parser.keys_ex[i].htype);
905 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_CMD,
906 p_parser.keys_ex[i].cmd);
907 mnl_attr_nest_end(nl, na_pedit_key);
909 mnl_attr_nest_end(nl, na_pedit_keys);
910 mnl_attr_nest_end(nl, na_act_options);
915 * Calculate max memory size of one TC-pedit actions.
916 * One TC-pedit action can contain set of keys each defining
917 * a rewrite element (rte_flow action)
919 * @param[in,out] actions
920 * actions specification.
921 * @param[in,out] action_flags
923 * @param[in,out] size
926 * Max memory size of one TC-pedit action
929 flow_tcf_get_pedit_actions_size(const struct rte_flow_action **actions,
930 uint64_t *action_flags)
936 pedit_size += SZ_NLATTR_NEST + /* na_act_index. */
937 SZ_NLATTR_STRZ_OF("pedit") +
938 SZ_NLATTR_NEST; /* TCA_ACT_OPTIONS. */
939 for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
940 switch ((*actions)->type) {
941 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
942 keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
943 flags |= MLX5_FLOW_ACTION_SET_IPV4_SRC;
945 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
946 keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
947 flags |= MLX5_FLOW_ACTION_SET_IPV4_DST;
949 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
950 keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
951 flags |= MLX5_FLOW_ACTION_SET_IPV6_SRC;
953 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
954 keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
955 flags |= MLX5_FLOW_ACTION_SET_IPV6_DST;
957 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
958 /* TCP is as same as UDP */
959 keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
960 flags |= MLX5_FLOW_ACTION_SET_TP_SRC;
962 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
963 /* TCP is as same as UDP */
964 keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
965 flags |= MLX5_FLOW_ACTION_SET_TP_DST;
967 case RTE_FLOW_ACTION_TYPE_SET_TTL:
968 keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
969 flags |= MLX5_FLOW_ACTION_SET_TTL;
971 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
972 keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
973 flags |= MLX5_FLOW_ACTION_DEC_TTL;
975 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
976 keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
977 flags |= MLX5_FLOW_ACTION_SET_MAC_SRC;
979 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
980 keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
981 flags |= MLX5_FLOW_ACTION_SET_MAC_DST;
984 goto get_pedit_action_size_done;
987 get_pedit_action_size_done:
988 /* TCA_PEDIT_PARAMS_EX */
990 SZ_NLATTR_DATA_OF(sizeof(struct tc_pedit_sel) +
991 keys * sizeof(struct tc_pedit_key));
992 pedit_size += SZ_NLATTR_NEST; /* TCA_PEDIT_KEYS */
994 /* TCA_PEDIT_KEY_EX + HTYPE + CMD */
995 (SZ_NLATTR_NEST + SZ_NLATTR_DATA_OF(2) +
996 SZ_NLATTR_DATA_OF(2));
997 (*action_flags) |= flags;
1003 * Retrieve mask for pattern item.
1005 * This function does basic sanity checks on a pattern item in order to
1006 * return the most appropriate mask for it.
1009 * Item specification.
1010 * @param[in] mask_default
1011 * Default mask for pattern item as specified by the flow API.
1012 * @param[in] mask_supported
1013 * Mask fields supported by the implementation.
1014 * @param[in] mask_empty
1015 * Empty mask to return when there is no specification.
1017 * Perform verbose error reporting if not NULL.
1020 * Either @p item->mask or one of the mask parameters on success, NULL
1021 * otherwise and rte_errno is set.
1024 flow_tcf_item_mask(const struct rte_flow_item *item, const void *mask_default,
1025 const void *mask_supported, const void *mask_empty,
1026 size_t mask_size, struct rte_flow_error *error)
1028 const uint8_t *mask;
1031 /* item->last and item->mask cannot exist without item->spec. */
1032 if (!item->spec && (item->mask || item->last)) {
1033 rte_flow_error_set(error, EINVAL,
1034 RTE_FLOW_ERROR_TYPE_ITEM, item,
1035 "\"mask\" or \"last\" field provided without"
1036 " a corresponding \"spec\"");
1039 /* No spec, no mask, no problem. */
1042 mask = item->mask ? item->mask : mask_default;
1045 * Single-pass check to make sure that:
1046 * - Mask is supported, no bits are set outside mask_supported.
1047 * - Both item->spec and item->last are included in mask.
1049 for (i = 0; i != mask_size; ++i) {
1052 if ((mask[i] | ((const uint8_t *)mask_supported)[i]) !=
1053 ((const uint8_t *)mask_supported)[i]) {
1054 rte_flow_error_set(error, ENOTSUP,
1055 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1056 "unsupported field found"
1061 (((const uint8_t *)item->spec)[i] & mask[i]) !=
1062 (((const uint8_t *)item->last)[i] & mask[i])) {
1063 rte_flow_error_set(error, EINVAL,
1064 RTE_FLOW_ERROR_TYPE_ITEM_LAST,
1066 "range between \"spec\" and \"last\""
1067 " not comprised in \"mask\"");
1075 * Build a conversion table between port ID and ifindex.
1078 * Pointer to Ethernet device.
1080 * Pointer to ptoi table.
1082 * Size of ptoi table provided.
1085 * Size of ptoi table filled.
1088 flow_tcf_build_ptoi_table(struct rte_eth_dev *dev, struct flow_tcf_ptoi *ptoi,
1091 unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
1092 uint16_t port_id[n + 1];
1094 unsigned int own = 0;
1096 /* At least one port is needed when no switch domain is present. */
1099 port_id[0] = dev->data->port_id;
1101 n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
1105 for (i = 0; i != n; ++i) {
1106 struct rte_eth_dev_info dev_info;
1108 rte_eth_dev_info_get(port_id[i], &dev_info);
1109 if (port_id[i] == dev->data->port_id)
1111 ptoi[i].port_id = port_id[i];
1112 ptoi[i].ifindex = dev_info.if_index;
1114 /* Ensure first entry of ptoi[] is the current device. */
1117 ptoi[0] = ptoi[own];
1118 ptoi[own] = ptoi[n];
1120 /* An entry with zero ifindex terminates ptoi[]. */
1121 ptoi[n].port_id = 0;
1122 ptoi[n].ifindex = 0;
1127 * Verify the @p attr will be correctly understood by the E-switch.
1130 * Pointer to flow attributes
1132 * Pointer to error structure.
1135 * 0 on success, a negative errno value otherwise and rte_errno is set.
1138 flow_tcf_validate_attributes(const struct rte_flow_attr *attr,
1139 struct rte_flow_error *error)
1142 * Supported attributes: groups, some priorities and ingress only.
1143 * group is supported only if kernel supports chain. Don't care about
1144 * transfer as it is the caller's problem.
1146 if (attr->group > MLX5_TCF_GROUP_ID_MAX)
1147 return rte_flow_error_set(error, ENOTSUP,
1148 RTE_FLOW_ERROR_TYPE_ATTR_GROUP, attr,
1149 "group ID larger than "
1150 RTE_STR(MLX5_TCF_GROUP_ID_MAX)
1151 " isn't supported");
1152 else if (attr->priority > MLX5_TCF_GROUP_PRIORITY_MAX)
1153 return rte_flow_error_set(error, ENOTSUP,
1154 RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1156 "priority more than "
1157 RTE_STR(MLX5_TCF_GROUP_PRIORITY_MAX)
1158 " is not supported");
1160 return rte_flow_error_set(error, EINVAL,
1161 RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1162 attr, "only ingress is supported");
1164 return rte_flow_error_set(error, ENOTSUP,
1165 RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1166 attr, "egress is not supported");
1171 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_ETH item for E-Switch.
1172 * The routine checks the L2 fields to be used in encapsulation header.
1175 * Pointer to the item structure.
1177 * Pointer to the error structure.
1180 * 0 on success, a negative errno value otherwise and rte_errno is set.
1183 flow_tcf_validate_vxlan_encap_eth(const struct rte_flow_item *item,
1184 struct rte_flow_error *error)
1186 const struct rte_flow_item_eth *spec = item->spec;
1187 const struct rte_flow_item_eth *mask = item->mask;
1191 * Specification for L2 addresses can be empty
1192 * because these ones are optional and not
1193 * required directly by tc rule. Kernel tries
1194 * to resolve these ones on its own
1199 /* If mask is not specified use the default one. */
1200 mask = &rte_flow_item_eth_mask;
1202 if (memcmp(&mask->dst,
1203 &flow_tcf_mask_empty.eth.dst,
1204 sizeof(flow_tcf_mask_empty.eth.dst))) {
1205 if (memcmp(&mask->dst,
1206 &rte_flow_item_eth_mask.dst,
1207 sizeof(rte_flow_item_eth_mask.dst)))
1208 return rte_flow_error_set
1210 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1211 "no support for partial mask on"
1212 " \"eth.dst\" field");
1214 if (memcmp(&mask->src,
1215 &flow_tcf_mask_empty.eth.src,
1216 sizeof(flow_tcf_mask_empty.eth.src))) {
1217 if (memcmp(&mask->src,
1218 &rte_flow_item_eth_mask.src,
1219 sizeof(rte_flow_item_eth_mask.src)))
1220 return rte_flow_error_set
1222 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1223 "no support for partial mask on"
1224 " \"eth.src\" field");
1226 if (mask->type != RTE_BE16(0x0000)) {
1227 if (mask->type != RTE_BE16(0xffff))
1228 return rte_flow_error_set
1230 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1231 "no support for partial mask on"
1232 " \"eth.type\" field");
1234 "outer ethernet type field"
1235 " cannot be forced for vxlan"
1236 " encapsulation, parameter ignored");
1242 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV4 item for E-Switch.
1243 * The routine checks the IPv4 fields to be used in encapsulation header.
1246 * Pointer to the item structure.
1248 * Pointer to the error structure.
1251 * 0 on success, a negative errno value otherwise and rte_errno is set.
1254 flow_tcf_validate_vxlan_encap_ipv4(const struct rte_flow_item *item,
1255 struct rte_flow_error *error)
1257 const struct rte_flow_item_ipv4 *spec = item->spec;
1258 const struct rte_flow_item_ipv4 *mask = item->mask;
1262 * Specification for IP addresses cannot be empty
1263 * because it is required by tunnel_key parameter.
1265 return rte_flow_error_set(error, EINVAL,
1266 RTE_FLOW_ERROR_TYPE_ITEM, item,
1267 "NULL outer ipv4 address"
1268 " specification for vxlan"
1272 mask = &rte_flow_item_ipv4_mask;
1273 if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) {
1274 if (mask->hdr.dst_addr != RTE_BE32(0xffffffff))
1275 return rte_flow_error_set
1277 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1278 "no support for partial mask on"
1279 " \"ipv4.hdr.dst_addr\" field"
1280 " for vxlan encapsulation");
1281 /* More IPv4 address validations can be put here. */
1284 * Kernel uses the destination IP address to determine
1285 * the routing path and obtain the MAC destination
1286 * address, so IP destination address must be
1287 * specified in the tc rule.
1289 return rte_flow_error_set(error, EINVAL,
1290 RTE_FLOW_ERROR_TYPE_ITEM, item,
1291 "outer ipv4 destination address"
1292 " must be specified for"
1293 " vxlan encapsulation");
1295 if (mask->hdr.src_addr != RTE_BE32(0x00000000)) {
1296 if (mask->hdr.src_addr != RTE_BE32(0xffffffff))
1297 return rte_flow_error_set
1299 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1300 "no support for partial mask on"
1301 " \"ipv4.hdr.src_addr\" field"
1302 " for vxlan encapsulation");
1303 /* More IPv4 address validations can be put here. */
1306 * Kernel uses the source IP address to select the
1307 * interface for egress encapsulated traffic, so
1308 * it must be specified in the tc rule.
1310 return rte_flow_error_set(error, EINVAL,
1311 RTE_FLOW_ERROR_TYPE_ITEM, item,
1312 "outer ipv4 source address"
1313 " must be specified for"
1314 " vxlan encapsulation");
1316 if (mask->hdr.type_of_service &&
1317 mask->hdr.type_of_service != 0xff)
1318 return rte_flow_error_set(error, ENOTSUP,
1319 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1320 "no support for partial mask on"
1321 " \"ipv4.hdr.type_of_service\" field"
1322 " for vxlan encapsulation");
1323 if (mask->hdr.time_to_live &&
1324 mask->hdr.time_to_live != 0xff)
1325 return rte_flow_error_set(error, ENOTSUP,
1326 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1327 "no support for partial mask on"
1328 " \"ipv4.hdr.time_to_live\" field"
1329 " for vxlan encapsulation");
1334 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV6 item for E-Switch.
1335 * The routine checks the IPv6 fields to be used in encapsulation header.
1338 * Pointer to the item structure.
1340 * Pointer to the error structure.
1343 * 0 on success, a negative errno value otherwise and rte_errno is set.
1346 flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item,
1347 struct rte_flow_error *error)
1349 const struct rte_flow_item_ipv6 *spec = item->spec;
1350 const struct rte_flow_item_ipv6 *mask = item->mask;
1355 * Specification for IP addresses cannot be empty
1356 * because it is required by tunnel_key parameter.
1358 return rte_flow_error_set(error, EINVAL,
1359 RTE_FLOW_ERROR_TYPE_ITEM, item,
1360 "NULL outer ipv6 address"
1361 " specification for"
1362 " vxlan encapsulation");
1365 mask = &rte_flow_item_ipv6_mask;
1366 if (memcmp(&mask->hdr.dst_addr,
1367 &flow_tcf_mask_empty.ipv6.hdr.dst_addr,
1369 if (memcmp(&mask->hdr.dst_addr,
1370 &rte_flow_item_ipv6_mask.hdr.dst_addr,
1372 return rte_flow_error_set
1374 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1375 "no support for partial mask on"
1376 " \"ipv6.hdr.dst_addr\" field"
1377 " for vxlan encapsulation");
1378 /* More IPv6 address validations can be put here. */
1381 * Kernel uses the destination IP address to determine
1382 * the routing path and obtain the MAC destination
1383 * address (heigh or gate), so IP destination address
1384 * must be specified within the tc rule.
1386 return rte_flow_error_set(error, EINVAL,
1387 RTE_FLOW_ERROR_TYPE_ITEM, item,
1388 "outer ipv6 destination address"
1389 " must be specified for"
1390 " vxlan encapsulation");
1392 if (memcmp(&mask->hdr.src_addr,
1393 &flow_tcf_mask_empty.ipv6.hdr.src_addr,
1395 if (memcmp(&mask->hdr.src_addr,
1396 &rte_flow_item_ipv6_mask.hdr.src_addr,
1398 return rte_flow_error_set
1400 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1401 "no support for partial mask on"
1402 " \"ipv6.hdr.src_addr\" field"
1403 " for vxlan encapsulation");
1404 /* More L3 address validation can be put here. */
1407 * Kernel uses the source IP address to select the
1408 * interface for egress encapsulated traffic, so
1409 * it must be specified in the tc rule.
1411 return rte_flow_error_set(error, EINVAL,
1412 RTE_FLOW_ERROR_TYPE_ITEM, item,
1413 "outer L3 source address"
1414 " must be specified for"
1415 " vxlan encapsulation");
1417 msk6 = (rte_be_to_cpu_32(mask->hdr.vtc_flow) >>
1418 IPV6_HDR_TC_SHIFT) & 0xff;
1419 if (msk6 && msk6 != 0xff)
1420 return rte_flow_error_set(error, ENOTSUP,
1421 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1422 "no support for partial mask on"
1423 " \"ipv6.hdr.vtc_flow.tos\" field"
1424 " for vxlan encapsulation");
1425 if (mask->hdr.hop_limits && mask->hdr.hop_limits != 0xff)
1426 return rte_flow_error_set(error, ENOTSUP,
1427 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1428 "no support for partial mask on"
1429 " \"ipv6.hdr.hop_limits\" field"
1430 " for vxlan encapsulation");
1435 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_UDP item for E-Switch.
1436 * The routine checks the UDP fields to be used in encapsulation header.
1439 * Pointer to the item structure.
1441 * Pointer to the error structure.
1444 * 0 on success, a negative errno value otherwise and rte_errno is set.
1447 flow_tcf_validate_vxlan_encap_udp(const struct rte_flow_item *item,
1448 struct rte_flow_error *error)
1450 const struct rte_flow_item_udp *spec = item->spec;
1451 const struct rte_flow_item_udp *mask = item->mask;
1455 * Specification for UDP ports cannot be empty
1456 * because it is required by tunnel_key parameter.
1458 return rte_flow_error_set(error, EINVAL,
1459 RTE_FLOW_ERROR_TYPE_ITEM, item,
1460 "NULL UDP port specification "
1461 " for vxlan encapsulation");
1464 mask = &rte_flow_item_udp_mask;
1465 if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1466 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1467 return rte_flow_error_set
1469 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1470 "no support for partial mask on"
1471 " \"udp.hdr.dst_port\" field"
1472 " for vxlan encapsulation");
1473 if (!spec->hdr.dst_port)
1474 return rte_flow_error_set
1476 RTE_FLOW_ERROR_TYPE_ITEM, item,
1477 "outer UDP remote port cannot be"
1478 " 0 for vxlan encapsulation");
1480 return rte_flow_error_set(error, EINVAL,
1481 RTE_FLOW_ERROR_TYPE_ITEM, item,
1482 "outer UDP remote port"
1483 " must be specified for"
1484 " vxlan encapsulation");
1486 if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1487 if (mask->hdr.src_port != RTE_BE16(0xffff))
1488 return rte_flow_error_set
1490 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1491 "no support for partial mask on"
1492 " \"udp.hdr.src_port\" field"
1493 " for vxlan encapsulation");
1495 "outer UDP source port cannot be"
1496 " forced for vxlan encapsulation,"
1497 " parameter ignored");
1503 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_VXLAN item for E-Switch.
1504 * The routine checks the VNIP fields to be used in encapsulation header.
1507 * Pointer to the item structure.
1509 * Pointer to the error structure.
1512 * 0 on success, a negative errno value otherwise and rte_errno is set.
1515 flow_tcf_validate_vxlan_encap_vni(const struct rte_flow_item *item,
1516 struct rte_flow_error *error)
1518 const struct rte_flow_item_vxlan *spec = item->spec;
1519 const struct rte_flow_item_vxlan *mask = item->mask;
1522 /* Outer VNI is required by tunnel_key parameter. */
1523 return rte_flow_error_set(error, EINVAL,
1524 RTE_FLOW_ERROR_TYPE_ITEM, item,
1525 "NULL VNI specification"
1526 " for vxlan encapsulation");
1529 mask = &rte_flow_item_vxlan_mask;
1530 if (!mask->vni[0] && !mask->vni[1] && !mask->vni[2])
1531 return rte_flow_error_set(error, EINVAL,
1532 RTE_FLOW_ERROR_TYPE_ITEM, item,
1533 "outer VNI must be specified "
1534 "for vxlan encapsulation");
1535 if (mask->vni[0] != 0xff ||
1536 mask->vni[1] != 0xff ||
1537 mask->vni[2] != 0xff)
1538 return rte_flow_error_set(error, ENOTSUP,
1539 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1540 "no support for partial mask on"
1541 " \"vxlan.vni\" field");
1543 if (!spec->vni[0] && !spec->vni[1] && !spec->vni[2])
1544 return rte_flow_error_set(error, EINVAL,
1545 RTE_FLOW_ERROR_TYPE_ITEM, item,
1546 "vxlan vni cannot be 0");
1551 * Validate VXLAN_ENCAP action item list for E-Switch.
1552 * The routine checks items to be used in encapsulation header.
1555 * Pointer to the VXLAN_ENCAP action structure.
1557 * Pointer to the error structure.
1560 * 0 on success, a negative errno value otherwise and rte_errno is set.
1563 flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action,
1564 struct rte_flow_error *error)
1566 const struct rte_flow_item *items;
1568 uint32_t item_flags = 0;
1571 return rte_flow_error_set(error, EINVAL,
1572 RTE_FLOW_ERROR_TYPE_ACTION, action,
1573 "Missing vxlan tunnel"
1574 " action configuration");
1575 items = ((const struct rte_flow_action_vxlan_encap *)
1576 action->conf)->definition;
1578 return rte_flow_error_set(error, EINVAL,
1579 RTE_FLOW_ERROR_TYPE_ACTION, action,
1580 "Missing vxlan tunnel"
1581 " encapsulation parameters");
1582 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1583 switch (items->type) {
1584 case RTE_FLOW_ITEM_TYPE_VOID:
1586 case RTE_FLOW_ITEM_TYPE_ETH:
1587 ret = mlx5_flow_validate_item_eth(items, item_flags,
1591 ret = flow_tcf_validate_vxlan_encap_eth(items, error);
1594 item_flags |= MLX5_FLOW_LAYER_OUTER_L2;
1597 case RTE_FLOW_ITEM_TYPE_IPV4:
1598 ret = mlx5_flow_validate_item_ipv4(items, item_flags,
1602 ret = flow_tcf_validate_vxlan_encap_ipv4(items, error);
1605 item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
1607 case RTE_FLOW_ITEM_TYPE_IPV6:
1608 ret = mlx5_flow_validate_item_ipv6(items, item_flags,
1612 ret = flow_tcf_validate_vxlan_encap_ipv6(items, error);
1615 item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
1617 case RTE_FLOW_ITEM_TYPE_UDP:
1618 ret = mlx5_flow_validate_item_udp(items, item_flags,
1622 ret = flow_tcf_validate_vxlan_encap_udp(items, error);
1625 item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
1627 case RTE_FLOW_ITEM_TYPE_VXLAN:
1628 ret = mlx5_flow_validate_item_vxlan(items,
1632 ret = flow_tcf_validate_vxlan_encap_vni(items, error);
1635 item_flags |= MLX5_FLOW_LAYER_VXLAN;
1638 return rte_flow_error_set
1640 RTE_FLOW_ERROR_TYPE_ITEM, items,
1641 "vxlan encap item not supported");
1644 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3))
1645 return rte_flow_error_set(error, EINVAL,
1646 RTE_FLOW_ERROR_TYPE_ACTION, action,
1647 "no outer IP layer found"
1648 " for vxlan encapsulation");
1649 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
1650 return rte_flow_error_set(error, EINVAL,
1651 RTE_FLOW_ERROR_TYPE_ACTION, action,
1652 "no outer UDP layer found"
1653 " for vxlan encapsulation");
1654 if (!(item_flags & MLX5_FLOW_LAYER_VXLAN))
1655 return rte_flow_error_set(error, EINVAL,
1656 RTE_FLOW_ERROR_TYPE_ACTION, action,
1657 "no VXLAN VNI found"
1658 " for vxlan encapsulation");
1663 * Validate outer RTE_FLOW_ITEM_TYPE_UDP item if tunnel item
1664 * RTE_FLOW_ITEM_TYPE_VXLAN is present in item list.
1667 * Outer UDP layer item (if any, NULL otherwise).
1669 * Pointer to the error structure.
1672 * 0 on success, a negative errno value otherwise and rte_errno is set.
1675 flow_tcf_validate_vxlan_decap_udp(const struct rte_flow_item *udp,
1676 struct rte_flow_error *error)
1678 const struct rte_flow_item_udp *spec = udp->spec;
1679 const struct rte_flow_item_udp *mask = udp->mask;
1683 * Specification for UDP ports cannot be empty
1684 * because it is required as decap parameter.
1686 return rte_flow_error_set(error, EINVAL,
1687 RTE_FLOW_ERROR_TYPE_ITEM, udp,
1688 "NULL UDP port specification"
1689 " for VXLAN decapsulation");
1691 mask = &rte_flow_item_udp_mask;
1692 if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1693 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1694 return rte_flow_error_set
1696 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1697 "no support for partial mask on"
1698 " \"udp.hdr.dst_port\" field");
1699 if (!spec->hdr.dst_port)
1700 return rte_flow_error_set
1702 RTE_FLOW_ERROR_TYPE_ITEM, udp,
1703 "zero decap local UDP port");
1705 return rte_flow_error_set(error, EINVAL,
1706 RTE_FLOW_ERROR_TYPE_ITEM, udp,
1707 "outer UDP destination port must be "
1708 "specified for vxlan decapsulation");
1710 if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1711 if (mask->hdr.src_port != RTE_BE16(0xffff))
1712 return rte_flow_error_set
1714 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1715 "no support for partial mask on"
1716 " \"udp.hdr.src_port\" field");
1718 "outer UDP local port cannot be "
1719 "forced for VXLAN encapsulation, "
1720 "parameter ignored");
1726 * Validate flow for E-Switch.
1729 * Pointer to the priv structure.
1731 * Pointer to the flow attributes.
1733 * Pointer to the list of items.
1734 * @param[in] actions
1735 * Pointer to the list of actions.
1737 * Pointer to the error structure.
1740 * 0 on success, a negative errno value otherwise and rte_errno is set.
1743 flow_tcf_validate(struct rte_eth_dev *dev,
1744 const struct rte_flow_attr *attr,
1745 const struct rte_flow_item items[],
1746 const struct rte_flow_action actions[],
1747 struct rte_flow_error *error)
1750 const struct rte_flow_item_port_id *port_id;
1751 const struct rte_flow_item_eth *eth;
1752 const struct rte_flow_item_vlan *vlan;
1753 const struct rte_flow_item_ipv4 *ipv4;
1754 const struct rte_flow_item_ipv6 *ipv6;
1755 const struct rte_flow_item_tcp *tcp;
1756 const struct rte_flow_item_udp *udp;
1757 const struct rte_flow_item_vxlan *vxlan;
1760 const struct rte_flow_action_port_id *port_id;
1761 const struct rte_flow_action_jump *jump;
1762 const struct rte_flow_action_of_push_vlan *of_push_vlan;
1763 const struct rte_flow_action_of_set_vlan_vid *
1765 const struct rte_flow_action_of_set_vlan_pcp *
1767 const struct rte_flow_action_vxlan_encap *vxlan_encap;
1768 const struct rte_flow_action_set_ipv4 *set_ipv4;
1769 const struct rte_flow_action_set_ipv6 *set_ipv6;
1771 const struct rte_flow_item *outer_udp = NULL;
1772 rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
1773 rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
1774 rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
1775 uint64_t item_flags = 0;
1776 uint64_t action_flags = 0;
1777 uint8_t next_protocol = 0xff;
1778 unsigned int tcm_ifindex = 0;
1779 uint8_t pedit_validated = 0;
1780 struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
1781 struct rte_eth_dev *port_id_dev = NULL;
1782 bool in_port_id_set;
1785 claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
1786 PTOI_TABLE_SZ_MAX(dev)));
1787 ret = flow_tcf_validate_attributes(attr, error);
1790 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
1792 uint64_t current_action_flag = 0;
1794 switch (actions->type) {
1795 case RTE_FLOW_ACTION_TYPE_VOID:
1797 case RTE_FLOW_ACTION_TYPE_PORT_ID:
1798 current_action_flag = MLX5_FLOW_ACTION_PORT_ID;
1801 conf.port_id = actions->conf;
1802 if (conf.port_id->original)
1805 for (i = 0; ptoi[i].ifindex; ++i)
1806 if (ptoi[i].port_id == conf.port_id->id)
1808 if (!ptoi[i].ifindex)
1809 return rte_flow_error_set
1811 RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1813 "missing data to convert port ID to"
1815 port_id_dev = &rte_eth_devices[conf.port_id->id];
1817 case RTE_FLOW_ACTION_TYPE_JUMP:
1818 current_action_flag = MLX5_FLOW_ACTION_JUMP;
1821 conf.jump = actions->conf;
1822 if (attr->group >= conf.jump->group)
1823 return rte_flow_error_set
1825 RTE_FLOW_ERROR_TYPE_ACTION,
1827 "can jump only to a group forward");
1829 case RTE_FLOW_ACTION_TYPE_DROP:
1830 current_action_flag = MLX5_FLOW_ACTION_DROP;
1832 case RTE_FLOW_ACTION_TYPE_COUNT:
1834 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
1835 current_action_flag = MLX5_FLOW_ACTION_OF_POP_VLAN;
1837 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: {
1838 rte_be16_t ethertype;
1840 current_action_flag = MLX5_FLOW_ACTION_OF_PUSH_VLAN;
1843 conf.of_push_vlan = actions->conf;
1844 ethertype = conf.of_push_vlan->ethertype;
1845 if (ethertype != RTE_BE16(ETH_P_8021Q) &&
1846 ethertype != RTE_BE16(ETH_P_8021AD))
1847 return rte_flow_error_set
1849 RTE_FLOW_ERROR_TYPE_ACTION, actions,
1850 "vlan push TPID must be "
1851 "802.1Q or 802.1AD");
1854 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
1855 if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1856 return rte_flow_error_set
1858 RTE_FLOW_ERROR_TYPE_ACTION, actions,
1859 "vlan modify is not supported,"
1860 " set action must follow push action");
1861 current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
1863 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
1864 if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1865 return rte_flow_error_set
1867 RTE_FLOW_ERROR_TYPE_ACTION, actions,
1868 "vlan modify is not supported,"
1869 " set action must follow push action");
1870 current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
1872 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
1873 current_action_flag = MLX5_FLOW_ACTION_VXLAN_DECAP;
1875 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
1876 ret = flow_tcf_validate_vxlan_encap(actions, error);
1879 current_action_flag = MLX5_FLOW_ACTION_VXLAN_ENCAP;
1881 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
1882 current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_SRC;
1884 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
1885 current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_DST;
1887 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
1888 current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_SRC;
1890 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
1891 current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_DST;
1893 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
1894 current_action_flag = MLX5_FLOW_ACTION_SET_TP_SRC;
1896 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
1897 current_action_flag = MLX5_FLOW_ACTION_SET_TP_DST;
1899 case RTE_FLOW_ACTION_TYPE_SET_TTL:
1900 current_action_flag = MLX5_FLOW_ACTION_SET_TTL;
1902 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
1903 current_action_flag = MLX5_FLOW_ACTION_DEC_TTL;
1905 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
1906 current_action_flag = MLX5_FLOW_ACTION_SET_MAC_SRC;
1908 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
1909 current_action_flag = MLX5_FLOW_ACTION_SET_MAC_DST;
1912 return rte_flow_error_set(error, ENOTSUP,
1913 RTE_FLOW_ERROR_TYPE_ACTION,
1915 "action not supported");
1917 if (current_action_flag & MLX5_TCF_CONFIG_ACTIONS) {
1919 return rte_flow_error_set
1921 RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1923 "action configuration not set");
1925 if ((current_action_flag & MLX5_TCF_PEDIT_ACTIONS) &&
1927 return rte_flow_error_set(error, ENOTSUP,
1928 RTE_FLOW_ERROR_TYPE_ACTION,
1930 "set actions should be "
1931 "listed successively");
1932 if ((current_action_flag & ~MLX5_TCF_PEDIT_ACTIONS) &&
1933 (action_flags & MLX5_TCF_PEDIT_ACTIONS))
1934 pedit_validated = 1;
1935 if ((current_action_flag & MLX5_TCF_FATE_ACTIONS) &&
1936 (action_flags & MLX5_TCF_FATE_ACTIONS))
1937 return rte_flow_error_set(error, EINVAL,
1938 RTE_FLOW_ERROR_TYPE_ACTION,
1940 "can't have multiple fate"
1942 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1943 (action_flags & MLX5_TCF_VXLAN_ACTIONS))
1944 return rte_flow_error_set(error, EINVAL,
1945 RTE_FLOW_ERROR_TYPE_ACTION,
1947 "can't have multiple vxlan"
1949 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1950 (action_flags & MLX5_TCF_VLAN_ACTIONS))
1951 return rte_flow_error_set(error, ENOTSUP,
1952 RTE_FLOW_ERROR_TYPE_ACTION,
1954 "can't have vxlan and vlan"
1955 " actions in the same rule");
1956 action_flags |= current_action_flag;
1958 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1961 switch (items->type) {
1962 case RTE_FLOW_ITEM_TYPE_VOID:
1964 case RTE_FLOW_ITEM_TYPE_PORT_ID:
1965 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
1966 return rte_flow_error_set
1968 RTE_FLOW_ERROR_TYPE_ITEM, items,
1969 "inner tunnel port id"
1970 " item is not supported");
1971 mask.port_id = flow_tcf_item_mask
1972 (items, &rte_flow_item_port_id_mask,
1973 &flow_tcf_mask_supported.port_id,
1974 &flow_tcf_mask_empty.port_id,
1975 sizeof(flow_tcf_mask_supported.port_id),
1979 if (mask.port_id == &flow_tcf_mask_empty.port_id) {
1983 spec.port_id = items->spec;
1984 if (mask.port_id->id && mask.port_id->id != 0xffffffff)
1985 return rte_flow_error_set
1987 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
1989 "no support for partial mask on"
1991 if (!mask.port_id->id)
1994 for (i = 0; ptoi[i].ifindex; ++i)
1995 if (ptoi[i].port_id == spec.port_id->id)
1997 if (!ptoi[i].ifindex)
1998 return rte_flow_error_set
2000 RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
2002 "missing data to convert port ID to"
2004 if (in_port_id_set && ptoi[i].ifindex != tcm_ifindex)
2005 return rte_flow_error_set
2007 RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
2009 "cannot match traffic for"
2010 " several port IDs through"
2011 " a single flow rule");
2012 tcm_ifindex = ptoi[i].ifindex;
2015 case RTE_FLOW_ITEM_TYPE_ETH:
2016 ret = mlx5_flow_validate_item_eth(items, item_flags,
2020 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2021 MLX5_FLOW_LAYER_INNER_L2 :
2022 MLX5_FLOW_LAYER_OUTER_L2;
2024 * Redundant check due to different supported mask.
2025 * Same for the rest of items.
2027 mask.eth = flow_tcf_item_mask
2028 (items, &rte_flow_item_eth_mask,
2029 &flow_tcf_mask_supported.eth,
2030 &flow_tcf_mask_empty.eth,
2031 sizeof(flow_tcf_mask_supported.eth),
2035 if (mask.eth->type && mask.eth->type !=
2037 return rte_flow_error_set
2039 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2041 "no support for partial mask on"
2043 assert(items->spec);
2044 spec.eth = items->spec;
2045 if (mask.eth->type &&
2046 (item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
2047 inner_etype != RTE_BE16(ETH_P_ALL) &&
2048 inner_etype != spec.eth->type)
2049 return rte_flow_error_set
2051 RTE_FLOW_ERROR_TYPE_ITEM,
2053 "inner eth_type conflict");
2054 if (mask.eth->type &&
2055 !(item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
2056 outer_etype != RTE_BE16(ETH_P_ALL) &&
2057 outer_etype != spec.eth->type)
2058 return rte_flow_error_set
2060 RTE_FLOW_ERROR_TYPE_ITEM,
2062 "outer eth_type conflict");
2063 if (mask.eth->type) {
2064 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
2065 inner_etype = spec.eth->type;
2067 outer_etype = spec.eth->type;
2070 case RTE_FLOW_ITEM_TYPE_VLAN:
2071 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
2072 return rte_flow_error_set
2074 RTE_FLOW_ERROR_TYPE_ITEM, items,
2076 " is not supported");
2077 ret = mlx5_flow_validate_item_vlan(items, item_flags,
2081 item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
2082 mask.vlan = flow_tcf_item_mask
2083 (items, &rte_flow_item_vlan_mask,
2084 &flow_tcf_mask_supported.vlan,
2085 &flow_tcf_mask_empty.vlan,
2086 sizeof(flow_tcf_mask_supported.vlan),
2090 if ((mask.vlan->tci & RTE_BE16(0xe000) &&
2091 (mask.vlan->tci & RTE_BE16(0xe000)) !=
2092 RTE_BE16(0xe000)) ||
2093 (mask.vlan->tci & RTE_BE16(0x0fff) &&
2094 (mask.vlan->tci & RTE_BE16(0x0fff)) !=
2095 RTE_BE16(0x0fff)) ||
2096 (mask.vlan->inner_type &&
2097 mask.vlan->inner_type != RTE_BE16(0xffff)))
2098 return rte_flow_error_set
2100 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2102 "no support for partial masks on"
2103 " \"tci\" (PCP and VID parts) and"
2104 " \"inner_type\" fields");
2105 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2106 outer_etype != RTE_BE16(ETH_P_8021Q))
2107 return rte_flow_error_set
2109 RTE_FLOW_ERROR_TYPE_ITEM,
2111 "outer eth_type conflict,"
2113 outer_etype = RTE_BE16(ETH_P_8021Q);
2114 assert(items->spec);
2115 spec.vlan = items->spec;
2116 if (mask.vlan->inner_type &&
2117 vlan_etype != RTE_BE16(ETH_P_ALL) &&
2118 vlan_etype != spec.vlan->inner_type)
2119 return rte_flow_error_set
2121 RTE_FLOW_ERROR_TYPE_ITEM,
2123 "vlan eth_type conflict");
2124 if (mask.vlan->inner_type)
2125 vlan_etype = spec.vlan->inner_type;
2127 case RTE_FLOW_ITEM_TYPE_IPV4:
2128 ret = mlx5_flow_validate_item_ipv4(items, item_flags,
2132 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2133 MLX5_FLOW_LAYER_INNER_L3_IPV4 :
2134 MLX5_FLOW_LAYER_OUTER_L3_IPV4;
2135 mask.ipv4 = flow_tcf_item_mask
2136 (items, &rte_flow_item_ipv4_mask,
2137 &flow_tcf_mask_supported.ipv4,
2138 &flow_tcf_mask_empty.ipv4,
2139 sizeof(flow_tcf_mask_supported.ipv4),
2143 if (mask.ipv4->hdr.next_proto_id &&
2144 mask.ipv4->hdr.next_proto_id != 0xff)
2145 return rte_flow_error_set
2147 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2149 "no support for partial mask on"
2150 " \"hdr.next_proto_id\" field");
2151 else if (mask.ipv4->hdr.next_proto_id)
2153 ((const struct rte_flow_item_ipv4 *)
2154 (items->spec))->hdr.next_proto_id;
2155 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2156 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2157 inner_etype != RTE_BE16(ETH_P_IP))
2158 return rte_flow_error_set
2160 RTE_FLOW_ERROR_TYPE_ITEM,
2162 "inner eth_type conflict,"
2163 " IPv4 is required");
2164 inner_etype = RTE_BE16(ETH_P_IP);
2165 } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2166 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2167 vlan_etype != RTE_BE16(ETH_P_IP))
2168 return rte_flow_error_set
2170 RTE_FLOW_ERROR_TYPE_ITEM,
2172 "vlan eth_type conflict,"
2173 " IPv4 is required");
2174 vlan_etype = RTE_BE16(ETH_P_IP);
2176 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2177 outer_etype != RTE_BE16(ETH_P_IP))
2178 return rte_flow_error_set
2180 RTE_FLOW_ERROR_TYPE_ITEM,
2182 "eth_type conflict,"
2183 " IPv4 is required");
2184 outer_etype = RTE_BE16(ETH_P_IP);
2187 case RTE_FLOW_ITEM_TYPE_IPV6:
2188 ret = mlx5_flow_validate_item_ipv6(items, item_flags,
2192 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2193 MLX5_FLOW_LAYER_INNER_L3_IPV6 :
2194 MLX5_FLOW_LAYER_OUTER_L3_IPV6;
2195 mask.ipv6 = flow_tcf_item_mask
2196 (items, &rte_flow_item_ipv6_mask,
2197 &flow_tcf_mask_supported.ipv6,
2198 &flow_tcf_mask_empty.ipv6,
2199 sizeof(flow_tcf_mask_supported.ipv6),
2203 if (mask.ipv6->hdr.proto &&
2204 mask.ipv6->hdr.proto != 0xff)
2205 return rte_flow_error_set
2207 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2209 "no support for partial mask on"
2210 " \"hdr.proto\" field");
2211 else if (mask.ipv6->hdr.proto)
2213 ((const struct rte_flow_item_ipv6 *)
2214 (items->spec))->hdr.proto;
2215 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2216 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2217 inner_etype != RTE_BE16(ETH_P_IPV6))
2218 return rte_flow_error_set
2220 RTE_FLOW_ERROR_TYPE_ITEM,
2222 "inner eth_type conflict,"
2223 " IPv6 is required");
2224 inner_etype = RTE_BE16(ETH_P_IPV6);
2225 } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2226 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2227 vlan_etype != RTE_BE16(ETH_P_IPV6))
2228 return rte_flow_error_set
2230 RTE_FLOW_ERROR_TYPE_ITEM,
2232 "vlan eth_type conflict,"
2233 " IPv6 is required");
2234 vlan_etype = RTE_BE16(ETH_P_IPV6);
2236 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2237 outer_etype != RTE_BE16(ETH_P_IPV6))
2238 return rte_flow_error_set
2240 RTE_FLOW_ERROR_TYPE_ITEM,
2242 "eth_type conflict,"
2243 " IPv6 is required");
2244 outer_etype = RTE_BE16(ETH_P_IPV6);
2247 case RTE_FLOW_ITEM_TYPE_UDP:
2248 ret = mlx5_flow_validate_item_udp(items, item_flags,
2249 next_protocol, error);
2252 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2253 MLX5_FLOW_LAYER_INNER_L4_UDP :
2254 MLX5_FLOW_LAYER_OUTER_L4_UDP;
2255 mask.udp = flow_tcf_item_mask
2256 (items, &rte_flow_item_udp_mask,
2257 &flow_tcf_mask_supported.udp,
2258 &flow_tcf_mask_empty.udp,
2259 sizeof(flow_tcf_mask_supported.udp),
2264 * Save the presumed outer UDP item for extra check
2265 * if the tunnel item will be found later in the list.
2267 if (!(item_flags & MLX5_FLOW_LAYER_TUNNEL))
2270 case RTE_FLOW_ITEM_TYPE_TCP:
2271 ret = mlx5_flow_validate_item_tcp
2274 &flow_tcf_mask_supported.tcp,
2278 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2279 MLX5_FLOW_LAYER_INNER_L4_TCP :
2280 MLX5_FLOW_LAYER_OUTER_L4_TCP;
2281 mask.tcp = flow_tcf_item_mask
2282 (items, &rte_flow_item_tcp_mask,
2283 &flow_tcf_mask_supported.tcp,
2284 &flow_tcf_mask_empty.tcp,
2285 sizeof(flow_tcf_mask_supported.tcp),
2290 case RTE_FLOW_ITEM_TYPE_VXLAN:
2291 if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN)
2292 return rte_flow_error_set
2294 RTE_FLOW_ERROR_TYPE_ITEM, items,
2295 "vxlan tunnel over vlan"
2296 " is not supported");
2297 ret = mlx5_flow_validate_item_vxlan(items,
2301 item_flags |= MLX5_FLOW_LAYER_VXLAN;
2302 mask.vxlan = flow_tcf_item_mask
2303 (items, &rte_flow_item_vxlan_mask,
2304 &flow_tcf_mask_supported.vxlan,
2305 &flow_tcf_mask_empty.vxlan,
2306 sizeof(flow_tcf_mask_supported.vxlan), error);
2309 if (mask.vxlan->vni[0] != 0xff ||
2310 mask.vxlan->vni[1] != 0xff ||
2311 mask.vxlan->vni[2] != 0xff)
2312 return rte_flow_error_set
2314 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2316 "no support for partial or "
2317 "empty mask on \"vxlan.vni\" field");
2319 * The VNI item assumes the VXLAN tunnel, it requires
2320 * at least the outer destination UDP port must be
2321 * specified without wildcards to allow kernel select
2322 * the virtual VXLAN device by port. Also outer IPv4
2323 * or IPv6 item must be specified (wilcards or even
2324 * zero mask are allowed) to let driver know the tunnel
2325 * IP version and process UDP traffic correctly.
2328 (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2329 MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2330 return rte_flow_error_set
2332 RTE_FLOW_ERROR_TYPE_ACTION,
2334 "no outer IP pattern found"
2335 " for vxlan tunnel");
2336 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
2337 return rte_flow_error_set
2339 RTE_FLOW_ERROR_TYPE_ACTION,
2341 "no outer UDP pattern found"
2342 " for vxlan tunnel");
2344 * All items preceding the tunnel item become outer
2345 * ones and we should do extra validation for them
2346 * due to tc limitations for tunnel outer parameters.
2347 * Currently only outer UDP item requres extra check,
2348 * use the saved pointer instead of item list rescan.
2351 ret = flow_tcf_validate_vxlan_decap_udp
2355 /* Reset L4 protocol for inner parameters. */
2356 next_protocol = 0xff;
2359 return rte_flow_error_set(error, ENOTSUP,
2360 RTE_FLOW_ERROR_TYPE_ITEM,
2361 items, "item not supported");
2364 if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2365 (action_flags & MLX5_FLOW_ACTION_DROP))
2366 return rte_flow_error_set(error, ENOTSUP,
2367 RTE_FLOW_ERROR_TYPE_ACTION,
2369 "set action is not compatible with "
2371 if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2372 !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2373 return rte_flow_error_set(error, ENOTSUP,
2374 RTE_FLOW_ERROR_TYPE_ACTION,
2376 "set action must be followed by "
2379 (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST)) {
2380 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4))
2381 return rte_flow_error_set(error, EINVAL,
2382 RTE_FLOW_ERROR_TYPE_ACTION,
2384 "no ipv4 item found in"
2388 (MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST)) {
2389 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6))
2390 return rte_flow_error_set(error, EINVAL,
2391 RTE_FLOW_ERROR_TYPE_ACTION,
2393 "no ipv6 item found in"
2397 (MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST)) {
2399 (MLX5_FLOW_LAYER_OUTER_L4_UDP |
2400 MLX5_FLOW_LAYER_OUTER_L4_TCP)))
2401 return rte_flow_error_set(error, EINVAL,
2402 RTE_FLOW_ERROR_TYPE_ACTION,
2404 "no TCP/UDP item found in"
2408 * FW syndrome (0xA9C090):
2409 * set_flow_table_entry: push vlan action fte in fdb can ONLY be
2410 * forward to the uplink.
2412 if ((action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) &&
2413 (action_flags & MLX5_FLOW_ACTION_PORT_ID) &&
2414 ((struct priv *)port_id_dev->data->dev_private)->representor)
2415 return rte_flow_error_set(error, ENOTSUP,
2416 RTE_FLOW_ERROR_TYPE_ACTION, actions,
2417 "vlan push can only be applied"
2418 " when forwarding to uplink port");
2420 * FW syndrome (0x294609):
2421 * set_flow_table_entry: modify/pop/push actions in fdb flow table
2422 * are supported only while forwarding to vport.
2424 if ((action_flags & MLX5_TCF_VLAN_ACTIONS) &&
2425 !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2426 return rte_flow_error_set(error, ENOTSUP,
2427 RTE_FLOW_ERROR_TYPE_ACTION, actions,
2428 "vlan actions are supported"
2429 " only with port_id action");
2430 if ((action_flags & MLX5_TCF_VXLAN_ACTIONS) &&
2431 !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2432 return rte_flow_error_set(error, ENOTSUP,
2433 RTE_FLOW_ERROR_TYPE_ACTION, NULL,
2434 "vxlan actions are supported"
2435 " only with port_id action");
2436 if (!(action_flags & MLX5_TCF_FATE_ACTIONS))
2437 return rte_flow_error_set(error, EINVAL,
2438 RTE_FLOW_ERROR_TYPE_ACTION, actions,
2439 "no fate action is found");
2441 (MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL)) {
2443 (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2444 MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2445 return rte_flow_error_set(error, EINVAL,
2446 RTE_FLOW_ERROR_TYPE_ACTION,
2448 "no IP found in pattern");
2451 (MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)) {
2452 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L2))
2453 return rte_flow_error_set(error, ENOTSUP,
2454 RTE_FLOW_ERROR_TYPE_ACTION,
2456 "no ethernet found in"
2459 if ((action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) &&
2460 !(item_flags & MLX5_FLOW_LAYER_VXLAN))
2461 return rte_flow_error_set(error, EINVAL,
2462 RTE_FLOW_ERROR_TYPE_ACTION,
2464 "no VNI pattern found"
2465 " for vxlan decap action");
2466 if ((action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) &&
2467 (item_flags & MLX5_FLOW_LAYER_TUNNEL))
2468 return rte_flow_error_set(error, EINVAL,
2469 RTE_FLOW_ERROR_TYPE_ACTION,
2471 "vxlan encap not supported"
2472 " for tunneled traffic");
2477 * Calculate maximum size of memory for flow items of Linux TC flower.
2480 * Pointer to the flow attributes.
2482 * Pointer to the list of items.
2483 * @param[out] action_flags
2484 * Pointer to the detected actions.
2487 * Maximum size of memory for items.
2490 flow_tcf_get_items_size(const struct rte_flow_attr *attr,
2491 const struct rte_flow_item items[],
2492 uint64_t *action_flags)
2496 size += SZ_NLATTR_STRZ_OF("flower") +
2497 SZ_NLATTR_TYPE_OF(uint16_t) + /* Outer ether type. */
2498 SZ_NLATTR_NEST + /* TCA_OPTIONS. */
2499 SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */
2500 if (attr->group > 0)
2501 size += SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CHAIN. */
2502 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2503 switch (items->type) {
2504 case RTE_FLOW_ITEM_TYPE_VOID:
2506 case RTE_FLOW_ITEM_TYPE_PORT_ID:
2508 case RTE_FLOW_ITEM_TYPE_ETH:
2509 size += SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4;
2510 /* dst/src MAC addr and mask. */
2512 case RTE_FLOW_ITEM_TYPE_VLAN:
2513 size += SZ_NLATTR_TYPE_OF(uint16_t) +
2514 /* VLAN Ether type. */
2515 SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */
2516 SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */
2518 case RTE_FLOW_ITEM_TYPE_IPV4: {
2519 const struct rte_flow_item_ipv4 *ipv4 = items->mask;
2521 size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2522 SZ_NLATTR_TYPE_OF(uint32_t) * 4;
2523 /* dst/src IP addr and mask. */
2524 if (ipv4 && ipv4->hdr.time_to_live)
2525 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2526 if (ipv4 && ipv4->hdr.type_of_service)
2527 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2530 case RTE_FLOW_ITEM_TYPE_IPV6: {
2531 const struct rte_flow_item_ipv6 *ipv6 = items->mask;
2533 size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2534 SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4;
2535 /* dst/src IP addr and mask. */
2536 if (ipv6 && ipv6->hdr.hop_limits)
2537 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2538 if (ipv6 && (rte_be_to_cpu_32(ipv6->hdr.vtc_flow) &
2539 (0xfful << IPV6_HDR_TC_SHIFT)))
2540 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2543 case RTE_FLOW_ITEM_TYPE_UDP:
2544 size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2545 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2546 /* dst/src port and mask. */
2548 case RTE_FLOW_ITEM_TYPE_TCP:
2549 size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2550 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2551 /* dst/src port and mask. */
2553 case RTE_FLOW_ITEM_TYPE_VXLAN:
2554 size += SZ_NLATTR_TYPE_OF(uint32_t);
2556 * There might be no VXLAN decap action in the action
2557 * list, nonetheless the VXLAN tunnel flow requires
2558 * the decap structure to be correctly applied to
2559 * VXLAN device, set the flag to create the structure.
2560 * Translation routine will not put the decap action
2561 * in tne Netlink message if there is no actual action
2564 *action_flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2568 "unsupported item %p type %d,"
2569 " items must be validated before flow creation",
2570 (const void *)items, items->type);
2578 * Calculate size of memory to store the VXLAN encapsultion
2579 * related items in the Netlink message buffer. Items list
2580 * is specified by RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action.
2581 * The item list should be validated.
2584 * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2585 * List of pattern items to scan data from.
2588 * The size the part of Netlink message buffer to store the
2589 * VXLAN encapsulation item attributes.
2592 flow_tcf_vxlan_encap_size(const struct rte_flow_action *action)
2594 const struct rte_flow_item *items;
2597 assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
2598 assert(action->conf);
2600 items = ((const struct rte_flow_action_vxlan_encap *)
2601 action->conf)->definition;
2603 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2604 switch (items->type) {
2605 case RTE_FLOW_ITEM_TYPE_VOID:
2607 case RTE_FLOW_ITEM_TYPE_ETH:
2608 /* This item does not require message buffer. */
2610 case RTE_FLOW_ITEM_TYPE_IPV4: {
2611 const struct rte_flow_item_ipv4 *ipv4 = items->mask;
2613 size += SZ_NLATTR_DATA_OF(IPV4_ADDR_LEN) * 2;
2614 if (ipv4 && ipv4->hdr.time_to_live)
2615 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2616 if (ipv4 && ipv4->hdr.type_of_service)
2617 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2620 case RTE_FLOW_ITEM_TYPE_IPV6: {
2621 const struct rte_flow_item_ipv6 *ipv6 = items->mask;
2623 size += SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 2;
2624 if (ipv6 && ipv6->hdr.hop_limits)
2625 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2626 if (ipv6 && (rte_be_to_cpu_32(ipv6->hdr.vtc_flow) &
2627 (0xfful << IPV6_HDR_TC_SHIFT)))
2628 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2631 case RTE_FLOW_ITEM_TYPE_UDP: {
2632 const struct rte_flow_item_udp *udp = items->mask;
2634 size += SZ_NLATTR_TYPE_OF(uint16_t);
2635 if (!udp || udp->hdr.src_port != RTE_BE16(0x0000))
2636 size += SZ_NLATTR_TYPE_OF(uint16_t);
2639 case RTE_FLOW_ITEM_TYPE_VXLAN:
2640 size += SZ_NLATTR_TYPE_OF(uint32_t);
2645 "unsupported item %p type %d,"
2646 " items must be validated"
2647 " before flow creation",
2648 (const void *)items, items->type);
2656 * Calculate maximum size of memory for flow actions of Linux TC flower and
2657 * extract specified actions.
2659 * @param[in] actions
2660 * Pointer to the list of actions.
2661 * @param[out] action_flags
2662 * Pointer to the detected actions.
2665 * Maximum size of memory for actions.
2668 flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
2669 uint64_t *action_flags)
2674 size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */
2675 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
2676 switch (actions->type) {
2677 case RTE_FLOW_ACTION_TYPE_VOID:
2679 case RTE_FLOW_ACTION_TYPE_PORT_ID:
2680 size += SZ_NLATTR_NEST + /* na_act_index. */
2681 SZ_NLATTR_STRZ_OF("mirred") +
2682 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2683 SZ_NLATTR_TYPE_OF(struct tc_mirred);
2684 flags |= MLX5_FLOW_ACTION_PORT_ID;
2686 case RTE_FLOW_ACTION_TYPE_JUMP:
2687 size += SZ_NLATTR_NEST + /* na_act_index. */
2688 SZ_NLATTR_STRZ_OF("gact") +
2689 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2690 SZ_NLATTR_TYPE_OF(struct tc_gact);
2691 flags |= MLX5_FLOW_ACTION_JUMP;
2693 case RTE_FLOW_ACTION_TYPE_DROP:
2694 size += SZ_NLATTR_NEST + /* na_act_index. */
2695 SZ_NLATTR_STRZ_OF("gact") +
2696 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2697 SZ_NLATTR_TYPE_OF(struct tc_gact);
2698 flags |= MLX5_FLOW_ACTION_DROP;
2700 case RTE_FLOW_ACTION_TYPE_COUNT:
2702 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
2703 flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
2704 goto action_of_vlan;
2705 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
2706 flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
2707 goto action_of_vlan;
2708 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
2709 flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
2710 goto action_of_vlan;
2711 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
2712 flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
2713 goto action_of_vlan;
2715 size += SZ_NLATTR_NEST + /* na_act_index. */
2716 SZ_NLATTR_STRZ_OF("vlan") +
2717 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2718 SZ_NLATTR_TYPE_OF(struct tc_vlan) +
2719 SZ_NLATTR_TYPE_OF(uint16_t) +
2720 /* VLAN protocol. */
2721 SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
2722 SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
2724 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
2725 size += SZ_NLATTR_NEST + /* na_act_index. */
2726 SZ_NLATTR_STRZ_OF("tunnel_key") +
2727 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2728 SZ_NLATTR_TYPE_OF(uint8_t);
2729 size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2730 size += flow_tcf_vxlan_encap_size(actions) +
2731 RTE_ALIGN_CEIL /* preceding encap params. */
2732 (sizeof(struct flow_tcf_vxlan_encap),
2734 flags |= MLX5_FLOW_ACTION_VXLAN_ENCAP;
2736 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
2737 size += SZ_NLATTR_NEST + /* na_act_index. */
2738 SZ_NLATTR_STRZ_OF("tunnel_key") +
2739 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2740 SZ_NLATTR_TYPE_OF(uint8_t);
2741 size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2742 size += RTE_ALIGN_CEIL /* preceding decap params. */
2743 (sizeof(struct flow_tcf_vxlan_decap),
2745 flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2747 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
2748 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
2749 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
2750 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
2751 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
2752 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
2753 case RTE_FLOW_ACTION_TYPE_SET_TTL:
2754 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
2755 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
2756 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
2757 size += flow_tcf_get_pedit_actions_size(&actions,
2762 "unsupported action %p type %d,"
2763 " items must be validated before flow creation",
2764 (const void *)actions, actions->type);
2768 *action_flags = flags;
2773 * Brand rtnetlink buffer with unique handle.
2775 * This handle should be unique for a given network interface to avoid
2779 * Pointer to Netlink message.
2781 * Unique 32-bit handle to use.
2784 flow_tcf_nl_brand(struct nlmsghdr *nlh, uint32_t handle)
2786 struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
2788 tcm->tcm_handle = handle;
2789 DRV_LOG(DEBUG, "Netlink msg %p is branded with handle %x",
2790 (void *)nlh, handle);
2794 * Prepare a flow object for Linux TC flower. It calculates the maximum size of
2795 * memory required, allocates the memory, initializes Netlink message headers
2796 * and set unique TC message handle.
2799 * Pointer to the flow attributes.
2801 * Pointer to the list of items.
2802 * @param[in] actions
2803 * Pointer to the list of actions.
2805 * Pointer to the error structure.
2808 * Pointer to mlx5_flow object on success,
2809 * otherwise NULL and rte_errno is set.
2811 static struct mlx5_flow *
2812 flow_tcf_prepare(const struct rte_flow_attr *attr,
2813 const struct rte_flow_item items[],
2814 const struct rte_flow_action actions[],
2815 struct rte_flow_error *error)
2817 size_t size = RTE_ALIGN_CEIL
2818 (sizeof(struct mlx5_flow),
2819 alignof(struct flow_tcf_tunnel_hdr)) +
2820 MNL_ALIGN(sizeof(struct nlmsghdr)) +
2821 MNL_ALIGN(sizeof(struct tcmsg));
2822 struct mlx5_flow *dev_flow;
2823 uint64_t action_flags = 0;
2824 struct nlmsghdr *nlh;
2826 uint8_t *sp, *tun = NULL;
2828 size += flow_tcf_get_items_size(attr, items, &action_flags);
2829 size += flow_tcf_get_actions_and_size(actions, &action_flags);
2830 dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
2832 rte_flow_error_set(error, ENOMEM,
2833 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
2834 "not enough memory to create E-Switch flow");
2837 sp = (uint8_t *)(dev_flow + 1);
2838 if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) {
2840 (sp, alignof(struct flow_tcf_tunnel_hdr));
2842 sp += RTE_ALIGN_CEIL
2843 (sizeof(struct flow_tcf_vxlan_encap),
2846 size -= RTE_ALIGN_CEIL
2847 (sizeof(struct flow_tcf_vxlan_encap),
2850 } else if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2852 (sp, alignof(struct flow_tcf_tunnel_hdr));
2854 sp += RTE_ALIGN_CEIL
2855 (sizeof(struct flow_tcf_vxlan_decap),
2858 size -= RTE_ALIGN_CEIL
2859 (sizeof(struct flow_tcf_vxlan_decap),
2863 sp = RTE_PTR_ALIGN(sp, MNL_ALIGNTO);
2865 nlh = mnl_nlmsg_put_header(sp);
2866 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
2867 *dev_flow = (struct mlx5_flow){
2868 .tcf = (struct mlx5_flow_tcf){
2870 .nlsize = size - RTE_ALIGN_CEIL
2871 (sizeof(struct mlx5_flow),
2872 alignof(struct flow_tcf_tunnel_hdr)),
2874 .tunnel = (struct flow_tcf_tunnel_hdr *)tun,
2879 if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP)
2880 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_DECAP;
2881 else if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP)
2882 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_ENCAP;
2884 * Generate a reasonably unique handle based on the address of the
2887 * This is straightforward on 32-bit systems where the flow pointer can
2888 * be used directly. Otherwise, its least significant part is taken
2889 * after shifting it by the previous power of two of the pointed buffer
2892 if (sizeof(dev_flow) <= 4)
2893 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow);
2895 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow >>
2896 rte_log2_u32(rte_align32prevpow2(size)));
2901 * Make adjustments for supporting count actions.
2904 * Pointer to the Ethernet device structure.
2905 * @param[in] dev_flow
2906 * Pointer to mlx5_flow.
2908 * Pointer to error structure.
2911 * 0 On success else a negative errno value is returned and rte_errno is set.
2914 flow_tcf_translate_action_count(struct rte_eth_dev *dev __rte_unused,
2915 struct mlx5_flow *dev_flow,
2916 struct rte_flow_error *error)
2918 struct rte_flow *flow = dev_flow->flow;
2920 if (!flow->counter) {
2921 flow->counter = flow_tcf_counter_new();
2923 return rte_flow_error_set(error, rte_errno,
2924 RTE_FLOW_ERROR_TYPE_ACTION,
2926 "cannot get counter"
2933 * Convert VXLAN VNI to 32-bit integer.
2936 * VXLAN VNI in 24-bit wire format.
2939 * VXLAN VNI as a 32-bit integer value in network endian.
2941 static inline rte_be32_t
2942 vxlan_vni_as_be32(const uint8_t vni[3])
2948 .vni = { 0, vni[0], vni[1], vni[2] },
2954 * Helper function to process RTE_FLOW_ITEM_TYPE_ETH entry in configuration
2955 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the MAC address fields
2956 * in the encapsulation parameters structure. The item must be prevalidated,
2957 * no any validation checks performed by function.
2960 * RTE_FLOW_ITEM_TYPE_ETH entry specification.
2962 * RTE_FLOW_ITEM_TYPE_ETH entry mask.
2964 * Structure to fill the gathered MAC address data.
2967 flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec,
2968 const struct rte_flow_item_eth *mask,
2969 struct flow_tcf_vxlan_encap *encap)
2971 /* Item must be validated before. No redundant checks. */
2973 if (!mask || !memcmp(&mask->dst,
2974 &rte_flow_item_eth_mask.dst,
2975 sizeof(rte_flow_item_eth_mask.dst))) {
2977 * Ethernet addresses are not supported by
2978 * tc as tunnel_key parameters. Destination
2979 * address is needed to form encap packet
2980 * header and retrieved by kernel from
2981 * implicit sources (ARP table, etc),
2982 * address masks are not supported at all.
2984 encap->eth.dst = spec->dst;
2985 encap->mask |= FLOW_TCF_ENCAP_ETH_DST;
2987 if (!mask || !memcmp(&mask->src,
2988 &rte_flow_item_eth_mask.src,
2989 sizeof(rte_flow_item_eth_mask.src))) {
2991 * Ethernet addresses are not supported by
2992 * tc as tunnel_key parameters. Source ethernet
2993 * address is ignored anyway.
2995 encap->eth.src = spec->src;
2996 encap->mask |= FLOW_TCF_ENCAP_ETH_SRC;
3001 * Helper function to process RTE_FLOW_ITEM_TYPE_IPV4 entry in configuration
3002 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV4 address fields
3003 * in the encapsulation parameters structure. The item must be prevalidated,
3004 * no any validation checks performed by function.
3007 * RTE_FLOW_ITEM_TYPE_IPV4 entry specification.
3009 * RTE_FLOW_ITEM_TYPE_IPV4 entry mask.
3011 * Structure to fill the gathered IPV4 address data.
3014 flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec,
3015 const struct rte_flow_item_ipv4 *mask,
3016 struct flow_tcf_vxlan_encap *encap)
3018 /* Item must be validated before. No redundant checks. */
3020 encap->ipv4.dst = spec->hdr.dst_addr;
3021 encap->ipv4.src = spec->hdr.src_addr;
3022 encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC |
3023 FLOW_TCF_ENCAP_IPV4_DST;
3024 if (mask && mask->hdr.type_of_service) {
3025 encap->mask |= FLOW_TCF_ENCAP_IP_TOS;
3026 encap->ip_tos = spec->hdr.type_of_service;
3028 if (mask && mask->hdr.time_to_live) {
3029 encap->mask |= FLOW_TCF_ENCAP_IP_TTL;
3030 encap->ip_ttl_hop = spec->hdr.time_to_live;
3035 * Helper function to process RTE_FLOW_ITEM_TYPE_IPV6 entry in configuration
3036 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV6 address fields
3037 * in the encapsulation parameters structure. The item must be prevalidated,
3038 * no any validation checks performed by function.
3041 * RTE_FLOW_ITEM_TYPE_IPV6 entry specification.
3043 * RTE_FLOW_ITEM_TYPE_IPV6 entry mask.
3045 * Structure to fill the gathered IPV6 address data.
3048 flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec,
3049 const struct rte_flow_item_ipv6 *mask,
3050 struct flow_tcf_vxlan_encap *encap)
3052 /* Item must be validated before. No redundant checks. */
3054 memcpy(encap->ipv6.dst, spec->hdr.dst_addr, IPV6_ADDR_LEN);
3055 memcpy(encap->ipv6.src, spec->hdr.src_addr, IPV6_ADDR_LEN);
3056 encap->mask |= FLOW_TCF_ENCAP_IPV6_SRC |
3057 FLOW_TCF_ENCAP_IPV6_DST;
3059 if ((rte_be_to_cpu_32(mask->hdr.vtc_flow) >>
3060 IPV6_HDR_TC_SHIFT) & 0xff) {
3061 encap->mask |= FLOW_TCF_ENCAP_IP_TOS;
3062 encap->ip_tos = (rte_be_to_cpu_32
3063 (spec->hdr.vtc_flow) >>
3064 IPV6_HDR_TC_SHIFT) & 0xff;
3066 if (mask->hdr.hop_limits) {
3067 encap->mask |= FLOW_TCF_ENCAP_IP_TTL;
3068 encap->ip_ttl_hop = spec->hdr.hop_limits;
3074 * Helper function to process RTE_FLOW_ITEM_TYPE_UDP entry in configuration
3075 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the UDP port fields
3076 * in the encapsulation parameters structure. The item must be prevalidated,
3077 * no any validation checks performed by function.
3080 * RTE_FLOW_ITEM_TYPE_UDP entry specification.
3082 * RTE_FLOW_ITEM_TYPE_UDP entry mask.
3084 * Structure to fill the gathered UDP port data.
3087 flow_tcf_parse_vxlan_encap_udp(const struct rte_flow_item_udp *spec,
3088 const struct rte_flow_item_udp *mask,
3089 struct flow_tcf_vxlan_encap *encap)
3092 encap->udp.dst = spec->hdr.dst_port;
3093 encap->mask |= FLOW_TCF_ENCAP_UDP_DST;
3094 if (!mask || mask->hdr.src_port != RTE_BE16(0x0000)) {
3095 encap->udp.src = spec->hdr.src_port;
3096 encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC;
3101 * Helper function to process RTE_FLOW_ITEM_TYPE_VXLAN entry in configuration
3102 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the VNI fields
3103 * in the encapsulation parameters structure. The item must be prevalidated,
3104 * no any validation checks performed by function.
3107 * RTE_FLOW_ITEM_TYPE_VXLAN entry specification.
3109 * Structure to fill the gathered VNI address data.
3112 flow_tcf_parse_vxlan_encap_vni(const struct rte_flow_item_vxlan *spec,
3113 struct flow_tcf_vxlan_encap *encap)
3115 /* Item must be validated before. Do not redundant checks. */
3117 memcpy(encap->vxlan.vni, spec->vni, sizeof(encap->vxlan.vni));
3118 encap->mask |= FLOW_TCF_ENCAP_VXLAN_VNI;
3122 * Populate consolidated encapsulation object from list of pattern items.
3124 * Helper function to process configuration of action such as
3125 * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. The item list should be
3126 * validated, there is no way to return an meaningful error.
3129 * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
3130 * List of pattern items to gather data from.
3132 * Structure to fill gathered data.
3135 flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action,
3136 struct flow_tcf_vxlan_encap *encap)
3139 const struct rte_flow_item_eth *eth;
3140 const struct rte_flow_item_ipv4 *ipv4;
3141 const struct rte_flow_item_ipv6 *ipv6;
3142 const struct rte_flow_item_udp *udp;
3143 const struct rte_flow_item_vxlan *vxlan;
3145 const struct rte_flow_item *items;
3147 assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
3148 assert(action->conf);
3150 items = ((const struct rte_flow_action_vxlan_encap *)
3151 action->conf)->definition;
3153 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3154 switch (items->type) {
3155 case RTE_FLOW_ITEM_TYPE_VOID:
3157 case RTE_FLOW_ITEM_TYPE_ETH:
3158 mask.eth = items->mask;
3159 spec.eth = items->spec;
3160 flow_tcf_parse_vxlan_encap_eth(spec.eth, mask.eth,
3163 case RTE_FLOW_ITEM_TYPE_IPV4:
3164 spec.ipv4 = items->spec;
3165 mask.ipv4 = items->mask;
3166 flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4, mask.ipv4,
3169 case RTE_FLOW_ITEM_TYPE_IPV6:
3170 spec.ipv6 = items->spec;
3171 mask.ipv6 = items->mask;
3172 flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6, mask.ipv6,
3175 case RTE_FLOW_ITEM_TYPE_UDP:
3176 mask.udp = items->mask;
3177 spec.udp = items->spec;
3178 flow_tcf_parse_vxlan_encap_udp(spec.udp, mask.udp,
3181 case RTE_FLOW_ITEM_TYPE_VXLAN:
3182 spec.vxlan = items->spec;
3183 flow_tcf_parse_vxlan_encap_vni(spec.vxlan, encap);
3188 "unsupported item %p type %d,"
3189 " items must be validated"
3190 " before flow creation",
3191 (const void *)items, items->type);
3199 * Translate flow for Linux TC flower and construct Netlink message.
3202 * Pointer to the priv structure.
3203 * @param[in, out] flow
3204 * Pointer to the sub flow.
3206 * Pointer to the flow attributes.
3208 * Pointer to the list of items.
3209 * @param[in] actions
3210 * Pointer to the list of actions.
3212 * Pointer to the error structure.
3215 * 0 on success, a negative errno value otherwise and rte_errno is set.
3218 flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow,
3219 const struct rte_flow_attr *attr,
3220 const struct rte_flow_item items[],
3221 const struct rte_flow_action actions[],
3222 struct rte_flow_error *error)
3225 const struct rte_flow_item_port_id *port_id;
3226 const struct rte_flow_item_eth *eth;
3227 const struct rte_flow_item_vlan *vlan;
3228 const struct rte_flow_item_ipv4 *ipv4;
3229 const struct rte_flow_item_ipv6 *ipv6;
3230 const struct rte_flow_item_tcp *tcp;
3231 const struct rte_flow_item_udp *udp;
3232 const struct rte_flow_item_vxlan *vxlan;
3235 const struct rte_flow_action_port_id *port_id;
3236 const struct rte_flow_action_jump *jump;
3237 const struct rte_flow_action_of_push_vlan *of_push_vlan;
3238 const struct rte_flow_action_of_set_vlan_vid *
3240 const struct rte_flow_action_of_set_vlan_pcp *
3244 struct flow_tcf_tunnel_hdr *hdr;
3245 struct flow_tcf_vxlan_decap *vxlan;
3250 struct flow_tcf_tunnel_hdr *hdr;
3251 struct flow_tcf_vxlan_encap *vxlan;
3255 struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
3256 struct nlmsghdr *nlh = dev_flow->tcf.nlh;
3257 struct tcmsg *tcm = dev_flow->tcf.tcm;
3258 uint32_t na_act_index_cur;
3259 rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
3260 rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
3261 rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
3262 bool ip_proto_set = 0;
3263 bool tunnel_outer = 0;
3264 struct nlattr *na_flower;
3265 struct nlattr *na_flower_act;
3266 struct nlattr *na_vlan_id = NULL;
3267 struct nlattr *na_vlan_priority = NULL;
3268 uint64_t item_flags = 0;
3271 claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
3272 PTOI_TABLE_SZ_MAX(dev)));
3273 if (dev_flow->tcf.tunnel) {
3274 switch (dev_flow->tcf.tunnel->type) {
3275 case FLOW_TCF_TUNACT_VXLAN_DECAP:
3276 decap.vxlan = dev_flow->tcf.vxlan_decap;
3279 case FLOW_TCF_TUNACT_VXLAN_ENCAP:
3280 encap.vxlan = dev_flow->tcf.vxlan_encap;
3282 /* New tunnel actions can be added here. */
3288 nlh = dev_flow->tcf.nlh;
3289 tcm = dev_flow->tcf.tcm;
3290 /* Prepare API must have been called beforehand. */
3291 assert(nlh != NULL && tcm != NULL);
3292 tcm->tcm_family = AF_UNSPEC;
3293 tcm->tcm_ifindex = ptoi[0].ifindex;
3294 tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
3296 * Priority cannot be zero to prevent the kernel from picking one
3299 tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16, outer_etype);
3300 if (attr->group > 0)
3301 mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group);
3302 mnl_attr_put_strz(nlh, TCA_KIND, "flower");
3303 na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
3304 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3307 switch (items->type) {
3308 case RTE_FLOW_ITEM_TYPE_VOID:
3310 case RTE_FLOW_ITEM_TYPE_PORT_ID:
3311 mask.port_id = flow_tcf_item_mask
3312 (items, &rte_flow_item_port_id_mask,
3313 &flow_tcf_mask_supported.port_id,
3314 &flow_tcf_mask_empty.port_id,
3315 sizeof(flow_tcf_mask_supported.port_id),
3317 assert(mask.port_id);
3318 if (mask.port_id == &flow_tcf_mask_empty.port_id)
3320 spec.port_id = items->spec;
3321 if (!mask.port_id->id)
3324 for (i = 0; ptoi[i].ifindex; ++i)
3325 if (ptoi[i].port_id == spec.port_id->id)
3327 assert(ptoi[i].ifindex);
3328 tcm->tcm_ifindex = ptoi[i].ifindex;
3330 case RTE_FLOW_ITEM_TYPE_ETH:
3331 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3332 MLX5_FLOW_LAYER_INNER_L2 :
3333 MLX5_FLOW_LAYER_OUTER_L2;
3334 mask.eth = flow_tcf_item_mask
3335 (items, &rte_flow_item_eth_mask,
3336 &flow_tcf_mask_supported.eth,
3337 &flow_tcf_mask_empty.eth,
3338 sizeof(flow_tcf_mask_supported.eth),
3341 if (mask.eth == &flow_tcf_mask_empty.eth)
3343 spec.eth = items->spec;
3344 if (mask.eth->type) {
3345 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
3346 inner_etype = spec.eth->type;
3348 outer_etype = spec.eth->type;
3352 "outer L2 addresses cannot be"
3353 " forced is outer ones for tunnel,"
3354 " parameter is ignored");
3357 if (!is_zero_ether_addr(&mask.eth->dst)) {
3358 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
3360 spec.eth->dst.addr_bytes);
3361 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK,
3363 mask.eth->dst.addr_bytes);
3365 if (!is_zero_ether_addr(&mask.eth->src)) {
3366 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC,
3368 spec.eth->src.addr_bytes);
3369 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK,
3371 mask.eth->src.addr_bytes);
3373 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3375 case RTE_FLOW_ITEM_TYPE_VLAN:
3378 assert(!tunnel_outer);
3379 item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
3380 mask.vlan = flow_tcf_item_mask
3381 (items, &rte_flow_item_vlan_mask,
3382 &flow_tcf_mask_supported.vlan,
3383 &flow_tcf_mask_empty.vlan,
3384 sizeof(flow_tcf_mask_supported.vlan),
3387 if (mask.vlan == &flow_tcf_mask_empty.vlan)
3389 spec.vlan = items->spec;
3390 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3391 outer_etype == RTE_BE16(ETH_P_8021Q));
3392 outer_etype = RTE_BE16(ETH_P_8021Q);
3393 if (mask.vlan->inner_type)
3394 vlan_etype = spec.vlan->inner_type;
3395 if (mask.vlan->tci & RTE_BE16(0xe000))
3396 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_VLAN_PRIO,
3398 (spec.vlan->tci) >> 13) & 0x7);
3399 if (mask.vlan->tci & RTE_BE16(0x0fff))
3400 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_VLAN_ID,
3404 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3406 case RTE_FLOW_ITEM_TYPE_IPV4:
3407 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3408 MLX5_FLOW_LAYER_INNER_L3_IPV4 :
3409 MLX5_FLOW_LAYER_OUTER_L3_IPV4;
3410 mask.ipv4 = flow_tcf_item_mask
3411 (items, &rte_flow_item_ipv4_mask,
3412 &flow_tcf_mask_supported.ipv4,
3413 &flow_tcf_mask_empty.ipv4,
3414 sizeof(flow_tcf_mask_supported.ipv4),
3417 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3418 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3419 inner_etype == RTE_BE16(ETH_P_IP));
3420 inner_etype = RTE_BE16(ETH_P_IP);
3421 } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3422 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3423 vlan_etype == RTE_BE16(ETH_P_IP));
3424 vlan_etype = RTE_BE16(ETH_P_IP);
3426 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3427 outer_etype == RTE_BE16(ETH_P_IP));
3428 outer_etype = RTE_BE16(ETH_P_IP);
3430 spec.ipv4 = items->spec;
3431 if (!tunnel_outer && mask.ipv4->hdr.next_proto_id) {
3433 * No way to set IP protocol for outer tunnel
3434 * layers. Usually it is fixed, for example,
3435 * to UDP for VXLAN/GPE.
3437 assert(spec.ipv4); /* Mask is not empty. */
3438 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3439 spec.ipv4->hdr.next_proto_id);
3442 if (mask.ipv4 == &flow_tcf_mask_empty.ipv4 ||
3443 (!mask.ipv4->hdr.src_addr &&
3444 !mask.ipv4->hdr.dst_addr)) {
3448 * For tunnel outer we must set outer IP key
3449 * anyway, even if the specification/mask is
3450 * empty. There is no another way to tell
3451 * kernel about he outer layer protocol.
3454 (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC,
3455 mask.ipv4->hdr.src_addr);
3457 (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
3458 mask.ipv4->hdr.src_addr);
3459 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3462 if (mask.ipv4->hdr.src_addr) {
3464 (nlh, tunnel_outer ?
3465 TCA_FLOWER_KEY_ENC_IPV4_SRC :
3466 TCA_FLOWER_KEY_IPV4_SRC,
3467 spec.ipv4->hdr.src_addr);
3469 (nlh, tunnel_outer ?
3470 TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK :
3471 TCA_FLOWER_KEY_IPV4_SRC_MASK,
3472 mask.ipv4->hdr.src_addr);
3474 if (mask.ipv4->hdr.dst_addr) {
3476 (nlh, tunnel_outer ?
3477 TCA_FLOWER_KEY_ENC_IPV4_DST :
3478 TCA_FLOWER_KEY_IPV4_DST,
3479 spec.ipv4->hdr.dst_addr);
3481 (nlh, tunnel_outer ?
3482 TCA_FLOWER_KEY_ENC_IPV4_DST_MASK :
3483 TCA_FLOWER_KEY_IPV4_DST_MASK,
3484 mask.ipv4->hdr.dst_addr);
3486 if (mask.ipv4->hdr.time_to_live) {
3488 (nlh, tunnel_outer ?
3489 TCA_FLOWER_KEY_ENC_IP_TTL :
3490 TCA_FLOWER_KEY_IP_TTL,
3491 spec.ipv4->hdr.time_to_live);
3493 (nlh, tunnel_outer ?
3494 TCA_FLOWER_KEY_ENC_IP_TTL_MASK :
3495 TCA_FLOWER_KEY_IP_TTL_MASK,
3496 mask.ipv4->hdr.time_to_live);
3498 if (mask.ipv4->hdr.type_of_service) {
3500 (nlh, tunnel_outer ?
3501 TCA_FLOWER_KEY_ENC_IP_TOS :
3502 TCA_FLOWER_KEY_IP_TOS,
3503 spec.ipv4->hdr.type_of_service);
3505 (nlh, tunnel_outer ?
3506 TCA_FLOWER_KEY_ENC_IP_TOS_MASK :
3507 TCA_FLOWER_KEY_IP_TOS_MASK,
3508 mask.ipv4->hdr.type_of_service);
3510 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3512 case RTE_FLOW_ITEM_TYPE_IPV6: {
3513 bool ipv6_src, ipv6_dst;
3516 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3517 MLX5_FLOW_LAYER_INNER_L3_IPV6 :
3518 MLX5_FLOW_LAYER_OUTER_L3_IPV6;
3519 mask.ipv6 = flow_tcf_item_mask
3520 (items, &rte_flow_item_ipv6_mask,
3521 &flow_tcf_mask_supported.ipv6,
3522 &flow_tcf_mask_empty.ipv6,
3523 sizeof(flow_tcf_mask_supported.ipv6),
3526 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3527 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3528 inner_etype == RTE_BE16(ETH_P_IPV6));
3529 inner_etype = RTE_BE16(ETH_P_IPV6);
3530 } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3531 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3532 vlan_etype == RTE_BE16(ETH_P_IPV6));
3533 vlan_etype = RTE_BE16(ETH_P_IPV6);
3535 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3536 outer_etype == RTE_BE16(ETH_P_IPV6));
3537 outer_etype = RTE_BE16(ETH_P_IPV6);
3539 spec.ipv6 = items->spec;
3540 if (!tunnel_outer && mask.ipv6->hdr.proto) {
3542 * No way to set IP protocol for outer tunnel
3543 * layers. Usually it is fixed, for example,
3544 * to UDP for VXLAN/GPE.
3546 assert(spec.ipv6); /* Mask is not empty. */
3547 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3548 spec.ipv6->hdr.proto);
3551 ipv6_dst = !IN6_IS_ADDR_UNSPECIFIED
3552 (mask.ipv6->hdr.dst_addr);
3553 ipv6_src = !IN6_IS_ADDR_UNSPECIFIED
3554 (mask.ipv6->hdr.src_addr);
3555 if (mask.ipv6 == &flow_tcf_mask_empty.ipv6 ||
3556 (!ipv6_dst && !ipv6_src)) {
3560 * For tunnel outer we must set outer IP key
3561 * anyway, even if the specification/mask is
3562 * empty. There is no another way to tell
3563 * kernel about he outer layer protocol.
3566 TCA_FLOWER_KEY_ENC_IPV6_SRC,
3568 mask.ipv6->hdr.src_addr);
3570 TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
3572 mask.ipv6->hdr.src_addr);
3573 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3577 mnl_attr_put(nlh, tunnel_outer ?
3578 TCA_FLOWER_KEY_ENC_IPV6_SRC :
3579 TCA_FLOWER_KEY_IPV6_SRC,
3581 spec.ipv6->hdr.src_addr);
3582 mnl_attr_put(nlh, tunnel_outer ?
3583 TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK :
3584 TCA_FLOWER_KEY_IPV6_SRC_MASK,
3586 mask.ipv6->hdr.src_addr);
3589 mnl_attr_put(nlh, tunnel_outer ?
3590 TCA_FLOWER_KEY_ENC_IPV6_DST :
3591 TCA_FLOWER_KEY_IPV6_DST,
3593 spec.ipv6->hdr.dst_addr);
3594 mnl_attr_put(nlh, tunnel_outer ?
3595 TCA_FLOWER_KEY_ENC_IPV6_DST_MASK :
3596 TCA_FLOWER_KEY_IPV6_DST_MASK,
3598 mask.ipv6->hdr.dst_addr);
3600 if (mask.ipv6->hdr.hop_limits) {
3602 (nlh, tunnel_outer ?
3603 TCA_FLOWER_KEY_ENC_IP_TTL :
3604 TCA_FLOWER_KEY_IP_TTL,
3605 spec.ipv6->hdr.hop_limits);
3607 (nlh, tunnel_outer ?
3608 TCA_FLOWER_KEY_ENC_IP_TTL_MASK :
3609 TCA_FLOWER_KEY_IP_TTL_MASK,
3610 mask.ipv6->hdr.hop_limits);
3612 msk6 = (rte_be_to_cpu_32(mask.ipv6->hdr.vtc_flow) >>
3613 IPV6_HDR_TC_SHIFT) & 0xff;
3615 tos6 = (rte_be_to_cpu_32
3616 (spec.ipv6->hdr.vtc_flow) >>
3617 IPV6_HDR_TC_SHIFT) & 0xff;
3619 (nlh, tunnel_outer ?
3620 TCA_FLOWER_KEY_ENC_IP_TOS :
3621 TCA_FLOWER_KEY_IP_TOS, tos6);
3623 (nlh, tunnel_outer ?
3624 TCA_FLOWER_KEY_ENC_IP_TOS_MASK :
3625 TCA_FLOWER_KEY_IP_TOS_MASK, msk6);
3627 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3630 case RTE_FLOW_ITEM_TYPE_UDP:
3631 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3632 MLX5_FLOW_LAYER_INNER_L4_UDP :
3633 MLX5_FLOW_LAYER_OUTER_L4_UDP;
3634 mask.udp = flow_tcf_item_mask
3635 (items, &rte_flow_item_udp_mask,
3636 &flow_tcf_mask_supported.udp,
3637 &flow_tcf_mask_empty.udp,
3638 sizeof(flow_tcf_mask_supported.udp),
3641 spec.udp = items->spec;
3642 if (!tunnel_outer) {
3645 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3647 if (mask.udp == &flow_tcf_mask_empty.udp)
3650 assert(mask.udp != &flow_tcf_mask_empty.udp);
3651 decap.vxlan->udp_port =
3653 (spec.udp->hdr.dst_port);
3655 if (mask.udp->hdr.src_port) {
3657 (nlh, tunnel_outer ?
3658 TCA_FLOWER_KEY_ENC_UDP_SRC_PORT :
3659 TCA_FLOWER_KEY_UDP_SRC,
3660 spec.udp->hdr.src_port);
3662 (nlh, tunnel_outer ?
3663 TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK :
3664 TCA_FLOWER_KEY_UDP_SRC_MASK,
3665 mask.udp->hdr.src_port);
3667 if (mask.udp->hdr.dst_port) {
3669 (nlh, tunnel_outer ?
3670 TCA_FLOWER_KEY_ENC_UDP_DST_PORT :
3671 TCA_FLOWER_KEY_UDP_DST,
3672 spec.udp->hdr.dst_port);
3674 (nlh, tunnel_outer ?
3675 TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK :
3676 TCA_FLOWER_KEY_UDP_DST_MASK,
3677 mask.udp->hdr.dst_port);
3679 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3681 case RTE_FLOW_ITEM_TYPE_TCP:
3682 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3683 MLX5_FLOW_LAYER_INNER_L4_TCP :
3684 MLX5_FLOW_LAYER_OUTER_L4_TCP;
3685 mask.tcp = flow_tcf_item_mask
3686 (items, &rte_flow_item_tcp_mask,
3687 &flow_tcf_mask_supported.tcp,
3688 &flow_tcf_mask_empty.tcp,
3689 sizeof(flow_tcf_mask_supported.tcp),
3693 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3695 if (mask.tcp == &flow_tcf_mask_empty.tcp)
3697 spec.tcp = items->spec;
3698 if (mask.tcp->hdr.src_port) {
3699 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_SRC,
3700 spec.tcp->hdr.src_port);
3701 mnl_attr_put_u16(nlh,
3702 TCA_FLOWER_KEY_TCP_SRC_MASK,
3703 mask.tcp->hdr.src_port);
3705 if (mask.tcp->hdr.dst_port) {
3706 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_DST,
3707 spec.tcp->hdr.dst_port);
3708 mnl_attr_put_u16(nlh,
3709 TCA_FLOWER_KEY_TCP_DST_MASK,
3710 mask.tcp->hdr.dst_port);
3712 if (mask.tcp->hdr.tcp_flags) {
3715 TCA_FLOWER_KEY_TCP_FLAGS,
3717 (spec.tcp->hdr.tcp_flags));
3720 TCA_FLOWER_KEY_TCP_FLAGS_MASK,
3722 (mask.tcp->hdr.tcp_flags));
3724 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3726 case RTE_FLOW_ITEM_TYPE_VXLAN:
3727 assert(decap.vxlan);
3729 item_flags |= MLX5_FLOW_LAYER_VXLAN;
3730 spec.vxlan = items->spec;
3731 mnl_attr_put_u32(nlh,
3732 TCA_FLOWER_KEY_ENC_KEY_ID,
3733 vxlan_vni_as_be32(spec.vxlan->vni));
3734 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3737 return rte_flow_error_set(error, ENOTSUP,
3738 RTE_FLOW_ERROR_TYPE_ITEM,
3739 NULL, "item not supported");
3743 * Set the ether_type flower key and tc rule protocol:
3744 * - if there is nor VLAN neither VXLAN the key is taken from
3745 * eth item directly or deduced from L3 items.
3746 * - if there is vlan item then key is fixed to 802.1q.
3747 * - if there is vxlan item then key is set to inner tunnel type.
3748 * - simultaneous vlan and vxlan items are prohibited.
3750 if (outer_etype != RTE_BE16(ETH_P_ALL)) {
3751 tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
3753 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3754 if (inner_etype != RTE_BE16(ETH_P_ALL))
3755 mnl_attr_put_u16(nlh,
3756 TCA_FLOWER_KEY_ETH_TYPE,
3759 mnl_attr_put_u16(nlh,
3760 TCA_FLOWER_KEY_ETH_TYPE,
3762 if (outer_etype == RTE_BE16(ETH_P_8021Q) &&
3763 vlan_etype != RTE_BE16(ETH_P_ALL))
3764 mnl_attr_put_u16(nlh,
3765 TCA_FLOWER_KEY_VLAN_ETH_TYPE,
3768 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3770 na_flower_act = mnl_attr_nest_start(nlh, TCA_FLOWER_ACT);
3771 na_act_index_cur = 1;
3772 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
3773 struct nlattr *na_act_index;
3774 struct nlattr *na_act;
3775 unsigned int vlan_act;
3778 switch (actions->type) {
3779 case RTE_FLOW_ACTION_TYPE_VOID:
3781 case RTE_FLOW_ACTION_TYPE_PORT_ID:
3782 conf.port_id = actions->conf;
3783 if (conf.port_id->original)
3786 for (i = 0; ptoi[i].ifindex; ++i)
3787 if (ptoi[i].port_id == conf.port_id->id)
3789 assert(ptoi[i].ifindex);
3791 mnl_attr_nest_start(nlh, na_act_index_cur++);
3792 assert(na_act_index);
3793 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
3794 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3797 assert(dev_flow->tcf.tunnel);
3798 dev_flow->tcf.tunnel->ifindex_ptr =
3799 &((struct tc_mirred *)
3800 mnl_attr_get_payload
3801 (mnl_nlmsg_get_payload_tail
3804 mnl_attr_put(nlh, TCA_MIRRED_PARMS,
3805 sizeof(struct tc_mirred),
3806 &(struct tc_mirred){
3807 .action = TC_ACT_STOLEN,
3808 .eaction = TCA_EGRESS_REDIR,
3809 .ifindex = ptoi[i].ifindex,
3811 mnl_attr_nest_end(nlh, na_act);
3812 mnl_attr_nest_end(nlh, na_act_index);
3814 case RTE_FLOW_ACTION_TYPE_JUMP:
3815 conf.jump = actions->conf;
3817 mnl_attr_nest_start(nlh, na_act_index_cur++);
3818 assert(na_act_index);
3819 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3820 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3822 mnl_attr_put(nlh, TCA_GACT_PARMS,
3823 sizeof(struct tc_gact),
3825 .action = TC_ACT_GOTO_CHAIN |
3828 mnl_attr_nest_end(nlh, na_act);
3829 mnl_attr_nest_end(nlh, na_act_index);
3831 case RTE_FLOW_ACTION_TYPE_DROP:
3833 mnl_attr_nest_start(nlh, na_act_index_cur++);
3834 assert(na_act_index);
3835 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3836 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3838 mnl_attr_put(nlh, TCA_GACT_PARMS,
3839 sizeof(struct tc_gact),
3841 .action = TC_ACT_SHOT,
3843 mnl_attr_nest_end(nlh, na_act);
3844 mnl_attr_nest_end(nlh, na_act_index);
3846 case RTE_FLOW_ACTION_TYPE_COUNT:
3848 * Driver adds the count action implicitly for
3849 * each rule it creates.
3851 ret = flow_tcf_translate_action_count(dev,
3856 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
3857 conf.of_push_vlan = NULL;
3858 vlan_act = TCA_VLAN_ACT_POP;
3859 goto action_of_vlan;
3860 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
3861 conf.of_push_vlan = actions->conf;
3862 vlan_act = TCA_VLAN_ACT_PUSH;
3863 goto action_of_vlan;
3864 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
3865 conf.of_set_vlan_vid = actions->conf;
3867 goto override_na_vlan_id;
3868 vlan_act = TCA_VLAN_ACT_MODIFY;
3869 goto action_of_vlan;
3870 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
3871 conf.of_set_vlan_pcp = actions->conf;
3872 if (na_vlan_priority)
3873 goto override_na_vlan_priority;
3874 vlan_act = TCA_VLAN_ACT_MODIFY;
3875 goto action_of_vlan;
3878 mnl_attr_nest_start(nlh, na_act_index_cur++);
3879 assert(na_act_index);
3880 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "vlan");
3881 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3883 mnl_attr_put(nlh, TCA_VLAN_PARMS,
3884 sizeof(struct tc_vlan),
3886 .action = TC_ACT_PIPE,
3887 .v_action = vlan_act,
3889 if (vlan_act == TCA_VLAN_ACT_POP) {
3890 mnl_attr_nest_end(nlh, na_act);
3891 mnl_attr_nest_end(nlh, na_act_index);
3894 if (vlan_act == TCA_VLAN_ACT_PUSH)
3895 mnl_attr_put_u16(nlh,
3896 TCA_VLAN_PUSH_VLAN_PROTOCOL,
3897 conf.of_push_vlan->ethertype);
3898 na_vlan_id = mnl_nlmsg_get_payload_tail(nlh);
3899 mnl_attr_put_u16(nlh, TCA_VLAN_PAD, 0);
3900 na_vlan_priority = mnl_nlmsg_get_payload_tail(nlh);
3901 mnl_attr_put_u8(nlh, TCA_VLAN_PAD, 0);
3902 mnl_attr_nest_end(nlh, na_act);
3903 mnl_attr_nest_end(nlh, na_act_index);
3904 if (actions->type ==
3905 RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
3906 override_na_vlan_id:
3907 na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID;
3908 *(uint16_t *)mnl_attr_get_payload(na_vlan_id) =
3910 (conf.of_set_vlan_vid->vlan_vid);
3911 } else if (actions->type ==
3912 RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
3913 override_na_vlan_priority:
3914 na_vlan_priority->nla_type =
3915 TCA_VLAN_PUSH_VLAN_PRIORITY;
3916 *(uint8_t *)mnl_attr_get_payload
3917 (na_vlan_priority) =
3918 conf.of_set_vlan_pcp->vlan_pcp;
3921 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
3922 assert(decap.vxlan);
3923 assert(dev_flow->tcf.tunnel);
3924 dev_flow->tcf.tunnel->ifindex_ptr =
3925 (unsigned int *)&tcm->tcm_ifindex;
3927 mnl_attr_nest_start(nlh, na_act_index_cur++);
3928 assert(na_act_index);
3929 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3930 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3932 mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3933 sizeof(struct tc_tunnel_key),
3934 &(struct tc_tunnel_key){
3935 .action = TC_ACT_PIPE,
3936 .t_action = TCA_TUNNEL_KEY_ACT_RELEASE,
3938 mnl_attr_nest_end(nlh, na_act);
3939 mnl_attr_nest_end(nlh, na_act_index);
3940 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3942 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
3943 assert(encap.vxlan);
3944 flow_tcf_vxlan_encap_parse(actions, encap.vxlan);
3946 mnl_attr_nest_start(nlh, na_act_index_cur++);
3947 assert(na_act_index);
3948 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3949 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3951 mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3952 sizeof(struct tc_tunnel_key),
3953 &(struct tc_tunnel_key){
3954 .action = TC_ACT_PIPE,
3955 .t_action = TCA_TUNNEL_KEY_ACT_SET,
3957 if (encap.vxlan->mask & FLOW_TCF_ENCAP_UDP_DST)
3958 mnl_attr_put_u16(nlh,
3959 TCA_TUNNEL_KEY_ENC_DST_PORT,
3960 encap.vxlan->udp.dst);
3961 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_SRC)
3962 mnl_attr_put_u32(nlh,
3963 TCA_TUNNEL_KEY_ENC_IPV4_SRC,
3964 encap.vxlan->ipv4.src);
3965 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_DST)
3966 mnl_attr_put_u32(nlh,
3967 TCA_TUNNEL_KEY_ENC_IPV4_DST,
3968 encap.vxlan->ipv4.dst);
3969 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_SRC)
3971 TCA_TUNNEL_KEY_ENC_IPV6_SRC,
3972 sizeof(encap.vxlan->ipv6.src),
3973 &encap.vxlan->ipv6.src);
3974 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_DST)
3976 TCA_TUNNEL_KEY_ENC_IPV6_DST,
3977 sizeof(encap.vxlan->ipv6.dst),
3978 &encap.vxlan->ipv6.dst);
3979 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IP_TTL)
3980 mnl_attr_put_u8(nlh,
3981 TCA_TUNNEL_KEY_ENC_TTL,
3982 encap.vxlan->ip_ttl_hop);
3983 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IP_TOS)
3984 mnl_attr_put_u8(nlh,
3985 TCA_TUNNEL_KEY_ENC_TOS,
3986 encap.vxlan->ip_tos);
3987 if (encap.vxlan->mask & FLOW_TCF_ENCAP_VXLAN_VNI)
3988 mnl_attr_put_u32(nlh,
3989 TCA_TUNNEL_KEY_ENC_KEY_ID,
3991 (encap.vxlan->vxlan.vni));
3992 mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM, 0);
3993 mnl_attr_nest_end(nlh, na_act);
3994 mnl_attr_nest_end(nlh, na_act_index);
3995 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3997 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
3998 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
3999 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
4000 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
4001 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
4002 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
4003 case RTE_FLOW_ACTION_TYPE_SET_TTL:
4004 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
4005 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
4006 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
4008 mnl_attr_nest_start(nlh, na_act_index_cur++);
4009 flow_tcf_create_pedit_mnl_msg(nlh,
4010 &actions, item_flags);
4011 mnl_attr_nest_end(nlh, na_act_index);
4014 return rte_flow_error_set(error, ENOTSUP,
4015 RTE_FLOW_ERROR_TYPE_ACTION,
4017 "action not supported");
4021 assert(na_flower_act);
4022 mnl_attr_nest_end(nlh, na_flower_act);
4023 dev_flow->tcf.ptc_flags = mnl_attr_get_payload
4024 (mnl_nlmsg_get_payload_tail(nlh));
4025 mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, decap.vxlan ?
4026 0 : TCA_CLS_FLAGS_SKIP_SW);
4027 mnl_attr_nest_end(nlh, na_flower);
4028 if (dev_flow->tcf.tunnel && dev_flow->tcf.tunnel->ifindex_ptr)
4029 dev_flow->tcf.tunnel->ifindex_org =
4030 *dev_flow->tcf.tunnel->ifindex_ptr;
4031 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
4036 * Send Netlink message with acknowledgment.
4039 * Flow context to use.
4041 * Message to send. This function always raises the NLM_F_ACK flag before
4044 * Callback handler for received message.
4046 * Context pointer for callback handler.
4049 * 0 on success, a negative errno value otherwise and rte_errno is set.
4052 flow_tcf_nl_ack(struct mlx5_flow_tcf_context *tcf,
4053 struct nlmsghdr *nlh,
4054 mnl_cb_t cb, void *arg)
4056 unsigned int portid = mnl_socket_get_portid(tcf->nl);
4057 uint32_t seq = tcf->seq++;
4063 /* seq 0 is reserved for kernel event-driven notifications. */
4066 nlh->nlmsg_seq = seq;
4067 nlh->nlmsg_flags |= NLM_F_ACK;
4068 ret = mnl_socket_sendto(tcf->nl, nlh, nlh->nlmsg_len);
4070 /* Message send error occurres. */
4074 nlh = (struct nlmsghdr *)(tcf->buf);
4076 * The following loop postpones non-fatal errors until multipart
4077 * messages are complete.
4080 ret = mnl_socket_recvfrom(tcf->nl, tcf->buf, tcf->buf_size);
4084 * In case of overflow Will receive till
4085 * end of multipart message. We may lost part
4086 * of reply messages but mark and return an error.
4088 if (err != ENOSPC ||
4089 !(nlh->nlmsg_flags & NLM_F_MULTI) ||
4090 nlh->nlmsg_type == NLMSG_DONE)
4093 ret = mnl_cb_run(nlh, ret, seq, portid, cb, arg);
4096 * libmnl returns 0 if DONE or
4097 * success ACK message found.
4103 * ACK message with error found
4104 * or some error occurred.
4109 /* We should continue receiving. */
4118 #define MNL_BUF_EXTRA_SPACE 16
4119 #define MNL_REQUEST_SIZE_MIN 256
4120 #define MNL_REQUEST_SIZE_MAX 2048
4121 #define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \
4122 MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX)
4124 /* Data structures used by flow_tcf_xxx_cb() routines. */
4125 struct tcf_nlcb_buf {
4126 LIST_ENTRY(tcf_nlcb_buf) next;
4128 alignas(struct nlmsghdr)
4129 uint8_t msg[]; /**< Netlink message data. */
4132 struct tcf_nlcb_context {
4133 unsigned int ifindex; /**< Base interface index. */
4135 LIST_HEAD(, tcf_nlcb_buf) nlbuf;
4139 * Allocate space for netlink command in buffer list
4141 * @param[in, out] ctx
4142 * Pointer to callback context with command buffers list.
4144 * Required size of data buffer to be allocated.
4147 * Pointer to allocated memory, aligned as message header.
4148 * NULL if some error occurred.
4150 static struct nlmsghdr *
4151 flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size)
4153 struct tcf_nlcb_buf *buf;
4154 struct nlmsghdr *nlh;
4156 size = NLMSG_ALIGN(size);
4157 buf = LIST_FIRST(&ctx->nlbuf);
4158 if (buf && (buf->size + size) <= ctx->bufsize) {
4159 nlh = (struct nlmsghdr *)&buf->msg[buf->size];
4163 if (size > ctx->bufsize) {
4164 DRV_LOG(WARNING, "netlink: too long command buffer requested");
4167 buf = rte_malloc(__func__,
4168 ctx->bufsize + sizeof(struct tcf_nlcb_buf),
4169 alignof(struct tcf_nlcb_buf));
4171 DRV_LOG(WARNING, "netlink: no memory for command buffer");
4174 LIST_INSERT_HEAD(&ctx->nlbuf, buf, next);
4176 nlh = (struct nlmsghdr *)&buf->msg[0];
4181 * Send the buffers with prepared netlink commands. Scans the list and
4182 * sends all found buffers. Buffers are sent and freed anyway in order
4183 * to prevent memory leakage if some every message in received packet.
4186 * Context object initialized by mlx5_flow_tcf_context_create().
4187 * @param[in, out] ctx
4188 * Pointer to callback context with command buffers list.
4191 * Zero value on success, negative errno value otherwise
4192 * and rte_errno is set.
4195 flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf,
4196 struct tcf_nlcb_context *ctx)
4198 struct tcf_nlcb_buf *bc = LIST_FIRST(&ctx->nlbuf);
4202 struct tcf_nlcb_buf *bn = LIST_NEXT(bc, next);
4203 struct nlmsghdr *nlh;
4207 while (msg < bc->size) {
4209 * Send Netlink commands from buffer in one by one
4210 * fashion. If we send multiple rule deletion commands
4211 * in one Netlink message and some error occurs it may
4212 * cause multiple ACK error messages and break sequence
4213 * numbers of Netlink communication, because we expect
4214 * the only one ACK reply.
4216 assert((bc->size - msg) >= sizeof(struct nlmsghdr));
4217 nlh = (struct nlmsghdr *)&bc->msg[msg];
4218 assert((bc->size - msg) >= nlh->nlmsg_len);
4219 msg += nlh->nlmsg_len;
4220 rc = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4223 "netlink: cleanup error %d", rc);
4231 LIST_INIT(&ctx->nlbuf);
4236 * Collect local IP address rules with scope link attribute on specified
4237 * network device. This is callback routine called by libmnl mnl_cb_run()
4238 * in loop for every message in received packet.
4241 * Pointer to reply header.
4242 * @param[in, out] arg
4243 * Opaque data pointer for this callback.
4246 * A positive, nonzero value on success, negative errno value otherwise
4247 * and rte_errno is set.
4250 flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg)
4252 struct tcf_nlcb_context *ctx = arg;
4253 struct nlmsghdr *cmd;
4254 struct ifaddrmsg *ifa;
4256 struct nlattr *na_local = NULL;
4257 struct nlattr *na_peer = NULL;
4258 unsigned char family;
4261 if (nlh->nlmsg_type != RTM_NEWADDR) {
4265 ifa = mnl_nlmsg_get_payload(nlh);
4266 family = ifa->ifa_family;
4267 if (ifa->ifa_index != ctx->ifindex ||
4268 ifa->ifa_scope != RT_SCOPE_LINK ||
4269 !(ifa->ifa_flags & IFA_F_PERMANENT) ||
4270 (family != AF_INET && family != AF_INET6))
4272 mnl_attr_for_each(na, nlh, sizeof(*ifa)) {
4273 switch (mnl_attr_get_type(na)) {
4281 if (na_local && na_peer)
4284 if (!na_local || !na_peer)
4286 /* Local rule found with scope link, permanent and assigned peer. */
4287 size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4288 MNL_ALIGN(sizeof(struct ifaddrmsg)) +
4289 (family == AF_INET6 ? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4290 : 2 * SZ_NLATTR_TYPE_OF(uint32_t));
4291 cmd = flow_tcf_alloc_nlcmd(ctx, size);
4296 cmd = mnl_nlmsg_put_header(cmd);
4297 cmd->nlmsg_type = RTM_DELADDR;
4298 cmd->nlmsg_flags = NLM_F_REQUEST;
4299 ifa = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifa));
4300 ifa->ifa_flags = IFA_F_PERMANENT;
4301 ifa->ifa_scope = RT_SCOPE_LINK;
4302 ifa->ifa_index = ctx->ifindex;
4303 if (family == AF_INET) {
4304 ifa->ifa_family = AF_INET;
4305 ifa->ifa_prefixlen = 32;
4306 mnl_attr_put_u32(cmd, IFA_LOCAL, mnl_attr_get_u32(na_local));
4307 mnl_attr_put_u32(cmd, IFA_ADDRESS, mnl_attr_get_u32(na_peer));
4309 ifa->ifa_family = AF_INET6;
4310 ifa->ifa_prefixlen = 128;
4311 mnl_attr_put(cmd, IFA_LOCAL, IPV6_ADDR_LEN,
4312 mnl_attr_get_payload(na_local));
4313 mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN,
4314 mnl_attr_get_payload(na_peer));
4316 assert(size == cmd->nlmsg_len);
4321 * Cleanup the local IP addresses on outer interface.
4324 * Context object initialized by mlx5_flow_tcf_context_create().
4325 * @param[in] ifindex
4326 * Network inferface index to perform cleanup.
4329 flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf,
4330 unsigned int ifindex)
4332 struct nlmsghdr *nlh;
4333 struct ifaddrmsg *ifa;
4334 struct tcf_nlcb_context ctx = {
4336 .bufsize = MNL_REQUEST_SIZE,
4337 .nlbuf = LIST_HEAD_INITIALIZER(),
4343 * Seek and destroy leftovers of local IP addresses with
4344 * matching properties "scope link".
4346 nlh = mnl_nlmsg_put_header(tcf->buf);
4347 nlh->nlmsg_type = RTM_GETADDR;
4348 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4349 ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4350 ifa->ifa_family = AF_UNSPEC;
4351 ifa->ifa_index = ifindex;
4352 ifa->ifa_scope = RT_SCOPE_LINK;
4353 ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_local_cb, &ctx);
4355 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4356 ret = flow_tcf_send_nlcmd(tcf, &ctx);
4358 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4362 * Collect neigh permament rules on specified network device.
4363 * This is callback routine called by libmnl mnl_cb_run() in loop for
4364 * every message in received packet.
4367 * Pointer to reply header.
4368 * @param[in, out] arg
4369 * Opaque data pointer for this callback.
4372 * A positive, nonzero value on success, negative errno value otherwise
4373 * and rte_errno is set.
4376 flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg)
4378 struct tcf_nlcb_context *ctx = arg;
4379 struct nlmsghdr *cmd;
4382 struct nlattr *na_ip = NULL;
4383 struct nlattr *na_mac = NULL;
4384 unsigned char family;
4387 if (nlh->nlmsg_type != RTM_NEWNEIGH) {
4391 ndm = mnl_nlmsg_get_payload(nlh);
4392 family = ndm->ndm_family;
4393 if (ndm->ndm_ifindex != (int)ctx->ifindex ||
4394 !(ndm->ndm_state & NUD_PERMANENT) ||
4395 (family != AF_INET && family != AF_INET6))
4397 mnl_attr_for_each(na, nlh, sizeof(*ndm)) {
4398 switch (mnl_attr_get_type(na)) {
4406 if (na_mac && na_ip)
4409 if (!na_mac || !na_ip)
4411 /* Neigh rule with permenent attribute found. */
4412 size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4413 MNL_ALIGN(sizeof(struct ndmsg)) +
4414 SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) +
4415 (family == AF_INET6 ? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4416 : SZ_NLATTR_TYPE_OF(uint32_t));
4417 cmd = flow_tcf_alloc_nlcmd(ctx, size);
4422 cmd = mnl_nlmsg_put_header(cmd);
4423 cmd->nlmsg_type = RTM_DELNEIGH;
4424 cmd->nlmsg_flags = NLM_F_REQUEST;
4425 ndm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ndm));
4426 ndm->ndm_ifindex = ctx->ifindex;
4427 ndm->ndm_state = NUD_PERMANENT;
4430 if (family == AF_INET) {
4431 ndm->ndm_family = AF_INET;
4432 mnl_attr_put_u32(cmd, NDA_DST, mnl_attr_get_u32(na_ip));
4434 ndm->ndm_family = AF_INET6;
4435 mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN,
4436 mnl_attr_get_payload(na_ip));
4438 mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN,
4439 mnl_attr_get_payload(na_mac));
4440 assert(size == cmd->nlmsg_len);
4445 * Cleanup the neigh rules on outer interface.
4448 * Context object initialized by mlx5_flow_tcf_context_create().
4449 * @param[in] ifindex
4450 * Network inferface index to perform cleanup.
4453 flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf,
4454 unsigned int ifindex)
4456 struct nlmsghdr *nlh;
4458 struct tcf_nlcb_context ctx = {
4460 .bufsize = MNL_REQUEST_SIZE,
4461 .nlbuf = LIST_HEAD_INITIALIZER(),
4466 /* Seek and destroy leftovers of neigh rules. */
4467 nlh = mnl_nlmsg_put_header(tcf->buf);
4468 nlh->nlmsg_type = RTM_GETNEIGH;
4469 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4470 ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4471 ndm->ndm_family = AF_UNSPEC;
4472 ndm->ndm_ifindex = ifindex;
4473 ndm->ndm_state = NUD_PERMANENT;
4474 ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_neigh_cb, &ctx);
4476 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4477 ret = flow_tcf_send_nlcmd(tcf, &ctx);
4479 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4483 * Collect indices of VXLAN encap/decap interfaces associated with device.
4484 * This is callback routine called by libmnl mnl_cb_run() in loop for
4485 * every message in received packet.
4488 * Pointer to reply header.
4489 * @param[in, out] arg
4490 * Opaque data pointer for this callback.
4493 * A positive, nonzero value on success, negative errno value otherwise
4494 * and rte_errno is set.
4497 flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg)
4499 struct tcf_nlcb_context *ctx = arg;
4500 struct nlmsghdr *cmd;
4501 struct ifinfomsg *ifm;
4503 struct nlattr *na_info = NULL;
4504 struct nlattr *na_vxlan = NULL;
4506 unsigned int vxindex;
4509 if (nlh->nlmsg_type != RTM_NEWLINK) {
4513 ifm = mnl_nlmsg_get_payload(nlh);
4514 if (!ifm->ifi_index) {
4518 mnl_attr_for_each(na, nlh, sizeof(*ifm))
4519 if (mnl_attr_get_type(na) == IFLA_LINKINFO) {
4525 mnl_attr_for_each_nested(na, na_info) {
4526 switch (mnl_attr_get_type(na)) {
4527 case IFLA_INFO_KIND:
4528 if (!strncmp("vxlan", mnl_attr_get_str(na),
4529 mnl_attr_get_len(na)))
4532 case IFLA_INFO_DATA:
4536 if (found && na_vxlan)
4539 if (!found || !na_vxlan)
4542 mnl_attr_for_each_nested(na, na_vxlan) {
4543 if (mnl_attr_get_type(na) == IFLA_VXLAN_LINK &&
4544 mnl_attr_get_u32(na) == ctx->ifindex) {
4551 /* Attached VXLAN device found, store the command to delete. */
4552 vxindex = ifm->ifi_index;
4553 size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4554 MNL_ALIGN(sizeof(struct ifinfomsg));
4555 cmd = flow_tcf_alloc_nlcmd(ctx, size);
4560 cmd = mnl_nlmsg_put_header(cmd);
4561 cmd->nlmsg_type = RTM_DELLINK;
4562 cmd->nlmsg_flags = NLM_F_REQUEST;
4563 ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm));
4564 ifm->ifi_family = AF_UNSPEC;
4565 ifm->ifi_index = vxindex;
4566 assert(size == cmd->nlmsg_len);
4571 * Cleanup the outer interface. Removes all found vxlan devices
4572 * attached to specified index, flushes the neigh and local IP
4576 * Context object initialized by mlx5_flow_tcf_context_create().
4577 * @param[in] ifindex
4578 * Network inferface index to perform cleanup.
4581 flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf,
4582 unsigned int ifindex)
4584 struct nlmsghdr *nlh;
4585 struct ifinfomsg *ifm;
4586 struct tcf_nlcb_context ctx = {
4588 .bufsize = MNL_REQUEST_SIZE,
4589 .nlbuf = LIST_HEAD_INITIALIZER(),
4595 * Seek and destroy leftover VXLAN encap/decap interfaces with
4596 * matching properties.
4598 nlh = mnl_nlmsg_put_header(tcf->buf);
4599 nlh->nlmsg_type = RTM_GETLINK;
4600 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4601 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4602 ifm->ifi_family = AF_UNSPEC;
4603 ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_vxlan_cb, &ctx);
4605 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4606 ret = flow_tcf_send_nlcmd(tcf, &ctx);
4608 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4612 * Emit Netlink message to add/remove local address to the outer device.
4613 * The address being added is visible within the link only (scope link).
4615 * Note that an implicit route is maintained by the kernel due to the
4616 * presence of a peer address (IFA_ADDRESS).
4618 * These rules are used for encapsultion only and allow to assign
4619 * the outer tunnel source IP address.
4622 * Libmnl socket context object.
4624 * Encapsulation properties (source address and its peer).
4625 * @param[in] ifindex
4626 * Network interface to apply rule.
4628 * Toggle between add and remove.
4630 * Perform verbose error reporting if not NULL.
4633 * 0 on success, a negative errno value otherwise and rte_errno is set.
4636 flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf,
4637 const struct flow_tcf_vxlan_encap *encap,
4638 unsigned int ifindex,
4640 struct rte_flow_error *error)
4642 struct nlmsghdr *nlh;
4643 struct ifaddrmsg *ifa;
4644 alignas(struct nlmsghdr)
4645 uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
4647 nlh = mnl_nlmsg_put_header(buf);
4648 nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
4650 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4652 ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4653 ifa->ifa_flags = IFA_F_PERMANENT;
4654 ifa->ifa_scope = RT_SCOPE_LINK;
4655 ifa->ifa_index = ifindex;
4656 if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4657 ifa->ifa_family = AF_INET;
4658 ifa->ifa_prefixlen = 32;
4659 mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src);
4660 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST)
4661 mnl_attr_put_u32(nlh, IFA_ADDRESS,
4664 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4665 ifa->ifa_family = AF_INET6;
4666 ifa->ifa_prefixlen = 128;
4667 mnl_attr_put(nlh, IFA_LOCAL,
4668 sizeof(encap->ipv6.src),
4670 if (encap->mask & FLOW_TCF_ENCAP_IPV6_DST)
4671 mnl_attr_put(nlh, IFA_ADDRESS,
4672 sizeof(encap->ipv6.dst),
4675 if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4677 return rte_flow_error_set(error, rte_errno,
4678 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4679 "netlink: cannot complete IFA request"
4684 * Emit Netlink message to add/remove neighbor.
4687 * Libmnl socket context object.
4689 * Encapsulation properties (destination address).
4690 * @param[in] ifindex
4691 * Network interface.
4693 * Toggle between add and remove.
4695 * Perform verbose error reporting if not NULL.
4698 * 0 on success, a negative errno value otherwise and rte_errno is set.
4701 flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf,
4702 const struct flow_tcf_vxlan_encap *encap,
4703 unsigned int ifindex,
4705 struct rte_flow_error *error)
4707 struct nlmsghdr *nlh;
4709 alignas(struct nlmsghdr)
4710 uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
4712 nlh = mnl_nlmsg_put_header(buf);
4713 nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
4715 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4717 ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4718 ndm->ndm_ifindex = ifindex;
4719 ndm->ndm_state = NUD_PERMANENT;
4722 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4723 ndm->ndm_family = AF_INET;
4724 mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst);
4726 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4727 ndm->ndm_family = AF_INET6;
4728 mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst),
4731 if (encap->mask & FLOW_TCF_ENCAP_ETH_SRC && enable)
4733 "outer ethernet source address cannot be "
4734 "forced for VXLAN encapsulation");
4735 if (encap->mask & FLOW_TCF_ENCAP_ETH_DST)
4736 mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst),
4738 if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4740 return rte_flow_error_set(error, rte_errno,
4741 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4742 "netlink: cannot complete ND request"
4747 * Manage the local IP addresses and their peers IP addresses on the
4748 * outer interface for encapsulation purposes. The kernel searches the
4749 * appropriate device for tunnel egress traffic using the outer source
4750 * IP, this IP should be assigned to the outer network device, otherwise
4751 * kernel rejects the rule.
4753 * Adds or removes the addresses using the Netlink command like this:
4754 * ip addr add <src_ip> peer <dst_ip> scope link dev <ifouter>
4756 * The addresses are local to the netdev ("scope link"), this reduces
4757 * the risk of conflicts. Note that an implicit route is maintained by
4758 * the kernel due to the presence of a peer address (IFA_ADDRESS).
4761 * Libmnl socket context object.
4763 * Object, contains rule database and ifouter index.
4764 * @param[in] dev_flow
4765 * Flow object, contains the tunnel parameters (for encap only).
4767 * Toggle between add and remove.
4769 * Perform verbose error reporting if not NULL.
4772 * 0 on success, a negative errno value otherwise and rte_errno is set.
4775 flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf,
4776 struct tcf_irule *iface,
4777 struct mlx5_flow *dev_flow,
4779 struct rte_flow_error *error)
4781 const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4782 struct tcf_local_rule *rule = NULL;
4786 assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4787 if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4788 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_DST);
4789 LIST_FOREACH(rule, &iface->local, next) {
4790 if (rule->mask & FLOW_TCF_ENCAP_IPV4_SRC &&
4791 encap->ipv4.src == rule->ipv4.src &&
4792 encap->ipv4.dst == rule->ipv4.dst) {
4797 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4798 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4799 LIST_FOREACH(rule, &iface->local, next) {
4800 if (rule->mask & FLOW_TCF_ENCAP_IPV6_SRC &&
4801 !memcmp(&encap->ipv6.src, &rule->ipv6.src,
4802 sizeof(encap->ipv6.src)) &&
4803 !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4804 sizeof(encap->ipv6.dst))) {
4814 if (!rule->refcnt || !--rule->refcnt) {
4815 LIST_REMOVE(rule, next);
4816 return flow_tcf_rule_local(tcf, encap,
4817 iface->ifouter, false, error);
4822 DRV_LOG(WARNING, "disabling not existing local rule");
4823 rte_flow_error_set(error, ENOENT,
4824 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4825 "disabling not existing local rule");
4828 rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule),
4829 alignof(struct tcf_local_rule));
4831 rte_flow_error_set(error, ENOMEM,
4832 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4833 "unable to allocate memory for local rule");
4836 *rule = (struct tcf_local_rule){.refcnt = 0,
4839 if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4840 rule->mask = FLOW_TCF_ENCAP_IPV4_SRC
4841 | FLOW_TCF_ENCAP_IPV4_DST;
4842 rule->ipv4.src = encap->ipv4.src;
4843 rule->ipv4.dst = encap->ipv4.dst;
4845 rule->mask = FLOW_TCF_ENCAP_IPV6_SRC
4846 | FLOW_TCF_ENCAP_IPV6_DST;
4847 memcpy(&rule->ipv6.src, &encap->ipv6.src, IPV6_ADDR_LEN);
4848 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4850 ret = flow_tcf_rule_local(tcf, encap, iface->ifouter, true, error);
4856 LIST_INSERT_HEAD(&iface->local, rule, next);
4861 * Manage the destination MAC/IP addresses neigh database, kernel uses
4862 * this one to determine the destination MAC address within encapsulation
4863 * header. Adds or removes the entries using the Netlink command like this:
4864 * ip neigh add dev <ifouter> lladdr <dst_mac> to <dst_ip> nud permanent
4867 * Libmnl socket context object.
4869 * Object, contains rule database and ifouter index.
4870 * @param[in] dev_flow
4871 * Flow object, contains the tunnel parameters (for encap only).
4873 * Toggle between add and remove.
4875 * Perform verbose error reporting if not NULL.
4878 * 0 on success, a negative errno value otherwise and rte_errno is set.
4881 flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf,
4882 struct tcf_irule *iface,
4883 struct mlx5_flow *dev_flow,
4885 struct rte_flow_error *error)
4887 const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4888 struct tcf_neigh_rule *rule = NULL;
4892 assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4893 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4894 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_SRC);
4895 LIST_FOREACH(rule, &iface->neigh, next) {
4896 if (rule->mask & FLOW_TCF_ENCAP_IPV4_DST &&
4897 encap->ipv4.dst == rule->ipv4.dst) {
4902 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4903 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4904 LIST_FOREACH(rule, &iface->neigh, next) {
4905 if (rule->mask & FLOW_TCF_ENCAP_IPV6_DST &&
4906 !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4907 sizeof(encap->ipv6.dst))) {
4913 if (memcmp(&encap->eth.dst, &rule->eth,
4914 sizeof(encap->eth.dst))) {
4915 DRV_LOG(WARNING, "Destination MAC differs"
4917 rte_flow_error_set(error, EEXIST,
4918 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
4919 NULL, "Different MAC address"
4920 " neigh rule for the same"
4928 if (!rule->refcnt || !--rule->refcnt) {
4929 LIST_REMOVE(rule, next);
4930 return flow_tcf_rule_neigh(tcf, encap,
4937 DRV_LOG(WARNING, "Disabling not existing neigh rule");
4938 rte_flow_error_set(error, ENOENT,
4939 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4940 "unable to allocate memory for neigh rule");
4943 rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule),
4944 alignof(struct tcf_neigh_rule));
4946 rte_flow_error_set(error, ENOMEM,
4947 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4948 "unable to allocate memory for neigh rule");
4951 *rule = (struct tcf_neigh_rule){.refcnt = 0,
4954 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4955 rule->mask = FLOW_TCF_ENCAP_IPV4_DST;
4956 rule->ipv4.dst = encap->ipv4.dst;
4958 rule->mask = FLOW_TCF_ENCAP_IPV6_DST;
4959 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4961 memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth));
4962 ret = flow_tcf_rule_neigh(tcf, encap, iface->ifouter, true, error);
4968 LIST_INSERT_HEAD(&iface->neigh, rule, next);
4972 /* VXLAN encap rule database for outer interfaces. */
4973 static LIST_HEAD(, tcf_irule) iface_list_vxlan = LIST_HEAD_INITIALIZER();
4975 /* VTEP device list is shared between PMD port instances. */
4976 static LIST_HEAD(, tcf_vtep) vtep_list_vxlan = LIST_HEAD_INITIALIZER();
4977 static pthread_mutex_t vtep_list_mutex = PTHREAD_MUTEX_INITIALIZER;
4980 * Acquire the VXLAN encap rules container for specified interface.
4981 * First looks for the container in the existing ones list, creates
4982 * and initializes the new container if existing not found.
4985 * Context object initialized by mlx5_flow_tcf_context_create().
4986 * @param[in] ifouter
4987 * Network interface index to create VXLAN encap rules on.
4989 * Perform verbose error reporting if not NULL.
4991 * Rule container pointer on success,
4992 * NULL otherwise and rte_errno is set.
4994 static struct tcf_irule*
4995 flow_tcf_encap_irule_acquire(struct mlx5_flow_tcf_context *tcf,
4996 unsigned int ifouter,
4997 struct rte_flow_error *error)
4999 struct tcf_irule *iface;
5001 /* Look whether the container for encap rules is created. */
5003 LIST_FOREACH(iface, &iface_list_vxlan, next) {
5004 if (iface->ifouter == ifouter)
5008 /* Container already exists, just increment the reference. */
5012 /* Not found, we should create the new container. */
5013 iface = rte_zmalloc(__func__, sizeof(*iface),
5014 alignof(struct tcf_irule));
5016 rte_flow_error_set(error, ENOMEM,
5017 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5018 "unable to allocate memory for container");
5021 *iface = (struct tcf_irule){
5022 .local = LIST_HEAD_INITIALIZER(),
5023 .neigh = LIST_HEAD_INITIALIZER(),
5027 /* Interface cleanup for new container created. */
5028 flow_tcf_encap_iface_cleanup(tcf, ifouter);
5029 flow_tcf_encap_local_cleanup(tcf, ifouter);
5030 flow_tcf_encap_neigh_cleanup(tcf, ifouter);
5031 LIST_INSERT_HEAD(&iface_list_vxlan, iface, next);
5036 * Releases VXLAN encap rules container by pointer. Decrements the
5037 * reference cointer and deletes the container if counter is zero.
5040 * VXLAN rule container pointer to release.
5043 flow_tcf_encap_irule_release(struct tcf_irule *iface)
5045 assert(iface->refcnt);
5046 if (--iface->refcnt == 0) {
5047 /* Reference counter is zero, delete the container. */
5048 assert(LIST_EMPTY(&iface->local));
5049 assert(LIST_EMPTY(&iface->neigh));
5050 LIST_REMOVE(iface, next);
5056 * Deletes VTEP network device.
5059 * Context object initialized by mlx5_flow_tcf_context_create().
5061 * Object represinting the network device to delete. Memory
5062 * allocated for this object is freed by routine.
5065 flow_tcf_vtep_delete(struct mlx5_flow_tcf_context *tcf,
5066 struct tcf_vtep *vtep)
5068 struct nlmsghdr *nlh;
5069 struct ifinfomsg *ifm;
5070 alignas(struct nlmsghdr)
5071 uint8_t buf[mnl_nlmsg_size(MNL_ALIGN(sizeof(*ifm))) +
5072 MNL_BUF_EXTRA_SPACE];
5075 assert(!vtep->refcnt);
5076 /* Delete only ifaces those we actually created. */
5077 if (vtep->created && vtep->ifindex) {
5078 DRV_LOG(INFO, "VTEP delete (%d)", vtep->ifindex);
5079 nlh = mnl_nlmsg_put_header(buf);
5080 nlh->nlmsg_type = RTM_DELLINK;
5081 nlh->nlmsg_flags = NLM_F_REQUEST;
5082 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5083 ifm->ifi_family = AF_UNSPEC;
5084 ifm->ifi_index = vtep->ifindex;
5085 assert(sizeof(buf) >= nlh->nlmsg_len);
5086 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5088 DRV_LOG(WARNING, "netlink: error deleting vxlan"
5089 " encap/decap ifindex %u",
5096 * Creates VTEP network device.
5099 * Context object initialized by mlx5_flow_tcf_context_create().
5101 * UDP port of created VTEP device.
5103 * Perform verbose error reporting if not NULL.
5106 * Pointer to created device structure on success,
5107 * NULL otherwise and rte_errno is set.
5109 static struct tcf_vtep*
5110 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf,
5111 uint16_t port, struct rte_flow_error *error)
5113 struct tcf_vtep *vtep;
5114 struct nlmsghdr *nlh;
5115 struct ifinfomsg *ifm;
5116 char name[sizeof(MLX5_VXLAN_DEVICE_PFX) + 24];
5117 alignas(struct nlmsghdr)
5118 uint8_t buf[mnl_nlmsg_size(sizeof(*ifm)) +
5119 SZ_NLATTR_DATA_OF(sizeof(name)) +
5120 SZ_NLATTR_NEST * 2 +
5121 SZ_NLATTR_STRZ_OF("vxlan") +
5122 SZ_NLATTR_DATA_OF(sizeof(uint32_t)) +
5123 SZ_NLATTR_DATA_OF(sizeof(uint16_t)) +
5124 SZ_NLATTR_DATA_OF(sizeof(uint8_t)) * 3 +
5125 MNL_BUF_EXTRA_SPACE];
5126 struct nlattr *na_info;
5127 struct nlattr *na_vxlan;
5128 rte_be16_t vxlan_port = rte_cpu_to_be_16(port);
5131 vtep = rte_zmalloc(__func__, sizeof(*vtep), alignof(struct tcf_vtep));
5133 rte_flow_error_set(error, ENOMEM,
5134 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5135 "unable to allocate memory for VTEP");
5138 *vtep = (struct tcf_vtep){
5141 memset(buf, 0, sizeof(buf));
5142 nlh = mnl_nlmsg_put_header(buf);
5143 nlh->nlmsg_type = RTM_NEWLINK;
5144 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
5145 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5146 ifm->ifi_family = AF_UNSPEC;
5149 ifm->ifi_flags = IFF_UP;
5150 ifm->ifi_change = 0xffffffff;
5151 snprintf(name, sizeof(name), "%s%u", MLX5_VXLAN_DEVICE_PFX, port);
5152 mnl_attr_put_strz(nlh, IFLA_IFNAME, name);
5153 na_info = mnl_attr_nest_start(nlh, IFLA_LINKINFO);
5155 mnl_attr_put_strz(nlh, IFLA_INFO_KIND, "vxlan");
5156 na_vxlan = mnl_attr_nest_start(nlh, IFLA_INFO_DATA);
5158 #ifdef HAVE_IFLA_VXLAN_COLLECT_METADATA
5160 * RH 7.2 does not support metadata for tunnel device.
5161 * It does not matter because we are going to use the
5162 * hardware offload by mlx5 driver.
5164 mnl_attr_put_u8(nlh, IFLA_VXLAN_COLLECT_METADATA, 1);
5166 mnl_attr_put_u8(nlh, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1);
5167 mnl_attr_put_u8(nlh, IFLA_VXLAN_LEARNING, 0);
5168 mnl_attr_put_u16(nlh, IFLA_VXLAN_PORT, vxlan_port);
5169 #ifndef HAVE_IFLA_VXLAN_COLLECT_METADATA
5171 * We must specify VNI explicitly if metadata not supported.
5172 * Note, VNI is transferred with native endianness format.
5174 mnl_attr_put_u16(nlh, IFLA_VXLAN_ID, MLX5_VXLAN_DEFAULT_VNI);
5176 mnl_attr_nest_end(nlh, na_vxlan);
5177 mnl_attr_nest_end(nlh, na_info);
5178 assert(sizeof(buf) >= nlh->nlmsg_len);
5179 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5182 "netlink: VTEP %s create failure (%d)",
5184 if (rte_errno != EEXIST)
5186 * Some unhandled error occurred or device is
5187 * for encapsulation and cannot be shared.
5192 * Mark device we actually created.
5193 * We should explicitly delete
5194 * when we do not need it anymore.
5198 /* Try to get ifindex of created of pre-existing device. */
5199 ret = if_nametoindex(name);
5202 "VTEP %s failed to get index (%d)", name, errno);
5205 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5206 "netlink: failed to retrieve VTEP ifindex");
5209 vtep->ifindex = ret;
5210 memset(buf, 0, sizeof(buf));
5211 nlh = mnl_nlmsg_put_header(buf);
5212 nlh->nlmsg_type = RTM_NEWLINK;
5213 nlh->nlmsg_flags = NLM_F_REQUEST;
5214 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5215 ifm->ifi_family = AF_UNSPEC;
5217 ifm->ifi_index = vtep->ifindex;
5218 ifm->ifi_flags = IFF_UP;
5219 ifm->ifi_change = IFF_UP;
5220 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5222 rte_flow_error_set(error, -errno,
5223 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5224 "netlink: failed to set VTEP link up");
5225 DRV_LOG(WARNING, "netlink: VTEP %s set link up failure (%d)",
5229 ret = mlx5_flow_tcf_init(tcf, vtep->ifindex, error);
5231 DRV_LOG(WARNING, "VTEP %s init failure (%d)", name, rte_errno);
5234 DRV_LOG(INFO, "VTEP create (%d, %d)", vtep->port, vtep->ifindex);
5238 flow_tcf_vtep_delete(tcf, vtep);
5246 * Acquire target interface index for VXLAN tunneling decapsulation.
5247 * In order to share the UDP port within the other interfaces the
5248 * VXLAN device created as not attached to any interface (if created).
5251 * Context object initialized by mlx5_flow_tcf_context_create().
5252 * @param[in] dev_flow
5253 * Flow tcf object with tunnel structure pointer set.
5255 * Perform verbose error reporting if not NULL.
5257 * Interface descriptor pointer on success,
5258 * NULL otherwise and rte_errno is set.
5260 static struct tcf_vtep*
5261 flow_tcf_decap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5262 struct mlx5_flow *dev_flow,
5263 struct rte_flow_error *error)
5265 struct tcf_vtep *vtep;
5266 uint16_t port = dev_flow->tcf.vxlan_decap->udp_port;
5268 LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5269 if (vtep->port == port)
5273 /* Device exists, just increment the reference counter. */
5275 assert(vtep->ifindex);
5278 /* No decapsulation device exists, try to create the new one. */
5279 vtep = flow_tcf_vtep_create(tcf, port, error);
5281 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5286 * Aqcuire target interface index for VXLAN tunneling encapsulation.
5289 * Context object initialized by mlx5_flow_tcf_context_create().
5290 * @param[in] ifouter
5291 * Network interface index to attach VXLAN encap device to.
5292 * @param[in] dev_flow
5293 * Flow tcf object with tunnel structure pointer set.
5295 * Perform verbose error reporting if not NULL.
5297 * Interface descriptor pointer on success,
5298 * NULL otherwise and rte_errno is set.
5300 static struct tcf_vtep*
5301 flow_tcf_encap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5302 unsigned int ifouter,
5303 struct mlx5_flow *dev_flow,
5304 struct rte_flow_error *error)
5306 static uint16_t port;
5307 struct tcf_vtep *vtep;
5308 struct tcf_irule *iface;
5312 /* Look whether the VTEP for specified port is created. */
5313 port = rte_be_to_cpu_16(dev_flow->tcf.vxlan_encap->udp.dst);
5314 LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5315 if (vtep->port == port)
5319 /* VTEP already exists, just increment the reference. */
5322 /* Not found, we should create the new VTEP. */
5323 vtep = flow_tcf_vtep_create(tcf, port, error);
5326 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5328 assert(vtep->ifindex);
5329 iface = flow_tcf_encap_irule_acquire(tcf, ifouter, error);
5331 if (--vtep->refcnt == 0)
5332 flow_tcf_vtep_delete(tcf, vtep);
5335 dev_flow->tcf.vxlan_encap->iface = iface;
5336 /* Create local ipaddr with peer to specify the outer IPs. */
5337 ret = flow_tcf_encap_local(tcf, iface, dev_flow, true, error);
5339 /* Create neigh rule to specify outer destination MAC. */
5340 ret = flow_tcf_encap_neigh(tcf, iface, dev_flow, true, error);
5342 flow_tcf_encap_local(tcf, iface,
5343 dev_flow, false, error);
5346 dev_flow->tcf.vxlan_encap->iface = NULL;
5347 flow_tcf_encap_irule_release(iface);
5348 if (--vtep->refcnt == 0)
5349 flow_tcf_vtep_delete(tcf, vtep);
5356 * Acquires target interface index for tunneling of any type.
5357 * Creates the new VTEP if needed.
5360 * Context object initialized by mlx5_flow_tcf_context_create().
5361 * @param[in] ifouter
5362 * Network interface index to create VXLAN encap rules on.
5363 * @param[in] dev_flow
5364 * Flow tcf object with tunnel structure pointer set.
5366 * Perform verbose error reporting if not NULL.
5368 * Interface descriptor pointer on success,
5369 * NULL otherwise and rte_errno is set.
5371 static struct tcf_vtep*
5372 flow_tcf_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5373 unsigned int ifouter,
5374 struct mlx5_flow *dev_flow,
5375 struct rte_flow_error *error)
5377 struct tcf_vtep *vtep = NULL;
5379 assert(dev_flow->tcf.tunnel);
5380 pthread_mutex_lock(&vtep_list_mutex);
5381 switch (dev_flow->tcf.tunnel->type) {
5382 case FLOW_TCF_TUNACT_VXLAN_ENCAP:
5383 vtep = flow_tcf_encap_vtep_acquire(tcf, ifouter,
5386 case FLOW_TCF_TUNACT_VXLAN_DECAP:
5387 vtep = flow_tcf_decap_vtep_acquire(tcf, dev_flow, error);
5390 rte_flow_error_set(error, ENOTSUP,
5391 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5392 "unsupported tunnel type");
5395 pthread_mutex_unlock(&vtep_list_mutex);
5400 * Release tunneling interface by ifindex. Decrements reference
5401 * counter and actually removes the device if counter is zero.
5404 * Context object initialized by mlx5_flow_tcf_context_create().
5406 * VTEP device descriptor structure.
5407 * @param[in] dev_flow
5408 * Flow tcf object with tunnel structure pointer set.
5411 flow_tcf_vtep_release(struct mlx5_flow_tcf_context *tcf,
5412 struct tcf_vtep *vtep,
5413 struct mlx5_flow *dev_flow)
5415 assert(dev_flow->tcf.tunnel);
5416 pthread_mutex_lock(&vtep_list_mutex);
5417 switch (dev_flow->tcf.tunnel->type) {
5418 case FLOW_TCF_TUNACT_VXLAN_DECAP:
5420 case FLOW_TCF_TUNACT_VXLAN_ENCAP: {
5421 struct tcf_irule *iface;
5423 /* Remove the encap ancillary rules first. */
5424 iface = dev_flow->tcf.vxlan_encap->iface;
5426 flow_tcf_encap_neigh(tcf, iface, dev_flow, false, NULL);
5427 flow_tcf_encap_local(tcf, iface, dev_flow, false, NULL);
5428 flow_tcf_encap_irule_release(iface);
5429 dev_flow->tcf.vxlan_encap->iface = NULL;
5434 DRV_LOG(WARNING, "Unsupported tunnel type");
5437 assert(vtep->refcnt);
5438 if (--vtep->refcnt == 0) {
5439 LIST_REMOVE(vtep, next);
5440 flow_tcf_vtep_delete(tcf, vtep);
5442 pthread_mutex_unlock(&vtep_list_mutex);
5445 struct tcf_nlcb_query {
5448 uint32_t flags_valid:1;
5452 * Collect queried rule attributes. This is callback routine called by
5453 * libmnl mnl_cb_run() in loop for every message in received packet.
5454 * Current implementation collects the flower flags only.
5457 * Pointer to reply header.
5458 * @param[in, out] arg
5459 * Context pointer for this callback.
5462 * A positive, nonzero value on success (required by libmnl
5463 * to continue messages processing).
5466 flow_tcf_collect_query_cb(const struct nlmsghdr *nlh, void *arg)
5468 struct tcf_nlcb_query *query = arg;
5469 struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
5470 struct nlattr *na, *na_opt;
5471 bool flower = false;
5473 if (nlh->nlmsg_type != RTM_NEWTFILTER ||
5474 tcm->tcm_handle != query->handle)
5476 mnl_attr_for_each(na, nlh, sizeof(*tcm)) {
5477 switch (mnl_attr_get_type(na)) {
5479 if (strcmp(mnl_attr_get_payload(na), "flower")) {
5480 /* Not flower filter, drop entire message. */
5487 /* Not flower options, drop entire message. */
5490 /* Check nested flower options. */
5491 mnl_attr_for_each_nested(na_opt, na) {
5492 switch (mnl_attr_get_type(na_opt)) {
5493 case TCA_FLOWER_FLAGS:
5494 query->flags_valid = 1;
5496 mnl_attr_get_u32(na_opt);
5507 * Query a TC flower rule flags via netlink.
5510 * Context object initialized by mlx5_flow_tcf_context_create().
5511 * @param[in] dev_flow
5512 * Pointer to the flow.
5513 * @param[out] pflags
5514 * pointer to the data retrieved by the query.
5517 * 0 on success, a negative errno value otherwise.
5520 flow_tcf_query_flags(struct mlx5_flow_tcf_context *tcf,
5521 struct mlx5_flow *dev_flow,
5524 struct nlmsghdr *nlh;
5526 struct tcf_nlcb_query query = {
5527 .handle = dev_flow->tcf.tcm->tcm_handle,
5530 nlh = mnl_nlmsg_put_header(tcf->buf);
5531 nlh->nlmsg_type = RTM_GETTFILTER;
5532 nlh->nlmsg_flags = NLM_F_REQUEST;
5533 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
5534 memcpy(tcm, dev_flow->tcf.tcm, sizeof(*tcm));
5536 * Ignore Netlink error for filter query operations.
5537 * The reply length is sent by kernel as errno.
5538 * Just check we got the flags option.
5540 flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_query_cb, &query);
5541 if (!query.flags_valid) {
5545 *pflags = query.tc_flags;
5550 * Query and check the in_hw set for specified rule.
5553 * Context object initialized by mlx5_flow_tcf_context_create().
5554 * @param[in] dev_flow
5555 * Pointer to the flow to check.
5558 * 0 on success, a negative errno value otherwise.
5561 flow_tcf_check_inhw(struct mlx5_flow_tcf_context *tcf,
5562 struct mlx5_flow *dev_flow)
5567 ret = flow_tcf_query_flags(tcf, dev_flow, &flags);
5570 return (flags & TCA_CLS_FLAGS_IN_HW) ? 0 : -ENOENT;
5574 * Remove flow from E-Switch by sending Netlink message.
5577 * Pointer to Ethernet device.
5578 * @param[in, out] flow
5579 * Pointer to the sub flow.
5582 flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
5584 struct priv *priv = dev->data->dev_private;
5585 struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5586 struct mlx5_flow *dev_flow;
5587 struct nlmsghdr *nlh;
5591 dev_flow = LIST_FIRST(&flow->dev_flows);
5594 /* E-Switch flow can't be expanded. */
5595 assert(!LIST_NEXT(dev_flow, next));
5596 if (dev_flow->tcf.applied) {
5597 nlh = dev_flow->tcf.nlh;
5598 nlh->nlmsg_type = RTM_DELTFILTER;
5599 nlh->nlmsg_flags = NLM_F_REQUEST;
5600 flow_tcf_nl_ack(ctx, nlh, NULL, NULL);
5601 if (dev_flow->tcf.tunnel) {
5602 assert(dev_flow->tcf.tunnel->vtep);
5603 flow_tcf_vtep_release(ctx,
5604 dev_flow->tcf.tunnel->vtep,
5606 dev_flow->tcf.tunnel->vtep = NULL;
5608 dev_flow->tcf.applied = 0;
5613 * Apply flow to E-Switch by sending Netlink message.
5616 * Pointer to Ethernet device.
5617 * @param[in, out] flow
5618 * Pointer to the sub flow.
5620 * Pointer to the error structure.
5623 * 0 on success, a negative errno value otherwise and rte_errno is set.
5626 flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
5627 struct rte_flow_error *error)
5629 struct priv *priv = dev->data->dev_private;
5630 struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5631 struct mlx5_flow *dev_flow;
5632 struct nlmsghdr *nlh;
5634 dev_flow = LIST_FIRST(&flow->dev_flows);
5635 /* E-Switch flow can't be expanded. */
5636 assert(!LIST_NEXT(dev_flow, next));
5637 if (dev_flow->tcf.applied)
5639 nlh = dev_flow->tcf.nlh;
5640 nlh->nlmsg_type = RTM_NEWTFILTER;
5641 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
5642 if (dev_flow->tcf.tunnel) {
5644 * Replace the interface index, target for
5645 * encapsulation, source for decapsulation.
5647 assert(!dev_flow->tcf.tunnel->vtep);
5648 assert(dev_flow->tcf.tunnel->ifindex_ptr);
5649 /* Acquire actual VTEP device when rule is being applied. */
5650 dev_flow->tcf.tunnel->vtep =
5651 flow_tcf_vtep_acquire(ctx,
5652 dev_flow->tcf.tunnel->ifindex_org,
5654 if (!dev_flow->tcf.tunnel->vtep)
5656 DRV_LOG(INFO, "Replace ifindex: %d->%d",
5657 dev_flow->tcf.tunnel->vtep->ifindex,
5658 dev_flow->tcf.tunnel->ifindex_org);
5659 *dev_flow->tcf.tunnel->ifindex_ptr =
5660 dev_flow->tcf.tunnel->vtep->ifindex;
5662 if (!flow_tcf_nl_ack(ctx, nlh, NULL, NULL)) {
5663 dev_flow->tcf.applied = 1;
5664 if (*dev_flow->tcf.ptc_flags & TCA_CLS_FLAGS_SKIP_SW)
5667 * Rule was applied without skip_sw flag set.
5668 * We should check whether the rule was acctually
5669 * accepted by hardware (have look at in_hw flag).
5671 if (flow_tcf_check_inhw(ctx, dev_flow)) {
5672 flow_tcf_remove(dev, flow);
5673 return rte_flow_error_set
5675 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5676 "netlink: rule has no in_hw flag set");
5680 if (dev_flow->tcf.tunnel) {
5681 /* Rollback the VTEP configuration if rule apply failed. */
5682 assert(dev_flow->tcf.tunnel->vtep);
5683 flow_tcf_vtep_release(ctx, dev_flow->tcf.tunnel->vtep,
5685 dev_flow->tcf.tunnel->vtep = NULL;
5687 return rte_flow_error_set(error, rte_errno,
5688 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5689 "netlink: failed to create TC flow rule");
5693 * Remove flow from E-Switch and release resources of the device flow.
5696 * Pointer to Ethernet device.
5697 * @param[in, out] flow
5698 * Pointer to the sub flow.
5701 flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
5703 struct mlx5_flow *dev_flow;
5707 flow_tcf_remove(dev, flow);
5708 if (flow->counter) {
5709 if (--flow->counter->ref_cnt == 0) {
5710 rte_free(flow->counter);
5711 flow->counter = NULL;
5714 dev_flow = LIST_FIRST(&flow->dev_flows);
5717 /* E-Switch flow can't be expanded. */
5718 assert(!LIST_NEXT(dev_flow, next));
5719 LIST_REMOVE(dev_flow, next);
5724 * Helper routine for figuring the space size required for a parse buffer.
5727 * array of values to use.
5729 * Current location in array.
5731 * Value to compare with.
5734 * The maximum between the given value and the array value on index.
5737 flow_tcf_arr_val_max(uint16_t array[], int idx, uint16_t value)
5739 return idx < 0 ? (value) : RTE_MAX((array)[idx], value);
5743 * Parse rtnetlink message attributes filling the attribute table with the info
5747 * Attribute table to be filled.
5749 * Maxinum entry in the attribute table.
5751 * The attributes section in the message to be parsed.
5753 * The length of the attributes section in the message.
5756 flow_tcf_nl_parse_rtattr(struct rtattr *tb[], int max,
5757 struct rtattr *rta, int len)
5759 unsigned short type;
5760 memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
5761 while (RTA_OK(rta, len)) {
5762 type = rta->rta_type;
5763 if (type <= max && !tb[type])
5765 rta = RTA_NEXT(rta, len);
5770 * Extract flow counters from flower action.
5773 * flower action stats properties in the Netlink message received.
5775 * The backward sequence of rta_types, as written in the attribute table,
5776 * we need to traverse in order to get to the requested object.
5778 * Current location in rta_type table.
5780 * data holding the count statistics of the rte_flow retrieved from
5784 * 0 if data was found and retrieved, -1 otherwise.
5787 flow_tcf_nl_action_stats_parse_and_get(struct rtattr *rta,
5788 uint16_t rta_type[], int idx,
5789 struct gnet_stats_basic *data)
5791 int tca_stats_max = flow_tcf_arr_val_max(rta_type, idx,
5793 struct rtattr *tbs[tca_stats_max + 1];
5795 if (rta == NULL || idx < 0)
5797 flow_tcf_nl_parse_rtattr(tbs, tca_stats_max,
5798 RTA_DATA(rta), RTA_PAYLOAD(rta));
5799 switch (rta_type[idx]) {
5800 case TCA_STATS_BASIC:
5801 if (tbs[TCA_STATS_BASIC]) {
5802 memcpy(data, RTA_DATA(tbs[TCA_STATS_BASIC]),
5803 RTE_MIN(RTA_PAYLOAD(tbs[TCA_STATS_BASIC]),
5815 * Parse flower single action retrieving the requested action attribute,
5819 * flower action properties in the Netlink message received.
5821 * The backward sequence of rta_types, as written in the attribute table,
5822 * we need to traverse in order to get to the requested object.
5824 * Current location in rta_type table.
5826 * Count statistics retrieved from the message query.
5829 * 0 if data was found and retrieved, -1 otherwise.
5832 flow_tcf_nl_parse_one_action_and_get(struct rtattr *arg,
5833 uint16_t rta_type[], int idx, void *data)
5835 int tca_act_max = flow_tcf_arr_val_max(rta_type, idx, TCA_ACT_STATS);
5836 struct rtattr *tb[tca_act_max + 1];
5838 if (arg == NULL || idx < 0)
5840 flow_tcf_nl_parse_rtattr(tb, tca_act_max,
5841 RTA_DATA(arg), RTA_PAYLOAD(arg));
5842 if (tb[TCA_ACT_KIND] == NULL)
5844 switch (rta_type[idx]) {
5846 if (tb[TCA_ACT_STATS])
5847 return flow_tcf_nl_action_stats_parse_and_get
5850 (struct gnet_stats_basic *)data);
5859 * Parse flower action section in the message retrieving the requested
5860 * attribute from the first action that provides it.
5863 * flower section in the Netlink message received.
5865 * The backward sequence of rta_types, as written in the attribute table,
5866 * we need to traverse in order to get to the requested object.
5868 * Current location in rta_type table.
5870 * data retrieved from the message query.
5873 * 0 if data was found and retrieved, -1 otherwise.
5876 flow_tcf_nl_action_parse_and_get(struct rtattr *arg,
5877 uint16_t rta_type[], int idx, void *data)
5879 struct rtattr *tb[TCA_ACT_MAX_PRIO + 1];
5882 if (arg == NULL || idx < 0)
5884 flow_tcf_nl_parse_rtattr(tb, TCA_ACT_MAX_PRIO,
5885 RTA_DATA(arg), RTA_PAYLOAD(arg));
5886 switch (rta_type[idx]) {
5888 * flow counters are stored in the actions defined by the flow
5889 * and not in the flow itself, therefore we need to traverse the
5890 * flower chain of actions in search for them.
5892 * Note that the index is not decremented here.
5895 for (i = 0; i <= TCA_ACT_MAX_PRIO; i++) {
5897 !flow_tcf_nl_parse_one_action_and_get(tb[i],
5910 * Parse flower classifier options in the message, retrieving the requested
5911 * attribute if found.
5914 * flower section in the Netlink message received.
5916 * The backward sequence of rta_types, as written in the attribute table,
5917 * we need to traverse in order to get to the requested object.
5919 * Current location in rta_type table.
5921 * data retrieved from the message query.
5924 * 0 if data was found and retrieved, -1 otherwise.
5927 flow_tcf_nl_opts_parse_and_get(struct rtattr *opt,
5928 uint16_t rta_type[], int idx, void *data)
5930 int tca_flower_max = flow_tcf_arr_val_max(rta_type, idx,
5932 struct rtattr *tb[tca_flower_max + 1];
5934 if (!opt || idx < 0)
5936 flow_tcf_nl_parse_rtattr(tb, tca_flower_max,
5937 RTA_DATA(opt), RTA_PAYLOAD(opt));
5938 switch (rta_type[idx]) {
5939 case TCA_FLOWER_ACT:
5940 if (tb[TCA_FLOWER_ACT])
5941 return flow_tcf_nl_action_parse_and_get
5942 (tb[TCA_FLOWER_ACT],
5943 rta_type, --idx, data);
5952 * Parse Netlink reply on filter query, retrieving the flow counters.
5955 * Message received from Netlink.
5957 * The backward sequence of rta_types, as written in the attribute table,
5958 * we need to traverse in order to get to the requested object.
5960 * Current location in rta_type table.
5962 * data retrieved from the message query.
5965 * 0 if data was found and retrieved, -1 otherwise.
5968 flow_tcf_nl_filter_parse_and_get(struct nlmsghdr *cnlh,
5969 uint16_t rta_type[], int idx, void *data)
5971 struct nlmsghdr *nlh = cnlh;
5972 struct tcmsg *t = NLMSG_DATA(nlh);
5973 int len = nlh->nlmsg_len;
5974 int tca_max = flow_tcf_arr_val_max(rta_type, idx, TCA_OPTIONS);
5975 struct rtattr *tb[tca_max + 1];
5979 if (nlh->nlmsg_type != RTM_NEWTFILTER &&
5980 nlh->nlmsg_type != RTM_GETTFILTER &&
5981 nlh->nlmsg_type != RTM_DELTFILTER)
5983 len -= NLMSG_LENGTH(sizeof(*t));
5986 flow_tcf_nl_parse_rtattr(tb, tca_max, TCA_RTA(t), len);
5987 /* Not a TC flower flow - bail out */
5988 if (!tb[TCA_KIND] ||
5989 strcmp(RTA_DATA(tb[TCA_KIND]), "flower"))
5991 switch (rta_type[idx]) {
5993 if (tb[TCA_OPTIONS])
5994 return flow_tcf_nl_opts_parse_and_get(tb[TCA_OPTIONS],
6005 * A callback to parse Netlink reply on TC flower query.
6008 * Message received from Netlink.
6010 * Pointer to data area to be filled by the parsing routine.
6011 * assumed to be a pointer to struct flow_tcf_stats_basic.
6017 flow_tcf_nl_message_get_stats_basic(const struct nlmsghdr *nlh, void *data)
6020 * The backward sequence of rta_types to pass in order to get
6023 uint16_t rta_type[] = { TCA_STATS_BASIC, TCA_ACT_STATS,
6024 TCA_FLOWER_ACT, TCA_OPTIONS };
6025 struct flow_tcf_stats_basic *sb_data = data;
6027 const struct nlmsghdr *c;
6028 struct nlmsghdr *nc;
6029 } tnlh = { .c = nlh };
6031 if (!flow_tcf_nl_filter_parse_and_get(tnlh.nc, rta_type,
6032 RTE_DIM(rta_type) - 1,
6033 (void *)&sb_data->counters))
6034 sb_data->valid = true;
6039 * Query a TC flower rule for its statistics via netlink.
6042 * Pointer to Ethernet device.
6044 * Pointer to the sub flow.
6046 * data retrieved by the query.
6048 * Perform verbose error reporting if not NULL.
6051 * 0 on success, a negative errno value otherwise and rte_errno is set.
6054 flow_tcf_query_count(struct rte_eth_dev *dev,
6055 struct rte_flow *flow,
6057 struct rte_flow_error *error)
6059 struct flow_tcf_stats_basic sb_data;
6060 struct rte_flow_query_count *qc = data;
6061 struct priv *priv = dev->data->dev_private;
6062 struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
6063 struct mnl_socket *nl = ctx->nl;
6064 struct mlx5_flow *dev_flow;
6065 struct nlmsghdr *nlh;
6066 uint32_t seq = priv->tcf_context->seq++;
6070 memset(&sb_data, 0, sizeof(sb_data));
6071 dev_flow = LIST_FIRST(&flow->dev_flows);
6072 /* E-Switch flow can't be expanded. */
6073 assert(!LIST_NEXT(dev_flow, next));
6074 if (!dev_flow->flow->counter)
6076 nlh = dev_flow->tcf.nlh;
6077 nlh->nlmsg_type = RTM_GETTFILTER;
6078 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ECHO;
6079 nlh->nlmsg_seq = seq;
6080 if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) == -1)
6083 ret = mnl_socket_recvfrom(nl, ctx->buf, ctx->buf_size);
6086 ret = mnl_cb_run(ctx->buf, ret, seq,
6087 mnl_socket_get_portid(nl),
6088 flow_tcf_nl_message_get_stats_basic,
6091 /* Return the delta from last reset. */
6092 if (sb_data.valid) {
6093 /* Return the delta from last reset. */
6096 qc->hits = sb_data.counters.packets - flow->counter->hits;
6097 qc->bytes = sb_data.counters.bytes - flow->counter->bytes;
6099 flow->counter->hits = sb_data.counters.packets;
6100 flow->counter->bytes = sb_data.counters.bytes;
6104 return rte_flow_error_set(error, EINVAL,
6105 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
6107 "flow does not have counter");
6109 return rte_flow_error_set
6110 (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
6111 NULL, "netlink: failed to read flow rule counters");
6113 return rte_flow_error_set
6114 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
6115 NULL, "counters are not available.");
6121 * @see rte_flow_query()
6125 flow_tcf_query(struct rte_eth_dev *dev,
6126 struct rte_flow *flow,
6127 const struct rte_flow_action *actions,
6129 struct rte_flow_error *error)
6133 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
6134 switch (actions->type) {
6135 case RTE_FLOW_ACTION_TYPE_VOID:
6137 case RTE_FLOW_ACTION_TYPE_COUNT:
6138 ret = flow_tcf_query_count(dev, flow, data, error);
6141 return rte_flow_error_set(error, ENOTSUP,
6142 RTE_FLOW_ERROR_TYPE_ACTION,
6144 "action not supported");
6150 const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops = {
6151 .validate = flow_tcf_validate,
6152 .prepare = flow_tcf_prepare,
6153 .translate = flow_tcf_translate,
6154 .apply = flow_tcf_apply,
6155 .remove = flow_tcf_remove,
6156 .destroy = flow_tcf_destroy,
6157 .query = flow_tcf_query,
6161 * Create and configure a libmnl socket for Netlink flow rules.
6164 * A valid libmnl socket object pointer on success, NULL otherwise and
6167 static struct mnl_socket *
6168 flow_tcf_mnl_socket_create(void)
6170 struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
6173 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
6175 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
6180 mnl_socket_close(nl);
6185 * Destroy a libmnl socket.
6188 * Libmnl socket of the @p NETLINK_ROUTE kind.
6191 flow_tcf_mnl_socket_destroy(struct mnl_socket *nl)
6194 mnl_socket_close(nl);
6198 * Initialize ingress qdisc of a given network interface.
6201 * Pointer to tc-flower context to use.
6203 * Index of network interface to initialize.
6205 * Perform verbose error reporting if not NULL.
6208 * 0 on success, a negative errno value otherwise and rte_errno is set.
6211 mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx,
6212 unsigned int ifindex, struct rte_flow_error *error)
6214 struct nlmsghdr *nlh;
6216 alignas(struct nlmsghdr)
6217 uint8_t buf[mnl_nlmsg_size(sizeof(*tcm)) +
6218 SZ_NLATTR_STRZ_OF("ingress") +
6219 MNL_BUF_EXTRA_SPACE];
6221 /* Destroy existing ingress qdisc and everything attached to it. */
6222 nlh = mnl_nlmsg_put_header(buf);
6223 nlh->nlmsg_type = RTM_DELQDISC;
6224 nlh->nlmsg_flags = NLM_F_REQUEST;
6225 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6226 tcm->tcm_family = AF_UNSPEC;
6227 tcm->tcm_ifindex = ifindex;
6228 tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6229 tcm->tcm_parent = TC_H_INGRESS;
6230 assert(sizeof(buf) >= nlh->nlmsg_len);
6231 /* Ignore errors when qdisc is already absent. */
6232 if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL) &&
6233 rte_errno != EINVAL && rte_errno != ENOENT)
6234 return rte_flow_error_set(error, rte_errno,
6235 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6236 "netlink: failed to remove ingress"
6238 /* Create fresh ingress qdisc. */
6239 nlh = mnl_nlmsg_put_header(buf);
6240 nlh->nlmsg_type = RTM_NEWQDISC;
6241 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
6242 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6243 tcm->tcm_family = AF_UNSPEC;
6244 tcm->tcm_ifindex = ifindex;
6245 tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6246 tcm->tcm_parent = TC_H_INGRESS;
6247 mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
6248 assert(sizeof(buf) >= nlh->nlmsg_len);
6249 if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL))
6250 return rte_flow_error_set(error, rte_errno,
6251 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6252 "netlink: failed to create ingress"
6258 * Create libmnl context for Netlink flow rules.
6261 * A valid libmnl socket object pointer on success, NULL otherwise and
6264 struct mlx5_flow_tcf_context *
6265 mlx5_flow_tcf_context_create(void)
6267 struct mlx5_flow_tcf_context *ctx = rte_zmalloc(__func__,
6272 ctx->nl = flow_tcf_mnl_socket_create();
6275 ctx->buf_size = MNL_SOCKET_BUFFER_SIZE;
6276 ctx->buf = rte_zmalloc(__func__,
6277 ctx->buf_size, sizeof(uint32_t));
6280 ctx->seq = random();
6283 mlx5_flow_tcf_context_destroy(ctx);
6288 * Destroy a libmnl context.
6291 * Libmnl socket of the @p NETLINK_ROUTE kind.
6294 mlx5_flow_tcf_context_destroy(struct mlx5_flow_tcf_context *ctx)
6298 flow_tcf_mnl_socket_destroy(ctx->nl);