X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=drivers%2Fnet%2Fmlx5%2Fmlx5_flow_tcf.c;h=2270ae3044c0d822b539deee19ed830d2a559b7a;hb=24ac604ef7469eb5773c2504b313dd00257f8df3;hp=ee614b3f1ddae4f145272d6dcebffbf2fe8dd602;hpb=e7d2e32b26f83b69128f80a7d025dca149c5a126;p=dpdk.git diff --git a/drivers/net/mlx5/mlx5_flow_tcf.c b/drivers/net/mlx5/mlx5_flow_tcf.c index ee614b3f1d..2270ae3044 100644 --- a/drivers/net/mlx5/mlx5_flow_tcf.c +++ b/drivers/net/mlx5/mlx5_flow_tcf.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "mlx5.h" #include "mlx5_flow.h" @@ -125,6 +126,14 @@ struct tc_pedit_sel { #define TCA_TUNNEL_KEY_NO_CSUM 10 #endif +#ifndef HAVE_TCA_TUNNEL_KEY_ENC_TOS +#define TCA_TUNNEL_KEY_ENC_TOS 12 +#endif + +#ifndef HAVE_TCA_TUNNEL_KEY_ENC_TTL +#define TCA_TUNNEL_KEY_ENC_TTL 13 +#endif + #else /* HAVE_TC_ACT_TUNNEL_KEY */ #define TCA_ACT_TUNNEL_KEY 17 @@ -138,6 +147,8 @@ struct tc_pedit_sel { #define TCA_TUNNEL_KEY_ENC_KEY_ID 7 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9 #define TCA_TUNNEL_KEY_NO_CSUM 10 +#define TCA_TUNNEL_KEY_ENC_TOS 12 +#define TCA_TUNNEL_KEY_ENC_TTL 13 struct tc_tunnel_key { tc_gen; @@ -160,6 +171,9 @@ struct tc_tunnel_key { #ifndef TCA_CLS_FLAGS_SKIP_SW #define TCA_CLS_FLAGS_SKIP_SW (1 << 1) #endif +#ifndef TCA_CLS_FLAGS_IN_HW +#define TCA_CLS_FLAGS_IN_HW (1 << 2) +#endif #ifndef HAVE_TCA_CHAIN #define TCA_CHAIN 11 #endif @@ -289,6 +303,31 @@ struct tc_tunnel_key { #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK #define TCA_FLOWER_KEY_TCP_FLAGS_MASK 72 #endif +#ifndef HAVE_TCA_FLOWER_KEY_IP_TOS +#define TCA_FLOWER_KEY_IP_TOS 73 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_IP_TOS_MASK +#define TCA_FLOWER_KEY_IP_TOS_MASK 74 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_IP_TTL +#define TCA_FLOWER_KEY_IP_TTL 75 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_IP_TTL_MASK +#define TCA_FLOWER_KEY_IP_TTL_MASK 76 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TOS +#define TCA_FLOWER_KEY_ENC_IP_TOS 80 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TOS_MASK +#define TCA_FLOWER_KEY_ENC_IP_TOS_MASK 81 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TTL +#define TCA_FLOWER_KEY_ENC_IP_TTL 82 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TTL_MASK +#define TCA_FLOWER_KEY_ENC_IP_TTL_MASK 83 +#endif + #ifndef HAVE_TC_ACT_GOTO_CHAIN #define TC_ACT_GOTO_CHAIN 0x20000000 #endif @@ -313,10 +352,14 @@ struct tc_tunnel_key { #define TCA_ACT_MAX_PRIO 32 #endif -/** UDP port range of VXLAN devices created by driver. */ -#define MLX5_VXLAN_PORT_MIN 30000 -#define MLX5_VXLAN_PORT_MAX 60000 +/** Parameters of VXLAN devices created by driver. */ +#define MLX5_VXLAN_DEFAULT_VNI 1 #define MLX5_VXLAN_DEVICE_PFX "vmlx_" +/** + * Timeout in milliseconds to wait VXLAN UDP offloaded port + * registration completed within the mlx5 driver. + */ +#define MLX5_VXLAN_WAIT_PORT_REG_MS 250 /** Tunnel action type, used for @p type in header structure. */ enum flow_tcf_tunact_type { @@ -334,6 +377,8 @@ enum flow_tcf_tunact_type { #define FLOW_TCF_ENCAP_UDP_SRC (1u << 6) #define FLOW_TCF_ENCAP_UDP_DST (1u << 7) #define FLOW_TCF_ENCAP_VXLAN_VNI (1u << 8) +#define FLOW_TCF_ENCAP_IP_TTL (1u << 9) +#define FLOW_TCF_ENCAP_IP_TOS (1u << 10) /** * Structure for holding netlink context. @@ -357,7 +402,7 @@ struct mlx5_flow_tcf_context { struct tcf_neigh_rule { LIST_ENTRY(tcf_neigh_rule) next; uint32_t refcnt; - struct ether_addr eth; + struct rte_ether_addr eth; uint16_t mask; union { struct { @@ -391,16 +436,23 @@ struct tcf_local_rule { }; }; +/** Outer interface VXLAN encapsulation rules container. */ +struct tcf_irule { + LIST_ENTRY(tcf_irule) next; + LIST_HEAD(, tcf_neigh_rule) neigh; + LIST_HEAD(, tcf_local_rule) local; + uint32_t refcnt; + unsigned int ifouter; /**< Own interface index. */ +}; + /** VXLAN virtual netdev. */ struct tcf_vtep { LIST_ENTRY(tcf_vtep) next; - LIST_HEAD(, tcf_neigh_rule) neigh; - LIST_HEAD(, tcf_local_rule) local; uint32_t refcnt; unsigned int ifindex; /**< Own interface index. */ - unsigned int ifouter; /**< Index of device attached to. */ uint16_t port; - uint8_t created; + uint32_t created:1; /**< Actually created by PMD. */ + uint32_t waitreg:1; /**< Wait for VXLAN UDP port registration. */ }; /** Tunnel descriptor header, common for all tunnel types. */ @@ -418,10 +470,13 @@ struct flow_tcf_vxlan_decap { struct flow_tcf_vxlan_encap { struct flow_tcf_tunnel_hdr hdr; + struct tcf_irule *iface; uint32_t mask; + uint8_t ip_tos; + uint8_t ip_ttl_hop; struct { - struct ether_addr dst; - struct ether_addr src; + struct rte_ether_addr dst; + struct rte_ether_addr src; } eth; union { struct { @@ -433,7 +488,7 @@ struct flow_tcf_vxlan_encap { uint8_t src[IPV6_ADDR_LEN]; } ipv6; }; -struct { + struct { rte_be16_t src; rte_be16_t dst; } udp; @@ -460,7 +515,9 @@ static const union { struct rte_flow_item_tcp tcp; struct rte_flow_item_udp udp; struct rte_flow_item_vxlan vxlan; -} flow_tcf_mask_empty; +} flow_tcf_mask_empty = { + {0}, +}; /** Supported masks for known item types. */ static const struct { @@ -488,11 +545,15 @@ static const struct { }, .ipv4.hdr = { .next_proto_id = 0xff, + .time_to_live = 0xff, + .type_of_service = 0xff, .src_addr = RTE_BE32(0xffffffff), .dst_addr = RTE_BE32(0xffffffff), }, .ipv6.hdr = { .proto = 0xff, + .vtc_flow = RTE_BE32(0xfful << RTE_IPV6_HDR_FL_SHIFT), + .hop_limits = 0xff, .src_addr = "\xff\xff\xff\xff\xff\xff\xff\xff" "\xff\xff\xff\xff\xff\xff\xff\xff", @@ -628,8 +689,8 @@ flow_tcf_pedit_key_set_mac(const struct rte_flow_action *actions, { int idx = p_parser->sel.nkeys; uint32_t off = actions->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ? - offsetof(struct ether_hdr, s_addr) : - offsetof(struct ether_hdr, d_addr); + offsetof(struct rte_ether_hdr, s_addr) : + offsetof(struct rte_ether_hdr, d_addr); const struct rte_flow_action_set_mac *conf = (const struct rte_flow_action_set_mac *)actions->conf; @@ -646,7 +707,7 @@ flow_tcf_pedit_key_set_mac(const struct rte_flow_action *actions, p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET; memcpy(&p_parser->keys[idx].val, conf->mac_addr + SZ_PEDIT_KEY_VAL, - ETHER_ADDR_LEN - SZ_PEDIT_KEY_VAL); + RTE_ETHER_ADDR_LEN - SZ_PEDIT_KEY_VAL); p_parser->sel.nkeys = (++idx); } @@ -671,12 +732,12 @@ flow_tcf_pedit_key_set_dec_ttl(const struct rte_flow_action *actions, if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) { p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4; p_parser->keys[idx].off = - offsetof(struct ipv4_hdr, time_to_live); + offsetof(struct rte_ipv4_hdr, time_to_live); } if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6) { p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6; p_parser->keys[idx].off = - offsetof(struct ipv6_hdr, hop_limits); + offsetof(struct rte_ipv6_hdr, hop_limits); } if (actions->type == RTE_FLOW_ACTION_TYPE_DEC_TTL) { p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_ADD; @@ -740,8 +801,8 @@ flow_tcf_pedit_key_set_ipv6_addr(const struct rte_flow_action *actions, int keys = NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN); int off_base = actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ? - offsetof(struct ipv6_hdr, src_addr) : - offsetof(struct ipv6_hdr, dst_addr); + offsetof(struct rte_ipv6_hdr, src_addr) : + offsetof(struct rte_ipv6_hdr, dst_addr); const struct rte_flow_action_set_ipv6 *conf = (const struct rte_flow_action_set_ipv6 *)actions->conf; @@ -775,8 +836,8 @@ flow_tcf_pedit_key_set_ipv4_addr(const struct rte_flow_action *actions, p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET; p_parser->keys[idx].off = actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ? - offsetof(struct ipv4_hdr, src_addr) : - offsetof(struct ipv4_hdr, dst_addr); + offsetof(struct rte_ipv4_hdr, src_addr) : + offsetof(struct rte_ipv4_hdr, dst_addr); p_parser->keys[idx].mask = ~UINT32_MAX; p_parser->keys[idx].val = ((const struct rte_flow_action_set_ipv4 *) @@ -923,11 +984,11 @@ flow_tcf_get_pedit_actions_size(const struct rte_flow_action **actions, flags |= MLX5_FLOW_ACTION_DEC_TTL; break; case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC: - keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN); + keys += NUM_OF_PEDIT_KEYS(RTE_ETHER_ADDR_LEN); flags |= MLX5_FLOW_ACTION_SET_MAC_SRC; break; case RTE_FLOW_ACTION_TYPE_SET_MAC_DST: - keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN); + keys += NUM_OF_PEDIT_KEYS(RTE_ETHER_ADDR_LEN); flags |= MLX5_FLOW_ACTION_SET_MAC_DST; break; default: @@ -1263,6 +1324,20 @@ flow_tcf_validate_vxlan_encap_ipv4(const struct rte_flow_item *item, " must be specified for" " vxlan encapsulation"); } + if (mask->hdr.type_of_service && + mask->hdr.type_of_service != 0xff) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "no support for partial mask on" + " \"ipv4.hdr.type_of_service\" field" + " for vxlan encapsulation"); + if (mask->hdr.time_to_live && + mask->hdr.time_to_live != 0xff) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "no support for partial mask on" + " \"ipv4.hdr.time_to_live\" field" + " for vxlan encapsulation"); return 0; } @@ -1276,7 +1351,7 @@ flow_tcf_validate_vxlan_encap_ipv4(const struct rte_flow_item *item, * Pointer to the error structure. * * @return - * 0 on success, a negative errno value otherwise and rte_ernno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. **/ static int flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item, @@ -1284,6 +1359,7 @@ flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item, { const struct rte_flow_item_ipv6 *spec = item->spec; const struct rte_flow_item_ipv6 *mask = item->mask; + uint8_t msk6; if (!spec) { /* @@ -1349,6 +1425,20 @@ flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item, " must be specified for" " vxlan encapsulation"); } + msk6 = (rte_be_to_cpu_32(mask->hdr.vtc_flow) >> + RTE_IPV6_HDR_TC_SHIFT) & 0xff; + if (msk6 && msk6 != 0xff) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "no support for partial mask on" + " \"ipv6.hdr.vtc_flow.tos\" field" + " for vxlan encapsulation"); + if (mask->hdr.hop_limits && mask->hdr.hop_limits != 0xff) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "no support for partial mask on" + " \"ipv6.hdr.hop_limits\" field" + " for vxlan encapsulation"); return 0; } @@ -1362,7 +1452,7 @@ flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item, * Pointer to the error structure. * * @return - * 0 on success, a negative errno value otherwise and rte_ernno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. **/ static int flow_tcf_validate_vxlan_encap_udp(const struct rte_flow_item *item, @@ -1430,7 +1520,7 @@ flow_tcf_validate_vxlan_encap_udp(const struct rte_flow_item *item, * Pointer to the error structure. * * @return - * 0 on success, a negative errno value otherwise and rte_ernno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. **/ static int flow_tcf_validate_vxlan_encap_vni(const struct rte_flow_item *item, @@ -1478,7 +1568,7 @@ flow_tcf_validate_vxlan_encap_vni(const struct rte_flow_item *item, * Pointer to the error structure. * * @return - * 0 on success, a negative errno value otherwise and rte_ernno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. **/ static int flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action, @@ -1516,8 +1606,9 @@ flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action, break; break; case RTE_FLOW_ITEM_TYPE_IPV4: - ret = mlx5_flow_validate_item_ipv4(items, item_flags, - error); + ret = mlx5_flow_validate_item_ipv4 + (items, item_flags, + &flow_tcf_mask_supported.ipv4, error); if (ret < 0) return ret; ret = flow_tcf_validate_vxlan_encap_ipv4(items, error); @@ -1526,8 +1617,9 @@ flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action, item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4; break; case RTE_FLOW_ITEM_TYPE_IPV6: - ret = mlx5_flow_validate_item_ipv6(items, item_flags, - error); + ret = mlx5_flow_validate_item_ipv6 + (items, item_flags, + &flow_tcf_mask_supported.ipv6, error); if (ret < 0) return ret; ret = flow_tcf_validate_vxlan_encap_ipv6(items, error); @@ -1581,141 +1673,8 @@ flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action, } /** - * Validate RTE_FLOW_ITEM_TYPE_IPV4 item if VXLAN_DECAP action - * is present in actions list. - * - * @param[in] ipv4 - * Outer IPv4 address item (if any, NULL otherwise). - * @param[out] error - * Pointer to the error structure. - * - * @return - * 0 on success, a negative errno value otherwise and rte_ernno is set. - **/ -static int -flow_tcf_validate_vxlan_decap_ipv4(const struct rte_flow_item *ipv4, - struct rte_flow_error *error) -{ - const struct rte_flow_item_ipv4 *spec = ipv4->spec; - const struct rte_flow_item_ipv4 *mask = ipv4->mask; - - if (!spec) { - /* - * Specification for IP addresses cannot be empty - * because it is required as decap parameter. - */ - return rte_flow_error_set(error, EINVAL, - RTE_FLOW_ERROR_TYPE_ITEM, ipv4, - "NULL outer ipv4 address" - " specification for vxlan" - " for vxlan decapsulation"); - } - if (!mask) - mask = &rte_flow_item_ipv4_mask; - if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) { - if (mask->hdr.dst_addr != RTE_BE32(0xffffffff)) - return rte_flow_error_set - (error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, - "no support for partial mask on" - " \"ipv4.hdr.dst_addr\" field"); - /* More IP address validations can be put here. */ - } else { - /* - * Kernel uses the destination IP address - * to determine the ingress network interface - * for traffic being decapsulated. - */ - return rte_flow_error_set(error, EINVAL, - RTE_FLOW_ERROR_TYPE_ITEM, ipv4, - "outer ipv4 destination address" - " must be specified for" - " vxlan decapsulation"); - } - /* Source IP address is optional for decap. */ - if (mask->hdr.src_addr != RTE_BE32(0x00000000) && - mask->hdr.src_addr != RTE_BE32(0xffffffff)) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, - "no support for partial mask on" - " \"ipv4.hdr.src_addr\" field"); - return 0; -} - -/** - * Validate RTE_FLOW_ITEM_TYPE_IPV6 item if VXLAN_DECAP action - * is present in actions list. - * - * @param[in] ipv6 - * Outer IPv6 address item (if any, NULL otherwise). - * @param[out] error - * Pointer to the error structure. - * - * @return - * 0 on success, a negative errno value otherwise and rte_ernno is set. - **/ -static int -flow_tcf_validate_vxlan_decap_ipv6(const struct rte_flow_item *ipv6, - struct rte_flow_error *error) -{ - const struct rte_flow_item_ipv6 *spec = ipv6->spec; - const struct rte_flow_item_ipv6 *mask = ipv6->mask; - - if (!spec) { - /* - * Specification for IP addresses cannot be empty - * because it is required as decap parameter. - */ - return rte_flow_error_set(error, EINVAL, - RTE_FLOW_ERROR_TYPE_ITEM, ipv6, - "NULL outer ipv6 address" - " specification for vxlan" - " decapsulation"); - } - if (!mask) - mask = &rte_flow_item_ipv6_mask; - if (memcmp(&mask->hdr.dst_addr, - &flow_tcf_mask_empty.ipv6.hdr.dst_addr, - IPV6_ADDR_LEN)) { - if (memcmp(&mask->hdr.dst_addr, - &rte_flow_item_ipv6_mask.hdr.dst_addr, - IPV6_ADDR_LEN)) - return rte_flow_error_set - (error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, - "no support for partial mask on" - " \"ipv6.hdr.dst_addr\" field"); - /* More IP address validations can be put here. */ - } else { - /* - * Kernel uses the destination IP address - * to determine the ingress network interface - * for traffic being decapsulated. - */ - return rte_flow_error_set(error, EINVAL, - RTE_FLOW_ERROR_TYPE_ITEM, ipv6, - "outer ipv6 destination address must be " - "specified for vxlan decapsulation"); - } - /* Source IP address is optional for decap. */ - if (memcmp(&mask->hdr.src_addr, - &flow_tcf_mask_empty.ipv6.hdr.src_addr, - IPV6_ADDR_LEN)) { - if (memcmp(&mask->hdr.src_addr, - &rte_flow_item_ipv6_mask.hdr.src_addr, - IPV6_ADDR_LEN)) - return rte_flow_error_set - (error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, - "no support for partial mask on" - " \"ipv6.hdr.src_addr\" field"); - } - return 0; -} - -/** - * Validate RTE_FLOW_ITEM_TYPE_UDP item if VXLAN_DECAP action - * is present in actions list. + * Validate outer RTE_FLOW_ITEM_TYPE_UDP item if tunnel item + * RTE_FLOW_ITEM_TYPE_VXLAN is present in item list. * * @param[in] udp * Outer UDP layer item (if any, NULL otherwise). @@ -1723,7 +1682,7 @@ flow_tcf_validate_vxlan_decap_ipv6(const struct rte_flow_item *ipv6, * Pointer to the error structure. * * @return - * 0 on success, a negative errno value otherwise and rte_ernno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. **/ static int flow_tcf_validate_vxlan_decap_udp(const struct rte_flow_item *udp, @@ -1791,7 +1750,7 @@ flow_tcf_validate_vxlan_decap_udp(const struct rte_flow_item *udp, * Pointer to the error structure. * * @return - * 0 on success, a negative errno value otherwise and rte_ernno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int flow_tcf_validate(struct rte_eth_dev *dev, @@ -1822,9 +1781,13 @@ flow_tcf_validate(struct rte_eth_dev *dev, const struct rte_flow_action_set_ipv4 *set_ipv4; const struct rte_flow_action_set_ipv6 *set_ipv6; } conf; + const struct rte_flow_item *outer_udp = NULL; + rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL); + rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL); + rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL); uint64_t item_flags = 0; uint64_t action_flags = 0; - uint8_t next_protocol = -1; + uint8_t next_protocol = 0xff; unsigned int tcm_ifindex = 0; uint8_t pedit_validated = 0; struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)]; @@ -1884,9 +1847,23 @@ flow_tcf_validate(struct rte_eth_dev *dev, case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN: current_action_flag = MLX5_FLOW_ACTION_OF_POP_VLAN; break; - case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: + case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: { + rte_be16_t ethertype; + current_action_flag = MLX5_FLOW_ACTION_OF_PUSH_VLAN; + if (!actions->conf) + break; + conf.of_push_vlan = actions->conf; + ethertype = conf.of_push_vlan->ethertype; + if (ethertype != RTE_BE16(ETH_P_8021Q) && + ethertype != RTE_BE16(ETH_P_8021AD)) + return rte_flow_error_set + (error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, actions, + "vlan push TPID must be " + "802.1Q or 802.1AD"); break; + } case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID: if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN)) return rte_flow_error_set @@ -1994,17 +1971,16 @@ flow_tcf_validate(struct rte_eth_dev *dev, for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { unsigned int i; - if ((item_flags & MLX5_FLOW_LAYER_TUNNEL) && - items->type != RTE_FLOW_ITEM_TYPE_ETH) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - items, - "only L2 inner item" - " is supported"); switch (items->type) { case RTE_FLOW_ITEM_TYPE_VOID: break; case RTE_FLOW_ITEM_TYPE_PORT_ID: + if (item_flags & MLX5_FLOW_LAYER_TUNNEL) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, items, + "inner tunnel port id" + " item is not supported"); mask.port_id = flow_tcf_item_mask (items, &rte_flow_item_port_id_mask, &flow_tcf_mask_supported.port_id, @@ -2055,8 +2031,8 @@ flow_tcf_validate(struct rte_eth_dev *dev, if (ret < 0) return ret; item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ? - MLX5_FLOW_LAYER_INNER_L2 : - MLX5_FLOW_LAYER_OUTER_L2; + MLX5_FLOW_LAYER_INNER_L2 : + MLX5_FLOW_LAYER_OUTER_L2; /* TODO: * Redundant check due to different supported mask. * Same for the rest of items. @@ -2077,8 +2053,40 @@ flow_tcf_validate(struct rte_eth_dev *dev, mask.eth, "no support for partial mask on" " \"type\" field"); + assert(items->spec); + spec.eth = items->spec; + if (mask.eth->type && + (item_flags & MLX5_FLOW_LAYER_TUNNEL) && + inner_etype != RTE_BE16(ETH_P_ALL) && + inner_etype != spec.eth->type) + return rte_flow_error_set + (error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + items, + "inner eth_type conflict"); + if (mask.eth->type && + !(item_flags & MLX5_FLOW_LAYER_TUNNEL) && + outer_etype != RTE_BE16(ETH_P_ALL) && + outer_etype != spec.eth->type) + return rte_flow_error_set + (error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + items, + "outer eth_type conflict"); + if (mask.eth->type) { + if (item_flags & MLX5_FLOW_LAYER_TUNNEL) + inner_etype = spec.eth->type; + else + outer_etype = spec.eth->type; + } break; case RTE_FLOW_ITEM_TYPE_VLAN: + if (item_flags & MLX5_FLOW_LAYER_TUNNEL) + return rte_flow_error_set + (error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, items, + "inner tunnel VLAN" + " is not supported"); ret = mlx5_flow_validate_item_vlan(items, item_flags, error); if (ret < 0) @@ -2107,13 +2115,37 @@ flow_tcf_validate(struct rte_eth_dev *dev, "no support for partial masks on" " \"tci\" (PCP and VID parts) and" " \"inner_type\" fields"); + if (outer_etype != RTE_BE16(ETH_P_ALL) && + outer_etype != RTE_BE16(ETH_P_8021Q)) + return rte_flow_error_set + (error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + items, + "outer eth_type conflict," + " must be 802.1Q"); + outer_etype = RTE_BE16(ETH_P_8021Q); + assert(items->spec); + spec.vlan = items->spec; + if (mask.vlan->inner_type && + vlan_etype != RTE_BE16(ETH_P_ALL) && + vlan_etype != spec.vlan->inner_type) + return rte_flow_error_set + (error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + items, + "vlan eth_type conflict"); + if (mask.vlan->inner_type) + vlan_etype = spec.vlan->inner_type; break; case RTE_FLOW_ITEM_TYPE_IPV4: - ret = mlx5_flow_validate_item_ipv4(items, item_flags, - error); + ret = mlx5_flow_validate_item_ipv4 + (items, item_flags, + &flow_tcf_mask_supported.ipv4, error); if (ret < 0) return ret; - item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4; + item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ? + MLX5_FLOW_LAYER_INNER_L3_IPV4 : + MLX5_FLOW_LAYER_OUTER_L3_IPV4; mask.ipv4 = flow_tcf_item_mask (items, &rte_flow_item_ipv4_mask, &flow_tcf_mask_supported.ipv4, @@ -2134,19 +2166,47 @@ flow_tcf_validate(struct rte_eth_dev *dev, next_protocol = ((const struct rte_flow_item_ipv4 *) (items->spec))->hdr.next_proto_id; - if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) { - ret = flow_tcf_validate_vxlan_decap_ipv4 - (items, error); - if (ret < 0) - return ret; + if (item_flags & MLX5_FLOW_LAYER_TUNNEL) { + if (inner_etype != RTE_BE16(ETH_P_ALL) && + inner_etype != RTE_BE16(ETH_P_IP)) + return rte_flow_error_set + (error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + items, + "inner eth_type conflict," + " IPv4 is required"); + inner_etype = RTE_BE16(ETH_P_IP); + } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) { + if (vlan_etype != RTE_BE16(ETH_P_ALL) && + vlan_etype != RTE_BE16(ETH_P_IP)) + return rte_flow_error_set + (error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + items, + "vlan eth_type conflict," + " IPv4 is required"); + vlan_etype = RTE_BE16(ETH_P_IP); + } else { + if (outer_etype != RTE_BE16(ETH_P_ALL) && + outer_etype != RTE_BE16(ETH_P_IP)) + return rte_flow_error_set + (error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + items, + "eth_type conflict," + " IPv4 is required"); + outer_etype = RTE_BE16(ETH_P_IP); } break; case RTE_FLOW_ITEM_TYPE_IPV6: - ret = mlx5_flow_validate_item_ipv6(items, item_flags, - error); + ret = mlx5_flow_validate_item_ipv6 + (items, item_flags, + &flow_tcf_mask_supported.ipv6, error); if (ret < 0) return ret; - item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6; + item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ? + MLX5_FLOW_LAYER_INNER_L3_IPV6 : + MLX5_FLOW_LAYER_OUTER_L3_IPV6; mask.ipv6 = flow_tcf_item_mask (items, &rte_flow_item_ipv6_mask, &flow_tcf_mask_supported.ipv6, @@ -2167,11 +2227,36 @@ flow_tcf_validate(struct rte_eth_dev *dev, next_protocol = ((const struct rte_flow_item_ipv6 *) (items->spec))->hdr.proto; - if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) { - ret = flow_tcf_validate_vxlan_decap_ipv6 - (items, error); - if (ret < 0) - return ret; + if (item_flags & MLX5_FLOW_LAYER_TUNNEL) { + if (inner_etype != RTE_BE16(ETH_P_ALL) && + inner_etype != RTE_BE16(ETH_P_IPV6)) + return rte_flow_error_set + (error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + items, + "inner eth_type conflict," + " IPv6 is required"); + inner_etype = RTE_BE16(ETH_P_IPV6); + } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) { + if (vlan_etype != RTE_BE16(ETH_P_ALL) && + vlan_etype != RTE_BE16(ETH_P_IPV6)) + return rte_flow_error_set + (error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + items, + "vlan eth_type conflict," + " IPv6 is required"); + vlan_etype = RTE_BE16(ETH_P_IPV6); + } else { + if (outer_etype != RTE_BE16(ETH_P_ALL) && + outer_etype != RTE_BE16(ETH_P_IPV6)) + return rte_flow_error_set + (error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, + items, + "eth_type conflict," + " IPv6 is required"); + outer_etype = RTE_BE16(ETH_P_IPV6); } break; case RTE_FLOW_ITEM_TYPE_UDP: @@ -2179,7 +2264,9 @@ flow_tcf_validate(struct rte_eth_dev *dev, next_protocol, error); if (ret < 0) return ret; - item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP; + item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ? + MLX5_FLOW_LAYER_INNER_L4_UDP : + MLX5_FLOW_LAYER_OUTER_L4_UDP; mask.udp = flow_tcf_item_mask (items, &rte_flow_item_udp_mask, &flow_tcf_mask_supported.udp, @@ -2188,12 +2275,12 @@ flow_tcf_validate(struct rte_eth_dev *dev, error); if (!mask.udp) return -rte_errno; - if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) { - ret = flow_tcf_validate_vxlan_decap_udp - (items, error); - if (ret < 0) - return ret; - } + /* + * Save the presumed outer UDP item for extra check + * if the tunnel item will be found later in the list. + */ + if (!(item_flags & MLX5_FLOW_LAYER_TUNNEL)) + outer_udp = items; break; case RTE_FLOW_ITEM_TYPE_TCP: ret = mlx5_flow_validate_item_tcp @@ -2203,7 +2290,9 @@ flow_tcf_validate(struct rte_eth_dev *dev, error); if (ret < 0) return ret; - item_flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP; + item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ? + MLX5_FLOW_LAYER_INNER_L4_TCP : + MLX5_FLOW_LAYER_OUTER_L4_TCP; mask.tcp = flow_tcf_item_mask (items, &rte_flow_item_tcp_mask, &flow_tcf_mask_supported.tcp, @@ -2214,13 +2303,12 @@ flow_tcf_validate(struct rte_eth_dev *dev, return -rte_errno; break; case RTE_FLOW_ITEM_TYPE_VXLAN: - if (!(action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP)) + if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) return rte_flow_error_set (error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ITEM, - items, - "vni pattern should be followed by" - " vxlan decapsulation action"); + RTE_FLOW_ERROR_TYPE_ITEM, items, + "vxlan tunnel over vlan" + " is not supported"); ret = mlx5_flow_validate_item_vxlan(items, item_flags, error); if (ret < 0) @@ -2242,6 +2330,45 @@ flow_tcf_validate(struct rte_eth_dev *dev, mask.vxlan, "no support for partial or " "empty mask on \"vxlan.vni\" field"); + /* + * The VNI item assumes the VXLAN tunnel, it requires + * at least the outer destination UDP port must be + * specified without wildcards to allow kernel select + * the virtual VXLAN device by port. Also outer IPv4 + * or IPv6 item must be specified (wilcards or even + * zero mask are allowed) to let driver know the tunnel + * IP version and process UDP traffic correctly. + */ + if (!(item_flags & + (MLX5_FLOW_LAYER_OUTER_L3_IPV4 | + MLX5_FLOW_LAYER_OUTER_L3_IPV6))) + return rte_flow_error_set + (error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "no outer IP pattern found" + " for vxlan tunnel"); + if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)) + return rte_flow_error_set + (error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "no outer UDP pattern found" + " for vxlan tunnel"); + /* + * All items preceding the tunnel item become outer + * ones and we should do extra validation for them + * due to tc limitations for tunnel outer parameters. + * Currently only outer UDP item requres extra check, + * use the saved pointer instead of item list rescan. + */ + assert(outer_udp); + ret = flow_tcf_validate_vxlan_decap_udp + (outer_udp, error); + if (ret < 0) + return ret; + /* Reset L4 protocol for inner parameters. */ + next_protocol = 0xff; break; default: return rte_flow_error_set(error, ENOTSUP, @@ -2299,7 +2426,7 @@ flow_tcf_validate(struct rte_eth_dev *dev, */ if ((action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) && (action_flags & MLX5_FLOW_ACTION_PORT_ID) && - ((struct priv *)port_id_dev->data->dev_private)->representor) + ((struct mlx5_priv *)port_id_dev->data->dev_private)->representor) return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, actions, "vlan push can only be applied" @@ -2344,52 +2471,45 @@ flow_tcf_validate(struct rte_eth_dev *dev, "no ethernet found in" " pattern"); } - if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) { - if (!(item_flags & - (MLX5_FLOW_LAYER_OUTER_L3_IPV4 | - MLX5_FLOW_LAYER_OUTER_L3_IPV6))) - return rte_flow_error_set(error, EINVAL, - RTE_FLOW_ERROR_TYPE_ACTION, - NULL, - "no outer IP pattern found" - " for vxlan decap action"); - if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)) - return rte_flow_error_set(error, EINVAL, - RTE_FLOW_ERROR_TYPE_ACTION, - NULL, - "no outer UDP pattern found" - " for vxlan decap action"); - if (!(item_flags & MLX5_FLOW_LAYER_VXLAN)) - return rte_flow_error_set(error, EINVAL, - RTE_FLOW_ERROR_TYPE_ACTION, - NULL, - "no VNI pattern found" - " for vxlan decap action"); - } + if ((action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) && + !(item_flags & MLX5_FLOW_LAYER_VXLAN)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "no VNI pattern found" + " for vxlan decap action"); + if ((action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) && + (item_flags & MLX5_FLOW_LAYER_TUNNEL)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, + "vxlan encap not supported" + " for tunneled traffic"); return 0; } /** - * Calculate maximum size of memory for flow items of Linux TC flower and - * extract specified items. + * Calculate maximum size of memory for flow items of Linux TC flower. * + * @param[in] attr + * Pointer to the flow attributes. * @param[in] items * Pointer to the list of items. - * @param[out] item_flags - * Pointer to the detected items. + * @param[out] action_flags + * Pointer to the detected actions. * * @return * Maximum size of memory for items. */ static int -flow_tcf_get_items_and_size(const struct rte_flow_attr *attr, - const struct rte_flow_item items[], - uint64_t *item_flags) +flow_tcf_get_items_size(const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + uint64_t *action_flags) { int size = 0; - uint64_t flags = 0; size += SZ_NLATTR_STRZ_OF("flower") + + SZ_NLATTR_TYPE_OF(uint16_t) + /* Outer ether type. */ SZ_NLATTR_NEST + /* TCA_OPTIONS. */ SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */ if (attr->group > 0) @@ -2401,48 +2521,62 @@ flow_tcf_get_items_and_size(const struct rte_flow_attr *attr, case RTE_FLOW_ITEM_TYPE_PORT_ID: break; case RTE_FLOW_ITEM_TYPE_ETH: - size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */ - SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4; + size += SZ_NLATTR_DATA_OF(RTE_ETHER_ADDR_LEN) * 4; /* dst/src MAC addr and mask. */ - flags |= MLX5_FLOW_LAYER_OUTER_L2; break; case RTE_FLOW_ITEM_TYPE_VLAN: - size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */ - SZ_NLATTR_TYPE_OF(uint16_t) + + size += SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN Ether type. */ SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */ SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */ - flags |= MLX5_FLOW_LAYER_OUTER_VLAN; break; - case RTE_FLOW_ITEM_TYPE_IPV4: - size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */ - SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */ + case RTE_FLOW_ITEM_TYPE_IPV4: { + const struct rte_flow_item_ipv4 *ipv4 = items->mask; + + size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */ SZ_NLATTR_TYPE_OF(uint32_t) * 4; /* dst/src IP addr and mask. */ - flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4; + if (ipv4 && ipv4->hdr.time_to_live) + size += SZ_NLATTR_TYPE_OF(uint8_t) * 2; + if (ipv4 && ipv4->hdr.type_of_service) + size += SZ_NLATTR_TYPE_OF(uint8_t) * 2; break; - case RTE_FLOW_ITEM_TYPE_IPV6: - size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */ - SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */ + } + case RTE_FLOW_ITEM_TYPE_IPV6: { + const struct rte_flow_item_ipv6 *ipv6 = items->mask; + + size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */ SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4; /* dst/src IP addr and mask. */ - flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6; + if (ipv6 && ipv6->hdr.hop_limits) + size += SZ_NLATTR_TYPE_OF(uint8_t) * 2; + if (ipv6 && (rte_be_to_cpu_32(ipv6->hdr.vtc_flow) & + (0xfful << RTE_IPV6_HDR_TC_SHIFT))) + size += SZ_NLATTR_TYPE_OF(uint8_t) * 2; break; + } case RTE_FLOW_ITEM_TYPE_UDP: size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */ SZ_NLATTR_TYPE_OF(uint16_t) * 4; /* dst/src port and mask. */ - flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP; break; case RTE_FLOW_ITEM_TYPE_TCP: size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */ SZ_NLATTR_TYPE_OF(uint16_t) * 4; /* dst/src port and mask. */ - flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP; break; case RTE_FLOW_ITEM_TYPE_VXLAN: size += SZ_NLATTR_TYPE_OF(uint32_t); - flags |= MLX5_FLOW_LAYER_VXLAN; + /* + * There might be no VXLAN decap action in the action + * list, nonetheless the VXLAN tunnel flow requires + * the decap structure to be correctly applied to + * VXLAN device, set the flag to create the structure. + * Translation routine will not put the decap action + * in tne Netlink message if there is no actual action + * in the list. + */ + *action_flags |= MLX5_FLOW_ACTION_VXLAN_DECAP; break; default: DRV_LOG(WARNING, @@ -2452,7 +2586,6 @@ flow_tcf_get_items_and_size(const struct rte_flow_attr *attr, break; } } - *item_flags = flags; return size; } @@ -2489,12 +2622,27 @@ flow_tcf_vxlan_encap_size(const struct rte_flow_action *action) case RTE_FLOW_ITEM_TYPE_ETH: /* This item does not require message buffer. */ break; - case RTE_FLOW_ITEM_TYPE_IPV4: + case RTE_FLOW_ITEM_TYPE_IPV4: { + const struct rte_flow_item_ipv4 *ipv4 = items->mask; + size += SZ_NLATTR_DATA_OF(IPV4_ADDR_LEN) * 2; + if (ipv4 && ipv4->hdr.time_to_live) + size += SZ_NLATTR_TYPE_OF(uint8_t) * 2; + if (ipv4 && ipv4->hdr.type_of_service) + size += SZ_NLATTR_TYPE_OF(uint8_t) * 2; break; - case RTE_FLOW_ITEM_TYPE_IPV6: + } + case RTE_FLOW_ITEM_TYPE_IPV6: { + const struct rte_flow_item_ipv6 *ipv6 = items->mask; + size += SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 2; + if (ipv6 && ipv6->hdr.hop_limits) + size += SZ_NLATTR_TYPE_OF(uint8_t) * 2; + if (ipv6 && (rte_be_to_cpu_32(ipv6->hdr.vtc_flow) & + (0xfful << RTE_IPV6_HDR_TC_SHIFT))) + size += SZ_NLATTR_TYPE_OF(uint8_t) * 2; break; + } case RTE_FLOW_ITEM_TYPE_UDP: { const struct rte_flow_item_udp *udp = items->mask; @@ -2536,7 +2684,7 @@ flow_tcf_get_actions_and_size(const struct rte_flow_action actions[], uint64_t *action_flags) { int size = 0; - uint64_t flags = 0; + uint64_t flags = *action_flags; size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */ for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { @@ -2637,30 +2785,9 @@ action_of_vlan: } /** - * Brand rtnetlink buffer with unique handle. - * - * This handle should be unique for a given network interface to avoid - * collisions. - * - * @param nlh - * Pointer to Netlink message. - * @param handle - * Unique 32-bit handle to use. - */ -static void -flow_tcf_nl_brand(struct nlmsghdr *nlh, uint32_t handle) -{ - struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh); - - tcm->tcm_handle = handle; - DRV_LOG(DEBUG, "Netlink msg %p is branded with handle %x", - (void *)nlh, handle); -} - -/** - * Prepare a flow object for Linux TC flower. It calculates the maximum size of - * memory required, allocates the memory, initializes Netlink message headers - * and set unique TC message handle. + * Prepare a flow object for Linux TC flower. It calculates the maximum size of + * memory required, allocates the memory, initializes Netlink message headers + * and set unique TC message handle. * * @param[in] attr * Pointer to the flow attributes. @@ -2668,22 +2795,17 @@ flow_tcf_nl_brand(struct nlmsghdr *nlh, uint32_t handle) * Pointer to the list of items. * @param[in] actions * Pointer to the list of actions. - * @param[out] item_flags - * Pointer to bit mask of all items detected. - * @param[out] action_flags - * Pointer to bit mask of all actions detected. * @param[out] error * Pointer to the error structure. * * @return * Pointer to mlx5_flow object on success, - * otherwise NULL and rte_ernno is set. + * otherwise NULL and rte_errno is set. */ static struct mlx5_flow * flow_tcf_prepare(const struct rte_flow_attr *attr, const struct rte_flow_item items[], const struct rte_flow_action actions[], - uint64_t *item_flags, uint64_t *action_flags, struct rte_flow_error *error) { size_t size = RTE_ALIGN_CEIL @@ -2692,12 +2814,13 @@ flow_tcf_prepare(const struct rte_flow_attr *attr, MNL_ALIGN(sizeof(struct nlmsghdr)) + MNL_ALIGN(sizeof(struct tcmsg)); struct mlx5_flow *dev_flow; + uint64_t action_flags = 0; struct nlmsghdr *nlh; struct tcmsg *tcm; uint8_t *sp, *tun = NULL; - size += flow_tcf_get_items_and_size(attr, items, item_flags); - size += flow_tcf_get_actions_and_size(actions, action_flags); + size += flow_tcf_get_items_size(attr, items, &action_flags); + size += flow_tcf_get_actions_and_size(actions, &action_flags); dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO); if (!dev_flow) { rte_flow_error_set(error, ENOMEM, @@ -2706,7 +2829,7 @@ flow_tcf_prepare(const struct rte_flow_attr *attr, return NULL; } sp = (uint8_t *)(dev_flow + 1); - if (*action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) { + if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) { sp = RTE_PTR_ALIGN (sp, alignof(struct flow_tcf_tunnel_hdr)); tun = sp; @@ -2718,7 +2841,7 @@ flow_tcf_prepare(const struct rte_flow_attr *attr, (sizeof(struct flow_tcf_vxlan_encap), MNL_ALIGNTO); #endif - } else if (*action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) { + } else if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) { sp = RTE_PTR_ALIGN (sp, alignof(struct flow_tcf_tunnel_hdr)); tun = sp; @@ -2747,24 +2870,10 @@ flow_tcf_prepare(const struct rte_flow_attr *attr, .tcm = tcm, }, }; - if (*action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) + if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_DECAP; - else if (*action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) + else if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_ENCAP; - /* - * Generate a reasonably unique handle based on the address of the - * target buffer. - * - * This is straightforward on 32-bit systems where the flow pointer can - * be used directly. Otherwise, its least significant part is taken - * after shifting it by the previous power of two of the pointed buffer - * size. - */ - if (sizeof(dev_flow) <= 4) - flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow); - else - flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow >> - rte_log2_u32(rte_align32prevpow2(size))); return dev_flow; } @@ -2807,7 +2916,7 @@ flow_tcf_translate_action_count(struct rte_eth_dev *dev __rte_unused, * VXLAN VNI in 24-bit wire format. * * @return - * VXLAN VNI as a 32-bit integer value in network endian. + * VXLAN VNI as a 32-bit integer value in network endianness. */ static inline rte_be32_t vxlan_vni_as_be32(const uint8_t vni[3]) @@ -2876,11 +2985,14 @@ flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec, * * @param[in] spec * RTE_FLOW_ITEM_TYPE_IPV4 entry specification. + * @param[in] mask + * RTE_FLOW_ITEM_TYPE_IPV4 entry mask. * @param[out] encap * Structure to fill the gathered IPV4 address data. */ static void flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec, + const struct rte_flow_item_ipv4 *mask, struct flow_tcf_vxlan_encap *encap) { /* Item must be validated before. No redundant checks. */ @@ -2889,6 +3001,14 @@ flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec, encap->ipv4.src = spec->hdr.src_addr; encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC | FLOW_TCF_ENCAP_IPV4_DST; + if (mask && mask->hdr.type_of_service) { + encap->mask |= FLOW_TCF_ENCAP_IP_TOS; + encap->ip_tos = spec->hdr.type_of_service; + } + if (mask && mask->hdr.time_to_live) { + encap->mask |= FLOW_TCF_ENCAP_IP_TTL; + encap->ip_ttl_hop = spec->hdr.time_to_live; + } } /** @@ -2899,11 +3019,14 @@ flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec, * * @param[in] spec * RTE_FLOW_ITEM_TYPE_IPV6 entry specification. + * @param[in] mask + * RTE_FLOW_ITEM_TYPE_IPV6 entry mask. * @param[out] encap * Structure to fill the gathered IPV6 address data. */ static void flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec, + const struct rte_flow_item_ipv6 *mask, struct flow_tcf_vxlan_encap *encap) { /* Item must be validated before. No redundant checks. */ @@ -2912,6 +3035,19 @@ flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec, memcpy(encap->ipv6.src, spec->hdr.src_addr, IPV6_ADDR_LEN); encap->mask |= FLOW_TCF_ENCAP_IPV6_SRC | FLOW_TCF_ENCAP_IPV6_DST; + if (mask) { + if ((rte_be_to_cpu_32(mask->hdr.vtc_flow) >> + RTE_IPV6_HDR_TC_SHIFT) & 0xff) { + encap->mask |= FLOW_TCF_ENCAP_IP_TOS; + encap->ip_tos = (rte_be_to_cpu_32 + (spec->hdr.vtc_flow) >> + RTE_IPV6_HDR_TC_SHIFT) & 0xff; + } + if (mask->hdr.hop_limits) { + encap->mask |= FLOW_TCF_ENCAP_IP_TTL; + encap->ip_ttl_hop = spec->hdr.hop_limits; + } + } } /** @@ -3006,11 +3142,15 @@ flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action, break; case RTE_FLOW_ITEM_TYPE_IPV4: spec.ipv4 = items->spec; - flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4, encap); + mask.ipv4 = items->mask; + flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4, mask.ipv4, + encap); break; case RTE_FLOW_ITEM_TYPE_IPV6: spec.ipv6 = items->spec; - flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6, encap); + mask.ipv6 = items->mask; + flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6, mask.ipv6, + encap); break; case RTE_FLOW_ITEM_TYPE_UDP: mask.udp = items->mask; @@ -3052,7 +3192,7 @@ flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action, * Pointer to the error structure. * * @return - * 0 on success, a negative errno value otherwise and rte_ernno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, @@ -3096,10 +3236,11 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, struct nlmsghdr *nlh = dev_flow->tcf.nlh; struct tcmsg *tcm = dev_flow->tcf.tcm; uint32_t na_act_index_cur; - bool eth_type_set = 0; - bool vlan_present = 0; - bool vlan_eth_type_set = 0; + rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL); + rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL); + rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL); bool ip_proto_set = 0; + bool tunnel_outer = 0; struct nlattr *na_flower; struct nlattr *na_flower_act; struct nlattr *na_vlan_id = NULL; @@ -3113,6 +3254,7 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, switch (dev_flow->tcf.tunnel->type) { case FLOW_TCF_TUNACT_VXLAN_DECAP: decap.vxlan = dev_flow->tcf.vxlan_decap; + tunnel_outer = 1; break; case FLOW_TCF_TUNACT_VXLAN_ENCAP: encap.vxlan = dev_flow->tcf.vxlan_encap; @@ -3134,8 +3276,7 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, * Priority cannot be zero to prevent the kernel from picking one * automatically. */ - tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16, - RTE_BE16(ETH_P_ALL)); + tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16, outer_etype); if (attr->group > 0) mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group); mnl_attr_put_strz(nlh, TCA_KIND, "flower"); @@ -3167,7 +3308,7 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, tcm->tcm_ifindex = ptoi[i].ifindex; break; case RTE_FLOW_ITEM_TYPE_ETH: - item_flags |= (item_flags & MLX5_FLOW_LAYER_VXLAN) ? + item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ? MLX5_FLOW_LAYER_INNER_L2 : MLX5_FLOW_LAYER_OUTER_L2; mask.eth = flow_tcf_item_mask @@ -3180,33 +3321,33 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, if (mask.eth == &flow_tcf_mask_empty.eth) break; spec.eth = items->spec; - if (decap.vxlan && - !(item_flags & MLX5_FLOW_LAYER_VXLAN)) { + if (mask.eth->type) { + if (item_flags & MLX5_FLOW_LAYER_TUNNEL) + inner_etype = spec.eth->type; + else + outer_etype = spec.eth->type; + } + if (tunnel_outer) { DRV_LOG(WARNING, - "outer L2 addresses cannot be forced" - " for vxlan decapsulation, parameter" - " ignored"); + "outer L2 addresses cannot be" + " forced is outer ones for tunnel," + " parameter is ignored"); break; } - if (mask.eth->type) { - mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_ETH_TYPE, - spec.eth->type); - eth_type_set = 1; - } - if (!is_zero_ether_addr(&mask.eth->dst)) { + if (!rte_is_zero_ether_addr(&mask.eth->dst)) { mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST, - ETHER_ADDR_LEN, + RTE_ETHER_ADDR_LEN, spec.eth->dst.addr_bytes); mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK, - ETHER_ADDR_LEN, + RTE_ETHER_ADDR_LEN, mask.eth->dst.addr_bytes); } - if (!is_zero_ether_addr(&mask.eth->src)) { + if (!rte_is_zero_ether_addr(&mask.eth->src)) { mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC, - ETHER_ADDR_LEN, + RTE_ETHER_ADDR_LEN, spec.eth->src.addr_bytes); mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK, - ETHER_ADDR_LEN, + RTE_ETHER_ADDR_LEN, mask.eth->src.addr_bytes); } assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len); @@ -3214,6 +3355,7 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, case RTE_FLOW_ITEM_TYPE_VLAN: assert(!encap.hdr); assert(!decap.hdr); + assert(!tunnel_outer); item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN; mask.vlan = flow_tcf_item_mask (items, &rte_flow_item_vlan_mask, @@ -3222,20 +3364,14 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, sizeof(flow_tcf_mask_supported.vlan), error); assert(mask.vlan); - if (!eth_type_set) - mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_ETH_TYPE, - RTE_BE16(ETH_P_8021Q)); - eth_type_set = 1; - vlan_present = 1; if (mask.vlan == &flow_tcf_mask_empty.vlan) break; spec.vlan = items->spec; - if (mask.vlan->inner_type) { - mnl_attr_put_u16(nlh, - TCA_FLOWER_KEY_VLAN_ETH_TYPE, - spec.vlan->inner_type); - vlan_eth_type_set = 1; - } + assert(outer_etype == RTE_BE16(ETH_P_ALL) || + outer_etype == RTE_BE16(ETH_P_8021Q)); + outer_etype = RTE_BE16(ETH_P_8021Q); + if (mask.vlan->inner_type) + vlan_etype = spec.vlan->inner_type; if (mask.vlan->tci & RTE_BE16(0xe000)) mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_VLAN_PRIO, (rte_be_to_cpu_16 @@ -3248,7 +3384,9 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len); break; case RTE_FLOW_ITEM_TYPE_IPV4: - item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4; + item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ? + MLX5_FLOW_LAYER_INNER_L3_IPV4 : + MLX5_FLOW_LAYER_OUTER_L3_IPV4; mask.ipv4 = flow_tcf_item_mask (items, &rte_flow_item_ipv4_mask, &flow_tcf_mask_supported.ipv4, @@ -3256,56 +3394,108 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, sizeof(flow_tcf_mask_supported.ipv4), error); assert(mask.ipv4); + if (item_flags & MLX5_FLOW_LAYER_TUNNEL) { + assert(inner_etype == RTE_BE16(ETH_P_ALL) || + inner_etype == RTE_BE16(ETH_P_IP)); + inner_etype = RTE_BE16(ETH_P_IP); + } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) { + assert(vlan_etype == RTE_BE16(ETH_P_ALL) || + vlan_etype == RTE_BE16(ETH_P_IP)); + vlan_etype = RTE_BE16(ETH_P_IP); + } else { + assert(outer_etype == RTE_BE16(ETH_P_ALL) || + outer_etype == RTE_BE16(ETH_P_IP)); + outer_etype = RTE_BE16(ETH_P_IP); + } spec.ipv4 = items->spec; - if (!decap.vxlan) { - if (!eth_type_set && !vlan_eth_type_set) - mnl_attr_put_u16 - (nlh, - vlan_present ? - TCA_FLOWER_KEY_VLAN_ETH_TYPE : - TCA_FLOWER_KEY_ETH_TYPE, - RTE_BE16(ETH_P_IP)); - eth_type_set = 1; - vlan_eth_type_set = 1; - if (mask.ipv4 == &flow_tcf_mask_empty.ipv4) + if (!tunnel_outer && mask.ipv4->hdr.next_proto_id) { + /* + * No way to set IP protocol for outer tunnel + * layers. Usually it is fixed, for example, + * to UDP for VXLAN/GPE. + */ + assert(spec.ipv4); /* Mask is not empty. */ + mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO, + spec.ipv4->hdr.next_proto_id); + ip_proto_set = 1; + } + if (mask.ipv4 == &flow_tcf_mask_empty.ipv4 || + (!mask.ipv4->hdr.src_addr && + !mask.ipv4->hdr.dst_addr)) { + if (!tunnel_outer) break; - if (mask.ipv4->hdr.next_proto_id) { - mnl_attr_put_u8 - (nlh, TCA_FLOWER_KEY_IP_PROTO, - spec.ipv4->hdr.next_proto_id); - ip_proto_set = 1; - } - } else { - assert(mask.ipv4 != &flow_tcf_mask_empty.ipv4); + /* + * For tunnel outer we must set outer IP key + * anyway, even if the specification/mask is + * empty. There is no another way to tell + * kernel about he outer layer protocol. + */ + mnl_attr_put_u32 + (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC, + mask.ipv4->hdr.src_addr); + mnl_attr_put_u32 + (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK, + mask.ipv4->hdr.src_addr); + assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len); + break; } if (mask.ipv4->hdr.src_addr) { mnl_attr_put_u32 - (nlh, decap.vxlan ? + (nlh, tunnel_outer ? TCA_FLOWER_KEY_ENC_IPV4_SRC : TCA_FLOWER_KEY_IPV4_SRC, spec.ipv4->hdr.src_addr); mnl_attr_put_u32 - (nlh, decap.vxlan ? + (nlh, tunnel_outer ? TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK : TCA_FLOWER_KEY_IPV4_SRC_MASK, mask.ipv4->hdr.src_addr); } if (mask.ipv4->hdr.dst_addr) { mnl_attr_put_u32 - (nlh, decap.vxlan ? + (nlh, tunnel_outer ? TCA_FLOWER_KEY_ENC_IPV4_DST : TCA_FLOWER_KEY_IPV4_DST, spec.ipv4->hdr.dst_addr); mnl_attr_put_u32 - (nlh, decap.vxlan ? + (nlh, tunnel_outer ? TCA_FLOWER_KEY_ENC_IPV4_DST_MASK : TCA_FLOWER_KEY_IPV4_DST_MASK, mask.ipv4->hdr.dst_addr); } + if (mask.ipv4->hdr.time_to_live) { + mnl_attr_put_u8 + (nlh, tunnel_outer ? + TCA_FLOWER_KEY_ENC_IP_TTL : + TCA_FLOWER_KEY_IP_TTL, + spec.ipv4->hdr.time_to_live); + mnl_attr_put_u8 + (nlh, tunnel_outer ? + TCA_FLOWER_KEY_ENC_IP_TTL_MASK : + TCA_FLOWER_KEY_IP_TTL_MASK, + mask.ipv4->hdr.time_to_live); + } + if (mask.ipv4->hdr.type_of_service) { + mnl_attr_put_u8 + (nlh, tunnel_outer ? + TCA_FLOWER_KEY_ENC_IP_TOS : + TCA_FLOWER_KEY_IP_TOS, + spec.ipv4->hdr.type_of_service); + mnl_attr_put_u8 + (nlh, tunnel_outer ? + TCA_FLOWER_KEY_ENC_IP_TOS_MASK : + TCA_FLOWER_KEY_IP_TOS_MASK, + mask.ipv4->hdr.type_of_service); + } assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len); break; - case RTE_FLOW_ITEM_TYPE_IPV6: - item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6; + case RTE_FLOW_ITEM_TYPE_IPV6: { + bool ipv6_src, ipv6_dst; + uint8_t msk6, tos6; + + item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ? + MLX5_FLOW_LAYER_INNER_L3_IPV6 : + MLX5_FLOW_LAYER_OUTER_L3_IPV6; mask.ipv6 = flow_tcf_item_mask (items, &rte_flow_item_ipv6_mask, &flow_tcf_mask_supported.ipv6, @@ -3313,57 +3503,114 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, sizeof(flow_tcf_mask_supported.ipv6), error); assert(mask.ipv6); + if (item_flags & MLX5_FLOW_LAYER_TUNNEL) { + assert(inner_etype == RTE_BE16(ETH_P_ALL) || + inner_etype == RTE_BE16(ETH_P_IPV6)); + inner_etype = RTE_BE16(ETH_P_IPV6); + } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) { + assert(vlan_etype == RTE_BE16(ETH_P_ALL) || + vlan_etype == RTE_BE16(ETH_P_IPV6)); + vlan_etype = RTE_BE16(ETH_P_IPV6); + } else { + assert(outer_etype == RTE_BE16(ETH_P_ALL) || + outer_etype == RTE_BE16(ETH_P_IPV6)); + outer_etype = RTE_BE16(ETH_P_IPV6); + } spec.ipv6 = items->spec; - if (!decap.vxlan) { - if (!eth_type_set || !vlan_eth_type_set) { - mnl_attr_put_u16 - (nlh, - vlan_present ? - TCA_FLOWER_KEY_VLAN_ETH_TYPE : - TCA_FLOWER_KEY_ETH_TYPE, - RTE_BE16(ETH_P_IPV6)); - } - eth_type_set = 1; - vlan_eth_type_set = 1; - if (mask.ipv6 == &flow_tcf_mask_empty.ipv6) + if (!tunnel_outer && mask.ipv6->hdr.proto) { + /* + * No way to set IP protocol for outer tunnel + * layers. Usually it is fixed, for example, + * to UDP for VXLAN/GPE. + */ + assert(spec.ipv6); /* Mask is not empty. */ + mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO, + spec.ipv6->hdr.proto); + ip_proto_set = 1; + } + ipv6_dst = !IN6_IS_ADDR_UNSPECIFIED + (mask.ipv6->hdr.dst_addr); + ipv6_src = !IN6_IS_ADDR_UNSPECIFIED + (mask.ipv6->hdr.src_addr); + if (mask.ipv6 == &flow_tcf_mask_empty.ipv6 || + (!ipv6_dst && !ipv6_src)) { + if (!tunnel_outer) break; - if (mask.ipv6->hdr.proto) { - mnl_attr_put_u8 - (nlh, TCA_FLOWER_KEY_IP_PROTO, - spec.ipv6->hdr.proto); - ip_proto_set = 1; - } - } else { - assert(mask.ipv6 != &flow_tcf_mask_empty.ipv6); + /* + * For tunnel outer we must set outer IP key + * anyway, even if the specification/mask is + * empty. There is no another way to tell + * kernel about he outer layer protocol. + */ + mnl_attr_put(nlh, + TCA_FLOWER_KEY_ENC_IPV6_SRC, + IPV6_ADDR_LEN, + mask.ipv6->hdr.src_addr); + mnl_attr_put(nlh, + TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK, + IPV6_ADDR_LEN, + mask.ipv6->hdr.src_addr); + assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len); + break; } - if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.src_addr)) { - mnl_attr_put(nlh, decap.vxlan ? + if (ipv6_src) { + mnl_attr_put(nlh, tunnel_outer ? TCA_FLOWER_KEY_ENC_IPV6_SRC : TCA_FLOWER_KEY_IPV6_SRC, IPV6_ADDR_LEN, spec.ipv6->hdr.src_addr); - mnl_attr_put(nlh, decap.vxlan ? + mnl_attr_put(nlh, tunnel_outer ? TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK : TCA_FLOWER_KEY_IPV6_SRC_MASK, IPV6_ADDR_LEN, mask.ipv6->hdr.src_addr); } - if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.dst_addr)) { - mnl_attr_put(nlh, decap.vxlan ? + if (ipv6_dst) { + mnl_attr_put(nlh, tunnel_outer ? TCA_FLOWER_KEY_ENC_IPV6_DST : TCA_FLOWER_KEY_IPV6_DST, IPV6_ADDR_LEN, spec.ipv6->hdr.dst_addr); - mnl_attr_put(nlh, decap.vxlan ? + mnl_attr_put(nlh, tunnel_outer ? TCA_FLOWER_KEY_ENC_IPV6_DST_MASK : TCA_FLOWER_KEY_IPV6_DST_MASK, IPV6_ADDR_LEN, mask.ipv6->hdr.dst_addr); } + if (mask.ipv6->hdr.hop_limits) { + mnl_attr_put_u8 + (nlh, tunnel_outer ? + TCA_FLOWER_KEY_ENC_IP_TTL : + TCA_FLOWER_KEY_IP_TTL, + spec.ipv6->hdr.hop_limits); + mnl_attr_put_u8 + (nlh, tunnel_outer ? + TCA_FLOWER_KEY_ENC_IP_TTL_MASK : + TCA_FLOWER_KEY_IP_TTL_MASK, + mask.ipv6->hdr.hop_limits); + } + msk6 = (rte_be_to_cpu_32(mask.ipv6->hdr.vtc_flow) >> + RTE_IPV6_HDR_TC_SHIFT) & 0xff; + if (msk6) { + tos6 = (rte_be_to_cpu_32 + (spec.ipv6->hdr.vtc_flow) >> + RTE_IPV6_HDR_TC_SHIFT) & 0xff; + mnl_attr_put_u8 + (nlh, tunnel_outer ? + TCA_FLOWER_KEY_ENC_IP_TOS : + TCA_FLOWER_KEY_IP_TOS, tos6); + mnl_attr_put_u8 + (nlh, tunnel_outer ? + TCA_FLOWER_KEY_ENC_IP_TOS_MASK : + TCA_FLOWER_KEY_IP_TOS_MASK, msk6); + } assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len); break; + } case RTE_FLOW_ITEM_TYPE_UDP: - item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP; + item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ? + MLX5_FLOW_LAYER_INNER_L4_UDP : + MLX5_FLOW_LAYER_OUTER_L4_UDP; mask.udp = flow_tcf_item_mask (items, &rte_flow_item_udp_mask, &flow_tcf_mask_supported.udp, @@ -3372,7 +3619,7 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, error); assert(mask.udp); spec.udp = items->spec; - if (!decap.vxlan) { + if (!tunnel_outer) { if (!ip_proto_set) mnl_attr_put_u8 (nlh, TCA_FLOWER_KEY_IP_PROTO, @@ -3387,24 +3634,24 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, } if (mask.udp->hdr.src_port) { mnl_attr_put_u16 - (nlh, decap.vxlan ? + (nlh, tunnel_outer ? TCA_FLOWER_KEY_ENC_UDP_SRC_PORT : TCA_FLOWER_KEY_UDP_SRC, spec.udp->hdr.src_port); mnl_attr_put_u16 - (nlh, decap.vxlan ? + (nlh, tunnel_outer ? TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK : TCA_FLOWER_KEY_UDP_SRC_MASK, mask.udp->hdr.src_port); } if (mask.udp->hdr.dst_port) { mnl_attr_put_u16 - (nlh, decap.vxlan ? + (nlh, tunnel_outer ? TCA_FLOWER_KEY_ENC_UDP_DST_PORT : TCA_FLOWER_KEY_UDP_DST, spec.udp->hdr.dst_port); mnl_attr_put_u16 - (nlh, decap.vxlan ? + (nlh, tunnel_outer ? TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK : TCA_FLOWER_KEY_UDP_DST_MASK, mask.udp->hdr.dst_port); @@ -3412,7 +3659,9 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len); break; case RTE_FLOW_ITEM_TYPE_TCP: - item_flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP; + item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ? + MLX5_FLOW_LAYER_INNER_L4_TCP : + MLX5_FLOW_LAYER_OUTER_L4_TCP; mask.tcp = flow_tcf_item_mask (items, &rte_flow_item_tcp_mask, &flow_tcf_mask_supported.tcp, @@ -3456,6 +3705,7 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, break; case RTE_FLOW_ITEM_TYPE_VXLAN: assert(decap.vxlan); + tunnel_outer = 0; item_flags |= MLX5_FLOW_LAYER_VXLAN; spec.vxlan = items->spec; mnl_attr_put_u32(nlh, @@ -3469,6 +3719,34 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, NULL, "item not supported"); } } + /* + * Set the ether_type flower key and tc rule protocol: + * - if there is nor VLAN neither VXLAN the key is taken from + * eth item directly or deduced from L3 items. + * - if there is vlan item then key is fixed to 802.1q. + * - if there is vxlan item then key is set to inner tunnel type. + * - simultaneous vlan and vxlan items are prohibited. + */ + if (outer_etype != RTE_BE16(ETH_P_ALL)) { + tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16, + outer_etype); + if (item_flags & MLX5_FLOW_LAYER_TUNNEL) { + if (inner_etype != RTE_BE16(ETH_P_ALL)) + mnl_attr_put_u16(nlh, + TCA_FLOWER_KEY_ETH_TYPE, + inner_etype); + } else { + mnl_attr_put_u16(nlh, + TCA_FLOWER_KEY_ETH_TYPE, + outer_etype); + if (outer_etype == RTE_BE16(ETH_P_8021Q) && + vlan_etype != RTE_BE16(ETH_P_ALL)) + mnl_attr_put_u16(nlh, + TCA_FLOWER_KEY_VLAN_ETH_TYPE, + vlan_etype); + } + assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len); + } na_flower_act = mnl_attr_nest_start(nlh, TCA_FLOWER_ACT); na_act_index_cur = 1; for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { @@ -3502,6 +3780,10 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, mnl_attr_get_payload (mnl_nlmsg_get_payload_tail (nlh)))->ifindex; + } else if (decap.hdr) { + assert(dev_flow->tcf.tunnel); + dev_flow->tcf.tunnel->ifindex_ptr = + (unsigned int *)&tcm->tcm_ifindex; } mnl_attr_put(nlh, TCA_MIRRED_PARMS, sizeof(struct tc_mirred), @@ -3678,6 +3960,14 @@ override_na_vlan_priority: TCA_TUNNEL_KEY_ENC_IPV6_DST, sizeof(encap.vxlan->ipv6.dst), &encap.vxlan->ipv6.dst); + if (encap.vxlan->mask & FLOW_TCF_ENCAP_IP_TTL) + mnl_attr_put_u8(nlh, + TCA_TUNNEL_KEY_ENC_TTL, + encap.vxlan->ip_ttl_hop); + if (encap.vxlan->mask & FLOW_TCF_ENCAP_IP_TOS) + mnl_attr_put_u8(nlh, + TCA_TUNNEL_KEY_ENC_TOS, + encap.vxlan->ip_tos); if (encap.vxlan->mask & FLOW_TCF_ENCAP_VXLAN_VNI) mnl_attr_put_u32(nlh, TCA_TUNNEL_KEY_ENC_KEY_ID, @@ -3714,6 +4004,8 @@ override_na_vlan_priority: assert(na_flower); assert(na_flower_act); mnl_attr_nest_end(nlh, na_flower_act); + dev_flow->tcf.ptc_flags = mnl_attr_get_payload + (mnl_nlmsg_get_payload_tail(nlh)); mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, decap.vxlan ? 0 : TCA_CLS_FLAGS_SKIP_SW); mnl_attr_nest_end(nlh, na_flower); @@ -3732,10 +4024,6 @@ override_na_vlan_priority: * @param nlh * Message to send. This function always raises the NLM_F_ACK flag before * sending. - * @param[in] msglen - * Message length. Message buffer may contain multiple commands and - * nlmsg_len field not always corresponds to actual message length. - * If 0 specified the nlmsg_len field in header is used as message length. * @param[in] cb * Callback handler for received message. * @param[in] arg @@ -3747,52 +4035,64 @@ override_na_vlan_priority: static int flow_tcf_nl_ack(struct mlx5_flow_tcf_context *tcf, struct nlmsghdr *nlh, - uint32_t msglen, mnl_cb_t cb, void *arg) { unsigned int portid = mnl_socket_get_portid(tcf->nl); uint32_t seq = tcf->seq++; - int err, ret; + int ret, err = 0; assert(tcf->nl); assert(tcf->buf); - if (!seq) + if (!seq) { /* seq 0 is reserved for kernel event-driven notifications. */ seq = tcf->seq++; + } nlh->nlmsg_seq = seq; - if (!msglen) { - msglen = nlh->nlmsg_len; - nlh->nlmsg_flags |= NLM_F_ACK; + nlh->nlmsg_flags |= NLM_F_ACK; + ret = mnl_socket_sendto(tcf->nl, nlh, nlh->nlmsg_len); + if (ret <= 0) { + /* Message send error occurred. */ + rte_errno = errno; + return -rte_errno; } - ret = mnl_socket_sendto(tcf->nl, nlh, msglen); - err = (ret <= 0) ? errno : 0; nlh = (struct nlmsghdr *)(tcf->buf); /* * The following loop postpones non-fatal errors until multipart * messages are complete. */ - if (ret > 0) - while (true) { - ret = mnl_socket_recvfrom(tcf->nl, tcf->buf, - tcf->buf_size); + while (true) { + ret = mnl_socket_recvfrom(tcf->nl, tcf->buf, tcf->buf_size); + if (ret < 0) { + err = errno; + /* + * In case of overflow Will receive till + * end of multipart message. We may lost part + * of reply messages but mark and return an error. + */ + if (err != ENOSPC || + !(nlh->nlmsg_flags & NLM_F_MULTI) || + nlh->nlmsg_type == NLMSG_DONE) + break; + } else { + ret = mnl_cb_run(nlh, ret, seq, portid, cb, arg); + if (!ret) { + /* + * libmnl returns 0 if DONE or + * success ACK message found. + */ + break; + } if (ret < 0) { + /* + * ACK message with error found + * or some error occurred. + */ err = errno; - if (err != ENOSPC) - break; - } - if (!err) { - ret = mnl_cb_run(nlh, ret, seq, portid, - cb, arg); - if (ret < 0) { - err = errno; - break; - } - } - /* Will receive till end of multipart message */ - if (!(nlh->nlmsg_flags & NLM_F_MULTI) || - nlh->nlmsg_type == NLMSG_DONE) break; + } + /* We should continue receiving. */ } + } if (!err) return 0; rte_errno = err; @@ -3861,30 +4161,6 @@ flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size) return nlh; } -/** - * Set NLM_F_ACK flags in the last netlink command in buffer. - * Only last command in the buffer will be acked by system. - * - * @param[in, out] buf - * Pointer to buffer with netlink commands. - */ -static void -flow_tcf_setack_nlcmd(struct tcf_nlcb_buf *buf) -{ - struct nlmsghdr *nlh; - uint32_t size = 0; - - assert(buf->size); - do { - nlh = (struct nlmsghdr *)&buf->msg[size]; - size += NLMSG_ALIGN(nlh->nlmsg_len); - if (size >= buf->size) { - nlh->nlmsg_flags |= NLM_F_ACK; - break; - } - } while (true); -} - /** * Send the buffers with prepared netlink commands. Scans the list and * sends all found buffers. Buffers are sent and freed anyway in order @@ -3903,21 +4179,35 @@ static int flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf, struct tcf_nlcb_context *ctx) { - struct tcf_nlcb_buf *bc, *bn; - struct nlmsghdr *nlh; + struct tcf_nlcb_buf *bc = LIST_FIRST(&ctx->nlbuf); int ret = 0; - bc = LIST_FIRST(&ctx->nlbuf); while (bc) { + struct tcf_nlcb_buf *bn = LIST_NEXT(bc, next); + struct nlmsghdr *nlh; + uint32_t msg = 0; int rc; - bn = LIST_NEXT(bc, next); - if (bc->size) { - flow_tcf_setack_nlcmd(bc); - nlh = (struct nlmsghdr *)&bc->msg; - rc = flow_tcf_nl_ack(tcf, nlh, bc->size, NULL, NULL); - if (rc && !ret) - ret = rc; + while (msg < bc->size) { + /* + * Send Netlink commands from buffer in one by one + * fashion. If we send multiple rule deletion commands + * in one Netlink message and some error occurs it may + * cause multiple ACK error messages and break sequence + * numbers of Netlink communication, because we expect + * the only one ACK reply. + */ + assert((bc->size - msg) >= sizeof(struct nlmsghdr)); + nlh = (struct nlmsghdr *)&bc->msg[msg]; + assert((bc->size - msg) >= nlh->nlmsg_len); + msg += nlh->nlmsg_len; + rc = flow_tcf_nl_ack(tcf, nlh, NULL, NULL); + if (rc) { + DRV_LOG(WARNING, + "netlink: cleanup error %d", rc); + if (!ret) + ret = rc; + } } rte_free(bc); bc = bn; @@ -3950,6 +4240,7 @@ flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg) struct nlattr *na_local = NULL; struct nlattr *na_peer = NULL; unsigned char family; + uint32_t size; if (nlh->nlmsg_type != RTM_NEWADDR) { rte_errno = EINVAL; @@ -3977,11 +4268,11 @@ flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg) if (!na_local || !na_peer) return 1; /* Local rule found with scope link, permanent and assigned peer. */ - cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) + - MNL_ALIGN(sizeof(struct ifaddrmsg)) + - (family == AF_INET6 - ? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) - : 2 * SZ_NLATTR_TYPE_OF(uint32_t))); + size = MNL_ALIGN(sizeof(struct nlmsghdr)) + + MNL_ALIGN(sizeof(struct ifaddrmsg)) + + (family == AF_INET6 ? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) + : 2 * SZ_NLATTR_TYPE_OF(uint32_t)); + cmd = flow_tcf_alloc_nlcmd(ctx, size); if (!cmd) { rte_errno = ENOMEM; return -rte_errno; @@ -4006,6 +4297,7 @@ flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg) mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN, mnl_attr_get_payload(na_peer)); } + assert(size == cmd->nlmsg_len); return 1; } @@ -4015,7 +4307,7 @@ flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg) * @param[in] tcf * Context object initialized by mlx5_flow_tcf_context_create(). * @param[in] ifindex - * Network inferface index to perform cleanup. + * Network interface index to perform cleanup. */ static void flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf, @@ -4042,7 +4334,7 @@ flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf, ifa->ifa_family = AF_UNSPEC; ifa->ifa_index = ifindex; ifa->ifa_scope = RT_SCOPE_LINK; - ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_local_cb, &ctx); + ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_local_cb, &ctx); if (ret) DRV_LOG(WARNING, "netlink: query device list error %d", ret); ret = flow_tcf_send_nlcmd(tcf, &ctx); @@ -4051,7 +4343,7 @@ flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf, } /** - * Collect neigh permament rules on specified network device. + * Collect neigh permanent rules on specified network device. * This is callback routine called by libmnl mnl_cb_run() in loop for * every message in received packet. * @@ -4074,6 +4366,7 @@ flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg) struct nlattr *na_ip = NULL; struct nlattr *na_mac = NULL; unsigned char family; + uint32_t size; if (nlh->nlmsg_type != RTM_NEWNEIGH) { rte_errno = EINVAL; @@ -4099,13 +4392,13 @@ flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg) } if (!na_mac || !na_ip) return 1; - /* Neigh rule with permenent attribute found. */ - cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) + - MNL_ALIGN(sizeof(struct ndmsg)) + - SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) + - (family == AF_INET6 - ? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) - : SZ_NLATTR_TYPE_OF(uint32_t))); + /* Neigh rule with permanent attribute found. */ + size = MNL_ALIGN(sizeof(struct nlmsghdr)) + + MNL_ALIGN(sizeof(struct ndmsg)) + + SZ_NLATTR_DATA_OF(RTE_ETHER_ADDR_LEN) + + (family == AF_INET6 ? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) + : SZ_NLATTR_TYPE_OF(uint32_t)); + cmd = flow_tcf_alloc_nlcmd(ctx, size); if (!cmd) { rte_errno = ENOMEM; return -rte_errno; @@ -4126,8 +4419,9 @@ flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg) mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN, mnl_attr_get_payload(na_ip)); } - mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN, + mnl_attr_put(cmd, NDA_LLADDR, RTE_ETHER_ADDR_LEN, mnl_attr_get_payload(na_mac)); + assert(size == cmd->nlmsg_len); return 1; } @@ -4137,7 +4431,7 @@ flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg) * @param[in] tcf * Context object initialized by mlx5_flow_tcf_context_create(). * @param[in] ifindex - * Network inferface index to perform cleanup. + * Network interface index to perform cleanup. */ static void flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf, @@ -4161,7 +4455,7 @@ flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf, ndm->ndm_family = AF_UNSPEC; ndm->ndm_ifindex = ifindex; ndm->ndm_state = NUD_PERMANENT; - ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_neigh_cb, &ctx); + ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_neigh_cb, &ctx); if (ret) DRV_LOG(WARNING, "netlink: query device list error %d", ret); ret = flow_tcf_send_nlcmd(tcf, &ctx); @@ -4194,6 +4488,7 @@ flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg) struct nlattr *na_vxlan = NULL; bool found = false; unsigned int vxindex; + uint32_t size; if (nlh->nlmsg_type != RTM_NEWLINK) { rte_errno = EINVAL; @@ -4239,9 +4534,10 @@ flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg) return 1; /* Attached VXLAN device found, store the command to delete. */ vxindex = ifm->ifi_index; - cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) + - MNL_ALIGN(sizeof(struct ifinfomsg))); - if (!nlh) { + size = MNL_ALIGN(sizeof(struct nlmsghdr)) + + MNL_ALIGN(sizeof(struct ifinfomsg)); + cmd = flow_tcf_alloc_nlcmd(ctx, size); + if (!cmd) { rte_errno = ENOMEM; return -rte_errno; } @@ -4251,13 +4547,14 @@ flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg) ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm)); ifm->ifi_family = AF_UNSPEC; ifm->ifi_index = vxindex; + assert(size == cmd->nlmsg_len); return 1; } /** * Cleanup the outer interface. Removes all found vxlan devices - * attached to specified index, flushes the meigh and local IP - * datavase. + * attached to specified index, flushes the neigh and local IP + * database. * * @param[in] tcf * Context object initialized by mlx5_flow_tcf_context_create(). @@ -4287,7 +4584,7 @@ flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf, nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm)); ifm->ifi_family = AF_UNSPEC; - ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_vxlan_cb, &ctx); + ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_vxlan_cb, &ctx); if (ret) DRV_LOG(WARNING, "netlink: query device list error %d", ret); ret = flow_tcf_send_nlcmd(tcf, &ctx); @@ -4302,7 +4599,7 @@ flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf, * Note that an implicit route is maintained by the kernel due to the * presence of a peer address (IFA_ADDRESS). * - * These rules are used for encapsultion only and allow to assign + * These rules are used for encapsulation only and allow to assign * the outer tunnel source IP address. * * @param[in] tcf @@ -4359,7 +4656,7 @@ flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf, sizeof(encap->ipv6.dst), &encap->ipv6.dst); } - if (!flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL)) + if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL)) return 0; return rte_flow_error_set(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, @@ -4422,7 +4719,7 @@ flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf, if (encap->mask & FLOW_TCF_ENCAP_ETH_DST) mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst), &encap->eth.dst); - if (!flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL)) + if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL)) return 0; return rte_flow_error_set(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, @@ -4446,8 +4743,8 @@ flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf, * * @param[in] tcf * Libmnl socket context object. - * @param[in] vtep - * VTEP object, contains rule database and ifouter index. + * @param[in] iface + * Object, contains rule database and ifouter index. * @param[in] dev_flow * Flow object, contains the tunnel parameters (for encap only). * @param[in] enable @@ -4460,43 +4757,40 @@ flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf, */ static int flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf, - struct tcf_vtep *vtep, + struct tcf_irule *iface, struct mlx5_flow *dev_flow, bool enable, struct rte_flow_error *error) { const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap; - struct tcf_local_rule *rule; - bool found = false; + struct tcf_local_rule *rule = NULL; int ret; assert(encap); assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP); if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) { assert(encap->mask & FLOW_TCF_ENCAP_IPV4_DST); - LIST_FOREACH(rule, &vtep->local, next) { + LIST_FOREACH(rule, &iface->local, next) { if (rule->mask & FLOW_TCF_ENCAP_IPV4_SRC && encap->ipv4.src == rule->ipv4.src && encap->ipv4.dst == rule->ipv4.dst) { - found = true; break; } } } else { assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC); assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST); - LIST_FOREACH(rule, &vtep->local, next) { + LIST_FOREACH(rule, &iface->local, next) { if (rule->mask & FLOW_TCF_ENCAP_IPV6_SRC && !memcmp(&encap->ipv6.src, &rule->ipv6.src, sizeof(encap->ipv6.src)) && !memcmp(&encap->ipv6.dst, &rule->ipv6.dst, sizeof(encap->ipv6.dst))) { - found = true; break; } } } - if (found) { + if (rule) { if (enable) { rule->refcnt++; return 0; @@ -4504,7 +4798,7 @@ flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf, if (!rule->refcnt || !--rule->refcnt) { LIST_REMOVE(rule, next); return flow_tcf_rule_local(tcf, encap, - vtep->ifouter, false, error); + iface->ifouter, false, error); } return 0; } @@ -4537,13 +4831,13 @@ flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf, memcpy(&rule->ipv6.src, &encap->ipv6.src, IPV6_ADDR_LEN); memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN); } - ret = flow_tcf_rule_local(tcf, encap, vtep->ifouter, true, error); + ret = flow_tcf_rule_local(tcf, encap, iface->ifouter, true, error); if (ret) { rte_free(rule); return ret; } rule->refcnt++; - LIST_INSERT_HEAD(&vtep->local, rule, next); + LIST_INSERT_HEAD(&iface->local, rule, next); return 0; } @@ -4555,8 +4849,8 @@ flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf, * * @param[in] tcf * Libmnl socket context object. - * @param[in] vtep - * VTEP object, contains rule database and ifouter index. + * @param[in] iface + * Object, contains rule database and ifouter index. * @param[in] dev_flow * Flow object, contains the tunnel parameters (for encap only). * @param[in] enable @@ -4569,40 +4863,37 @@ flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf, */ static int flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf, - struct tcf_vtep *vtep, + struct tcf_irule *iface, struct mlx5_flow *dev_flow, bool enable, struct rte_flow_error *error) { const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap; - struct tcf_neigh_rule *rule; - bool found = false; + struct tcf_neigh_rule *rule = NULL; int ret; assert(encap); assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP); if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) { assert(encap->mask & FLOW_TCF_ENCAP_IPV4_SRC); - LIST_FOREACH(rule, &vtep->neigh, next) { + LIST_FOREACH(rule, &iface->neigh, next) { if (rule->mask & FLOW_TCF_ENCAP_IPV4_DST && encap->ipv4.dst == rule->ipv4.dst) { - found = true; break; } } } else { assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC); assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST); - LIST_FOREACH(rule, &vtep->neigh, next) { + LIST_FOREACH(rule, &iface->neigh, next) { if (rule->mask & FLOW_TCF_ENCAP_IPV6_DST && !memcmp(&encap->ipv6.dst, &rule->ipv6.dst, sizeof(encap->ipv6.dst))) { - found = true; break; } } } - if (found) { + if (rule) { if (memcmp(&encap->eth.dst, &rule->eth, sizeof(encap->eth.dst))) { DRV_LOG(WARNING, "Destination MAC differs" @@ -4621,7 +4912,7 @@ flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf, if (!rule->refcnt || !--rule->refcnt) { LIST_REMOVE(rule, next); return flow_tcf_rule_neigh(tcf, encap, - vtep->ifouter, + iface->ifouter, false, error); } return 0; @@ -4652,27 +4943,106 @@ flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf, memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN); } memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth)); - ret = flow_tcf_rule_neigh(tcf, encap, vtep->ifouter, true, error); + ret = flow_tcf_rule_neigh(tcf, encap, iface->ifouter, true, error); if (ret) { rte_free(rule); return ret; } rule->refcnt++; - LIST_INSERT_HEAD(&vtep->neigh, rule, next); + LIST_INSERT_HEAD(&iface->neigh, rule, next); return 0; } +/* VXLAN encap rule database for outer interfaces. */ +static LIST_HEAD(, tcf_irule) iface_list_vxlan = LIST_HEAD_INITIALIZER(); + /* VTEP device list is shared between PMD port instances. */ static LIST_HEAD(, tcf_vtep) vtep_list_vxlan = LIST_HEAD_INITIALIZER(); static pthread_mutex_t vtep_list_mutex = PTHREAD_MUTEX_INITIALIZER; +/** + * Acquire the VXLAN encap rules container for specified interface. + * First looks for the container in the existing ones list, creates + * and initializes the new container if existing not found. + * + * @param[in] tcf + * Context object initialized by mlx5_flow_tcf_context_create(). + * @param[in] ifouter + * Network interface index to create VXLAN encap rules on. + * @param[out] error + * Perform verbose error reporting if not NULL. + * @return + * Rule container pointer on success, + * NULL otherwise and rte_errno is set. + */ +static struct tcf_irule* +flow_tcf_encap_irule_acquire(struct mlx5_flow_tcf_context *tcf, + unsigned int ifouter, + struct rte_flow_error *error) +{ + struct tcf_irule *iface; + + /* Look whether the container for encap rules is created. */ + assert(ifouter); + LIST_FOREACH(iface, &iface_list_vxlan, next) { + if (iface->ifouter == ifouter) + break; + } + if (iface) { + /* Container already exists, just increment the reference. */ + iface->refcnt++; + return iface; + } + /* Not found, we should create the new container. */ + iface = rte_zmalloc(__func__, sizeof(*iface), + alignof(struct tcf_irule)); + if (!iface) { + rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "unable to allocate memory for container"); + return NULL; + } + *iface = (struct tcf_irule){ + .local = LIST_HEAD_INITIALIZER(), + .neigh = LIST_HEAD_INITIALIZER(), + .ifouter = ifouter, + .refcnt = 1, + }; + /* Interface cleanup for new container created. */ + flow_tcf_encap_iface_cleanup(tcf, ifouter); + flow_tcf_encap_local_cleanup(tcf, ifouter); + flow_tcf_encap_neigh_cleanup(tcf, ifouter); + LIST_INSERT_HEAD(&iface_list_vxlan, iface, next); + return iface; +} + +/** + * Releases VXLAN encap rules container by pointer. Decrements the + * reference counter and deletes the container if counter is zero. + * + * @param[in] irule + * VXLAN rule container pointer to release. + */ +static void +flow_tcf_encap_irule_release(struct tcf_irule *iface) +{ + assert(iface->refcnt); + if (--iface->refcnt == 0) { + /* Reference counter is zero, delete the container. */ + assert(LIST_EMPTY(&iface->local)); + assert(LIST_EMPTY(&iface->neigh)); + LIST_REMOVE(iface, next); + rte_free(iface); + } +} + /** * Deletes VTEP network device. * * @param[in] tcf * Context object initialized by mlx5_flow_tcf_context_create(). * @param[in] vtep - * Object represinting the network device to delete. Memory + * Object representing the network device to delete. Memory * allocated for this object is freed by routine. */ static void @@ -4697,7 +5067,7 @@ flow_tcf_vtep_delete(struct mlx5_flow_tcf_context *tcf, ifm->ifi_family = AF_UNSPEC; ifm->ifi_index = vtep->ifindex; assert(sizeof(buf) >= nlh->nlmsg_len); - ret = flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL); + ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL); if (ret) DRV_LOG(WARNING, "netlink: error deleting vxlan" " encap/decap ifindex %u", @@ -4711,11 +5081,6 @@ flow_tcf_vtep_delete(struct mlx5_flow_tcf_context *tcf, * * @param[in] tcf * Context object initialized by mlx5_flow_tcf_context_create(). - * @param[in] ifouter - * Outer interface to attach new-created VXLAN device - * If zero the VXLAN device will not be attached to any device. - * These VTEPs are used for decapsulation and can be precreated - * and shared between processes. * @param[in] port * UDP port of created VTEP device. * @param[out] error @@ -4725,10 +5090,8 @@ flow_tcf_vtep_delete(struct mlx5_flow_tcf_context *tcf, * Pointer to created device structure on success, * NULL otherwise and rte_errno is set. */ -#ifdef HAVE_IFLA_VXLAN_COLLECT_METADATA static struct tcf_vtep* flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf, - unsigned int ifouter, uint16_t port, struct rte_flow_error *error) { struct tcf_vtep *vtep; @@ -4758,8 +5121,6 @@ flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf, } *vtep = (struct tcf_vtep){ .port = port, - .local = LIST_HEAD_INITIALIZER(), - .neigh = LIST_HEAD_INITIALIZER(), }; memset(buf, 0, sizeof(buf)); nlh = mnl_nlmsg_put_header(buf); @@ -4777,22 +5138,34 @@ flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf, assert(na_info); mnl_attr_put_strz(nlh, IFLA_INFO_KIND, "vxlan"); na_vxlan = mnl_attr_nest_start(nlh, IFLA_INFO_DATA); - if (ifouter) - mnl_attr_put_u32(nlh, IFLA_VXLAN_LINK, ifouter); assert(na_vxlan); +#ifdef HAVE_IFLA_VXLAN_COLLECT_METADATA + /* + * RH 7.2 does not support metadata for tunnel device. + * It does not matter because we are going to use the + * hardware offload by mlx5 driver. + */ mnl_attr_put_u8(nlh, IFLA_VXLAN_COLLECT_METADATA, 1); +#endif mnl_attr_put_u8(nlh, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1); mnl_attr_put_u8(nlh, IFLA_VXLAN_LEARNING, 0); mnl_attr_put_u16(nlh, IFLA_VXLAN_PORT, vxlan_port); +#ifndef HAVE_IFLA_VXLAN_COLLECT_METADATA + /* + * We must specify VNI explicitly if metadata not supported. + * Note, VNI is transferred with native endianness format. + */ + mnl_attr_put_u16(nlh, IFLA_VXLAN_ID, MLX5_VXLAN_DEFAULT_VNI); +#endif mnl_attr_nest_end(nlh, na_vxlan); mnl_attr_nest_end(nlh, na_info); assert(sizeof(buf) >= nlh->nlmsg_len); - ret = flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL); + ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL); if (ret) { DRV_LOG(WARNING, "netlink: VTEP %s create failure (%d)", name, rte_errno); - if (rte_errno != EEXIST || ifouter) + if (rte_errno != EEXIST) /* * Some unhandled error occurred or device is * for encapsulation and cannot be shared. @@ -4805,6 +5178,7 @@ flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf, * when we do not need it anymore. */ vtep->created = 1; + vtep->waitreg = 1; } /* Try to get ifindex of created of pre-existing device. */ ret = if_nametoindex(name); @@ -4818,7 +5192,6 @@ flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf, goto error; } vtep->ifindex = ret; - vtep->ifouter = ifouter; memset(buf, 0, sizeof(buf)); nlh = mnl_nlmsg_put_header(buf); nlh->nlmsg_type = RTM_NEWLINK; @@ -4829,7 +5202,7 @@ flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf, ifm->ifi_index = vtep->ifindex; ifm->ifi_flags = IFF_UP; ifm->ifi_change = IFF_UP; - ret = flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL); + ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL); if (ret) { rte_flow_error_set(error, -errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, @@ -4853,20 +5226,6 @@ error: rte_free(vtep); return NULL; } -#else -static struct tcf_vtep* -flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf __rte_unused, - unsigned int ifouter __rte_unused, - uint16_t port __rte_unused, - struct rte_flow_error *error) -{ - rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, - "netlink: failed to create VTEP, " - "vxlan metadata are not supported by kernel"); - return NULL; -} -#endif /* HAVE_IFLA_VXLAN_COLLECT_METADATA */ /** * Acquire target interface index for VXLAN tunneling decapsulation. @@ -4895,13 +5254,6 @@ flow_tcf_decap_vtep_acquire(struct mlx5_flow_tcf_context *tcf, if (vtep->port == port) break; } - if (vtep && vtep->ifouter) { - rte_flow_error_set(error, -errno, - RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, - "Failed to create decap VTEP with specified" - " UDP port, atatched device exists"); - return NULL; - } if (vtep) { /* Device exists, just increment the reference counter. */ vtep->refcnt++; @@ -4909,14 +5261,14 @@ flow_tcf_decap_vtep_acquire(struct mlx5_flow_tcf_context *tcf, return vtep; } /* No decapsulation device exists, try to create the new one. */ - vtep = flow_tcf_vtep_create(tcf, 0, port, error); + vtep = flow_tcf_vtep_create(tcf, port, error); if (vtep) LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next); return vtep; } /** - * Aqcuire target interface index for VXLAN tunneling encapsulation. + * Acquire target interface index for VXLAN tunneling encapsulation. * * @param[in] tcf * Context object initialized by mlx5_flow_tcf_context_create(). @@ -4933,70 +5285,51 @@ flow_tcf_decap_vtep_acquire(struct mlx5_flow_tcf_context *tcf, static struct tcf_vtep* flow_tcf_encap_vtep_acquire(struct mlx5_flow_tcf_context *tcf, unsigned int ifouter, - struct mlx5_flow *dev_flow __rte_unused, + struct mlx5_flow *dev_flow, struct rte_flow_error *error) { - static uint16_t encap_port = MLX5_VXLAN_PORT_MIN - 1; + static uint16_t port; struct tcf_vtep *vtep; + struct tcf_irule *iface; int ret; assert(ifouter); - /* Look whether the attached VTEP for encap is created. */ + /* Look whether the VTEP for specified port is created. */ + port = rte_be_to_cpu_16(dev_flow->tcf.vxlan_encap->udp.dst); LIST_FOREACH(vtep, &vtep_list_vxlan, next) { - if (vtep->ifouter == ifouter) + if (vtep->port == port) break; } if (vtep) { /* VTEP already exists, just increment the reference. */ vtep->refcnt++; } else { - uint16_t pcnt; - - /* Not found, we should create the new attached VTEP. */ - flow_tcf_encap_iface_cleanup(tcf, ifouter); - flow_tcf_encap_local_cleanup(tcf, ifouter); - flow_tcf_encap_neigh_cleanup(tcf, ifouter); - for (pcnt = 0; pcnt <= (MLX5_VXLAN_PORT_MAX - - MLX5_VXLAN_PORT_MIN); pcnt++) { - encap_port++; - /* Wraparound the UDP port index. */ - if (encap_port < MLX5_VXLAN_PORT_MIN || - encap_port > MLX5_VXLAN_PORT_MAX) - encap_port = MLX5_VXLAN_PORT_MIN; - /* Check whether UDP port is in already in use. */ - LIST_FOREACH(vtep, &vtep_list_vxlan, next) { - if (vtep->port == encap_port) - break; - } - if (vtep) { - /* Port is in use, try the next one. */ - vtep = NULL; - continue; - } - vtep = flow_tcf_vtep_create(tcf, ifouter, - encap_port, error); - if (vtep) { - LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next); - break; - } - if (rte_errno != EEXIST) - break; - } + /* Not found, we should create the new VTEP. */ + vtep = flow_tcf_vtep_create(tcf, port, error); if (!vtep) return NULL; + LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next); } - assert(vtep->ifouter == ifouter); assert(vtep->ifindex); + iface = flow_tcf_encap_irule_acquire(tcf, ifouter, error); + if (!iface) { + if (--vtep->refcnt == 0) + flow_tcf_vtep_delete(tcf, vtep); + return NULL; + } + dev_flow->tcf.vxlan_encap->iface = iface; /* Create local ipaddr with peer to specify the outer IPs. */ - ret = flow_tcf_encap_local(tcf, vtep, dev_flow, true, error); + ret = flow_tcf_encap_local(tcf, iface, dev_flow, true, error); if (!ret) { /* Create neigh rule to specify outer destination MAC. */ - ret = flow_tcf_encap_neigh(tcf, vtep, dev_flow, true, error); + ret = flow_tcf_encap_neigh(tcf, iface, dev_flow, true, error); if (ret) - flow_tcf_encap_local(tcf, vtep, + flow_tcf_encap_local(tcf, iface, dev_flow, false, error); } if (ret) { + dev_flow->tcf.vxlan_encap->iface = NULL; + flow_tcf_encap_irule_release(iface); if (--vtep->refcnt == 0) flow_tcf_vtep_delete(tcf, vtep); return NULL; @@ -5011,7 +5344,7 @@ flow_tcf_encap_vtep_acquire(struct mlx5_flow_tcf_context *tcf, * @param[in] tcf * Context object initialized by mlx5_flow_tcf_context_create(). * @param[in] ifouter - * Network interface index to attach VXLAN encap device to. + * Network interface index to create VXLAN encap rules on. * @param[in] dev_flow * Flow tcf object with tunnel structure pointer set. * @param[out] error @@ -5069,11 +5402,18 @@ flow_tcf_vtep_release(struct mlx5_flow_tcf_context *tcf, switch (dev_flow->tcf.tunnel->type) { case FLOW_TCF_TUNACT_VXLAN_DECAP: break; - case FLOW_TCF_TUNACT_VXLAN_ENCAP: + case FLOW_TCF_TUNACT_VXLAN_ENCAP: { + struct tcf_irule *iface; + /* Remove the encap ancillary rules first. */ - flow_tcf_encap_neigh(tcf, vtep, dev_flow, false, NULL); - flow_tcf_encap_local(tcf, vtep, dev_flow, false, NULL); + iface = dev_flow->tcf.vxlan_encap->iface; + assert(iface); + flow_tcf_encap_neigh(tcf, iface, dev_flow, false, NULL); + flow_tcf_encap_local(tcf, iface, dev_flow, false, NULL); + flow_tcf_encap_irule_release(iface); + dev_flow->tcf.vxlan_encap->iface = NULL; break; + } default: assert(false); DRV_LOG(WARNING, "Unsupported tunnel type"); @@ -5087,7 +5427,217 @@ flow_tcf_vtep_release(struct mlx5_flow_tcf_context *tcf, pthread_mutex_unlock(&vtep_list_mutex); } +struct tcf_nlcb_query { + uint32_t handle; + uint32_t tc_flags; + uint32_t flags_valid:1; +}; + +/** + * Collect queried rule attributes. This is callback routine called by + * libmnl mnl_cb_run() in loop for every message in received packet. + * Current implementation collects the flower flags only. + * + * @param[in] nlh + * Pointer to reply header. + * @param[in, out] arg + * Context pointer for this callback. + * + * @return + * A positive, nonzero value on success (required by libmnl + * to continue messages processing). + */ +static int +flow_tcf_collect_query_cb(const struct nlmsghdr *nlh, void *arg) +{ + struct tcf_nlcb_query *query = arg; + struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh); + struct nlattr *na, *na_opt; + bool flower = false; + + if (nlh->nlmsg_type != RTM_NEWTFILTER || + tcm->tcm_handle != query->handle) + return 1; + mnl_attr_for_each(na, nlh, sizeof(*tcm)) { + switch (mnl_attr_get_type(na)) { + case TCA_KIND: + if (strcmp(mnl_attr_get_payload(na), "flower")) { + /* Not flower filter, drop entire message. */ + return 1; + } + flower = true; + break; + case TCA_OPTIONS: + if (!flower) { + /* Not flower options, drop entire message. */ + return 1; + } + /* Check nested flower options. */ + mnl_attr_for_each_nested(na_opt, na) { + switch (mnl_attr_get_type(na_opt)) { + case TCA_FLOWER_FLAGS: + query->flags_valid = 1; + query->tc_flags = + mnl_attr_get_u32(na_opt); + break; + } + } + break; + } + } + return 1; +} + +/** + * Query a TC flower rule flags via netlink. + * + * @param[in] tcf + * Context object initialized by mlx5_flow_tcf_context_create(). + * @param[in] dev_flow + * Pointer to the flow. + * @param[out] pflags + * pointer to the data retrieved by the query. + * + * @return + * 0 on success, a negative errno value otherwise. + */ +static int +flow_tcf_query_flags(struct mlx5_flow_tcf_context *tcf, + struct mlx5_flow *dev_flow, + uint32_t *pflags) +{ + struct nlmsghdr *nlh; + struct tcmsg *tcm; + struct tcf_nlcb_query query = { + .handle = dev_flow->tcf.tcm->tcm_handle, + }; + + nlh = mnl_nlmsg_put_header(tcf->buf); + nlh->nlmsg_type = RTM_GETTFILTER; + nlh->nlmsg_flags = NLM_F_REQUEST; + tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm)); + memcpy(tcm, dev_flow->tcf.tcm, sizeof(*tcm)); + /* + * Ignore Netlink error for filter query operations. + * The reply length is sent by kernel as errno. + * Just check we got the flags option. + */ + flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_query_cb, &query); + if (!query.flags_valid) { + *pflags = 0; + return -ENOENT; + } + *pflags = query.tc_flags; + return 0; +} + +/** + * Query and check the in_hw set for specified rule. + * + * @param[in] tcf + * Context object initialized by mlx5_flow_tcf_context_create(). + * @param[in] dev_flow + * Pointer to the flow to check. + * + * @return + * 0 on success, a negative errno value otherwise. + */ +static int +flow_tcf_check_inhw(struct mlx5_flow_tcf_context *tcf, + struct mlx5_flow *dev_flow) +{ + uint32_t flags; + int ret; + + ret = flow_tcf_query_flags(tcf, dev_flow, &flags); + if (ret) + return ret; + return (flags & TCA_CLS_FLAGS_IN_HW) ? 0 : -ENOENT; +} + +/** + * Remove flow from E-Switch by sending Netlink message. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in, out] flow + * Pointer to the sub flow. + */ +static void +flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_flow_tcf_context *ctx = priv->tcf_context; + struct mlx5_flow *dev_flow; + struct nlmsghdr *nlh; + struct tcmsg *tcm; + + if (!flow) + return; + dev_flow = LIST_FIRST(&flow->dev_flows); + if (!dev_flow) + return; + /* E-Switch flow can't be expanded. */ + assert(!LIST_NEXT(dev_flow, next)); + if (dev_flow->tcf.applied) { + nlh = dev_flow->tcf.nlh; + nlh->nlmsg_type = RTM_DELTFILTER; + nlh->nlmsg_flags = NLM_F_REQUEST; + flow_tcf_nl_ack(ctx, nlh, NULL, NULL); + if (dev_flow->tcf.tunnel) { + assert(dev_flow->tcf.tunnel->vtep); + flow_tcf_vtep_release(ctx, + dev_flow->tcf.tunnel->vtep, + dev_flow); + dev_flow->tcf.tunnel->vtep = NULL; + } + /* Cleanup the rule handle value. */ + tcm = mnl_nlmsg_get_payload(nlh); + tcm->tcm_handle = 0; + dev_flow->tcf.applied = 0; + } +} + +/** + * Fetch the applied rule handle. This is callback routine called by + * libmnl mnl_cb_run() in loop for every message in received packet. + * When the NLM_F_ECHO flag is specified the kernel sends the created + * rule descriptor back to the application and we can retrieve the + * actual rule handle from updated descriptor. + * + * @param[in] nlh + * Pointer to reply header. + * @param[in, out] arg + * Context pointer for this callback. + * + * @return + * A positive, nonzero value on success (required by libmnl + * to continue messages processing). + */ +static int +flow_tcf_collect_apply_cb(const struct nlmsghdr *nlh, void *arg) +{ + struct nlmsghdr *nlhrq = arg; + struct tcmsg *tcmrq = mnl_nlmsg_get_payload(nlhrq); + struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh); + struct nlattr *na; + if (nlh->nlmsg_type != RTM_NEWTFILTER || + nlh->nlmsg_seq != nlhrq->nlmsg_seq) + return 1; + mnl_attr_for_each(na, nlh, sizeof(*tcm)) { + switch (mnl_attr_get_type(na)) { + case TCA_KIND: + if (strcmp(mnl_attr_get_payload(na), "flower")) { + /* Not flower filter, drop entire message. */ + return 1; + } + tcmrq->tcm_handle = tcm->tcm_handle; + return 1; + } + } + return 1; +} /** * Apply flow to E-Switch by sending Netlink message. * @@ -5099,16 +5649,20 @@ flow_tcf_vtep_release(struct mlx5_flow_tcf_context *tcf, * Pointer to the error structure. * * @return - * 0 on success, a negative errno value otherwise and rte_ernno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow, struct rte_flow_error *error) { - struct priv *priv = dev->data->dev_private; + struct mlx5_priv *priv = dev->data->dev_private; struct mlx5_flow_tcf_context *ctx = priv->tcf_context; struct mlx5_flow *dev_flow; struct nlmsghdr *nlh; + struct tcmsg *tcm; + uint64_t start = 0; + uint64_t twait = 0; + int ret; dev_flow = LIST_FIRST(&flow->dev_flows); /* E-Switch flow can't be expanded. */ @@ -5117,7 +5671,11 @@ flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow, return 0; nlh = dev_flow->tcf.nlh; nlh->nlmsg_type = RTM_NEWTFILTER; - nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | + NLM_F_EXCL | NLM_F_ECHO; + tcm = mnl_nlmsg_get_payload(nlh); + /* Allow kernel to assign handle on its own. */ + tcm->tcm_handle = 0; if (dev_flow->tcf.tunnel) { /* * Replace the interface index, target for @@ -5137,55 +5695,81 @@ flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow, dev_flow->tcf.tunnel->ifindex_org); *dev_flow->tcf.tunnel->ifindex_ptr = dev_flow->tcf.tunnel->vtep->ifindex; + if (dev_flow->tcf.tunnel->vtep->waitreg) { + /* Clear wait flag for VXLAN port registration. */ + dev_flow->tcf.tunnel->vtep->waitreg = 0; + twait = rte_get_timer_hz(); + assert(twait > MS_PER_S); + twait = twait * MLX5_VXLAN_WAIT_PORT_REG_MS; + twait = twait / MS_PER_S; + start = rte_get_timer_cycles(); + } } - if (!flow_tcf_nl_ack(ctx, nlh, 0, NULL, NULL)) { + /* + * Kernel creates the VXLAN devices and registers UDP ports to + * be hardware offloaded within the NIC kernel drivers. The + * registration process is being performed into context of + * working kernel thread and the race conditions might happen. + * The VXLAN device is created and success is returned to + * calling application, but the UDP port registration process + * is not completed yet. The next applied rule may be rejected + * by the driver with ENOSUP code. We are going to wait a bit, + * allowing registration process to be completed. The waiting + * is performed once after device been created. + */ + do { + struct timespec onems; + + ret = flow_tcf_nl_ack(ctx, nlh, + flow_tcf_collect_apply_cb, nlh); + if (!ret || ret != -ENOTSUP || !twait) + break; + /* Wait one millisecond and try again till timeout. */ + onems.tv_sec = 0; + onems.tv_nsec = NS_PER_S / MS_PER_S; + nanosleep(&onems, 0); + if ((rte_get_timer_cycles() - start) > twait) { + /* Timeout elapsed, try once more and exit. */ + twait = 0; + } + } while (true); + if (!ret) { + if (!tcm->tcm_handle) { + flow_tcf_remove(dev, flow); + return rte_flow_error_set + (error, ENOENT, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "netlink: rule zero handle returned"); + } dev_flow->tcf.applied = 1; + if (*dev_flow->tcf.ptc_flags & TCA_CLS_FLAGS_SKIP_SW) + return 0; + /* + * Rule was applied without skip_sw flag set. + * We should check whether the rule was acctually + * accepted by hardware (have look at in_hw flag). + */ + if (flow_tcf_check_inhw(ctx, dev_flow)) { + flow_tcf_remove(dev, flow); + return rte_flow_error_set + (error, ENOENT, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "netlink: rule has no in_hw flag set"); + } return 0; } + if (dev_flow->tcf.tunnel) { + /* Rollback the VTEP configuration if rule apply failed. */ + assert(dev_flow->tcf.tunnel->vtep); + flow_tcf_vtep_release(ctx, dev_flow->tcf.tunnel->vtep, + dev_flow); + dev_flow->tcf.tunnel->vtep = NULL; + } return rte_flow_error_set(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, "netlink: failed to create TC flow rule"); } -/** - * Remove flow from E-Switch by sending Netlink message. - * - * @param[in] dev - * Pointer to Ethernet device. - * @param[in, out] flow - * Pointer to the sub flow. - */ -static void -flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow) -{ - struct priv *priv = dev->data->dev_private; - struct mlx5_flow_tcf_context *ctx = priv->tcf_context; - struct mlx5_flow *dev_flow; - struct nlmsghdr *nlh; - - if (!flow) - return; - dev_flow = LIST_FIRST(&flow->dev_flows); - if (!dev_flow) - return; - /* E-Switch flow can't be expanded. */ - assert(!LIST_NEXT(dev_flow, next)); - if (dev_flow->tcf.applied) { - nlh = dev_flow->tcf.nlh; - nlh->nlmsg_type = RTM_DELTFILTER; - nlh->nlmsg_flags = NLM_F_REQUEST; - flow_tcf_nl_ack(ctx, nlh, 0, NULL, NULL); - if (dev_flow->tcf.tunnel) { - assert(dev_flow->tcf.tunnel->vtep); - flow_tcf_vtep_release(ctx, - dev_flow->tcf.tunnel->vtep, - dev_flow); - dev_flow->tcf.tunnel->vtep = NULL; - } - dev_flow->tcf.applied = 0; - } -} - /** * Remove flow from E-Switch and release resources of the device flow. * @@ -5505,7 +6089,7 @@ flow_tcf_nl_filter_parse_and_get(struct nlmsghdr *cnlh, * Message received from Netlink. * @param[out] data * Pointer to data area to be filled by the parsing routine. - * assumed to be a pinter to struct flow_tcf_stats_basic. + * assumed to be a pointer to struct flow_tcf_stats_basic. * * @return * MNL_CB_OK value. @@ -5553,9 +6137,9 @@ flow_tcf_query_count(struct rte_eth_dev *dev, void *data, struct rte_flow_error *error) { - struct flow_tcf_stats_basic sb_data = { 0 }; + struct flow_tcf_stats_basic sb_data; struct rte_flow_query_count *qc = data; - struct priv *priv = dev->data->dev_private; + struct mlx5_priv *priv = dev->data->dev_private; struct mlx5_flow_tcf_context *ctx = priv->tcf_context; struct mnl_socket *nl = ctx->nl; struct mlx5_flow *dev_flow; @@ -5564,6 +6148,7 @@ flow_tcf_query_count(struct rte_eth_dev *dev, ssize_t ret; assert(qc); + memset(&sb_data, 0, sizeof(sb_data)); dev_flow = LIST_FIRST(&flow->dev_flows); /* E-Switch flow can't be expanded. */ assert(!LIST_NEXT(dev_flow, next)); @@ -5725,7 +6310,7 @@ mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx, tcm->tcm_parent = TC_H_INGRESS; assert(sizeof(buf) >= nlh->nlmsg_len); /* Ignore errors when qdisc is already absent. */ - if (flow_tcf_nl_ack(ctx, nlh, 0, NULL, NULL) && + if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL) && rte_errno != EINVAL && rte_errno != ENOENT) return rte_flow_error_set(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, @@ -5742,7 +6327,7 @@ mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx, tcm->tcm_parent = TC_H_INGRESS; mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress"); assert(sizeof(buf) >= nlh->nlmsg_len); - if (flow_tcf_nl_ack(ctx, nlh, 0, NULL, NULL)) + if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL)) return rte_flow_error_set(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, "netlink: failed to create ingress"