X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=drivers%2Fnet%2Fmlx5%2Fmlx5_flow_tcf.c;h=e021ea5e043827e589e22d167530090bf2a4b2c3;hb=a7c528e5d71ff3f569898d268f9de129fdfc152b;hp=1fd3f3f8295e83f1c409df0190e0fec2253a0948;hpb=8ba325b051eaad293c6eae2f5dbbe1d3e997d636;p=dpdk.git diff --git a/drivers/net/mlx5/mlx5_flow_tcf.c b/drivers/net/mlx5/mlx5_flow_tcf.c index 1fd3f3f829..e021ea5e04 100644 --- a/drivers/net/mlx5/mlx5_flow_tcf.c +++ b/drivers/net/mlx5/mlx5_flow_tcf.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "mlx5.h" #include "mlx5_flow.h" @@ -125,6 +126,14 @@ struct tc_pedit_sel { #define TCA_TUNNEL_KEY_NO_CSUM 10 #endif +#ifndef HAVE_TCA_TUNNEL_KEY_ENC_TOS +#define TCA_TUNNEL_KEY_ENC_TOS 12 +#endif + +#ifndef HAVE_TCA_TUNNEL_KEY_ENC_TTL +#define TCA_TUNNEL_KEY_ENC_TTL 13 +#endif + #else /* HAVE_TC_ACT_TUNNEL_KEY */ #define TCA_ACT_TUNNEL_KEY 17 @@ -138,6 +147,8 @@ struct tc_pedit_sel { #define TCA_TUNNEL_KEY_ENC_KEY_ID 7 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9 #define TCA_TUNNEL_KEY_NO_CSUM 10 +#define TCA_TUNNEL_KEY_ENC_TOS 12 +#define TCA_TUNNEL_KEY_ENC_TTL 13 struct tc_tunnel_key { tc_gen; @@ -292,6 +303,31 @@ struct tc_tunnel_key { #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK #define TCA_FLOWER_KEY_TCP_FLAGS_MASK 72 #endif +#ifndef HAVE_TCA_FLOWER_KEY_IP_TOS +#define TCA_FLOWER_KEY_IP_TOS 73 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_IP_TOS_MASK +#define TCA_FLOWER_KEY_IP_TOS_MASK 74 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_IP_TTL +#define TCA_FLOWER_KEY_IP_TTL 75 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_IP_TTL_MASK +#define TCA_FLOWER_KEY_IP_TTL_MASK 76 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TOS +#define TCA_FLOWER_KEY_ENC_IP_TOS 80 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TOS_MASK +#define TCA_FLOWER_KEY_ENC_IP_TOS_MASK 81 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TTL +#define TCA_FLOWER_KEY_ENC_IP_TTL 82 +#endif +#ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TTL_MASK +#define TCA_FLOWER_KEY_ENC_IP_TTL_MASK 83 +#endif + #ifndef HAVE_TC_ACT_GOTO_CHAIN #define TC_ACT_GOTO_CHAIN 0x20000000 #endif @@ -316,10 +352,14 @@ struct tc_tunnel_key { #define TCA_ACT_MAX_PRIO 32 #endif -/** UDP port range of VXLAN devices created by driver. */ -#define MLX5_VXLAN_PORT_MIN 30000 -#define MLX5_VXLAN_PORT_MAX 60000 +/** Parameters of VXLAN devices created by driver. */ +#define MLX5_VXLAN_DEFAULT_VNI 1 #define MLX5_VXLAN_DEVICE_PFX "vmlx_" +/** + * Timeout in milliseconds to wait VXLAN UDP offloaded port + * registration completed within the mlx5 driver. + */ +#define MLX5_VXLAN_WAIT_PORT_REG_MS 250 /** Tunnel action type, used for @p type in header structure. */ enum flow_tcf_tunact_type { @@ -337,6 +377,8 @@ enum flow_tcf_tunact_type { #define FLOW_TCF_ENCAP_UDP_SRC (1u << 6) #define FLOW_TCF_ENCAP_UDP_DST (1u << 7) #define FLOW_TCF_ENCAP_VXLAN_VNI (1u << 8) +#define FLOW_TCF_ENCAP_IP_TTL (1u << 9) +#define FLOW_TCF_ENCAP_IP_TOS (1u << 10) /** * Structure for holding netlink context. @@ -360,7 +402,7 @@ struct mlx5_flow_tcf_context { struct tcf_neigh_rule { LIST_ENTRY(tcf_neigh_rule) next; uint32_t refcnt; - struct ether_addr eth; + struct rte_ether_addr eth; uint16_t mask; union { struct { @@ -409,7 +451,8 @@ struct tcf_vtep { uint32_t refcnt; unsigned int ifindex; /**< Own interface index. */ uint16_t port; - uint8_t created; + uint32_t created:1; /**< Actually created by PMD. */ + uint32_t waitreg:1; /**< Wait for VXLAN UDP port registration. */ }; /** Tunnel descriptor header, common for all tunnel types. */ @@ -429,9 +472,11 @@ struct flow_tcf_vxlan_encap { struct flow_tcf_tunnel_hdr hdr; struct tcf_irule *iface; uint32_t mask; + uint8_t ip_tos; + uint8_t ip_ttl_hop; struct { - struct ether_addr dst; - struct ether_addr src; + struct rte_ether_addr dst; + struct rte_ether_addr src; } eth; union { struct { @@ -500,11 +545,15 @@ static const struct { }, .ipv4.hdr = { .next_proto_id = 0xff, + .time_to_live = 0xff, + .type_of_service = 0xff, .src_addr = RTE_BE32(0xffffffff), .dst_addr = RTE_BE32(0xffffffff), }, .ipv6.hdr = { .proto = 0xff, + .vtc_flow = RTE_BE32(0xfful << IPV6_HDR_FL_SHIFT), + .hop_limits = 0xff, .src_addr = "\xff\xff\xff\xff\xff\xff\xff\xff" "\xff\xff\xff\xff\xff\xff\xff\xff", @@ -640,8 +689,8 @@ flow_tcf_pedit_key_set_mac(const struct rte_flow_action *actions, { int idx = p_parser->sel.nkeys; uint32_t off = actions->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ? - offsetof(struct ether_hdr, s_addr) : - offsetof(struct ether_hdr, d_addr); + offsetof(struct rte_ether_hdr, s_addr) : + offsetof(struct rte_ether_hdr, d_addr); const struct rte_flow_action_set_mac *conf = (const struct rte_flow_action_set_mac *)actions->conf; @@ -658,7 +707,7 @@ flow_tcf_pedit_key_set_mac(const struct rte_flow_action *actions, p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET; memcpy(&p_parser->keys[idx].val, conf->mac_addr + SZ_PEDIT_KEY_VAL, - ETHER_ADDR_LEN - SZ_PEDIT_KEY_VAL); + RTE_ETHER_ADDR_LEN - SZ_PEDIT_KEY_VAL); p_parser->sel.nkeys = (++idx); } @@ -683,12 +732,12 @@ flow_tcf_pedit_key_set_dec_ttl(const struct rte_flow_action *actions, if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) { p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4; p_parser->keys[idx].off = - offsetof(struct ipv4_hdr, time_to_live); + offsetof(struct rte_ipv4_hdr, time_to_live); } if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6) { p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6; p_parser->keys[idx].off = - offsetof(struct ipv6_hdr, hop_limits); + offsetof(struct rte_ipv6_hdr, hop_limits); } if (actions->type == RTE_FLOW_ACTION_TYPE_DEC_TTL) { p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_ADD; @@ -752,8 +801,8 @@ flow_tcf_pedit_key_set_ipv6_addr(const struct rte_flow_action *actions, int keys = NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN); int off_base = actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ? - offsetof(struct ipv6_hdr, src_addr) : - offsetof(struct ipv6_hdr, dst_addr); + offsetof(struct rte_ipv6_hdr, src_addr) : + offsetof(struct rte_ipv6_hdr, dst_addr); const struct rte_flow_action_set_ipv6 *conf = (const struct rte_flow_action_set_ipv6 *)actions->conf; @@ -787,8 +836,8 @@ flow_tcf_pedit_key_set_ipv4_addr(const struct rte_flow_action *actions, p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET; p_parser->keys[idx].off = actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ? - offsetof(struct ipv4_hdr, src_addr) : - offsetof(struct ipv4_hdr, dst_addr); + offsetof(struct rte_ipv4_hdr, src_addr) : + offsetof(struct rte_ipv4_hdr, dst_addr); p_parser->keys[idx].mask = ~UINT32_MAX; p_parser->keys[idx].val = ((const struct rte_flow_action_set_ipv4 *) @@ -935,11 +984,11 @@ flow_tcf_get_pedit_actions_size(const struct rte_flow_action **actions, flags |= MLX5_FLOW_ACTION_DEC_TTL; break; case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC: - keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN); + keys += NUM_OF_PEDIT_KEYS(RTE_ETHER_ADDR_LEN); flags |= MLX5_FLOW_ACTION_SET_MAC_SRC; break; case RTE_FLOW_ACTION_TYPE_SET_MAC_DST: - keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN); + keys += NUM_OF_PEDIT_KEYS(RTE_ETHER_ADDR_LEN); flags |= MLX5_FLOW_ACTION_SET_MAC_DST; break; default: @@ -1275,6 +1324,20 @@ flow_tcf_validate_vxlan_encap_ipv4(const struct rte_flow_item *item, " must be specified for" " vxlan encapsulation"); } + if (mask->hdr.type_of_service && + mask->hdr.type_of_service != 0xff) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "no support for partial mask on" + " \"ipv4.hdr.type_of_service\" field" + " for vxlan encapsulation"); + if (mask->hdr.time_to_live && + mask->hdr.time_to_live != 0xff) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "no support for partial mask on" + " \"ipv4.hdr.time_to_live\" field" + " for vxlan encapsulation"); return 0; } @@ -1296,6 +1359,7 @@ flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item, { const struct rte_flow_item_ipv6 *spec = item->spec; const struct rte_flow_item_ipv6 *mask = item->mask; + uint8_t msk6; if (!spec) { /* @@ -1361,6 +1425,20 @@ flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item, " must be specified for" " vxlan encapsulation"); } + msk6 = (rte_be_to_cpu_32(mask->hdr.vtc_flow) >> + IPV6_HDR_TC_SHIFT) & 0xff; + if (msk6 && msk6 != 0xff) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "no support for partial mask on" + " \"ipv6.hdr.vtc_flow.tos\" field" + " for vxlan encapsulation"); + if (mask->hdr.hop_limits && mask->hdr.hop_limits != 0xff) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask, + "no support for partial mask on" + " \"ipv6.hdr.hop_limits\" field" + " for vxlan encapsulation"); return 0; } @@ -1528,8 +1606,9 @@ flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action, break; break; case RTE_FLOW_ITEM_TYPE_IPV4: - ret = mlx5_flow_validate_item_ipv4(items, item_flags, - error); + ret = mlx5_flow_validate_item_ipv4 + (items, item_flags, + &flow_tcf_mask_supported.ipv4, error); if (ret < 0) return ret; ret = flow_tcf_validate_vxlan_encap_ipv4(items, error); @@ -1538,8 +1617,9 @@ flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action, item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4; break; case RTE_FLOW_ITEM_TYPE_IPV6: - ret = mlx5_flow_validate_item_ipv6(items, item_flags, - error); + ret = mlx5_flow_validate_item_ipv6 + (items, item_flags, + &flow_tcf_mask_supported.ipv6, error); if (ret < 0) return ret; ret = flow_tcf_validate_vxlan_encap_ipv6(items, error); @@ -2058,8 +2138,9 @@ flow_tcf_validate(struct rte_eth_dev *dev, vlan_etype = spec.vlan->inner_type; break; case RTE_FLOW_ITEM_TYPE_IPV4: - ret = mlx5_flow_validate_item_ipv4(items, item_flags, - error); + ret = mlx5_flow_validate_item_ipv4 + (items, item_flags, + &flow_tcf_mask_supported.ipv4, error); if (ret < 0) return ret; item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ? @@ -2118,8 +2199,9 @@ flow_tcf_validate(struct rte_eth_dev *dev, } break; case RTE_FLOW_ITEM_TYPE_IPV6: - ret = mlx5_flow_validate_item_ipv6(items, item_flags, - error); + ret = mlx5_flow_validate_item_ipv6 + (items, item_flags, + &flow_tcf_mask_supported.ipv6, error); if (ret < 0) return ret; item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ? @@ -2344,7 +2426,7 @@ flow_tcf_validate(struct rte_eth_dev *dev, */ if ((action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) && (action_flags & MLX5_FLOW_ACTION_PORT_ID) && - ((struct priv *)port_id_dev->data->dev_private)->representor) + ((struct mlx5_priv *)port_id_dev->data->dev_private)->representor) return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, actions, "vlan push can only be applied" @@ -2439,7 +2521,7 @@ flow_tcf_get_items_size(const struct rte_flow_attr *attr, case RTE_FLOW_ITEM_TYPE_PORT_ID: break; case RTE_FLOW_ITEM_TYPE_ETH: - size += SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4; + size += SZ_NLATTR_DATA_OF(RTE_ETHER_ADDR_LEN) * 4; /* dst/src MAC addr and mask. */ break; case RTE_FLOW_ITEM_TYPE_VLAN: @@ -2448,16 +2530,31 @@ flow_tcf_get_items_size(const struct rte_flow_attr *attr, SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */ SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */ break; - case RTE_FLOW_ITEM_TYPE_IPV4: + case RTE_FLOW_ITEM_TYPE_IPV4: { + const struct rte_flow_item_ipv4 *ipv4 = items->mask; + size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */ SZ_NLATTR_TYPE_OF(uint32_t) * 4; /* dst/src IP addr and mask. */ + if (ipv4 && ipv4->hdr.time_to_live) + size += SZ_NLATTR_TYPE_OF(uint8_t) * 2; + if (ipv4 && ipv4->hdr.type_of_service) + size += SZ_NLATTR_TYPE_OF(uint8_t) * 2; break; - case RTE_FLOW_ITEM_TYPE_IPV6: + } + case RTE_FLOW_ITEM_TYPE_IPV6: { + const struct rte_flow_item_ipv6 *ipv6 = items->mask; + size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */ SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4; /* dst/src IP addr and mask. */ + if (ipv6 && ipv6->hdr.hop_limits) + size += SZ_NLATTR_TYPE_OF(uint8_t) * 2; + if (ipv6 && (rte_be_to_cpu_32(ipv6->hdr.vtc_flow) & + (0xfful << IPV6_HDR_TC_SHIFT))) + size += SZ_NLATTR_TYPE_OF(uint8_t) * 2; break; + } case RTE_FLOW_ITEM_TYPE_UDP: size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */ SZ_NLATTR_TYPE_OF(uint16_t) * 4; @@ -2525,12 +2622,27 @@ flow_tcf_vxlan_encap_size(const struct rte_flow_action *action) case RTE_FLOW_ITEM_TYPE_ETH: /* This item does not require message buffer. */ break; - case RTE_FLOW_ITEM_TYPE_IPV4: + case RTE_FLOW_ITEM_TYPE_IPV4: { + const struct rte_flow_item_ipv4 *ipv4 = items->mask; + size += SZ_NLATTR_DATA_OF(IPV4_ADDR_LEN) * 2; + if (ipv4 && ipv4->hdr.time_to_live) + size += SZ_NLATTR_TYPE_OF(uint8_t) * 2; + if (ipv4 && ipv4->hdr.type_of_service) + size += SZ_NLATTR_TYPE_OF(uint8_t) * 2; break; - case RTE_FLOW_ITEM_TYPE_IPV6: + } + case RTE_FLOW_ITEM_TYPE_IPV6: { + const struct rte_flow_item_ipv6 *ipv6 = items->mask; + size += SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 2; + if (ipv6 && ipv6->hdr.hop_limits) + size += SZ_NLATTR_TYPE_OF(uint8_t) * 2; + if (ipv6 && (rte_be_to_cpu_32(ipv6->hdr.vtc_flow) & + (0xfful << IPV6_HDR_TC_SHIFT))) + size += SZ_NLATTR_TYPE_OF(uint8_t) * 2; break; + } case RTE_FLOW_ITEM_TYPE_UDP: { const struct rte_flow_item_udp *udp = items->mask; @@ -2572,7 +2684,7 @@ flow_tcf_get_actions_and_size(const struct rte_flow_action actions[], uint64_t *action_flags) { int size = 0; - uint64_t flags = 0; + uint64_t flags = *action_flags; size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */ for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { @@ -2672,27 +2784,6 @@ action_of_vlan: return size; } -/** - * Brand rtnetlink buffer with unique handle. - * - * This handle should be unique for a given network interface to avoid - * collisions. - * - * @param nlh - * Pointer to Netlink message. - * @param handle - * Unique 32-bit handle to use. - */ -static void -flow_tcf_nl_brand(struct nlmsghdr *nlh, uint32_t handle) -{ - struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh); - - tcm->tcm_handle = handle; - DRV_LOG(DEBUG, "Netlink msg %p is branded with handle %x", - (void *)nlh, handle); -} - /** * Prepare a flow object for Linux TC flower. It calculates the maximum size of * memory required, allocates the memory, initializes Netlink message headers @@ -2783,20 +2874,6 @@ flow_tcf_prepare(const struct rte_flow_attr *attr, dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_DECAP; else if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_ENCAP; - /* - * Generate a reasonably unique handle based on the address of the - * target buffer. - * - * This is straightforward on 32-bit systems where the flow pointer can - * be used directly. Otherwise, its least significant part is taken - * after shifting it by the previous power of two of the pointed buffer - * size. - */ - if (sizeof(dev_flow) <= 4) - flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow); - else - flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow >> - rte_log2_u32(rte_align32prevpow2(size))); return dev_flow; } @@ -2839,7 +2916,7 @@ flow_tcf_translate_action_count(struct rte_eth_dev *dev __rte_unused, * VXLAN VNI in 24-bit wire format. * * @return - * VXLAN VNI as a 32-bit integer value in network endian. + * VXLAN VNI as a 32-bit integer value in network endianness. */ static inline rte_be32_t vxlan_vni_as_be32(const uint8_t vni[3]) @@ -2908,11 +2985,14 @@ flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec, * * @param[in] spec * RTE_FLOW_ITEM_TYPE_IPV4 entry specification. + * @param[in] mask + * RTE_FLOW_ITEM_TYPE_IPV4 entry mask. * @param[out] encap * Structure to fill the gathered IPV4 address data. */ static void flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec, + const struct rte_flow_item_ipv4 *mask, struct flow_tcf_vxlan_encap *encap) { /* Item must be validated before. No redundant checks. */ @@ -2921,6 +3001,14 @@ flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec, encap->ipv4.src = spec->hdr.src_addr; encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC | FLOW_TCF_ENCAP_IPV4_DST; + if (mask && mask->hdr.type_of_service) { + encap->mask |= FLOW_TCF_ENCAP_IP_TOS; + encap->ip_tos = spec->hdr.type_of_service; + } + if (mask && mask->hdr.time_to_live) { + encap->mask |= FLOW_TCF_ENCAP_IP_TTL; + encap->ip_ttl_hop = spec->hdr.time_to_live; + } } /** @@ -2931,11 +3019,14 @@ flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec, * * @param[in] spec * RTE_FLOW_ITEM_TYPE_IPV6 entry specification. + * @param[in] mask + * RTE_FLOW_ITEM_TYPE_IPV6 entry mask. * @param[out] encap * Structure to fill the gathered IPV6 address data. */ static void flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec, + const struct rte_flow_item_ipv6 *mask, struct flow_tcf_vxlan_encap *encap) { /* Item must be validated before. No redundant checks. */ @@ -2944,6 +3035,19 @@ flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec, memcpy(encap->ipv6.src, spec->hdr.src_addr, IPV6_ADDR_LEN); encap->mask |= FLOW_TCF_ENCAP_IPV6_SRC | FLOW_TCF_ENCAP_IPV6_DST; + if (mask) { + if ((rte_be_to_cpu_32(mask->hdr.vtc_flow) >> + IPV6_HDR_TC_SHIFT) & 0xff) { + encap->mask |= FLOW_TCF_ENCAP_IP_TOS; + encap->ip_tos = (rte_be_to_cpu_32 + (spec->hdr.vtc_flow) >> + IPV6_HDR_TC_SHIFT) & 0xff; + } + if (mask->hdr.hop_limits) { + encap->mask |= FLOW_TCF_ENCAP_IP_TTL; + encap->ip_ttl_hop = spec->hdr.hop_limits; + } + } } /** @@ -3038,11 +3142,15 @@ flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action, break; case RTE_FLOW_ITEM_TYPE_IPV4: spec.ipv4 = items->spec; - flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4, encap); + mask.ipv4 = items->mask; + flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4, mask.ipv4, + encap); break; case RTE_FLOW_ITEM_TYPE_IPV6: spec.ipv6 = items->spec; - flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6, encap); + mask.ipv6 = items->mask; + flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6, mask.ipv6, + encap); break; case RTE_FLOW_ITEM_TYPE_UDP: mask.udp = items->mask; @@ -3226,20 +3334,20 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, " parameter is ignored"); break; } - if (!is_zero_ether_addr(&mask.eth->dst)) { + if (!rte_is_zero_ether_addr(&mask.eth->dst)) { mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST, - ETHER_ADDR_LEN, + RTE_ETHER_ADDR_LEN, spec.eth->dst.addr_bytes); mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK, - ETHER_ADDR_LEN, + RTE_ETHER_ADDR_LEN, mask.eth->dst.addr_bytes); } - if (!is_zero_ether_addr(&mask.eth->src)) { + if (!rte_is_zero_ether_addr(&mask.eth->src)) { mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC, - ETHER_ADDR_LEN, + RTE_ETHER_ADDR_LEN, spec.eth->src.addr_bytes); mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK, - ETHER_ADDR_LEN, + RTE_ETHER_ADDR_LEN, mask.eth->src.addr_bytes); } assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len); @@ -3355,10 +3463,35 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, TCA_FLOWER_KEY_IPV4_DST_MASK, mask.ipv4->hdr.dst_addr); } + if (mask.ipv4->hdr.time_to_live) { + mnl_attr_put_u8 + (nlh, tunnel_outer ? + TCA_FLOWER_KEY_ENC_IP_TTL : + TCA_FLOWER_KEY_IP_TTL, + spec.ipv4->hdr.time_to_live); + mnl_attr_put_u8 + (nlh, tunnel_outer ? + TCA_FLOWER_KEY_ENC_IP_TTL_MASK : + TCA_FLOWER_KEY_IP_TTL_MASK, + mask.ipv4->hdr.time_to_live); + } + if (mask.ipv4->hdr.type_of_service) { + mnl_attr_put_u8 + (nlh, tunnel_outer ? + TCA_FLOWER_KEY_ENC_IP_TOS : + TCA_FLOWER_KEY_IP_TOS, + spec.ipv4->hdr.type_of_service); + mnl_attr_put_u8 + (nlh, tunnel_outer ? + TCA_FLOWER_KEY_ENC_IP_TOS_MASK : + TCA_FLOWER_KEY_IP_TOS_MASK, + mask.ipv4->hdr.type_of_service); + } assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len); break; case RTE_FLOW_ITEM_TYPE_IPV6: { bool ipv6_src, ipv6_dst; + uint8_t msk6, tos6; item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ? MLX5_FLOW_LAYER_INNER_L3_IPV6 : @@ -3444,6 +3577,33 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, IPV6_ADDR_LEN, mask.ipv6->hdr.dst_addr); } + if (mask.ipv6->hdr.hop_limits) { + mnl_attr_put_u8 + (nlh, tunnel_outer ? + TCA_FLOWER_KEY_ENC_IP_TTL : + TCA_FLOWER_KEY_IP_TTL, + spec.ipv6->hdr.hop_limits); + mnl_attr_put_u8 + (nlh, tunnel_outer ? + TCA_FLOWER_KEY_ENC_IP_TTL_MASK : + TCA_FLOWER_KEY_IP_TTL_MASK, + mask.ipv6->hdr.hop_limits); + } + msk6 = (rte_be_to_cpu_32(mask.ipv6->hdr.vtc_flow) >> + IPV6_HDR_TC_SHIFT) & 0xff; + if (msk6) { + tos6 = (rte_be_to_cpu_32 + (spec.ipv6->hdr.vtc_flow) >> + IPV6_HDR_TC_SHIFT) & 0xff; + mnl_attr_put_u8 + (nlh, tunnel_outer ? + TCA_FLOWER_KEY_ENC_IP_TOS : + TCA_FLOWER_KEY_IP_TOS, tos6); + mnl_attr_put_u8 + (nlh, tunnel_outer ? + TCA_FLOWER_KEY_ENC_IP_TOS_MASK : + TCA_FLOWER_KEY_IP_TOS_MASK, msk6); + } assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len); break; } @@ -3620,6 +3780,10 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, mnl_attr_get_payload (mnl_nlmsg_get_payload_tail (nlh)))->ifindex; + } else if (decap.hdr) { + assert(dev_flow->tcf.tunnel); + dev_flow->tcf.tunnel->ifindex_ptr = + (unsigned int *)&tcm->tcm_ifindex; } mnl_attr_put(nlh, TCA_MIRRED_PARMS, sizeof(struct tc_mirred), @@ -3796,6 +3960,14 @@ override_na_vlan_priority: TCA_TUNNEL_KEY_ENC_IPV6_DST, sizeof(encap.vxlan->ipv6.dst), &encap.vxlan->ipv6.dst); + if (encap.vxlan->mask & FLOW_TCF_ENCAP_IP_TTL) + mnl_attr_put_u8(nlh, + TCA_TUNNEL_KEY_ENC_TTL, + encap.vxlan->ip_ttl_hop); + if (encap.vxlan->mask & FLOW_TCF_ENCAP_IP_TOS) + mnl_attr_put_u8(nlh, + TCA_TUNNEL_KEY_ENC_TOS, + encap.vxlan->ip_tos); if (encap.vxlan->mask & FLOW_TCF_ENCAP_VXLAN_VNI) mnl_attr_put_u32(nlh, TCA_TUNNEL_KEY_ENC_KEY_ID, @@ -3879,7 +4051,7 @@ flow_tcf_nl_ack(struct mlx5_flow_tcf_context *tcf, nlh->nlmsg_flags |= NLM_F_ACK; ret = mnl_socket_sendto(tcf->nl, nlh, nlh->nlmsg_len); if (ret <= 0) { - /* Message send error occurres. */ + /* Message send error occurred. */ rte_errno = errno; return -rte_errno; } @@ -4135,7 +4307,7 @@ flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg) * @param[in] tcf * Context object initialized by mlx5_flow_tcf_context_create(). * @param[in] ifindex - * Network inferface index to perform cleanup. + * Network interface index to perform cleanup. */ static void flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf, @@ -4171,7 +4343,7 @@ flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf, } /** - * Collect neigh permament rules on specified network device. + * Collect neigh permanent rules on specified network device. * This is callback routine called by libmnl mnl_cb_run() in loop for * every message in received packet. * @@ -4220,10 +4392,10 @@ flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg) } if (!na_mac || !na_ip) return 1; - /* Neigh rule with permenent attribute found. */ + /* Neigh rule with permanent attribute found. */ size = MNL_ALIGN(sizeof(struct nlmsghdr)) + MNL_ALIGN(sizeof(struct ndmsg)) + - SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) + + SZ_NLATTR_DATA_OF(RTE_ETHER_ADDR_LEN) + (family == AF_INET6 ? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) : SZ_NLATTR_TYPE_OF(uint32_t)); cmd = flow_tcf_alloc_nlcmd(ctx, size); @@ -4247,7 +4419,7 @@ flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg) mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN, mnl_attr_get_payload(na_ip)); } - mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN, + mnl_attr_put(cmd, NDA_LLADDR, RTE_ETHER_ADDR_LEN, mnl_attr_get_payload(na_mac)); assert(size == cmd->nlmsg_len); return 1; @@ -4259,7 +4431,7 @@ flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg) * @param[in] tcf * Context object initialized by mlx5_flow_tcf_context_create(). * @param[in] ifindex - * Network inferface index to perform cleanup. + * Network interface index to perform cleanup. */ static void flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf, @@ -4427,7 +4599,7 @@ flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf, * Note that an implicit route is maintained by the kernel due to the * presence of a peer address (IFA_ADDRESS). * - * These rules are used for encapsultion only and allow to assign + * These rules are used for encapsulation only and allow to assign * the outer tunnel source IP address. * * @param[in] tcf @@ -4846,7 +5018,7 @@ flow_tcf_encap_irule_acquire(struct mlx5_flow_tcf_context *tcf, /** * Releases VXLAN encap rules container by pointer. Decrements the - * reference cointer and deletes the container if counter is zero. + * reference counter and deletes the container if counter is zero. * * @param[in] irule * VXLAN rule container pointer to release. @@ -4870,7 +5042,7 @@ flow_tcf_encap_irule_release(struct tcf_irule *iface) * @param[in] tcf * Context object initialized by mlx5_flow_tcf_context_create(). * @param[in] vtep - * Object represinting the network device to delete. Memory + * Object representing the network device to delete. Memory * allocated for this object is freed by routine. */ static void @@ -4918,7 +5090,6 @@ flow_tcf_vtep_delete(struct mlx5_flow_tcf_context *tcf, * Pointer to created device structure on success, * NULL otherwise and rte_errno is set. */ -#ifdef HAVE_IFLA_VXLAN_COLLECT_METADATA static struct tcf_vtep* flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf, uint16_t port, struct rte_flow_error *error) @@ -4968,10 +5139,24 @@ flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf, mnl_attr_put_strz(nlh, IFLA_INFO_KIND, "vxlan"); na_vxlan = mnl_attr_nest_start(nlh, IFLA_INFO_DATA); assert(na_vxlan); +#ifdef HAVE_IFLA_VXLAN_COLLECT_METADATA + /* + * RH 7.2 does not support metadata for tunnel device. + * It does not matter because we are going to use the + * hardware offload by mlx5 driver. + */ mnl_attr_put_u8(nlh, IFLA_VXLAN_COLLECT_METADATA, 1); +#endif mnl_attr_put_u8(nlh, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1); mnl_attr_put_u8(nlh, IFLA_VXLAN_LEARNING, 0); mnl_attr_put_u16(nlh, IFLA_VXLAN_PORT, vxlan_port); +#ifndef HAVE_IFLA_VXLAN_COLLECT_METADATA + /* + * We must specify VNI explicitly if metadata not supported. + * Note, VNI is transferred with native endianness format. + */ + mnl_attr_put_u16(nlh, IFLA_VXLAN_ID, MLX5_VXLAN_DEFAULT_VNI); +#endif mnl_attr_nest_end(nlh, na_vxlan); mnl_attr_nest_end(nlh, na_info); assert(sizeof(buf) >= nlh->nlmsg_len); @@ -4993,6 +5178,7 @@ flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf, * when we do not need it anymore. */ vtep->created = 1; + vtep->waitreg = 1; } /* Try to get ifindex of created of pre-existing device. */ ret = if_nametoindex(name); @@ -5040,19 +5226,6 @@ error: rte_free(vtep); return NULL; } -#else -static struct tcf_vtep* -flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf __rte_unused, - uint16_t port __rte_unused, - struct rte_flow_error *error) -{ - rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, - "netlink: failed to create VTEP, " - "vxlan metadata are not supported by kernel"); - return NULL; -} -#endif /* HAVE_IFLA_VXLAN_COLLECT_METADATA */ /** * Acquire target interface index for VXLAN tunneling decapsulation. @@ -5095,7 +5268,7 @@ flow_tcf_decap_vtep_acquire(struct mlx5_flow_tcf_context *tcf, } /** - * Aqcuire target interface index for VXLAN tunneling encapsulation. + * Acquire target interface index for VXLAN tunneling encapsulation. * * @param[in] tcf * Context object initialized by mlx5_flow_tcf_context_create(). @@ -5393,10 +5566,11 @@ flow_tcf_check_inhw(struct mlx5_flow_tcf_context *tcf, static void flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow) { - struct priv *priv = dev->data->dev_private; + struct mlx5_priv *priv = dev->data->dev_private; struct mlx5_flow_tcf_context *ctx = priv->tcf_context; struct mlx5_flow *dev_flow; struct nlmsghdr *nlh; + struct tcmsg *tcm; if (!flow) return; @@ -5417,10 +5591,53 @@ flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow) dev_flow); dev_flow->tcf.tunnel->vtep = NULL; } + /* Cleanup the rule handle value. */ + tcm = mnl_nlmsg_get_payload(nlh); + tcm->tcm_handle = 0; dev_flow->tcf.applied = 0; } } +/** + * Fetch the applied rule handle. This is callback routine called by + * libmnl mnl_cb_run() in loop for every message in received packet. + * When the NLM_F_ECHO flag is specified the kernel sends the created + * rule descriptor back to the application and we can retrieve the + * actual rule handle from updated descriptor. + * + * @param[in] nlh + * Pointer to reply header. + * @param[in, out] arg + * Context pointer for this callback. + * + * @return + * A positive, nonzero value on success (required by libmnl + * to continue messages processing). + */ +static int +flow_tcf_collect_apply_cb(const struct nlmsghdr *nlh, void *arg) +{ + struct nlmsghdr *nlhrq = arg; + struct tcmsg *tcmrq = mnl_nlmsg_get_payload(nlhrq); + struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh); + struct nlattr *na; + + if (nlh->nlmsg_type != RTM_NEWTFILTER || + nlh->nlmsg_seq != nlhrq->nlmsg_seq) + return 1; + mnl_attr_for_each(na, nlh, sizeof(*tcm)) { + switch (mnl_attr_get_type(na)) { + case TCA_KIND: + if (strcmp(mnl_attr_get_payload(na), "flower")) { + /* Not flower filter, drop entire message. */ + return 1; + } + tcmrq->tcm_handle = tcm->tcm_handle; + return 1; + } + } + return 1; +} /** * Apply flow to E-Switch by sending Netlink message. * @@ -5438,10 +5655,14 @@ static int flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow, struct rte_flow_error *error) { - struct priv *priv = dev->data->dev_private; + struct mlx5_priv *priv = dev->data->dev_private; struct mlx5_flow_tcf_context *ctx = priv->tcf_context; struct mlx5_flow *dev_flow; struct nlmsghdr *nlh; + struct tcmsg *tcm; + uint64_t start = 0; + uint64_t twait = 0; + int ret; dev_flow = LIST_FIRST(&flow->dev_flows); /* E-Switch flow can't be expanded. */ @@ -5450,7 +5671,11 @@ flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow, return 0; nlh = dev_flow->tcf.nlh; nlh->nlmsg_type = RTM_NEWTFILTER; - nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | + NLM_F_EXCL | NLM_F_ECHO; + tcm = mnl_nlmsg_get_payload(nlh); + /* Allow kernel to assign handle on its own. */ + tcm->tcm_handle = 0; if (dev_flow->tcf.tunnel) { /* * Replace the interface index, target for @@ -5470,8 +5695,52 @@ flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow, dev_flow->tcf.tunnel->ifindex_org); *dev_flow->tcf.tunnel->ifindex_ptr = dev_flow->tcf.tunnel->vtep->ifindex; + if (dev_flow->tcf.tunnel->vtep->waitreg) { + /* Clear wait flag for VXLAN port registration. */ + dev_flow->tcf.tunnel->vtep->waitreg = 0; + twait = rte_get_timer_hz(); + assert(twait > MS_PER_S); + twait = twait * MLX5_VXLAN_WAIT_PORT_REG_MS; + twait = twait / MS_PER_S; + start = rte_get_timer_cycles(); + } } - if (!flow_tcf_nl_ack(ctx, nlh, NULL, NULL)) { + /* + * Kernel creates the VXLAN devices and registers UDP ports to + * be hardware offloaded within the NIC kernel drivers. The + * registration process is being performed into context of + * working kernel thread and the race conditions might happen. + * The VXLAN device is created and success is returned to + * calling application, but the UDP port registration process + * is not completed yet. The next applied rule may be rejected + * by the driver with ENOSUP code. We are going to wait a bit, + * allowing registration process to be completed. The waiting + * is performed once after device been created. + */ + do { + struct timespec onems; + + ret = flow_tcf_nl_ack(ctx, nlh, + flow_tcf_collect_apply_cb, nlh); + if (!ret || ret != -ENOTSUP || !twait) + break; + /* Wait one millisecond and try again till timeout. */ + onems.tv_sec = 0; + onems.tv_nsec = NS_PER_S / MS_PER_S; + nanosleep(&onems, 0); + if ((rte_get_timer_cycles() - start) > twait) { + /* Timeout elapsed, try once more and exit. */ + twait = 0; + } + } while (true); + if (!ret) { + if (!tcm->tcm_handle) { + flow_tcf_remove(dev, flow); + return rte_flow_error_set + (error, ENOENT, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "netlink: rule zero handle returned"); + } dev_flow->tcf.applied = 1; if (*dev_flow->tcf.ptc_flags & TCA_CLS_FLAGS_SKIP_SW) return 0; @@ -5870,7 +6139,7 @@ flow_tcf_query_count(struct rte_eth_dev *dev, { struct flow_tcf_stats_basic sb_data; struct rte_flow_query_count *qc = data; - struct priv *priv = dev->data->dev_private; + struct mlx5_priv *priv = dev->data->dev_private; struct mlx5_flow_tcf_context *ctx = priv->tcf_context; struct mnl_socket *nl = ctx->nl; struct mlx5_flow *dev_flow;