X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=drivers%2Fnet%2Fmlx5%2Fmlx5_flow_tcf.c;h=fb284c3558c2da14c6ede02a264ff4e0d42190e0;hb=eccbe2ffdc755a9e759c6ba480f6c15ccb1163c7;hp=26a91c5679a1ad0de627640c70a15fcaf96c35b6;hpb=6e74990b3463a34669e09ccae1d34727a83b082b;p=dpdk.git diff --git a/drivers/net/mlx5/mlx5_flow_tcf.c b/drivers/net/mlx5/mlx5_flow_tcf.c index 26a91c5679..fb284c3558 100644 --- a/drivers/net/mlx5/mlx5_flow_tcf.c +++ b/drivers/net/mlx5/mlx5_flow_tcf.c @@ -160,6 +160,9 @@ struct tc_tunnel_key { #ifndef TCA_CLS_FLAGS_SKIP_SW #define TCA_CLS_FLAGS_SKIP_SW (1 << 1) #endif +#ifndef TCA_CLS_FLAGS_IN_HW +#define TCA_CLS_FLAGS_IN_HW (1 << 2) +#endif #ifndef HAVE_TCA_CHAIN #define TCA_CHAIN 11 #endif @@ -530,7 +533,15 @@ struct flow_tcf_ptoi { /* Due to a limitation on driver/FW. */ #define MLX5_TCF_GROUP_ID_MAX 3 -#define MLX5_TCF_GROUP_PRIORITY_MAX 14 + +/* + * Due to a limitation on driver/FW, priority ranges from 1 to 16 in kernel. + * Priority in rte_flow attribute starts from 0 and is added by 1 in + * translation. This is subject to be changed to determine the max priority + * based on trial-and-error like Verbs driver once the restriction is lifted or + * the range is extended. + */ +#define MLX5_TCF_GROUP_PRIORITY_MAX 15 #define MLX5_TCF_FATE_ACTIONS \ (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \ @@ -1091,19 +1102,13 @@ flow_tcf_validate_attributes(const struct rte_flow_attr *attr, "group ID larger than " RTE_STR(MLX5_TCF_GROUP_ID_MAX) " isn't supported"); - else if (attr->group > 0 && - attr->priority > MLX5_TCF_GROUP_PRIORITY_MAX) + else if (attr->priority > MLX5_TCF_GROUP_PRIORITY_MAX) return rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, attr, - "lowest priority level is " + "priority more than " RTE_STR(MLX5_TCF_GROUP_PRIORITY_MAX) - " when group is configured"); - else if (attr->priority > 0xfffe) - return rte_flow_error_set(error, ENOTSUP, - RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, - attr, - "lowest priority level is 0xfffe"); + " is not supported"); if (!attr->ingress) return rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, @@ -1882,9 +1887,23 @@ flow_tcf_validate(struct rte_eth_dev *dev, case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN: current_action_flag = MLX5_FLOW_ACTION_OF_POP_VLAN; break; - case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: + case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: { + rte_be16_t ethertype; + current_action_flag = MLX5_FLOW_ACTION_OF_PUSH_VLAN; + if (!actions->conf) + break; + conf.of_push_vlan = actions->conf; + ethertype = conf.of_push_vlan->ethertype; + if (ethertype != RTE_BE16(ETH_P_8021Q) && + ethertype != RTE_BE16(ETH_P_8021AD)) + return rte_flow_error_set + (error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, actions, + "vlan push TPID must be " + "802.1Q or 802.1AD"); break; + } case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID: if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN)) return rte_flow_error_set @@ -2368,24 +2387,21 @@ flow_tcf_validate(struct rte_eth_dev *dev, } /** - * Calculate maximum size of memory for flow items of Linux TC flower and - * extract specified items. + * Calculate maximum size of memory for flow items of Linux TC flower. * + * @param[in] attr + * Pointer to the flow attributes. * @param[in] items * Pointer to the list of items. - * @param[out] item_flags - * Pointer to the detected items. * * @return * Maximum size of memory for items. */ static int -flow_tcf_get_items_and_size(const struct rte_flow_attr *attr, - const struct rte_flow_item items[], - uint64_t *item_flags) +flow_tcf_get_items_size(const struct rte_flow_attr *attr, + const struct rte_flow_item items[]) { int size = 0; - uint64_t flags = 0; size += SZ_NLATTR_STRZ_OF("flower") + SZ_NLATTR_NEST + /* TCA_OPTIONS. */ @@ -2402,7 +2418,6 @@ flow_tcf_get_items_and_size(const struct rte_flow_attr *attr, size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */ SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4; /* dst/src MAC addr and mask. */ - flags |= MLX5_FLOW_LAYER_OUTER_L2; break; case RTE_FLOW_ITEM_TYPE_VLAN: size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */ @@ -2410,37 +2425,31 @@ flow_tcf_get_items_and_size(const struct rte_flow_attr *attr, /* VLAN Ether type. */ SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */ SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */ - flags |= MLX5_FLOW_LAYER_OUTER_VLAN; break; case RTE_FLOW_ITEM_TYPE_IPV4: size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */ SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */ SZ_NLATTR_TYPE_OF(uint32_t) * 4; /* dst/src IP addr and mask. */ - flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4; break; case RTE_FLOW_ITEM_TYPE_IPV6: size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */ SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */ SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4; /* dst/src IP addr and mask. */ - flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6; break; case RTE_FLOW_ITEM_TYPE_UDP: size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */ SZ_NLATTR_TYPE_OF(uint16_t) * 4; /* dst/src port and mask. */ - flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP; break; case RTE_FLOW_ITEM_TYPE_TCP: size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */ SZ_NLATTR_TYPE_OF(uint16_t) * 4; /* dst/src port and mask. */ - flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP; break; case RTE_FLOW_ITEM_TYPE_VXLAN: size += SZ_NLATTR_TYPE_OF(uint32_t); - flags |= MLX5_FLOW_LAYER_VXLAN; break; default: DRV_LOG(WARNING, @@ -2450,7 +2459,6 @@ flow_tcf_get_items_and_size(const struct rte_flow_attr *attr, break; } } - *item_flags = flags; return size; } @@ -2666,10 +2674,6 @@ flow_tcf_nl_brand(struct nlmsghdr *nlh, uint32_t handle) * Pointer to the list of items. * @param[in] actions * Pointer to the list of actions. - * @param[out] item_flags - * Pointer to bit mask of all items detected. - * @param[out] action_flags - * Pointer to bit mask of all actions detected. * @param[out] error * Pointer to the error structure. * @@ -2681,7 +2685,6 @@ static struct mlx5_flow * flow_tcf_prepare(const struct rte_flow_attr *attr, const struct rte_flow_item items[], const struct rte_flow_action actions[], - uint64_t *item_flags, uint64_t *action_flags, struct rte_flow_error *error) { size_t size = RTE_ALIGN_CEIL @@ -2690,12 +2693,13 @@ flow_tcf_prepare(const struct rte_flow_attr *attr, MNL_ALIGN(sizeof(struct nlmsghdr)) + MNL_ALIGN(sizeof(struct tcmsg)); struct mlx5_flow *dev_flow; + uint64_t action_flags = 0; struct nlmsghdr *nlh; struct tcmsg *tcm; uint8_t *sp, *tun = NULL; - size += flow_tcf_get_items_and_size(attr, items, item_flags); - size += flow_tcf_get_actions_and_size(actions, action_flags); + size += flow_tcf_get_items_size(attr, items); + size += flow_tcf_get_actions_and_size(actions, &action_flags); dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO); if (!dev_flow) { rte_flow_error_set(error, ENOMEM, @@ -2704,7 +2708,7 @@ flow_tcf_prepare(const struct rte_flow_attr *attr, return NULL; } sp = (uint8_t *)(dev_flow + 1); - if (*action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) { + if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) { sp = RTE_PTR_ALIGN (sp, alignof(struct flow_tcf_tunnel_hdr)); tun = sp; @@ -2716,7 +2720,7 @@ flow_tcf_prepare(const struct rte_flow_attr *attr, (sizeof(struct flow_tcf_vxlan_encap), MNL_ALIGNTO); #endif - } else if (*action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) { + } else if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) { sp = RTE_PTR_ALIGN (sp, alignof(struct flow_tcf_tunnel_hdr)); tun = sp; @@ -2745,9 +2749,9 @@ flow_tcf_prepare(const struct rte_flow_attr *attr, .tcm = tcm, }, }; - if (*action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) + if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_DECAP; - else if (*action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) + else if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_ENCAP; /* * Generate a reasonably unique handle based on the address of the @@ -3256,7 +3260,8 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, assert(mask.ipv4); spec.ipv4 = items->spec; if (!decap.vxlan) { - if (!eth_type_set && !vlan_eth_type_set) + if (!eth_type_set || + (!vlan_eth_type_set && vlan_present)) mnl_attr_put_u16 (nlh, vlan_present ? @@ -3313,14 +3318,14 @@ flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow, assert(mask.ipv6); spec.ipv6 = items->spec; if (!decap.vxlan) { - if (!eth_type_set || !vlan_eth_type_set) { + if (!eth_type_set || + (!vlan_eth_type_set && vlan_present)) mnl_attr_put_u16 (nlh, vlan_present ? TCA_FLOWER_KEY_VLAN_ETH_TYPE : TCA_FLOWER_KEY_ETH_TYPE, RTE_BE16(ETH_P_IPV6)); - } eth_type_set = 1; vlan_eth_type_set = 1; if (mask.ipv6 == &flow_tcf_mask_empty.ipv6) @@ -3712,6 +3717,8 @@ override_na_vlan_priority: assert(na_flower); assert(na_flower_act); mnl_attr_nest_end(nlh, na_flower_act); + dev_flow->tcf.ptc_flags = mnl_attr_get_payload + (mnl_nlmsg_get_payload_tail(nlh)); mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, decap.vxlan ? 0 : TCA_CLS_FLAGS_SKIP_SW); mnl_attr_nest_end(nlh, na_flower); @@ -3730,10 +3737,6 @@ override_na_vlan_priority: * @param nlh * Message to send. This function always raises the NLM_F_ACK flag before * sending. - * @param[in] msglen - * Message length. Message buffer may contain multiple commands and - * nlmsg_len field not always corresponds to actual message length. - * If 0 specified the nlmsg_len field in header is used as message length. * @param[in] cb * Callback handler for received message. * @param[in] arg @@ -3745,144 +3748,1624 @@ override_na_vlan_priority: static int flow_tcf_nl_ack(struct mlx5_flow_tcf_context *tcf, struct nlmsghdr *nlh, - uint32_t msglen, mnl_cb_t cb, void *arg) { unsigned int portid = mnl_socket_get_portid(tcf->nl); uint32_t seq = tcf->seq++; - int err, ret; + int ret, err = 0; assert(tcf->nl); assert(tcf->buf); - if (!seq) + if (!seq) { /* seq 0 is reserved for kernel event-driven notifications. */ seq = tcf->seq++; + } nlh->nlmsg_seq = seq; - if (!msglen) { - msglen = nlh->nlmsg_len; - nlh->nlmsg_flags |= NLM_F_ACK; + nlh->nlmsg_flags |= NLM_F_ACK; + ret = mnl_socket_sendto(tcf->nl, nlh, nlh->nlmsg_len); + if (ret <= 0) { + /* Message send error occurres. */ + rte_errno = errno; + return -rte_errno; } - ret = mnl_socket_sendto(tcf->nl, nlh, msglen); - err = (ret <= 0) ? errno : 0; nlh = (struct nlmsghdr *)(tcf->buf); /* * The following loop postpones non-fatal errors until multipart * messages are complete. */ - if (ret > 0) - while (true) { - ret = mnl_socket_recvfrom(tcf->nl, tcf->buf, - tcf->buf_size); + while (true) { + ret = mnl_socket_recvfrom(tcf->nl, tcf->buf, tcf->buf_size); + if (ret < 0) { + err = errno; + /* + * In case of overflow Will receive till + * end of multipart message. We may lost part + * of reply messages but mark and return an error. + */ + if (err != ENOSPC || + !(nlh->nlmsg_flags & NLM_F_MULTI) || + nlh->nlmsg_type == NLMSG_DONE) + break; + } else { + ret = mnl_cb_run(nlh, ret, seq, portid, cb, arg); + if (!ret) { + /* + * libmnl returns 0 if DONE or + * success ACK message found. + */ + break; + } if (ret < 0) { + /* + * ACK message with error found + * or some error occurred. + */ err = errno; - if (err != ENOSPC) - break; - } - if (!err) { - ret = mnl_cb_run(nlh, ret, seq, portid, - cb, arg); - if (ret < 0) { - err = errno; - break; - } - } - /* Will receive till end of multipart message */ - if (!(nlh->nlmsg_flags & NLM_F_MULTI) || - nlh->nlmsg_type == NLMSG_DONE) break; + } + /* We should continue receiving. */ } + } if (!err) return 0; rte_errno = err; return -err; } +#define MNL_BUF_EXTRA_SPACE 16 +#define MNL_REQUEST_SIZE_MIN 256 +#define MNL_REQUEST_SIZE_MAX 2048 +#define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \ + MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX) + +/* Data structures used by flow_tcf_xxx_cb() routines. */ +struct tcf_nlcb_buf { + LIST_ENTRY(tcf_nlcb_buf) next; + uint32_t size; + alignas(struct nlmsghdr) + uint8_t msg[]; /**< Netlink message data. */ +}; + +struct tcf_nlcb_context { + unsigned int ifindex; /**< Base interface index. */ + uint32_t bufsize; + LIST_HEAD(, tcf_nlcb_buf) nlbuf; +}; + /** - * Apply flow to E-Switch by sending Netlink message. + * Allocate space for netlink command in buffer list * - * @param[in] dev - * Pointer to Ethernet device. - * @param[in, out] flow - * Pointer to the sub flow. + * @param[in, out] ctx + * Pointer to callback context with command buffers list. + * @param[in] size + * Required size of data buffer to be allocated. + * + * @return + * Pointer to allocated memory, aligned as message header. + * NULL if some error occurred. + */ +static struct nlmsghdr * +flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size) +{ + struct tcf_nlcb_buf *buf; + struct nlmsghdr *nlh; + + size = NLMSG_ALIGN(size); + buf = LIST_FIRST(&ctx->nlbuf); + if (buf && (buf->size + size) <= ctx->bufsize) { + nlh = (struct nlmsghdr *)&buf->msg[buf->size]; + buf->size += size; + return nlh; + } + if (size > ctx->bufsize) { + DRV_LOG(WARNING, "netlink: too long command buffer requested"); + return NULL; + } + buf = rte_malloc(__func__, + ctx->bufsize + sizeof(struct tcf_nlcb_buf), + alignof(struct tcf_nlcb_buf)); + if (!buf) { + DRV_LOG(WARNING, "netlink: no memory for command buffer"); + return NULL; + } + LIST_INSERT_HEAD(&ctx->nlbuf, buf, next); + buf->size = size; + nlh = (struct nlmsghdr *)&buf->msg[0]; + return nlh; +} + +/** + * Send the buffers with prepared netlink commands. Scans the list and + * sends all found buffers. Buffers are sent and freed anyway in order + * to prevent memory leakage if some every message in received packet. + * + * @param[in] tcf + * Context object initialized by mlx5_flow_tcf_context_create(). + * @param[in, out] ctx + * Pointer to callback context with command buffers list. + * + * @return + * Zero value on success, negative errno value otherwise + * and rte_errno is set. + */ +static int +flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf, + struct tcf_nlcb_context *ctx) +{ + struct tcf_nlcb_buf *bc = LIST_FIRST(&ctx->nlbuf); + int ret = 0; + + while (bc) { + struct tcf_nlcb_buf *bn = LIST_NEXT(bc, next); + struct nlmsghdr *nlh; + uint32_t msg = 0; + int rc; + + while (msg < bc->size) { + /* + * Send Netlink commands from buffer in one by one + * fashion. If we send multiple rule deletion commands + * in one Netlink message and some error occurs it may + * cause multiple ACK error messages and break sequence + * numbers of Netlink communication, because we expect + * the only one ACK reply. + */ + assert((bc->size - msg) >= sizeof(struct nlmsghdr)); + nlh = (struct nlmsghdr *)&bc->msg[msg]; + assert((bc->size - msg) >= nlh->nlmsg_len); + msg += nlh->nlmsg_len; + rc = flow_tcf_nl_ack(tcf, nlh, NULL, NULL); + if (rc) { + DRV_LOG(WARNING, + "netlink: cleanup error %d", rc); + if (!ret) + ret = rc; + } + } + rte_free(bc); + bc = bn; + } + LIST_INIT(&ctx->nlbuf); + return ret; +} + +/** + * Collect local IP address rules with scope link attribute on specified + * network device. This is callback routine called by libmnl mnl_cb_run() + * in loop for every message in received packet. + * + * @param[in] nlh + * Pointer to reply header. + * @param[in, out] arg + * Opaque data pointer for this callback. + * + * @return + * A positive, nonzero value on success, negative errno value otherwise + * and rte_errno is set. + */ +static int +flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg) +{ + struct tcf_nlcb_context *ctx = arg; + struct nlmsghdr *cmd; + struct ifaddrmsg *ifa; + struct nlattr *na; + struct nlattr *na_local = NULL; + struct nlattr *na_peer = NULL; + unsigned char family; + uint32_t size; + + if (nlh->nlmsg_type != RTM_NEWADDR) { + rte_errno = EINVAL; + return -rte_errno; + } + ifa = mnl_nlmsg_get_payload(nlh); + family = ifa->ifa_family; + if (ifa->ifa_index != ctx->ifindex || + ifa->ifa_scope != RT_SCOPE_LINK || + !(ifa->ifa_flags & IFA_F_PERMANENT) || + (family != AF_INET && family != AF_INET6)) + return 1; + mnl_attr_for_each(na, nlh, sizeof(*ifa)) { + switch (mnl_attr_get_type(na)) { + case IFA_LOCAL: + na_local = na; + break; + case IFA_ADDRESS: + na_peer = na; + break; + } + if (na_local && na_peer) + break; + } + if (!na_local || !na_peer) + return 1; + /* Local rule found with scope link, permanent and assigned peer. */ + size = MNL_ALIGN(sizeof(struct nlmsghdr)) + + MNL_ALIGN(sizeof(struct ifaddrmsg)) + + (family == AF_INET6 ? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) + : 2 * SZ_NLATTR_TYPE_OF(uint32_t)); + cmd = flow_tcf_alloc_nlcmd(ctx, size); + if (!cmd) { + rte_errno = ENOMEM; + return -rte_errno; + } + cmd = mnl_nlmsg_put_header(cmd); + cmd->nlmsg_type = RTM_DELADDR; + cmd->nlmsg_flags = NLM_F_REQUEST; + ifa = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifa)); + ifa->ifa_flags = IFA_F_PERMANENT; + ifa->ifa_scope = RT_SCOPE_LINK; + ifa->ifa_index = ctx->ifindex; + if (family == AF_INET) { + ifa->ifa_family = AF_INET; + ifa->ifa_prefixlen = 32; + mnl_attr_put_u32(cmd, IFA_LOCAL, mnl_attr_get_u32(na_local)); + mnl_attr_put_u32(cmd, IFA_ADDRESS, mnl_attr_get_u32(na_peer)); + } else { + ifa->ifa_family = AF_INET6; + ifa->ifa_prefixlen = 128; + mnl_attr_put(cmd, IFA_LOCAL, IPV6_ADDR_LEN, + mnl_attr_get_payload(na_local)); + mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN, + mnl_attr_get_payload(na_peer)); + } + assert(size == cmd->nlmsg_len); + return 1; +} + +/** + * Cleanup the local IP addresses on outer interface. + * + * @param[in] tcf + * Context object initialized by mlx5_flow_tcf_context_create(). + * @param[in] ifindex + * Network inferface index to perform cleanup. + */ +static void +flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf, + unsigned int ifindex) +{ + struct nlmsghdr *nlh; + struct ifaddrmsg *ifa; + struct tcf_nlcb_context ctx = { + .ifindex = ifindex, + .bufsize = MNL_REQUEST_SIZE, + .nlbuf = LIST_HEAD_INITIALIZER(), + }; + int ret; + + assert(ifindex); + /* + * Seek and destroy leftovers of local IP addresses with + * matching properties "scope link". + */ + nlh = mnl_nlmsg_put_header(tcf->buf); + nlh->nlmsg_type = RTM_GETADDR; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; + ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa)); + ifa->ifa_family = AF_UNSPEC; + ifa->ifa_index = ifindex; + ifa->ifa_scope = RT_SCOPE_LINK; + ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_local_cb, &ctx); + if (ret) + DRV_LOG(WARNING, "netlink: query device list error %d", ret); + ret = flow_tcf_send_nlcmd(tcf, &ctx); + if (ret) + DRV_LOG(WARNING, "netlink: device delete error %d", ret); +} + +/** + * Collect neigh permament rules on specified network device. + * This is callback routine called by libmnl mnl_cb_run() in loop for + * every message in received packet. + * + * @param[in] nlh + * Pointer to reply header. + * @param[in, out] arg + * Opaque data pointer for this callback. + * + * @return + * A positive, nonzero value on success, negative errno value otherwise + * and rte_errno is set. + */ +static int +flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg) +{ + struct tcf_nlcb_context *ctx = arg; + struct nlmsghdr *cmd; + struct ndmsg *ndm; + struct nlattr *na; + struct nlattr *na_ip = NULL; + struct nlattr *na_mac = NULL; + unsigned char family; + uint32_t size; + + if (nlh->nlmsg_type != RTM_NEWNEIGH) { + rte_errno = EINVAL; + return -rte_errno; + } + ndm = mnl_nlmsg_get_payload(nlh); + family = ndm->ndm_family; + if (ndm->ndm_ifindex != (int)ctx->ifindex || + !(ndm->ndm_state & NUD_PERMANENT) || + (family != AF_INET && family != AF_INET6)) + return 1; + mnl_attr_for_each(na, nlh, sizeof(*ndm)) { + switch (mnl_attr_get_type(na)) { + case NDA_DST: + na_ip = na; + break; + case NDA_LLADDR: + na_mac = na; + break; + } + if (na_mac && na_ip) + break; + } + if (!na_mac || !na_ip) + return 1; + /* Neigh rule with permenent attribute found. */ + size = MNL_ALIGN(sizeof(struct nlmsghdr)) + + MNL_ALIGN(sizeof(struct ndmsg)) + + SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) + + (family == AF_INET6 ? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) + : SZ_NLATTR_TYPE_OF(uint32_t)); + cmd = flow_tcf_alloc_nlcmd(ctx, size); + if (!cmd) { + rte_errno = ENOMEM; + return -rte_errno; + } + cmd = mnl_nlmsg_put_header(cmd); + cmd->nlmsg_type = RTM_DELNEIGH; + cmd->nlmsg_flags = NLM_F_REQUEST; + ndm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ndm)); + ndm->ndm_ifindex = ctx->ifindex; + ndm->ndm_state = NUD_PERMANENT; + ndm->ndm_flags = 0; + ndm->ndm_type = 0; + if (family == AF_INET) { + ndm->ndm_family = AF_INET; + mnl_attr_put_u32(cmd, NDA_DST, mnl_attr_get_u32(na_ip)); + } else { + ndm->ndm_family = AF_INET6; + mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN, + mnl_attr_get_payload(na_ip)); + } + mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN, + mnl_attr_get_payload(na_mac)); + assert(size == cmd->nlmsg_len); + return 1; +} + +/** + * Cleanup the neigh rules on outer interface. + * + * @param[in] tcf + * Context object initialized by mlx5_flow_tcf_context_create(). + * @param[in] ifindex + * Network inferface index to perform cleanup. + */ +static void +flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf, + unsigned int ifindex) +{ + struct nlmsghdr *nlh; + struct ndmsg *ndm; + struct tcf_nlcb_context ctx = { + .ifindex = ifindex, + .bufsize = MNL_REQUEST_SIZE, + .nlbuf = LIST_HEAD_INITIALIZER(), + }; + int ret; + + assert(ifindex); + /* Seek and destroy leftovers of neigh rules. */ + nlh = mnl_nlmsg_put_header(tcf->buf); + nlh->nlmsg_type = RTM_GETNEIGH; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; + ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm)); + ndm->ndm_family = AF_UNSPEC; + ndm->ndm_ifindex = ifindex; + ndm->ndm_state = NUD_PERMANENT; + ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_neigh_cb, &ctx); + if (ret) + DRV_LOG(WARNING, "netlink: query device list error %d", ret); + ret = flow_tcf_send_nlcmd(tcf, &ctx); + if (ret) + DRV_LOG(WARNING, "netlink: device delete error %d", ret); +} + +/** + * Collect indices of VXLAN encap/decap interfaces associated with device. + * This is callback routine called by libmnl mnl_cb_run() in loop for + * every message in received packet. + * + * @param[in] nlh + * Pointer to reply header. + * @param[in, out] arg + * Opaque data pointer for this callback. + * + * @return + * A positive, nonzero value on success, negative errno value otherwise + * and rte_errno is set. + */ +static int +flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg) +{ + struct tcf_nlcb_context *ctx = arg; + struct nlmsghdr *cmd; + struct ifinfomsg *ifm; + struct nlattr *na; + struct nlattr *na_info = NULL; + struct nlattr *na_vxlan = NULL; + bool found = false; + unsigned int vxindex; + uint32_t size; + + if (nlh->nlmsg_type != RTM_NEWLINK) { + rte_errno = EINVAL; + return -rte_errno; + } + ifm = mnl_nlmsg_get_payload(nlh); + if (!ifm->ifi_index) { + rte_errno = EINVAL; + return -rte_errno; + } + mnl_attr_for_each(na, nlh, sizeof(*ifm)) + if (mnl_attr_get_type(na) == IFLA_LINKINFO) { + na_info = na; + break; + } + if (!na_info) + return 1; + mnl_attr_for_each_nested(na, na_info) { + switch (mnl_attr_get_type(na)) { + case IFLA_INFO_KIND: + if (!strncmp("vxlan", mnl_attr_get_str(na), + mnl_attr_get_len(na))) + found = true; + break; + case IFLA_INFO_DATA: + na_vxlan = na; + break; + } + if (found && na_vxlan) + break; + } + if (!found || !na_vxlan) + return 1; + found = false; + mnl_attr_for_each_nested(na, na_vxlan) { + if (mnl_attr_get_type(na) == IFLA_VXLAN_LINK && + mnl_attr_get_u32(na) == ctx->ifindex) { + found = true; + break; + } + } + if (!found) + return 1; + /* Attached VXLAN device found, store the command to delete. */ + vxindex = ifm->ifi_index; + size = MNL_ALIGN(sizeof(struct nlmsghdr)) + + MNL_ALIGN(sizeof(struct ifinfomsg)); + cmd = flow_tcf_alloc_nlcmd(ctx, size); + if (!cmd) { + rte_errno = ENOMEM; + return -rte_errno; + } + cmd = mnl_nlmsg_put_header(cmd); + cmd->nlmsg_type = RTM_DELLINK; + cmd->nlmsg_flags = NLM_F_REQUEST; + ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm)); + ifm->ifi_family = AF_UNSPEC; + ifm->ifi_index = vxindex; + assert(size == cmd->nlmsg_len); + return 1; +} + +/** + * Cleanup the outer interface. Removes all found vxlan devices + * attached to specified index, flushes the meigh and local IP + * datavase. + * + * @param[in] tcf + * Context object initialized by mlx5_flow_tcf_context_create(). + * @param[in] ifindex + * Network inferface index to perform cleanup. + */ +static void +flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf, + unsigned int ifindex) +{ + struct nlmsghdr *nlh; + struct ifinfomsg *ifm; + struct tcf_nlcb_context ctx = { + .ifindex = ifindex, + .bufsize = MNL_REQUEST_SIZE, + .nlbuf = LIST_HEAD_INITIALIZER(), + }; + int ret; + + assert(ifindex); + /* + * Seek and destroy leftover VXLAN encap/decap interfaces with + * matching properties. + */ + nlh = mnl_nlmsg_put_header(tcf->buf); + nlh->nlmsg_type = RTM_GETLINK; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; + ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm)); + ifm->ifi_family = AF_UNSPEC; + ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_vxlan_cb, &ctx); + if (ret) + DRV_LOG(WARNING, "netlink: query device list error %d", ret); + ret = flow_tcf_send_nlcmd(tcf, &ctx); + if (ret) + DRV_LOG(WARNING, "netlink: device delete error %d", ret); +} + +/** + * Emit Netlink message to add/remove local address to the outer device. + * The address being added is visible within the link only (scope link). + * + * Note that an implicit route is maintained by the kernel due to the + * presence of a peer address (IFA_ADDRESS). + * + * These rules are used for encapsultion only and allow to assign + * the outer tunnel source IP address. + * + * @param[in] tcf + * Libmnl socket context object. + * @param[in] encap + * Encapsulation properties (source address and its peer). + * @param[in] ifindex + * Network interface to apply rule. + * @param[in] enable + * Toggle between add and remove. * @param[out] error - * Pointer to the error structure. + * Perform verbose error reporting if not NULL. * * @return - * 0 on success, a negative errno value otherwise and rte_ernno is set. + * 0 on success, a negative errno value otherwise and rte_errno is set. */ static int -flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow, - struct rte_flow_error *error) +flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf, + const struct flow_tcf_vxlan_encap *encap, + unsigned int ifindex, + bool enable, + struct rte_flow_error *error) { - struct priv *priv = dev->data->dev_private; - struct mlx5_flow_tcf_context *ctx = priv->tcf_context; - struct mlx5_flow *dev_flow; struct nlmsghdr *nlh; + struct ifaddrmsg *ifa; + alignas(struct nlmsghdr) + uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)]; - dev_flow = LIST_FIRST(&flow->dev_flows); - /* E-Switch flow can't be expanded. */ - assert(!LIST_NEXT(dev_flow, next)); - nlh = dev_flow->tcf.nlh; - nlh->nlmsg_type = RTM_NEWTFILTER; - nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; - if (!flow_tcf_nl_ack(ctx, nlh, 0, NULL, NULL)) + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR; + nlh->nlmsg_flags = + NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0); + nlh->nlmsg_seq = 0; + ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa)); + ifa->ifa_flags = IFA_F_PERMANENT; + ifa->ifa_scope = RT_SCOPE_LINK; + ifa->ifa_index = ifindex; + if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) { + ifa->ifa_family = AF_INET; + ifa->ifa_prefixlen = 32; + mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src); + if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) + mnl_attr_put_u32(nlh, IFA_ADDRESS, + encap->ipv4.dst); + } else { + assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC); + ifa->ifa_family = AF_INET6; + ifa->ifa_prefixlen = 128; + mnl_attr_put(nlh, IFA_LOCAL, + sizeof(encap->ipv6.src), + &encap->ipv6.src); + if (encap->mask & FLOW_TCF_ENCAP_IPV6_DST) + mnl_attr_put(nlh, IFA_ADDRESS, + sizeof(encap->ipv6.dst), + &encap->ipv6.dst); + } + if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL)) return 0; return rte_flow_error_set(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, - "netlink: failed to create TC flow rule"); + "netlink: cannot complete IFA request" + " (ip addr add)"); } /** - * Remove flow from E-Switch by sending Netlink message. + * Emit Netlink message to add/remove neighbor. * - * @param[in] dev - * Pointer to Ethernet device. - * @param[in, out] flow - * Pointer to the sub flow. + * @param[in] tcf + * Libmnl socket context object. + * @param[in] encap + * Encapsulation properties (destination address). + * @param[in] ifindex + * Network interface. + * @param[in] enable + * Toggle between add and remove. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -static void -flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow) +static int +flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf, + const struct flow_tcf_vxlan_encap *encap, + unsigned int ifindex, + bool enable, + struct rte_flow_error *error) { - struct priv *priv = dev->data->dev_private; - struct mlx5_flow_tcf_context *ctx = priv->tcf_context; - struct mlx5_flow *dev_flow; struct nlmsghdr *nlh; + struct ndmsg *ndm; + alignas(struct nlmsghdr) + uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)]; - if (!flow) - return; - if (flow->counter) { - if (--flow->counter->ref_cnt == 0) { - rte_free(flow->counter); - flow->counter = NULL; - } + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH; + nlh->nlmsg_flags = + NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0); + nlh->nlmsg_seq = 0; + ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm)); + ndm->ndm_ifindex = ifindex; + ndm->ndm_state = NUD_PERMANENT; + ndm->ndm_flags = 0; + ndm->ndm_type = 0; + if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) { + ndm->ndm_family = AF_INET; + mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst); + } else { + assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST); + ndm->ndm_family = AF_INET6; + mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst), + &encap->ipv6.dst); } - dev_flow = LIST_FIRST(&flow->dev_flows); - if (!dev_flow) - return; - /* E-Switch flow can't be expanded. */ - assert(!LIST_NEXT(dev_flow, next)); - nlh = dev_flow->tcf.nlh; - nlh->nlmsg_type = RTM_DELTFILTER; - nlh->nlmsg_flags = NLM_F_REQUEST; - flow_tcf_nl_ack(ctx, nlh, 0, NULL, NULL); + if (encap->mask & FLOW_TCF_ENCAP_ETH_SRC && enable) + DRV_LOG(WARNING, + "outer ethernet source address cannot be " + "forced for VXLAN encapsulation"); + if (encap->mask & FLOW_TCF_ENCAP_ETH_DST) + mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst), + &encap->eth.dst); + if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL)) + return 0; + return rte_flow_error_set(error, rte_errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "netlink: cannot complete ND request" + " (ip neigh)"); } /** - * Remove flow from E-Switch and release resources of the device flow. + * Manage the local IP addresses and their peers IP addresses on the + * outer interface for encapsulation purposes. The kernel searches the + * appropriate device for tunnel egress traffic using the outer source + * IP, this IP should be assigned to the outer network device, otherwise + * kernel rejects the rule. * - * @param[in] dev - * Pointer to Ethernet device. - * @param[in, out] flow - * Pointer to the sub flow. + * Adds or removes the addresses using the Netlink command like this: + * ip addr add peer scope link dev + * + * The addresses are local to the netdev ("scope link"), this reduces + * the risk of conflicts. Note that an implicit route is maintained by + * the kernel due to the presence of a peer address (IFA_ADDRESS). + * + * @param[in] tcf + * Libmnl socket context object. + * @param[in] vtep + * VTEP object, contains rule database and ifouter index. + * @param[in] dev_flow + * Flow object, contains the tunnel parameters (for encap only). + * @param[in] enable + * Toggle between add and remove. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. */ -static void -flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow) +static int +flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf, + struct tcf_vtep *vtep, + struct mlx5_flow *dev_flow, + bool enable, + struct rte_flow_error *error) { - struct mlx5_flow *dev_flow; + const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap; + struct tcf_local_rule *rule; + bool found = false; + int ret; - if (!flow) - return; - flow_tcf_remove(dev, flow); + assert(encap); + assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP); + if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) { + assert(encap->mask & FLOW_TCF_ENCAP_IPV4_DST); + LIST_FOREACH(rule, &vtep->local, next) { + if (rule->mask & FLOW_TCF_ENCAP_IPV4_SRC && + encap->ipv4.src == rule->ipv4.src && + encap->ipv4.dst == rule->ipv4.dst) { + found = true; + break; + } + } + } else { + assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC); + assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST); + LIST_FOREACH(rule, &vtep->local, next) { + if (rule->mask & FLOW_TCF_ENCAP_IPV6_SRC && + !memcmp(&encap->ipv6.src, &rule->ipv6.src, + sizeof(encap->ipv6.src)) && + !memcmp(&encap->ipv6.dst, &rule->ipv6.dst, + sizeof(encap->ipv6.dst))) { + found = true; + break; + } + } + } + if (found) { + if (enable) { + rule->refcnt++; + return 0; + } + if (!rule->refcnt || !--rule->refcnt) { + LIST_REMOVE(rule, next); + return flow_tcf_rule_local(tcf, encap, + vtep->ifouter, false, error); + } + return 0; + } + if (!enable) { + DRV_LOG(WARNING, "disabling not existing local rule"); + rte_flow_error_set(error, ENOENT, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "disabling not existing local rule"); + return -ENOENT; + } + rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule), + alignof(struct tcf_local_rule)); + if (!rule) { + rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "unable to allocate memory for local rule"); + return -rte_errno; + } + *rule = (struct tcf_local_rule){.refcnt = 0, + .mask = 0, + }; + if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) { + rule->mask = FLOW_TCF_ENCAP_IPV4_SRC + | FLOW_TCF_ENCAP_IPV4_DST; + rule->ipv4.src = encap->ipv4.src; + rule->ipv4.dst = encap->ipv4.dst; + } else { + rule->mask = FLOW_TCF_ENCAP_IPV6_SRC + | FLOW_TCF_ENCAP_IPV6_DST; + memcpy(&rule->ipv6.src, &encap->ipv6.src, IPV6_ADDR_LEN); + memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN); + } + ret = flow_tcf_rule_local(tcf, encap, vtep->ifouter, true, error); + if (ret) { + rte_free(rule); + return ret; + } + rule->refcnt++; + LIST_INSERT_HEAD(&vtep->local, rule, next); + return 0; +} + +/** + * Manage the destination MAC/IP addresses neigh database, kernel uses + * this one to determine the destination MAC address within encapsulation + * header. Adds or removes the entries using the Netlink command like this: + * ip neigh add dev lladdr to nud permanent + * + * @param[in] tcf + * Libmnl socket context object. + * @param[in] vtep + * VTEP object, contains rule database and ifouter index. + * @param[in] dev_flow + * Flow object, contains the tunnel parameters (for encap only). + * @param[in] enable + * Toggle between add and remove. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ +static int +flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf, + struct tcf_vtep *vtep, + struct mlx5_flow *dev_flow, + bool enable, + struct rte_flow_error *error) +{ + const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap; + struct tcf_neigh_rule *rule; + bool found = false; + int ret; + + assert(encap); + assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP); + if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) { + assert(encap->mask & FLOW_TCF_ENCAP_IPV4_SRC); + LIST_FOREACH(rule, &vtep->neigh, next) { + if (rule->mask & FLOW_TCF_ENCAP_IPV4_DST && + encap->ipv4.dst == rule->ipv4.dst) { + found = true; + break; + } + } + } else { + assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC); + assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST); + LIST_FOREACH(rule, &vtep->neigh, next) { + if (rule->mask & FLOW_TCF_ENCAP_IPV6_DST && + !memcmp(&encap->ipv6.dst, &rule->ipv6.dst, + sizeof(encap->ipv6.dst))) { + found = true; + break; + } + } + } + if (found) { + if (memcmp(&encap->eth.dst, &rule->eth, + sizeof(encap->eth.dst))) { + DRV_LOG(WARNING, "Destination MAC differs" + " in neigh rule"); + rte_flow_error_set(error, EEXIST, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, "Different MAC address" + " neigh rule for the same" + " destination IP"); + return -EEXIST; + } + if (enable) { + rule->refcnt++; + return 0; + } + if (!rule->refcnt || !--rule->refcnt) { + LIST_REMOVE(rule, next); + return flow_tcf_rule_neigh(tcf, encap, + vtep->ifouter, + false, error); + } + return 0; + } + if (!enable) { + DRV_LOG(WARNING, "Disabling not existing neigh rule"); + rte_flow_error_set(error, ENOENT, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "unable to allocate memory for neigh rule"); + return -ENOENT; + } + rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule), + alignof(struct tcf_neigh_rule)); + if (!rule) { + rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "unable to allocate memory for neigh rule"); + return -rte_errno; + } + *rule = (struct tcf_neigh_rule){.refcnt = 0, + .mask = 0, + }; + if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) { + rule->mask = FLOW_TCF_ENCAP_IPV4_DST; + rule->ipv4.dst = encap->ipv4.dst; + } else { + rule->mask = FLOW_TCF_ENCAP_IPV6_DST; + memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN); + } + memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth)); + ret = flow_tcf_rule_neigh(tcf, encap, vtep->ifouter, true, error); + if (ret) { + rte_free(rule); + return ret; + } + rule->refcnt++; + LIST_INSERT_HEAD(&vtep->neigh, rule, next); + return 0; +} + +/* VTEP device list is shared between PMD port instances. */ +static LIST_HEAD(, tcf_vtep) vtep_list_vxlan = LIST_HEAD_INITIALIZER(); +static pthread_mutex_t vtep_list_mutex = PTHREAD_MUTEX_INITIALIZER; + +/** + * Deletes VTEP network device. + * + * @param[in] tcf + * Context object initialized by mlx5_flow_tcf_context_create(). + * @param[in] vtep + * Object represinting the network device to delete. Memory + * allocated for this object is freed by routine. + */ +static void +flow_tcf_vtep_delete(struct mlx5_flow_tcf_context *tcf, + struct tcf_vtep *vtep) +{ + struct nlmsghdr *nlh; + struct ifinfomsg *ifm; + alignas(struct nlmsghdr) + uint8_t buf[mnl_nlmsg_size(MNL_ALIGN(sizeof(*ifm))) + + MNL_BUF_EXTRA_SPACE]; + int ret; + + assert(!vtep->refcnt); + /* Delete only ifaces those we actually created. */ + if (vtep->created && vtep->ifindex) { + DRV_LOG(INFO, "VTEP delete (%d)", vtep->ifindex); + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = RTM_DELLINK; + nlh->nlmsg_flags = NLM_F_REQUEST; + ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm)); + ifm->ifi_family = AF_UNSPEC; + ifm->ifi_index = vtep->ifindex; + assert(sizeof(buf) >= nlh->nlmsg_len); + ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL); + if (ret) + DRV_LOG(WARNING, "netlink: error deleting vxlan" + " encap/decap ifindex %u", + ifm->ifi_index); + } + rte_free(vtep); +} + +/** + * Creates VTEP network device. + * + * @param[in] tcf + * Context object initialized by mlx5_flow_tcf_context_create(). + * @param[in] ifouter + * Outer interface to attach new-created VXLAN device + * If zero the VXLAN device will not be attached to any device. + * These VTEPs are used for decapsulation and can be precreated + * and shared between processes. + * @param[in] port + * UDP port of created VTEP device. + * @param[out] error + * Perform verbose error reporting if not NULL. + * + * @return + * Pointer to created device structure on success, + * NULL otherwise and rte_errno is set. + */ +#ifdef HAVE_IFLA_VXLAN_COLLECT_METADATA +static struct tcf_vtep* +flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf, + unsigned int ifouter, + uint16_t port, struct rte_flow_error *error) +{ + struct tcf_vtep *vtep; + struct nlmsghdr *nlh; + struct ifinfomsg *ifm; + char name[sizeof(MLX5_VXLAN_DEVICE_PFX) + 24]; + alignas(struct nlmsghdr) + uint8_t buf[mnl_nlmsg_size(sizeof(*ifm)) + + SZ_NLATTR_DATA_OF(sizeof(name)) + + SZ_NLATTR_NEST * 2 + + SZ_NLATTR_STRZ_OF("vxlan") + + SZ_NLATTR_DATA_OF(sizeof(uint32_t)) + + SZ_NLATTR_DATA_OF(sizeof(uint16_t)) + + SZ_NLATTR_DATA_OF(sizeof(uint8_t)) * 3 + + MNL_BUF_EXTRA_SPACE]; + struct nlattr *na_info; + struct nlattr *na_vxlan; + rte_be16_t vxlan_port = rte_cpu_to_be_16(port); + int ret; + + vtep = rte_zmalloc(__func__, sizeof(*vtep), alignof(struct tcf_vtep)); + if (!vtep) { + rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "unable to allocate memory for VTEP"); + return NULL; + } + *vtep = (struct tcf_vtep){ + .port = port, + .local = LIST_HEAD_INITIALIZER(), + .neigh = LIST_HEAD_INITIALIZER(), + }; + memset(buf, 0, sizeof(buf)); + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = RTM_NEWLINK; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; + ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm)); + ifm->ifi_family = AF_UNSPEC; + ifm->ifi_type = 0; + ifm->ifi_index = 0; + ifm->ifi_flags = IFF_UP; + ifm->ifi_change = 0xffffffff; + snprintf(name, sizeof(name), "%s%u", MLX5_VXLAN_DEVICE_PFX, port); + mnl_attr_put_strz(nlh, IFLA_IFNAME, name); + na_info = mnl_attr_nest_start(nlh, IFLA_LINKINFO); + assert(na_info); + mnl_attr_put_strz(nlh, IFLA_INFO_KIND, "vxlan"); + na_vxlan = mnl_attr_nest_start(nlh, IFLA_INFO_DATA); + if (ifouter) + mnl_attr_put_u32(nlh, IFLA_VXLAN_LINK, ifouter); + assert(na_vxlan); + mnl_attr_put_u8(nlh, IFLA_VXLAN_COLLECT_METADATA, 1); + mnl_attr_put_u8(nlh, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1); + mnl_attr_put_u8(nlh, IFLA_VXLAN_LEARNING, 0); + mnl_attr_put_u16(nlh, IFLA_VXLAN_PORT, vxlan_port); + mnl_attr_nest_end(nlh, na_vxlan); + mnl_attr_nest_end(nlh, na_info); + assert(sizeof(buf) >= nlh->nlmsg_len); + ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL); + if (ret) { + DRV_LOG(WARNING, + "netlink: VTEP %s create failure (%d)", + name, rte_errno); + if (rte_errno != EEXIST || ifouter) + /* + * Some unhandled error occurred or device is + * for encapsulation and cannot be shared. + */ + goto error; + } else { + /* + * Mark device we actually created. + * We should explicitly delete + * when we do not need it anymore. + */ + vtep->created = 1; + } + /* Try to get ifindex of created of pre-existing device. */ + ret = if_nametoindex(name); + if (!ret) { + DRV_LOG(WARNING, + "VTEP %s failed to get index (%d)", name, errno); + rte_flow_error_set + (error, -errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "netlink: failed to retrieve VTEP ifindex"); + goto error; + } + vtep->ifindex = ret; + vtep->ifouter = ifouter; + memset(buf, 0, sizeof(buf)); + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = RTM_NEWLINK; + nlh->nlmsg_flags = NLM_F_REQUEST; + ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm)); + ifm->ifi_family = AF_UNSPEC; + ifm->ifi_type = 0; + ifm->ifi_index = vtep->ifindex; + ifm->ifi_flags = IFF_UP; + ifm->ifi_change = IFF_UP; + ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL); + if (ret) { + rte_flow_error_set(error, -errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "netlink: failed to set VTEP link up"); + DRV_LOG(WARNING, "netlink: VTEP %s set link up failure (%d)", + name, rte_errno); + goto clean; + } + ret = mlx5_flow_tcf_init(tcf, vtep->ifindex, error); + if (ret) { + DRV_LOG(WARNING, "VTEP %s init failure (%d)", name, rte_errno); + goto clean; + } + DRV_LOG(INFO, "VTEP create (%d, %d)", vtep->port, vtep->ifindex); + vtep->refcnt = 1; + return vtep; +clean: + flow_tcf_vtep_delete(tcf, vtep); + return NULL; +error: + rte_free(vtep); + return NULL; +} +#else +static struct tcf_vtep* +flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf __rte_unused, + unsigned int ifouter __rte_unused, + uint16_t port __rte_unused, + struct rte_flow_error *error) +{ + rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "netlink: failed to create VTEP, " + "vxlan metadata are not supported by kernel"); + return NULL; +} +#endif /* HAVE_IFLA_VXLAN_COLLECT_METADATA */ + +/** + * Acquire target interface index for VXLAN tunneling decapsulation. + * In order to share the UDP port within the other interfaces the + * VXLAN device created as not attached to any interface (if created). + * + * @param[in] tcf + * Context object initialized by mlx5_flow_tcf_context_create(). + * @param[in] dev_flow + * Flow tcf object with tunnel structure pointer set. + * @param[out] error + * Perform verbose error reporting if not NULL. + * @return + * Interface descriptor pointer on success, + * NULL otherwise and rte_errno is set. + */ +static struct tcf_vtep* +flow_tcf_decap_vtep_acquire(struct mlx5_flow_tcf_context *tcf, + struct mlx5_flow *dev_flow, + struct rte_flow_error *error) +{ + struct tcf_vtep *vtep; + uint16_t port = dev_flow->tcf.vxlan_decap->udp_port; + + LIST_FOREACH(vtep, &vtep_list_vxlan, next) { + if (vtep->port == port) + break; + } + if (vtep && vtep->ifouter) { + rte_flow_error_set(error, -errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "Failed to create decap VTEP with specified" + " UDP port, atatched device exists"); + return NULL; + } + if (vtep) { + /* Device exists, just increment the reference counter. */ + vtep->refcnt++; + assert(vtep->ifindex); + return vtep; + } + /* No decapsulation device exists, try to create the new one. */ + vtep = flow_tcf_vtep_create(tcf, 0, port, error); + if (vtep) + LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next); + return vtep; +} + +/** + * Aqcuire target interface index for VXLAN tunneling encapsulation. + * + * @param[in] tcf + * Context object initialized by mlx5_flow_tcf_context_create(). + * @param[in] ifouter + * Network interface index to attach VXLAN encap device to. + * @param[in] dev_flow + * Flow tcf object with tunnel structure pointer set. + * @param[out] error + * Perform verbose error reporting if not NULL. + * @return + * Interface descriptor pointer on success, + * NULL otherwise and rte_errno is set. + */ +static struct tcf_vtep* +flow_tcf_encap_vtep_acquire(struct mlx5_flow_tcf_context *tcf, + unsigned int ifouter, + struct mlx5_flow *dev_flow __rte_unused, + struct rte_flow_error *error) +{ + static uint16_t encap_port = MLX5_VXLAN_PORT_MIN - 1; + struct tcf_vtep *vtep; + int ret; + + assert(ifouter); + /* Look whether the attached VTEP for encap is created. */ + LIST_FOREACH(vtep, &vtep_list_vxlan, next) { + if (vtep->ifouter == ifouter) + break; + } + if (vtep) { + /* VTEP already exists, just increment the reference. */ + vtep->refcnt++; + } else { + uint16_t pcnt; + + /* Not found, we should create the new attached VTEP. */ + flow_tcf_encap_iface_cleanup(tcf, ifouter); + flow_tcf_encap_local_cleanup(tcf, ifouter); + flow_tcf_encap_neigh_cleanup(tcf, ifouter); + for (pcnt = 0; pcnt <= (MLX5_VXLAN_PORT_MAX + - MLX5_VXLAN_PORT_MIN); pcnt++) { + encap_port++; + /* Wraparound the UDP port index. */ + if (encap_port < MLX5_VXLAN_PORT_MIN || + encap_port > MLX5_VXLAN_PORT_MAX) + encap_port = MLX5_VXLAN_PORT_MIN; + /* Check whether UDP port is in already in use. */ + LIST_FOREACH(vtep, &vtep_list_vxlan, next) { + if (vtep->port == encap_port) + break; + } + if (vtep) { + /* Port is in use, try the next one. */ + vtep = NULL; + continue; + } + vtep = flow_tcf_vtep_create(tcf, ifouter, + encap_port, error); + if (vtep) { + LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next); + break; + } + if (rte_errno != EEXIST) + break; + } + if (!vtep) + return NULL; + } + assert(vtep->ifouter == ifouter); + assert(vtep->ifindex); + /* Create local ipaddr with peer to specify the outer IPs. */ + ret = flow_tcf_encap_local(tcf, vtep, dev_flow, true, error); + if (!ret) { + /* Create neigh rule to specify outer destination MAC. */ + ret = flow_tcf_encap_neigh(tcf, vtep, dev_flow, true, error); + if (ret) + flow_tcf_encap_local(tcf, vtep, + dev_flow, false, error); + } + if (ret) { + if (--vtep->refcnt == 0) + flow_tcf_vtep_delete(tcf, vtep); + return NULL; + } + return vtep; +} + +/** + * Acquires target interface index for tunneling of any type. + * Creates the new VTEP if needed. + * + * @param[in] tcf + * Context object initialized by mlx5_flow_tcf_context_create(). + * @param[in] ifouter + * Network interface index to attach VXLAN encap device to. + * @param[in] dev_flow + * Flow tcf object with tunnel structure pointer set. + * @param[out] error + * Perform verbose error reporting if not NULL. + * @return + * Interface descriptor pointer on success, + * NULL otherwise and rte_errno is set. + */ +static struct tcf_vtep* +flow_tcf_vtep_acquire(struct mlx5_flow_tcf_context *tcf, + unsigned int ifouter, + struct mlx5_flow *dev_flow, + struct rte_flow_error *error) +{ + struct tcf_vtep *vtep = NULL; + + assert(dev_flow->tcf.tunnel); + pthread_mutex_lock(&vtep_list_mutex); + switch (dev_flow->tcf.tunnel->type) { + case FLOW_TCF_TUNACT_VXLAN_ENCAP: + vtep = flow_tcf_encap_vtep_acquire(tcf, ifouter, + dev_flow, error); + break; + case FLOW_TCF_TUNACT_VXLAN_DECAP: + vtep = flow_tcf_decap_vtep_acquire(tcf, dev_flow, error); + break; + default: + rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "unsupported tunnel type"); + break; + } + pthread_mutex_unlock(&vtep_list_mutex); + return vtep; +} + +/** + * Release tunneling interface by ifindex. Decrements reference + * counter and actually removes the device if counter is zero. + * + * @param[in] tcf + * Context object initialized by mlx5_flow_tcf_context_create(). + * @param[in] vtep + * VTEP device descriptor structure. + * @param[in] dev_flow + * Flow tcf object with tunnel structure pointer set. + */ +static void +flow_tcf_vtep_release(struct mlx5_flow_tcf_context *tcf, + struct tcf_vtep *vtep, + struct mlx5_flow *dev_flow) +{ + assert(dev_flow->tcf.tunnel); + pthread_mutex_lock(&vtep_list_mutex); + switch (dev_flow->tcf.tunnel->type) { + case FLOW_TCF_TUNACT_VXLAN_DECAP: + break; + case FLOW_TCF_TUNACT_VXLAN_ENCAP: + /* Remove the encap ancillary rules first. */ + flow_tcf_encap_neigh(tcf, vtep, dev_flow, false, NULL); + flow_tcf_encap_local(tcf, vtep, dev_flow, false, NULL); + break; + default: + assert(false); + DRV_LOG(WARNING, "Unsupported tunnel type"); + break; + } + assert(vtep->refcnt); + if (--vtep->refcnt == 0) { + LIST_REMOVE(vtep, next); + flow_tcf_vtep_delete(tcf, vtep); + } + pthread_mutex_unlock(&vtep_list_mutex); +} + +struct tcf_nlcb_query { + uint32_t handle; + uint32_t tc_flags; + uint32_t flags_valid:1; +}; + +/** + * Collect queried rule attributes. This is callback routine called by + * libmnl mnl_cb_run() in loop for every message in received packet. + * Current implementation collects the flower flags only. + * + * @param[in] nlh + * Pointer to reply header. + * @param[in, out] arg + * Context pointer for this callback. + * + * @return + * A positive, nonzero value on success (required by libmnl + * to continue messages processing). + */ +static int +flow_tcf_collect_query_cb(const struct nlmsghdr *nlh, void *arg) +{ + struct tcf_nlcb_query *query = arg; + struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh); + struct nlattr *na, *na_opt; + bool flower = false; + + if (nlh->nlmsg_type != RTM_NEWTFILTER || + tcm->tcm_handle != query->handle) + return 1; + mnl_attr_for_each(na, nlh, sizeof(*tcm)) { + switch (mnl_attr_get_type(na)) { + case TCA_KIND: + if (strcmp(mnl_attr_get_payload(na), "flower")) { + /* Not flower filter, drop entire message. */ + return 1; + } + flower = true; + break; + case TCA_OPTIONS: + if (!flower) { + /* Not flower options, drop entire message. */ + return 1; + } + /* Check nested flower options. */ + mnl_attr_for_each_nested(na_opt, na) { + switch (mnl_attr_get_type(na_opt)) { + case TCA_FLOWER_FLAGS: + query->flags_valid = 1; + query->tc_flags = + mnl_attr_get_u32(na_opt); + break; + } + } + break; + } + } + return 1; +} + +/** + * Query a TC flower rule flags via netlink. + * + * @param[in] tcf + * Context object initialized by mlx5_flow_tcf_context_create(). + * @param[in] dev_flow + * Pointer to the flow. + * @param[out] pflags + * pointer to the data retrieved by the query. + * + * @return + * 0 on success, a negative errno value otherwise. + */ +static int +flow_tcf_query_flags(struct mlx5_flow_tcf_context *tcf, + struct mlx5_flow *dev_flow, + uint32_t *pflags) +{ + struct nlmsghdr *nlh; + struct tcmsg *tcm; + struct tcf_nlcb_query query = { + .handle = dev_flow->tcf.tcm->tcm_handle, + }; + + nlh = mnl_nlmsg_put_header(tcf->buf); + nlh->nlmsg_type = RTM_GETTFILTER; + nlh->nlmsg_flags = NLM_F_REQUEST; + tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm)); + memcpy(tcm, dev_flow->tcf.tcm, sizeof(*tcm)); + /* + * Ignore Netlink error for filter query operations. + * The reply length is sent by kernel as errno. + * Just check we got the flags option. + */ + flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_query_cb, &query); + if (!query.flags_valid) { + *pflags = 0; + return -ENOENT; + } + *pflags = query.tc_flags; + return 0; +} + +/** + * Query and check the in_hw set for specified rule. + * + * @param[in] tcf + * Context object initialized by mlx5_flow_tcf_context_create(). + * @param[in] dev_flow + * Pointer to the flow to check. + * + * @return + * 0 on success, a negative errno value otherwise. + */ +static int +flow_tcf_check_inhw(struct mlx5_flow_tcf_context *tcf, + struct mlx5_flow *dev_flow) +{ + uint32_t flags; + int ret; + + ret = flow_tcf_query_flags(tcf, dev_flow, &flags); + if (ret) + return ret; + return (flags & TCA_CLS_FLAGS_IN_HW) ? 0 : -ENOENT; +} + +/** + * Remove flow from E-Switch by sending Netlink message. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in, out] flow + * Pointer to the sub flow. + */ +static void +flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow) +{ + struct priv *priv = dev->data->dev_private; + struct mlx5_flow_tcf_context *ctx = priv->tcf_context; + struct mlx5_flow *dev_flow; + struct nlmsghdr *nlh; + + if (!flow) + return; + dev_flow = LIST_FIRST(&flow->dev_flows); + if (!dev_flow) + return; + /* E-Switch flow can't be expanded. */ + assert(!LIST_NEXT(dev_flow, next)); + if (dev_flow->tcf.applied) { + nlh = dev_flow->tcf.nlh; + nlh->nlmsg_type = RTM_DELTFILTER; + nlh->nlmsg_flags = NLM_F_REQUEST; + flow_tcf_nl_ack(ctx, nlh, NULL, NULL); + if (dev_flow->tcf.tunnel) { + assert(dev_flow->tcf.tunnel->vtep); + flow_tcf_vtep_release(ctx, + dev_flow->tcf.tunnel->vtep, + dev_flow); + dev_flow->tcf.tunnel->vtep = NULL; + } + dev_flow->tcf.applied = 0; + } +} + +/** + * Apply flow to E-Switch by sending Netlink message. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in, out] flow + * Pointer to the sub flow. + * @param[out] error + * Pointer to the error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_ernno is set. + */ +static int +flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow, + struct rte_flow_error *error) +{ + struct priv *priv = dev->data->dev_private; + struct mlx5_flow_tcf_context *ctx = priv->tcf_context; + struct mlx5_flow *dev_flow; + struct nlmsghdr *nlh; + + dev_flow = LIST_FIRST(&flow->dev_flows); + /* E-Switch flow can't be expanded. */ + assert(!LIST_NEXT(dev_flow, next)); + if (dev_flow->tcf.applied) + return 0; + nlh = dev_flow->tcf.nlh; + nlh->nlmsg_type = RTM_NEWTFILTER; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; + if (dev_flow->tcf.tunnel) { + /* + * Replace the interface index, target for + * encapsulation, source for decapsulation. + */ + assert(!dev_flow->tcf.tunnel->vtep); + assert(dev_flow->tcf.tunnel->ifindex_ptr); + /* Acquire actual VTEP device when rule is being applied. */ + dev_flow->tcf.tunnel->vtep = + flow_tcf_vtep_acquire(ctx, + dev_flow->tcf.tunnel->ifindex_org, + dev_flow, error); + if (!dev_flow->tcf.tunnel->vtep) + return -rte_errno; + DRV_LOG(INFO, "Replace ifindex: %d->%d", + dev_flow->tcf.tunnel->vtep->ifindex, + dev_flow->tcf.tunnel->ifindex_org); + *dev_flow->tcf.tunnel->ifindex_ptr = + dev_flow->tcf.tunnel->vtep->ifindex; + } + if (!flow_tcf_nl_ack(ctx, nlh, NULL, NULL)) { + dev_flow->tcf.applied = 1; + if (*dev_flow->tcf.ptc_flags & TCA_CLS_FLAGS_SKIP_SW) + return 0; + /* + * Rule was applied without skip_sw flag set. + * We should check whether the rule was acctually + * accepted by hardware (have look at in_hw flag). + */ + if (flow_tcf_check_inhw(ctx, dev_flow)) { + flow_tcf_remove(dev, flow); + return rte_flow_error_set + (error, ENOENT, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "netlink: rule has no in_hw flag set"); + } + return 0; + } + if (dev_flow->tcf.tunnel) { + /* Rollback the VTEP configuration if rule apply failed. */ + assert(dev_flow->tcf.tunnel->vtep); + flow_tcf_vtep_release(ctx, dev_flow->tcf.tunnel->vtep, + dev_flow); + dev_flow->tcf.tunnel->vtep = NULL; + } + return rte_flow_error_set(error, rte_errno, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "netlink: failed to create TC flow rule"); +} + +/** + * Remove flow from E-Switch and release resources of the device flow. + * + * @param[in] dev + * Pointer to Ethernet device. + * @param[in, out] flow + * Pointer to the sub flow. + */ +static void +flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow) +{ + struct mlx5_flow *dev_flow; + + if (!flow) + return; + flow_tcf_remove(dev, flow); + if (flow->counter) { + if (--flow->counter->ref_cnt == 0) { + rte_free(flow->counter); + flow->counter = NULL; + } + } dev_flow = LIST_FIRST(&flow->dev_flows); if (!dev_flow) return; @@ -4180,7 +5663,7 @@ flow_tcf_nl_filter_parse_and_get(struct nlmsghdr *cnlh, * Message received from Netlink. * @param[out] data * Pointer to data area to be filled by the parsing routine. - * assumed to be a pinter to struct flow_tcf_stats_basic. + * assumed to be a pointer to struct flow_tcf_stats_basic. * * @return * MNL_CB_OK value. @@ -4228,7 +5711,7 @@ flow_tcf_query_count(struct rte_eth_dev *dev, void *data, struct rte_flow_error *error) { - struct flow_tcf_stats_basic sb_data = { 0 }; + struct flow_tcf_stats_basic sb_data; struct rte_flow_query_count *qc = data; struct priv *priv = dev->data->dev_private; struct mlx5_flow_tcf_context *ctx = priv->tcf_context; @@ -4239,6 +5722,7 @@ flow_tcf_query_count(struct rte_eth_dev *dev, ssize_t ret; assert(qc); + memset(&sb_data, 0, sizeof(sb_data)); dev_flow = LIST_FIRST(&flow->dev_flows); /* E-Switch flow can't be expanded. */ assert(!LIST_NEXT(dev_flow, next)); @@ -4385,7 +5869,9 @@ mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx, struct nlmsghdr *nlh; struct tcmsg *tcm; alignas(struct nlmsghdr) - uint8_t buf[mnl_nlmsg_size(sizeof(*tcm) + 128)]; + uint8_t buf[mnl_nlmsg_size(sizeof(*tcm)) + + SZ_NLATTR_STRZ_OF("ingress") + + MNL_BUF_EXTRA_SPACE]; /* Destroy existing ingress qdisc and everything attached to it. */ nlh = mnl_nlmsg_put_header(buf); @@ -4396,8 +5882,9 @@ mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx, tcm->tcm_ifindex = ifindex; tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0); tcm->tcm_parent = TC_H_INGRESS; + assert(sizeof(buf) >= nlh->nlmsg_len); /* Ignore errors when qdisc is already absent. */ - if (flow_tcf_nl_ack(ctx, nlh, 0, NULL, NULL) && + if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL) && rte_errno != EINVAL && rte_errno != ENOENT) return rte_flow_error_set(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, @@ -4413,7 +5900,8 @@ mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx, tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0); tcm->tcm_parent = TC_H_INGRESS; mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress"); - if (flow_tcf_nl_ack(ctx, nlh, 0, NULL, NULL)) + assert(sizeof(buf) >= nlh->nlmsg_len); + if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL)) return rte_flow_error_set(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, "netlink: failed to create ingress"