net/mlx5: replace flow list with indexed pool
[dpdk.git] / drivers / net / mlx5 / mlx5_flow.c
index cac4e6a..2bbb5f5 100644 (file)
@@ -131,6 +131,7 @@ mlx5_flow_is_rss_expandable_item(const struct rte_flow_item *item)
        case RTE_FLOW_ITEM_TYPE_NVGRE:
        case RTE_FLOW_ITEM_TYPE_GRE:
        case RTE_FLOW_ITEM_TYPE_GENEVE:
+       case RTE_FLOW_ITEM_TYPE_MPLS:
                return true;
        default:
                break;
@@ -264,6 +265,7 @@ mlx5_flow_expand_rss_item_complete(const struct rte_flow_item *item)
  *   set, the following errors are defined:
  *
  *   -E2BIG: graph-depth @p graph is too deep.
+ *   -EINVAL: @p size has not enough space for expanded pattern.
  */
 static int
 mlx5_flow_expand_rss(struct mlx5_flow_expand_rss *buf, size_t size,
@@ -290,12 +292,12 @@ mlx5_flow_expand_rss(struct mlx5_flow_expand_rss *buf, size_t size,
        memset(&missed_item, 0, sizeof(missed_item));
        lsize = offsetof(struct mlx5_flow_expand_rss, entry) +
                MLX5_RSS_EXP_ELT_N * sizeof(buf->entry[0]);
-       if (lsize <= size) {
-               buf->entry[0].priority = 0;
-               buf->entry[0].pattern = (void *)&buf->entry[MLX5_RSS_EXP_ELT_N];
-               buf->entries = 0;
-               addr = buf->entry[0].pattern;
-       }
+       if (lsize > size)
+               return -EINVAL;
+       buf->entry[0].priority = 0;
+       buf->entry[0].pattern = (void *)&buf->entry[MLX5_RSS_EXP_ELT_N];
+       buf->entries = 0;
+       addr = buf->entry[0].pattern;
        for (item = pattern; item->type != RTE_FLOW_ITEM_TYPE_END; item++) {
                if (!mlx5_flow_is_rss_expandable_item(item)) {
                        user_pattern_size += sizeof(*item);
@@ -313,12 +315,12 @@ mlx5_flow_expand_rss(struct mlx5_flow_expand_rss *buf, size_t size,
        }
        user_pattern_size += sizeof(*item); /* Handle END item. */
        lsize += user_pattern_size;
+       if (lsize > size)
+               return -EINVAL;
        /* Copy the user pattern in the first entry of the buffer. */
-       if (lsize <= size) {
-               rte_memcpy(addr, pattern, user_pattern_size);
-               addr = (void *)(((uintptr_t)addr) + user_pattern_size);
-               buf->entries = 1;
-       }
+       rte_memcpy(addr, pattern, user_pattern_size);
+       addr = (void *)(((uintptr_t)addr) + user_pattern_size);
+       buf->entries = 1;
        /* Start expanding. */
        memset(flow_items, 0, sizeof(flow_items));
        user_pattern_size -= sizeof(*item);
@@ -348,7 +350,9 @@ mlx5_flow_expand_rss(struct mlx5_flow_expand_rss *buf, size_t size,
                elt = 2; /* missed item + item end. */
                node = next;
                lsize += elt * sizeof(*item) + user_pattern_size;
-               if ((node->rss_types & types) && lsize <= size) {
+               if (lsize > size)
+                       return -EINVAL;
+               if (node->rss_types & types) {
                        buf->entry[buf->entries].priority = 1;
                        buf->entry[buf->entries].pattern = addr;
                        buf->entries++;
@@ -367,6 +371,7 @@ mlx5_flow_expand_rss(struct mlx5_flow_expand_rss *buf, size_t size,
        while (node) {
                flow_items[stack_pos].type = node->type;
                if (node->rss_types & types) {
+                       size_t n;
                        /*
                         * compute the number of items to copy from the
                         * expansion and copy it.
@@ -376,24 +381,23 @@ mlx5_flow_expand_rss(struct mlx5_flow_expand_rss *buf, size_t size,
                        elt = stack_pos + 2;
                        flow_items[stack_pos + 1].type = RTE_FLOW_ITEM_TYPE_END;
                        lsize += elt * sizeof(*item) + user_pattern_size;
-                       if (lsize <= size) {
-                               size_t n = elt * sizeof(*item);
-
-                               buf->entry[buf->entries].priority =
-                                       stack_pos + 1 + missed;
-                               buf->entry[buf->entries].pattern = addr;
-                               buf->entries++;
-                               rte_memcpy(addr, buf->entry[0].pattern,
-                                          user_pattern_size);
-                               addr = (void *)(((uintptr_t)addr) +
-                                               user_pattern_size);
-                               rte_memcpy(addr, &missed_item,
-                                          missed * sizeof(*item));
-                               addr = (void *)(((uintptr_t)addr) +
-                                       missed * sizeof(*item));
-                               rte_memcpy(addr, flow_items, n);
-                               addr = (void *)(((uintptr_t)addr) + n);
-                       }
+                       if (lsize > size)
+                               return -EINVAL;
+                       n = elt * sizeof(*item);
+                       buf->entry[buf->entries].priority =
+                               stack_pos + 1 + missed;
+                       buf->entry[buf->entries].pattern = addr;
+                       buf->entries++;
+                       rte_memcpy(addr, buf->entry[0].pattern,
+                                  user_pattern_size);
+                       addr = (void *)(((uintptr_t)addr) +
+                                       user_pattern_size);
+                       rte_memcpy(addr, &missed_item,
+                                  missed * sizeof(*item));
+                       addr = (void *)(((uintptr_t)addr) +
+                               missed * sizeof(*item));
+                       rte_memcpy(addr, flow_items, n);
+                       addr = (void *)(((uintptr_t)addr) + n);
                }
                /* Go deeper. */
                if (!node->optional && node->next) {
@@ -474,8 +478,7 @@ static const struct mlx5_flow_expand_node mlx5_support_expansion[] = {
        },
        [MLX5_EXPANSION_OUTER_ETH] = {
                .next = MLX5_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_OUTER_IPV4,
-                                                 MLX5_EXPANSION_OUTER_IPV6,
-                                                 MLX5_EXPANSION_MPLS),
+                                                 MLX5_EXPANSION_OUTER_IPV6),
                .type = RTE_FLOW_ITEM_TYPE_ETH,
                .rss_types = 0,
        },
@@ -503,7 +506,8 @@ static const struct mlx5_flow_expand_node mlx5_support_expansion[] = {
        },
        [MLX5_EXPANSION_OUTER_IPV4_UDP] = {
                .next = MLX5_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_VXLAN,
-                                                 MLX5_EXPANSION_VXLAN_GPE),
+                                                 MLX5_EXPANSION_VXLAN_GPE,
+                                                 MLX5_EXPANSION_MPLS),
                .type = RTE_FLOW_ITEM_TYPE_UDP,
                .rss_types = ETH_RSS_NONFRAG_IPV4_UDP,
        },
@@ -525,7 +529,8 @@ static const struct mlx5_flow_expand_node mlx5_support_expansion[] = {
        },
        [MLX5_EXPANSION_OUTER_IPV6_UDP] = {
                .next = MLX5_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_VXLAN,
-                                                 MLX5_EXPANSION_VXLAN_GPE),
+                                                 MLX5_EXPANSION_VXLAN_GPE,
+                                                 MLX5_EXPANSION_MPLS),
                .type = RTE_FLOW_ITEM_TYPE_UDP,
                .rss_types = ETH_RSS_NONFRAG_IPV6_UDP,
        },
@@ -548,12 +553,14 @@ static const struct mlx5_flow_expand_node mlx5_support_expansion[] = {
        [MLX5_EXPANSION_GRE] = {
                .next = MLX5_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4,
                                                  MLX5_EXPANSION_IPV6,
-                                                 MLX5_EXPANSION_GRE_KEY),
+                                                 MLX5_EXPANSION_GRE_KEY,
+                                                 MLX5_EXPANSION_MPLS),
                .type = RTE_FLOW_ITEM_TYPE_GRE,
        },
        [MLX5_EXPANSION_GRE_KEY] = {
                .next = MLX5_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4,
-                                                 MLX5_EXPANSION_IPV6),
+                                                 MLX5_EXPANSION_IPV6,
+                                                 MLX5_EXPANSION_MPLS),
                .type = RTE_FLOW_ITEM_TYPE_GRE_KEY,
                .optional = 1,
        },
@@ -563,7 +570,8 @@ static const struct mlx5_flow_expand_node mlx5_support_expansion[] = {
        },
        [MLX5_EXPANSION_MPLS] = {
                .next = MLX5_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_IPV4,
-                                                 MLX5_EXPANSION_IPV6),
+                                                 MLX5_EXPANSION_IPV6,
+                                                 MLX5_EXPANSION_ETH),
                .type = RTE_FLOW_ITEM_TYPE_MPLS,
        },
        [MLX5_EXPANSION_ETH] = {
@@ -1666,6 +1674,13 @@ mlx5_flow_validate_action_rss(const struct rte_flow_action *action,
                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF, NULL,
                                          "RSS on eCPRI is not supported now");
        }
+       if ((item_flags & MLX5_FLOW_LAYER_MPLS) &&
+           !(item_flags &
+             (MLX5_FLOW_LAYER_INNER_L2 | MLX5_FLOW_LAYER_INNER_L3)) &&
+           rss->level > 1)
+               return rte_flow_error_set(error, EINVAL,
+                                         RTE_FLOW_ERROR_TYPE_ITEM, NULL,
+                                         "MPLS inner RSS needs to specify inner L2/L3 items after MPLS in pattern");
        return 0;
 }
 
@@ -2395,12 +2410,14 @@ mlx5_flow_validate_item_tcp(const struct rte_flow_item *item,
 /**
  * Validate VXLAN item.
  *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
  * @param[in] item
  *   Item specification.
  * @param[in] item_flags
  *   Bit-fields that holds the items detected until now.
- * @param[in] target_protocol
- *   The next protocol in the previous item.
+ * @param[in] attr
+ *   Flow rule attributes.
  * @param[out] error
  *   Pointer to error structure.
  *
@@ -2408,24 +2425,32 @@ mlx5_flow_validate_item_tcp(const struct rte_flow_item *item,
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 int
-mlx5_flow_validate_item_vxlan(const struct rte_flow_item *item,
+mlx5_flow_validate_item_vxlan(struct rte_eth_dev *dev,
+                             const struct rte_flow_item *item,
                              uint64_t item_flags,
+                             const struct rte_flow_attr *attr,
                              struct rte_flow_error *error)
 {
        const struct rte_flow_item_vxlan *spec = item->spec;
        const struct rte_flow_item_vxlan *mask = item->mask;
        int ret;
+       struct mlx5_priv *priv = dev->data->dev_private;
        union vni {
                uint32_t vlan_id;
                uint8_t vni[4];
        } id = { .vlan_id = 0, };
-
+       const struct rte_flow_item_vxlan nic_mask = {
+               .vni = "\xff\xff\xff",
+               .rsvd1 = 0xff,
+       };
+       const struct rte_flow_item_vxlan *valid_mask;
 
        if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
                return rte_flow_error_set(error, ENOTSUP,
                                          RTE_FLOW_ERROR_TYPE_ITEM, item,
                                          "multiple tunnel layers not"
                                          " supported");
+       valid_mask = &rte_flow_item_vxlan_mask;
        /*
         * Verify only UDPv4 is present as defined in
         * https://tools.ietf.org/html/rfc7348
@@ -2436,9 +2461,15 @@ mlx5_flow_validate_item_vxlan(const struct rte_flow_item *item,
                                          "no outer UDP layer found");
        if (!mask)
                mask = &rte_flow_item_vxlan_mask;
+       /* FDB domain & NIC domain non-zero group */
+       if ((attr->transfer || attr->group) && priv->sh->misc5_cap)
+               valid_mask = &nic_mask;
+       /* Group zero in NIC domain */
+       if (!attr->group && !attr->transfer && priv->sh->tunnel_header_0_1)
+               valid_mask = &nic_mask;
        ret = mlx5_flow_item_acceptable
                (item, (const uint8_t *)mask,
-                (const uint8_t *)&rte_flow_item_vxlan_mask,
+                (const uint8_t *)valid_mask,
                 sizeof(struct rte_flow_item_vxlan),
                 MLX5_ITEM_RANGE_NOT_ACCEPTED, error);
        if (ret < 0)
@@ -2920,9 +2951,8 @@ mlx5_flow_validate_item_mpls(struct rte_eth_dev *dev __rte_unused,
                                          "MPLS not supported or"
                                          " disabled in firmware"
                                          " configuration.");
-       /* MPLS over IP, UDP, GRE is allowed */
-       if (!(prev_layer & (MLX5_FLOW_LAYER_OUTER_L3 |
-                           MLX5_FLOW_LAYER_OUTER_L4_UDP |
+       /* MPLS over UDP, GRE is allowed */
+       if (!(prev_layer & (MLX5_FLOW_LAYER_OUTER_L4_UDP |
                            MLX5_FLOW_LAYER_GRE |
                            MLX5_FLOW_LAYER_GRE_KEY)))
                return rte_flow_error_set(error, EINVAL,
@@ -3095,31 +3125,6 @@ mlx5_flow_validate_item_ecpri(const struct rte_flow_item *item,
                                         MLX5_ITEM_RANGE_NOT_ACCEPTED, error);
 }
 
-/**
- * Release resource related QUEUE/RSS action split.
- *
- * @param dev
- *   Pointer to Ethernet device.
- * @param flow
- *   Flow to release id's from.
- */
-static void
-flow_mreg_split_qrss_release(struct rte_eth_dev *dev,
-                            struct rte_flow *flow)
-{
-       struct mlx5_priv *priv = dev->data->dev_private;
-       uint32_t handle_idx;
-       struct mlx5_flow_handle *dev_handle;
-
-       SILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_MLX5_FLOW], flow->dev_handles,
-                      handle_idx, dev_handle, next)
-               if (dev_handle->split_flow_id &&
-                   !dev_handle->is_meter_flow_id)
-                       mlx5_ipool_free(priv->sh->ipool
-                                       [MLX5_IPOOL_RSS_EXPANTION_FLOW_ID],
-                                       dev_handle->split_flow_id);
-}
-
 static int
 flow_null_validate(struct rte_eth_dev *dev __rte_unused,
                   const struct rte_flow_attr *attr __rte_unused,
@@ -3415,7 +3420,6 @@ flow_drv_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
        const struct mlx5_flow_driver_ops *fops;
        enum mlx5_flow_drv_type type = flow->drv_type;
 
-       flow_mreg_split_qrss_release(dev, flow);
        MLX5_ASSERT(type > MLX5_FLOW_TYPE_MIN && type < MLX5_FLOW_TYPE_MAX);
        fops = flow_get_drv_ops(type);
        fops->destroy(dev, flow);
@@ -3449,6 +3453,41 @@ flow_drv_meter_sub_policy_rss_prepare(struct rte_eth_dev *dev,
        return fops->meter_sub_policy_rss_prepare(dev, policy, rss_desc);
 }
 
+/**
+ * Flow driver color tag rule API. This abstracts calling driver
+ * specific functions. Parent flow (rte_flow) should have driver
+ * type (drv_type). It will create the color tag rules in hierarchy meter.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in, out] flow
+ *   Pointer to flow structure.
+ * @param[in] fm
+ *   Pointer to flow meter structure.
+ * @param[in] src_port
+ *   The src port this extra rule should use.
+ * @param[in] item
+ *   The src port id match item.
+ * @param[out] error
+ *   Pointer to error structure.
+ */
+static int
+flow_drv_mtr_hierarchy_rule_create(struct rte_eth_dev *dev,
+               struct rte_flow *flow,
+               struct mlx5_flow_meter_info *fm,
+               int32_t src_port,
+               const struct rte_flow_item *item,
+               struct rte_flow_error *error)
+{
+       const struct mlx5_flow_driver_ops *fops;
+       enum mlx5_flow_drv_type type = flow->drv_type;
+
+       MLX5_ASSERT(type > MLX5_FLOW_TYPE_MIN && type < MLX5_FLOW_TYPE_MAX);
+       fops = flow_get_drv_ops(type);
+       return fops->meter_hierarchy_rule_create(dev, fm,
+                                               src_port, item, error);
+}
+
 /**
  * Get RSS action from the action list.
  *
@@ -3492,10 +3531,18 @@ flow_get_rss_action(struct rte_eth_dev *dev,
                        const struct rte_flow_action_meter *mtr = actions->conf;
 
                        fm = mlx5_flow_meter_find(priv, mtr->mtr_id, &mtr_idx);
-                       if (fm) {
+                       if (fm && !fm->def_policy) {
                                policy = mlx5_flow_meter_policy_find(dev,
                                                fm->policy_id, NULL);
-                               if (policy && policy->is_rss)
+                               MLX5_ASSERT(policy);
+                               if (policy->is_hierarchy) {
+                                       policy =
+                               mlx5_flow_meter_hierarchy_get_final_policy(dev,
+                                                                       policy);
+                                       if (!policy)
+                                               return NULL;
+                               }
+                               if (policy->is_rss)
                                        rss =
                                policy->act_cnt[RTE_COLOR_GREEN].rss->conf;
                        }
@@ -3998,14 +4045,14 @@ flow_check_hairpin_split(struct rte_eth_dev *dev,
 
 /* Declare flow create/destroy prototype in advance. */
 static uint32_t
-flow_list_create(struct rte_eth_dev *dev, uint32_t *list,
+flow_list_create(struct rte_eth_dev *dev, enum mlx5_flow_type type,
                 const struct rte_flow_attr *attr,
                 const struct rte_flow_item items[],
                 const struct rte_flow_action actions[],
                 bool external, struct rte_flow_error *error);
 
 static void
-flow_list_destroy(struct rte_eth_dev *dev, uint32_t *list,
+flow_list_destroy(struct rte_eth_dev *dev, enum mlx5_flow_type type,
                  uint32_t flow_idx);
 
 int
@@ -4127,8 +4174,8 @@ flow_dv_mreg_create_cb(struct mlx5_hlist *list, uint64_t key,
         * be applied, removed, deleted in ardbitrary order
         * by list traversing.
         */
-       mcp_res->rix_flow = flow_list_create(dev, NULL, &attr, items,
-                                        actions, false, error);
+       mcp_res->rix_flow = flow_list_create(dev, MLX5_FLOW_TYPE_MCP,
+                                       &attr, items, actions, false, error);
        if (!mcp_res->rix_flow) {
                mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MCP], idx);
                return NULL;
@@ -4190,7 +4237,7 @@ flow_dv_mreg_remove_cb(struct mlx5_hlist *list, struct mlx5_hlist_entry *entry)
        struct mlx5_priv *priv = dev->data->dev_private;
 
        MLX5_ASSERT(mcp_res->rix_flow);
-       flow_list_destroy(dev, NULL, mcp_res->rix_flow);
+       flow_list_destroy(dev, MLX5_FLOW_TYPE_MCP, mcp_res->rix_flow);
        mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MCP], mcp_res->idx);
 }
 
@@ -4564,8 +4611,8 @@ flow_create_split_inner(struct rte_eth_dev *dev,
  *   Pointer to Ethernet device.
  * @param[in] flow
  *   Parent flow structure pointer.
- * @param[in] policy_id;
- *   Meter Policy id.
+ * @param wks
+ *   Pointer to thread flow work space.
  * @param[in] attr
  *   Flow rule attributes.
  * @param[in] items
@@ -4579,31 +4626,22 @@ flow_create_split_inner(struct rte_eth_dev *dev,
 static struct mlx5_flow_meter_sub_policy *
 get_meter_sub_policy(struct rte_eth_dev *dev,
                     struct rte_flow *flow,
-                    uint32_t policy_id,
+                    struct mlx5_flow_workspace *wks,
                     const struct rte_flow_attr *attr,
                     const struct rte_flow_item items[],
                     struct rte_flow_error *error)
 {
        struct mlx5_flow_meter_policy *policy;
+       struct mlx5_flow_meter_policy *final_policy;
        struct mlx5_flow_meter_sub_policy *sub_policy = NULL;
 
-       policy = mlx5_flow_meter_policy_find(dev, policy_id, NULL);
-       if (!policy) {
-               rte_flow_error_set(error, EINVAL,
-                                  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
-                                  "Failed to find Meter Policy.");
-               goto exit;
-       }
-       if (policy->is_rss ||
-               (policy->is_queue &&
-       !policy->sub_policys[MLX5_MTR_DOMAIN_INGRESS][0]->rix_hrxq[0])) {
-               struct mlx5_flow_workspace *wks =
-                               mlx5_flow_get_thread_workspace();
+       policy = wks->policy;
+       final_policy = policy->is_hierarchy ? wks->final_policy : policy;
+       if (final_policy->is_rss || final_policy->is_queue) {
                struct mlx5_flow_rss_desc rss_desc_v[MLX5_MTR_RTE_COLORS];
                struct mlx5_flow_rss_desc *rss_desc[MLX5_MTR_RTE_COLORS] = {0};
                uint32_t i;
 
-               MLX5_ASSERT(wks);
                /**
                 * This is a tmp dev_flow,
                 * no need to register any matcher for it in translate.
@@ -4613,9 +4651,9 @@ get_meter_sub_policy(struct rte_eth_dev *dev,
                        struct mlx5_flow dev_flow = {0};
                        struct mlx5_flow_handle dev_handle = { {0} };
 
-                       if (policy->is_rss) {
+                       if (final_policy->is_rss) {
                                const void *rss_act =
-                                       policy->act_cnt[i].rss->conf;
+                                       final_policy->act_cnt[i].rss->conf;
                                struct rte_flow_action rss_actions[2] = {
                                        [0] = {
                                        .type = RTE_FLOW_ACTION_TYPE_RSS,
@@ -4656,7 +4694,7 @@ get_meter_sub_policy(struct rte_eth_dev *dev,
                                rss_desc_v[i].key_len = 0;
                                rss_desc_v[i].hash_fields = 0;
                                rss_desc_v[i].queue =
-                                       &policy->act_cnt[i].queue;
+                                       &final_policy->act_cnt[i].queue;
                                rss_desc_v[i].queue_num = 1;
                        }
                        rss_desc[i] = &rss_desc_v[i];
@@ -4696,8 +4734,8 @@ exit:
  *   Pointer to Ethernet device.
  * @param[in] flow
  *   Parent flow structure pointer.
- * @param[in] fm
- *   Pointer to flow meter structure.
+ * @param wks
+ *   Pointer to thread flow work space.
  * @param[in] attr
  *   Flow rule attributes.
  * @param[in] items
@@ -4721,7 +4759,7 @@ exit:
 static int
 flow_meter_split_prep(struct rte_eth_dev *dev,
                      struct rte_flow *flow,
-                     struct mlx5_flow_meter_info *fm,
+                     struct mlx5_flow_workspace *wks,
                      const struct rte_flow_attr *attr,
                      const struct rte_flow_item items[],
                      struct rte_flow_item sfx_items[],
@@ -4732,6 +4770,7 @@ flow_meter_split_prep(struct rte_eth_dev *dev,
                      struct rte_flow_error *error)
 {
        struct mlx5_priv *priv = dev->data->dev_private;
+       struct mlx5_flow_meter_info *fm = wks->fm;
        struct rte_flow_action *tag_action = NULL;
        struct rte_flow_item *tag_item;
        struct mlx5_rte_flow_action_set_tag *set_tag;
@@ -4773,6 +4812,15 @@ flow_meter_split_prep(struct rte_eth_dev *dev,
                                                pid_v,
                                                "Failed to get port info.");
                        flow_src_port = port_priv->representor_id;
+                       if (!fm->def_policy && wks->policy->is_hierarchy &&
+                           flow_src_port != priv->representor_id) {
+                               if (flow_drv_mtr_hierarchy_rule_create(dev,
+                                                               flow, fm,
+                                                               flow_src_port,
+                                                               items,
+                                                               error))
+                                       return -rte_errno;
+                       }
                        memcpy(sfx_items, items, sizeof(*sfx_items));
                        sfx_items++;
                        break;
@@ -4856,9 +4904,8 @@ flow_meter_split_prep(struct rte_eth_dev *dev,
                struct mlx5_flow_tbl_data_entry *tbl_data;
 
                if (!fm->def_policy) {
-                       sub_policy = get_meter_sub_policy(dev, flow,
-                                                         fm->policy_id, attr,
-                                                         items, error);
+                       sub_policy = get_meter_sub_policy(dev, flow, wks,
+                                                         attr, items, error);
                        if (!sub_policy)
                                return -rte_errno;
                } else {
@@ -5714,6 +5761,7 @@ flow_create_split_meter(struct rte_eth_dev *dev,
        bool has_mtr = false;
        bool has_modify = false;
        bool set_mtr_reg = true;
+       bool is_mtr_hierarchy = false;
        uint32_t meter_id = 0;
        uint32_t mtr_idx = 0;
        uint32_t mtr_flow_id = 0;
@@ -5746,14 +5794,33 @@ flow_create_split_meter(struct rte_eth_dev *dev,
                }
                MLX5_ASSERT(wks);
                wks->fm = fm;
+               if (!fm->def_policy) {
+                       wks->policy = mlx5_flow_meter_policy_find(dev,
+                                                                 fm->policy_id,
+                                                                 NULL);
+                       MLX5_ASSERT(wks->policy);
+                       if (wks->policy->is_hierarchy) {
+                               wks->final_policy =
+                               mlx5_flow_meter_hierarchy_get_final_policy(dev,
+                                                               wks->policy);
+                               if (!wks->final_policy)
+                                       return rte_flow_error_set(error,
+                                       EINVAL,
+                                       RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+                               "Failed to find terminal policy of hierarchy.");
+                               is_mtr_hierarchy = true;
+                       }
+               }
                /*
                 * If it isn't default-policy Meter, and
                 * 1. There's no action in flow to change
                 *    packet (modify/encap/decap etc.), OR
                 * 2. No drop count needed for this meter.
-                * no need to use regC to save meter id anymore.
+                * 3. It's not meter hierarchy.
+                * Then no need to use regC to save meter id anymore.
                 */
-               if (!fm->def_policy && (!has_modify || !fm->drop_cnt))
+               if (!fm->def_policy && !is_mtr_hierarchy &&
+                   (!has_modify || !fm->drop_cnt))
                        set_mtr_reg = false;
                /* Prefix actions: meter, decap, encap, tag, jump, end. */
                act_size = sizeof(struct rte_flow_action) * (actions_n + 6) +
@@ -5776,7 +5843,7 @@ flow_create_split_meter(struct rte_eth_dev *dev,
                        pre_actions = sfx_actions + 1;
                else
                        pre_actions = sfx_actions + actions_n;
-               ret = flow_meter_split_prep(dev, flow, fm, &sfx_attr,
+               ret = flow_meter_split_prep(dev, flow, wks, &sfx_attr,
                                            items, sfx_items, actions,
                                            sfx_actions, pre_actions,
                                            (set_mtr_reg ? &mtr_flow_id : NULL),
@@ -6110,7 +6177,7 @@ flow_rss_workspace_adjust(struct mlx5_flow_workspace *wks,
  *   A flow index on success, 0 otherwise and rte_errno is set.
  */
 static uint32_t
-flow_list_create(struct rte_eth_dev *dev, uint32_t *list,
+flow_list_create(struct rte_eth_dev *dev, enum mlx5_flow_type type,
                 const struct rte_flow_attr *attr,
                 const struct rte_flow_item items[],
                 const struct rte_flow_action original_actions[],
@@ -6178,7 +6245,7 @@ flow_list_create(struct rte_eth_dev *dev, uint32_t *list,
                                external, hairpin_flow, error);
        if (ret < 0)
                goto error_before_hairpin_split;
-       flow = mlx5_ipool_zmalloc(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], &idx);
+       flow = mlx5_ipool_zmalloc(priv->flows[type], &idx);
        if (!flow) {
                rte_errno = ENOMEM;
                goto error_before_hairpin_split;
@@ -6308,12 +6375,7 @@ flow_list_create(struct rte_eth_dev *dev, uint32_t *list,
                if (ret < 0)
                        goto error;
        }
-       if (list) {
-               rte_spinlock_lock(&priv->flow_list_lock);
-               ILIST_INSERT(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], list, idx,
-                            flow, next);
-               rte_spinlock_unlock(&priv->flow_list_lock);
-       }
+       flow->type = type;
        flow_rxq_flags_set(dev, flow);
        rte_free(translated_actions);
        tunnel = flow_tunnel_from_rule(wks->flows);
@@ -6335,7 +6397,7 @@ error:
                        mlx5_ipool_get
                        (priv->sh->ipool[MLX5_IPOOL_RSS_SHARED_ACTIONS],
                        rss_desc->shared_rss))->refcnt, 1, __ATOMIC_RELAXED);
-       mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], idx);
+       mlx5_ipool_free(priv->flows[type], idx);
        rte_errno = ret; /* Restore rte_errno. */
        ret = rte_errno;
        rte_errno = ret;
@@ -6387,10 +6449,9 @@ mlx5_flow_create_esw_table_zero_flow(struct rte_eth_dev *dev)
                        .type = RTE_FLOW_ACTION_TYPE_END,
                },
        };
-       struct mlx5_priv *priv = dev->data->dev_private;
        struct rte_flow_error error;
 
-       return (void *)(uintptr_t)flow_list_create(dev, &priv->ctrl_flows,
+       return (void *)(uintptr_t)flow_list_create(dev, MLX5_FLOW_TYPE_CTL,
                                                   &attr, &pattern,
                                                   actions, false, &error);
 }
@@ -6442,8 +6503,6 @@ mlx5_flow_create(struct rte_eth_dev *dev,
                 const struct rte_flow_action actions[],
                 struct rte_flow_error *error)
 {
-       struct mlx5_priv *priv = dev->data->dev_private;
-
        /*
         * If the device is not started yet, it is not allowed to created a
         * flow from application. PMD default flows and traffic control flows
@@ -6459,8 +6518,9 @@ mlx5_flow_create(struct rte_eth_dev *dev,
                return NULL;
        }
 
-       return (void *)(uintptr_t)flow_list_create(dev, &priv->flows,
-                                 attr, items, actions, true, error);
+       return (void *)(uintptr_t)flow_list_create(dev, MLX5_FLOW_TYPE_GEN,
+                                                  attr, items, actions,
+                                                  true, error);
 }
 
 /**
@@ -6468,24 +6528,19 @@ mlx5_flow_create(struct rte_eth_dev *dev,
  *
  * @param dev
  *   Pointer to Ethernet device.
- * @param list
- *   Pointer to the Indexed flow list. If this parameter NULL,
- *   there is no flow removal from the list. Be noted that as
- *   flow is add to the indexed list, memory of the indexed
- *   list points to maybe changed as flow destroyed.
  * @param[in] flow_idx
  *   Index of flow to destroy.
  */
 static void
-flow_list_destroy(struct rte_eth_dev *dev, uint32_t *list,
+flow_list_destroy(struct rte_eth_dev *dev, enum mlx5_flow_type type,
                  uint32_t flow_idx)
 {
        struct mlx5_priv *priv = dev->data->dev_private;
-       struct rte_flow *flow = mlx5_ipool_get(priv->sh->ipool
-                                              [MLX5_IPOOL_RTE_FLOW], flow_idx);
+       struct rte_flow *flow = mlx5_ipool_get(priv->flows[type], flow_idx);
 
        if (!flow)
                return;
+       MLX5_ASSERT(flow->type == type);
        /*
         * Update RX queue flags only if port is started, otherwise it is
         * already clean.
@@ -6493,12 +6548,6 @@ flow_list_destroy(struct rte_eth_dev *dev, uint32_t *list,
        if (dev->data->dev_started)
                flow_rxq_flags_trim(dev, flow);
        flow_drv_destroy(dev, flow);
-       if (list) {
-               rte_spinlock_lock(&priv->flow_list_lock);
-               ILIST_REMOVE(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], list,
-                            flow_idx, flow, next);
-               rte_spinlock_unlock(&priv->flow_list_lock);
-       }
        if (flow->tunnel) {
                struct mlx5_flow_tunnel *tunnel;
 
@@ -6508,7 +6557,7 @@ flow_list_destroy(struct rte_eth_dev *dev, uint32_t *list,
                        mlx5_flow_tunnel_free(dev, tunnel);
        }
        flow_mreg_del_copy_action(dev, flow);
-       mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], flow_idx);
+       mlx5_ipool_free(priv->flows[type], flow_idx);
 }
 
 /**
@@ -6516,18 +6565,21 @@ flow_list_destroy(struct rte_eth_dev *dev, uint32_t *list,
  *
  * @param dev
  *   Pointer to Ethernet device.
- * @param list
- *   Pointer to the Indexed flow list.
+ * @param type
+ *   Flow type to be flushed.
  * @param active
  *   If flushing is called avtively.
  */
 void
-mlx5_flow_list_flush(struct rte_eth_dev *dev, uint32_t *list, bool active)
+mlx5_flow_list_flush(struct rte_eth_dev *dev, enum mlx5_flow_type type,
+                    bool active)
 {
-       uint32_t num_flushed = 0;
+       struct mlx5_priv *priv = dev->data->dev_private;
+       uint32_t num_flushed = 0, fidx = 1;
+       struct rte_flow *flow;
 
-       while (*list) {
-               flow_list_destroy(dev, list, *list);
+       MLX5_IPOOL_FOREACH(priv->flows[type], fidx, flow) {
+               flow_list_destroy(dev, type, fidx);
                num_flushed++;
        }
        if (active) {
@@ -6699,18 +6751,19 @@ mlx5_flow_pop_thread_workspace(void)
  * @return the number of flows not released.
  */
 int
-mlx5_flow_verify(struct rte_eth_dev *dev)
+mlx5_flow_verify(struct rte_eth_dev *dev __rte_unused)
 {
        struct mlx5_priv *priv = dev->data->dev_private;
        struct rte_flow *flow;
-       uint32_t idx;
-       int ret = 0;
+       uint32_t idx = 0;
+       int ret = 0, i;
 
-       ILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], priv->flows, idx,
-                     flow, next) {
-               DRV_LOG(DEBUG, "port %u flow %p still referenced",
-                       dev->data->port_id, (void *)flow);
-               ++ret;
+       for (i = 0; i < MLX5_FLOW_TYPE_MAXI; i++) {
+               MLX5_IPOOL_FOREACH(priv->flows[i], idx, flow) {
+                       DRV_LOG(DEBUG, "port %u flow %p still referenced",
+                               dev->data->port_id, (void *)flow);
+                       ret++;
+               }
        }
        return ret;
 }
@@ -6730,7 +6783,6 @@ int
 mlx5_ctrl_flow_source_queue(struct rte_eth_dev *dev,
                            uint32_t queue)
 {
-       struct mlx5_priv *priv = dev->data->dev_private;
        const struct rte_flow_attr attr = {
                .egress = 1,
                .priority = 0,
@@ -6763,8 +6815,8 @@ mlx5_ctrl_flow_source_queue(struct rte_eth_dev *dev,
        actions[0].type = RTE_FLOW_ACTION_TYPE_JUMP;
        actions[0].conf = &jump;
        actions[1].type = RTE_FLOW_ACTION_TYPE_END;
-       flow_idx = flow_list_create(dev, &priv->ctrl_flows,
-                               &attr, items, actions, false, &error);
+       flow_idx = flow_list_create(dev, MLX5_FLOW_TYPE_CTL,
+                                   &attr, items, actions, false, &error);
        if (!flow_idx) {
                DRV_LOG(DEBUG,
                        "Failed to create ctrl flow: rte_errno(%d),"
@@ -6853,8 +6905,8 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
                action_rss.types = 0;
        for (i = 0; i != priv->reta_idx_n; ++i)
                queue[i] = (*priv->reta_idx)[i];
-       flow_idx = flow_list_create(dev, &priv->ctrl_flows,
-                               &attr, items, actions, false, &error);
+       flow_idx = flow_list_create(dev, MLX5_FLOW_TYPE_CTL,
+                                   &attr, items, actions, false, &error);
        if (!flow_idx)
                return -rte_errno;
        return 0;
@@ -6895,7 +6947,6 @@ mlx5_ctrl_flow(struct rte_eth_dev *dev,
 int
 mlx5_flow_lacp_miss(struct rte_eth_dev *dev)
 {
-       struct mlx5_priv *priv = dev->data->dev_private;
        /*
         * The LACP matching is done by only using ether type since using
         * a multicast dst mac causes kernel to give low priority to this flow.
@@ -6929,8 +6980,9 @@ mlx5_flow_lacp_miss(struct rte_eth_dev *dev)
                },
        };
        struct rte_flow_error error;
-       uint32_t flow_idx = flow_list_create(dev, &priv->ctrl_flows,
-                               &attr, items, actions, false, &error);
+       uint32_t flow_idx = flow_list_create(dev, MLX5_FLOW_TYPE_CTL,
+                                       &attr, items, actions,
+                                       false, &error);
 
        if (!flow_idx)
                return -rte_errno;
@@ -6948,9 +7000,8 @@ mlx5_flow_destroy(struct rte_eth_dev *dev,
                  struct rte_flow *flow,
                  struct rte_flow_error *error __rte_unused)
 {
-       struct mlx5_priv *priv = dev->data->dev_private;
-
-       flow_list_destroy(dev, &priv->flows, (uintptr_t)(void *)flow);
+       flow_list_destroy(dev, MLX5_FLOW_TYPE_GEN,
+                               (uintptr_t)(void *)flow);
        return 0;
 }
 
@@ -6964,9 +7015,7 @@ int
 mlx5_flow_flush(struct rte_eth_dev *dev,
                struct rte_flow_error *error __rte_unused)
 {
-       struct mlx5_priv *priv = dev->data->dev_private;
-
-       mlx5_flow_list_flush(dev, &priv->flows, false);
+       mlx5_flow_list_flush(dev, MLX5_FLOW_TYPE_GEN, false);
        return 0;
 }
 
@@ -7017,8 +7066,7 @@ flow_drv_query(struct rte_eth_dev *dev,
 {
        struct mlx5_priv *priv = dev->data->dev_private;
        const struct mlx5_flow_driver_ops *fops;
-       struct rte_flow *flow = mlx5_ipool_get(priv->sh->ipool
-                                              [MLX5_IPOOL_RTE_FLOW],
+       struct rte_flow *flow = mlx5_ipool_get(priv->flows[MLX5_FLOW_TYPE_GEN],
                                               flow_idx);
        enum mlx5_flow_drv_type ftype;
 
@@ -7884,14 +7932,14 @@ mlx5_flow_discover_mreg_c(struct rte_eth_dev *dev)
                if (!config->dv_flow_en)
                        break;
                /* Create internal flow, validation skips copy action. */
-               flow_idx = flow_list_create(dev, NULL, &attr, items,
-                                           actions, false, &error);
-               flow = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW],
+               flow_idx = flow_list_create(dev, MLX5_FLOW_TYPE_GEN, &attr,
+                                       items, actions, false, &error);
+               flow = mlx5_ipool_get(priv->flows[MLX5_FLOW_TYPE_GEN],
                                      flow_idx);
                if (!flow)
                        continue;
                config->flow_mreg_c[n++] = idx;
-               flow_list_destroy(dev, NULL, flow_idx);
+               flow_list_destroy(dev, MLX5_FLOW_TYPE_GEN, flow_idx);
        }
        for (; n < MLX5_MREG_C_NUM; ++n)
                config->flow_mreg_c[n] = REG_NON;
@@ -8086,8 +8134,7 @@ mlx5_flow_dev_dump(struct rte_eth_dev *dev, struct rte_flow *flow_idx,
        /* dump all */
        if (!flow_idx) {
 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
-               ILIST_FOREACH(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW],
-                                               priv->flows, idx, flow, next)
+               MLX5_IPOOL_FOREACH(priv->flows[MLX5_FLOW_TYPE_GEN], idx, flow)
                        mlx5_flow_dev_dump_ipool(dev, flow, file, error);
 #endif
                return mlx5_devx_cmd_flow_dump(sh->fdb_domain,
@@ -8095,8 +8142,8 @@ mlx5_flow_dev_dump(struct rte_eth_dev *dev, struct rte_flow *flow_idx,
                                        sh->tx_domain, file);
        }
        /* dump one */
-       flow = mlx5_ipool_get(priv->sh->ipool
-                       [MLX5_IPOOL_RTE_FLOW], (uintptr_t)(void *)flow_idx);
+       flow = mlx5_ipool_get(priv->flows[MLX5_FLOW_TYPE_GEN],
+                       (uintptr_t)(void *)flow_idx);
        if (!flow)
                return -ENOENT;