regex/mlx5: add data path scattered mbuf process
authorSuanming Mou <suanmingm@nvidia.com>
Wed, 7 Apr 2021 07:21:39 +0000 (10:21 +0300)
committerThomas Monjalon <thomas@monjalon.net>
Thu, 8 Apr 2021 20:52:55 +0000 (22:52 +0200)
UMR (User-Mode Memory Registration) WQE can present data buffers
scattered within multiple mbufs with single indirect mkey. Take
advantage of the UMR WQE, scattered mbuf in one operation can be
presented to an indirect mkey. The RegEx which only accepts one
mkey can now process the whole scattered mbuf in one operation.

The maximum scattered mbuf can be supported in one UMR WQE is now
defined as 64. The mbufs from multiple operations can be combined
into one UMR WQE as well if there is enough space in the KLM array,
since the operations can address their own mbuf's content by the
mkey's address and length. However, one operation's scattered mbuf's
can't be placed in two different UMR WQE's KLM array, if the UMR
WQE's KLM does not has enough free space for one operation, the
extra UMR WQE will be engaged.

In case the UMR WQE's indirect mkey will be over wrapped by the SQ's
WQE move, the mkey's index used by the UMR WQE should be the index
of last the RegEX WQE in the operations. As one operation consumes
one WQE set, build the RegEx WQE by reverse helps address the mkey
more efficiently. Once the operations in one burst consumes multiple
mkeys, when the mkey KLM array is full, the reverse WQE set index
will always be the last of the new mkey's for the new UMR WQE.

In GGA mode, the SQ WQE's memory layout becomes UMR/NOP and RegEx
WQE by interleave. The UMR and RegEx WQE can be called as WQE set.
The SQ's pi and ci will also be increased as WQE set not as WQE.

For operations don't have scattered mbuf, uses the mbuf's mkey directly,
the WQE set combination is NOP + RegEx.
For operations have scattered mbuf but share the UMR WQE with others,
the WQE set combination is NOP + RegEx.
For operations complete the UMR WQE, the WQE set combination is UMR +
RegEx.

Signed-off-by: John Hurley <jhurley@nvidia.com>
Signed-off-by: Suanming Mou <suanmingm@nvidia.com>
Acked-by: Ori Kam <orika@nvidia.com>
doc/guides/regexdevs/mlx5.rst
doc/guides/rel_notes/release_21_05.rst
drivers/regex/mlx5/mlx5_regex.c
drivers/regex/mlx5/mlx5_regex.h
drivers/regex/mlx5/mlx5_regex_control.c
drivers/regex/mlx5/mlx5_regex_fastpath.c

index faaa6ac..45a0b96 100644 (file)
@@ -35,6 +35,11 @@ be specified as device parameter. The RegEx device can be probed and used with
 other Mellanox devices, by adding more options in the class.
 For example: ``class=net:regex`` will probe both the net PMD and the RegEx PMD.
 
+Features
+--------
+
+- Multi segments mbuf support.
+
 Supported NICs
 --------------
 
index 326f3f9..6f5858c 100644 (file)
@@ -119,6 +119,10 @@ New Features
 
   * Added support for preferred busy polling.
 
+* **Updated Mellanox RegEx PMD.**
+
+  * Added support for multi-segments mbuf.
+
 * **Updated testpmd.**
 
   * Added a command line option to configure forced speed for Ethernet port.
index ac5b205..82c485e 100644 (file)
@@ -199,6 +199,13 @@ mlx5_regex_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
        }
        priv->regexdev->dev_ops = &mlx5_regexdev_ops;
        priv->regexdev->enqueue = mlx5_regexdev_enqueue;
+#ifdef HAVE_MLX5_UMR_IMKEY
+       if (!attr.umr_indirect_mkey_disabled &&
+           !attr.umr_modify_entity_size_disabled)
+               priv->has_umr = 1;
+       if (priv->has_umr)
+               priv->regexdev->enqueue = mlx5_regexdev_enqueue_gga;
+#endif
        priv->regexdev->dequeue = mlx5_regexdev_dequeue;
        priv->regexdev->device = (struct rte_device *)pci_dev;
        priv->regexdev->data->dev_private = priv;
@@ -213,6 +220,8 @@ mlx5_regex_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
            rte_errno = ENOMEM;
                goto error;
        }
+       DRV_LOG(INFO, "RegEx GGA is %s.",
+               priv->has_umr ? "supported" : "unsupported");
        return 0;
 
 error:
index a2b3f0d..51a2101 100644 (file)
@@ -15,6 +15,7 @@
 #include <mlx5_common_devx.h>
 
 #include "mlx5_rxp.h"
+#include "mlx5_regex_utils.h"
 
 struct mlx5_regex_sq {
        uint16_t log_nb_desc; /* Log 2 number of desc for this object. */
@@ -40,6 +41,7 @@ struct mlx5_regex_qp {
        struct mlx5_regex_job *jobs;
        struct ibv_mr *metadata;
        struct ibv_mr *outputs;
+       struct ibv_mr *imkey_addr; /* Indirect mkey array region. */
        size_t ci, pi;
        struct mlx5_mr_ctrl mr_ctrl;
 };
@@ -71,8 +73,29 @@ struct mlx5_regex_priv {
        struct mlx5_mr_share_cache mr_scache; /* Global shared MR cache. */
        uint8_t is_bf2; /* The device is BF2 device. */
        uint8_t sq_ts_format; /* Whether SQ supports timestamp formats. */
+       uint8_t has_umr; /* The device supports UMR. */
 };
 
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+static inline int
+regex_get_pdn(void *pd, uint32_t *pdn)
+{
+       struct mlx5dv_obj obj;
+       struct mlx5dv_pd pd_info;
+       int ret = 0;
+
+       obj.pd.in = pd;
+       obj.pd.out = &pd_info;
+       ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
+       if (ret) {
+               DRV_LOG(DEBUG, "Fail to get PD object info");
+               return ret;
+       }
+       *pdn = pd_info.pdn;
+       return 0;
+}
+#endif
+
 /* mlx5_regex.c */
 int mlx5_regex_start(struct rte_regexdev *dev);
 int mlx5_regex_stop(struct rte_regexdev *dev);
@@ -108,5 +131,6 @@ uint16_t mlx5_regexdev_enqueue(struct rte_regexdev *dev, uint16_t qp_id,
                       struct rte_regex_ops **ops, uint16_t nb_ops);
 uint16_t mlx5_regexdev_dequeue(struct rte_regexdev *dev, uint16_t qp_id,
                       struct rte_regex_ops **ops, uint16_t nb_ops);
-
+uint16_t mlx5_regexdev_enqueue_gga(struct rte_regexdev *dev, uint16_t qp_id,
+                      struct rte_regex_ops **ops, uint16_t nb_ops);
 #endif /* MLX5_REGEX_H */
index 55fbb41..eef0fe5 100644 (file)
@@ -27,6 +27,9 @@
 
 #define MLX5_REGEX_NUM_WQE_PER_PAGE (4096/64)
 
+#define MLX5_REGEX_WQE_LOG_NUM(has_umr, log_desc) \
+               ((has_umr) ? ((log_desc) + 2) : (log_desc))
+
 /**
  * Returns the number of qp obj to be created.
  *
@@ -91,26 +94,6 @@ regex_ctrl_create_cq(struct mlx5_regex_priv *priv, struct mlx5_regex_cq *cq)
        return 0;
 }
 
-#ifdef HAVE_IBV_FLOW_DV_SUPPORT
-static int
-regex_get_pdn(void *pd, uint32_t *pdn)
-{
-       struct mlx5dv_obj obj;
-       struct mlx5dv_pd pd_info;
-       int ret = 0;
-
-       obj.pd.in = pd;
-       obj.pd.out = &pd_info;
-       ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
-       if (ret) {
-               DRV_LOG(DEBUG, "Fail to get PD object info");
-               return ret;
-       }
-       *pdn = pd_info.pdn;
-       return 0;
-}
-#endif
-
 /**
  * Destroy the SQ object.
  *
@@ -168,14 +151,16 @@ regex_ctrl_create_sq(struct mlx5_regex_priv *priv, struct mlx5_regex_qp *qp,
        int ret;
 
        sq->log_nb_desc = log_nb_desc;
+       sq->sqn = q_ind;
        sq->ci = 0;
        sq->pi = 0;
        ret = regex_get_pdn(priv->pd, &pd_num);
        if (ret)
                return ret;
        attr.wq_attr.pd = pd_num;
-       ret = mlx5_devx_sq_create(priv->ctx, &sq->sq_obj, log_nb_desc, &attr,
-                                 SOCKET_ID_ANY);
+       ret = mlx5_devx_sq_create(priv->ctx, &sq->sq_obj,
+                       MLX5_REGEX_WQE_LOG_NUM(priv->has_umr, log_nb_desc),
+                       &attr, SOCKET_ID_ANY);
        if (ret) {
                DRV_LOG(ERR, "Can't create SQ object.");
                rte_errno = ENOMEM;
@@ -225,10 +210,18 @@ mlx5_regex_qp_setup(struct rte_regexdev *dev, uint16_t qp_ind,
 
        qp = &priv->qps[qp_ind];
        qp->flags = cfg->qp_conf_flags;
-       qp->cq.log_nb_desc = rte_log2_u32(cfg->nb_desc);
-       qp->nb_desc = 1 << qp->cq.log_nb_desc;
+       log_desc = rte_log2_u32(cfg->nb_desc);
+       /*
+        * UMR mode requires two WQEs(UMR and RegEx WQE) for one descriptor.
+        * For CQ, expand the CQE number multiple with 2.
+        * For SQ, the UMR and RegEx WQE for one descriptor consumes 4 WQEBBS,
+        * expand the WQE number multiple with 4.
+        */
+       qp->cq.log_nb_desc = log_desc + (!!priv->has_umr);
+       qp->nb_desc = 1 << log_desc;
        if (qp->flags & RTE_REGEX_QUEUE_PAIR_CFG_OOS_F)
-               qp->nb_obj = regex_ctrl_get_nb_obj(qp->nb_desc);
+               qp->nb_obj = regex_ctrl_get_nb_obj
+                       (1 << MLX5_REGEX_WQE_LOG_NUM(priv->has_umr, log_desc));
        else
                qp->nb_obj = 1;
        qp->sqs = rte_malloc(NULL,
index beaea7b..b57e7d7 100644 (file)
 #define MLX5_REGEX_WQE_GATHER_OFFSET 32
 #define MLX5_REGEX_WQE_SCATTER_OFFSET 48
 #define MLX5_REGEX_METADATA_OFF 32
+#define MLX5_REGEX_UMR_WQE_SIZE 192
+/* The maximum KLMs can be added to one UMR indirect mkey. */
+#define MLX5_REGEX_MAX_KLM_NUM 128
+/* The KLM array size for one job. */
+#define MLX5_REGEX_KLMS_SIZE \
+       ((MLX5_REGEX_MAX_KLM_NUM) * sizeof(struct mlx5_klm))
+/* In WQE set mode, the pi should be quarter of the MLX5_REGEX_MAX_WQE_INDEX. */
+#define MLX5_REGEX_UMR_SQ_PI_IDX(pi, ops) \
+       (((pi) + (ops)) & (MLX5_REGEX_MAX_WQE_INDEX >> 2))
 
 static inline uint32_t
 sq_size_get(struct mlx5_regex_sq *sq)
@@ -49,6 +58,8 @@ struct mlx5_regex_job {
        uint64_t user_id;
        volatile uint8_t *output;
        volatile uint8_t *metadata;
+       struct mlx5_klm *imkey_array; /* Indirect mkey's KLM array. */
+       struct mlx5_devx_obj *imkey; /* UMR WQE's indirect meky. */
 } __rte_cached_aligned;
 
 static inline void
@@ -99,12 +110,13 @@ set_wqe_ctrl_seg(struct mlx5_wqe_ctrl_seg *seg, uint16_t pi, uint8_t opcode,
 }
 
 static inline void
-prep_one(struct mlx5_regex_priv *priv, struct mlx5_regex_qp *qp,
-        struct mlx5_regex_sq *sq, struct rte_regex_ops *op,
-        struct mlx5_regex_job *job)
+__prep_one(struct mlx5_regex_priv *priv, struct mlx5_regex_sq *sq,
+          struct rte_regex_ops *op, struct mlx5_regex_job *job,
+          size_t pi, struct mlx5_klm *klm)
 {
-       size_t wqe_offset = (sq->pi & (sq_size_get(sq) - 1)) * MLX5_SEND_WQE_BB;
-       uint32_t lkey;
+       size_t wqe_offset = (pi & (sq_size_get(sq) - 1)) *
+                           (MLX5_SEND_WQE_BB << (priv->has_umr ? 2 : 0)) +
+                           (priv->has_umr ? MLX5_REGEX_UMR_WQE_SIZE : 0);
        uint16_t group0 = op->req_flags & RTE_REGEX_OPS_REQ_GROUP_ID0_VALID_F ?
                                op->group_id0 : 0;
        uint16_t group1 = op->req_flags & RTE_REGEX_OPS_REQ_GROUP_ID1_VALID_F ?
@@ -122,14 +134,11 @@ prep_one(struct mlx5_regex_priv *priv, struct mlx5_regex_qp *qp,
                               RTE_REGEX_OPS_REQ_GROUP_ID2_VALID_F |
                               RTE_REGEX_OPS_REQ_GROUP_ID3_VALID_F)))
                group0 = op->group_id0;
-       lkey = mlx5_mr_addr2mr_bh(priv->pd, 0,
-                                 &priv->mr_scache, &qp->mr_ctrl,
-                                 rte_pktmbuf_mtod(op->mbuf, uintptr_t),
-                                 !!(op->mbuf->ol_flags & EXT_ATTACHED_MBUF));
        uint8_t *wqe = (uint8_t *)(uintptr_t)sq->sq_obj.wqes + wqe_offset;
        int ds = 4; /*  ctrl + meta + input + output */
 
-       set_wqe_ctrl_seg((struct mlx5_wqe_ctrl_seg *)wqe, sq->pi,
+       set_wqe_ctrl_seg((struct mlx5_wqe_ctrl_seg *)wqe,
+                        (priv->has_umr ? (pi * 4 + 3) : pi),
                         MLX5_OPCODE_MMO, MLX5_OPC_MOD_MMO_REGEX,
                         sq->sq_obj.sq->id, 0, ds, 0, 0);
        set_regex_ctrl_seg(wqe + 12, 0, group0, group1, group2, group3,
@@ -137,36 +146,56 @@ prep_one(struct mlx5_regex_priv *priv, struct mlx5_regex_qp *qp,
        struct mlx5_wqe_data_seg *input_seg =
                (struct mlx5_wqe_data_seg *)(wqe +
                                             MLX5_REGEX_WQE_GATHER_OFFSET);
-       input_seg->byte_count =
-               rte_cpu_to_be_32(rte_pktmbuf_data_len(op->mbuf));
-       input_seg->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(op->mbuf,
-                                                           uintptr_t));
-       input_seg->lkey = lkey;
+       input_seg->byte_count = rte_cpu_to_be_32(klm->byte_count);
+       input_seg->addr = rte_cpu_to_be_64(klm->address);
+       input_seg->lkey = klm->mkey;
        job->user_id = op->user_id;
+}
+
+static inline void
+prep_one(struct mlx5_regex_priv *priv, struct mlx5_regex_qp *qp,
+        struct mlx5_regex_sq *sq, struct rte_regex_ops *op,
+        struct mlx5_regex_job *job)
+{
+       struct mlx5_klm klm;
+
+       klm.byte_count = rte_pktmbuf_data_len(op->mbuf);
+       klm.mkey = mlx5_mr_addr2mr_bh(priv->pd, 0,
+                                 &priv->mr_scache, &qp->mr_ctrl,
+                                 rte_pktmbuf_mtod(op->mbuf, uintptr_t),
+                                 !!(op->mbuf->ol_flags & EXT_ATTACHED_MBUF));
+       klm.address = rte_pktmbuf_mtod(op->mbuf, uintptr_t);
+       __prep_one(priv, sq, op, job, sq->pi, &klm);
        sq->db_pi = sq->pi;
        sq->pi = (sq->pi + 1) & MLX5_REGEX_MAX_WQE_INDEX;
 }
 
 static inline void
-send_doorbell(struct mlx5dv_devx_uar *uar, struct mlx5_regex_sq *sq)
+send_doorbell(struct mlx5_regex_priv *priv, struct mlx5_regex_sq *sq)
 {
+       struct mlx5dv_devx_uar *uar = priv->uar;
        size_t wqe_offset = (sq->db_pi & (sq_size_get(sq) - 1)) *
-               MLX5_SEND_WQE_BB;
+               (MLX5_SEND_WQE_BB << (priv->has_umr ? 2 : 0)) +
+               (priv->has_umr ? MLX5_REGEX_UMR_WQE_SIZE : 0);
        uint8_t *wqe = (uint8_t *)(uintptr_t)sq->sq_obj.wqes + wqe_offset;
-       ((struct mlx5_wqe_ctrl_seg *)wqe)->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
+       /* Or the fm_ce_se instead of set, avoid the fence be cleared. */
+       ((struct mlx5_wqe_ctrl_seg *)wqe)->fm_ce_se |= MLX5_WQE_CTRL_CQ_UPDATE;
        uint64_t *doorbell_addr =
                (uint64_t *)((uint8_t *)uar->base_addr + 0x800);
        rte_io_wmb();
-       sq->sq_obj.db_rec[MLX5_SND_DBR] = rte_cpu_to_be_32((sq->db_pi + 1) &
-                                                MLX5_REGEX_MAX_WQE_INDEX);
+       sq->sq_obj.db_rec[MLX5_SND_DBR] = rte_cpu_to_be_32((priv->has_umr ?
+                                       (sq->db_pi * 4 + 3) : sq->db_pi) &
+                                       MLX5_REGEX_MAX_WQE_INDEX);
        rte_wmb();
        *doorbell_addr = *(volatile uint64_t *)wqe;
        rte_wmb();
 }
 
 static inline int
-can_send(struct mlx5_regex_sq *sq) {
-       return ((uint16_t)(sq->pi - sq->ci) < sq_size_get(sq));
+get_free(struct mlx5_regex_sq *sq, uint8_t has_umr) {
+       return (sq_size_get(sq) - ((sq->pi - sq->ci) &
+                       (has_umr ? (MLX5_REGEX_MAX_WQE_INDEX >> 2) :
+                       MLX5_REGEX_MAX_WQE_INDEX)));
 }
 
 static inline uint32_t
@@ -174,6 +203,211 @@ job_id_get(uint32_t qid, size_t sq_size, size_t index) {
        return qid * sq_size + (index & (sq_size - 1));
 }
 
+#ifdef HAVE_MLX5_UMR_IMKEY
+static inline int
+mkey_klm_available(struct mlx5_klm *klm, uint32_t pos, uint32_t new)
+{
+       return (klm && ((pos + new) <= MLX5_REGEX_MAX_KLM_NUM));
+}
+
+static inline void
+complete_umr_wqe(struct mlx5_regex_qp *qp, struct mlx5_regex_sq *sq,
+                struct mlx5_regex_job *mkey_job,
+                size_t umr_index, uint32_t klm_size, uint32_t total_len)
+{
+       size_t wqe_offset = (umr_index & (sq_size_get(sq) - 1)) *
+               (MLX5_SEND_WQE_BB * 4);
+       struct mlx5_wqe_ctrl_seg *wqe = (struct mlx5_wqe_ctrl_seg *)((uint8_t *)
+                                  (uintptr_t)sq->sq_obj.wqes + wqe_offset);
+       struct mlx5_wqe_umr_ctrl_seg *ucseg =
+                               (struct mlx5_wqe_umr_ctrl_seg *)(wqe + 1);
+       struct mlx5_wqe_mkey_context_seg *mkc =
+                               (struct mlx5_wqe_mkey_context_seg *)(ucseg + 1);
+       struct mlx5_klm *iklm = (struct mlx5_klm *)(mkc + 1);
+       uint16_t klm_align = RTE_ALIGN(klm_size, 4);
+
+       memset(wqe, 0, MLX5_REGEX_UMR_WQE_SIZE);
+       /* Set WQE control seg. Non-inline KLM UMR WQE size must be 9 WQE_DS. */
+       set_wqe_ctrl_seg(wqe, (umr_index * 4), MLX5_OPCODE_UMR,
+                        0, sq->sq_obj.sq->id, 0, 9, 0,
+                        rte_cpu_to_be_32(mkey_job->imkey->id));
+       /* Set UMR WQE control seg. */
+       ucseg->mkey_mask |= rte_cpu_to_be_64(MLX5_WQE_UMR_CTRL_MKEY_MASK_LEN |
+                               MLX5_WQE_UMR_CTRL_FLAG_TRNSLATION_OFFSET |
+                               MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_LOCAL_WRITE);
+       ucseg->klm_octowords = rte_cpu_to_be_16(klm_align);
+       /* Set mkey context seg. */
+       mkc->len = rte_cpu_to_be_64(total_len);
+       mkc->qpn_mkey = rte_cpu_to_be_32(0xffffff00 |
+                                       (mkey_job->imkey->id & 0xff));
+       /* Set UMR pointer to data seg. */
+       iklm->address = rte_cpu_to_be_64
+                               ((uintptr_t)((char *)mkey_job->imkey_array));
+       iklm->mkey = rte_cpu_to_be_32(qp->imkey_addr->lkey);
+       iklm->byte_count = rte_cpu_to_be_32(klm_align);
+       /* Clear the padding memory. */
+       memset((uint8_t *)&mkey_job->imkey_array[klm_size], 0,
+              sizeof(struct mlx5_klm) * (klm_align - klm_size));
+
+       /* Add the following RegEx WQE with fence. */
+       wqe = (struct mlx5_wqe_ctrl_seg *)
+                               (((uint8_t *)wqe) + MLX5_REGEX_UMR_WQE_SIZE);
+       wqe->fm_ce_se |= MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE;
+}
+
+static inline void
+prep_nop_regex_wqe_set(struct mlx5_regex_priv *priv, struct mlx5_regex_sq *sq,
+                      struct rte_regex_ops *op, struct mlx5_regex_job *job,
+                      size_t pi, struct mlx5_klm *klm)
+{
+       size_t wqe_offset = (pi & (sq_size_get(sq) - 1)) *
+                           (MLX5_SEND_WQE_BB << 2);
+       struct mlx5_wqe_ctrl_seg *wqe = (struct mlx5_wqe_ctrl_seg *)((uint8_t *)
+                                  (uintptr_t)sq->sq_obj.wqes + wqe_offset);
+
+       /* Clear the WQE memory used as UMR WQE previously. */
+       if ((rte_be_to_cpu_32(wqe->opmod_idx_opcode) & 0xff) != MLX5_OPCODE_NOP)
+               memset(wqe, 0, MLX5_REGEX_UMR_WQE_SIZE);
+       /* UMR WQE size is 9 DS, align nop WQE to 3 WQEBBS(12 DS). */
+       set_wqe_ctrl_seg(wqe, pi * 4, MLX5_OPCODE_NOP, 0, sq->sq_obj.sq->id,
+                        0, 12, 0, 0);
+       __prep_one(priv, sq, op, job, pi, klm);
+}
+
+static inline void
+prep_regex_umr_wqe_set(struct mlx5_regex_priv *priv, struct mlx5_regex_qp *qp,
+        struct mlx5_regex_sq *sq, struct rte_regex_ops **op, size_t nb_ops)
+{
+       struct mlx5_regex_job *job = NULL;
+       size_t sqid = sq->sqn, mkey_job_id = 0;
+       size_t left_ops = nb_ops;
+       uint32_t klm_num = 0, len;
+       struct mlx5_klm *mkey_klm = NULL;
+       struct mlx5_klm klm;
+
+       sqid = sq->sqn;
+       while (left_ops--)
+               rte_prefetch0(op[left_ops]);
+       left_ops = nb_ops;
+       /*
+        * Build the WQE set by reverse. In case the burst may consume
+        * multiple mkeys, build the WQE set as normal will hard to
+        * address the last mkey index, since we will only know the last
+        * RegEx WQE's index when finishes building.
+        */
+       while (left_ops--) {
+               struct rte_mbuf *mbuf = op[left_ops]->mbuf;
+               size_t pi = MLX5_REGEX_UMR_SQ_PI_IDX(sq->pi, left_ops);
+
+               if (mbuf->nb_segs > 1) {
+                       size_t scatter_size = 0;
+
+                       if (!mkey_klm_available(mkey_klm, klm_num,
+                                               mbuf->nb_segs)) {
+                               /*
+                                * The mkey's KLM is full, create the UMR
+                                * WQE in the next WQE set.
+                                */
+                               if (mkey_klm)
+                                       complete_umr_wqe(qp, sq,
+                                               &qp->jobs[mkey_job_id],
+                                               MLX5_REGEX_UMR_SQ_PI_IDX(pi, 1),
+                                               klm_num, len);
+                               /*
+                                * Get the indircet mkey and KLM array index
+                                * from the last WQE set.
+                                */
+                               mkey_job_id = job_id_get(sqid,
+                                                        sq_size_get(sq), pi);
+                               mkey_klm = qp->jobs[mkey_job_id].imkey_array;
+                               klm_num = 0;
+                               len = 0;
+                       }
+                       /* Build RegEx WQE's data segment KLM. */
+                       klm.address = len;
+                       klm.mkey = rte_cpu_to_be_32
+                                       (qp->jobs[mkey_job_id].imkey->id);
+                       while (mbuf) {
+                               /* Build indirect mkey seg's KLM. */
+                               mkey_klm->mkey = mlx5_mr_addr2mr_bh(priv->pd,
+                                       NULL, &priv->mr_scache, &qp->mr_ctrl,
+                                       rte_pktmbuf_mtod(mbuf, uintptr_t),
+                                       !!(mbuf->ol_flags & EXT_ATTACHED_MBUF));
+                               mkey_klm->address = rte_cpu_to_be_64
+                                       (rte_pktmbuf_mtod(mbuf, uintptr_t));
+                               mkey_klm->byte_count = rte_cpu_to_be_32
+                                               (rte_pktmbuf_data_len(mbuf));
+                               /*
+                                * Save the mbuf's total size for RegEx data
+                                * segment.
+                                */
+                               scatter_size += rte_pktmbuf_data_len(mbuf);
+                               mkey_klm++;
+                               klm_num++;
+                               mbuf = mbuf->next;
+                       }
+                       len += scatter_size;
+                       klm.byte_count = scatter_size;
+               } else {
+                       /* The single mubf case. Build the KLM directly. */
+                       klm.mkey = mlx5_mr_addr2mr_bh(priv->pd, NULL,
+                                       &priv->mr_scache, &qp->mr_ctrl,
+                                       rte_pktmbuf_mtod(mbuf, uintptr_t),
+                                       !!(mbuf->ol_flags & EXT_ATTACHED_MBUF));
+                       klm.address = rte_pktmbuf_mtod(mbuf, uintptr_t);
+                       klm.byte_count = rte_pktmbuf_data_len(mbuf);
+               }
+               job = &qp->jobs[job_id_get(sqid, sq_size_get(sq), pi)];
+               /*
+                * Build the nop + RegEx WQE set by default. The fist nop WQE
+                * will be updated later as UMR WQE if scattered mubf exist.
+                */
+               prep_nop_regex_wqe_set(priv, sq, op[left_ops], job, pi, &klm);
+       }
+       /*
+        * Scattered mbuf have been added to the KLM array. Complete the build
+        * of UMR WQE, update the first nop WQE as UMR WQE.
+        */
+       if (mkey_klm)
+               complete_umr_wqe(qp, sq, &qp->jobs[mkey_job_id], sq->pi,
+                                klm_num, len);
+       sq->db_pi = MLX5_REGEX_UMR_SQ_PI_IDX(sq->pi, nb_ops - 1);
+       sq->pi = MLX5_REGEX_UMR_SQ_PI_IDX(sq->pi, nb_ops);
+}
+
+uint16_t
+mlx5_regexdev_enqueue_gga(struct rte_regexdev *dev, uint16_t qp_id,
+                         struct rte_regex_ops **ops, uint16_t nb_ops)
+{
+       struct mlx5_regex_priv *priv = dev->data->dev_private;
+       struct mlx5_regex_qp *queue = &priv->qps[qp_id];
+       struct mlx5_regex_sq *sq;
+       size_t sqid, nb_left = nb_ops, nb_desc;
+
+       while ((sqid = ffs(queue->free_sqs))) {
+               sqid--; /* ffs returns 1 for bit 0 */
+               sq = &queue->sqs[sqid];
+               nb_desc = get_free(sq, priv->has_umr);
+               if (nb_desc) {
+                       /* The ops be handled can't exceed nb_ops. */
+                       if (nb_desc > nb_left)
+                               nb_desc = nb_left;
+                       else
+                               queue->free_sqs &= ~(1 << sqid);
+                       prep_regex_umr_wqe_set(priv, queue, sq, ops, nb_desc);
+                       send_doorbell(priv, sq);
+                       nb_left -= nb_desc;
+               }
+               if (!nb_left)
+                       break;
+               ops += nb_desc;
+       }
+       nb_ops -= nb_left;
+       queue->pi += nb_ops;
+       return nb_ops;
+}
+#endif
+
 uint16_t
 mlx5_regexdev_enqueue(struct rte_regexdev *dev, uint16_t qp_id,
                      struct rte_regex_ops **ops, uint16_t nb_ops)
@@ -186,17 +420,17 @@ mlx5_regexdev_enqueue(struct rte_regexdev *dev, uint16_t qp_id,
        while ((sqid = ffs(queue->free_sqs))) {
                sqid--; /* ffs returns 1 for bit 0 */
                sq = &queue->sqs[sqid];
-               while (can_send(sq)) {
+               while (get_free(sq, priv->has_umr)) {
                        job_id = job_id_get(sqid, sq_size_get(sq), sq->pi);
                        prep_one(priv, queue, sq, ops[i], &queue->jobs[job_id]);
                        i++;
                        if (unlikely(i == nb_ops)) {
-                               send_doorbell(priv->uar, sq);
+                               send_doorbell(priv, sq);
                                goto out;
                        }
                }
                queue->free_sqs &= ~(1 << sqid);
-               send_doorbell(priv->uar, sq);
+               send_doorbell(priv, sq);
        }
 
 out:
@@ -308,6 +542,10 @@ mlx5_regexdev_dequeue(struct rte_regexdev *dev, uint16_t qp_id,
                          MLX5_REGEX_MAX_WQE_INDEX;
                size_t sqid = cqe->rsvd3[2];
                struct mlx5_regex_sq *sq = &queue->sqs[sqid];
+
+               /* UMR mode WQE counter move as WQE set(4 WQEBBS).*/
+               if (priv->has_umr)
+                       wq_counter >>= 2;
                while (sq->ci != wq_counter) {
                        if (unlikely(i == nb_ops)) {
                                /* Return without updating cq->ci */
@@ -316,7 +554,9 @@ mlx5_regexdev_dequeue(struct rte_regexdev *dev, uint16_t qp_id,
                        uint32_t job_id = job_id_get(sqid, sq_size_get(sq),
                                                     sq->ci);
                        extract_result(ops[i], &queue->jobs[job_id]);
-                       sq->ci = (sq->ci + 1) & MLX5_REGEX_MAX_WQE_INDEX;
+                       sq->ci = (sq->ci + 1) & (priv->has_umr ?
+                                (MLX5_REGEX_MAX_WQE_INDEX >> 2) :
+                                 MLX5_REGEX_MAX_WQE_INDEX);
                        i++;
                }
                cq->ci = (cq->ci + 1) & 0xffffff;
@@ -331,7 +571,7 @@ out:
 }
 
 static void
-setup_sqs(struct mlx5_regex_qp *queue)
+setup_sqs(struct mlx5_regex_priv *priv, struct mlx5_regex_qp *queue)
 {
        size_t sqid, entry;
        uint32_t job_id;
@@ -342,6 +582,14 @@ setup_sqs(struct mlx5_regex_qp *queue)
                        job_id = sqid * sq_size_get(sq) + entry;
                        struct mlx5_regex_job *job = &queue->jobs[job_id];
 
+                       /* Fill UMR WQE with NOP in advanced. */
+                       if (priv->has_umr) {
+                               set_wqe_ctrl_seg
+                                       ((struct mlx5_wqe_ctrl_seg *)wqe,
+                                        entry * 2, MLX5_OPCODE_NOP, 0,
+                                        sq->sq_obj.sq->id, 0, 12, 0, 0);
+                               wqe += MLX5_REGEX_UMR_WQE_SIZE;
+                       }
                        set_metadata_seg((struct mlx5_wqe_metadata_seg *)
                                         (wqe + MLX5_REGEX_WQE_METADATA_OFFSET),
                                         0, queue->metadata->lkey,
@@ -358,8 +606,9 @@ setup_sqs(struct mlx5_regex_qp *queue)
 }
 
 static int
-setup_buffers(struct mlx5_regex_qp *qp, struct ibv_pd *pd)
+setup_buffers(struct mlx5_regex_priv *priv, struct mlx5_regex_qp *qp)
 {
+       struct ibv_pd *pd = priv->pd;
        uint32_t i;
        int err;
 
@@ -395,6 +644,24 @@ setup_buffers(struct mlx5_regex_qp *qp, struct ibv_pd *pd)
                goto err_output;
        }
 
+       if (priv->has_umr) {
+               ptr = rte_calloc(__func__, qp->nb_desc, MLX5_REGEX_KLMS_SIZE,
+                                MLX5_REGEX_KLMS_SIZE);
+               if (!ptr) {
+                       err = -ENOMEM;
+                       goto err_imkey;
+               }
+               qp->imkey_addr = mlx5_glue->reg_mr(pd, ptr,
+                                       MLX5_REGEX_KLMS_SIZE * qp->nb_desc,
+                                       IBV_ACCESS_LOCAL_WRITE);
+               if (!qp->imkey_addr) {
+                       rte_free(ptr);
+                       DRV_LOG(ERR, "Failed to register output");
+                       err = -EINVAL;
+                       goto err_imkey;
+               }
+       }
+
        /* distribute buffers to jobs */
        for (i = 0; i < qp->nb_desc; i++) {
                qp->jobs[i].output =
@@ -403,9 +670,18 @@ setup_buffers(struct mlx5_regex_qp *qp, struct ibv_pd *pd)
                qp->jobs[i].metadata =
                        (uint8_t *)qp->metadata->addr +
                        (i % qp->nb_desc) * MLX5_REGEX_METADATA_SIZE;
+               if (qp->imkey_addr)
+                       qp->jobs[i].imkey_array = (struct mlx5_klm *)
+                               qp->imkey_addr->addr +
+                               (i % qp->nb_desc) * MLX5_REGEX_MAX_KLM_NUM;
        }
+
        return 0;
 
+err_imkey:
+       ptr = qp->outputs->addr;
+       rte_free(ptr);
+       mlx5_glue->dereg_mr(qp->outputs);
 err_output:
        ptr = qp->metadata->addr;
        rte_free(ptr);
@@ -417,23 +693,57 @@ int
 mlx5_regexdev_setup_fastpath(struct mlx5_regex_priv *priv, uint32_t qp_id)
 {
        struct mlx5_regex_qp *qp = &priv->qps[qp_id];
-       int err;
+       struct mlx5_klm klm = { 0 };
+       struct mlx5_devx_mkey_attr attr = {
+               .klm_array = &klm,
+               .klm_num = 1,
+               .umr_en = 1,
+       };
+       uint32_t i;
+       int err = 0;
 
        qp->jobs = rte_calloc(__func__, qp->nb_desc, sizeof(*qp->jobs), 64);
        if (!qp->jobs)
                return -ENOMEM;
-       err = setup_buffers(qp, priv->pd);
+       err = setup_buffers(priv, qp);
        if (err) {
                rte_free(qp->jobs);
                return err;
        }
-       setup_sqs(qp);
-       return 0;
+
+       setup_sqs(priv, qp);
+
+       if (priv->has_umr) {
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+               if (regex_get_pdn(priv->pd, &attr.pd)) {
+                       err = -rte_errno;
+                       DRV_LOG(ERR, "Failed to get pdn.");
+                       mlx5_regexdev_teardown_fastpath(priv, qp_id);
+                       return err;
+               }
+#endif
+               for (i = 0; i < qp->nb_desc; i++) {
+                       attr.klm_num = MLX5_REGEX_MAX_KLM_NUM;
+                       attr.klm_array = qp->jobs[i].imkey_array;
+                       qp->jobs[i].imkey = mlx5_devx_cmd_mkey_create(priv->ctx,
+                                                                     &attr);
+                       if (!qp->jobs[i].imkey) {
+                               err = -rte_errno;
+                               DRV_LOG(ERR, "Failed to allocate imkey.");
+                               mlx5_regexdev_teardown_fastpath(priv, qp_id);
+                       }
+               }
+       }
+       return err;
 }
 
 static void
 free_buffers(struct mlx5_regex_qp *qp)
 {
+       if (qp->imkey_addr) {
+               mlx5_glue->dereg_mr(qp->imkey_addr);
+               rte_free(qp->imkey_addr->addr);
+       }
        if (qp->metadata) {
                mlx5_glue->dereg_mr(qp->metadata);
                rte_free(qp->metadata->addr);
@@ -448,8 +758,14 @@ void
 mlx5_regexdev_teardown_fastpath(struct mlx5_regex_priv *priv, uint32_t qp_id)
 {
        struct mlx5_regex_qp *qp = &priv->qps[qp_id];
+       uint32_t i;
 
        if (qp) {
+               for (i = 0; i < qp->nb_desc; i++) {
+                       if (qp->jobs[i].imkey)
+                               claim_zero(mlx5_devx_cmd_destroy
+                                                       (qp->jobs[i].imkey));
+               }
                free_buffers(qp);
                if (qp->jobs)
                        rte_free(qp->jobs);