net/mlx5: fix Rx descriptor status

[dpdk.git] / drivers / net / mlx5 / mlx5_rxtx.c
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c

index d8f6671..2ecf901 100644 (file)
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -3,22 +3,10 @@
   * Copyright 2015-2019 Mellanox Technologies, Ltd
   */
  
-#include <assert.h>
  #include <stdint.h>
  #include <string.h>
  #include <stdlib.h>
  
-/* Verbs header. */
-/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
-#ifdef PEDANTIC
-#pragma GCC diagnostic ignored "-Wpedantic"
-#endif
-#include <infiniband/verbs.h>
-#include <infiniband/mlx5dv.h>
-#ifdef PEDANTIC
-#pragma GCC diagnostic error "-Wpedantic"
-#endif
-
  #include <rte_mbuf.h>
  #include <rte_mempool.h>
  #include <rte_prefetch.h>
@@ -28,14 +16,15 @@
  #include <rte_cycles.h>
  #include <rte_flow.h>
  
-#include <mlx5_devx_cmds.h>
  #include <mlx5_prm.h>
+#include <mlx5_common.h>
  
+#include "mlx5_autoconf.h"
  #include "mlx5_defs.h"
  #include "mlx5.h"
+#include "mlx5_mr.h"
  #include "mlx5_utils.h"
  #include "mlx5_rxtx.h"
-#include "mlx5_autoconf.h"
  
  /* TX burst subroutines return codes. */
  enum mlx5_txcmp_code {
@@ -65,6 +54,7 @@ enum mlx5_txcmp_code {
  #define MLX5_TXOFF_CONFIG_METADATA (1u << 6) /* Flow metadata. */
  #define MLX5_TXOFF_CONFIG_EMPW (1u << 8) /* Enhanced MPW supported.*/
  #define MLX5_TXOFF_CONFIG_MPW (1u << 9) /* Legacy MPW supported.*/
+#define MLX5_TXOFF_CONFIG_TXPP (1u << 10) /* Scheduling on timestamp.*/
  
  /* The most common offloads groups. */
  #define MLX5_TXOFF_CONFIG_NONE 0
@@ -90,7 +80,8 @@ static uint16_t mlx5_tx_burst_##func(void *txq, \
  #define MLX5_TXOFF_INFO(func, olx) {mlx5_tx_burst_##func, olx},
  
  static __rte_always_inline uint32_t
-rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe);
+rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
+                                  volatile struct mlx5_mini_cqe8 *mcqe);
  
  static __rte_always_inline int
  mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
@@ -101,25 +92,23 @@ rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe);
  
  static __rte_always_inline void
  rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
-              volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res);
-
-static __rte_always_inline void
-mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,
-                const unsigned int strd_n);
+              volatile struct mlx5_cqe *cqe,
+              volatile struct mlx5_mini_cqe8 *mcqe);
  
  static int
  mlx5_queue_state_modify(struct rte_eth_dev *dev,
                         struct mlx5_mp_arg_queue_state_modify *sm);
  
  static inline void
-mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp,
-                       volatile struct mlx5_cqe *restrict cqe,
-                       uint32_t phcsum);
+mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp,
+                       volatile struct mlx5_cqe *__rte_restrict cqe,
+                       uint32_t phcsum, uint8_t l4_type);
  
  static inline void
-mlx5_lro_update_hdr(uint8_t *restrict padd,
-                   volatile struct mlx5_cqe *restrict cqe,
-                   uint32_t len);
+mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
+                   volatile struct mlx5_cqe *__rte_restrict cqe,
+                       volatile struct mlx5_mini_cqe8 *mcqe,
+                   struct mlx5_rxq_data *rxq, uint32_t len);
  
  uint32_t mlx5_ptype_table[] __rte_cache_aligned = {
         [0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */
@@ -128,6 +117,9 @@ uint32_t mlx5_ptype_table[] __rte_cache_aligned = {
  uint8_t mlx5_cksum_table[1 << 10] __rte_cache_aligned;
  uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned;
  
+uint64_t rte_net_mlx5_dynf_inline_mask;
+#define PKT_TX_DYNF_NOINLINE rte_net_mlx5_dynf_inline_mask
+
  /**
   * Build a table to translate Rx completion flags to packet type.
   *
@@ -370,7 +362,7 @@ mlx5_set_swp_types_table(void)
   *   Software Parser flags are set by pointer.
   */
  static __rte_always_inline uint32_t
-txq_mbuf_to_swp(struct mlx5_txq_local *restrict loc,
+txq_mbuf_to_swp(struct mlx5_txq_local *__rte_restrict loc,
                 uint8_t *swp_flags,
                 unsigned int olx)
  {
@@ -471,13 +463,15 @@ rx_queue_count(struct mlx5_rxq_data *rxq)
         struct rxq_zip *zip = &rxq->zip;
         volatile struct mlx5_cqe *cqe;
         const unsigned int cqe_n = (1 << rxq->cqe_n);
+       const unsigned int sges_n = (1 << rxq->sges_n);
+       const unsigned int elts_n = (1 << rxq->elts_n);
+       const unsigned int strd_n = (1 << rxq->strd_num_n);
         const unsigned int cqe_cnt = cqe_n - 1;
-       unsigned int cq_ci;
-       unsigned int used;
+       unsigned int cq_ci, used;
  
         /* if we are processing a compressed cqe */
         if (zip->ai) {
-               used = zip->cqe_cnt - zip->ca;
+               used = zip->cqe_cnt - zip->ai;
                 cq_ci = zip->cq_ci;
         } else {
                 used = 0;
@@ -497,7 +491,7 @@ rx_queue_count(struct mlx5_rxq_data *rxq)
                 used += n;
                 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
         }
-       used = RTE_MIN(used, (1U << rxq->elts_n) - 1);
+       used = RTE_MIN(used * sges_n, elts_n * strd_n);
         return used;
  }
  
@@ -520,11 +514,12 @@ mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset)
                         container_of(rxq, struct mlx5_rxq_ctrl, rxq);
         struct rte_eth_dev *dev = ETH_DEV(rxq_ctrl->priv);
  
-       if (dev->rx_pkt_burst != mlx5_rx_burst) {
+       if (dev->rx_pkt_burst == NULL ||
+           dev->rx_pkt_burst == removed_rx_burst) {
                 rte_errno = ENOTSUP;
                 return -rte_errno;
         }
-       if (offset >= (1 << rxq->elts_n)) {
+       if (offset >= (1 << rxq->cqe_n)) {
                 rte_errno = EINVAL;
                 return -rte_errno;
         }
@@ -533,6 +528,108 @@ mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset)
         return RTE_ETH_RX_DESC_AVAIL;
  }
  
+/**
+ * DPDK callback to get the RX queue information
+ *
+ * @param dev
+ *   Pointer to the device structure.
+ *
+ * @param rx_queue_id
+ *   Rx queue identificator.
+ *
+ * @param qinfo
+ *   Pointer to the RX queue information structure.
+ *
+ * @return
+ *   None.
+ */
+
+void
+mlx5_rxq_info_get(struct rte_eth_dev *dev, uint16_t rx_queue_id,
+                 struct rte_eth_rxq_info *qinfo)
+{
+       struct mlx5_priv *priv = dev->data->dev_private;
+       struct mlx5_rxq_data *rxq = (*priv->rxqs)[rx_queue_id];
+       struct mlx5_rxq_ctrl *rxq_ctrl =
+               container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+
+       if (!rxq)
+               return;
+       qinfo->mp = mlx5_rxq_mprq_enabled(rxq) ?
+                                       rxq->mprq_mp : rxq->mp;
+       qinfo->conf.rx_thresh.pthresh = 0;
+       qinfo->conf.rx_thresh.hthresh = 0;
+       qinfo->conf.rx_thresh.wthresh = 0;
+       qinfo->conf.rx_free_thresh = rxq->rq_repl_thresh;
+       qinfo->conf.rx_drop_en = 1;
+       qinfo->conf.rx_deferred_start = rxq_ctrl ? 0 : 1;
+       qinfo->conf.offloads = dev->data->dev_conf.rxmode.offloads;
+       qinfo->scattered_rx = dev->data->scattered_rx;
+       qinfo->nb_desc = mlx5_rxq_mprq_enabled(rxq) ?
+               (1 << rxq->elts_n) * (1 << rxq->strd_num_n) :
+               (1 << rxq->elts_n);
+}
+
+/**
+ * DPDK callback to get the RX packet burst mode information
+ *
+ * @param dev
+ *   Pointer to the device structure.
+ *
+ * @param rx_queue_id
+ *   Rx queue identificatior.
+ *
+ * @param mode
+ *   Pointer to the burts mode information.
+ *
+ * @return
+ *   0 as success, -EINVAL as failure.
+ */
+
+int
+mlx5_rx_burst_mode_get(struct rte_eth_dev *dev,
+                      uint16_t rx_queue_id __rte_unused,
+                      struct rte_eth_burst_mode *mode)
+{
+       eth_rx_burst_t pkt_burst = dev->rx_pkt_burst;
+       struct mlx5_priv *priv = dev->data->dev_private;
+       struct mlx5_rxq_data *rxq;
+
+       rxq = (*priv->rxqs)[rx_queue_id];
+       if (!rxq) {
+               rte_errno = EINVAL;
+               return -rte_errno;
+       }
+       if (pkt_burst == mlx5_rx_burst) {
+               snprintf(mode->info, sizeof(mode->info), "%s", "Scalar");
+       } else if (pkt_burst == mlx5_rx_burst_mprq) {
+               snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ");
+       } else if (pkt_burst == mlx5_rx_burst_vec) {
+#if defined RTE_ARCH_X86_64
+               snprintf(mode->info, sizeof(mode->info), "%s", "Vector SSE");
+#elif defined RTE_ARCH_ARM64
+               snprintf(mode->info, sizeof(mode->info), "%s", "Vector Neon");
+#elif defined RTE_ARCH_PPC_64
+               snprintf(mode->info, sizeof(mode->info), "%s", "Vector AltiVec");
+#else
+               return -EINVAL;
+#endif
+       } else if (pkt_burst == mlx5_rx_burst_mprq_vec) {
+#if defined RTE_ARCH_X86_64
+               snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector SSE");
+#elif defined RTE_ARCH_ARM64
+               snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector Neon");
+#elif defined RTE_ARCH_PPC_64
+               snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector AltiVec");
+#else
+               return -EINVAL;
+#endif
+       } else {
+               return -EINVAL;
+       }
+       return 0;
+}
+
  /**
   * DPDK callback to get the number of used descriptors in a RX queue
   *
@@ -552,7 +649,8 @@ mlx5_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
         struct mlx5_priv *priv = dev->data->dev_private;
         struct mlx5_rxq_data *rxq;
  
-       if (dev->rx_pkt_burst != mlx5_rx_burst) {
+       if (dev->rx_pkt_burst == NULL ||
+           dev->rx_pkt_burst == removed_rx_burst) {
                 rte_errno = ENOTSUP;
                 return -rte_errno;
         }
@@ -660,7 +758,7 @@ check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe)
   *   the error completion entry is handled successfully.
   */
  static int
-mlx5_tx_error_cqe_handle(struct mlx5_txq_data *restrict txq,
+mlx5_tx_error_cqe_handle(struct mlx5_txq_data *__rte_restrict txq,
                          volatile struct mlx5_err_cqe *err_cqe)
  {
         if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) {
@@ -727,12 +825,19 @@ mlx5_tx_error_cqe_handle(struct mlx5_txq_data *restrict txq,
   *   Packet type for struct rte_mbuf.
   */
  static inline uint32_t
-rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe)
+rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
+                                  volatile struct mlx5_mini_cqe8 *mcqe)
  {
         uint8_t idx;
-       uint8_t pinfo = cqe->pkt_info;
-       uint16_t ptype = cqe->hdr_type_etc;
+       uint8_t ptype;
+       uint8_t pinfo = (cqe->pkt_info & 0x3) << 6;
  
+       /* Get l3/l4 header from mini-CQE in case L3/L4 format*/
+       if (mcqe == NULL ||
+           rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX)
+               ptype = (cqe->hdr_type_etc & 0xfc00) >> 10;
+       else
+               ptype = mcqe->hdr_type >> 2;
         /*
          * The index to the array should have:
          * bit[1:0] = l3_hdr_type
@@ -741,7 +846,7 @@ rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe)
          * bit[6] = tunneled
          * bit[7] = outer_l3_type
          */
-       idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10);
+       idx = pinfo | ptype;
         return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6));
  }
  
@@ -780,7 +885,7 @@ mlx5_rxq_initialize(struct mlx5_rxq_data *rxq)
                         byte_count = DATA_LEN(buf);
                 }
                 /* scat->addr must be able to store a pointer. */
-               assert(sizeof(scat->addr) >= sizeof(uintptr_t));
+               MLX5_ASSERT(sizeof(scat->addr) >= sizeof(uintptr_t));
                 *scat = (struct mlx5_wqe_data_seg){
                         .addr = rte_cpu_to_be_64(addr),
                         .byte_count = rte_cpu_to_be_32(byte_count),
@@ -793,9 +898,11 @@ mlx5_rxq_initialize(struct mlx5_rxq_data *rxq)
         rxq->zip = (struct rxq_zip){
                 .ai = 0,
         };
+       rxq->elts_ci = mlx5_rxq_mprq_enabled(rxq) ?
+               (wqe_n >> rxq->sges_n) * (1 << rxq->strd_num_n) : 0;
         /* Update doorbell counter. */
         rxq->rq_ci = wqe_n >> rxq->sges_n;
-       rte_cio_wmb();
+       rte_io_wmb();
         *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
  }
  
@@ -823,30 +930,7 @@ mlx5_queue_state_modify_primary(struct rte_eth_dev *dev,
                 struct mlx5_rxq_ctrl *rxq_ctrl =
                         container_of(rxq, struct mlx5_rxq_ctrl, rxq);
  
-               if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_IBV) {
-                       struct ibv_wq_attr mod = {
-                               .attr_mask = IBV_WQ_ATTR_STATE,
-                               .wq_state = sm->state,
-                       };
-
-                       ret = mlx5_glue->modify_wq(rxq_ctrl->obj->wq, &mod);
-               } else { /* rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ. */
-                       struct mlx5_devx_modify_rq_attr rq_attr;
-
-                       memset(&rq_attr, 0, sizeof(rq_attr));
-                       if (sm->state == IBV_WQS_RESET) {
-                               rq_attr.rq_state = MLX5_RQC_STATE_ERR;
-                               rq_attr.state = MLX5_RQC_STATE_RST;
-                       } else if (sm->state == IBV_WQS_RDY) {
-                               rq_attr.rq_state = MLX5_RQC_STATE_RST;
-                               rq_attr.state = MLX5_RQC_STATE_RDY;
-                       } else if (sm->state == IBV_WQS_ERR) {
-                               rq_attr.rq_state = MLX5_RQC_STATE_RDY;
-                               rq_attr.state = MLX5_RQC_STATE_ERR;
-                       }
-                       ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq,
-                                                     &rq_attr);
-               }
+               ret = priv->obj_ops.rxq_obj_modify(rxq_ctrl->obj, sm->state);
                 if (ret) {
                         DRV_LOG(ERR, "Cannot change Rx WQ state to %u  - %s",
                                         sm->state, strerror(errno));
@@ -857,44 +941,12 @@ mlx5_queue_state_modify_primary(struct rte_eth_dev *dev,
                 struct mlx5_txq_data *txq = (*priv->txqs)[sm->queue_id];
                 struct mlx5_txq_ctrl *txq_ctrl =
                         container_of(txq, struct mlx5_txq_ctrl, txq);
-               struct ibv_qp_attr mod = {
-                       .qp_state = IBV_QPS_RESET,
-                       .port_num = (uint8_t)priv->ibv_port,
-               };
-               struct ibv_qp *qp = txq_ctrl->obj->qp;
  
-               ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
-               if (ret) {
-                       DRV_LOG(ERR, "Cannot change the Tx QP state to RESET "
-                               "%s", strerror(errno));
-                       rte_errno = errno;
-                       return ret;
-               }
-               mod.qp_state = IBV_QPS_INIT;
-               ret = mlx5_glue->modify_qp(qp, &mod,
-                                          (IBV_QP_STATE | IBV_QP_PORT));
-               if (ret) {
-                       DRV_LOG(ERR, "Cannot change Tx QP state to INIT %s",
-                               strerror(errno));
-                       rte_errno = errno;
-                       return ret;
-               }
-               mod.qp_state = IBV_QPS_RTR;
-               ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
-               if (ret) {
-                       DRV_LOG(ERR, "Cannot change Tx QP state to RTR %s",
-                               strerror(errno));
-                       rte_errno = errno;
+               ret = priv->obj_ops.txq_obj_modify(txq_ctrl->obj,
+                                                  MLX5_TXQ_MOD_ERR2RDY,
+                                                  (uint8_t)priv->dev_port);
+               if (ret)
                         return ret;
-               }
-               mod.qp_state = IBV_QPS_RTS;
-               ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
-               if (ret) {
-                       DRV_LOG(ERR, "Cannot change Tx QP state to RTS %s",
-                               strerror(errno));
-                       rte_errno = errno;
-                       return ret;
-               }
         }
         return 0;
  }
@@ -914,6 +966,7 @@ static int
  mlx5_queue_state_modify(struct rte_eth_dev *dev,
                         struct mlx5_mp_arg_queue_state_modify *sm)
  {
+       struct mlx5_priv *priv = dev->data->dev_private;
         int ret = 0;
  
         switch (rte_eal_process_type()) {
@@ -921,7 +974,7 @@ mlx5_queue_state_modify(struct rte_eth_dev *dev,
                 ret = mlx5_queue_state_modify_primary(dev, sm);
                 break;
         case RTE_PROC_SECONDARY:
-               ret = mlx5_mp_req_queue_state_modify(dev, sm);
+               ret = mlx5_mp_req_queue_state_modify(&priv->mp_id, sm);
                 break;
         default:
                 break;
@@ -950,7 +1003,8 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)
  {
         const uint16_t cqe_n = 1 << rxq->cqe_n;
         const uint16_t cqe_mask = cqe_n - 1;
-       const unsigned int wqe_n = 1 << rxq->elts_n;
+       const uint16_t wqe_n = 1 << rxq->elts_n;
+       const uint16_t strd_n = 1 << rxq->strd_num_n;
         struct mlx5_rxq_ctrl *rxq_ctrl =
                         container_of(rxq, struct mlx5_rxq_ctrl, rxq);
         union {
@@ -998,15 +1052,15 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)
         case MLX5_RXQ_ERR_STATE_NEED_READY:
                 ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci);
                 if (ret == MLX5_CQE_STATUS_HW_OWN) {
-                       rte_cio_wmb();
+                       rte_io_wmb();
                         *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
-                       rte_cio_wmb();
+                       rte_io_wmb();
                         /*
                          * The RQ consumer index must be zeroed while moving
                          * from RESET state to RDY state.
                          */
                         *rxq->rq_db = rte_cpu_to_be_32(0);
-                       rte_cio_wmb();
+                       rte_io_wmb();
                         sm.is_wq = 1;
                         sm.queue_id = rxq->idx;
                         sm.state = IBV_WQS_RDY;
@@ -1014,21 +1068,27 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)
                                                     &sm))
                                 return -1;
                         if (vec) {
-                               const uint16_t q_mask = wqe_n - 1;
-                               uint16_t elt_idx;
+                               const uint32_t elts_n =
+                                       mlx5_rxq_mprq_enabled(rxq) ?
+                                       wqe_n * strd_n : wqe_n;
+                               const uint32_t e_mask = elts_n - 1;
+                               uint32_t elts_ci =
+                                       mlx5_rxq_mprq_enabled(rxq) ?
+                                       rxq->elts_ci : rxq->rq_ci;
+                               uint32_t elt_idx;
                                 struct rte_mbuf **elt;
                                 int i;
-                               unsigned int n = wqe_n - (rxq->rq_ci -
+                               unsigned int n = elts_n - (elts_ci -
                                                           rxq->rq_pi);
  
                                 for (i = 0; i < (int)n; ++i) {
-                                       elt_idx = (rxq->rq_ci + i) & q_mask;
+                                       elt_idx = (elts_ci + i) & e_mask;
                                         elt = &(*rxq->elts)[elt_idx];
                                         *elt = rte_mbuf_raw_alloc(rxq->mp);
                                         if (!*elt) {
                                                 for (i--; i >= 0; --i) {
-                                                       elt_idx = (rxq->rq_ci +
-                                                                  i) & q_mask;
+                                                       elt_idx = (elts_ci +
+                                                                  i) & elts_n;
                                                         elt = &(*rxq->elts)
                                                                 [elt_idx];
                                                         rte_pktmbuf_free_seg
@@ -1037,7 +1097,7 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)
                                                 return -1;
                                         }
                                 }
-                               for (i = 0; i < (int)wqe_n; ++i) {
+                               for (i = 0; i < (int)elts_n; ++i) {
                                         elt = &(*rxq->elts)[i];
                                         DATA_LEN(*elt) =
                                                 (uint16_t)((*elt)->buf_len -
@@ -1045,7 +1105,7 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)
                                 }
                                 /* Padding with a fake mbuf for vec Rx. */
                                 for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
-                                       (*rxq->elts)[wqe_n + i] =
+                                       (*rxq->elts)[elts_n + i] =
                                                                 &rxq->fake_mbuf;
                         }
                         mlx5_rxq_initialize(rxq);
@@ -1090,8 +1150,8 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
                                 (volatile struct mlx5_mini_cqe8 (*)[8])
                                 (uintptr_t)(&(*rxq->cqes)[zip->ca &
                                                           cqe_cnt].pkt_info);
-
-                       len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt);
+                       len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt &
+                                              rxq->byte_mask);
                         *mcqe = &(*mc)[zip->ai & 7];
                         if ((++zip->ai & 7) == 0) {
                                 /* Invalidate consumed CQEs */
@@ -1130,6 +1190,7 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
                 } else {
                         int ret;
                         int8_t op_own;
+                       uint32_t cq_ci;
  
                         ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
                         if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
@@ -1143,14 +1204,19 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
                                         return 0;
                                 }
                         }
-                       ++rxq->cq_ci;
+                       /*
+                        * Introduce the local variable to have queue cq_ci
+                        * index in queue structure always consistent with
+                        * actual CQE boundary (not pointing to the middle
+                        * of compressed CQE session).
+                        */
+                       cq_ci = rxq->cq_ci + 1;
                         op_own = cqe->op_own;
                         if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
                                 volatile struct mlx5_mini_cqe8 (*mc)[8] =
                                         (volatile struct mlx5_mini_cqe8 (*)[8])
                                         (uintptr_t)(&(*rxq->cqes)
-                                               [rxq->cq_ci &
-                                                cqe_cnt].pkt_info);
+                                               [cq_ci & cqe_cnt].pkt_info);
  
                                 /* Fix endianness. */
                                 zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt);
@@ -1163,13 +1229,13 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
                                  * 7 CQEs after the initial CQE instead of 8
                                  * for subsequent ones.
                                  */
-                               zip->ca = rxq->cq_ci;
+                               zip->ca = cq_ci;
                                 zip->na = zip->ca + 7;
                                 /* Compute the next non compressed CQE. */
-                               --rxq->cq_ci;
                                 zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
                                 /* Get packet size to return. */
-                               len = rte_be_to_cpu_32((*mc)[0].byte_cnt);
+                               len = rte_be_to_cpu_32((*mc)[0].byte_cnt &
+                                                      rxq->byte_mask);
                                 *mcqe = &(*mc)[0];
                                 zip->ai = 1;
                                 /* Prefetch all to be invalidated */
@@ -1181,6 +1247,7 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
                                         ++idx;
                                 }
                         } else {
+                               rxq->cq_ci = cq_ci;
                                 len = rte_be_to_cpu_32(cqe->byte_cnt);
                         }
                 }
@@ -1233,38 +1300,73 @@ rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe)
   */
  static inline void
  rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
-              volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res)
+              volatile struct mlx5_cqe *cqe,
+              volatile struct mlx5_mini_cqe8 *mcqe)
  {
         /* Update packet information. */
-       pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe);
-       if (rss_hash_res && rxq->rss_hash) {
-               pkt->hash.rss = rss_hash_res;
-               pkt->ol_flags |= PKT_RX_RSS_HASH;
+       pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe, mcqe);
+
+       if (rxq->rss_hash) {
+               uint32_t rss_hash_res = 0;
+
+               /* If compressed, take hash result from mini-CQE. */
+               if (mcqe == NULL ||
+                   rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_HASH)
+                       rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res);
+               else
+                       rss_hash_res = rte_be_to_cpu_32(mcqe->rx_hash_result);
+               if (rss_hash_res) {
+                       pkt->hash.rss = rss_hash_res;
+                       pkt->ol_flags |= PKT_RX_RSS_HASH;
+               }
         }
-       if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) {
-               pkt->ol_flags |= PKT_RX_FDIR;
-               if (cqe->sop_drop_qpn !=
-                   rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) {
-                       uint32_t mark = cqe->sop_drop_qpn;
-
-                       pkt->ol_flags |= PKT_RX_FDIR_ID;
-                       pkt->hash.fdir.hi = mlx5_flow_mark_get(mark);
+       if (rxq->mark) {
+               uint32_t mark = 0;
+
+               /* If compressed, take flow tag from mini-CQE. */
+               if (mcqe == NULL ||
+                   rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_FTAG_STRIDX)
+                       mark = cqe->sop_drop_qpn;
+               else
+                       mark = ((mcqe->byte_cnt_flow & 0xff) << 8) |
+                               (mcqe->flow_tag_high << 16);
+               if (MLX5_FLOW_MARK_IS_VALID(mark)) {
+                       pkt->ol_flags |= PKT_RX_FDIR;
+                       if (mark != RTE_BE32(MLX5_FLOW_MARK_DEFAULT)) {
+                               pkt->ol_flags |= PKT_RX_FDIR_ID;
+                               pkt->hash.fdir.hi = mlx5_flow_mark_get(mark);
+                       }
                 }
         }
-       if (rte_flow_dynf_metadata_avail() && cqe->flow_table_metadata) {
-               pkt->ol_flags |= PKT_RX_DYNF_METADATA;
-               *RTE_FLOW_DYNF_METADATA(pkt) = cqe->flow_table_metadata;
+       if (rxq->dynf_meta && cqe->flow_table_metadata) {
+               pkt->ol_flags |= rxq->flow_meta_mask;
+               *RTE_MBUF_DYNFIELD(pkt, rxq->flow_meta_offset, uint32_t *) =
+                       cqe->flow_table_metadata;
         }
         if (rxq->csum)
                 pkt->ol_flags |= rxq_cq_to_ol_flags(cqe);
-       if (rxq->vlan_strip &&
-           (cqe->hdr_type_etc & rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) {
-               pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
-               pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info);
+       if (rxq->vlan_strip) {
+               bool vlan_strip;
+
+               if (mcqe == NULL ||
+                   rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX)
+                       vlan_strip = cqe->hdr_type_etc &
+                                    RTE_BE16(MLX5_CQE_VLAN_STRIPPED);
+               else
+                       vlan_strip = mcqe->hdr_type &
+                                    RTE_BE16(MLX5_CQE_VLAN_STRIPPED);
+               if (vlan_strip) {
+                       pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
+                       pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info);
+               }
         }
         if (rxq->hw_timestamp) {
-               pkt->timestamp = rte_be_to_cpu_64(cqe->timestamp);
-               pkt->ol_flags |= PKT_RX_TIMESTAMP;
+               uint64_t ts = rte_be_to_cpu_64(cqe->timestamp);
+
+               if (rxq->rt_timestamp)
+                       ts = mlx5_txpp_convert_rx_ts(rxq->sh, ts);
+               mlx5_timestamp_set(pkt, rxq->timestamp_offset, ts);
+               pkt->ol_flags |= rxq->timestamp_rx_flag;
         }
  }
  
@@ -1302,7 +1404,6 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
                         &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx];
                 struct rte_mbuf *rep = (*rxq->elts)[idx];
                 volatile struct mlx5_mini_cqe8 *mcqe = NULL;
-               uint32_t rss_hash_res;
  
                 if (pkt)
                         NEXT(seg) = rep;
@@ -1310,7 +1411,8 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
                 rte_prefetch0(seg);
                 rte_prefetch0(cqe);
                 rte_prefetch0(wqe);
-               rep = rte_mbuf_raw_alloc(rxq->mp);
+               /* Allocate the buf from the same pool. */
+               rep = rte_mbuf_raw_alloc(seg->pool);
                 if (unlikely(rep == NULL)) {
                         ++rxq->stats.rx_nombuf;
                         if (!pkt) {
@@ -1321,7 +1423,7 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
                                 break;
                         }
                         while (pkt != seg) {
-                               assert(pkt != (*rxq->elts)[idx]);
+                               MLX5_ASSERT(pkt != (*rxq->elts)[idx]);
                                 rep = NEXT(pkt);
                                 NEXT(pkt) = NULL;
                                 NB_SEGS(pkt) = 1;
@@ -1338,20 +1440,16 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
                                 break;
                         }
                         pkt = seg;
-                       assert(len >= (rxq->crc_present << 2));
+                       MLX5_ASSERT(len >= (rxq->crc_present << 2));
                         pkt->ol_flags &= EXT_ATTACHED_MBUF;
-                       /* If compressed, take hash result from mini-CQE. */
-                       rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ?
-                                                       cqe->rx_hash_res :
-                                                       mcqe->rx_hash_result);
-                       rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
+                       rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe);
                         if (rxq->crc_present)
                                 len -= RTE_ETHER_CRC_LEN;
                         PKT_LEN(pkt) = len;
                         if (cqe->lro_num_seg > 1) {
                                 mlx5_lro_update_hdr
                                         (rte_pktmbuf_mtod(pkt, uint8_t *), cqe,
-                                        len);
+                                        mcqe, rxq, len);
                                 pkt->ol_flags |= PKT_RX_LRO;
                                 pkt->tso_segsz = len / cqe->lro_num_seg;
                         }
@@ -1395,9 +1493,9 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
                 return 0;
         /* Update the consumer index. */
         rxq->rq_ci = rq_ci >> sges_n;
-       rte_cio_wmb();
+       rte_io_wmb();
         *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
-       rte_cio_wmb();
+       rte_io_wmb();
         *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
  #ifdef MLX5_PMD_SOFT_COUNTERS
         /* Increment packets counter. */
@@ -1419,12 +1517,10 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
   *   The L3 pseudo-header checksum.
   */
  static inline void
-mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp,
-                       volatile struct mlx5_cqe *restrict cqe,
-                       uint32_t phcsum)
+mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp,
+                       volatile struct mlx5_cqe *__rte_restrict cqe,
+                       uint32_t phcsum, uint8_t l4_type)
  {
-       uint8_t l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) &
-                          MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
         /*
          * The HW calculates only the TCP payload checksum, need to complete
          * the TCP header checksum and the L3 pseudo-header checksum.
@@ -1440,7 +1536,7 @@ mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp,
         if (cqe->lro_tcppsh_abort_dupack & MLX5_CQE_LRO_PUSH_MASK)
                 tcp->tcp_flags |= RTE_TCP_PSH_FLAG;
         tcp->cksum = 0;
-       csum += rte_raw_cksum(tcp, (tcp->data_off & 0xF) * 4);
+       csum += rte_raw_cksum(tcp, (tcp->data_off >> 4) * 4);
         csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff);
         csum = (~csum) & 0xffff;
         if (csum == 0)
@@ -1461,9 +1557,10 @@ mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp,
   *   The packet length.
   */
  static inline void
-mlx5_lro_update_hdr(uint8_t *restrict padd,
-                   volatile struct mlx5_cqe *restrict cqe,
-                   uint32_t len)
+mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
+                   volatile struct mlx5_cqe *__rte_restrict cqe,
+                   volatile struct mlx5_mini_cqe8 *mcqe,
+                   struct mlx5_rxq_data *rxq, uint32_t len)
  {
         union {
                 struct rte_ether_hdr *eth;
@@ -1477,6 +1574,7 @@ mlx5_lro_update_hdr(uint8_t *restrict padd,
         };
         uint16_t proto = h.eth->ether_type;
         uint32_t phcsum;
+       uint8_t l4_type;
  
         h.eth++;
         while (proto == RTE_BE16(RTE_ETHER_TYPE_VLAN) ||
@@ -1498,7 +1596,14 @@ mlx5_lro_update_hdr(uint8_t *restrict padd,
                 phcsum = rte_ipv6_phdr_cksum(h.ipv6, 0);
                 h.ipv6++;
         }
-       mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum);
+       if (mcqe == NULL ||
+           rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX)
+               l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) &
+                          MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
+       else
+               l4_type = (rte_be_to_cpu_16(mcqe->hdr_type) &
+                          MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
+       mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum, l4_type);
  }
  
  void
@@ -1506,10 +1611,11 @@ mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque)
  {
         struct mlx5_mprq_buf *buf = opaque;
  
-       if (rte_atomic16_read(&buf->refcnt) == 1) {
+       if (__atomic_load_n(&buf->refcnt, __ATOMIC_RELAXED) == 1) {
                 rte_mempool_put(buf->mp, buf);
-       } else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) {
-               rte_atomic16_set(&buf->refcnt, 1);
+       } else if (unlikely(__atomic_sub_fetch(&buf->refcnt, 1,
+                                              __ATOMIC_RELAXED) == 0)) {
+               __atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
                 rte_mempool_put(buf->mp, buf);
         }
  }
@@ -1520,31 +1626,6 @@ mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf)
         mlx5_mprq_buf_free_cb(NULL, buf);
  }
  
-static inline void
-mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,
-                const unsigned int strd_n)
-{
-       struct mlx5_mprq_buf *rep = rxq->mprq_repl;
-       volatile struct mlx5_wqe_data_seg *wqe =
-               &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;
-       void *addr;
-
-       assert(rep != NULL);
-       /* Replace MPRQ buf. */
-       (*rxq->mprq_bufs)[rq_idx] = rep;
-       /* Replace WQE. */
-       addr = mlx5_mprq_buf_addr(rep, strd_n);
-       wqe->addr = rte_cpu_to_be_64((uintptr_t)addr);
-       /* If there's only one MR, no need to replace LKey in WQE. */
-       if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
-               wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);
-       /* Stash a mbuf for next replacement. */
-       if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))
-               rxq->mprq_repl = rep;
-       else
-               rxq->mprq_repl = NULL;
-}
-
  /**
   * DPDK callback for RX with Multi-Packet RQ support.
   *
@@ -1562,51 +1643,29 @@ uint16_t
  mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
  {
         struct mlx5_rxq_data *rxq = dpdk_rxq;
-       const unsigned int strd_n = 1 << rxq->strd_num_n;
-       const unsigned int strd_sz = 1 << rxq->strd_sz_n;
-       const unsigned int strd_shift =
-               MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;
-       const unsigned int cq_mask = (1 << rxq->cqe_n) - 1;
-       const unsigned int wq_mask = (1 << rxq->elts_n) - 1;
+       const uint32_t strd_n = 1 << rxq->strd_num_n;
+       const uint32_t strd_sz = 1 << rxq->strd_sz_n;
+       const uint32_t cq_mask = (1 << rxq->cqe_n) - 1;
+       const uint32_t wq_mask = (1 << rxq->elts_n) - 1;
         volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
         unsigned int i = 0;
         uint32_t rq_ci = rxq->rq_ci;
         uint16_t consumed_strd = rxq->consumed_strd;
-       uint16_t headroom_sz = rxq->strd_headroom_en * RTE_PKTMBUF_HEADROOM;
         struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
  
         while (i < pkts_n) {
                 struct rte_mbuf *pkt;
-               void *addr;
                 int ret;
-               unsigned int len;
+               uint32_t len;
                 uint16_t strd_cnt;
                 uint16_t strd_idx;
-               uint32_t offset;
                 uint32_t byte_cnt;
                 volatile struct mlx5_mini_cqe8 *mcqe = NULL;
-               uint32_t rss_hash_res = 0;
-               uint8_t lro_num_seg;
+               enum mlx5_rqx_code rxq_code;
  
                 if (consumed_strd == strd_n) {
-                       /* Replace WQE only if the buffer is still in use. */
-                       if (rte_atomic16_read(&buf->refcnt) > 1) {
-                               mprq_buf_replace(rxq, rq_ci & wq_mask, strd_n);
-                               /* Release the old buffer. */
-                               mlx5_mprq_buf_free(buf);
-                       } else if (unlikely(rxq->mprq_repl == NULL)) {
-                               struct mlx5_mprq_buf *rep;
-
-                               /*
-                                * Currently, the MPRQ mempool is out of buffer
-                                * and doing memcpy regardless of the size of Rx
-                                * packet. Retry allocation to get back to
-                                * normal.
-                                */
-                               if (!rte_mempool_get(rxq->mprq_mp,
-                                                    (void **)&rep))
-                                       rxq->mprq_repl = rep;
-                       }
+                       /* Replace WQE if the buffer is still in use. */
+                       mprq_buf_replace(rxq, rq_ci & wq_mask);
                         /* Advance to the next WQE. */
                         consumed_strd = 0;
                         ++rq_ci;
@@ -1617,127 +1676,54 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
                 if (!ret)
                         break;
                 byte_cnt = ret;
-               strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >>
-                          MLX5_MPRQ_STRIDE_NUM_SHIFT;
-               assert(strd_cnt);
+               len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT;
+               MLX5_ASSERT((int)len >= (rxq->crc_present << 2));
+               if (rxq->crc_present)
+                       len -= RTE_ETHER_CRC_LEN;
+               if (mcqe &&
+                   rxq->mcqe_format == MLX5_CQE_RESP_FORMAT_FTAG_STRIDX)
+                       strd_cnt = (len / strd_sz) + !!(len % strd_sz);
+               else
+                       strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >>
+                                  MLX5_MPRQ_STRIDE_NUM_SHIFT;
+               MLX5_ASSERT(strd_cnt);
                 consumed_strd += strd_cnt;
                 if (byte_cnt & MLX5_MPRQ_FILLER_MASK)
                         continue;
-               if (mcqe == NULL) {
-                       rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res);
-                       strd_idx = rte_be_to_cpu_16(cqe->wqe_counter);
-               } else {
-                       /* mini-CQE for MPRQ doesn't have hash result. */
-                       strd_idx = rte_be_to_cpu_16(mcqe->stride_idx);
-               }
-               assert(strd_idx < strd_n);
-               assert(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) & wq_mask));
-               lro_num_seg = cqe->lro_num_seg;
-               /*
-                * Currently configured to receive a packet per a stride. But if
-                * MTU is adjusted through kernel interface, device could
-                * consume multiple strides without raising an error. In this
-                * case, the packet should be dropped because it is bigger than
-                * the max_rx_pkt_len.
-                */
-               if (unlikely(!lro_num_seg && strd_cnt > 1)) {
-                       ++rxq->stats.idropped;
-                       continue;
-               }
+               strd_idx = rte_be_to_cpu_16(mcqe == NULL ?
+                                       cqe->wqe_counter :
+                                       mcqe->stride_idx);
+               MLX5_ASSERT(strd_idx < strd_n);
+               MLX5_ASSERT(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) &
+                           wq_mask));
                 pkt = rte_pktmbuf_alloc(rxq->mp);
                 if (unlikely(pkt == NULL)) {
                         ++rxq->stats.rx_nombuf;
                         break;
                 }
                 len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT;
-               assert((int)len >= (rxq->crc_present << 2));
+               MLX5_ASSERT((int)len >= (rxq->crc_present << 2));
                 if (rxq->crc_present)
                         len -= RTE_ETHER_CRC_LEN;
-               offset = strd_idx * strd_sz + strd_shift;
-               addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);
-               /*
-                * Memcpy packets to the target mbuf if:
-                * - The size of packet is smaller than mprq_max_memcpy_len.
-                * - Out of buffer in the Mempool for Multi-Packet RQ.
-                */
-               if (len <= rxq->mprq_max_memcpy_len || rxq->mprq_repl == NULL) {
-                       /*
-                        * When memcpy'ing packet due to out-of-buffer, the
-                        * packet must be smaller than the target mbuf.
-                        */
-                       if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) {
-                               rte_pktmbuf_free_seg(pkt);
+               rxq_code = mprq_buf_to_pkt(rxq, pkt, len, buf,
+                                          strd_idx, strd_cnt);
+               if (unlikely(rxq_code != MLX5_RXQ_CODE_EXIT)) {
+                       rte_pktmbuf_free_seg(pkt);
+                       if (rxq_code == MLX5_RXQ_CODE_DROPPED) {
                                 ++rxq->stats.idropped;
                                 continue;
                         }
-                       rte_memcpy(rte_pktmbuf_mtod(pkt, void *), addr, len);
-                       DATA_LEN(pkt) = len;
-               } else {
-                       rte_iova_t buf_iova;
-                       struct rte_mbuf_ext_shared_info *shinfo;
-                       uint16_t buf_len = strd_cnt * strd_sz;
-                       void *buf_addr;
-
-                       /* Increment the refcnt of the whole chunk. */
-                       rte_atomic16_add_return(&buf->refcnt, 1);
-                       assert((uint16_t)rte_atomic16_read(&buf->refcnt) <=
-                              strd_n + 1);
-                       buf_addr = RTE_PTR_SUB(addr, headroom_sz);
-                       /*
-                        * MLX5 device doesn't use iova but it is necessary in a
-                        * case where the Rx packet is transmitted via a
-                        * different PMD.
-                        */
-                       buf_iova = rte_mempool_virt2iova(buf) +
-                                  RTE_PTR_DIFF(buf_addr, buf);
-                       shinfo = &buf->shinfos[strd_idx];
-                       rte_mbuf_ext_refcnt_set(shinfo, 1);
-                       /*
-                        * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when
-                        * attaching the stride to mbuf and more offload flags
-                        * will be added below by calling rxq_cq_to_mbuf().
-                        * Other fields will be overwritten.
-                        */
-                       rte_pktmbuf_attach_extbuf(pkt, buf_addr, buf_iova,
-                                                 buf_len, shinfo);
-                       /* Set mbuf head-room. */
-                       pkt->data_off = headroom_sz;
-                       assert(pkt->ol_flags == EXT_ATTACHED_MBUF);
-                       /*
-                        * Prevent potential overflow due to MTU change through
-                        * kernel interface.
-                        */
-                       if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) {
-                               rte_pktmbuf_free_seg(pkt);
-                               ++rxq->stats.idropped;
-                               continue;
-                       }
-                       DATA_LEN(pkt) = len;
-                       /*
-                        * LRO packet may consume all the stride memory, in this
-                        * case packet head-room space is not guaranteed so must
-                        * to add an empty mbuf for the head-room.
-                        */
-                       if (!rxq->strd_headroom_en) {
-                               struct rte_mbuf *headroom_mbuf =
-                                               rte_pktmbuf_alloc(rxq->mp);
-
-                               if (unlikely(headroom_mbuf == NULL)) {
-                                       rte_pktmbuf_free_seg(pkt);
-                                       ++rxq->stats.rx_nombuf;
-                                       break;
-                               }
-                               PORT(pkt) = rxq->port_id;
-                               NEXT(headroom_mbuf) = pkt;
-                               pkt = headroom_mbuf;
-                               NB_SEGS(pkt) = 2;
+                       if (rxq_code == MLX5_RXQ_CODE_NOMBUF) {
+                               ++rxq->stats.rx_nombuf;
+                               break;
                         }
                 }
-               rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
-               if (lro_num_seg > 1) {
-                       mlx5_lro_update_hdr(addr, cqe, len);
+               rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe);
+               if (cqe->lro_num_seg > 1) {
+                       mlx5_lro_update_hdr(rte_pktmbuf_mtod(pkt, uint8_t *),
+                                           cqe, mcqe, rxq, len);
                         pkt->ol_flags |= PKT_RX_LRO;
-                       pkt->tso_segsz = strd_sz;
+                       pkt->tso_segsz = len / cqe->lro_num_seg;
                 }
                 PKT_LEN(pkt) = len;
                 PORT(pkt) = rxq->port_id;
@@ -1751,11 +1737,11 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
         }
         /* Update the consumer indexes. */
         rxq->consumed_strd = consumed_strd;
-       rte_cio_wmb();
+       rte_io_wmb();
         *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
         if (rq_ci != rxq->rq_ci) {
                 rxq->rq_ci = rq_ci;
-               rte_cio_wmb();
+               rte_io_wmb();
                 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
         }
  #ifdef MLX5_PMD_SOFT_COUNTERS
@@ -1830,6 +1816,14 @@ mlx5_rx_burst_vec(void *dpdk_txq __rte_unused,
         return 0;
  }
  
+__rte_weak uint16_t
+mlx5_rx_burst_mprq_vec(void *dpdk_txq __rte_unused,
+                      struct rte_mbuf **pkts __rte_unused,
+                      uint16_t pkts_n __rte_unused)
+{
+       return 0;
+}
+
  __rte_weak int
  mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
  {
@@ -1854,7 +1848,7 @@ mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused)
   *   compile time and may be used for optimization.
   */
  static __rte_always_inline void
-mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts,
+mlx5_tx_free_mbuf(struct rte_mbuf **__rte_restrict pkts,
                   unsigned int pkts_n,
                   unsigned int olx __rte_unused)
  {
@@ -1868,8 +1862,8 @@ mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts,
          * copying pointers to temporary array
          * for rte_mempool_put_bulk() calls.
          */
-       assert(pkts);
-       assert(pkts_n);
+       MLX5_ASSERT(pkts);
+       MLX5_ASSERT(pkts_n);
         for (;;) {
                 for (;;) {
                         /*
@@ -1878,7 +1872,7 @@ mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts,
                          */
                         mbuf = rte_pktmbuf_prefree_seg(*pkts);
                         if (likely(mbuf != NULL)) {
-                               assert(mbuf == *pkts);
+                               MLX5_ASSERT(mbuf == *pkts);
                                 if (likely(n_free != 0)) {
                                         if (unlikely(pool != mbuf->pool))
                                                 /* From different pool. */
@@ -1915,9 +1909,9 @@ mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts,
                          * This loop is implemented to avoid multiple
                          * inlining of rte_mempool_put_bulk().
                          */
-                       assert(pool);
-                       assert(p_free);
-                       assert(n_free);
+                       MLX5_ASSERT(pool);
+                       MLX5_ASSERT(p_free);
+                       MLX5_ASSERT(n_free);
                         /*
                          * Free the array of pre-freed mbufs
                          * belonging to the same memory pool.
@@ -1959,14 +1953,14 @@ mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts,
   *   compile time and may be used for optimization.
   */
  static __rte_always_inline void
-mlx5_tx_free_elts(struct mlx5_txq_data *restrict txq,
+mlx5_tx_free_elts(struct mlx5_txq_data *__rte_restrict txq,
                   uint16_t tail,
                   unsigned int olx __rte_unused)
  {
         uint16_t n_elts = tail - txq->elts_tail;
  
-       assert(n_elts);
-       assert(n_elts <= txq->elts_s);
+       MLX5_ASSERT(n_elts);
+       MLX5_ASSERT(n_elts <= txq->elts_s);
         /*
          * Implement a loop to support ring buffer wraparound
          * with single inlining of mlx5_tx_free_mbuf().
@@ -1976,8 +1970,8 @@ mlx5_tx_free_elts(struct mlx5_txq_data *restrict txq,
  
                 part = txq->elts_s - (txq->elts_tail & txq->elts_m);
                 part = RTE_MIN(part, n_elts);
-               assert(part);
-               assert(part <= txq->elts_s);
+               MLX5_ASSERT(part);
+               MLX5_ASSERT(part <= txq->elts_s);
                 mlx5_tx_free_mbuf(&txq->elts[txq->elts_tail & txq->elts_m],
                                   part, olx);
                 txq->elts_tail += part;
@@ -2000,19 +1994,19 @@ mlx5_tx_free_elts(struct mlx5_txq_data *restrict txq,
   *   compile time and may be used for optimization.
   */
  static __rte_always_inline void
-mlx5_tx_copy_elts(struct mlx5_txq_data *restrict txq,
-                 struct rte_mbuf **restrict pkts,
+mlx5_tx_copy_elts(struct mlx5_txq_data *__rte_restrict txq,
+                 struct rte_mbuf **__rte_restrict pkts,
                   unsigned int pkts_n,
                   unsigned int olx __rte_unused)
  {
         unsigned int part;
         struct rte_mbuf **elts = (struct rte_mbuf **)txq->elts;
  
-       assert(pkts);
-       assert(pkts_n);
+       MLX5_ASSERT(pkts);
+       MLX5_ASSERT(pkts_n);
         part = txq->elts_s - (txq->elts_head & txq->elts_m);
-       assert(part);
-       assert(part <= txq->elts_s);
+       MLX5_ASSERT(part);
+       MLX5_ASSERT(part <= txq->elts_s);
         /* This code is a good candidate for vectorizing with SIMD. */
         rte_memcpy((void *)(elts + (txq->elts_head & txq->elts_m)),
                    (void *)pkts,
@@ -2037,7 +2031,7 @@ mlx5_tx_copy_elts(struct mlx5_txq_data *restrict txq,
   *   compile time and may be used for optimization.
   */
  static __rte_always_inline void
-mlx5_tx_comp_flush(struct mlx5_txq_data *restrict txq,
+mlx5_tx_comp_flush(struct mlx5_txq_data *__rte_restrict txq,
                    volatile struct mlx5_cqe *last_cqe,
                    unsigned int olx __rte_unused)
  {
@@ -2048,7 +2042,7 @@ mlx5_tx_comp_flush(struct mlx5_txq_data *restrict txq,
                 tail = txq->fcqs[(txq->cq_ci - 1) & txq->cqe_m];
                 if (likely(tail != txq->elts_tail)) {
                         mlx5_tx_free_elts(txq, tail, olx);
-                       assert(tail == txq->elts_tail);
+                       MLX5_ASSERT(tail == txq->elts_tail);
                 }
         }
  }
@@ -2068,12 +2062,12 @@ mlx5_tx_comp_flush(struct mlx5_txq_data *restrict txq,
   * routine smaller, simple and faster - from experiments.
   */
  static void
-mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq,
+mlx5_tx_handle_completion(struct mlx5_txq_data *__rte_restrict txq,
                           unsigned int olx __rte_unused)
  {
         unsigned int count = MLX5_TX_COMP_MAX_CQE;
         volatile struct mlx5_cqe *last_cqe = NULL;
-       uint16_t ci = txq->cq_ci;
+       bool ring_doorbell = false;
         int ret;
  
         static_assert(MLX5_CQE_STATUS_HW_OWN < 0, "Must be negative value");
@@ -2081,12 +2075,12 @@ mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq,
         do {
                 volatile struct mlx5_cqe *cqe;
  
-               cqe = &txq->cqes[ci & txq->cqe_m];
-               ret = check_cqe(cqe, txq->cqe_s, ci);
+               cqe = &txq->cqes[txq->cq_ci & txq->cqe_m];
+               ret = check_cqe(cqe, txq->cqe_s, txq->cq_ci);
                 if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
                         if (likely(ret != MLX5_CQE_STATUS_ERR)) {
                                 /* No new CQEs in completion queue. */
-                               assert(ret == MLX5_CQE_STATUS_HW_OWN);
+                               MLX5_ASSERT(ret == MLX5_CQE_STATUS_HW_OWN);
                                 break;
                         }
                         /*
@@ -2096,7 +2090,6 @@ mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq,
                          * here, before we might perform SQ reset.
                          */
                         rte_wmb();
-                       txq->cq_ci = ci;
                         ret = mlx5_tx_error_cqe_handle
                                 (txq, (volatile struct mlx5_err_cqe *)cqe);
                         if (unlikely(ret < 0)) {
@@ -2112,15 +2105,18 @@ mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq,
                          * MLX5_CQE_SYNDROME_WR_FLUSH_ERR status.
                          * The send queue is supposed to be empty.
                          */
-                       ++ci;
-                       txq->cq_pi = ci;
+                       ring_doorbell = true;
+                       ++txq->cq_ci;
+                       txq->cq_pi = txq->cq_ci;
                         last_cqe = NULL;
                         continue;
                 }
                 /* Normal transmit completion. */
-               assert(ci != txq->cq_pi);
-               assert((txq->fcqs[ci & txq->cqe_m] >> 16) == cqe->wqe_counter);
-               ++ci;
+               MLX5_ASSERT(txq->cq_ci != txq->cq_pi);
+               MLX5_ASSERT((txq->fcqs[txq->cq_ci & txq->cqe_m] >> 16) ==
+                           cqe->wqe_counter);
+               ring_doorbell = true;
+               ++txq->cq_ci;
                 last_cqe = cqe;
                 /*
                  * We have to restrict the amount of processed CQEs
@@ -2133,14 +2129,10 @@ mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq,
                 if (likely(--count == 0))
                         break;
         } while (true);
-       if (likely(ci != txq->cq_ci)) {
-               /*
-                * Update completion queue consuming index
-                * and ring doorbell to notify hardware.
-                */
+       if (likely(ring_doorbell)) {
+               /* Ring doorbell to notify hardware. */
                 rte_compiler_barrier();
-               txq->cq_ci = ci;
-               *txq->cq_db = rte_cpu_to_be_32(ci);
+               *txq->cq_db = rte_cpu_to_be_32(txq->cq_ci);
                 mlx5_tx_comp_flush(txq, last_cqe, olx);
         }
  }
@@ -2159,8 +2151,8 @@ mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq,
   *   compile time and may be used for optimization.
   */
  static __rte_always_inline void
-mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq,
-                          struct mlx5_txq_local *restrict loc,
+mlx5_tx_request_completion(struct mlx5_txq_data *__rte_restrict txq,
+                          struct mlx5_txq_local *__rte_restrict loc,
                            unsigned int olx)
  {
         uint16_t head = txq->elts_head;
@@ -2174,6 +2166,7 @@ mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq,
              (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres)) {
                 volatile struct mlx5_wqe *last = loc->wqe_last;
  
+               MLX5_ASSERT(last);
                 txq->elts_comp = head;
                 if (MLX5_TXOFF_CONFIG(INLINE))
                         txq->wqe_comp = txq->wqe_ci;
@@ -2181,14 +2174,14 @@ mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq,
                 last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
                                             MLX5_COMP_MODE_OFFSET);
                 /* Save elts_head in dedicated free on completion queue. */
-#ifdef NDEBUG
-               txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head;
-#else
+#ifdef RTE_LIBRTE_MLX5_DEBUG
                 txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head |
-                                       (last->cseg.opcode >> 8) << 16;
+                         (last->cseg.opcode >> 8) << 16;
+#else
+               txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head;
  #endif
                 /* A CQE slot must always be available. */
-               assert((txq->cq_pi - txq->cq_ci) <= txq->cqe_s);
+               MLX5_ASSERT((txq->cq_pi - txq->cq_ci) <= txq->cqe_s);
         }
  }
  
@@ -2206,7 +2199,7 @@ mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq,
  int
  mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
  {
-       struct mlx5_txq_data *restrict txq = tx_queue;
+       struct mlx5_txq_data *__rte_restrict txq = tx_queue;
         uint16_t used;
  
         mlx5_tx_handle_completion(txq, 0);
@@ -2237,14 +2230,14 @@ mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
   *   compile time and may be used for optimization.
   */
  static __rte_always_inline void
-mlx5_tx_cseg_init(struct mlx5_txq_data *restrict txq,
-                 struct mlx5_txq_local *restrict loc __rte_unused,
-                 struct mlx5_wqe *restrict wqe,
+mlx5_tx_cseg_init(struct mlx5_txq_data *__rte_restrict txq,
+                 struct mlx5_txq_local *__rte_restrict loc __rte_unused,
+                 struct mlx5_wqe *__rte_restrict wqe,
                   unsigned int ds,
                   unsigned int opcode,
                   unsigned int olx __rte_unused)
  {
-       struct mlx5_wqe_cseg *restrict cs = &wqe->cseg;
+       struct mlx5_wqe_cseg *__rte_restrict cs = &wqe->cseg;
  
         /* For legacy MPW replace the EMPW by TSO with modifier. */
         if (MLX5_TXOFF_CONFIG(MPW) && opcode == MLX5_OPCODE_ENHANCED_MPSW)
@@ -2256,6 +2249,37 @@ mlx5_tx_cseg_init(struct mlx5_txq_data *restrict txq,
         cs->misc = RTE_BE32(0);
  }
  
+/**
+ * Build the Synchronize Queue Segment with specified completion index.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Control Segment.
+ * @param wci
+ *   Completion index in Clock Queue to wait.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_wseg_init(struct mlx5_txq_data *restrict txq,
+                 struct mlx5_txq_local *restrict loc __rte_unused,
+                 struct mlx5_wqe *restrict wqe,
+                 unsigned int wci,
+                 unsigned int olx __rte_unused)
+{
+       struct mlx5_wqe_qseg *qs;
+
+       qs = RTE_PTR_ADD(wqe, MLX5_WSEG_SIZE);
+       qs->max_index = rte_cpu_to_be_32(wci);
+       qs->qpn_cqn = rte_cpu_to_be_32(txq->sh->txpp.clock_queue.cq->id);
+       qs->reserved0 = RTE_BE32(0);
+       qs->reserved1 = RTE_BE32(0);
+}
+
  /**
   * Build the Ethernet Segment without inlined data.
   * Supports Software Parser, Checksums and VLAN
@@ -2272,12 +2296,12 @@ mlx5_tx_cseg_init(struct mlx5_txq_data *restrict txq,
   *   compile time and may be used for optimization.
   */
  static __rte_always_inline void
-mlx5_tx_eseg_none(struct mlx5_txq_data *restrict txq __rte_unused,
-                 struct mlx5_txq_local *restrict loc,
-                 struct mlx5_wqe *restrict wqe,
+mlx5_tx_eseg_none(struct mlx5_txq_data *__rte_restrict txq __rte_unused,
+                 struct mlx5_txq_local *__rte_restrict loc,
+                 struct mlx5_wqe *__rte_restrict wqe,
                   unsigned int olx)
  {
-       struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
+       struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg;
         uint32_t csum;
  
         /*
@@ -2302,7 +2326,7 @@ mlx5_tx_eseg_none(struct mlx5_txq_data *restrict txq __rte_unused,
                  * We should get here only if device support
                  * this feature correctly.
                  */
-               assert(txq->vlan_en);
+               MLX5_ASSERT(txq->vlan_en);
                 es->inline_hdr = rte_cpu_to_be_32(MLX5_ETH_WQE_VLAN_INSERT |
                                                   loc->mbuf->vlan_tci);
         } else {
@@ -2330,13 +2354,13 @@ mlx5_tx_eseg_none(struct mlx5_txq_data *restrict txq __rte_unused,
   *   compile time and may be used for optimization.
   */
  static __rte_always_inline void
-mlx5_tx_eseg_dmin(struct mlx5_txq_data *restrict txq __rte_unused,
-                 struct mlx5_txq_local *restrict loc,
-                 struct mlx5_wqe *restrict wqe,
+mlx5_tx_eseg_dmin(struct mlx5_txq_data *__rte_restrict txq __rte_unused,
+                 struct mlx5_txq_local *__rte_restrict loc,
+                 struct mlx5_wqe *__rte_restrict wqe,
                   unsigned int vlan,
                   unsigned int olx)
  {
-       struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
+       struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg;
         uint32_t csum;
         uint8_t *psrc, *pdst;
  
@@ -2380,7 +2404,7 @@ mlx5_tx_eseg_dmin(struct mlx5_txq_data *restrict txq __rte_unused,
                                                  loc->mbuf->vlan_tci);
                 pdst += sizeof(struct rte_vlan_hdr);
                 /* Copy the rest two bytes from packet data. */
-               assert(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t)));
+               MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t)));
                 *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc;
         } else {
                 /* Fill the gap in the title WQEBB with inline data. */
@@ -2414,15 +2438,15 @@ mlx5_tx_eseg_dmin(struct mlx5_txq_data *restrict txq __rte_unused,
   *   Pointer to the next Data Segment (aligned and wrapped around).
   */
  static __rte_always_inline struct mlx5_wqe_dseg *
-mlx5_tx_eseg_data(struct mlx5_txq_data *restrict txq,
-                 struct mlx5_txq_local *restrict loc,
-                 struct mlx5_wqe *restrict wqe,
+mlx5_tx_eseg_data(struct mlx5_txq_data *__rte_restrict txq,
+                 struct mlx5_txq_local *__rte_restrict loc,
+                 struct mlx5_wqe *__rte_restrict wqe,
                   unsigned int vlan,
                   unsigned int inlen,
                   unsigned int tso,
                   unsigned int olx)
  {
-       struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
+       struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg;
         uint32_t csum;
         uint8_t *psrc, *pdst;
         unsigned int part;
@@ -2473,7 +2497,7 @@ mlx5_tx_eseg_data(struct mlx5_txq_data *restrict txq,
                                                  loc->mbuf->vlan_tci);
                 pdst += sizeof(struct rte_vlan_hdr);
                 /* Copy the rest two bytes from packet data. */
-               assert(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t)));
+               MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t)));
                 *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc;
                 psrc += sizeof(uint16_t);
         } else {
@@ -2482,11 +2506,11 @@ mlx5_tx_eseg_data(struct mlx5_txq_data *restrict txq,
                 psrc += sizeof(rte_v128u32_t);
         }
         pdst = (uint8_t *)(es + 2);
-       assert(inlen >= MLX5_ESEG_MIN_INLINE_SIZE);
-       assert(pdst < (uint8_t *)txq->wqes_end);
+       MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE);
+       MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end);
         inlen -= MLX5_ESEG_MIN_INLINE_SIZE;
         if (!inlen) {
-               assert(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE));
+               MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE));
                 return (struct mlx5_wqe_dseg *)pdst;
         }
         /*
@@ -2527,21 +2551,30 @@ mlx5_tx_eseg_data(struct mlx5_txq_data *restrict txq,
   *   Pointer to burst routine local context.
   * @param len
   *   Length of data to be copied.
+ * @param must
+ *   Length of data to be copied ignoring no inline hint.
   * @param olx
   *   Configured Tx offloads mask. It is fully defined at
   *   compile time and may be used for optimization.
+ *
+ * @return
+ *   Number of actual copied data bytes. This is always greater than or
+ *   equal to must parameter and might be lesser than len in no inline
+ *   hint flag is encountered.
   */
-static __rte_always_inline void
+static __rte_always_inline unsigned int
  mlx5_tx_mseg_memcpy(uint8_t *pdst,
-                   struct mlx5_txq_local *restrict loc,
+                   struct mlx5_txq_local *__rte_restrict loc,
                     unsigned int len,
+                   unsigned int must,
                     unsigned int olx __rte_unused)
  {
         struct rte_mbuf *mbuf;
-       unsigned int part, dlen;
+       unsigned int part, dlen, copy = 0;
         uint8_t *psrc;
  
-       assert(len);
+       MLX5_ASSERT(len);
+       MLX5_ASSERT(must <= len);
         do {
                 /* Allow zero length packets, must check first. */
                 dlen = rte_pktmbuf_data_len(loc->mbuf);
@@ -2551,9 +2584,28 @@ mlx5_tx_mseg_memcpy(uint8_t *pdst,
                         loc->mbuf = mbuf->next;
                         rte_pktmbuf_free_seg(mbuf);
                         loc->mbuf_off = 0;
-                       assert(loc->mbuf_nseg > 1);
-                       assert(loc->mbuf);
+                       MLX5_ASSERT(loc->mbuf_nseg > 1);
+                       MLX5_ASSERT(loc->mbuf);
                         --loc->mbuf_nseg;
+                       if (loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) {
+                               unsigned int diff;
+
+                               if (copy >= must) {
+                                       /*
+                                        * We already copied the minimal
+                                        * requested amount of data.
+                                        */
+                                       return copy;
+                               }
+                               diff = must - copy;
+                               if (diff <= rte_pktmbuf_data_len(loc->mbuf)) {
+                                       /*
+                                        * Copy only the minimal required
+                                        * part of the data buffer.
+                                        */
+                                       len = diff;
+                               }
+                       }
                         continue;
                 }
                 dlen -= loc->mbuf_off;
@@ -2561,6 +2613,7 @@ mlx5_tx_mseg_memcpy(uint8_t *pdst,
                                                loc->mbuf_off);
                 part = RTE_MIN(len, dlen);
                 rte_memcpy(pdst, psrc, part);
+               copy += part;
                 loc->mbuf_off += part;
                 len -= part;
                 if (!len) {
@@ -2571,10 +2624,10 @@ mlx5_tx_mseg_memcpy(uint8_t *pdst,
                                 loc->mbuf = mbuf->next;
                                 rte_pktmbuf_free_seg(mbuf);
                                 loc->mbuf_off = 0;
-                               assert(loc->mbuf_nseg >= 1);
+                               MLX5_ASSERT(loc->mbuf_nseg >= 1);
                                 --loc->mbuf_nseg;
                         }
-                       return;
+                       return copy;
                 }
                 pdst += part;
         } while (true);
@@ -2608,18 +2661,18 @@ mlx5_tx_mseg_memcpy(uint8_t *pdst,
   *   wrapping check on its own).
   */
  static __rte_always_inline struct mlx5_wqe_dseg *
-mlx5_tx_eseg_mdat(struct mlx5_txq_data *restrict txq,
-                 struct mlx5_txq_local *restrict loc,
-                 struct mlx5_wqe *restrict wqe,
+mlx5_tx_eseg_mdat(struct mlx5_txq_data *__rte_restrict txq,
+                 struct mlx5_txq_local *__rte_restrict loc,
+                 struct mlx5_wqe *__rte_restrict wqe,
                   unsigned int vlan,
                   unsigned int inlen,
                   unsigned int tso,
                   unsigned int olx)
  {
-       struct mlx5_wqe_eseg *restrict es = &wqe->eseg;
+       struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg;
         uint32_t csum;
         uint8_t *pdst;
-       unsigned int part;
+       unsigned int part, tlen = 0;
  
         /*
          * Calculate and set check sum flags first, uint32_t field
@@ -2651,37 +2704,46 @@ mlx5_tx_eseg_mdat(struct mlx5_txq_data *restrict txq,
                                  sizeof(struct rte_vlan_hdr) +
                                  2 * RTE_ETHER_ADDR_LEN),
                       "invalid Ethernet Segment data size");
-       assert(inlen >= MLX5_ESEG_MIN_INLINE_SIZE);
-       es->inline_hdr_sz = rte_cpu_to_be_16(inlen);
+       MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE);
         pdst = (uint8_t *)&es->inline_data;
         if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
                 /* Implement VLAN tag insertion as part inline data. */
-               mlx5_tx_mseg_memcpy(pdst, loc, 2 * RTE_ETHER_ADDR_LEN, olx);
+               mlx5_tx_mseg_memcpy(pdst, loc,
+                                   2 * RTE_ETHER_ADDR_LEN,
+                                   2 * RTE_ETHER_ADDR_LEN, olx);
                 pdst += 2 * RTE_ETHER_ADDR_LEN;
                 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32
                                                 ((RTE_ETHER_TYPE_VLAN << 16) |
                                                  loc->mbuf->vlan_tci);
                 pdst += sizeof(struct rte_vlan_hdr);
-               inlen -= 2 * RTE_ETHER_ADDR_LEN + sizeof(struct rte_vlan_hdr);
+               tlen += 2 * RTE_ETHER_ADDR_LEN + sizeof(struct rte_vlan_hdr);
         }
-       assert(pdst < (uint8_t *)txq->wqes_end);
+       MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end);
         /*
          * The WQEBB space availability is checked by caller.
          * Here we should be aware of WQE ring buffer wraparound only.
          */
         part = (uint8_t *)txq->wqes_end - pdst;
-       part = RTE_MIN(part, inlen);
-       assert(part);
+       part = RTE_MIN(part, inlen - tlen);
+       MLX5_ASSERT(part);
         do {
-               mlx5_tx_mseg_memcpy(pdst, loc, part, olx);
-               inlen -= part;
-               if (likely(!inlen)) {
-                       pdst += part;
+               unsigned int copy;
+
+               /*
+                * Copying may be interrupted inside the routine
+                * if run into no inline hint flag.
+                */
+               copy = tlen >= txq->inlen_mode ? 0 : (txq->inlen_mode - tlen);
+               copy = mlx5_tx_mseg_memcpy(pdst, loc, part, copy, olx);
+               tlen += copy;
+               if (likely(inlen <= tlen) || copy < part) {
+                       es->inline_hdr_sz = rte_cpu_to_be_16(tlen);
+                       pdst += copy;
                         pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
                         return (struct mlx5_wqe_dseg *)pdst;
                 }
                 pdst = (uint8_t *)txq->wqes;
-               part = inlen;
+               part = inlen - tlen;
         } while (true);
  }
  
@@ -2703,15 +2765,15 @@ mlx5_tx_eseg_mdat(struct mlx5_txq_data *restrict txq,
   *   compile time and may be used for optimization.
   */
  static __rte_always_inline void
-mlx5_tx_dseg_ptr(struct mlx5_txq_data *restrict txq,
-                struct mlx5_txq_local *restrict loc,
-                struct mlx5_wqe_dseg *restrict dseg,
+mlx5_tx_dseg_ptr(struct mlx5_txq_data *__rte_restrict txq,
+                struct mlx5_txq_local *__rte_restrict loc,
+                struct mlx5_wqe_dseg *__rte_restrict dseg,
                  uint8_t *buf,
                  unsigned int len,
                  unsigned int olx __rte_unused)
  
  {
-       assert(len);
+       MLX5_ASSERT(len);
         dseg->bcount = rte_cpu_to_be_32(len);
         dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf);
         dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf);
@@ -2737,9 +2799,9 @@ mlx5_tx_dseg_ptr(struct mlx5_txq_data *restrict txq,
   *   compile time and may be used for optimization.
   */
  static __rte_always_inline void
-mlx5_tx_dseg_iptr(struct mlx5_txq_data *restrict txq,
-                 struct mlx5_txq_local *restrict loc,
-                 struct mlx5_wqe_dseg *restrict dseg,
+mlx5_tx_dseg_iptr(struct mlx5_txq_data *__rte_restrict txq,
+                 struct mlx5_txq_local *__rte_restrict loc,
+                 struct mlx5_wqe_dseg *__rte_restrict dseg,
                   uint8_t *buf,
                   unsigned int len,
                   unsigned int olx __rte_unused)
@@ -2747,7 +2809,7 @@ mlx5_tx_dseg_iptr(struct mlx5_txq_data *restrict txq,
  {
         uintptr_t dst, src;
  
-       assert(len);
+       MLX5_ASSERT(len);
         if (len > MLX5_DSEG_MIN_INLINE_SIZE) {
                 dseg->bcount = rte_cpu_to_be_32(len);
                 dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf);
@@ -2761,7 +2823,7 @@ mlx5_tx_dseg_iptr(struct mlx5_txq_data *restrict txq,
         src = (uintptr_t)buf;
         if (len & 0x08) {
  #ifdef RTE_ARCH_STRICT_ALIGN
-               assert(dst == RTE_PTR_ALIGN(dst, sizeof(uint32_t)));
+               MLX5_ASSERT(dst == RTE_PTR_ALIGN(dst, sizeof(uint32_t)));
                 *(uint32_t *)dst = *(unaligned_uint32_t *)src;
                 dst += sizeof(uint32_t);
                 src += sizeof(uint32_t);
@@ -2813,9 +2875,9 @@ mlx5_tx_dseg_iptr(struct mlx5_txq_data *restrict txq,
   *   last packet in the eMPW session.
   */
  static __rte_always_inline struct mlx5_wqe_dseg *
-mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq,
-                 struct mlx5_txq_local *restrict loc __rte_unused,
-                 struct mlx5_wqe_dseg *restrict dseg,
+mlx5_tx_dseg_empw(struct mlx5_txq_data *__rte_restrict txq,
+                 struct mlx5_txq_local *__rte_restrict loc __rte_unused,
+                 struct mlx5_wqe_dseg *__rte_restrict dseg,
                   uint8_t *buf,
                   unsigned int len,
                   unsigned int olx __rte_unused)
@@ -2823,8 +2885,14 @@ mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq,
         unsigned int part;
         uint8_t *pdst;
  
-       dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
-       pdst = &dseg->inline_data[0];
+       if (!MLX5_TXOFF_CONFIG(MPW)) {
+               /* Store the descriptor byte counter for eMPW sessions. */
+               dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
+               pdst = &dseg->inline_data[0];
+       } else {
+               /* The entire legacy MPW session counter is stored on close. */
+               pdst = (uint8_t *)dseg;
+       }
         /*
          * The WQEBB space availability is checked by caller.
          * Here we should be aware of WQE ring buffer wraparound only.
@@ -2836,7 +2904,8 @@ mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq,
                 len -= part;
                 if (likely(!len)) {
                         pdst += part;
-                       pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
+                       if (!MLX5_TXOFF_CONFIG(MPW))
+                               pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
                         /* Note: no final wraparound check here. */
                         return (struct mlx5_wqe_dseg *)pdst;
                 }
@@ -2869,9 +2938,9 @@ mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq,
   *   Ring buffer wraparound check is needed.
   */
  static __rte_always_inline struct mlx5_wqe_dseg *
-mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq,
-                 struct mlx5_txq_local *restrict loc __rte_unused,
-                 struct mlx5_wqe_dseg *restrict dseg,
+mlx5_tx_dseg_vlan(struct mlx5_txq_data *__rte_restrict txq,
+                 struct mlx5_txq_local *__rte_restrict loc __rte_unused,
+                 struct mlx5_wqe_dseg *__rte_restrict dseg,
                   uint8_t *buf,
                   unsigned int len,
                   unsigned int olx __rte_unused)
@@ -2880,19 +2949,26 @@ mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq,
         unsigned int part;
         uint8_t *pdst;
  
-       assert(len > MLX5_ESEG_MIN_INLINE_SIZE);
+       MLX5_ASSERT(len > MLX5_ESEG_MIN_INLINE_SIZE);
         static_assert(MLX5_DSEG_MIN_INLINE_SIZE ==
                                  (2 * RTE_ETHER_ADDR_LEN),
                       "invalid Data Segment data size");
-       dseg->bcount = rte_cpu_to_be_32((len + sizeof(struct rte_vlan_hdr)) |
-                                       MLX5_ETH_WQE_DATA_INLINE);
-       pdst = &dseg->inline_data[0];
+       if (!MLX5_TXOFF_CONFIG(MPW)) {
+               /* Store the descriptor byte counter for eMPW sessions. */
+               dseg->bcount = rte_cpu_to_be_32
+                               ((len + sizeof(struct rte_vlan_hdr)) |
+                                MLX5_ETH_WQE_DATA_INLINE);
+               pdst = &dseg->inline_data[0];
+       } else {
+               /* The entire legacy MPW session counter is stored on close. */
+               pdst = (uint8_t *)dseg;
+       }
         memcpy(pdst, buf, MLX5_DSEG_MIN_INLINE_SIZE);
         buf += MLX5_DSEG_MIN_INLINE_SIZE;
         pdst += MLX5_DSEG_MIN_INLINE_SIZE;
         len -= MLX5_DSEG_MIN_INLINE_SIZE;
         /* Insert VLAN ethertype + VLAN tag. Pointer is aligned. */
-       assert(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE));
+       MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE));
         if (unlikely(pdst >= (uint8_t *)txq->wqes_end))
                 pdst = (uint8_t *)txq->wqes;
         *(uint32_t *)pdst = rte_cpu_to_be_32((RTE_ETHER_TYPE_VLAN << 16) |
@@ -2909,7 +2985,8 @@ mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq,
                 len -= part;
                 if (likely(!len)) {
                         pdst += part;
-                       pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
+                       if (!MLX5_TXOFF_CONFIG(MPW))
+                               pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
                         /* Note: no final wraparound check here. */
                         return (struct mlx5_wqe_dseg *)pdst;
                 }
@@ -2949,18 +3026,18 @@ mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq,
   *   Actual size of built WQE in segments.
   */
  static __rte_always_inline unsigned int
-mlx5_tx_mseg_build(struct mlx5_txq_data *restrict txq,
-                  struct mlx5_txq_local *restrict loc,
-                  struct mlx5_wqe *restrict wqe,
+mlx5_tx_mseg_build(struct mlx5_txq_data *__rte_restrict txq,
+                  struct mlx5_txq_local *__rte_restrict loc,
+                  struct mlx5_wqe *__rte_restrict wqe,
                    unsigned int vlan,
                    unsigned int inlen,
                    unsigned int tso,
                    unsigned int olx __rte_unused)
  {
-       struct mlx5_wqe_dseg *restrict dseg;
+       struct mlx5_wqe_dseg *__rte_restrict dseg;
         unsigned int ds;
  
-       assert((rte_pktmbuf_pkt_len(loc->mbuf) + vlan) >= inlen);
+       MLX5_ASSERT((rte_pktmbuf_pkt_len(loc->mbuf) + vlan) >= inlen);
         loc->mbuf_nseg = NB_SEGS(loc->mbuf);
         loc->mbuf_off = 0;
  
@@ -2981,8 +3058,8 @@ mlx5_tx_mseg_build(struct mlx5_txq_data *restrict txq,
                  * Non-zero offset means there are some data
                  * remained in the packet.
                  */
-               assert(loc->mbuf_off < rte_pktmbuf_data_len(loc->mbuf));
-               assert(rte_pktmbuf_data_len(loc->mbuf));
+               MLX5_ASSERT(loc->mbuf_off < rte_pktmbuf_data_len(loc->mbuf));
+               MLX5_ASSERT(rte_pktmbuf_data_len(loc->mbuf));
                 dptr = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *,
                                                loc->mbuf_off);
                 dlen = rte_pktmbuf_data_len(loc->mbuf) - loc->mbuf_off;
@@ -2994,7 +3071,7 @@ mlx5_tx_mseg_build(struct mlx5_txq_data *restrict txq,
                         dseg = (struct mlx5_wqe_dseg *)txq->wqes;
                 mlx5_tx_dseg_iptr(txq, loc, dseg, dptr, dlen, olx);
                 /* Store the mbuf to be freed on completion. */
-               assert(loc->elts_free);
+               MLX5_ASSERT(loc->elts_free);
                 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
                 --loc->elts_free;
                 ++dseg;
@@ -3020,7 +3097,7 @@ mlx5_tx_mseg_build(struct mlx5_txq_data *restrict txq,
                                 (txq, loc, dseg,
                                  rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
                                  rte_pktmbuf_data_len(loc->mbuf), olx);
-                       assert(loc->elts_free);
+                       MLX5_ASSERT(loc->elts_free);
                         txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
                         --loc->elts_free;
                         ++dseg;
@@ -3040,6 +3117,59 @@ dseg_done:
         return ds;
  }
  
+/**
+ * The routine checks timestamp flag in the current packet,
+ * and push WAIT WQE into the queue if scheduling is required.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_SINGLE - continue processing with the packet.
+ *   MLX5_TXCMP_CODE_MULTI - the WAIT inserted, continue processing.
+ * Local context variables partially updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_schedule_send(struct mlx5_txq_data *restrict txq,
+                     struct mlx5_txq_local *restrict loc,
+                     unsigned int olx)
+{
+       if (MLX5_TXOFF_CONFIG(TXPP) &&
+           loc->mbuf->ol_flags & txq->ts_mask) {
+               struct mlx5_wqe *wqe;
+               uint64_t ts;
+               int32_t wci;
+
+               /*
+                * Estimate the required space quickly and roughly.
+                * We would like to ensure the packet can be pushed
+                * to the queue and we won't get the orphan WAIT WQE.
+                */
+               if (loc->wqe_free <= MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE ||
+                   loc->elts_free < NB_SEGS(loc->mbuf))
+                       return MLX5_TXCMP_CODE_EXIT;
+               /* Convert the timestamp into completion to wait. */
+               ts = *RTE_MBUF_DYNFIELD(loc->mbuf, txq->ts_offset, uint64_t *);
+               wci = mlx5_txpp_convert_tx_ts(txq->sh, ts);
+               if (unlikely(wci < 0))
+                       return MLX5_TXCMP_CODE_SINGLE;
+               /* Build the WAIT WQE with specified completion. */
+               wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+               mlx5_tx_cseg_init(txq, loc, wqe, 2, MLX5_OPCODE_WAIT, olx);
+               mlx5_tx_wseg_init(txq, loc, wqe, wci, olx);
+               ++txq->wqe_ci;
+               --loc->wqe_free;
+               return MLX5_TXCMP_CODE_MULTI;
+       }
+       return MLX5_TXCMP_CODE_SINGLE;
+}
+
  /**
   * Tx one packet function for multi-segment TSO. Supports all
   * types of Tx offloads, uses MLX5_OPCODE_TSO to build WQEs,
@@ -3062,13 +3192,23 @@ dseg_done:
   * Local context variables partially updated.
   */
  static __rte_always_inline enum mlx5_txcmp_code
-mlx5_tx_packet_multi_tso(struct mlx5_txq_data *restrict txq,
-                       struct mlx5_txq_local *restrict loc,
+mlx5_tx_packet_multi_tso(struct mlx5_txq_data *__rte_restrict txq,
+                       struct mlx5_txq_local *__rte_restrict loc,
                         unsigned int olx)
  {
-       struct mlx5_wqe *restrict wqe;
+       struct mlx5_wqe *__rte_restrict wqe;
         unsigned int ds, dlen, inlen, ntcp, vlan = 0;
  
+       if (MLX5_TXOFF_CONFIG(TXPP)) {
+               enum mlx5_txcmp_code wret;
+
+               /* Generate WAIT for scheduling if requested. */
+               wret = mlx5_tx_schedule_send(txq, loc, olx);
+               if (wret == MLX5_TXCMP_CODE_EXIT)
+                       return MLX5_TXCMP_CODE_EXIT;
+               if (wret == MLX5_TXCMP_CODE_ERROR)
+                       return MLX5_TXCMP_CODE_ERROR;
+       }
         /*
          * Calculate data length to be inlined to estimate
          * the required space in WQE ring buffer.
@@ -3087,7 +3227,7 @@ mlx5_tx_packet_multi_tso(struct mlx5_txq_data *restrict txq,
                      inlen <= MLX5_ESEG_MIN_INLINE_SIZE ||
                      inlen > (dlen + vlan)))
                 return MLX5_TXCMP_CODE_ERROR;
-       assert(inlen >= txq->inlen_mode);
+       MLX5_ASSERT(inlen >= txq->inlen_mode);
         /*
          * Check whether there are enough free WQEBBs:
          * - Control Segment
@@ -3151,15 +3291,25 @@ mlx5_tx_packet_multi_tso(struct mlx5_txq_data *restrict txq,
   * Local context variables partially updated.
   */
  static __rte_always_inline enum mlx5_txcmp_code
-mlx5_tx_packet_multi_send(struct mlx5_txq_data *restrict txq,
-                         struct mlx5_txq_local *restrict loc,
+mlx5_tx_packet_multi_send(struct mlx5_txq_data *__rte_restrict txq,
+                         struct mlx5_txq_local *__rte_restrict loc,
                           unsigned int olx)
  {
-       struct mlx5_wqe_dseg *restrict dseg;
-       struct mlx5_wqe *restrict wqe;
+       struct mlx5_wqe_dseg *__rte_restrict dseg;
+       struct mlx5_wqe *__rte_restrict wqe;
         unsigned int ds, nseg;
  
-       assert(NB_SEGS(loc->mbuf) > 1);
+       MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
+       if (MLX5_TXOFF_CONFIG(TXPP)) {
+               enum mlx5_txcmp_code wret;
+
+               /* Generate WAIT for scheduling if requested. */
+               wret = mlx5_tx_schedule_send(txq, loc, olx);
+               if (wret == MLX5_TXCMP_CODE_EXIT)
+                       return MLX5_TXCMP_CODE_EXIT;
+               if (wret == MLX5_TXCMP_CODE_ERROR)
+                       return MLX5_TXCMP_CODE_ERROR;
+       }
         /*
          * No inline at all, it means the CPU cycles saving
          * is prioritized at configuration, we should not
@@ -3259,15 +3409,25 @@ mlx5_tx_packet_multi_send(struct mlx5_txq_data *restrict txq,
   * Local context variables partially updated.
   */
  static __rte_always_inline enum mlx5_txcmp_code
-mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq,
-                           struct mlx5_txq_local *restrict loc,
+mlx5_tx_packet_multi_inline(struct mlx5_txq_data *__rte_restrict txq,
+                           struct mlx5_txq_local *__rte_restrict loc,
                             unsigned int olx)
  {
-       struct mlx5_wqe *restrict wqe;
+       struct mlx5_wqe *__rte_restrict wqe;
         unsigned int ds, inlen, dlen, vlan = 0;
  
-       assert(MLX5_TXOFF_CONFIG(INLINE));
-       assert(NB_SEGS(loc->mbuf) > 1);
+       MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
+       MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
+       if (MLX5_TXOFF_CONFIG(TXPP)) {
+               enum mlx5_txcmp_code wret;
+
+               /* Generate WAIT for scheduling if requested. */
+               wret = mlx5_tx_schedule_send(txq, loc, olx);
+               if (wret == MLX5_TXCMP_CODE_EXIT)
+                       return MLX5_TXCMP_CODE_EXIT;
+               if (wret == MLX5_TXCMP_CODE_ERROR)
+                       return MLX5_TXCMP_CODE_ERROR;
+       }
         /*
          * First calculate data length to be inlined
          * to estimate the required space for WQE.
@@ -3279,8 +3439,9 @@ mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq,
         /* Check against minimal length. */
         if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE)
                 return MLX5_TXCMP_CODE_ERROR;
-       assert(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE);
-       if (inlen > txq->inlen_send) {
+       MLX5_ASSERT(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE);
+       if (inlen > txq->inlen_send ||
+           loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) {
                 struct rte_mbuf *mbuf;
                 unsigned int nxlen;
                 uintptr_t start;
@@ -3291,11 +3452,13 @@ mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq,
                  * inlining is required.
                  */
                 if (txq->inlen_mode) {
-                       assert(txq->inlen_mode >= MLX5_ESEG_MIN_INLINE_SIZE);
-                       assert(txq->inlen_mode <= txq->inlen_send);
+                       MLX5_ASSERT(txq->inlen_mode >=
+                                   MLX5_ESEG_MIN_INLINE_SIZE);
+                       MLX5_ASSERT(txq->inlen_mode <= txq->inlen_send);
                         inlen = txq->inlen_mode;
                 } else {
-                       if (!vlan || txq->vlan_en) {
+                       if (loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE ||
+                           !vlan || txq->vlan_en) {
                                 /*
                                  * VLAN insertion will be done inside by HW.
                                  * It is not utmost effective - VLAN flag is
@@ -3324,7 +3487,7 @@ mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq,
                                 do {
                                         smlen = nxlen;
                                         mbuf = NEXT(mbuf);
-                                       assert(mbuf);
+                                       MLX5_ASSERT(mbuf);
                                         nxlen = rte_pktmbuf_data_len(mbuf);
                                         nxlen += smlen;
                                 } while (unlikely(nxlen < inlen));
@@ -3340,7 +3503,7 @@ mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq,
                                 inlen = nxlen;
                                 mbuf = NEXT(mbuf);
                                 /* There should be not end of packet. */
-                               assert(mbuf);
+                               MLX5_ASSERT(mbuf);
                                 nxlen = inlen + rte_pktmbuf_data_len(mbuf);
                         } while (unlikely(nxlen < txq->inlen_send));
                 }
@@ -3368,7 +3531,7 @@ do_align:
          * Estimate the number of Data Segments conservatively,
          * supposing no any mbufs is being freed during inlining.
          */
-       assert(inlen <= txq->inlen_send);
+       MLX5_ASSERT(inlen <= txq->inlen_send);
         ds = NB_SEGS(loc->mbuf) + 2 + (inlen -
                                        MLX5_ESEG_MIN_INLINE_SIZE +
                                        MLX5_WSEG_SIZE +
@@ -3421,20 +3584,20 @@ do_align:
   * Local context variables updated.
   */
  static __rte_always_inline enum mlx5_txcmp_code
-mlx5_tx_burst_mseg(struct mlx5_txq_data *restrict txq,
-                  struct rte_mbuf **restrict pkts,
+mlx5_tx_burst_mseg(struct mlx5_txq_data *__rte_restrict txq,
+                  struct rte_mbuf **__rte_restrict pkts,
                    unsigned int pkts_n,
-                  struct mlx5_txq_local *restrict loc,
+                  struct mlx5_txq_local *__rte_restrict loc,
                    unsigned int olx)
  {
-       assert(loc->elts_free && loc->wqe_free);
-       assert(pkts_n > loc->pkts_sent);
+       MLX5_ASSERT(loc->elts_free && loc->wqe_free);
+       MLX5_ASSERT(pkts_n > loc->pkts_sent);
         pkts += loc->pkts_sent + 1;
         pkts_n -= loc->pkts_sent;
         for (;;) {
                 enum mlx5_txcmp_code ret;
  
-               assert(NB_SEGS(loc->mbuf) > 1);
+               MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
                 /*
                  * Estimate the number of free elts quickly but
                  * conservatively. Some segment may be fully inlined
@@ -3474,7 +3637,7 @@ mlx5_tx_burst_mseg(struct mlx5_txq_data *restrict txq,
                         return MLX5_TXCMP_CODE_TSO;
                 return MLX5_TXCMP_CODE_SINGLE;
         }
-       assert(false);
+       MLX5_ASSERT(false);
  }
  
  /**
@@ -3510,23 +3673,33 @@ mlx5_tx_burst_mseg(struct mlx5_txq_data *restrict txq,
   * Local context variables updated.
   */
  static __rte_always_inline enum mlx5_txcmp_code
-mlx5_tx_burst_tso(struct mlx5_txq_data *restrict txq,
-                 struct rte_mbuf **restrict pkts,
+mlx5_tx_burst_tso(struct mlx5_txq_data *__rte_restrict txq,
+                 struct rte_mbuf **__rte_restrict pkts,
                   unsigned int pkts_n,
-                 struct mlx5_txq_local *restrict loc,
+                 struct mlx5_txq_local *__rte_restrict loc,
                   unsigned int olx)
  {
-       assert(loc->elts_free && loc->wqe_free);
-       assert(pkts_n > loc->pkts_sent);
+       MLX5_ASSERT(loc->elts_free && loc->wqe_free);
+       MLX5_ASSERT(pkts_n > loc->pkts_sent);
         pkts += loc->pkts_sent + 1;
         pkts_n -= loc->pkts_sent;
         for (;;) {
-               struct mlx5_wqe_dseg *restrict dseg;
-               struct mlx5_wqe *restrict wqe;
+               struct mlx5_wqe_dseg *__rte_restrict dseg;
+               struct mlx5_wqe *__rte_restrict wqe;
                 unsigned int ds, dlen, hlen, ntcp, vlan = 0;
                 uint8_t *dptr;
  
-               assert(NB_SEGS(loc->mbuf) == 1);
+               MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
+               if (MLX5_TXOFF_CONFIG(TXPP)) {
+                       enum mlx5_txcmp_code wret;
+
+                       /* Generate WAIT for scheduling if requested. */
+                       wret = mlx5_tx_schedule_send(txq, loc, olx);
+                       if (wret == MLX5_TXCMP_CODE_EXIT)
+                               return MLX5_TXCMP_CODE_EXIT;
+                       if (wret == MLX5_TXCMP_CODE_ERROR)
+                               return MLX5_TXCMP_CODE_ERROR;
+               }
                 dlen = rte_pktmbuf_data_len(loc->mbuf);
                 if (MLX5_TXOFF_CONFIG(VLAN) &&
                     loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
@@ -3610,7 +3783,7 @@ mlx5_tx_burst_tso(struct mlx5_txq_data *restrict txq,
                         return MLX5_TXCMP_CODE_SINGLE;
                 /* Continue with the next TSO packet. */
         }
-       assert(false);
+       MLX5_ASSERT(false);
  }
  
  /**
@@ -3634,8 +3807,8 @@ mlx5_tx_burst_tso(struct mlx5_txq_data *restrict txq,
   *  MLX5_TXCMP_CODE_EMPW - single-segment packet, use MPW.
   */
  static __rte_always_inline enum mlx5_txcmp_code
-mlx5_tx_able_to_empw(struct mlx5_txq_data *restrict txq,
-                    struct mlx5_txq_local *restrict loc,
+mlx5_tx_able_to_empw(struct mlx5_txq_data *__rte_restrict txq,
+                    struct mlx5_txq_local *__rte_restrict loc,
                      unsigned int olx,
                      bool newp)
  {
@@ -3689,9 +3862,9 @@ mlx5_tx_able_to_empw(struct mlx5_txq_data *restrict txq,
   *  false - no match, eMPW should be restarted.
   */
  static __rte_always_inline bool
-mlx5_tx_match_empw(struct mlx5_txq_data *restrict txq __rte_unused,
-                  struct mlx5_wqe_eseg *restrict es,
-                  struct mlx5_txq_local *restrict loc,
+mlx5_tx_match_empw(struct mlx5_txq_data *__rte_restrict txq,
+                  struct mlx5_wqe_eseg *__rte_restrict es,
+                  struct mlx5_txq_local *__rte_restrict loc,
                    uint32_t dlen,
                    unsigned int olx)
  {
@@ -3717,7 +3890,11 @@ mlx5_tx_match_empw(struct mlx5_txq_data *restrict txq __rte_unused,
                 return false;
         /* There must be no VLAN packets in eMPW loop. */
         if (MLX5_TXOFF_CONFIG(VLAN))
-               assert(!(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT));
+               MLX5_ASSERT(!(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT));
+       /* Check if the scheduling is requested. */
+       if (MLX5_TXOFF_CONFIG(TXPP) &&
+           loc->mbuf->ol_flags & txq->ts_mask)
+               return false;
         return true;
  }
  
@@ -3743,13 +3920,13 @@ mlx5_tx_match_empw(struct mlx5_txq_data *restrict txq __rte_unused,
   *  false - no match, eMPW should be restarted.
   */
  static __rte_always_inline void
-mlx5_tx_sdone_empw(struct mlx5_txq_data *restrict txq,
-                  struct mlx5_txq_local *restrict loc,
+mlx5_tx_sdone_empw(struct mlx5_txq_data *__rte_restrict txq,
+                  struct mlx5_txq_local *__rte_restrict loc,
                    unsigned int ds,
                    unsigned int slen,
                    unsigned int olx __rte_unused)
  {
-       assert(!MLX5_TXOFF_CONFIG(INLINE));
+       MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE));
  #ifdef MLX5_PMD_SOFT_COUNTERS
         /* Update sent data bytes counter. */
          txq->stats.obytes += slen;
@@ -3777,6 +3954,8 @@ mlx5_tx_sdone_empw(struct mlx5_txq_data *restrict txq,
   *   Total size of descriptor/data in bytes.
   * @param slen
   *   Accumulated statistics, data bytes sent.
+ * @param wqem
+ *   The base WQE for the eMPW/MPW descriptor.
   * @param olx
   *   Configured Tx offloads mask. It is fully defined at
   *   compile time and may be used for optimization.
@@ -3786,24 +3965,44 @@ mlx5_tx_sdone_empw(struct mlx5_txq_data *restrict txq,
   *  false - no match, eMPW should be restarted.
   */
  static __rte_always_inline void
-mlx5_tx_idone_empw(struct mlx5_txq_data *restrict txq,
-                  struct mlx5_txq_local *restrict loc,
+mlx5_tx_idone_empw(struct mlx5_txq_data *__rte_restrict txq,
+                  struct mlx5_txq_local *__rte_restrict loc,
                    unsigned int len,
                    unsigned int slen,
+                  struct mlx5_wqe *__rte_restrict wqem,
                    unsigned int olx __rte_unused)
  {
-       assert(MLX5_TXOFF_CONFIG(INLINE));
-       assert((len % MLX5_WSEG_SIZE) == 0);
+       struct mlx5_wqe_dseg *dseg = &wqem->dseg[0];
+
+       MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
  #ifdef MLX5_PMD_SOFT_COUNTERS
         /* Update sent data bytes counter. */
          txq->stats.obytes += slen;
  #else
         (void)slen;
  #endif
-       len = len / MLX5_WSEG_SIZE + 2;
-       loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len);
+       if (MLX5_TXOFF_CONFIG(MPW) && dseg->bcount == RTE_BE32(0)) {
+               /*
+                * If the legacy MPW session contains the inline packets
+                * we should set the only inline data segment length
+                * and align the total length to the segment size.
+                */
+               MLX5_ASSERT(len > sizeof(dseg->bcount));
+               dseg->bcount = rte_cpu_to_be_32((len - sizeof(dseg->bcount)) |
+                                               MLX5_ETH_WQE_DATA_INLINE);
+               len = (len + MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE + 2;
+       } else {
+               /*
+                * The session is not legacy MPW or contains the
+                * data buffer pointer segments.
+                */
+               MLX5_ASSERT((len % MLX5_WSEG_SIZE) == 0);
+               len = len / MLX5_WSEG_SIZE + 2;
+       }
+       wqem->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len);
         txq->wqe_ci += (len + 3) / 4;
         loc->wqe_free -= (len + 3) / 4;
+       loc->wqe_last = wqem;
  }
  
  /**
@@ -3854,10 +4053,10 @@ mlx5_tx_idone_empw(struct mlx5_txq_data *restrict txq,
   * No VLAN insertion is supported.
   */
  static __rte_always_inline enum mlx5_txcmp_code
-mlx5_tx_burst_empw_simple(struct mlx5_txq_data *restrict txq,
-                         struct rte_mbuf **restrict pkts,
+mlx5_tx_burst_empw_simple(struct mlx5_txq_data *__rte_restrict txq,
+                         struct rte_mbuf **__rte_restrict pkts,
                           unsigned int pkts_n,
-                         struct mlx5_txq_local *restrict loc,
+                         struct mlx5_txq_local *__rte_restrict loc,
                           unsigned int olx)
  {
         /*
@@ -3865,22 +4064,32 @@ mlx5_tx_burst_empw_simple(struct mlx5_txq_data *restrict txq,
          * and sends single-segment packet with eMPW opcode
          * without data inlining.
          */
-       assert(!MLX5_TXOFF_CONFIG(INLINE));
-       assert(MLX5_TXOFF_CONFIG(EMPW));
-       assert(loc->elts_free && loc->wqe_free);
-       assert(pkts_n > loc->pkts_sent);
+       MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE));
+       MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW));
+       MLX5_ASSERT(loc->elts_free && loc->wqe_free);
+       MLX5_ASSERT(pkts_n > loc->pkts_sent);
         static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size");
         pkts += loc->pkts_sent + 1;
         pkts_n -= loc->pkts_sent;
         for (;;) {
-               struct mlx5_wqe_dseg *restrict dseg;
-               struct mlx5_wqe_eseg *restrict eseg;
+               struct mlx5_wqe_dseg *__rte_restrict dseg;
+               struct mlx5_wqe_eseg *__rte_restrict eseg;
                 enum mlx5_txcmp_code ret;
                 unsigned int part, loop;
                 unsigned int slen = 0;
  
  next_empw:
-               assert(NB_SEGS(loc->mbuf) == 1);
+               MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
+               if (MLX5_TXOFF_CONFIG(TXPP)) {
+                       enum mlx5_txcmp_code wret;
+
+                       /* Generate WAIT for scheduling if requested. */
+                       wret = mlx5_tx_schedule_send(txq, loc, olx);
+                       if (wret == MLX5_TXCMP_CODE_EXIT)
+                               return MLX5_TXCMP_CODE_EXIT;
+                       if (wret == MLX5_TXCMP_CODE_ERROR)
+                               return MLX5_TXCMP_CODE_ERROR;
+               }
                 part = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ?
                                        MLX5_MPW_MAX_PACKETS :
                                        MLX5_EMPW_MAX_PACKETS);
@@ -3946,7 +4155,7 @@ next_empw:
                                         return MLX5_TXCMP_CODE_EXIT;
                                 return MLX5_TXCMP_CODE_MULTI;
                         }
-                       assert(NB_SEGS(loc->mbuf) == 1);
+                       MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
                         if (ret == MLX5_TXCMP_CODE_TSO) {
                                 part -= loop;
                                 mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
@@ -3964,7 +4173,7 @@ next_empw:
                                 return MLX5_TXCMP_CODE_SINGLE;
                         }
                         if (ret != MLX5_TXCMP_CODE_EMPW) {
-                               assert(false);
+                               MLX5_ASSERT(false);
                                 part -= loop;
                                 mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
                                 return MLX5_TXCMP_CODE_ERROR;
@@ -3976,9 +4185,10 @@ next_empw:
                          * - metadata value
                          * - software parser settings
                          * - packets length (legacy MPW only)
+                        * - scheduling is not required
                          */
                         if (!mlx5_tx_match_empw(txq, eseg, loc, dlen, olx)) {
-                               assert(loop);
+                               MLX5_ASSERT(loop);
                                 part -= loop;
                                 mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
                                 if (unlikely(!loc->elts_free ||
@@ -3993,8 +4203,8 @@ next_empw:
                                 dseg = (struct mlx5_wqe_dseg *)txq->wqes;
                 }
                 /* eMPW is built successfully, update loop parameters. */
-               assert(!loop);
-               assert(pkts_n >= part);
+               MLX5_ASSERT(!loop);
+               MLX5_ASSERT(pkts_n >= part);
  #ifdef MLX5_PMD_SOFT_COUNTERS
                 /* Update sent data bytes counter. */
                 txq->stats.obytes += slen;
@@ -4012,7 +4222,7 @@ next_empw:
                         return ret;
                 /* Continue sending eMPW batches. */
         }
-       assert(false);
+       MLX5_ASSERT(false);
  }
  
  /**
@@ -4020,10 +4230,10 @@ next_empw:
   * with inlining, optionally supports VLAN insertion.
   */
  static __rte_always_inline enum mlx5_txcmp_code
-mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq,
-                         struct rte_mbuf **restrict pkts,
+mlx5_tx_burst_empw_inline(struct mlx5_txq_data *__rte_restrict txq,
+                         struct rte_mbuf **__rte_restrict pkts,
                           unsigned int pkts_n,
-                         struct mlx5_txq_local *restrict loc,
+                         struct mlx5_txq_local *__rte_restrict loc,
                           unsigned int olx)
  {
         /*
@@ -4031,21 +4241,31 @@ mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq,
          * and sends single-segment packet with eMPW opcode
          * with data inlining.
          */
-       assert(MLX5_TXOFF_CONFIG(INLINE));
-       assert(MLX5_TXOFF_CONFIG(EMPW));
-       assert(loc->elts_free && loc->wqe_free);
-       assert(pkts_n > loc->pkts_sent);
+       MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
+       MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW));
+       MLX5_ASSERT(loc->elts_free && loc->wqe_free);
+       MLX5_ASSERT(pkts_n > loc->pkts_sent);
         static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size");
         pkts += loc->pkts_sent + 1;
         pkts_n -= loc->pkts_sent;
         for (;;) {
-               struct mlx5_wqe_dseg *restrict dseg;
-               struct mlx5_wqe_eseg *restrict eseg;
+               struct mlx5_wqe_dseg *__rte_restrict dseg;
+               struct mlx5_wqe *__rte_restrict wqem;
                 enum mlx5_txcmp_code ret;
                 unsigned int room, part, nlim;
                 unsigned int slen = 0;
  
-               assert(NB_SEGS(loc->mbuf) == 1);
+               MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
+               if (MLX5_TXOFF_CONFIG(TXPP)) {
+                       enum mlx5_txcmp_code wret;
+
+                       /* Generate WAIT for scheduling if requested. */
+                       wret = mlx5_tx_schedule_send(txq, loc, olx);
+                       if (wret == MLX5_TXCMP_CODE_EXIT)
+                               return MLX5_TXCMP_CODE_EXIT;
+                       if (wret == MLX5_TXCMP_CODE_ERROR)
+                               return MLX5_TXCMP_CODE_ERROR;
+               }
                 /*
                  * Limits the amount of packets in one WQE
                  * to improve CQE latency generation.
@@ -4059,26 +4279,34 @@ mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq,
                         return MLX5_TXCMP_CODE_EXIT;
                 if (likely(pkts_n > 1))
                         rte_prefetch0(*pkts);
-               loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+               wqem = txq->wqes + (txq->wqe_ci & txq->wqe_m);
                 /*
                  * Build eMPW title WQEBB:
                  * - Control Segment, eMPW opcode, zero DS
                  * - Ethernet Segment, no inline
                  */
-               mlx5_tx_cseg_init(txq, loc, loc->wqe_last, 0,
+               mlx5_tx_cseg_init(txq, loc, wqem, 0,
                                   MLX5_OPCODE_ENHANCED_MPSW, olx);
-               mlx5_tx_eseg_none(txq, loc, loc->wqe_last,
+               mlx5_tx_eseg_none(txq, loc, wqem,
                                   olx & ~MLX5_TXOFF_CONFIG_VLAN);
-               eseg = &loc->wqe_last->eseg;
-               dseg = &loc->wqe_last->dseg[0];
+               dseg = &wqem->dseg[0];
                 /* Store the packet length for legacy MPW. */
                 if (MLX5_TXOFF_CONFIG(MPW))
-                       eseg->mss = rte_cpu_to_be_16
-                                       (rte_pktmbuf_data_len(loc->mbuf));
+                       wqem->eseg.mss = rte_cpu_to_be_16
+                                        (rte_pktmbuf_data_len(loc->mbuf));
                 room = RTE_MIN(MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE,
                                loc->wqe_free) * MLX5_WQE_SIZE -
                                         MLX5_WQE_CSEG_SIZE -
                                         MLX5_WQE_ESEG_SIZE;
+               /* Limit the room for legacy MPW sessions for performance. */
+               if (MLX5_TXOFF_CONFIG(MPW))
+                       room = RTE_MIN(room,
+                                      RTE_MAX(txq->inlen_empw +
+                                              sizeof(dseg->bcount) +
+                                              (MLX5_TXOFF_CONFIG(VLAN) ?
+                                              sizeof(struct rte_vlan_hdr) : 0),
+                                              MLX5_MPW_INLINE_MAX_PACKETS *
+                                              MLX5_WQE_DSEG_SIZE));
                 /* Build WQE till we have space, packets and resources. */
                 part = room;
                 for (;;) {
@@ -4086,9 +4314,9 @@ mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq,
                         uint8_t *dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
                         unsigned int tlen;
  
-                       assert(room >= MLX5_WQE_DSEG_SIZE);
-                       assert((room % MLX5_WQE_DSEG_SIZE) == 0);
-                       assert((uintptr_t)dseg < (uintptr_t)txq->wqes_end);
+                       MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE);
+                       MLX5_ASSERT((room % MLX5_WQE_DSEG_SIZE) == 0);
+                       MLX5_ASSERT((uintptr_t)dseg < (uintptr_t)txq->wqes_end);
                         /*
                          * Some Tx offloads may cause an error if
                          * packet is not long enough, check against
@@ -4102,14 +4330,36 @@ mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq,
                                  * We have some successfully built
                                  * packet Data Segments to send.
                                  */
-                               mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+                               mlx5_tx_idone_empw(txq, loc, part,
+                                                  slen, wqem, olx);
                                 return MLX5_TXCMP_CODE_ERROR;
                         }
                         /* Inline or not inline - that's the Question. */
-                       if (dlen > txq->inlen_empw)
+                       if (dlen > txq->inlen_empw ||
+                           loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE)
                                 goto pointer_empw;
+                       if (MLX5_TXOFF_CONFIG(MPW)) {
+                               if (dlen > txq->inlen_send)
+                                       goto pointer_empw;
+                               tlen = dlen;
+                               if (part == room) {
+                                       /* Open new inline MPW session. */
+                                       tlen += sizeof(dseg->bcount);
+                                       dseg->bcount = RTE_BE32(0);
+                                       dseg = RTE_PTR_ADD
+                                               (dseg, sizeof(dseg->bcount));
+                               } else {
+                                       /*
+                                        * No pointer and inline descriptor
+                                        * intermix for legacy MPW sessions.
+                                        */
+                                       if (wqem->dseg[0].bcount)
+                                               break;
+                               }
+                       } else {
+                               tlen = sizeof(dseg->bcount) + dlen;
+                       }
                         /* Inline entire packet, optional VLAN insertion. */
-                       tlen = sizeof(dseg->bcount) + dlen;
                         if (MLX5_TXOFF_CONFIG(VLAN) &&
                             loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
                                 /*
@@ -4117,8 +4367,9 @@ mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq,
                                  * mlx5_tx_able_to_empw() and packet
                                  * fits into inline length guaranteed.
                                  */
-                               assert((dlen + sizeof(struct rte_vlan_hdr)) <=
-                                       txq->inlen_empw);
+                               MLX5_ASSERT((dlen +
+                                            sizeof(struct rte_vlan_hdr)) <=
+                                           txq->inlen_empw);
                                 tlen += sizeof(struct rte_vlan_hdr);
                                 if (room < tlen)
                                         break;
@@ -4134,8 +4385,9 @@ mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq,
                                 dseg = mlx5_tx_dseg_empw(txq, loc, dseg,
                                                          dptr, dlen, olx);
                         }
-                       tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE);
-                       assert(room >= tlen);
+                       if (!MLX5_TXOFF_CONFIG(MPW))
+                               tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE);
+                       MLX5_ASSERT(room >= tlen);
                         room -= tlen;
                         /*
                          * Packet data are completely inlined,
@@ -4144,14 +4396,22 @@ mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq,
                         rte_pktmbuf_free_seg(loc->mbuf);
                         goto next_mbuf;
  pointer_empw:
+                       /*
+                        * No pointer and inline descriptor
+                        * intermix for legacy MPW sessions.
+                        */
+                       if (MLX5_TXOFF_CONFIG(MPW) &&
+                           part != room &&
+                           wqem->dseg[0].bcount == RTE_BE32(0))
+                               break;
                         /*
                          * Not inlinable VLAN packets are
                          * proceeded outside of this routine.
                          */
-                       assert(room >= MLX5_WQE_DSEG_SIZE);
+                       MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE);
                         if (MLX5_TXOFF_CONFIG(VLAN))
-                               assert(!(loc->mbuf->ol_flags &
-                                        PKT_TX_VLAN_PKT));
+                               MLX5_ASSERT(!(loc->mbuf->ol_flags &
+                                           PKT_TX_VLAN_PKT));
                         mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx);
                         /* We have to store mbuf in elts.*/
                         txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
@@ -4172,7 +4432,8 @@ next_mbuf:
                                  * continue build descriptors.
                                  */
                                 part -= room;
-                               mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+                               mlx5_tx_idone_empw(txq, loc, part,
+                                                  slen, wqem, olx);
                                 return MLX5_TXCMP_CODE_EXIT;
                         }
                         loc->mbuf = *pkts++;
@@ -4186,16 +4447,18 @@ next_mbuf:
                          */
                         if (ret == MLX5_TXCMP_CODE_MULTI) {
                                 part -= room;
-                               mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+                               mlx5_tx_idone_empw(txq, loc, part,
+                                                  slen, wqem, olx);
                                 if (unlikely(!loc->elts_free ||
                                              !loc->wqe_free))
                                         return MLX5_TXCMP_CODE_EXIT;
                                 return MLX5_TXCMP_CODE_MULTI;
                         }
-                       assert(NB_SEGS(loc->mbuf) == 1);
+                       MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
                         if (ret == MLX5_TXCMP_CODE_TSO) {
                                 part -= room;
-                               mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+                               mlx5_tx_idone_empw(txq, loc, part,
+                                                  slen, wqem, olx);
                                 if (unlikely(!loc->elts_free ||
                                              !loc->wqe_free))
                                         return MLX5_TXCMP_CODE_EXIT;
@@ -4203,16 +4466,18 @@ next_mbuf:
                         }
                         if (ret == MLX5_TXCMP_CODE_SINGLE) {
                                 part -= room;
-                               mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+                               mlx5_tx_idone_empw(txq, loc, part,
+                                                  slen, wqem, olx);
                                 if (unlikely(!loc->elts_free ||
                                              !loc->wqe_free))
                                         return MLX5_TXCMP_CODE_EXIT;
                                 return MLX5_TXCMP_CODE_SINGLE;
                         }
                         if (ret != MLX5_TXCMP_CODE_EMPW) {
-                               assert(false);
+                               MLX5_ASSERT(false);
                                 part -= room;
-                               mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+                               mlx5_tx_idone_empw(txq, loc, part,
+                                                  slen, wqem, olx);
                                 return MLX5_TXCMP_CODE_ERROR;
                         }
                         /* Check if we have minimal room left. */
@@ -4226,8 +4491,10 @@ next_mbuf:
                          * - metadata value
                          * - software parser settings
                          * - packets length (legacy MPW only)
+                        * - scheduling is not required
                          */
-                       if (!mlx5_tx_match_empw(txq, eseg, loc, dlen, olx))
+                       if (!mlx5_tx_match_empw(txq, &wqem->eseg,
+                                               loc, dlen, olx))
                                 break;
                         /* Packet attributes match, continue the same eMPW. */
                         if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
@@ -4237,17 +4504,17 @@ next_mbuf:
                  * We get here to close an existing eMPW
                  * session and start the new one.
                  */
-               assert(pkts_n);
+               MLX5_ASSERT(pkts_n);
                 part -= room;
                 if (unlikely(!part))
                         return MLX5_TXCMP_CODE_EXIT;
-               mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+               mlx5_tx_idone_empw(txq, loc, part, slen, wqem, olx);
                 if (unlikely(!loc->elts_free ||
                              !loc->wqe_free))
                         return MLX5_TXCMP_CODE_EXIT;
                 /* Continue the loop with new eMPW session. */
         }
-       assert(false);
+       MLX5_ASSERT(false);
  }
  
  /**
@@ -4255,25 +4522,35 @@ next_mbuf:
   * Data inlining and VLAN insertion are supported.
   */
  static __rte_always_inline enum mlx5_txcmp_code
-mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq,
-                         struct rte_mbuf **restrict pkts,
+mlx5_tx_burst_single_send(struct mlx5_txq_data *__rte_restrict txq,
+                         struct rte_mbuf **__rte_restrict pkts,
                           unsigned int pkts_n,
-                         struct mlx5_txq_local *restrict loc,
+                         struct mlx5_txq_local *__rte_restrict loc,
                           unsigned int olx)
  {
         /*
          * Subroutine is the part of mlx5_tx_burst_single()
          * and sends single-segment packet with SEND opcode.
          */
-       assert(loc->elts_free && loc->wqe_free);
-       assert(pkts_n > loc->pkts_sent);
+       MLX5_ASSERT(loc->elts_free && loc->wqe_free);
+       MLX5_ASSERT(pkts_n > loc->pkts_sent);
         pkts += loc->pkts_sent + 1;
         pkts_n -= loc->pkts_sent;
         for (;;) {
-               struct mlx5_wqe *restrict wqe;
+               struct mlx5_wqe *__rte_restrict wqe;
                 enum mlx5_txcmp_code ret;
  
-               assert(NB_SEGS(loc->mbuf) == 1);
+               MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
+               if (MLX5_TXOFF_CONFIG(TXPP)) {
+                       enum mlx5_txcmp_code wret;
+
+                       /* Generate WAIT for scheduling if requested. */
+                       wret = mlx5_tx_schedule_send(txq, loc, olx);
+                       if (wret == MLX5_TXCMP_CODE_EXIT)
+                               return MLX5_TXCMP_CODE_EXIT;
+                       if (wret == MLX5_TXCMP_CODE_ERROR)
+                               return MLX5_TXCMP_CODE_ERROR;
+               }
                 if (MLX5_TXOFF_CONFIG(INLINE)) {
                         unsigned int inlen, vlan = 0;
  
@@ -4293,7 +4570,8 @@ mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq,
                          * Otherwise we would do extra check for data
                          * size to avoid crashes due to length overflow.
                          */
-                       assert(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE);
+                       MLX5_ASSERT(txq->inlen_send >=
+                                   MLX5_ESEG_MIN_INLINE_SIZE);
                         if (inlen <= txq->inlen_send) {
                                 unsigned int seg_n, wqe_n;
  
@@ -4302,6 +4580,36 @@ mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq,
                                 /* Check against minimal length. */
                                 if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE)
                                         return MLX5_TXCMP_CODE_ERROR;
+                               if (loc->mbuf->ol_flags &
+                                   PKT_TX_DYNF_NOINLINE) {
+                                       /*
+                                        * The hint flag not to inline packet
+                                        * data is set. Check whether we can
+                                        * follow the hint.
+                                        */
+                                       if ((!MLX5_TXOFF_CONFIG(EMPW) &&
+                                             txq->inlen_mode) ||
+                                           (MLX5_TXOFF_CONFIG(MPW) &&
+                                            txq->inlen_mode)) {
+                                               if (inlen <= txq->inlen_send)
+                                                       goto single_inline;
+                                               /*
+                                                * The hardware requires the
+                                                * minimal inline data header.
+                                                */
+                                               goto single_min_inline;
+                                       }
+                                       if (MLX5_TXOFF_CONFIG(VLAN) &&
+                                           vlan && !txq->vlan_en) {
+                                               /*
+                                                * We must insert VLAN tag
+                                                * by software means.
+                                                */
+                                               goto single_part_inline;
+                                       }
+                                       goto single_no_inline;
+                               }
+single_inline:
                                 /*
                                  * Completely inlined packet data WQE:
                                  * - Control Segment, SEND opcode
@@ -4340,7 +4648,7 @@ mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq,
                                  * not contain inlined data for eMPW due to
                                  * segment shared for all packets.
                                  */
-                               struct mlx5_wqe_dseg *restrict dseg;
+                               struct mlx5_wqe_dseg *__rte_restrict dseg;
                                 unsigned int ds;
                                 uint8_t *dptr;
  
@@ -4351,10 +4659,11 @@ mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq,
                                  * We should check the free space in
                                  * WQE ring buffer to inline partially.
                                  */
-                               assert(txq->inlen_send >= txq->inlen_mode);
-                               assert(inlen > txq->inlen_mode);
-                               assert(txq->inlen_mode >=
-                                               MLX5_ESEG_MIN_INLINE_SIZE);
+single_min_inline:
+                               MLX5_ASSERT(txq->inlen_send >= txq->inlen_mode);
+                               MLX5_ASSERT(inlen > txq->inlen_mode);
+                               MLX5_ASSERT(txq->inlen_mode >=
+                                           MLX5_ESEG_MIN_INLINE_SIZE);
                                 /*
                                  * Check whether there are enough free WQEBBs:
                                  * - Control Segment
@@ -4397,7 +4706,7 @@ mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq,
                                 txq->wqe_ci += (ds + 3) / 4;
                                 loc->wqe_free -= (ds + 3) / 4;
                                 /* We have to store mbuf in elts.*/
-                               assert(MLX5_TXOFF_CONFIG(INLINE));
+                               MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
                                 txq->elts[txq->elts_head++ & txq->elts_m] =
                                                 loc->mbuf;
                                 --loc->elts_free;
@@ -4418,6 +4727,7 @@ mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq,
                                  * We also get here if VLAN insertion is not
                                  * supported by HW, the inline is enabled.
                                  */
+single_part_inline:
                                 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
                                 loc->wqe_last = wqe;
                                 mlx5_tx_cseg_init(txq, loc, wqe, 4,
@@ -4430,14 +4740,14 @@ mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq,
                                  * comparing with txq->inlen_send. We should
                                  * not get overflow here.
                                  */
-                               assert(inlen > MLX5_ESEG_MIN_INLINE_SIZE);
+                               MLX5_ASSERT(inlen > MLX5_ESEG_MIN_INLINE_SIZE);
                                 dlen = inlen - MLX5_ESEG_MIN_INLINE_SIZE;
                                 mlx5_tx_dseg_ptr(txq, loc, &wqe->dseg[1],
                                                  dptr, dlen, olx);
                                 ++txq->wqe_ci;
                                 --loc->wqe_free;
                                 /* We have to store mbuf in elts.*/
-                               assert(MLX5_TXOFF_CONFIG(INLINE));
+                               MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
                                 txq->elts[txq->elts_head++ & txq->elts_m] =
                                                 loc->mbuf;
                                 --loc->elts_free;
@@ -4458,6 +4768,7 @@ mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq,
                          * - Ethernet Segment, optional VLAN, no inline
                          * - Data Segment, pointer type
                          */
+single_no_inline:
                         wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
                         loc->wqe_last = wqe;
                         mlx5_tx_cseg_init(txq, loc, wqe, 3,
@@ -4474,7 +4785,7 @@ mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq,
                          * if no inlining is configured, this is done
                          * by calling routine in a batch copy.
                          */
-                       assert(!MLX5_TXOFF_CONFIG(INLINE));
+                       MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE));
                         --loc->elts_free;
  #ifdef MLX5_PMD_SOFT_COUNTERS
                         /* Update sent data bytes counter. */
@@ -4496,14 +4807,14 @@ mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq,
                 if (unlikely(ret != MLX5_TXCMP_CODE_SINGLE))
                         return ret;
         }
-       assert(false);
+       MLX5_ASSERT(false);
  }
  
  static __rte_always_inline enum mlx5_txcmp_code
-mlx5_tx_burst_single(struct mlx5_txq_data *restrict txq,
-                    struct rte_mbuf **restrict pkts,
+mlx5_tx_burst_single(struct mlx5_txq_data *__rte_restrict txq,
+                    struct rte_mbuf **__rte_restrict pkts,
                      unsigned int pkts_n,
-                    struct mlx5_txq_local *restrict loc,
+                    struct mlx5_txq_local *__rte_restrict loc,
                      unsigned int olx)
  {
         enum mlx5_txcmp_code ret;
@@ -4511,7 +4822,7 @@ mlx5_tx_burst_single(struct mlx5_txq_data *restrict txq,
         ret = mlx5_tx_able_to_empw(txq, loc, olx, false);
         if (ret == MLX5_TXCMP_CODE_SINGLE)
                 goto ordinary_send;
-       assert(ret == MLX5_TXCMP_CODE_EMPW);
+       MLX5_ASSERT(ret == MLX5_TXCMP_CODE_EMPW);
         for (;;) {
                 /* Optimize for inline/no inline eMPW send. */
                 ret = (MLX5_TXOFF_CONFIG(INLINE)) ?
@@ -4522,14 +4833,14 @@ mlx5_tx_burst_single(struct mlx5_txq_data *restrict txq,
                 if (ret != MLX5_TXCMP_CODE_SINGLE)
                         return ret;
                 /* The resources to send one packet should remain. */
-               assert(loc->elts_free && loc->wqe_free);
+               MLX5_ASSERT(loc->elts_free && loc->wqe_free);
  ordinary_send:
                 ret = mlx5_tx_burst_single_send(txq, pkts, pkts_n, loc, olx);
-               assert(ret != MLX5_TXCMP_CODE_SINGLE);
+               MLX5_ASSERT(ret != MLX5_TXCMP_CODE_SINGLE);
                 if (ret != MLX5_TXCMP_CODE_EMPW)
                         return ret;
                 /* The resources to send one packet should remain. */
-               assert(loc->elts_free && loc->wqe_free);
+               MLX5_ASSERT(loc->elts_free && loc->wqe_free);
         }
  }
  
@@ -4554,8 +4865,8 @@ ordinary_send:
   *   Number of packets successfully transmitted (<= pkts_n).
   */
  static __rte_always_inline uint16_t
-mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq,
-                  struct rte_mbuf **restrict pkts,
+mlx5_tx_burst_tmpl(struct mlx5_txq_data *__rte_restrict txq,
+                  struct rte_mbuf **__rte_restrict pkts,
                    uint16_t pkts_n,
                    unsigned int olx)
  {
@@ -4563,8 +4874,8 @@ mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq,
         enum mlx5_txcmp_code ret;
         unsigned int part;
  
-       assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
-       assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
+       MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
+       MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
         if (unlikely(!pkts_n))
                 return 0;
         loc.pkts_sent = 0;
@@ -4585,15 +4896,15 @@ send_loop:
         /*
          * Calculate the number of available resources - elts and WQEs.
          * There are two possible different scenarios:
-        * - no data inlining into WQEs, one WQEBB may contains upto
+        * - no data inlining into WQEs, one WQEBB may contains up to
          *   four packets, in this case elts become scarce resource
          * - data inlining into WQEs, one packet may require multiple
          *   WQEBBs, the WQEs become the limiting factor.
          */
-       assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
+       MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
         loc.elts_free = txq->elts_s -
                                 (uint16_t)(txq->elts_head - txq->elts_tail);
-       assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
+       MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
         loc.wqe_free = txq->wqe_s -
                                 (uint16_t)(txq->wqe_ci - txq->wqe_pi);
         if (unlikely(!loc.elts_free || !loc.wqe_free))
@@ -4615,7 +4926,7 @@ send_loop:
                          * per WQE, do it in dedicated routine.
                          */
  enter_send_multi:
-                       assert(loc.pkts_sent >= loc.pkts_copy);
+                       MLX5_ASSERT(loc.pkts_sent >= loc.pkts_copy);
                         part = loc.pkts_sent - loc.pkts_copy;
                         if (!MLX5_TXOFF_CONFIG(INLINE) && part) {
                                 /*
@@ -4629,7 +4940,7 @@ enter_send_multi:
                                                   part, olx);
                                 loc.pkts_copy = loc.pkts_sent;
                         }
-                       assert(pkts_n > loc.pkts_sent);
+                       MLX5_ASSERT(pkts_n > loc.pkts_sent);
                         ret = mlx5_tx_burst_mseg(txq, pkts, pkts_n, &loc, olx);
                         if (!MLX5_TXOFF_CONFIG(INLINE))
                                 loc.pkts_copy = loc.pkts_sent;
@@ -4671,7 +4982,7 @@ enter_send_multi:
                                 goto enter_send_tso;
                         }
                         /* We must not get here. Something is going wrong. */
-                       assert(false);
+                       MLX5_ASSERT(false);
                         txq->stats.oerrors++;
                         break;
                 }
@@ -4685,8 +4996,8 @@ enter_send_multi:
                          * in dedicated branch.
                          */
  enter_send_tso:
-                       assert(NB_SEGS(loc.mbuf) == 1);
-                       assert(pkts_n > loc.pkts_sent);
+                       MLX5_ASSERT(NB_SEGS(loc.mbuf) == 1);
+                       MLX5_ASSERT(pkts_n > loc.pkts_sent);
                         ret = mlx5_tx_burst_tso(txq, pkts, pkts_n, &loc, olx);
                         /*
                          * These returned code checks are supposed
@@ -4709,7 +5020,7 @@ enter_send_tso:
                                 goto enter_send_multi;
                         }
                         /* We must not get here. Something is going wrong. */
-                       assert(false);
+                       MLX5_ASSERT(false);
                         txq->stats.oerrors++;
                         break;
                 }
@@ -4722,7 +5033,7 @@ enter_send_tso:
                  * offloads are requested at SQ configuration time).
                  */
  enter_send_single:
-               assert(pkts_n > loc.pkts_sent);
+               MLX5_ASSERT(pkts_n > loc.pkts_sent);
                 ret = mlx5_tx_burst_single(txq, pkts, pkts_n, &loc, olx);
                 /*
                  * These returned code checks are supposed
@@ -4751,7 +5062,7 @@ enter_send_single:
                         goto enter_send_tso;
                 }
                 /* We must not get here. Something is going wrong. */
-               assert(false);
+               MLX5_ASSERT(false);
                 txq->stats.oerrors++;
                 break;
         }
@@ -4761,7 +5072,8 @@ enter_send_single:
          * - doorbell the hardware
          * - copy the rest of mbufs to elts (if any)
          */
-       assert(MLX5_TXOFF_CONFIG(INLINE) || loc.pkts_sent >= loc.pkts_copy);
+       MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE) ||
+                   loc.pkts_sent >= loc.pkts_copy);
         /* Take a shortcut if nothing is sent. */
         if (unlikely(loc.pkts_sent == loc.pkts_loop))
                 goto burst_exit;
@@ -4814,8 +5126,8 @@ enter_send_single:
                 mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, part, olx);
                 loc.pkts_copy = loc.pkts_sent;
         }
-       assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
-       assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
+       MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
+       MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
         if (pkts_n > loc.pkts_sent) {
                 /*
                  * If burst size is large there might be no enough CQE
@@ -4966,9 +5278,48 @@ MLX5_TXOFF_DECL(iv,
                 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
                 MLX5_TXOFF_CONFIG_METADATA)
  
+/* Generate routines with timestamp scheduling. */
+MLX5_TXOFF_DECL(full_ts_nompw,
+               MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_TXPP)
+
+MLX5_TXOFF_DECL(full_ts_nompwi,
+               MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+               MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+               MLX5_TXOFF_CONFIG_VLAN | MLX5_TXOFF_CONFIG_METADATA |
+               MLX5_TXOFF_CONFIG_TXPP)
+
+MLX5_TXOFF_DECL(full_ts,
+               MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_TXPP |
+               MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(full_ts_noi,
+               MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+               MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+               MLX5_TXOFF_CONFIG_VLAN | MLX5_TXOFF_CONFIG_METADATA |
+               MLX5_TXOFF_CONFIG_TXPP | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(none_ts,
+               MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_TXPP |
+               MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mdi_ts,
+               MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_METADATA |
+               MLX5_TXOFF_CONFIG_TXPP | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mti_ts,
+               MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+               MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_METADATA |
+               MLX5_TXOFF_CONFIG_TXPP | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mtiv_ts,
+               MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+               MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+               MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_TXPP |
+               MLX5_TXOFF_CONFIG_EMPW)
+
  /*
   * Generate routines with Legacy Multi-Packet Write support.
- * This mode is supported by ConnectX-4LX only and imposes
+ * This mode is supported by ConnectX-4 Lx only and imposes
   * offload limitations, not supported:
   *   - ACL/Flows (metadata are becoming meaningless)
   *   - WQE Inline headers
@@ -5070,6 +5421,44 @@ MLX5_TXOFF_INFO(iv_empw,
                 MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
                 MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
  
+MLX5_TXOFF_INFO(full_ts_nompw,
+               MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_TXPP)
+
+MLX5_TXOFF_INFO(full_ts_nompwi,
+               MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+               MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+               MLX5_TXOFF_CONFIG_VLAN | MLX5_TXOFF_CONFIG_METADATA |
+               MLX5_TXOFF_CONFIG_TXPP)
+
+MLX5_TXOFF_INFO(full_ts,
+               MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_TXPP |
+               MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(full_ts_noi,
+               MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+               MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
+               MLX5_TXOFF_CONFIG_VLAN | MLX5_TXOFF_CONFIG_METADATA |
+               MLX5_TXOFF_CONFIG_TXPP | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(none_ts,
+               MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_TXPP |
+               MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mdi_ts,
+               MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_METADATA |
+               MLX5_TXOFF_CONFIG_TXPP | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mti_ts,
+               MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+               MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_METADATA |
+               MLX5_TXOFF_CONFIG_TXPP | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mtiv_ts,
+               MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+               MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+               MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_TXPP |
+               MLX5_TXOFF_CONFIG_EMPW)
+
  MLX5_TXOFF_INFO(full,
                 MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
                 MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM |
@@ -5186,7 +5575,7 @@ mlx5_select_tx_function(struct rte_eth_dev *dev)
                       "invalid WQE Data Segment size");
         static_assert(MLX5_WQE_SIZE == 4 * MLX5_WSEG_SIZE,
                       "invalid WQE size");
-       assert(priv);
+       MLX5_ASSERT(priv);
         if (tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS) {
                 /* We should support Multi-Segment Packets. */
                 olx |= MLX5_TXOFF_CONFIG_MULTI;
@@ -5216,6 +5605,14 @@ mlx5_select_tx_function(struct rte_eth_dev *dev)
                 /* We should support VLAN insertion. */
                 olx |= MLX5_TXOFF_CONFIG_VLAN;
         }
+       if (tx_offloads & DEV_TX_OFFLOAD_SEND_ON_TIMESTAMP &&
+           rte_mbuf_dynflag_lookup
+                       (RTE_MBUF_DYNFLAG_TX_TIMESTAMP_NAME, NULL) >= 0 &&
+           rte_mbuf_dynfield_lookup
+                       (RTE_MBUF_DYNFIELD_TIMESTAMP_NAME, NULL) >= 0) {
+               /* Offload configured, dynamic entities registered. */
+               olx |= MLX5_TXOFF_CONFIG_TXPP;
+       }
         if (priv->txqs_n && (*priv->txqs)[0]) {
                 struct mlx5_txq_data *txd = (*priv->txqs)[0];
  
@@ -5276,12 +5673,18 @@ mlx5_select_tx_function(struct rte_eth_dev *dev)
                         /* Does not meet requested offloads at all. */
                         continue;
                 }
+               if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_MPW)
+                       /* Do not enable legacy MPW if not configured. */
+                       continue;
                 if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_EMPW)
                         /* Do not enable eMPW if not configured. */
                         continue;
                 if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_INLINE)
                         /* Do not enable inlining if not configured. */
                         continue;
+               if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_TXPP)
+                       /* Do not enable scheduling if not configured. */
+                       continue;
                 /*
                  * Some routine meets the requirements.
                  * Check whether it has minimal amount
@@ -5326,6 +5729,8 @@ mlx5_select_tx_function(struct rte_eth_dev *dev)
                 DRV_LOG(DEBUG, "\tVLANI (VLAN insertion)");
         if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_METADATA)
                 DRV_LOG(DEBUG, "\tMETAD (tx Flow metadata)");
+       if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_TXPP)
+               DRV_LOG(DEBUG, "\tMETAD (tx Scheduling)");
         if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_EMPW) {
                 if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MPW)
                         DRV_LOG(DEBUG, "\tMPW   (Legacy MPW)");
@@ -5334,3 +5739,94 @@ mlx5_select_tx_function(struct rte_eth_dev *dev)
         }
         return txoff_func[m].func;
  }
+
+/**
+ * DPDK callback to get the TX queue information
+ *
+ * @param dev
+ *   Pointer to the device structure.
+ *
+ * @param tx_queue_id
+ *   Tx queue identificator.
+ *
+ * @param qinfo
+ *   Pointer to the TX queue information structure.
+ *
+ * @return
+ *   None.
+ */
+
+void
+mlx5_txq_info_get(struct rte_eth_dev *dev, uint16_t tx_queue_id,
+                 struct rte_eth_txq_info *qinfo)
+{
+       struct mlx5_priv *priv = dev->data->dev_private;
+       struct mlx5_txq_data *txq = (*priv->txqs)[tx_queue_id];
+       struct mlx5_txq_ctrl *txq_ctrl =
+                       container_of(txq, struct mlx5_txq_ctrl, txq);
+
+       if (!txq)
+               return;
+       qinfo->nb_desc = txq->elts_s;
+       qinfo->conf.tx_thresh.pthresh = 0;
+       qinfo->conf.tx_thresh.hthresh = 0;
+       qinfo->conf.tx_thresh.wthresh = 0;
+       qinfo->conf.tx_rs_thresh = 0;
+       qinfo->conf.tx_free_thresh = 0;
+       qinfo->conf.tx_deferred_start = txq_ctrl ? 0 : 1;
+       qinfo->conf.offloads = dev->data->dev_conf.txmode.offloads;
+}
+
+/**
+ * DPDK callback to get the TX packet burst mode information
+ *
+ * @param dev
+ *   Pointer to the device structure.
+ *
+ * @param tx_queue_id
+ *   Tx queue identificatior.
+ *
+ * @param mode
+ *   Pointer to the burts mode information.
+ *
+ * @return
+ *   0 as success, -EINVAL as failure.
+ */
+
+int
+mlx5_tx_burst_mode_get(struct rte_eth_dev *dev,
+                      uint16_t tx_queue_id __rte_unused,
+                      struct rte_eth_burst_mode *mode)
+{
+       eth_tx_burst_t pkt_burst = dev->tx_pkt_burst;
+       unsigned int i, olx;
+
+       for (i = 0; i < RTE_DIM(txoff_func); i++) {
+               if (pkt_burst == txoff_func[i].func) {
+                       olx = txoff_func[i].olx;
+                       snprintf(mode->info, sizeof(mode->info),
+                                "%s%s%s%s%s%s%s%s%s",
+                                (olx & MLX5_TXOFF_CONFIG_EMPW) ?
+                                ((olx & MLX5_TXOFF_CONFIG_MPW) ?
+                                "Legacy MPW" : "Enhanced MPW") : "No MPW",
+                                (olx & MLX5_TXOFF_CONFIG_MULTI) ?
+                                " + MULTI" : "",
+                                (olx & MLX5_TXOFF_CONFIG_TSO) ?
+                                " + TSO" : "",
+                                (olx & MLX5_TXOFF_CONFIG_SWP) ?
+                                " + SWP" : "",
+                                (olx & MLX5_TXOFF_CONFIG_CSUM) ?
+                                "  + CSUM" : "",
+                                (olx & MLX5_TXOFF_CONFIG_INLINE) ?
+                                " + INLINE" : "",
+                                (olx & MLX5_TXOFF_CONFIG_VLAN) ?
+                                " + VLAN" : "",
+                                (olx & MLX5_TXOFF_CONFIG_METADATA) ?
+                                " + METADATA" : "",
+                                (olx & MLX5_TXOFF_CONFIG_TXPP) ?
+                                " + TXPP" : "");
+                       return 0;
+               }
+       }
+       return -EINVAL;
+}