X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=drivers%2Fnet%2Fmlx5%2Fmlx5_rxtx.c;h=a21201038ca0273362e14e4ed29633ee87cfe203;hb=611faa5f46cc67449f272e14450fc6a0a275767d;hp=5eea932d44000ec8cda94c6d92c6b6001d5affc0;hpb=26f1bae837eb11c86dcb058d8ab6cc8e44989b1c;p=dpdk.git

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 5eea932d44..a21201038c 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -33,6 +33,7 @@
 
 #include "mlx5_defs.h"
 #include "mlx5.h"
+#include "mlx5_mr.h"
 #include "mlx5_utils.h"
 #include "mlx5_rxtx.h"
 #include "mlx5_autoconf.h"
@@ -1000,6 +1001,7 @@ static int
 mlx5_queue_state_modify(struct rte_eth_dev *dev,
 			struct mlx5_mp_arg_queue_state_modify *sm)
 {
+	struct mlx5_priv *priv = dev->data->dev_private;
 	int ret = 0;
 
 	switch (rte_eal_process_type()) {
@@ -1007,7 +1009,7 @@ mlx5_queue_state_modify(struct rte_eth_dev *dev,
 		ret = mlx5_queue_state_modify_primary(dev, sm);
 		break;
 	case RTE_PROC_SECONDARY:
-		ret = mlx5_mp_req_queue_state_modify(dev, sm);
+		ret = mlx5_mp_req_queue_state_modify(&priv->mp_id, sm);
 		break;
 	default:
 		break;
@@ -1337,9 +1339,10 @@ rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
 			pkt->hash.fdir.hi = mlx5_flow_mark_get(mark);
 		}
 	}
-	if (rte_flow_dynf_metadata_avail() && cqe->flow_table_metadata) {
-		pkt->ol_flags |= PKT_RX_DYNF_METADATA;
-		*RTE_FLOW_DYNF_METADATA(pkt) = cqe->flow_table_metadata;
+	if (rxq->dynf_meta && cqe->flow_table_metadata) {
+		pkt->ol_flags |= rxq->flow_meta_mask;
+		*RTE_MBUF_DYNFIELD(pkt, rxq->flow_meta_offset, uint32_t *) =
+			cqe->flow_table_metadata;
 	}
 	if (rxq->csum)
 		pkt->ol_flags |= rxq_cq_to_ol_flags(cqe);
@@ -1658,21 +1661,20 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	unsigned int i = 0;
 	uint32_t rq_ci = rxq->rq_ci;
 	uint16_t consumed_strd = rxq->consumed_strd;
-	uint16_t headroom_sz = rxq->strd_headroom_en * RTE_PKTMBUF_HEADROOM;
 	struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
 
 	while (i < pkts_n) {
 		struct rte_mbuf *pkt;
 		void *addr;
 		int ret;
-		unsigned int len;
+		uint32_t len;
 		uint16_t strd_cnt;
 		uint16_t strd_idx;
 		uint32_t offset;
 		uint32_t byte_cnt;
+		int32_t hdrm_overlap;
 		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
 		uint32_t rss_hash_res = 0;
-		uint8_t lro_num_seg;
 
 		if (consumed_strd == strd_n) {
 			/* Replace WQE only if the buffer is still in use. */
@@ -1719,18 +1721,6 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		MLX5_ASSERT(strd_idx < strd_n);
 		MLX5_ASSERT(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) &
 			    wq_mask));
-		lro_num_seg = cqe->lro_num_seg;
-		/*
-		 * Currently configured to receive a packet per a stride. But if
-		 * MTU is adjusted through kernel interface, device could
-		 * consume multiple strides without raising an error. In this
-		 * case, the packet should be dropped because it is bigger than
-		 * the max_rx_pkt_len.
-		 */
-		if (unlikely(!lro_num_seg && strd_cnt > 1)) {
-			++rxq->stats.idropped;
-			continue;
-		}
 		pkt = rte_pktmbuf_alloc(rxq->mp);
 		if (unlikely(pkt == NULL)) {
 			++rxq->stats.rx_nombuf;
@@ -1742,23 +1732,57 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			len -= RTE_ETHER_CRC_LEN;
 		offset = strd_idx * strd_sz + strd_shift;
 		addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);
+		hdrm_overlap = len + RTE_PKTMBUF_HEADROOM - strd_cnt * strd_sz;
 		/*
 		 * Memcpy packets to the target mbuf if:
 		 * - The size of packet is smaller than mprq_max_memcpy_len.
 		 * - Out of buffer in the Mempool for Multi-Packet RQ.
+		 * - The packet's stride overlaps a headroom and scatter is off.
 		 */
-		if (len <= rxq->mprq_max_memcpy_len || rxq->mprq_repl == NULL) {
-			/*
-			 * When memcpy'ing packet due to out-of-buffer, the
-			 * packet must be smaller than the target mbuf.
-			 */
-			if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) {
+		if (len <= rxq->mprq_max_memcpy_len ||
+		    rxq->mprq_repl == NULL ||
+		    (hdrm_overlap > 0 && !rxq->strd_scatter_en)) {
+			if (likely(rte_pktmbuf_tailroom(pkt) >= len)) {
+				rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
+					   addr, len);
+				DATA_LEN(pkt) = len;
+			} else if (rxq->strd_scatter_en) {
+				struct rte_mbuf *prev = pkt;
+				uint32_t seg_len =
+					RTE_MIN(rte_pktmbuf_tailroom(pkt), len);
+				uint32_t rem_len = len - seg_len;
+
+				rte_memcpy(rte_pktmbuf_mtod(pkt, void *),
+					   addr, seg_len);
+				DATA_LEN(pkt) = seg_len;
+				while (rem_len) {
+					struct rte_mbuf *next =
+						rte_pktmbuf_alloc(rxq->mp);
+
+					if (unlikely(next == NULL)) {
+						rte_pktmbuf_free(pkt);
+						++rxq->stats.rx_nombuf;
+						goto out;
+					}
+					NEXT(prev) = next;
+					SET_DATA_OFF(next, 0);
+					addr = RTE_PTR_ADD(addr, seg_len);
+					seg_len = RTE_MIN
+						(rte_pktmbuf_tailroom(next),
+						 rem_len);
+					rte_memcpy
+						(rte_pktmbuf_mtod(next, void *),
+						 addr, seg_len);
+					DATA_LEN(next) = seg_len;
+					rem_len -= seg_len;
+					prev = next;
+					++NB_SEGS(pkt);
+				}
+			} else {
 				rte_pktmbuf_free_seg(pkt);
 				++rxq->stats.idropped;
 				continue;
 			}
-			rte_memcpy(rte_pktmbuf_mtod(pkt, void *), addr, len);
-			DATA_LEN(pkt) = len;
 		} else {
 			rte_iova_t buf_iova;
 			struct rte_mbuf_ext_shared_info *shinfo;
@@ -1769,7 +1793,7 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			rte_atomic16_add_return(&buf->refcnt, 1);
 			MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf->refcnt) <=
 				    strd_n + 1);
-			buf_addr = RTE_PTR_SUB(addr, headroom_sz);
+			buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);
 			/*
 			 * MLX5 device doesn't use iova but it is necessary in a
 			 * case where the Rx packet is transmitted via a
@@ -1788,43 +1812,42 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			rte_pktmbuf_attach_extbuf(pkt, buf_addr, buf_iova,
 						  buf_len, shinfo);
 			/* Set mbuf head-room. */
-			pkt->data_off = headroom_sz;
+			SET_DATA_OFF(pkt, RTE_PKTMBUF_HEADROOM);
 			MLX5_ASSERT(pkt->ol_flags == EXT_ATTACHED_MBUF);
-			/*
-			 * Prevent potential overflow due to MTU change through
-			 * kernel interface.
-			 */
-			if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) {
-				rte_pktmbuf_free_seg(pkt);
-				++rxq->stats.idropped;
-				continue;
-			}
+			MLX5_ASSERT(rte_pktmbuf_tailroom(pkt) <
+				len - (hdrm_overlap > 0 ? hdrm_overlap : 0));
 			DATA_LEN(pkt) = len;
 			/*
-			 * LRO packet may consume all the stride memory, in this
-			 * case packet head-room space is not guaranteed so must
-			 * to add an empty mbuf for the head-room.
+			 * Copy the last fragment of a packet (up to headroom
+			 * size bytes) in case there is a stride overlap with
+			 * a next packet's headroom. Allocate a separate mbuf
+			 * to store this fragment and link it. Scatter is on.
 			 */
-			if (!rxq->strd_headroom_en) {
-				struct rte_mbuf *headroom_mbuf =
-						rte_pktmbuf_alloc(rxq->mp);
+			if (hdrm_overlap > 0) {
+				MLX5_ASSERT(rxq->strd_scatter_en);
+				struct rte_mbuf *seg =
+					rte_pktmbuf_alloc(rxq->mp);
 
-				if (unlikely(headroom_mbuf == NULL)) {
+				if (unlikely(seg == NULL)) {
 					rte_pktmbuf_free_seg(pkt);
 					++rxq->stats.rx_nombuf;
 					break;
 				}
-				PORT(pkt) = rxq->port_id;
-				NEXT(headroom_mbuf) = pkt;
-				pkt = headroom_mbuf;
+				SET_DATA_OFF(seg, 0);
+				rte_memcpy(rte_pktmbuf_mtod(seg, void *),
+					RTE_PTR_ADD(addr, len - hdrm_overlap),
+					hdrm_overlap);
+				DATA_LEN(seg) = hdrm_overlap;
+				DATA_LEN(pkt) = len - hdrm_overlap;
+				NEXT(pkt) = seg;
 				NB_SEGS(pkt) = 2;
 			}
 		}
 		rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
-		if (lro_num_seg > 1) {
+		if (cqe->lro_num_seg > 1) {
 			mlx5_lro_update_hdr(addr, cqe, len);
 			pkt->ol_flags |= PKT_RX_LRO;
-			pkt->tso_segsz = strd_sz;
+			pkt->tso_segsz = len / cqe->lro_num_seg;
 		}
 		PKT_LEN(pkt) = len;
 		PORT(pkt) = rxq->port_id;
@@ -1836,6 +1859,7 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		*(pkts++) = pkt;
 		++i;
 	}
+out:
 	/* Update the consumer indexes. */
 	rxq->consumed_strd = consumed_strd;
 	rte_cio_wmb();
@@ -2160,7 +2184,7 @@ mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq,
 {
 	unsigned int count = MLX5_TX_COMP_MAX_CQE;
 	volatile struct mlx5_cqe *last_cqe = NULL;
-	uint16_t ci = txq->cq_ci;
+	bool ring_doorbell = false;
 	int ret;
 
 	static_assert(MLX5_CQE_STATUS_HW_OWN < 0, "Must be negative value");
@@ -2168,8 +2192,8 @@ mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq,
 	do {
 		volatile struct mlx5_cqe *cqe;
 
-		cqe = &txq->cqes[ci & txq->cqe_m];
-		ret = check_cqe(cqe, txq->cqe_s, ci);
+		cqe = &txq->cqes[txq->cq_ci & txq->cqe_m];
+		ret = check_cqe(cqe, txq->cqe_s, txq->cq_ci);
 		if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
 			if (likely(ret != MLX5_CQE_STATUS_ERR)) {
 				/* No new CQEs in completion queue. */
@@ -2183,7 +2207,6 @@ mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq,
 			 * here, before we might perform SQ reset.
 			 */
 			rte_wmb();
-			txq->cq_ci = ci;
 			ret = mlx5_tx_error_cqe_handle
 				(txq, (volatile struct mlx5_err_cqe *)cqe);
 			if (unlikely(ret < 0)) {
@@ -2199,16 +2222,18 @@ mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq,
 			 * MLX5_CQE_SYNDROME_WR_FLUSH_ERR status.
 			 * The send queue is supposed to be empty.
 			 */
-			++ci;
-			txq->cq_pi = ci;
+			ring_doorbell = true;
+			++txq->cq_ci;
+			txq->cq_pi = txq->cq_ci;
 			last_cqe = NULL;
 			continue;
 		}
 		/* Normal transmit completion. */
-		MLX5_ASSERT(ci != txq->cq_pi);
-		MLX5_ASSERT((txq->fcqs[ci & txq->cqe_m] >> 16) ==
+		MLX5_ASSERT(txq->cq_ci != txq->cq_pi);
+		MLX5_ASSERT((txq->fcqs[txq->cq_ci & txq->cqe_m] >> 16) ==
 			    cqe->wqe_counter);
-		++ci;
+		ring_doorbell = true;
+		++txq->cq_ci;
 		last_cqe = cqe;
 		/*
 		 * We have to restrict the amount of processed CQEs
@@ -2221,14 +2246,10 @@ mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq,
 		if (likely(--count == 0))
 			break;
 	} while (true);
-	if (likely(ci != txq->cq_ci)) {
-		/*
-		 * Update completion queue consuming index
-		 * and ring doorbell to notify hardware.
-		 */
+	if (likely(ring_doorbell)) {
+		/* Ring doorbell to notify hardware. */
 		rte_compiler_barrier();
-		txq->cq_ci = ci;
-		*txq->cq_db = rte_cpu_to_be_32(ci);
+		*txq->cq_db = rte_cpu_to_be_32(txq->cq_ci);
 		mlx5_tx_comp_flush(txq, last_cqe, olx);
 	}
 }
@@ -2262,6 +2283,7 @@ mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq,
 	     (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres)) {
 		volatile struct mlx5_wqe *last = loc->wqe_last;
 
+		MLX5_ASSERT(last);
 		txq->elts_comp = head;
 		if (MLX5_TXOFF_CONFIG(INLINE))
 			txq->wqe_comp = txq->wqe_ci;
@@ -2949,8 +2971,14 @@ mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq,
 	unsigned int part;
 	uint8_t *pdst;
 
-	dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
-	pdst = &dseg->inline_data[0];
+	if (!MLX5_TXOFF_CONFIG(MPW)) {
+		/* Store the descriptor byte counter for eMPW sessions. */
+		dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
+		pdst = &dseg->inline_data[0];
+	} else {
+		/* The entire legacy MPW session counter is stored on close. */
+		pdst = (uint8_t *)dseg;
+	}
 	/*
 	 * The WQEBB space availability is checked by caller.
 	 * Here we should be aware of WQE ring buffer wraparound only.
@@ -2962,7 +2990,8 @@ mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq,
 		len -= part;
 		if (likely(!len)) {
 			pdst += part;
-			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
+			if (!MLX5_TXOFF_CONFIG(MPW))
+				pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
 			/* Note: no final wraparound check here. */
 			return (struct mlx5_wqe_dseg *)pdst;
 		}
@@ -3010,9 +3039,16 @@ mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq,
 	static_assert(MLX5_DSEG_MIN_INLINE_SIZE ==
 				 (2 * RTE_ETHER_ADDR_LEN),
 		      "invalid Data Segment data size");
-	dseg->bcount = rte_cpu_to_be_32((len + sizeof(struct rte_vlan_hdr)) |
-					MLX5_ETH_WQE_DATA_INLINE);
-	pdst = &dseg->inline_data[0];
+	if (!MLX5_TXOFF_CONFIG(MPW)) {
+		/* Store the descriptor byte counter for eMPW sessions. */
+		dseg->bcount = rte_cpu_to_be_32
+				((len + sizeof(struct rte_vlan_hdr)) |
+				 MLX5_ETH_WQE_DATA_INLINE);
+		pdst = &dseg->inline_data[0];
+	} else {
+		/* The entire legacy MPW session counter is stored on close. */
+		pdst = (uint8_t *)dseg;
+	}
 	memcpy(pdst, buf, MLX5_DSEG_MIN_INLINE_SIZE);
 	buf += MLX5_DSEG_MIN_INLINE_SIZE;
 	pdst += MLX5_DSEG_MIN_INLINE_SIZE;
@@ -3035,7 +3071,8 @@ mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq,
 		len -= part;
 		if (likely(!len)) {
 			pdst += part;
-			pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
+			if (!MLX5_TXOFF_CONFIG(MPW))
+				pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
 			/* Note: no final wraparound check here. */
 			return (struct mlx5_wqe_dseg *)pdst;
 		}
@@ -3906,6 +3943,8 @@ mlx5_tx_sdone_empw(struct mlx5_txq_data *restrict txq,
  *   Total size of descriptor/data in bytes.
  * @param slen
  *   Accumulated statistics, data bytes sent.
+ * @param wqem
+ *   The base WQE for the eMPW/MPW descriptor.
  * @param olx
  *   Configured Tx offloads mask. It is fully defined at
  *   compile time and may be used for optimization.
@@ -3919,20 +3958,40 @@ mlx5_tx_idone_empw(struct mlx5_txq_data *restrict txq,
 		   struct mlx5_txq_local *restrict loc,
 		   unsigned int len,
 		   unsigned int slen,
+		   struct mlx5_wqe *restrict wqem,
 		   unsigned int olx __rte_unused)
 {
+	struct mlx5_wqe_dseg *dseg = &wqem->dseg[0];
+
 	MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
-	MLX5_ASSERT((len % MLX5_WSEG_SIZE) == 0);
 #ifdef MLX5_PMD_SOFT_COUNTERS
 	/* Update sent data bytes counter. */
 	 txq->stats.obytes += slen;
 #else
 	(void)slen;
 #endif
-	len = len / MLX5_WSEG_SIZE + 2;
-	loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len);
+	if (MLX5_TXOFF_CONFIG(MPW) && dseg->bcount == RTE_BE32(0)) {
+		/*
+		 * If the legacy MPW session contains the inline packets
+		 * we should set the only inline data segment length
+		 * and align the total length to the segment size.
+		 */
+		MLX5_ASSERT(len > sizeof(dseg->bcount));
+		dseg->bcount = rte_cpu_to_be_32((len - sizeof(dseg->bcount)) |
+						MLX5_ETH_WQE_DATA_INLINE);
+		len = (len + MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE + 2;
+	} else {
+		/*
+		 * The session is not legacy MPW or contains the
+		 * data buffer pointer segments.
+		 */
+		MLX5_ASSERT((len % MLX5_WSEG_SIZE) == 0);
+		len = len / MLX5_WSEG_SIZE + 2;
+	}
+	wqem->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len);
 	txq->wqe_ci += (len + 3) / 4;
 	loc->wqe_free -= (len + 3) / 4;
+	loc->wqe_last = wqem;
 }
 
 /**
@@ -4169,7 +4228,7 @@ mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq,
 	pkts_n -= loc->pkts_sent;
 	for (;;) {
 		struct mlx5_wqe_dseg *restrict dseg;
-		struct mlx5_wqe_eseg *restrict eseg;
+		struct mlx5_wqe *restrict wqem;
 		enum mlx5_txcmp_code ret;
 		unsigned int room, part, nlim;
 		unsigned int slen = 0;
@@ -4188,26 +4247,34 @@ mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq,
 			return MLX5_TXCMP_CODE_EXIT;
 		if (likely(pkts_n > 1))
 			rte_prefetch0(*pkts);
-		loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+		wqem = txq->wqes + (txq->wqe_ci & txq->wqe_m);
 		/*
 		 * Build eMPW title WQEBB:
 		 * - Control Segment, eMPW opcode, zero DS
 		 * - Ethernet Segment, no inline
 		 */
-		mlx5_tx_cseg_init(txq, loc, loc->wqe_last, 0,
+		mlx5_tx_cseg_init(txq, loc, wqem, 0,
 				  MLX5_OPCODE_ENHANCED_MPSW, olx);
-		mlx5_tx_eseg_none(txq, loc, loc->wqe_last,
+		mlx5_tx_eseg_none(txq, loc, wqem,
 				  olx & ~MLX5_TXOFF_CONFIG_VLAN);
-		eseg = &loc->wqe_last->eseg;
-		dseg = &loc->wqe_last->dseg[0];
+		dseg = &wqem->dseg[0];
 		/* Store the packet length for legacy MPW. */
 		if (MLX5_TXOFF_CONFIG(MPW))
-			eseg->mss = rte_cpu_to_be_16
-					(rte_pktmbuf_data_len(loc->mbuf));
+			wqem->eseg.mss = rte_cpu_to_be_16
+					 (rte_pktmbuf_data_len(loc->mbuf));
 		room = RTE_MIN(MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE,
 			       loc->wqe_free) * MLX5_WQE_SIZE -
 					MLX5_WQE_CSEG_SIZE -
 					MLX5_WQE_ESEG_SIZE;
+		/* Limit the room for legacy MPW sessions for performance. */
+		if (MLX5_TXOFF_CONFIG(MPW))
+			room = RTE_MIN(room,
+				       RTE_MAX(txq->inlen_empw +
+					       sizeof(dseg->bcount) +
+					       (MLX5_TXOFF_CONFIG(VLAN) ?
+					       sizeof(struct rte_vlan_hdr) : 0),
+					       MLX5_MPW_INLINE_MAX_PACKETS *
+					       MLX5_WQE_DSEG_SIZE));
 		/* Build WQE till we have space, packets and resources. */
 		part = room;
 		for (;;) {
@@ -4231,15 +4298,36 @@ mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq,
 				 * We have some successfully built
 				 * packet Data Segments to send.
 				 */
-				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+				mlx5_tx_idone_empw(txq, loc, part,
+						   slen, wqem, olx);
 				return MLX5_TXCMP_CODE_ERROR;
 			}
 			/* Inline or not inline - that's the Question. */
 			if (dlen > txq->inlen_empw ||
 			    loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE)
 				goto pointer_empw;
+			if (MLX5_TXOFF_CONFIG(MPW)) {
+				if (dlen > txq->inlen_send)
+					goto pointer_empw;
+				tlen = dlen;
+				if (part == room) {
+					/* Open new inline MPW session. */
+					tlen += sizeof(dseg->bcount);
+					dseg->bcount = RTE_BE32(0);
+					dseg = RTE_PTR_ADD
+						(dseg, sizeof(dseg->bcount));
+				} else {
+					/*
+					 * No pointer and inline descriptor
+					 * intermix for legacy MPW sessions.
+					 */
+					if (wqem->dseg[0].bcount)
+						break;
+				}
+			} else {
+				tlen = sizeof(dseg->bcount) + dlen;
+			}
 			/* Inline entire packet, optional VLAN insertion. */
-			tlen = sizeof(dseg->bcount) + dlen;
 			if (MLX5_TXOFF_CONFIG(VLAN) &&
 			    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
 				/*
@@ -4265,7 +4353,8 @@ mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq,
 				dseg = mlx5_tx_dseg_empw(txq, loc, dseg,
 							 dptr, dlen, olx);
 			}
-			tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE);
+			if (!MLX5_TXOFF_CONFIG(MPW))
+				tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE);
 			MLX5_ASSERT(room >= tlen);
 			room -= tlen;
 			/*
@@ -4275,6 +4364,14 @@ mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq,
 			rte_pktmbuf_free_seg(loc->mbuf);
 			goto next_mbuf;
 pointer_empw:
+			/*
+			 * No pointer and inline descriptor
+			 * intermix for legacy MPW sessions.
+			 */
+			if (MLX5_TXOFF_CONFIG(MPW) &&
+			    part != room &&
+			    wqem->dseg[0].bcount == RTE_BE32(0))
+				break;
 			/*
 			 * Not inlinable VLAN packets are
 			 * proceeded outside of this routine.
@@ -4303,7 +4400,8 @@ next_mbuf:
 				 * continue build descriptors.
 				 */
 				part -= room;
-				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+				mlx5_tx_idone_empw(txq, loc, part,
+						   slen, wqem, olx);
 				return MLX5_TXCMP_CODE_EXIT;
 			}
 			loc->mbuf = *pkts++;
@@ -4317,7 +4415,8 @@ next_mbuf:
 			 */
 			if (ret == MLX5_TXCMP_CODE_MULTI) {
 				part -= room;
-				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+				mlx5_tx_idone_empw(txq, loc, part,
+						   slen, wqem, olx);
 				if (unlikely(!loc->elts_free ||
 					     !loc->wqe_free))
 					return MLX5_TXCMP_CODE_EXIT;
@@ -4326,7 +4425,8 @@ next_mbuf:
 			MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
 			if (ret == MLX5_TXCMP_CODE_TSO) {
 				part -= room;
-				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+				mlx5_tx_idone_empw(txq, loc, part,
+						   slen, wqem, olx);
 				if (unlikely(!loc->elts_free ||
 					     !loc->wqe_free))
 					return MLX5_TXCMP_CODE_EXIT;
@@ -4334,7 +4434,8 @@ next_mbuf:
 			}
 			if (ret == MLX5_TXCMP_CODE_SINGLE) {
 				part -= room;
-				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+				mlx5_tx_idone_empw(txq, loc, part,
+						   slen, wqem, olx);
 				if (unlikely(!loc->elts_free ||
 					     !loc->wqe_free))
 					return MLX5_TXCMP_CODE_EXIT;
@@ -4343,7 +4444,8 @@ next_mbuf:
 			if (ret != MLX5_TXCMP_CODE_EMPW) {
 				MLX5_ASSERT(false);
 				part -= room;
-				mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+				mlx5_tx_idone_empw(txq, loc, part,
+						   slen, wqem, olx);
 				return MLX5_TXCMP_CODE_ERROR;
 			}
 			/* Check if we have minimal room left. */
@@ -4358,7 +4460,8 @@ next_mbuf:
 			 * - software parser settings
 			 * - packets length (legacy MPW only)
 			 */
-			if (!mlx5_tx_match_empw(txq, eseg, loc, dlen, olx))
+			if (!mlx5_tx_match_empw(txq, &wqem->eseg,
+						loc, dlen, olx))
 				break;
 			/* Packet attributes match, continue the same eMPW. */
 			if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
@@ -4372,7 +4475,7 @@ next_mbuf:
 		part -= room;
 		if (unlikely(!part))
 			return MLX5_TXCMP_CODE_EXIT;
-		mlx5_tx_idone_empw(txq, loc, part, slen, olx);
+		mlx5_tx_idone_empw(txq, loc, part, slen, wqem, olx);
 		if (unlikely(!loc->elts_free ||
 			     !loc->wqe_free))
 			return MLX5_TXCMP_CODE_EXIT;
@@ -4747,7 +4850,7 @@ send_loop:
 	/*
 	 * Calculate the number of available resources - elts and WQEs.
 	 * There are two possible different scenarios:
-	 * - no data inlining into WQEs, one WQEBB may contains upto
+	 * - no data inlining into WQEs, one WQEBB may contains up to
 	 *   four packets, in this case elts become scarce resource
 	 * - data inlining into WQEs, one packet may require multiple
 	 *   WQEBBs, the WQEs become the limiting factor.
@@ -5131,7 +5234,7 @@ MLX5_TXOFF_DECL(iv,
 
 /*
  * Generate routines with Legacy Multi-Packet Write support.
- * This mode is supported by ConnectX-4LX only and imposes
+ * This mode is supported by ConnectX-4 Lx only and imposes
  * offload limitations, not supported:
  *   - ACL/Flows (metadata are becoming meaningless)
  *   - WQE Inline headers