From: Wenzhuo Lu <wenzhuo.lu@intel.com>
Date: Wed, 14 Apr 2021 07:34:07 +0000 (+0800)
Subject: net/iavf: add offload path for Tx AVX512
X-Git-Url: http://git.droids-corp.org/?a=commitdiff_plain;h=059f18ae2aec;p=dpdk.git

net/iavf: add offload path for Tx AVX512

Add a specific path for TX AVX512.
In this path, support the HW offload features, like,
checksum insertion, VLAN insertion.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
Acked-by: Qi Zhang <qi.z.zhang@intel.com>
---

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index bd0b7eea83..099ede7736 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -160,7 +160,7 @@ check_rx_vec_allow(struct iavf_rx_queue *rxq)
 static inline bool
 check_tx_vec_allow(struct iavf_tx_queue *txq)
 {
-	if (!(txq->offloads & IAVF_NO_VECTOR_FLAGS) &&
+	if (!(txq->offloads & IAVF_TX_NO_VECTOR_FLAGS) &&
 	    txq->rs_thresh >= IAVF_VPMD_TX_MAX_BURST &&
 	    txq->rs_thresh <= IAVF_VPMD_TX_MAX_FREE_BUF) {
 		PMD_INIT_LOG(DEBUG, "Vector tx can be enabled on this txq.");
@@ -2498,17 +2498,23 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
 #ifdef RTE_ARCH_X86
 	struct iavf_tx_queue *txq;
 	int i;
+	int check_ret;
+	bool use_sse = false;
 	bool use_avx2 = false;
-#ifdef CC_AVX512_SUPPORT
 	bool use_avx512 = false;
-#endif
 
-	if (!iavf_tx_vec_dev_check(dev) &&
-			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-		if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-		     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-				rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-			use_avx2 = true;
+	check_ret = iavf_tx_vec_dev_check(dev);
+
+	if (check_ret >= 0 &&
+	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
+		/* SSE and AVX2 not support offload path yet. */
+		if (check_ret == IAVF_VECTOR_PATH) {
+			use_sse = true;
+			if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+			     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+			    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+				use_avx2 = true;
+		}
 #ifdef CC_AVX512_SUPPORT
 		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
 		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
@@ -2516,15 +2522,29 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
 			use_avx512 = true;
 #endif
 
-		PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
-			    use_avx2 ? "avx2 " : "",
-			    dev->data->port_id);
-		dev->tx_pkt_burst = use_avx2 ?
-				    iavf_xmit_pkts_vec_avx2 :
-				    iavf_xmit_pkts_vec;
+		if (!use_sse && !use_avx2 && !use_avx512)
+			goto normal;
+
+		if (!use_avx512) {
+			PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
+				    use_avx2 ? "avx2 " : "",
+				    dev->data->port_id);
+			dev->tx_pkt_burst = use_avx2 ?
+					    iavf_xmit_pkts_vec_avx2 :
+					    iavf_xmit_pkts_vec;
+		}
 #ifdef CC_AVX512_SUPPORT
-		if (use_avx512)
-			dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+		if (use_avx512) {
+			if (check_ret == IAVF_VECTOR_PATH) {
+				dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+				PMD_DRV_LOG(DEBUG, "Using AVX512 Vector Tx (port %d).",
+					    dev->data->port_id);
+			} else {
+				dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512_offload;
+				PMD_DRV_LOG(DEBUG, "Using AVX512 OFFLOAD Vector Tx (port %d).",
+					    dev->data->port_id);
+			}
+		}
 #endif
 		dev->tx_pkt_prepare = NULL;
 
@@ -2544,8 +2564,9 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
 
 		return;
 	}
-#endif
 
+normal:
+#endif
 	PMD_DRV_LOG(DEBUG, "Using Basic Tx callback (port=%d).",
 		    dev->data->port_id);
 	dev->tx_pkt_burst = iavf_xmit_pkts;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index f56dd742da..bead119264 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -23,14 +23,21 @@
 #define IAVF_VPMD_DESCS_PER_LOOP  4
 #define IAVF_VPMD_TX_MAX_FREE_BUF 64
 
-#define IAVF_NO_VECTOR_FLAGS (				 \
+#define IAVF_TX_NO_VECTOR_FLAGS (				 \
 		DEV_TX_OFFLOAD_MULTI_SEGS |		 \
+		DEV_TX_OFFLOAD_TCP_TSO)
+
+#define IAVF_TX_VECTOR_OFFLOAD (				 \
 		DEV_TX_OFFLOAD_VLAN_INSERT |		 \
+		DEV_TX_OFFLOAD_QINQ_INSERT |		 \
+		DEV_TX_OFFLOAD_IPV4_CKSUM |		 \
 		DEV_TX_OFFLOAD_SCTP_CKSUM |		 \
 		DEV_TX_OFFLOAD_UDP_CKSUM |		 \
-		DEV_TX_OFFLOAD_TCP_TSO |		 \
 		DEV_TX_OFFLOAD_TCP_CKSUM)
 
+#define IAVF_VECTOR_PATH 0
+#define IAVF_VECTOR_OFFLOAD_PATH 1
+
 #define DEFAULT_TX_RS_THRESH     32
 #define DEFAULT_TX_FREE_THRESH   32
 
@@ -488,6 +495,9 @@ uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
 						      uint16_t nb_pkts);
 uint16_t iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 				   uint16_t nb_pkts);
+uint16_t iavf_xmit_pkts_vec_avx512_offload(void *tx_queue,
+					   struct rte_mbuf **tx_pkts,
+					   uint16_t nb_pkts);
 int iavf_txq_vec_setup_avx512(struct iavf_tx_queue *txq);
 
 uint8_t iavf_proto_xtr_type_to_rxdid(uint8_t xtr_type);
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 385f44ec47..f4dd2228fc 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -1518,14 +1518,16 @@ tx_backlog_entry_avx512(struct iavf_tx_vec_entry *txep,
 		txep[i].mbuf = tx_pkts[i];
 }
 
-static inline void
+static __rte_always_inline void
 iavf_vtx1(volatile struct iavf_tx_desc *txdp,
-	  struct rte_mbuf *pkt, uint64_t flags)
+	  struct rte_mbuf *pkt, uint64_t flags, bool offload)
 {
 	uint64_t high_qw =
 		(IAVF_TX_DESC_DTYPE_DATA |
 		 ((uint64_t)flags  << IAVF_TXD_QW1_CMD_SHIFT) |
 		 ((uint64_t)pkt->data_len << IAVF_TXD_QW1_TX_BUF_SZ_SHIFT));
+	if (offload)
+		iavf_txd_enable_offload(pkt, &high_qw);
 
 	__m128i descriptor = _mm_set_epi64x(high_qw,
 					    pkt->buf_iova + pkt->data_off);
@@ -1534,62 +1536,70 @@ iavf_vtx1(volatile struct iavf_tx_desc *txdp,
 
 #define IAVF_TX_LEN_MASK 0xAA
 #define IAVF_TX_OFF_MASK 0x55
-static inline void
+static __rte_always_inline void
 iavf_vtx(volatile struct iavf_tx_desc *txdp,
-	 struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
+	 struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags,
+	 bool offload)
 {
 	const uint64_t hi_qw_tmpl = (IAVF_TX_DESC_DTYPE_DATA |
 			((uint64_t)flags  << IAVF_TXD_QW1_CMD_SHIFT));
 
 	/* if unaligned on 32-bit boundary, do one to align */
 	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
-		iavf_vtx1(txdp, *pkt, flags);
+		iavf_vtx1(txdp, *pkt, flags, offload);
 		nb_pkts--, txdp++, pkt++;
 	}
 
 	/* do 4 at a time while possible, in bursts */
 	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
-		__m512i desc4 =
-			_mm512_set_epi64
-				((uint64_t)pkt[3]->data_len,
-				 pkt[3]->buf_iova,
-				 (uint64_t)pkt[2]->data_len,
-				 pkt[2]->buf_iova,
-				 (uint64_t)pkt[1]->data_len,
-				 pkt[1]->buf_iova,
-				 (uint64_t)pkt[0]->data_len,
-				 pkt[0]->buf_iova);
-		__m512i hi_qw_tmpl_4 = _mm512_set1_epi64(hi_qw_tmpl);
-		__m512i data_off_4 =
+		uint64_t hi_qw3 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[3]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		if (offload)
+			iavf_txd_enable_offload(pkt[3], &hi_qw3);
+		uint64_t hi_qw2 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[2]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		if (offload)
+			iavf_txd_enable_offload(pkt[2], &hi_qw2);
+		uint64_t hi_qw1 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[1]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		if (offload)
+			iavf_txd_enable_offload(pkt[1], &hi_qw1);
+		uint64_t hi_qw0 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[0]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		if (offload)
+			iavf_txd_enable_offload(pkt[0], &hi_qw0);
+
+		__m512i desc0_3 =
 			_mm512_set_epi64
-				(0,
-				 pkt[3]->data_off,
-				 0,
-				 pkt[2]->data_off,
-				 0,
-				 pkt[1]->data_off,
-				 0,
-				 pkt[0]->data_off);
-
-		desc4 = _mm512_mask_slli_epi64(desc4, IAVF_TX_LEN_MASK, desc4,
-					       IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
-		desc4 = _mm512_mask_or_epi64(desc4, IAVF_TX_LEN_MASK, desc4,
-					     hi_qw_tmpl_4);
-		desc4 = _mm512_mask_add_epi64(desc4, IAVF_TX_OFF_MASK, desc4,
-					      data_off_4);
-		_mm512_storeu_si512((void *)txdp, desc4);
+				(hi_qw3,
+				 pkt[3]->buf_iova + pkt[3]->data_off,
+				 hi_qw2,
+				 pkt[2]->buf_iova + pkt[2]->data_off,
+				 hi_qw1,
+				 pkt[1]->buf_iova + pkt[1]->data_off,
+				 hi_qw0,
+				 pkt[0]->buf_iova + pkt[0]->data_off);
+		_mm512_storeu_si512((void *)txdp, desc0_3);
 	}
 
 	/* do any last ones */
 	while (nb_pkts) {
-		iavf_vtx1(txdp, *pkt, flags);
+		iavf_vtx1(txdp, *pkt, flags, offload);
 		txdp++, pkt++, nb_pkts--;
 	}
 }
 
-static inline uint16_t
+static __rte_always_inline uint16_t
 iavf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
-				 uint16_t nb_pkts)
+				 uint16_t nb_pkts, bool offload)
 {
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
 	volatile struct iavf_tx_desc *txdp;
@@ -1620,11 +1630,11 @@ iavf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 	if (nb_commit >= n) {
 		tx_backlog_entry_avx512(txep, tx_pkts, n);
 
-		iavf_vtx(txdp, tx_pkts, n - 1, flags);
+		iavf_vtx(txdp, tx_pkts, n - 1, flags, offload);
 		tx_pkts += (n - 1);
 		txdp += (n - 1);
 
-		iavf_vtx1(txdp, *tx_pkts++, rs);
+		iavf_vtx1(txdp, *tx_pkts++, rs, offload);
 
 		nb_commit = (uint16_t)(nb_commit - n);
 
@@ -1639,7 +1649,7 @@ iavf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	tx_backlog_entry_avx512(txep, tx_pkts, nb_commit);
 
-	iavf_vtx(txdp, tx_pkts, nb_commit, flags);
+	iavf_vtx(txdp, tx_pkts, nb_commit, flags, offload);
 
 	tx_id = (uint16_t)(tx_id + nb_commit);
 	if (tx_id > txq->next_rs) {
@@ -1657,9 +1667,9 @@ iavf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 	return nb_pkts;
 }
 
-uint16_t
-iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
-			  uint16_t nb_pkts)
+static __rte_always_inline uint16_t
+iavf_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
+			      uint16_t nb_pkts, bool offload)
 {
 	uint16_t nb_tx = 0;
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
@@ -1669,7 +1679,7 @@ iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 		num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh);
 		ret = iavf_xmit_fixed_burst_vec_avx512(tx_queue, &tx_pkts[nb_tx],
-						       num);
+						       num, offload);
 		nb_tx += ret;
 		nb_pkts -= ret;
 		if (ret < num)
@@ -1679,6 +1689,13 @@ iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 	return nb_tx;
 }
 
+uint16_t
+iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
+			  uint16_t nb_pkts)
+{
+	return iavf_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts, false);
+}
+
 static inline void
 iavf_tx_queue_release_mbufs_avx512(struct iavf_tx_queue *txq)
 {
@@ -1709,3 +1726,10 @@ iavf_txq_vec_setup_avx512(struct iavf_tx_queue *txq)
 	txq->ops = &avx512_vec_txq_ops;
 	return 0;
 }
+
+uint16_t
+iavf_xmit_pkts_vec_avx512_offload(void *tx_queue, struct rte_mbuf **tx_pkts,
+				  uint16_t nb_pkts)
+{
+	return iavf_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts, true);
+}
diff --git a/drivers/net/iavf/iavf_rxtx_vec_common.h b/drivers/net/iavf/iavf_rxtx_vec_common.h
index 816e16a937..62a333f0d9 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_common.h
+++ b/drivers/net/iavf/iavf_rxtx_vec_common.h
@@ -240,14 +240,17 @@ iavf_tx_vec_queue_default(struct iavf_tx_queue *txq)
 	if (!txq)
 		return -1;
 
-	if (txq->offloads & IAVF_NO_VECTOR_FLAGS)
-		return -1;
-
 	if (txq->rs_thresh < IAVF_VPMD_TX_MAX_BURST ||
 	    txq->rs_thresh > IAVF_VPMD_TX_MAX_FREE_BUF)
 		return -1;
 
-	return 0;
+	if (txq->offloads & IAVF_TX_NO_VECTOR_FLAGS)
+		return -1;
+
+	if (txq->offloads & IAVF_TX_VECTOR_OFFLOAD)
+		return IAVF_VECTOR_OFFLOAD_PATH;
+
+	return IAVF_VECTOR_PATH;
 }
 
 static inline int
@@ -270,14 +273,97 @@ iavf_tx_vec_dev_check_default(struct rte_eth_dev *dev)
 {
 	int i;
 	struct iavf_tx_queue *txq;
+	int ret;
+	int result = 0;
 
 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
 		txq = dev->data->tx_queues[i];
-		if (iavf_tx_vec_queue_default(txq))
+		ret = iavf_tx_vec_queue_default(txq);
+
+		if (ret < 0)
 			return -1;
+		if (ret > result)
+			result = ret;
 	}
 
-	return 0;
+	return result;
+}
+
+/******************************************************************************
+ * If user knows a specific offload is not enabled by APP,
+ * the macro can be commented to save the effort of fast path.
+ * Currently below 2 features are supported in TX path,
+ * 1, checksum offload
+ * 2, VLAN/QINQ insertion
+ ******************************************************************************/
+#define IAVF_TX_CSUM_OFFLOAD
+#define IAVF_TX_VLAN_QINQ_OFFLOAD
+
+static __rte_always_inline void
+iavf_txd_enable_offload(__rte_unused struct rte_mbuf *tx_pkt,
+			uint64_t *txd_hi)
+{
+#if defined(IAVF_TX_CSUM_OFFLOAD) || defined(IAVF_TX_VLAN_QINQ_OFFLOAD)
+	uint64_t ol_flags = tx_pkt->ol_flags;
+#endif
+	uint32_t td_cmd = 0;
+#ifdef IAVF_TX_CSUM_OFFLOAD
+	uint32_t td_offset = 0;
+#endif
+
+#ifdef IAVF_TX_CSUM_OFFLOAD
+	/* Set MACLEN */
+	td_offset |= (tx_pkt->l2_len >> 1) <<
+		     IAVF_TX_DESC_LENGTH_MACLEN_SHIFT;
+
+	/* Enable L3 checksum offloads */
+	if (ol_flags & PKT_TX_IP_CKSUM) {
+		td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV4_CSUM;
+		td_offset |= (tx_pkt->l3_len >> 2) <<
+			     IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
+	} else if (ol_flags & PKT_TX_IPV4) {
+		td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV4;
+		td_offset |= (tx_pkt->l3_len >> 2) <<
+			     IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
+	} else if (ol_flags & PKT_TX_IPV6) {
+		td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV6;
+		td_offset |= (tx_pkt->l3_len >> 2) <<
+			     IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
+	}
+
+	/* Enable L4 checksum offloads */
+	switch (ol_flags & PKT_TX_L4_MASK) {
+	case PKT_TX_TCP_CKSUM:
+		td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_TCP;
+		td_offset |= (sizeof(struct rte_tcp_hdr) >> 2) <<
+			     IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
+		break;
+	case PKT_TX_SCTP_CKSUM:
+		td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_SCTP;
+		td_offset |= (sizeof(struct rte_sctp_hdr) >> 2) <<
+			     IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
+		break;
+	case PKT_TX_UDP_CKSUM:
+		td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_UDP;
+		td_offset |= (sizeof(struct rte_udp_hdr) >> 2) <<
+			     IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
+		break;
+	default:
+		break;
+	}
+
+	*txd_hi |= ((uint64_t)td_offset) << IAVF_TXD_QW1_OFFSET_SHIFT;
+#endif
+
+#ifdef IAVF_TX_VLAN_QINQ_OFFLOAD
+	if (ol_flags & (PKT_TX_VLAN | PKT_TX_QINQ)) {
+		td_cmd |= IAVF_TX_DESC_CMD_IL2TAG1;
+		*txd_hi |= ((uint64_t)tx_pkt->vlan_tci <<
+			    IAVF_TXD_QW1_L2TAG1_SHIFT);
+	}
+#endif
+
+	*txd_hi |= ((uint64_t)td_cmd) << IAVF_TXD_QW1_CMD_SHIFT;
 }
 
 #ifdef CC_AVX2_SUPPORT