X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=drivers%2Fnet%2Fiavf%2Fiavf_rxtx_vec_common.h;h=457d6339e1d7d0dbb943fe7076286a62254fb401;hb=39e4a2577fd05199f53182b7c8509aeed40dc07f;hp=7ad1e0f68a229ef853d15a3374cade2a49e8d487;hpb=12b435bf8f2f0cd13bc7c02f2cafe96e949e409b;p=dpdk.git diff --git a/drivers/net/iavf/iavf_rxtx_vec_common.h b/drivers/net/iavf/iavf_rxtx_vec_common.h index 7ad1e0f68a..457d6339e1 100644 --- a/drivers/net/iavf/iavf_rxtx_vec_common.h +++ b/drivers/net/iavf/iavf_rxtx_vec_common.h @@ -5,13 +5,17 @@ #ifndef _IAVF_RXTX_VEC_COMMON_H_ #define _IAVF_RXTX_VEC_COMMON_H_ #include -#include +#include #include #include "iavf.h" #include "iavf_rxtx.h" -static inline uint16_t +#ifndef __INTEL_COMPILER +#pragma GCC diagnostic ignored "-Wcast-qual" +#endif + +static __rte_always_inline uint16_t reassemble_packets(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_bufs, uint16_t nb_bufs, uint8_t *split_flags) { @@ -227,7 +231,10 @@ iavf_rx_vec_queue_default(struct iavf_rx_queue *rxq) if (rxq->proto_xtr != IAVF_PROTO_XTR_NONE) return -1; - return 0; + if (rxq->offloads & IAVF_RX_VECTOR_OFFLOAD) + return IAVF_VECTOR_OFFLOAD_PATH; + + return IAVF_VECTOR_PATH; } static inline int @@ -236,14 +243,17 @@ iavf_tx_vec_queue_default(struct iavf_tx_queue *txq) if (!txq) return -1; - if (txq->offloads & IAVF_NO_VECTOR_FLAGS) - return -1; - if (txq->rs_thresh < IAVF_VPMD_TX_MAX_BURST || txq->rs_thresh > IAVF_VPMD_TX_MAX_FREE_BUF) return -1; - return 0; + if (txq->offloads & IAVF_TX_NO_VECTOR_FLAGS) + return -1; + + if (txq->offloads & IAVF_TX_VECTOR_OFFLOAD) + return IAVF_VECTOR_OFFLOAD_PATH; + + return IAVF_VECTOR_PATH; } static inline int @@ -251,14 +261,20 @@ iavf_rx_vec_dev_check_default(struct rte_eth_dev *dev) { int i; struct iavf_rx_queue *rxq; + int ret; + int result = 0; for (i = 0; i < dev->data->nb_rx_queues; i++) { rxq = dev->data->rx_queues[i]; - if (iavf_rx_vec_queue_default(rxq)) + ret = iavf_rx_vec_queue_default(rxq); + + if (ret < 0) return -1; + if (ret > result) + result = ret; } - return 0; + return result; } static inline int @@ -266,14 +282,296 @@ iavf_tx_vec_dev_check_default(struct rte_eth_dev *dev) { int i; struct iavf_tx_queue *txq; + int ret; + int result = 0; for (i = 0; i < dev->data->nb_tx_queues; i++) { txq = dev->data->tx_queues[i]; - if (iavf_tx_vec_queue_default(txq)) + ret = iavf_tx_vec_queue_default(txq); + + if (ret < 0) return -1; + if (ret > result) + result = ret; } - return 0; + return result; } +/****************************************************************************** + * If user knows a specific offload is not enabled by APP, + * the macro can be commented to save the effort of fast path. + * Currently below 2 features are supported in TX path, + * 1, checksum offload + * 2, VLAN/QINQ insertion + ******************************************************************************/ +#define IAVF_TX_CSUM_OFFLOAD +#define IAVF_TX_VLAN_QINQ_OFFLOAD + +static __rte_always_inline void +iavf_txd_enable_offload(__rte_unused struct rte_mbuf *tx_pkt, + uint64_t *txd_hi) +{ +#if defined(IAVF_TX_CSUM_OFFLOAD) || defined(IAVF_TX_VLAN_QINQ_OFFLOAD) + uint64_t ol_flags = tx_pkt->ol_flags; +#endif + uint32_t td_cmd = 0; +#ifdef IAVF_TX_CSUM_OFFLOAD + uint32_t td_offset = 0; +#endif + +#ifdef IAVF_TX_CSUM_OFFLOAD + /* Set MACLEN */ + td_offset |= (tx_pkt->l2_len >> 1) << + IAVF_TX_DESC_LENGTH_MACLEN_SHIFT; + + /* Enable L3 checksum offloads */ + if (ol_flags & PKT_TX_IP_CKSUM) { + td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV4_CSUM; + td_offset |= (tx_pkt->l3_len >> 2) << + IAVF_TX_DESC_LENGTH_IPLEN_SHIFT; + } else if (ol_flags & PKT_TX_IPV4) { + td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV4; + td_offset |= (tx_pkt->l3_len >> 2) << + IAVF_TX_DESC_LENGTH_IPLEN_SHIFT; + } else if (ol_flags & PKT_TX_IPV6) { + td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV6; + td_offset |= (tx_pkt->l3_len >> 2) << + IAVF_TX_DESC_LENGTH_IPLEN_SHIFT; + } + + /* Enable L4 checksum offloads */ + switch (ol_flags & PKT_TX_L4_MASK) { + case PKT_TX_TCP_CKSUM: + td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_TCP; + td_offset |= (sizeof(struct rte_tcp_hdr) >> 2) << + IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT; + break; + case PKT_TX_SCTP_CKSUM: + td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_SCTP; + td_offset |= (sizeof(struct rte_sctp_hdr) >> 2) << + IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT; + break; + case PKT_TX_UDP_CKSUM: + td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_UDP; + td_offset |= (sizeof(struct rte_udp_hdr) >> 2) << + IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT; + break; + default: + break; + } + + *txd_hi |= ((uint64_t)td_offset) << IAVF_TXD_QW1_OFFSET_SHIFT; +#endif + +#ifdef IAVF_TX_VLAN_QINQ_OFFLOAD + if (ol_flags & (PKT_TX_VLAN | PKT_TX_QINQ)) { + td_cmd |= IAVF_TX_DESC_CMD_IL2TAG1; + *txd_hi |= ((uint64_t)tx_pkt->vlan_tci << + IAVF_TXD_QW1_L2TAG1_SHIFT); + } +#endif + + *txd_hi |= ((uint64_t)td_cmd) << IAVF_TXD_QW1_CMD_SHIFT; +} + +#ifdef CC_AVX2_SUPPORT +static __rte_always_inline void +iavf_rxq_rearm_common(struct iavf_rx_queue *rxq, __rte_unused bool avx512) +{ + int i; + uint16_t rx_id; + volatile union iavf_rx_desc *rxdp; + struct rte_mbuf **rxp = &rxq->sw_ring[rxq->rxrearm_start]; + + rxdp = rxq->rx_ring + rxq->rxrearm_start; + + /* Pull 'n' more MBUFs into the software ring */ + if (rte_mempool_get_bulk(rxq->mp, + (void *)rxp, + IAVF_RXQ_REARM_THRESH) < 0) { + if (rxq->rxrearm_nb + IAVF_RXQ_REARM_THRESH >= + rxq->nb_rx_desc) { + __m128i dma_addr0; + + dma_addr0 = _mm_setzero_si128(); + for (i = 0; i < IAVF_VPMD_DESCS_PER_LOOP; i++) { + rxp[i] = &rxq->fake_mbuf; + _mm_store_si128((__m128i *)&rxdp[i].read, + dma_addr0); + } + } + rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += + IAVF_RXQ_REARM_THRESH; + return; + } + +#ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC + struct rte_mbuf *mb0, *mb1; + __m128i dma_addr0, dma_addr1; + __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, + RTE_PKTMBUF_HEADROOM); + /* Initialize the mbufs in vector, process 2 mbufs in one loop */ + for (i = 0; i < IAVF_RXQ_REARM_THRESH; i += 2, rxp += 2) { + __m128i vaddr0, vaddr1; + + mb0 = rxp[0]; + mb1 = rxp[1]; + + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != + offsetof(struct rte_mbuf, buf_addr) + 8); + vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); + vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); + + /* convert pa to dma_addr hdr/data */ + dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0); + dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1); + + /* add headroom to pa values */ + dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room); + dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room); + + /* flush desc with pa dma_addr */ + _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0); + _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1); + } +#else +#ifdef CC_AVX512_SUPPORT + if (avx512) { + struct rte_mbuf *mb0, *mb1, *mb2, *mb3; + struct rte_mbuf *mb4, *mb5, *mb6, *mb7; + __m512i dma_addr0_3, dma_addr4_7; + __m512i hdr_room = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM); + /* Initialize the mbufs in vector, process 8 mbufs in one loop */ + for (i = 0; i < IAVF_RXQ_REARM_THRESH; + i += 8, rxp += 8, rxdp += 8) { + __m128i vaddr0, vaddr1, vaddr2, vaddr3; + __m128i vaddr4, vaddr5, vaddr6, vaddr7; + __m256i vaddr0_1, vaddr2_3; + __m256i vaddr4_5, vaddr6_7; + __m512i vaddr0_3, vaddr4_7; + + mb0 = rxp[0]; + mb1 = rxp[1]; + mb2 = rxp[2]; + mb3 = rxp[3]; + mb4 = rxp[4]; + mb5 = rxp[5]; + mb6 = rxp[6]; + mb7 = rxp[7]; + + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != + offsetof(struct rte_mbuf, buf_addr) + 8); + vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); + vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); + vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr); + vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr); + vaddr4 = _mm_loadu_si128((__m128i *)&mb4->buf_addr); + vaddr5 = _mm_loadu_si128((__m128i *)&mb5->buf_addr); + vaddr6 = _mm_loadu_si128((__m128i *)&mb6->buf_addr); + vaddr7 = _mm_loadu_si128((__m128i *)&mb7->buf_addr); + + /** + * merge 0 & 1, by casting 0 to 256-bit and inserting 1 + * into the high lanes. Similarly for 2 & 3, and so on. + */ + vaddr0_1 = + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0), + vaddr1, 1); + vaddr2_3 = + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2), + vaddr3, 1); + vaddr4_5 = + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr4), + vaddr5, 1); + vaddr6_7 = + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr6), + vaddr7, 1); + vaddr0_3 = + _mm512_inserti64x4(_mm512_castsi256_si512(vaddr0_1), + vaddr2_3, 1); + vaddr4_7 = + _mm512_inserti64x4(_mm512_castsi256_si512(vaddr4_5), + vaddr6_7, 1); + + /* convert pa to dma_addr hdr/data */ + dma_addr0_3 = _mm512_unpackhi_epi64(vaddr0_3, vaddr0_3); + dma_addr4_7 = _mm512_unpackhi_epi64(vaddr4_7, vaddr4_7); + + /* add headroom to pa values */ + dma_addr0_3 = _mm512_add_epi64(dma_addr0_3, hdr_room); + dma_addr4_7 = _mm512_add_epi64(dma_addr4_7, hdr_room); + + /* flush desc with pa dma_addr */ + _mm512_store_si512((__m512i *)&rxdp->read, dma_addr0_3); + _mm512_store_si512((__m512i *)&(rxdp + 4)->read, dma_addr4_7); + } + } else +#endif + { + struct rte_mbuf *mb0, *mb1, *mb2, *mb3; + __m256i dma_addr0_1, dma_addr2_3; + __m256i hdr_room = _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM); + /* Initialize the mbufs in vector, process 4 mbufs in one loop */ + for (i = 0; i < IAVF_RXQ_REARM_THRESH; + i += 4, rxp += 4, rxdp += 4) { + __m128i vaddr0, vaddr1, vaddr2, vaddr3; + __m256i vaddr0_1, vaddr2_3; + + mb0 = rxp[0]; + mb1 = rxp[1]; + mb2 = rxp[2]; + mb3 = rxp[3]; + + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != + offsetof(struct rte_mbuf, buf_addr) + 8); + vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); + vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); + vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr); + vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr); + + /** + * merge 0 & 1, by casting 0 to 256-bit and inserting 1 + * into the high lanes. Similarly for 2 & 3 + */ + vaddr0_1 = + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0), + vaddr1, 1); + vaddr2_3 = + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2), + vaddr3, 1); + + /* convert pa to dma_addr hdr/data */ + dma_addr0_1 = _mm256_unpackhi_epi64(vaddr0_1, vaddr0_1); + dma_addr2_3 = _mm256_unpackhi_epi64(vaddr2_3, vaddr2_3); + + /* add headroom to pa values */ + dma_addr0_1 = _mm256_add_epi64(dma_addr0_1, hdr_room); + dma_addr2_3 = _mm256_add_epi64(dma_addr2_3, hdr_room); + + /* flush desc with pa dma_addr */ + _mm256_store_si256((__m256i *)&rxdp->read, dma_addr0_1); + _mm256_store_si256((__m256i *)&(rxdp + 2)->read, dma_addr2_3); + } + } + +#endif + + rxq->rxrearm_start += IAVF_RXQ_REARM_THRESH; + if (rxq->rxrearm_start >= rxq->nb_rx_desc) + rxq->rxrearm_start = 0; + + rxq->rxrearm_nb -= IAVF_RXQ_REARM_THRESH; + + rx_id = (uint16_t)((rxq->rxrearm_start == 0) ? + (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1)); + + /* Update the tail pointer on the NIC */ + IAVF_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id); +} +#endif + #endif