X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=drivers%2Fnet%2Fice%2Fice_rxtx_vec_common.h;h=2d8ef7dc8a93d5951f2666afa9f85c28fd030df5;hb=90bbd7d9545f88ac11f8b89449ad7fd799adbfba;hp=080ca4175aa4c23f32f81cc9b289da87e19696d9;hpb=7e124ff12c85a438010e49f0cee7eb10593b0ed6;p=dpdk.git diff --git a/drivers/net/ice/ice_rxtx_vec_common.h b/drivers/net/ice/ice_rxtx_vec_common.h index 080ca4175a..2d8ef7dc8a 100644 --- a/drivers/net/ice/ice_rxtx_vec_common.h +++ b/drivers/net/ice/ice_rxtx_vec_common.h @@ -7,6 +7,10 @@ #include "ice_rxtx.h" +#ifndef __INTEL_COMPILER +#pragma GCC diagnostic ignored "-Wcast-qual" +#endif + static inline uint16_t ice_rx_reassemble_packets(struct ice_rx_queue *rxq, struct rte_mbuf **rx_bufs, uint16_t nb_bufs, uint8_t *split_flags) @@ -29,6 +33,7 @@ ice_rx_reassemble_packets(struct ice_rx_queue *rxq, struct rte_mbuf **rx_bufs, if (!split_flags[buf_idx]) { /* it's the last packet of the set */ start->hash = end->hash; + start->vlan_tci = end->vlan_tci; start->ol_flags = end->ol_flags; /* we need to strip crc for the whole packet */ start->pkt_len -= rxq->crc_len; @@ -72,7 +77,7 @@ ice_rx_reassemble_packets(struct ice_rx_queue *rxq, struct rte_mbuf **rx_bufs, } static __rte_always_inline int -ice_tx_free_bufs(struct ice_tx_queue *txq) +ice_tx_free_bufs_vec(struct ice_tx_queue *txq) { struct ice_tx_entry *txep; uint32_t n; @@ -188,16 +193,39 @@ _ice_tx_queue_release_mbufs_vec(struct ice_tx_queue *txq) * so need to free remains more carefully. */ i = txq->tx_next_dd - txq->tx_rs_thresh + 1; - if (txq->tx_tail < i) { - for (; i < txq->nb_tx_desc; i++) { + +#ifdef CC_AVX512_SUPPORT + struct rte_eth_dev *dev = &rte_eth_devices[txq->vsi->adapter->pf.dev_data->port_id]; + + if (dev->tx_pkt_burst == ice_xmit_pkts_vec_avx512 || + dev->tx_pkt_burst == ice_xmit_pkts_vec_avx512_offload) { + struct ice_vec_tx_entry *swr = (void *)txq->sw_ring; + + if (txq->tx_tail < i) { + for (; i < txq->nb_tx_desc; i++) { + rte_pktmbuf_free_seg(swr[i].mbuf); + swr[i].mbuf = NULL; + } + i = 0; + } + for (; i < txq->tx_tail; i++) { + rte_pktmbuf_free_seg(swr[i].mbuf); + swr[i].mbuf = NULL; + } + } else +#endif + { + if (txq->tx_tail < i) { + for (; i < txq->nb_tx_desc; i++) { + rte_pktmbuf_free_seg(txq->sw_ring[i].mbuf); + txq->sw_ring[i].mbuf = NULL; + } + i = 0; + } + for (; i < txq->tx_tail; i++) { rte_pktmbuf_free_seg(txq->sw_ring[i].mbuf); txq->sw_ring[i].mbuf = NULL; } - i = 0; - } - for (; i < txq->tx_tail; i++) { - rte_pktmbuf_free_seg(txq->sw_ring[i].mbuf); - txq->sw_ring[i].mbuf = NULL; } } @@ -219,6 +247,28 @@ ice_rxq_vec_setup_default(struct ice_rx_queue *rxq) return 0; } +#define ICE_TX_NO_VECTOR_FLAGS ( \ + DEV_TX_OFFLOAD_MULTI_SEGS | \ + DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM | \ + DEV_TX_OFFLOAD_TCP_TSO) + +#define ICE_TX_VECTOR_OFFLOAD ( \ + DEV_TX_OFFLOAD_VLAN_INSERT | \ + DEV_TX_OFFLOAD_QINQ_INSERT | \ + DEV_TX_OFFLOAD_IPV4_CKSUM | \ + DEV_TX_OFFLOAD_SCTP_CKSUM | \ + DEV_TX_OFFLOAD_UDP_CKSUM | \ + DEV_TX_OFFLOAD_TCP_CKSUM) + +#define ICE_RX_VECTOR_OFFLOAD ( \ + DEV_RX_OFFLOAD_CHECKSUM | \ + DEV_RX_OFFLOAD_SCTP_CKSUM | \ + DEV_RX_OFFLOAD_VLAN | \ + DEV_RX_OFFLOAD_RSS_HASH) + +#define ICE_VECTOR_PATH 0 +#define ICE_VECTOR_OFFLOAD_PATH 1 + static inline int ice_rx_vec_queue_default(struct ice_rx_queue *rxq) { @@ -237,15 +287,11 @@ ice_rx_vec_queue_default(struct ice_rx_queue *rxq) if (rxq->proto_xtr != PROTO_XTR_NONE) return -1; - return 0; -} + if (rxq->offloads & ICE_RX_VECTOR_OFFLOAD) + return ICE_VECTOR_OFFLOAD_PATH; -#define ICE_NO_VECTOR_FLAGS ( \ - DEV_TX_OFFLOAD_MULTI_SEGS | \ - DEV_TX_OFFLOAD_VLAN_INSERT | \ - DEV_TX_OFFLOAD_SCTP_CKSUM | \ - DEV_TX_OFFLOAD_UDP_CKSUM | \ - DEV_TX_OFFLOAD_TCP_CKSUM) + return ICE_VECTOR_PATH; +} static inline int ice_tx_vec_queue_default(struct ice_tx_queue *txq) @@ -253,14 +299,17 @@ ice_tx_vec_queue_default(struct ice_tx_queue *txq) if (!txq) return -1; - if (txq->offloads & ICE_NO_VECTOR_FLAGS) - return -1; - if (txq->tx_rs_thresh < ICE_VPMD_TX_BURST || txq->tx_rs_thresh > ICE_TX_MAX_FREE_BUF_SZ) return -1; - return 0; + if (txq->offloads & ICE_TX_NO_VECTOR_FLAGS) + return -1; + + if (txq->offloads & ICE_TX_VECTOR_OFFLOAD) + return ICE_VECTOR_OFFLOAD_PATH; + + return ICE_VECTOR_PATH; } static inline int @@ -268,14 +317,19 @@ ice_rx_vec_dev_check_default(struct rte_eth_dev *dev) { int i; struct ice_rx_queue *rxq; + int ret = 0; + int result = 0; for (i = 0; i < dev->data->nb_rx_queues; i++) { rxq = dev->data->rx_queues[i]; - if (ice_rx_vec_queue_default(rxq)) + ret = (ice_rx_vec_queue_default(rxq)); + if (ret < 0) return -1; + if (ret == ICE_VECTOR_OFFLOAD_PATH) + result = ret; } - return 0; + return result; } static inline int @@ -283,14 +337,278 @@ ice_tx_vec_dev_check_default(struct rte_eth_dev *dev) { int i; struct ice_tx_queue *txq; + int ret = 0; + int result = 0; for (i = 0; i < dev->data->nb_tx_queues; i++) { txq = dev->data->tx_queues[i]; - if (ice_tx_vec_queue_default(txq)) + ret = ice_tx_vec_queue_default(txq); + if (ret < 0) return -1; + if (ret == ICE_VECTOR_OFFLOAD_PATH) + result = ret; } - return 0; + return result; +} + +#ifdef CC_AVX2_SUPPORT +static __rte_always_inline void +ice_rxq_rearm_common(struct ice_rx_queue *rxq, __rte_unused bool avx512) +{ + int i; + uint16_t rx_id; + volatile union ice_rx_flex_desc *rxdp; + struct ice_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start]; + + rxdp = rxq->rx_ring + rxq->rxrearm_start; + + /* Pull 'n' more MBUFs into the software ring */ + if (rte_mempool_get_bulk(rxq->mp, + (void *)rxep, + ICE_RXQ_REARM_THRESH) < 0) { + if (rxq->rxrearm_nb + ICE_RXQ_REARM_THRESH >= + rxq->nb_rx_desc) { + __m128i dma_addr0; + + dma_addr0 = _mm_setzero_si128(); + for (i = 0; i < ICE_DESCS_PER_LOOP; i++) { + rxep[i].mbuf = &rxq->fake_mbuf; + _mm_store_si128((__m128i *)&rxdp[i].read, + dma_addr0); + } + } + rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += + ICE_RXQ_REARM_THRESH; + return; + } + +#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC + struct rte_mbuf *mb0, *mb1; + __m128i dma_addr0, dma_addr1; + __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, + RTE_PKTMBUF_HEADROOM); + /* Initialize the mbufs in vector, process 2 mbufs in one loop */ + for (i = 0; i < ICE_RXQ_REARM_THRESH; i += 2, rxep += 2) { + __m128i vaddr0, vaddr1; + + mb0 = rxep[0].mbuf; + mb1 = rxep[1].mbuf; + + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != + offsetof(struct rte_mbuf, buf_addr) + 8); + vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); + vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); + + /* convert pa to dma_addr hdr/data */ + dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0); + dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1); + + /* add headroom to pa values */ + dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room); + dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room); + + /* flush desc with pa dma_addr */ + _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0); + _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1); + } +#else +#ifdef CC_AVX512_SUPPORT + if (avx512) { + struct rte_mbuf *mb0, *mb1, *mb2, *mb3; + struct rte_mbuf *mb4, *mb5, *mb6, *mb7; + __m512i dma_addr0_3, dma_addr4_7; + __m512i hdr_room = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM); + /* Initialize the mbufs in vector, process 8 mbufs in one loop */ + for (i = 0; i < ICE_RXQ_REARM_THRESH; + i += 8, rxep += 8, rxdp += 8) { + __m128i vaddr0, vaddr1, vaddr2, vaddr3; + __m128i vaddr4, vaddr5, vaddr6, vaddr7; + __m256i vaddr0_1, vaddr2_3; + __m256i vaddr4_5, vaddr6_7; + __m512i vaddr0_3, vaddr4_7; + + mb0 = rxep[0].mbuf; + mb1 = rxep[1].mbuf; + mb2 = rxep[2].mbuf; + mb3 = rxep[3].mbuf; + mb4 = rxep[4].mbuf; + mb5 = rxep[5].mbuf; + mb6 = rxep[6].mbuf; + mb7 = rxep[7].mbuf; + + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != + offsetof(struct rte_mbuf, buf_addr) + 8); + vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); + vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); + vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr); + vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr); + vaddr4 = _mm_loadu_si128((__m128i *)&mb4->buf_addr); + vaddr5 = _mm_loadu_si128((__m128i *)&mb5->buf_addr); + vaddr6 = _mm_loadu_si128((__m128i *)&mb6->buf_addr); + vaddr7 = _mm_loadu_si128((__m128i *)&mb7->buf_addr); + + /** + * merge 0 & 1, by casting 0 to 256-bit and inserting 1 + * into the high lanes. Similarly for 2 & 3, and so on. + */ + vaddr0_1 = + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0), + vaddr1, 1); + vaddr2_3 = + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2), + vaddr3, 1); + vaddr4_5 = + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr4), + vaddr5, 1); + vaddr6_7 = + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr6), + vaddr7, 1); + vaddr0_3 = + _mm512_inserti64x4(_mm512_castsi256_si512(vaddr0_1), + vaddr2_3, 1); + vaddr4_7 = + _mm512_inserti64x4(_mm512_castsi256_si512(vaddr4_5), + vaddr6_7, 1); + + /* convert pa to dma_addr hdr/data */ + dma_addr0_3 = _mm512_unpackhi_epi64(vaddr0_3, vaddr0_3); + dma_addr4_7 = _mm512_unpackhi_epi64(vaddr4_7, vaddr4_7); + + /* add headroom to pa values */ + dma_addr0_3 = _mm512_add_epi64(dma_addr0_3, hdr_room); + dma_addr4_7 = _mm512_add_epi64(dma_addr4_7, hdr_room); + + /* flush desc with pa dma_addr */ + _mm512_store_si512((__m512i *)&rxdp->read, dma_addr0_3); + _mm512_store_si512((__m512i *)&(rxdp + 4)->read, dma_addr4_7); + } + } else +#endif + { + struct rte_mbuf *mb0, *mb1, *mb2, *mb3; + __m256i dma_addr0_1, dma_addr2_3; + __m256i hdr_room = _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM); + /* Initialize the mbufs in vector, process 4 mbufs in one loop */ + for (i = 0; i < ICE_RXQ_REARM_THRESH; + i += 4, rxep += 4, rxdp += 4) { + __m128i vaddr0, vaddr1, vaddr2, vaddr3; + __m256i vaddr0_1, vaddr2_3; + + mb0 = rxep[0].mbuf; + mb1 = rxep[1].mbuf; + mb2 = rxep[2].mbuf; + mb3 = rxep[3].mbuf; + + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != + offsetof(struct rte_mbuf, buf_addr) + 8); + vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); + vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); + vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr); + vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr); + + /** + * merge 0 & 1, by casting 0 to 256-bit and inserting 1 + * into the high lanes. Similarly for 2 & 3 + */ + vaddr0_1 = + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0), + vaddr1, 1); + vaddr2_3 = + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2), + vaddr3, 1); + + /* convert pa to dma_addr hdr/data */ + dma_addr0_1 = _mm256_unpackhi_epi64(vaddr0_1, vaddr0_1); + dma_addr2_3 = _mm256_unpackhi_epi64(vaddr2_3, vaddr2_3); + + /* add headroom to pa values */ + dma_addr0_1 = _mm256_add_epi64(dma_addr0_1, hdr_room); + dma_addr2_3 = _mm256_add_epi64(dma_addr2_3, hdr_room); + + /* flush desc with pa dma_addr */ + _mm256_store_si256((__m256i *)&rxdp->read, dma_addr0_1); + _mm256_store_si256((__m256i *)&(rxdp + 2)->read, dma_addr2_3); + } + } + +#endif + + rxq->rxrearm_start += ICE_RXQ_REARM_THRESH; + if (rxq->rxrearm_start >= rxq->nb_rx_desc) + rxq->rxrearm_start = 0; + + rxq->rxrearm_nb -= ICE_RXQ_REARM_THRESH; + + rx_id = (uint16_t)((rxq->rxrearm_start == 0) ? + (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1)); + + /* Update the tail pointer on the NIC */ + ICE_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id); } +#endif +static inline void +ice_txd_enable_offload(struct rte_mbuf *tx_pkt, + uint64_t *txd_hi) +{ + uint64_t ol_flags = tx_pkt->ol_flags; + uint32_t td_cmd = 0; + uint32_t td_offset = 0; + + /* Tx Checksum Offload */ + /* SET MACLEN */ + td_offset |= (tx_pkt->l2_len >> 1) << + ICE_TX_DESC_LEN_MACLEN_S; + + /* Enable L3 checksum offload */ + if (ol_flags & PKT_TX_IP_CKSUM) { + td_cmd |= ICE_TX_DESC_CMD_IIPT_IPV4_CSUM; + td_offset |= (tx_pkt->l3_len >> 2) << + ICE_TX_DESC_LEN_IPLEN_S; + } else if (ol_flags & PKT_TX_IPV4) { + td_cmd |= ICE_TX_DESC_CMD_IIPT_IPV4; + td_offset |= (tx_pkt->l3_len >> 2) << + ICE_TX_DESC_LEN_IPLEN_S; + } else if (ol_flags & PKT_TX_IPV6) { + td_cmd |= ICE_TX_DESC_CMD_IIPT_IPV6; + td_offset |= (tx_pkt->l3_len >> 2) << + ICE_TX_DESC_LEN_IPLEN_S; + } + + /* Enable L4 checksum offloads */ + switch (ol_flags & PKT_TX_L4_MASK) { + case PKT_TX_TCP_CKSUM: + td_cmd |= ICE_TX_DESC_CMD_L4T_EOFT_TCP; + td_offset |= (sizeof(struct rte_tcp_hdr) >> 2) << + ICE_TX_DESC_LEN_L4_LEN_S; + break; + case PKT_TX_SCTP_CKSUM: + td_cmd |= ICE_TX_DESC_CMD_L4T_EOFT_SCTP; + td_offset |= (sizeof(struct rte_sctp_hdr) >> 2) << + ICE_TX_DESC_LEN_L4_LEN_S; + break; + case PKT_TX_UDP_CKSUM: + td_cmd |= ICE_TX_DESC_CMD_L4T_EOFT_UDP; + td_offset |= (sizeof(struct rte_udp_hdr) >> 2) << + ICE_TX_DESC_LEN_L4_LEN_S; + break; + default: + break; + } + + *txd_hi |= ((uint64_t)td_offset) << ICE_TXD_QW1_OFFSET_S; + + /* Tx VLAN/QINQ insertion Offload */ + if (ol_flags & (PKT_TX_VLAN | PKT_TX_QINQ)) { + td_cmd |= ICE_TX_DESC_CMD_IL2TAG1; + *txd_hi |= ((uint64_t)tx_pkt->vlan_tci << + ICE_TXD_QW1_L2TAG1_S); + } + + *txd_hi |= ((uint64_t)td_cmd) << ICE_TXD_QW1_CMD_S; +} #endif