From c4e4c18963b0c686d990e067787b9944a6330a9c Mon Sep 17 00:00:00 2001 From: Lance Richardson Date: Mon, 24 May 2021 14:59:51 -0400 Subject: [PATCH] net/bnxt: add AVX2 RX/Tx Implement AVX2 vector PMD. Signed-off-by: Lance Richardson Reviewed-by: Ajit Khaparde --- doc/guides/nics/bnxt.rst | 57 ++- drivers/net/bnxt/bnxt_ethdev.c | 119 +++-- drivers/net/bnxt/bnxt_rxr.c | 4 +- drivers/net/bnxt/bnxt_rxr.h | 11 +- drivers/net/bnxt/bnxt_rxtx_vec_avx2.c | 597 ++++++++++++++++++++++++++ drivers/net/bnxt/bnxt_rxtx_vec_neon.c | 25 +- drivers/net/bnxt/bnxt_rxtx_vec_sse.c | 31 +- drivers/net/bnxt/bnxt_txr.h | 7 + drivers/net/bnxt/meson.build | 17 + 9 files changed, 780 insertions(+), 88 deletions(-) create mode 100644 drivers/net/bnxt/bnxt_rxtx_vec_avx2.c diff --git a/doc/guides/nics/bnxt.rst b/doc/guides/nics/bnxt.rst index 0fb2032447..feb0c6a765 100644 --- a/doc/guides/nics/bnxt.rst +++ b/doc/guides/nics/bnxt.rst @@ -853,23 +853,36 @@ DPDK implements a light-weight library to allow PMDs to be bonded together and p Vector Processing ----------------- +The BNXT PMD provides vectorized burst transmit/receive function implementations +on x86-based platforms using SSE (Streaming SIMD Extensions) and AVX2 (Advanced +Vector Extensions 2) instructions, and on Arm-based platforms using Arm Neon +Advanced SIMD instructions. Vector processing support is currently implemented +only for Intel/AMD and Arm CPU architectures. + Vector processing provides significantly improved performance over scalar -processing (see Vector Processor, here). +processing. This improved performance is derived from a number of optimizations: + +* Using SIMD instructions to operate on multiple packets in parallel. +* Using SIMD instructions to do more work per instruction than is possible + with scalar instructions, for example by leveraging 128-bit and 256-bi + load/store instructions or by using SIMD shuffle and permute operations. +* Batching -The BNXT PMD supports the vector processing using SSE (Streaming SIMD -Extensions) instructions on x86 platforms. It also supports NEON intrinsics for -vector processing on ARM CPUs. The BNXT vPMD (vector mode PMD) is available for -Intel/AMD and ARM CPU architectures. +  * TX: transmit completions are processed in bulk. +  * RX: bulk allocation of mbufs is used when allocating rxq buffers. -This improved performance comes from several optimizations: +* Simplifications enabled by not supporting chained mbufs in vector mode. +* Simplifications enabled by not supporting some stateless offloads in vector + mode: -* Batching -  * TX: processing completions in bulk -  * RX: allocating mbufs in bulk -* Chained mbufs are *not* supported, i.e. a packet should fit a single mbuf -* Some stateless offloads are *not* supported with vector processing -  * TX: no offloads will be supported -  * RX: reduced RX offloads (listed below) will be supported:: +  * TX: only the following reduced set of transmit offloads is supported in + vector mode:: + +   DEV_TX_OFFLOAD_MBUF_FAST_FREE + +  * RX: only the following reduced set of receive offloads is supported in + vector mode (note that jumbo MTU is allowed only when the MTU setting + does not require `DEV_RX_OFFLOAD_SCATTER` to be enabled)::   DEV_RX_OFFLOAD_VLAN_STRIP   DEV_RX_OFFLOAD_KEEP_CRC @@ -878,23 +891,21 @@ This improved performance comes from several optimizations:   DEV_RX_OFFLOAD_UDP_CKSUM   DEV_RX_OFFLOAD_TCP_CKSUM   DEV_RX_OFFLOAD_OUTER_IPV4_CKSUM +   DEV_RX_OFFLOAD_OUTER_UDP_CKSUM   DEV_RX_OFFLOAD_RSS_HASH   DEV_RX_OFFLOAD_VLAN_FILTER -The BNXT Vector PMD is enabled in DPDK builds by default. - -However, a decision to enable vector mode will be made when the port transitions -from stopped to started. Any TX offloads or some RX offloads (other than listed -above) will disable the vector mode. -Offload configuration changes that impact vector mode must be made when the port -is stopped. +The BNXT Vector PMD is enabled in DPDK builds by default. The decision to enable +vector processing is made at run-time when the port is started; if no transmit +offloads outside the set supported for vector mode are enabled then vector mode +transmit will be enabled, and if no receive offloads outside the set supported +for vector mode are enabled then vector mode receive will be enabled. Offload +configuration changes that impact the decision to enable vector mode are allowed +only when the port is stopped. Note that TX (or RX) vector mode can be enabled independently from RX (or TX) vector mode. -Also vector mode is allowed when jumbo is enabled -as long as the MTU setting does not require scattered Rx. - Appendix -------- diff --git a/drivers/net/bnxt/bnxt_ethdev.c b/drivers/net/bnxt/bnxt_ethdev.c index 3778e28cca..96d2befd0f 100644 --- a/drivers/net/bnxt/bnxt_ethdev.c +++ b/drivers/net/bnxt/bnxt_ethdev.c @@ -1174,32 +1174,57 @@ bnxt_receive_function(struct rte_eth_dev *eth_dev) return bnxt_recv_pkts; } -#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64) -#ifndef RTE_LIBRTE_IEEE1588 +#if (defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)) && \ + !defined(RTE_LIBRTE_IEEE1588) + + /* Vector mode receive cannot be enabled if scattered rx is in use. */ + if (eth_dev->data->scattered_rx) + goto use_scalar_rx; + /* - * Vector mode receive can be enabled only if scatter rx is not - * in use and rx offloads are limited to VLAN stripping and - * CRC stripping. + * Vector mode receive cannot be enabled if Truflow is enabled or if + * asynchronous completions and receive completions can be placed in + * the same completion ring. */ - if (!eth_dev->data->scattered_rx && - !(eth_dev->data->dev_conf.rxmode.offloads & - ~(DEV_RX_OFFLOAD_VLAN_STRIP | - DEV_RX_OFFLOAD_KEEP_CRC | - DEV_RX_OFFLOAD_JUMBO_FRAME | - DEV_RX_OFFLOAD_IPV4_CKSUM | - DEV_RX_OFFLOAD_UDP_CKSUM | - DEV_RX_OFFLOAD_TCP_CKSUM | - DEV_RX_OFFLOAD_OUTER_IPV4_CKSUM | - DEV_RX_OFFLOAD_OUTER_UDP_CKSUM | - DEV_RX_OFFLOAD_RSS_HASH | - DEV_RX_OFFLOAD_VLAN_FILTER)) && - !BNXT_TRUFLOW_EN(bp) && BNXT_NUM_ASYNC_CPR(bp) && - rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) { - PMD_DRV_LOG(INFO, "Using vector mode receive for port %d\n", + if (BNXT_TRUFLOW_EN(bp) || !BNXT_NUM_ASYNC_CPR(bp)) + goto use_scalar_rx; + + /* + * Vector mode receive cannot be enabled if any receive offloads outside + * a limited subset have been enabled. + */ + if (eth_dev->data->dev_conf.rxmode.offloads & + ~(DEV_RX_OFFLOAD_VLAN_STRIP | + DEV_RX_OFFLOAD_KEEP_CRC | + DEV_RX_OFFLOAD_JUMBO_FRAME | + DEV_RX_OFFLOAD_IPV4_CKSUM | + DEV_RX_OFFLOAD_UDP_CKSUM | + DEV_RX_OFFLOAD_TCP_CKSUM | + DEV_RX_OFFLOAD_OUTER_IPV4_CKSUM | + DEV_RX_OFFLOAD_OUTER_UDP_CKSUM | + DEV_RX_OFFLOAD_RSS_HASH | + DEV_RX_OFFLOAD_VLAN_FILTER)) + goto use_scalar_rx; + +#if defined(RTE_ARCH_X86) && defined(CC_AVX2_SUPPORT) + if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256 && + rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1) { + PMD_DRV_LOG(INFO, + "Using AVX2 vector mode receive for port %d\n", + eth_dev->data->port_id); + bp->flags |= BNXT_FLAG_RX_VECTOR_PKT_MODE; + return bnxt_recv_pkts_vec_avx2; + } + #endif + if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) { + PMD_DRV_LOG(INFO, + "Using SSE vector mode receive for port %d\n", eth_dev->data->port_id); bp->flags |= BNXT_FLAG_RX_VECTOR_PKT_MODE; return bnxt_recv_pkts_vec; } + +use_scalar_rx: PMD_DRV_LOG(INFO, "Vector mode receive disabled for port %d\n", eth_dev->data->port_id); PMD_DRV_LOG(INFO, @@ -1207,7 +1232,6 @@ bnxt_receive_function(struct rte_eth_dev *eth_dev) eth_dev->data->port_id, eth_dev->data->scattered_rx, eth_dev->data->dev_conf.rxmode.offloads); -#endif #endif bp->flags &= ~BNXT_FLAG_RX_VECTOR_PKT_MODE; return bnxt_recv_pkts; @@ -1222,22 +1246,36 @@ bnxt_transmit_function(struct rte_eth_dev *eth_dev) if (BNXT_CHIP_SR2(bp)) return bnxt_xmit_pkts; -#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64) -#ifndef RTE_LIBRTE_IEEE1588 +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64) && \ + !defined(RTE_LIBRTE_IEEE1588) uint64_t offloads = eth_dev->data->dev_conf.txmode.offloads; /* * Vector mode transmit can be enabled only if not using scatter rx * or tx offloads. */ - if (!eth_dev->data->scattered_rx && - !(offloads & ~DEV_TX_OFFLOAD_MBUF_FAST_FREE) && - !BNXT_TRUFLOW_EN(bp) && - rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) { - PMD_DRV_LOG(INFO, "Using vector mode transmit for port %d\n", + if (eth_dev->data->scattered_rx || + (offloads & ~DEV_TX_OFFLOAD_MBUF_FAST_FREE) || + BNXT_TRUFLOW_EN(bp)) + goto use_scalar_tx; + +#if defined(RTE_ARCH_X86) && defined(CC_AVX2_SUPPORT) + if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256 && + rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1) { + PMD_DRV_LOG(INFO, + "Using AVX2 vector mode transmit for port %d\n", + eth_dev->data->port_id); + return bnxt_xmit_pkts_vec_avx2; + } +#endif + if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) { + PMD_DRV_LOG(INFO, + "Using SSE vector mode transmit for port %d\n", eth_dev->data->port_id); return bnxt_xmit_pkts_vec; } + +use_scalar_tx: PMD_DRV_LOG(INFO, "Vector mode transmit disabled for port %d\n", eth_dev->data->port_id); PMD_DRV_LOG(INFO, @@ -1245,7 +1283,6 @@ bnxt_transmit_function(struct rte_eth_dev *eth_dev) eth_dev->data->port_id, eth_dev->data->scattered_rx, offloads); -#endif #endif return bnxt_xmit_pkts; } @@ -2855,11 +2892,15 @@ static const struct { eth_rx_burst_t pkt_burst; const char *info; } bnxt_rx_burst_info[] = { - {bnxt_recv_pkts, "Scalar"}, + {bnxt_recv_pkts, "Scalar"}, #if defined(RTE_ARCH_X86) - {bnxt_recv_pkts_vec, "Vector SSE"}, -#elif defined(RTE_ARCH_ARM64) - {bnxt_recv_pkts_vec, "Vector Neon"}, + {bnxt_recv_pkts_vec, "Vector SSE"}, +#endif +#if defined(RTE_ARCH_X86) && defined(CC_AVX2_SUPPORT) + {bnxt_recv_pkts_vec_avx2, "Vector AVX2"}, +#endif +#if defined(RTE_ARCH_ARM64) + {bnxt_recv_pkts_vec, "Vector Neon"}, #endif }; @@ -2885,11 +2926,15 @@ static const struct { eth_tx_burst_t pkt_burst; const char *info; } bnxt_tx_burst_info[] = { - {bnxt_xmit_pkts, "Scalar"}, + {bnxt_xmit_pkts, "Scalar"}, #if defined(RTE_ARCH_X86) - {bnxt_xmit_pkts_vec, "Vector SSE"}, -#elif defined(RTE_ARCH_ARM64) - {bnxt_xmit_pkts_vec, "Vector Neon"}, + {bnxt_xmit_pkts_vec, "Vector SSE"}, +#endif +#if defined(RTE_ARCH_X86) && defined(CC_AVX2_SUPPORT) + {bnxt_xmit_pkts_vec_avx2, "Vector AVX2"}, +#endif +#if defined(RTE_ARCH_ARM64) + {bnxt_xmit_pkts_vec, "Vector Neon"}, #endif }; diff --git a/drivers/net/bnxt/bnxt_rxr.c b/drivers/net/bnxt/bnxt_rxr.c index a6a8fb213b..4eef75f6be 100644 --- a/drivers/net/bnxt/bnxt_rxr.c +++ b/drivers/net/bnxt/bnxt_rxr.c @@ -1147,7 +1147,7 @@ int bnxt_init_rx_ring_struct(struct bnxt_rx_queue *rxq, unsigned int socket_id) /* Allocate extra rx ring entries for vector rx. */ ring->vmem_size = sizeof(struct rte_mbuf *) * - (ring->ring_size + RTE_BNXT_DESCS_PER_LOOP); + (ring->ring_size + BNXT_RX_EXTRA_MBUF_ENTRIES); ring->vmem = (void **)&rxr->rx_buf_ring; ring->fw_ring_id = INVALID_HW_RING_ID; @@ -1251,7 +1251,7 @@ int bnxt_init_one_rx_ring(struct bnxt_rx_queue *rxq) /* Initialize dummy mbuf pointers for vector mode rx. */ for (i = ring->ring_size; - i < ring->ring_size + RTE_BNXT_DESCS_PER_LOOP; i++) { + i < ring->ring_size + BNXT_RX_EXTRA_MBUF_ENTRIES; i++) { rxr->rx_buf_ring[i] = &rxq->fake_mbuf; } diff --git a/drivers/net/bnxt/bnxt_rxr.h b/drivers/net/bnxt/bnxt_rxr.h index 79f1458698..955bf3e99e 100644 --- a/drivers/net/bnxt/bnxt_rxr.h +++ b/drivers/net/bnxt/bnxt_rxr.h @@ -42,7 +42,12 @@ static inline uint16_t bnxt_tpa_start_agg_id(struct bnxt *bp, RX_PKT_CMPL_AGG_BUFS_SFT) /* Number of descriptors to process per inner loop in vector mode. */ -#define RTE_BNXT_DESCS_PER_LOOP 4U +#define BNXT_RX_DESCS_PER_LOOP_VEC128 4U /* SSE, Neon */ +#define BNXT_RX_DESCS_PER_LOOP_VEC256 8U /* AVX2 */ + +/* Number of extra Rx mbuf ring entries to allocate for vector mode. */ +#define BNXT_RX_EXTRA_MBUF_ENTRIES \ + RTE_MAX(BNXT_RX_DESCS_PER_LOOP_VEC128, BNXT_RX_DESCS_PER_LOOP_VEC256) #define BNXT_OL_FLAGS_TBL_DIM 64 #define BNXT_OL_FLAGS_ERR_TBL_DIM 32 @@ -106,6 +111,10 @@ uint16_t bnxt_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, int bnxt_rxq_vec_setup(struct bnxt_rx_queue *rxq); #endif +#if defined(RTE_ARCH_X86) && defined(CC_AVX2_SUPPORT) +uint16_t bnxt_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts); +#endif void bnxt_set_mark_in_mbuf(struct bnxt *bp, struct rx_pkt_cmpl_hi *rxcmp1, struct rte_mbuf *mbuf); diff --git a/drivers/net/bnxt/bnxt_rxtx_vec_avx2.c b/drivers/net/bnxt/bnxt_rxtx_vec_avx2.c new file mode 100644 index 0000000000..a06dfec90e --- /dev/null +++ b/drivers/net/bnxt/bnxt_rxtx_vec_avx2.c @@ -0,0 +1,597 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright(c) 2019-2021 Broadcom All rights reserved. */ + +#include +#include + +#include +#include +#include +#include +#include + +#include "bnxt.h" +#include "bnxt_cpr.h" +#include "bnxt_ring.h" + +#include "bnxt_txq.h" +#include "bnxt_txr.h" +#include "bnxt_rxtx_vec_common.h" +#include + +static uint16_t +recv_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) +{ + struct bnxt_rx_queue *rxq = rx_queue; + const __m256i mbuf_init = + _mm256_set_epi64x(0, 0, 0, rxq->mbuf_initializer); + struct bnxt_cp_ring_info *cpr = rxq->cp_ring; + struct bnxt_rx_ring_info *rxr = rxq->rx_ring; + uint16_t cp_ring_size = cpr->cp_ring_struct->ring_size; + uint16_t rx_ring_size = rxr->rx_ring_struct->ring_size; + struct cmpl_base *cp_desc_ring = cpr->cp_desc_ring; + uint64_t valid, desc_valid_mask = ~0ULL; + const __m256i info3_v_mask = _mm256_set1_epi32(CMPL_BASE_V); + uint32_t raw_cons = cpr->cp_raw_cons; + uint32_t cons, mbcons; + int nb_rx_pkts = 0; + int i; + const __m256i valid_target = + _mm256_set1_epi32(!!(raw_cons & cp_ring_size)); + const __m256i dsc_shuf_msk = + _mm256_set_epi8(0xff, 0xff, 0xff, 0xff, /* Zeroes. */ + 7, 6, /* metadata type */ + 9, 8, /* flags2 low 16 */ + 5, 4, /* vlan_tci */ + 1, 0, /* errors_v2 */ + 0xff, 0xff, 0xff, 0xff, /* Zeroes. */ + 0xff, 0xff, 0xff, 0xff, /* Zeroes. */ + 7, 6, /* metadata type */ + 9, 8, /* flags2 low 16 */ + 5, 4, /* vlan_tci */ + 1, 0, /* errors_v2 */ + 0xff, 0xff, 0xff, 0xff); /* Zeroes. */ + const __m256i shuf_msk = + _mm256_set_epi8(15, 14, 13, 12, /* rss */ + 7, 6, /* vlan_tci */ + 3, 2, /* data_len */ + 0xFF, 0xFF, 3, 2, /* pkt_len */ + 0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type (zeroes) */ + 15, 14, 13, 12, /* rss */ + 7, 6, /* vlan_tci */ + 3, 2, /* data_len */ + 0xFF, 0xFF, 3, 2, /* pkt_len */ + 0xFF, 0xFF, 0xFF, 0xFF); /* pkt_type (zeroes) */ + const __m256i flags_type_mask = + _mm256_set1_epi32(RX_PKT_CMPL_FLAGS_ITYPE_MASK); + const __m256i flags2_mask1 = + _mm256_set1_epi32(CMPL_FLAGS2_VLAN_TUN_MSK); + const __m256i flags2_mask2 = + _mm256_set1_epi32(RX_PKT_CMPL_FLAGS2_IP_TYPE); + const __m256i rss_mask = + _mm256_set1_epi32(RX_PKT_CMPL_FLAGS_RSS_VALID); + __m256i t0, t1, flags_type, flags2, index, errors; + __m256i ptype_idx, ptypes, is_tunnel; + __m256i mbuf01, mbuf23, mbuf45, mbuf67; + __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5, rearm6, rearm7; + __m256i ol_flags, ol_flags_hi; + __m256i rss_flags; + + /* Validate ptype table indexing at build time. */ + bnxt_check_ptype_constants(); + + /* If Rx Q was stopped return */ + if (unlikely(!rxq->rx_started)) + return 0; + + if (rxq->rxrearm_nb >= rxq->rx_free_thresh) + bnxt_rxq_rearm(rxq, rxr); + + nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, BNXT_RX_DESCS_PER_LOOP_VEC256); + + cons = raw_cons & (cp_ring_size - 1); + mbcons = (raw_cons / 2) & (rx_ring_size - 1); + + /* Prefetch first four descriptor pairs. */ + rte_prefetch0(&cp_desc_ring[cons + 0]); + rte_prefetch0(&cp_desc_ring[cons + 4]); + rte_prefetch0(&cp_desc_ring[cons + 8]); + rte_prefetch0(&cp_desc_ring[cons + 12]); + + /* Ensure that we do not go past the ends of the rings. */ + nb_pkts = RTE_MIN(nb_pkts, RTE_MIN(rx_ring_size - mbcons, + (cp_ring_size - cons) / 2)); + /* + * If we are at the end of the ring, ensure that descriptors after the + * last valid entry are not treated as valid. Otherwise, force the + * maximum number of packets to receive to be a multiple of the per- + * loop count. + */ + if (nb_pkts < BNXT_RX_DESCS_PER_LOOP_VEC256) { + desc_valid_mask >>= + CHAR_BIT * (BNXT_RX_DESCS_PER_LOOP_VEC256 - nb_pkts); + } else { + nb_pkts = + RTE_ALIGN_FLOOR(nb_pkts, BNXT_RX_DESCS_PER_LOOP_VEC256); + } + + /* Handle RX burst request */ + for (i = 0; i < nb_pkts; i += BNXT_RX_DESCS_PER_LOOP_VEC256, + cons += BNXT_RX_DESCS_PER_LOOP_VEC256 * 2, + mbcons += BNXT_RX_DESCS_PER_LOOP_VEC256) { + __m256i desc0, desc1, desc2, desc3, desc4, desc5, desc6, desc7; + __m256i rxcmp0_1, rxcmp2_3, rxcmp4_5, rxcmp6_7, info3_v; + __m256i errors_v2; + uint32_t num_valid; + + /* Copy eight mbuf pointers to output array. */ + t0 = _mm256_loadu_si256((void *)&rxr->rx_buf_ring[mbcons]); + _mm256_storeu_si256((void *)&rx_pkts[i], t0); +#ifdef RTE_ARCH_X86_64 + t0 = _mm256_loadu_si256((void *)&rxr->rx_buf_ring[mbcons + 4]); + _mm256_storeu_si256((void *)&rx_pkts[i + 4], t0); +#endif + + /* Prefetch eight descriptor pairs for next iteration. */ + if (i + BNXT_RX_DESCS_PER_LOOP_VEC256 < nb_pkts) { + rte_prefetch0(&cp_desc_ring[cons + 16]); + rte_prefetch0(&cp_desc_ring[cons + 20]); + rte_prefetch0(&cp_desc_ring[cons + 24]); + rte_prefetch0(&cp_desc_ring[cons + 28]); + } + + /* + * Load eight receive completion descriptors into 256-bit + * registers. Loads are issued in reverse order in order to + * ensure consistent state. + */ + desc7 = _mm256_load_si256((void *)&cp_desc_ring[cons + 14]); + rte_compiler_barrier(); + desc6 = _mm256_load_si256((void *)&cp_desc_ring[cons + 12]); + rte_compiler_barrier(); + desc5 = _mm256_load_si256((void *)&cp_desc_ring[cons + 10]); + rte_compiler_barrier(); + desc4 = _mm256_load_si256((void *)&cp_desc_ring[cons + 8]); + rte_compiler_barrier(); + desc3 = _mm256_load_si256((void *)&cp_desc_ring[cons + 6]); + rte_compiler_barrier(); + desc2 = _mm256_load_si256((void *)&cp_desc_ring[cons + 4]); + rte_compiler_barrier(); + desc1 = _mm256_load_si256((void *)&cp_desc_ring[cons + 2]); + rte_compiler_barrier(); + desc0 = _mm256_load_si256((void *)&cp_desc_ring[cons + 0]); + + /* + * Pack needed fields from each descriptor into a compressed + * 128-bit layout and pair two compressed descriptors into + * 256-bit registers. The 128-bit compressed layout is as + * follows: + * Bits 0-15: flags_type field from low completion record. + * Bits 16-31: len field from low completion record. + * Bits 32-47: flags2 (low 16 bits) from high completion. + * Bits 48-79: metadata from high completion record. + * Bits 80-95: errors_v2 from high completion record. + * Bits 96-127: rss hash from low completion record. + */ + t0 = _mm256_permute2f128_si256(desc6, desc7, 0x20); + t1 = _mm256_permute2f128_si256(desc6, desc7, 0x31); + t1 = _mm256_shuffle_epi8(t1, dsc_shuf_msk); + rxcmp6_7 = _mm256_blend_epi32(t0, t1, 0x66); + + t0 = _mm256_permute2f128_si256(desc4, desc5, 0x20); + t1 = _mm256_permute2f128_si256(desc4, desc5, 0x31); + t1 = _mm256_shuffle_epi8(t1, dsc_shuf_msk); + rxcmp4_5 = _mm256_blend_epi32(t0, t1, 0x66); + + t0 = _mm256_permute2f128_si256(desc2, desc3, 0x20); + t1 = _mm256_permute2f128_si256(desc2, desc3, 0x31); + t1 = _mm256_shuffle_epi8(t1, dsc_shuf_msk); + rxcmp2_3 = _mm256_blend_epi32(t0, t1, 0x66); + + t0 = _mm256_permute2f128_si256(desc0, desc1, 0x20); + t1 = _mm256_permute2f128_si256(desc0, desc1, 0x31); + t1 = _mm256_shuffle_epi8(t1, dsc_shuf_msk); + rxcmp0_1 = _mm256_blend_epi32(t0, t1, 0x66); + + /* Compute packet type table indices for eight packets. */ + t0 = _mm256_unpacklo_epi32(rxcmp0_1, rxcmp2_3); + t1 = _mm256_unpacklo_epi32(rxcmp4_5, rxcmp6_7); + flags_type = _mm256_unpacklo_epi64(t0, t1); + ptype_idx = _mm256_and_si256(flags_type, flags_type_mask); + ptype_idx = _mm256_srli_epi32(ptype_idx, + RX_PKT_CMPL_FLAGS_ITYPE_SFT - + BNXT_PTYPE_TBL_TYPE_SFT); + + t0 = _mm256_unpacklo_epi32(rxcmp0_1, rxcmp2_3); + t1 = _mm256_unpacklo_epi32(rxcmp4_5, rxcmp6_7); + flags2 = _mm256_unpackhi_epi64(t0, t1); + + t0 = _mm256_srli_epi32(_mm256_and_si256(flags2, flags2_mask1), + RX_PKT_CMPL_FLAGS2_META_FORMAT_SFT - + BNXT_PTYPE_TBL_VLAN_SFT); + ptype_idx = _mm256_or_si256(ptype_idx, t0); + + t0 = _mm256_srli_epi32(_mm256_and_si256(flags2, flags2_mask2), + RX_PKT_CMPL_FLAGS2_IP_TYPE_SFT - + BNXT_PTYPE_TBL_IP_VER_SFT); + ptype_idx = _mm256_or_si256(ptype_idx, t0); + + /* + * Load ptypes for eight packets using gather. Gather operations + * have extremely high latency (~19 cycles), execution and use + * of result should be separated as much as possible. + */ + ptypes = _mm256_i32gather_epi32((int *)bnxt_ptype_table, + ptype_idx, sizeof(uint32_t)); + /* + * Compute ol_flags and checksum error table indices for eight + * packets. + */ + is_tunnel = _mm256_and_si256(flags2, _mm256_set1_epi32(4)); + is_tunnel = _mm256_slli_epi32(is_tunnel, 3); + flags2 = _mm256_and_si256(flags2, _mm256_set1_epi32(0x1F)); + + /* Extract errors_v2 fields for eight packets. */ + t0 = _mm256_unpackhi_epi32(rxcmp0_1, rxcmp2_3); + t1 = _mm256_unpackhi_epi32(rxcmp4_5, rxcmp6_7); + errors_v2 = _mm256_unpacklo_epi64(t0, t1); + + errors = _mm256_srli_epi32(errors_v2, 4); + errors = _mm256_and_si256(errors, _mm256_set1_epi32(0xF)); + errors = _mm256_and_si256(errors, flags2); + + index = _mm256_andnot_si256(errors, flags2); + errors = _mm256_or_si256(errors, + _mm256_srli_epi32(is_tunnel, 1)); + index = _mm256_or_si256(index, is_tunnel); + + /* + * Load ol_flags for eight packets using gather. Gather + * operations have extremely high latency (~19 cycles), + * execution and use of result should be separated as much + * as possible. + */ + ol_flags = _mm256_i32gather_epi32((int *)rxr->ol_flags_table, + index, sizeof(uint32_t)); + errors = _mm256_i32gather_epi32((int *)rxr->ol_flags_err_table, + errors, sizeof(uint32_t)); + + /* + * Pack the 128-bit array of valid descriptor flags into 64 + * bits and count the number of set bits in order to determine + * the number of valid descriptors. + */ + const __m256i perm_msk = + _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + info3_v = _mm256_permutevar8x32_epi32(errors_v2, perm_msk); + info3_v = _mm256_and_si256(errors_v2, info3_v_mask); + info3_v = _mm256_xor_si256(info3_v, valid_target); + + info3_v = _mm256_packs_epi32(info3_v, _mm256_setzero_si256()); + valid = _mm_cvtsi128_si64(_mm256_extracti128_si256(info3_v, 1)); + valid = (valid << CHAR_BIT) | + _mm_cvtsi128_si64(_mm256_castsi256_si128(info3_v)); + num_valid = __builtin_popcountll(valid & desc_valid_mask); + + if (num_valid == 0) + break; + + /* Update mbuf rearm_data for eight packets. */ + mbuf01 = _mm256_shuffle_epi8(rxcmp0_1, shuf_msk); + mbuf23 = _mm256_shuffle_epi8(rxcmp2_3, shuf_msk); + mbuf45 = _mm256_shuffle_epi8(rxcmp4_5, shuf_msk); + mbuf67 = _mm256_shuffle_epi8(rxcmp6_7, shuf_msk); + + /* Blend in ptype field for two mbufs at a time. */ + mbuf01 = _mm256_blend_epi32(mbuf01, ptypes, 0x11); + mbuf23 = _mm256_blend_epi32(mbuf23, + _mm256_srli_si256(ptypes, 4), 0x11); + mbuf45 = _mm256_blend_epi32(mbuf45, + _mm256_srli_si256(ptypes, 8), 0x11); + mbuf67 = _mm256_blend_epi32(mbuf67, + _mm256_srli_si256(ptypes, 12), 0x11); + + /* Unpack rearm data, set fixed fields for first four mbufs. */ + rearm0 = _mm256_permute2f128_si256(mbuf_init, mbuf01, 0x20); + rearm1 = _mm256_blend_epi32(mbuf_init, mbuf01, 0xF0); + rearm2 = _mm256_permute2f128_si256(mbuf_init, mbuf23, 0x20); + rearm3 = _mm256_blend_epi32(mbuf_init, mbuf23, 0xF0); + + /* Compute final ol_flags values for eight packets. */ + rss_flags = _mm256_and_si256(flags_type, rss_mask); + rss_flags = _mm256_srli_epi32(rss_flags, 9); + ol_flags = _mm256_or_si256(ol_flags, errors); + ol_flags = _mm256_or_si256(ol_flags, rss_flags); + ol_flags_hi = _mm256_permute2f128_si256(ol_flags, + ol_flags, 0x11); + + /* Set ol_flags fields for first four packets. */ + rearm0 = _mm256_blend_epi32(rearm0, + _mm256_slli_si256(ol_flags, 8), + 0x04); + rearm1 = _mm256_blend_epi32(rearm1, + _mm256_slli_si256(ol_flags_hi, 8), + 0x04); + rearm2 = _mm256_blend_epi32(rearm2, + _mm256_slli_si256(ol_flags, 4), + 0x04); + rearm3 = _mm256_blend_epi32(rearm3, + _mm256_slli_si256(ol_flags_hi, 4), + 0x04); + + /* Store all mbuf fields for first four packets. */ + _mm256_storeu_si256((void *)&rx_pkts[i + 0]->rearm_data, + rearm0); + _mm256_storeu_si256((void *)&rx_pkts[i + 1]->rearm_data, + rearm1); + _mm256_storeu_si256((void *)&rx_pkts[i + 2]->rearm_data, + rearm2); + _mm256_storeu_si256((void *)&rx_pkts[i + 3]->rearm_data, + rearm3); + + /* Unpack rearm data, set fixed fields for final four mbufs. */ + rearm4 = _mm256_permute2f128_si256(mbuf_init, mbuf45, 0x20); + rearm5 = _mm256_blend_epi32(mbuf_init, mbuf45, 0xF0); + rearm6 = _mm256_permute2f128_si256(mbuf_init, mbuf67, 0x20); + rearm7 = _mm256_blend_epi32(mbuf_init, mbuf67, 0xF0); + + /* Set ol_flags fields for final four packets. */ + rearm4 = _mm256_blend_epi32(rearm4, ol_flags, 0x04); + rearm5 = _mm256_blend_epi32(rearm5, ol_flags_hi, 0x04); + rearm6 = _mm256_blend_epi32(rearm6, + _mm256_srli_si256(ol_flags, 4), + 0x04); + rearm7 = _mm256_blend_epi32(rearm7, + _mm256_srli_si256(ol_flags_hi, 4), + 0x04); + + /* Store all mbuf fields for final four packets. */ + _mm256_storeu_si256((void *)&rx_pkts[i + 4]->rearm_data, + rearm4); + _mm256_storeu_si256((void *)&rx_pkts[i + 5]->rearm_data, + rearm5); + _mm256_storeu_si256((void *)&rx_pkts[i + 6]->rearm_data, + rearm6); + _mm256_storeu_si256((void *)&rx_pkts[i + 7]->rearm_data, + rearm7); + + nb_rx_pkts += num_valid; + if (num_valid < BNXT_RX_DESCS_PER_LOOP_VEC256) + break; + } + + if (nb_rx_pkts) { + rxr->rx_raw_prod = RING_ADV(rxr->rx_raw_prod, nb_rx_pkts); + + rxq->rxrearm_nb += nb_rx_pkts; + cpr->cp_raw_cons += 2 * nb_rx_pkts; + bnxt_db_cq(cpr); + } + + return nb_rx_pkts; +} + +uint16_t +bnxt_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts) +{ + uint16_t cnt = 0; + + while (nb_pkts > RTE_BNXT_MAX_RX_BURST) { + uint16_t burst; + + burst = recv_burst_vec_avx2(rx_queue, rx_pkts + cnt, + RTE_BNXT_MAX_RX_BURST); + + cnt += burst; + nb_pkts -= burst; + + if (burst < RTE_BNXT_MAX_RX_BURST) + return cnt; + } + return cnt + recv_burst_vec_avx2(rx_queue, rx_pkts + cnt, nb_pkts); +} + +static void +bnxt_handle_tx_cp_vec(struct bnxt_tx_queue *txq) +{ + struct bnxt_cp_ring_info *cpr = txq->cp_ring; + uint32_t raw_cons = cpr->cp_raw_cons; + uint32_t cons; + uint32_t nb_tx_pkts = 0; + struct tx_cmpl *txcmp; + struct cmpl_base *cp_desc_ring = cpr->cp_desc_ring; + struct bnxt_ring *cp_ring_struct = cpr->cp_ring_struct; + uint32_t ring_mask = cp_ring_struct->ring_mask; + + do { + cons = RING_CMPL(ring_mask, raw_cons); + txcmp = (struct tx_cmpl *)&cp_desc_ring[cons]; + + if (!CMP_VALID(txcmp, raw_cons, cp_ring_struct)) + break; + + nb_tx_pkts += txcmp->opaque; + raw_cons = NEXT_RAW_CMP(raw_cons); + } while (nb_tx_pkts < ring_mask); + + if (nb_tx_pkts) { + if (txq->offloads & DEV_TX_OFFLOAD_MBUF_FAST_FREE) + bnxt_tx_cmp_vec_fast(txq, nb_tx_pkts); + else + bnxt_tx_cmp_vec(txq, nb_tx_pkts); + cpr->cp_raw_cons = raw_cons; + bnxt_db_cq(cpr); + } +} + +static inline void +bnxt_xmit_one(struct rte_mbuf *mbuf, struct tx_bd_long *txbd, + struct rte_mbuf **tx_buf) +{ + uint64_t dsc_hi, dsc_lo; + __m128i desc; + + *tx_buf = mbuf; + + dsc_hi = mbuf->buf_iova + mbuf->data_off; + dsc_lo = (mbuf->data_len << 16) | + bnxt_xmit_flags_len(mbuf->data_len, TX_BD_FLAGS_NOCMPL); + + desc = _mm_set_epi64x(dsc_hi, dsc_lo); + _mm_store_si128((void *)txbd, desc); +} + +static uint16_t +bnxt_xmit_fixed_burst_vec(struct bnxt_tx_queue *txq, struct rte_mbuf **pkts, + uint16_t nb_pkts) +{ + struct bnxt_tx_ring_info *txr = txq->tx_ring; + uint16_t tx_prod, tx_raw_prod = txr->tx_raw_prod; + struct tx_bd_long *txbd; + struct rte_mbuf **tx_buf; + uint16_t to_send; + + tx_prod = RING_IDX(txr->tx_ring_struct, tx_raw_prod); + txbd = &txr->tx_desc_ring[tx_prod]; + tx_buf = &txr->tx_buf_ring[tx_prod]; + + /* Prefetch next transmit buffer descriptors. */ + rte_prefetch0(txbd); + rte_prefetch0(txbd + 3); + + nb_pkts = RTE_MIN(nb_pkts, bnxt_tx_avail(txq)); + + if (unlikely(nb_pkts == 0)) + return 0; + + /* Handle TX burst request */ + to_send = nb_pkts; + + /* + * If current descriptor is not on a 32-byte boundary, send one packet + * to align for 32-byte stores. + */ + if (tx_prod & 1) { + bnxt_xmit_one(pkts[0], txbd++, tx_buf++); + to_send--; + pkts++; + } + + /* + * Send four packets per loop, with a single store for each pair + * of descriptors. + */ + while (to_send >= BNXT_TX_DESCS_PER_LOOP) { + uint64_t dsc0_hi, dsc0_lo, dsc1_hi, dsc1_lo; + uint64_t dsc2_hi, dsc2_lo, dsc3_hi, dsc3_lo; + __m256i dsc01, dsc23; + + /* Prefetch next transmit buffer descriptors. */ + rte_prefetch0(txbd + 4); + rte_prefetch0(txbd + 7); + + /* Copy four mbuf pointers to tx buf ring. */ +#ifdef RTE_ARCH_X86_64 + __m256i tmp = _mm256_loadu_si256((void *)pkts); + _mm256_storeu_si256((void *)tx_buf, tmp); +#else + __m128i tmp = _mm_loadu_si128((void *)pkts); + _mm_storeu_si128((void *)tx_buf, tmp); +#endif + + dsc0_hi = tx_buf[0]->buf_iova + tx_buf[0]->data_off; + dsc0_lo = (tx_buf[0]->data_len << 16) | + bnxt_xmit_flags_len(tx_buf[0]->data_len, + TX_BD_FLAGS_NOCMPL); + + dsc1_hi = tx_buf[1]->buf_iova + tx_buf[1]->data_off; + dsc1_lo = (tx_buf[1]->data_len << 16) | + bnxt_xmit_flags_len(tx_buf[1]->data_len, + TX_BD_FLAGS_NOCMPL); + + dsc01 = _mm256_set_epi64x(dsc1_hi, dsc1_lo, dsc0_hi, dsc0_lo); + + dsc2_hi = tx_buf[2]->buf_iova + tx_buf[2]->data_off; + dsc2_lo = (tx_buf[2]->data_len << 16) | + bnxt_xmit_flags_len(tx_buf[2]->data_len, + TX_BD_FLAGS_NOCMPL); + + dsc3_hi = tx_buf[3]->buf_iova + tx_buf[3]->data_off; + dsc3_lo = (tx_buf[3]->data_len << 16) | + bnxt_xmit_flags_len(tx_buf[3]->data_len, + TX_BD_FLAGS_NOCMPL); + + dsc23 = _mm256_set_epi64x(dsc3_hi, dsc3_lo, dsc2_hi, dsc2_lo); + + _mm256_store_si256((void *)txbd, dsc01); + _mm256_store_si256((void *)(txbd + 2), dsc23); + + to_send -= BNXT_TX_DESCS_PER_LOOP; + pkts += BNXT_TX_DESCS_PER_LOOP; + txbd += BNXT_TX_DESCS_PER_LOOP; + tx_buf += BNXT_TX_DESCS_PER_LOOP; + } + + /* Send any remaining packets, writing each descriptor individually. */ + while (to_send) { + bnxt_xmit_one(pkts[0], txbd++, tx_buf++); + to_send--; + pkts++; + } + + /* Request a completion for the final packet of the burst. */ + txbd[-1].opaque = nb_pkts; + txbd[-1].flags_type &= ~TX_BD_LONG_FLAGS_NO_CMPL; + + tx_raw_prod += nb_pkts; + bnxt_db_write(&txr->tx_db, tx_raw_prod); + + txr->tx_raw_prod = tx_raw_prod; + + return nb_pkts; +} + +uint16_t +bnxt_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts) +{ + int nb_sent = 0; + struct bnxt_tx_queue *txq = tx_queue; + struct bnxt_tx_ring_info *txr = txq->tx_ring; + uint16_t ring_size = txr->tx_ring_struct->ring_size; + + /* Tx queue was stopped; wait for it to be restarted */ + if (unlikely(!txq->tx_started)) { + PMD_DRV_LOG(DEBUG, "Tx q stopped;return\n"); + return 0; + } + + /* Handle TX completions */ + if (bnxt_tx_bds_in_hw(txq) >= txq->tx_free_thresh) + bnxt_handle_tx_cp_vec(txq); + + while (nb_pkts) { + uint16_t ret, num; + + /* + * Ensure that no more than RTE_BNXT_MAX_TX_BURST packets + * are transmitted before the next completion. + */ + num = RTE_MIN(nb_pkts, RTE_BNXT_MAX_TX_BURST); + + /* + * Ensure that a ring wrap does not occur within a call to + * bnxt_xmit_fixed_burst_vec(). + */ + num = RTE_MIN(num, ring_size - + (txr->tx_raw_prod & (ring_size - 1))); + ret = bnxt_xmit_fixed_burst_vec(txq, &tx_pkts[nb_sent], num); + nb_sent += ret; + nb_pkts -= ret; + if (ret < num) + break; + } + + return nb_sent; +} diff --git a/drivers/net/bnxt/bnxt_rxtx_vec_neon.c b/drivers/net/bnxt/bnxt_rxtx_vec_neon.c index a6e630ea5e..b4e9202568 100644 --- a/drivers/net/bnxt/bnxt_rxtx_vec_neon.c +++ b/drivers/net/bnxt/bnxt_rxtx_vec_neon.c @@ -200,17 +200,20 @@ recv_burst_vec_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) * maximum number of packets to receive to be a multiple of the per- * loop count. */ - if (nb_pkts < RTE_BNXT_DESCS_PER_LOOP) - desc_valid_mask >>= 16 * (RTE_BNXT_DESCS_PER_LOOP - nb_pkts); - else - nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_BNXT_DESCS_PER_LOOP); + if (nb_pkts < BNXT_RX_DESCS_PER_LOOP_VEC128) { + desc_valid_mask >>= + 16 * (BNXT_RX_DESCS_PER_LOOP_VEC128 - nb_pkts); + } else { + nb_pkts = + RTE_ALIGN_FLOOR(nb_pkts, BNXT_RX_DESCS_PER_LOOP_VEC128); + } /* Handle RX burst request */ - for (i = 0; i < nb_pkts; i += RTE_BNXT_DESCS_PER_LOOP, - cons += RTE_BNXT_DESCS_PER_LOOP * 2, - mbcons += RTE_BNXT_DESCS_PER_LOOP) { - uint32x4_t rxcmp1[RTE_BNXT_DESCS_PER_LOOP]; - uint32x4_t rxcmp[RTE_BNXT_DESCS_PER_LOOP]; + for (i = 0; i < nb_pkts; i += BNXT_RX_DESCS_PER_LOOP_VEC128, + cons += BNXT_RX_DESCS_PER_LOOP_VEC128 * 2, + mbcons += BNXT_RX_DESCS_PER_LOOP_VEC128) { + uint32x4_t rxcmp1[BNXT_RX_DESCS_PER_LOOP_VEC128]; + uint32x4_t rxcmp[BNXT_RX_DESCS_PER_LOOP_VEC128]; uint32x4_t info3_v; uint64x2_t t0, t1; uint32_t num_valid; @@ -226,7 +229,7 @@ recv_burst_vec_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) #endif /* Prefetch four descriptor pairs for next iteration. */ - if (i + RTE_BNXT_DESCS_PER_LOOP < nb_pkts) { + if (i + BNXT_RX_DESCS_PER_LOOP_VEC128 < nb_pkts) { rte_prefetch0(&cp_desc_ring[cons + 8]); rte_prefetch0(&cp_desc_ring[cons + 12]); } @@ -284,7 +287,7 @@ recv_burst_vec_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) rxr); nb_rx_pkts += num_valid; - if (num_valid < RTE_BNXT_DESCS_PER_LOOP) + if (num_valid < BNXT_RX_DESCS_PER_LOOP_VEC128) break; } diff --git a/drivers/net/bnxt/bnxt_rxtx_vec_sse.c b/drivers/net/bnxt/bnxt_rxtx_vec_sse.c index fe074f82cf..c479697ac0 100644 --- a/drivers/net/bnxt/bnxt_rxtx_vec_sse.c +++ b/drivers/net/bnxt/bnxt_rxtx_vec_sse.c @@ -191,17 +191,20 @@ recv_burst_vec_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) * maximum number of packets to receive to be a multiple of the per- * loop count. */ - if (nb_pkts < RTE_BNXT_DESCS_PER_LOOP) - desc_valid_mask >>= 16 * (RTE_BNXT_DESCS_PER_LOOP - nb_pkts); - else - nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_BNXT_DESCS_PER_LOOP); + if (nb_pkts < BNXT_RX_DESCS_PER_LOOP_VEC128) { + desc_valid_mask >>= + 16 * (BNXT_RX_DESCS_PER_LOOP_VEC128 - nb_pkts); + } else { + nb_pkts = + RTE_ALIGN_FLOOR(nb_pkts, BNXT_RX_DESCS_PER_LOOP_VEC128); + } /* Handle RX burst request */ - for (i = 0; i < nb_pkts; i += RTE_BNXT_DESCS_PER_LOOP, - cons += RTE_BNXT_DESCS_PER_LOOP * 2, - mbcons += RTE_BNXT_DESCS_PER_LOOP) { - __m128i rxcmp1[RTE_BNXT_DESCS_PER_LOOP]; - __m128i rxcmp[RTE_BNXT_DESCS_PER_LOOP]; + for (i = 0; i < nb_pkts; i += BNXT_RX_DESCS_PER_LOOP_VEC128, + cons += BNXT_RX_DESCS_PER_LOOP_VEC128 * 2, + mbcons += BNXT_RX_DESCS_PER_LOOP_VEC128) { + __m128i rxcmp1[BNXT_RX_DESCS_PER_LOOP_VEC128]; + __m128i rxcmp[BNXT_RX_DESCS_PER_LOOP_VEC128]; __m128i tmp0, tmp1, info3_v; uint32_t num_valid; @@ -216,7 +219,7 @@ recv_burst_vec_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) #endif /* Prefetch four descriptor pairs for next iteration. */ - if (i + RTE_BNXT_DESCS_PER_LOOP < nb_pkts) { + if (i + BNXT_RX_DESCS_PER_LOOP_VEC128 < nb_pkts) { rte_prefetch0(&cp_desc_ring[cons + 8]); rte_prefetch0(&cp_desc_ring[cons + 12]); } @@ -265,7 +268,7 @@ recv_burst_vec_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) rxr); nb_rx_pkts += num_valid; - if (num_valid < RTE_BNXT_DESCS_PER_LOOP) + if (num_valid < BNXT_RX_DESCS_PER_LOOP_VEC128) break; } @@ -383,7 +386,7 @@ bnxt_xmit_fixed_burst_vec(struct bnxt_tx_queue *txq, struct rte_mbuf **tx_pkts, /* Handle TX burst request */ to_send = nb_pkts; - while (to_send >= RTE_BNXT_DESCS_PER_LOOP) { + while (to_send >= BNXT_TX_DESCS_PER_LOOP) { /* Prefetch next transmit buffer descriptors. */ rte_prefetch0(txbd + 4); rte_prefetch0(txbd + 7); @@ -393,8 +396,8 @@ bnxt_xmit_fixed_burst_vec(struct bnxt_tx_queue *txq, struct rte_mbuf **tx_pkts, bnxt_xmit_one(tx_pkts[2], txbd++, tx_buf++); bnxt_xmit_one(tx_pkts[3], txbd++, tx_buf++); - to_send -= RTE_BNXT_DESCS_PER_LOOP; - tx_pkts += RTE_BNXT_DESCS_PER_LOOP; + to_send -= BNXT_TX_DESCS_PER_LOOP; + tx_pkts += BNXT_TX_DESCS_PER_LOOP; } while (to_send) { diff --git a/drivers/net/bnxt/bnxt_txr.h b/drivers/net/bnxt/bnxt_txr.h index e4bd90f883..6bfdc6d01a 100644 --- a/drivers/net/bnxt/bnxt_txr.h +++ b/drivers/net/bnxt/bnxt_txr.h @@ -11,6 +11,9 @@ #define BNXT_MAX_TSO_SEGS 32 #define BNXT_MIN_PKT_SIZE 52 +/* Number of transmit descriptors processed per inner loop in vector mode. */ +#define BNXT_TX_DESCS_PER_LOOP 4U + struct bnxt_tx_ring_info { uint16_t tx_raw_prod; uint16_t tx_raw_cons; @@ -48,6 +51,10 @@ uint16_t bnxt_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t bnxt_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts); #endif +#if defined(RTE_ARCH_X86) && defined(CC_AVX2_SUPPORT) +uint16_t bnxt_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts); +#endif int bnxt_tx_queue_start(struct rte_eth_dev *dev, uint16_t tx_queue_id); int bnxt_tx_queue_stop(struct rte_eth_dev *dev, uint16_t tx_queue_id); diff --git a/drivers/net/bnxt/meson.build b/drivers/net/bnxt/meson.build index 117c753489..41c4796366 100644 --- a/drivers/net/bnxt/meson.build +++ b/drivers/net/bnxt/meson.build @@ -82,6 +82,23 @@ sources = files( if arch_subdir == 'x86' sources += files('bnxt_rxtx_vec_sse.c') + # compile AVX2 version if either: + # a. we have AVX supported in minimum instruction set baseline + # b. it's not minimum instruction set, but supported by compiler + if cc.get_define('__AVX2__', args: machine_args) != '' + cflags += ['-DCC_AVX2_SUPPORT'] + sources += files('bnxt_rxtx_vec_avx2.c') + elif cc.has_argument('-mavx2') + cflags += ['-DCC_AVX2_SUPPORT'] + bnxt_avx2_lib = static_library('bnxt_avx2_lib', + 'bnxt_rxtx_vec_avx2.c', + dependencies: [static_rte_ethdev, + static_rte_bus_pci, + static_rte_kvargs, static_rte_hash], + include_directories: includes, + c_args: [cflags, '-mavx2']) + objs += bnxt_avx2_lib.extract_objects('bnxt_rxtx_vec_avx2.c') + endif elif arch_subdir == 'arm' and host_machine.cpu_family().startswith('aarch64') sources += files('bnxt_rxtx_vec_neon.c') endif -- 2.20.1