]> git.droids-corp.org - dpdk.git/commitdiff
net/bnxt: remove software prefetches from AVX2 Rx
authorLance Richardson <lance.richardson@broadcom.com>
Mon, 15 Nov 2021 18:24:10 +0000 (13:24 -0500)
committerAjit Khaparde <ajit.khaparde@broadcom.com>
Tue, 16 Nov 2021 22:57:27 +0000 (23:57 +0100)
Testing has shown no performance benefit from software prefetching
of receive completion descriptors in the AVX2 burst receive path,
and slightly better performance without them on some CPU families,
so this patch removes them.

Fixes: c4e4c18963b0 ("net/bnxt: add AVX2 RX/Tx")
Cc: stable@dpdk.org
Signed-off-by: Lance Richardson <lance.richardson@broadcom.com>
Reviewed-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
drivers/net/bnxt/bnxt_rxtx_vec_avx2.c

index 54e3af22ac5ce6aef736950c59d51d25a6e20eee..34bd22edf0e7d4789e3bbb2931529db8379eef9c 100644 (file)
@@ -92,12 +92,6 @@ recv_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
        cons = raw_cons & (cp_ring_size - 1);
        mbcons = (raw_cons / 2) & (rx_ring_size - 1);
 
-       /* Prefetch first four descriptor pairs. */
-       rte_prefetch0(&cp_desc_ring[cons + 0]);
-       rte_prefetch0(&cp_desc_ring[cons + 4]);
-       rte_prefetch0(&cp_desc_ring[cons + 8]);
-       rte_prefetch0(&cp_desc_ring[cons + 12]);
-
        /* Return immediately if there is not at least one completed packet. */
        if (!bnxt_cpr_cmp_valid(&cp_desc_ring[cons], raw_cons, cp_ring_size))
                return 0;
@@ -136,14 +130,6 @@ recv_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
                _mm256_storeu_si256((void *)&rx_pkts[i + 4], t0);
 #endif
 
-               /* Prefetch eight descriptor pairs for next iteration. */
-               if (i + BNXT_RX_DESCS_PER_LOOP_VEC256 < nb_pkts) {
-                       rte_prefetch0(&cp_desc_ring[cons + 16]);
-                       rte_prefetch0(&cp_desc_ring[cons + 20]);
-                       rte_prefetch0(&cp_desc_ring[cons + 24]);
-                       rte_prefetch0(&cp_desc_ring[cons + 28]);
-               }
-
                /*
                 * Load eight receive completion descriptors into 256-bit
                 * registers. Loads are issued in reverse order in order to