]> git.droids-corp.org - dpdk.git/commitdiff
net/mlx5: prefetch CQEs for a faster decompression
authorAlexander Kozyrev <akozyrev@mellanox.com>
Tue, 24 Mar 2020 14:45:30 +0000 (16:45 +0200)
committerFerruh Yigit <ferruh.yigit@intel.com>
Tue, 21 Apr 2020 11:57:05 +0000 (13:57 +0200)
Invalidation of consumed CQEs incurs a performance penalty
due to many cache misses caused by a non-sequential CQEs access.
Prefetch CQEs to get a better data locality and speed up the
decompression of CQEs. Prefetching reduces CPI rate of the
rxq_cq_decompress_v() function from 1 to 0.85 in my environment,
resulting in 2% boost in mpps for 64B frames single core test.

Signed-off-by: Alexander Kozyrev <akozyrev@mellanox.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
Acked-by: Matan Azrad <matan@mellanox.com>
drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
drivers/net/mlx5/mlx5_rxtx_vec_neon.h
drivers/net/mlx5/mlx5_rxtx_vec_sse.h

index d55642e57a31fc4b0147813b14ee454aa69018e3..9778b0bbc1c01a37a39c7813141c91df315b6513 100644 (file)
@@ -155,8 +155,9 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
                const vector unsigned long shmax = {64, 64};
 #endif
 
-               if (!(pos & 0x7) && pos + 8 < mcqe_n)
-                       rte_prefetch0((void *)(cq + pos + 8));
+               for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
+                       if (likely(pos + i < mcqe_n))
+                               rte_prefetch0((void *)(cq + pos + i));
 
                /* A.1 load mCQEs into a 128bit register. */
                mcqe1 = (vector unsigned char)vec_vsx_ld(0,
index 701e5e0cd521cba32c08f6da986fcaf2f49e11e3..7b6c5db491a64de61e5494ddbdaffb415c9deadb 100644 (file)
@@ -145,9 +145,9 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
                                    -1UL << ((mcqe_n - pos) *
                                             sizeof(uint16_t) * 8) : 0);
 #endif
-
-               if (!(pos & 0x7) && pos + 8 < mcqe_n)
-                       rte_prefetch0((void *)(cq + pos + 8));
+               for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
+                       if (likely(pos + i < mcqe_n))
+                               rte_prefetch0((void *)(cq + pos + i));
                __asm__ volatile (
                /* A.1 load mCQEs into a 128bit register. */
                "ld1 {v16.16b - v17.16b}, [%[mcq]] \n\t"
index a4086df2e901ddc3aee13c4b7c6670136bd5b9af..4b711f0f1f4ea77488ad0845df63d007a6504116 100644 (file)
@@ -132,8 +132,10 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
                __m128i byte_cnt, invalid_mask;
 #endif
 
-               if (!(pos & 0x7) && pos + 8 < mcqe_n)
-                       rte_prefetch0((void *)(cq + pos + 8));
+               for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
+                       if (likely(pos + i < mcqe_n))
+                               rte_prefetch0((void *)(cq + pos + i));
+
                /* A.1 load mCQEs into a 128bit register. */
                mcqe1 = _mm_loadu_si128((__m128i *)&mcq[pos % 8]);
                mcqe2 = _mm_loadu_si128((__m128i *)&mcq[pos % 8 + 2]);