From: Alexander Kozyrev Date: Wed, 22 Jul 2020 20:32:38 +0000 (+0000) Subject: net/mlx5: fix vectorized mini-CQE prefetching X-Git-Url: http://git.droids-corp.org/?p=dpdk.git;a=commitdiff_plain;h=6f52bd338374e4bd54ff1f872928e53ebdf1108d net/mlx5: fix vectorized mini-CQE prefetching There was an optimization work to prefetch all the CQEs before their invalidation. It allowed us to speed up the mini-CQE decompression process by preheating the cache in the vectorized Rx routine. Prefetching of the next mini-CQE, on the other hand, showed no difference in the performance on x86 platform. So, that was removed. Unfortunately this caused the performance drop on ARM. Prefetch the mini-CQE as well as all the soon to be invalidated CQEs to get both CQE and mini-CQE on the hot path. Fixes: 28a4b96321a3 ("net/mlx5: prefetch CQEs for a faster decompression") Cc: stable@dpdk.org Signed-off-by: Alexander Kozyrev Acked-by: Viacheslav Ovsiienko --- diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h index f5414eebad..cb4ce1a099 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h @@ -158,7 +158,6 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) if (likely(pos + i < mcqe_n)) rte_prefetch0((void *)(cq + pos + i)); - /* A.1 load mCQEs into a 128bit register. */ mcqe1 = (vector unsigned char)vec_vsx_ld(0, (signed int const *)&mcq[pos % 8]); @@ -287,6 +286,8 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, pos += MLX5_VPMD_DESCS_PER_LOOP; /* Move to next CQE and invalidate consumed CQEs. */ if (!(pos & 0x7) && pos < mcqe_n) { + if (pos + 8 < mcqe_n) + rte_prefetch0((void *)(cq + pos + 8)); mcq = (void *)&(cq + pos)->pkt_info; for (i = 0; i < 8; ++i) cq[inv++].op_own = MLX5_CQE_INVALIDATE; diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h index 555c342626..6c3149523e 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h @@ -145,6 +145,7 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, -1UL << ((mcqe_n - pos) * sizeof(uint16_t) * 8) : 0); #endif + for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) if (likely(pos + i < mcqe_n)) rte_prefetch0((void *)(cq + pos + i)); @@ -227,6 +228,8 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, pos += MLX5_VPMD_DESCS_PER_LOOP; /* Move to next CQE and invalidate consumed CQEs. */ if (!(pos & 0x7) && pos < mcqe_n) { + if (pos + 8 < mcqe_n) + rte_prefetch0((void *)(cq + pos + 8)); mcq = (void *)&(cq + pos)->pkt_info; for (i = 0; i < 8; ++i) cq[inv++].op_own = MLX5_CQE_INVALIDATE; diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h index 34e3397115..554924d7fc 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h @@ -135,7 +135,6 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) if (likely(pos + i < mcqe_n)) rte_prefetch0((void *)(cq + pos + i)); - /* A.1 load mCQEs into a 128bit register. */ mcqe1 = _mm_loadu_si128((__m128i *)&mcq[pos % 8]); mcqe2 = _mm_loadu_si128((__m128i *)&mcq[pos % 8 + 2]); @@ -214,6 +213,8 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, pos += MLX5_VPMD_DESCS_PER_LOOP; /* Move to next CQE and invalidate consumed CQEs. */ if (!(pos & 0x7) && pos < mcqe_n) { + if (pos + 8 < mcqe_n) + rte_prefetch0((void *)(cq + pos + 8)); mcq = (void *)(cq + pos); for (i = 0; i < 8; ++i) cq[inv++].op_own = MLX5_CQE_INVALIDATE;