]> git.droids-corp.org - dpdk.git/commitdiff
net/bnxt: reduce barriers in NEON vector Rx
authorRuifeng Wang <ruifeng.wang@arm.com>
Mon, 13 Jun 2022 06:22:25 +0000 (14:22 +0800)
committerAjit Khaparde <ajit.khaparde@broadcom.com>
Fri, 24 Jun 2022 21:22:09 +0000 (23:22 +0200)
To read descriptors in expected order, barriers are inserted after each
descriptor read. The excessive use of barriers is unnecessary and could
cause performance drop.

Removed barriers between descriptor reads. And changed counting of valid
packets so as to handle discontinuous valid packets. Because out of
order read could lead to valid descriptors that fetched being
discontinuous.

In VPP L3 routing test, 6% performance gain was observed. The test was
done on a platform with ThunderX2 CPU and Broadcom PS225 NIC.

Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
drivers/net/bnxt/bnxt_rxtx_vec_neon.c

index 32f8e59b3a5cc9c68bd629a4558a624edeac6908..6a4ece681be85a60cbe4bc1a5ef982d728f95c0d 100644 (file)
@@ -235,34 +235,32 @@ recv_burst_vec_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
                 * IO barriers are used to ensure consistent state.
                 */
                rxcmp1[3] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 7]);
-               rte_io_rmb();
+               rxcmp1[2] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 5]);
+               rxcmp1[1] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 3]);
+               rxcmp1[0] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 1]);
+
+               /* Use acquire fence to order loads of descriptor words. */
+               rte_atomic_thread_fence(__ATOMIC_ACQUIRE);
                /* Reload lower 64b of descriptors to make it ordered after info3_v. */
                rxcmp1[3] = vreinterpretq_u32_u64(vld1q_lane_u64
                                ((void *)&cpr->cp_desc_ring[cons + 7],
                                vreinterpretq_u64_u32(rxcmp1[3]), 0));
-               rxcmp[3] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 6]);
-
-               rxcmp1[2] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 5]);
-               rte_io_rmb();
                rxcmp1[2] = vreinterpretq_u32_u64(vld1q_lane_u64
                                ((void *)&cpr->cp_desc_ring[cons + 5],
                                vreinterpretq_u64_u32(rxcmp1[2]), 0));
-               rxcmp[2] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 4]);
-
-               t1 = vreinterpretq_u64_u32(vzip2q_u32(rxcmp1[2], rxcmp1[3]));
-
-               rxcmp1[1] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 3]);
-               rte_io_rmb();
                rxcmp1[1] = vreinterpretq_u32_u64(vld1q_lane_u64
                                ((void *)&cpr->cp_desc_ring[cons + 3],
                                vreinterpretq_u64_u32(rxcmp1[1]), 0));
-               rxcmp[1] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 2]);
-
-               rxcmp1[0] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 1]);
-               rte_io_rmb();
                rxcmp1[0] = vreinterpretq_u32_u64(vld1q_lane_u64
                                ((void *)&cpr->cp_desc_ring[cons + 1],
                                vreinterpretq_u64_u32(rxcmp1[0]), 0));
+
+               rxcmp[3] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 6]);
+               rxcmp[2] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 4]);
+
+               t1 = vreinterpretq_u64_u32(vzip2q_u32(rxcmp1[2], rxcmp1[3]));
+
+               rxcmp[1] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 2]);
                rxcmp[0] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 0]);
 
                t0 = vreinterpretq_u64_u32(vzip2q_u32(rxcmp1[0], rxcmp1[1]));
@@ -278,16 +276,21 @@ recv_burst_vec_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
                 * bits and count the number of set bits in order to determine
                 * the number of valid descriptors.
                 */
-               valid = vget_lane_u64(vreinterpret_u64_u16(vqmovn_u32(info3_v)),
-                                     0);
+               valid = vget_lane_u64(vreinterpret_u64_s16(vshr_n_s16
+                               (vreinterpret_s16_u16(vshl_n_u16
+                               (vqmovn_u32(info3_v), 15)), 15)), 0);
+
                /*
                 * At this point, 'valid' is a 64-bit value containing four
-                * 16-bit fields, each of which is either 0x0001 or 0x0000.
-                * Compute number of valid descriptors from the index of
-                * the highest non-zero field.
+                * 16-bit fields, each of which is either 0xffff or 0x0000.
+                * Count the number of consecutive 1s from LSB in order to
+                * determine the number of valid descriptors.
                 */
-               num_valid = (sizeof(uint64_t) / sizeof(uint16_t)) -
-                               (__builtin_clzl(valid & desc_valid_mask) / 16);
+               valid = ~(valid & desc_valid_mask);
+               if (valid == 0)
+                       num_valid = 4;
+               else
+                       num_valid = __builtin_ctzl(valid) / 16;
 
                if (num_valid == 0)
                        break;