From 319df9f9bf1ad4a854e1e8b9fe087580909b8263 Mon Sep 17 00:00:00 2001 From: Feifei Wang Date: Fri, 23 Jul 2021 11:10:49 +0800 Subject: [PATCH] net/i40e: reduce L1 cache misses in NEON Rx For N1 platform, packet mbuf load and descs load are hot spots to limit the performance for "desc_to_ptype_v" and "desc_to_olflags_v" functions in i40e rx NEON path. This is because packet mbuf and descs are evicted from l1d-cache to l2d-cache. To reduce l1d-cache-misses and improve the performance, change the code order and move "desc_to_ptype_v" and "desc_to_olflags_v" functions forward to the location, where packet mbuf and descs are just loaded. Test Result: dpdk:21.08-rc1 gcc-9 For n1sdp, the patch improves the performance by 1.8%. For thunderx2, no performance changes. Signed-off-by: Feifei Wang Reviewed-by: Ruifeng Wang --- drivers/net/i40e/i40e_rxtx_vec_neon.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c b/drivers/net/i40e/i40e_rxtx_vec_neon.c index 8f3188e910..b2683fda60 100644 --- a/drivers/net/i40e/i40e_rxtx_vec_neon.c +++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c @@ -301,18 +301,6 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq, rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); } - /* C.1 4=>2 filter staterr info only */ - sterr_tmp2 = vzipq_u16(vreinterpretq_u16_u64(descs[1]), - vreinterpretq_u16_u64(descs[3])); - sterr_tmp1 = vzipq_u16(vreinterpretq_u16_u64(descs[0]), - vreinterpretq_u16_u64(descs[2])); - - /* C.2 get 4 pkts staterr value */ - staterr = vzipq_u16(sterr_tmp1.val[1], - sterr_tmp2.val[1]).val[0]; - - desc_to_olflags_v(rxq, descs, &rx_pkts[pos]); - /* pkts shift the pktlen field to be 16-bit aligned*/ uint32x4_t len3 = vshlq_u32(vreinterpretq_u32_u64(descs[3]), len_shl); @@ -367,10 +355,22 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq, desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl); + desc_to_olflags_v(rxq, descs, &rx_pkts[pos]); + if (likely(pos + RTE_I40E_DESCS_PER_LOOP < nb_pkts)) { rte_prefetch_non_temporal(rxdp + RTE_I40E_DESCS_PER_LOOP); } + /* C.1 4=>2 filter staterr info only */ + sterr_tmp2 = vzipq_u16(vreinterpretq_u16_u64(descs[1]), + vreinterpretq_u16_u64(descs[3])); + sterr_tmp1 = vzipq_u16(vreinterpretq_u16_u64(descs[0]), + vreinterpretq_u16_u64(descs[2])); + + /* C.2 get 4 pkts staterr value */ + staterr = vzipq_u16(sterr_tmp1.val[1], + sterr_tmp2.val[1]).val[0]; + /* C* extract and record EOP bit */ if (split_packet) { uint8x16_t eop_shuf_mask = { -- 2.20.1