From: Pavan Nikhilesh Date: Thu, 24 Feb 2022 16:10:12 +0000 (+0530) Subject: net/cnxk: align prefetches to CN10K cache model X-Git-Url: http://git.droids-corp.org/?a=commitdiff_plain;h=592642c494b159abf21b175a32d9621ed7b8a308;p=dpdk.git net/cnxk: align prefetches to CN10K cache model Align prefetches for CN10K cache model for vWQE in Rx and Tx. Move mbuf->next NULL assignment to Tx path and enabled it only when multi segments offload is enabled to reduce L1 pressure. Add macros to detect corrupted mbuf->next values when MEMPOOL_DEBUG is set. Signed-off-by: Pavan Nikhilesh Acked-by: Jerin Jacob --- diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h index 0226896d44..0d0685e28e 100644 --- a/drivers/event/cnxk/cn10k_worker.h +++ b/drivers/event/cnxk/cn10k_worker.h @@ -118,23 +118,23 @@ cn10k_process_vwqe(uintptr_t vwqe, uint16_t port_id, const uint32_t flags, uint64_t aura_handle, laddr; uint16_t nb_mbufs, non_vec; uint16_t lmt_id, d_off; + struct rte_mbuf **wqe; struct rte_mbuf *mbuf; uint8_t loff = 0; uint64_t sa_base; - uint64_t **wqe; int i; mbuf_init |= ((uint64_t)port_id) << 48; vec = (struct rte_event_vector *)vwqe; - wqe = vec->u64s; + wqe = vec->mbufs; - rte_prefetch_non_temporal(&vec->ptrs[0]); + rte_prefetch0(&vec->ptrs[0]); #define OBJS_PER_CLINE (RTE_CACHE_LINE_SIZE / sizeof(void *)) for (i = OBJS_PER_CLINE; i < vec->nb_elem; i += OBJS_PER_CLINE) - rte_prefetch_non_temporal(&vec->ptrs[i]); + rte_prefetch0(&vec->ptrs[i]); nb_mbufs = RTE_ALIGN_FLOOR(vec->nb_elem, NIX_DESCS_PER_LOOP); - nb_mbufs = cn10k_nix_recv_pkts_vector(&mbuf_init, vec->mbufs, nb_mbufs, + nb_mbufs = cn10k_nix_recv_pkts_vector(&mbuf_init, wqe, nb_mbufs, flags | NIX_RX_VWQE_F, lookup_mem, tstamp, lbase); wqe += nb_mbufs; @@ -182,7 +182,7 @@ cn10k_process_vwqe(uintptr_t vwqe, uint16_t port_id, const uint32_t flags, cn10k_nix_mbuf_to_tstamp((struct rte_mbuf *)mbuf, tstamp, flags & NIX_RX_OFFLOAD_TSTAMP_F, (uint64_t *)tstamp_ptr); - wqe[0] = (uint64_t *)mbuf; + wqe[0] = (struct rte_mbuf *)mbuf; non_vec--; wqe++; } @@ -620,6 +620,7 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev, ev->sched_type, txq_data, flags); } rte_mempool_put(rte_mempool_from_obj(ev->vec), ev->vec); + rte_prefetch0(ws); return (meta & 0xFFFF); } diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h index 236a1dca6e..de5e41483b 100644 --- a/drivers/net/cnxk/cn10k_rx.h +++ b/drivers/net/cnxk/cn10k_rx.h @@ -36,6 +36,27 @@ (((f) & NIX_RX_VWQE_F) ? \ (uint64_t *)(((uintptr_t)((uint64_t *)(b))[i]) + (o)) : \ (uint64_t *)(((uintptr_t)(b)) + CQE_SZ(i) + (o))) +#define CQE_PTR_DIFF(b, i, o, f) \ + (((f) & NIX_RX_VWQE_F) ? \ + (uint64_t *)(((uintptr_t)((uint64_t *)(b))[i]) - (o)) : \ + (uint64_t *)(((uintptr_t)(b)) + CQE_SZ(i) - (o))) + +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG +static inline void +nix_mbuf_validate_next(struct rte_mbuf *m) +{ + if (m->nb_segs == 1 && m->next) { + rte_panic("mbuf->next[%p] valid when mbuf->nb_segs is %d", + m->next, m->nb_segs); + } +} +#else +static inline void +nix_mbuf_validate_next(struct rte_mbuf *m) +{ + RTE_SET_USED(m); +} +#endif union mbuf_initializer { struct { @@ -674,17 +695,66 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, cq0 = (uintptr_t)&mbufs[packets]; } - /* Prefetch N desc ahead */ - rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 4, 64, flags)); - rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 5, 64, flags)); - rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 6, 64, flags)); - rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 7, 64, flags)); - - /* Get NIX_RX_SG_S for size and buffer pointer */ - cq0_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 0, 64, flags)); - cq1_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 1, 64, flags)); - cq2_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 2, 64, flags)); - cq3_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 3, 64, flags)); + if (flags & NIX_RX_VWQE_F) { + if (pkts - packets > 4) { + rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, + 4, 0, flags)); + rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, + 5, 0, flags)); + rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, + 6, 0, flags)); + rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, + 7, 0, flags)); + + if (likely(pkts - packets > 8)) { + rte_prefetch1(CQE_PTR_OFF(cq0, + 8, 0, flags)); + rte_prefetch1(CQE_PTR_OFF(cq0, + 9, 0, flags)); + rte_prefetch1(CQE_PTR_OFF(cq0, + 10, 0, flags)); + rte_prefetch1(CQE_PTR_OFF(cq0, + 11, 0, flags)); + if (pkts - packets > 12) { + rte_prefetch1(CQE_PTR_OFF(cq0, + 12, 0, flags)); + rte_prefetch1(CQE_PTR_OFF(cq0, + 13, 0, flags)); + rte_prefetch1(CQE_PTR_OFF(cq0, + 14, 0, flags)); + rte_prefetch1(CQE_PTR_OFF(cq0, + 15, 0, flags)); + } + } + + rte_prefetch0(CQE_PTR_DIFF(cq0, + 4, RTE_PKTMBUF_HEADROOM, flags)); + rte_prefetch0(CQE_PTR_DIFF(cq0, + 5, RTE_PKTMBUF_HEADROOM, flags)); + rte_prefetch0(CQE_PTR_DIFF(cq0, + 6, RTE_PKTMBUF_HEADROOM, flags)); + rte_prefetch0(CQE_PTR_DIFF(cq0, + 7, RTE_PKTMBUF_HEADROOM, flags)); + + if (likely(pkts - packets > 8)) { + rte_prefetch0(CQE_PTR_DIFF(cq0, + 8, RTE_PKTMBUF_HEADROOM, flags)); + rte_prefetch0(CQE_PTR_DIFF(cq0, + 9, RTE_PKTMBUF_HEADROOM, flags)); + rte_prefetch0(CQE_PTR_DIFF(cq0, + 10, RTE_PKTMBUF_HEADROOM, flags)); + rte_prefetch0(CQE_PTR_DIFF(cq0, + 11, RTE_PKTMBUF_HEADROOM, flags)); + } + } + } else { + if (pkts - packets > 4) { + rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 4, 64, flags)); + rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 5, 64, flags)); + rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 6, 64, flags)); + rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 7, 64, flags)); + } + } if (!(flags & NIX_RX_VWQE_F)) { /* Get NIX_RX_SG_S for size and buffer pointer */ @@ -995,19 +1065,18 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, nix_cqe_xtract_mseg((union nix_rx_parse_u *) (CQE_PTR_OFF(cq0, 3, 8, flags)), mbuf3, mbuf_initializer, flags); - } else { - /* Update that no more segments */ - mbuf0->next = NULL; - mbuf1->next = NULL; - mbuf2->next = NULL; - mbuf3->next = NULL; } - /* Prefetch mbufs */ - roc_prefetch_store_keep(mbuf0); - roc_prefetch_store_keep(mbuf1); - roc_prefetch_store_keep(mbuf2); - roc_prefetch_store_keep(mbuf3); + /* Mark mempool obj as "get" as it is alloc'ed by NIX */ + RTE_MEMPOOL_CHECK_COOKIES(mbuf0->pool, (void **)&mbuf0, 1, 1); + RTE_MEMPOOL_CHECK_COOKIES(mbuf1->pool, (void **)&mbuf1, 1, 1); + RTE_MEMPOOL_CHECK_COOKIES(mbuf2->pool, (void **)&mbuf2, 1, 1); + RTE_MEMPOOL_CHECK_COOKIES(mbuf3->pool, (void **)&mbuf3, 1, 1); + + nix_mbuf_validate_next(mbuf0); + nix_mbuf_validate_next(mbuf1); + nix_mbuf_validate_next(mbuf2); + nix_mbuf_validate_next(mbuf3); packets += NIX_DESCS_PER_LOOP; diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h index ec6366168c..695e3ed354 100644 --- a/drivers/net/cnxk/cn10k_tx.h +++ b/drivers/net/cnxk/cn10k_tx.h @@ -2569,6 +2569,13 @@ again: lnum += 1; } + if (flags & NIX_TX_MULTI_SEG_F) { + tx_pkts[0]->next = NULL; + tx_pkts[1]->next = NULL; + tx_pkts[2]->next = NULL; + tx_pkts[3]->next = NULL; + } + tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP; }