From: Pavan Nikhilesh Date: Wed, 14 Jul 2021 09:02:07 +0000 (+0530) Subject: event/cnxk: support vectorized Tx event fast path X-Git-Url: http://git.droids-corp.org/?a=commitdiff_plain;h=761a321acf9111132f7789ac374e08a7ea7ea0f7;p=dpdk.git event/cnxk: support vectorized Tx event fast path Add Tx event vector fastpath, integrate event vector Tx routine into Tx burst. Signed-off-by: Pavan Nikhilesh Acked-by: Jerin Jacob --- diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst index def91649e4..adcf1bf56f 100644 --- a/doc/guides/rel_notes/release_21_08.rst +++ b/doc/guides/rel_notes/release_21_08.rst @@ -122,7 +122,7 @@ New Features * Added Rx/Tx adapter support for event/cnxk when the ethernet device requested is net/cnxk. - * Added support for event vectorization for Rx adapter. + * Added support for event vectorization for Rx/Tx adapter. * **Added cppc_cpufreq support to Power Management library.** diff --git a/drivers/common/cnxk/roc_sso.h b/drivers/common/cnxk/roc_sso.h index a6030e7d8a..b28f6089cc 100644 --- a/drivers/common/cnxk/roc_sso.h +++ b/drivers/common/cnxk/roc_sso.h @@ -44,6 +44,29 @@ struct roc_sso { uint8_t reserved[ROC_SSO_MEM_SZ] __plt_cache_aligned; } __plt_cache_aligned; +static __plt_always_inline void +roc_sso_hws_head_wait(uintptr_t tag_op) +{ +#ifdef RTE_ARCH_ARM64 + uint64_t tag; + + asm volatile(PLT_CPU_FEATURE_PREAMBLE + " ldr %[tag], [%[tag_op]] \n" + " tbnz %[tag], 35, done%= \n" + " sevl \n" + "rty%=: wfe \n" + " ldr %[tag], [%[tag_op]] \n" + " tbz %[tag], 35, rty%= \n" + "done%=: \n" + : [tag] "=&r"(tag) + : [tag_op] "r"(tag_op)); +#else + /* Wait for the SWTAG/SWTAG_FULL operation */ + while (!(plt_read64(tag_op) & BIT_ULL(35))) + ; +#endif +} + /* SSO device initialization */ int __roc_api roc_sso_dev_init(struct roc_sso *roc_sso); int __roc_api roc_sso_dev_fini(struct roc_sso *roc_sso); diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c index e85fa4785d..6f37c5bd23 100644 --- a/drivers/event/cnxk/cn10k_eventdev.c +++ b/drivers/event/cnxk/cn10k_eventdev.c @@ -782,7 +782,8 @@ cn10k_sso_tx_adapter_caps_get(const struct rte_eventdev *dev, if (ret) *caps = 0; else - *caps = RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT; + *caps = RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT | + RTE_EVENT_ETH_TX_ADAPTER_CAP_EVENT_VECTOR; return 0; } diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h index 7a48a6b17d..9cc0992063 100644 --- a/drivers/event/cnxk/cn10k_worker.h +++ b/drivers/event/cnxk/cn10k_worker.h @@ -308,29 +308,120 @@ uint16_t __rte_hot cn10k_sso_hws_enq_fwd_burst(void *port, NIX_RX_FASTPATH_MODES #undef R -static __rte_always_inline const struct cn10k_eth_txq * +static __rte_always_inline struct cn10k_eth_txq * cn10k_sso_hws_xtract_meta(struct rte_mbuf *m, const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT]) { - return (const struct cn10k_eth_txq *) + return (struct cn10k_eth_txq *) txq_data[m->port][rte_event_eth_tx_adapter_txq_get(m)]; } +static __rte_always_inline void +cn10k_sso_vwqe_split_tx(struct rte_mbuf **mbufs, uint16_t nb_mbufs, + uint64_t *cmd, uint16_t lmt_id, uintptr_t lmt_addr, + uint8_t sched_type, uintptr_t base, + const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT], + const uint32_t flags) +{ + uint16_t port[4], queue[4]; + struct cn10k_eth_txq *txq; + uint16_t i, j; + uintptr_t pa; + + for (i = 0; i < nb_mbufs; i += 4) { + port[0] = mbufs[i]->port; + port[1] = mbufs[i + 1]->port; + port[2] = mbufs[i + 2]->port; + port[3] = mbufs[i + 3]->port; + + queue[0] = rte_event_eth_tx_adapter_txq_get(mbufs[i]); + queue[1] = rte_event_eth_tx_adapter_txq_get(mbufs[i + 1]); + queue[2] = rte_event_eth_tx_adapter_txq_get(mbufs[i + 2]); + queue[3] = rte_event_eth_tx_adapter_txq_get(mbufs[i + 3]); + + if (((port[0] ^ port[1]) & (port[2] ^ port[3])) || + ((queue[0] ^ queue[1]) & (queue[2] ^ queue[3]))) { + + for (j = 0; j < 4; j++) { + struct rte_mbuf *m = mbufs[i + j]; + + txq = (struct cn10k_eth_txq *) + txq_data[port[j]][queue[j]]; + cn10k_nix_tx_skeleton(txq, cmd, flags); + /* Perform header writes before barrier + * for TSO + */ + if (flags & NIX_TX_OFFLOAD_TSO_F) + cn10k_nix_xmit_prepare_tso(m, flags); + + cn10k_nix_xmit_prepare(m, cmd, lmt_addr, flags, + txq->lso_tun_fmt); + if (flags & NIX_TX_MULTI_SEG_F) { + const uint16_t segdw = + cn10k_nix_prepare_mseg( + m, (uint64_t *)lmt_addr, + flags); + pa = txq->io_addr | ((segdw - 1) << 4); + } else { + pa = txq->io_addr | + (cn10k_nix_tx_ext_subs(flags) + 1) + << 4; + } + if (!sched_type) + roc_sso_hws_head_wait(base + + SSOW_LF_GWS_TAG); + + roc_lmt_submit_steorl(lmt_id, pa); + } + } else { + txq = (struct cn10k_eth_txq *) + txq_data[port[0]][queue[0]]; + cn10k_nix_xmit_pkts_vector(txq, &mbufs[i], 4, cmd, base + + SSOW_LF_GWS_TAG, + flags | NIX_TX_VWQE_F); + } + } +} + static __rte_always_inline uint16_t cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev, uint64_t *cmd, const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT], const uint32_t flags) { - const struct cn10k_eth_txq *txq; - struct rte_mbuf *m = ev->mbuf; - uint16_t ref_cnt = m->refcnt; + struct cn10k_eth_txq *txq; + struct rte_mbuf *m; uintptr_t lmt_addr; + uint16_t ref_cnt; uint16_t lmt_id; uintptr_t pa; lmt_addr = ws->lmt_base; ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id); + + if (ev->event_type & RTE_EVENT_TYPE_VECTOR) { + struct rte_mbuf **mbufs = ev->vec->mbufs; + uint64_t meta = *(uint64_t *)ev->vec; + + if (meta & BIT(31)) { + txq = (struct cn10k_eth_txq *) + txq_data[meta >> 32][meta >> 48]; + + cn10k_nix_xmit_pkts_vector( + txq, mbufs, meta & 0xFFFF, cmd, + ws->tx_base + SSOW_LF_GWS_TAG, + flags | NIX_TX_VWQE_F); + } else { + cn10k_sso_vwqe_split_tx( + mbufs, meta & 0xFFFF, cmd, lmt_id, lmt_addr, + ev->sched_type, ws->tx_base, txq_data, flags); + } + rte_mempool_put(rte_mempool_from_obj(ev->vec), ev->vec); + return (meta & 0xFFFF); + } + + m = ev->mbuf; + ref_cnt = m->refcnt; txq = cn10k_sso_hws_xtract_meta(m, txq_data); cn10k_nix_tx_skeleton(txq, cmd, flags); /* Perform header writes before barrier for TSO */ @@ -346,7 +437,7 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev, pa = txq->io_addr | (cn10k_nix_tx_ext_subs(flags) + 1) << 4; } if (!ev->sched_type) - cnxk_sso_hws_head_wait(ws->tx_base + SSOW_LF_GWS_TAG); + roc_sso_hws_head_wait(ws->tx_base + SSOW_LF_GWS_TAG); roc_lmt_submit_steorl(lmt_id, pa); @@ -357,7 +448,6 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev, cnxk_sso_hws_swtag_flush(ws->tx_base + SSOW_LF_GWS_TAG, ws->tx_base + SSOW_LF_GWS_OP_SWTAG_FLUSH); - return 1; } diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h index 3f9751211a..cc1e141957 100644 --- a/drivers/event/cnxk/cn9k_worker.h +++ b/drivers/event/cnxk/cn9k_worker.h @@ -466,7 +466,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd, const uint16_t segdw = cn9k_nix_prepare_mseg(m, cmd, flags); if (!CNXK_TT_FROM_EVENT(ev->event)) { cn9k_nix_xmit_mseg_prep_lmt(cmd, txq->lmt_addr, segdw); - cnxk_sso_hws_head_wait(base + SSOW_LF_GWS_TAG); + roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG); cn9k_sso_txq_fc_wait(txq); if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0) cn9k_nix_xmit_mseg_one(cmd, txq->lmt_addr, @@ -478,7 +478,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd, } else { if (!CNXK_TT_FROM_EVENT(ev->event)) { cn9k_nix_xmit_prep_lmt(cmd, txq->lmt_addr, flags); - cnxk_sso_hws_head_wait(base + SSOW_LF_GWS_TAG); + roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG); cn9k_sso_txq_fc_wait(txq); if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0) cn9k_nix_xmit_one(cmd, txq->lmt_addr, diff --git a/drivers/event/cnxk/cnxk_worker.h b/drivers/event/cnxk/cnxk_worker.h index 7891b749df..9f9ceab8a1 100644 --- a/drivers/event/cnxk/cnxk_worker.h +++ b/drivers/event/cnxk/cnxk_worker.h @@ -75,26 +75,4 @@ cnxk_sso_hws_swtag_wait(uintptr_t tag_op) #endif } -static __rte_always_inline void -cnxk_sso_hws_head_wait(uintptr_t tag_op) -{ -#ifdef RTE_ARCH_ARM64 - uint64_t tag; - - asm volatile(" ldr %[tag], [%[tag_op]] \n" - " tbnz %[tag], 35, done%= \n" - " sevl \n" - "rty%=: wfe \n" - " ldr %[tag], [%[tag_op]] \n" - " tbz %[tag], 35, rty%= \n" - "done%=: \n" - : [tag] "=&r"(tag) - : [tag_op] "r"(tag_op)); -#else - /* Wait for the HEAD to be set */ - while (!(plt_read64(tag_op) & BIT_ULL(35))) - ; -#endif -} - #endif diff --git a/drivers/net/cnxk/cn10k_tx.c b/drivers/net/cnxk/cn10k_tx.c index 1f30bab59a..0e1276c60b 100644 --- a/drivers/net/cnxk/cn10k_tx.c +++ b/drivers/net/cnxk/cn10k_tx.c @@ -16,7 +16,7 @@ !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F)) \ return 0; \ return cn10k_nix_xmit_pkts(tx_queue, tx_pkts, pkts, cmd, \ - flags); \ + 0, flags); \ } NIX_TX_FASTPATH_MODES diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h index eb148b8e77..f75cae07ae 100644 --- a/drivers/net/cnxk/cn10k_tx.h +++ b/drivers/net/cnxk/cn10k_tx.h @@ -18,6 +18,7 @@ * Defining it from backwards to denote its been * not used as offload flags to pick function */ +#define NIX_TX_VWQE_F BIT(14) #define NIX_TX_MULTI_SEG_F BIT(15) #define NIX_TX_NEED_SEND_HDR_W1 \ @@ -519,7 +520,7 @@ cn10k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags) static __rte_always_inline uint16_t cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts, - uint64_t *cmd, const uint16_t flags) + uint64_t *cmd, uintptr_t base, const uint16_t flags) { struct cn10k_eth_txq *txq = tx_queue; const rte_iova_t io_addr = txq->io_addr; @@ -528,14 +529,15 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts, uint64_t lso_tun_fmt; uint64_t data; - NIX_XMIT_FC_OR_RETURN(txq, pkts); + if (!(flags & NIX_TX_VWQE_F)) { + NIX_XMIT_FC_OR_RETURN(txq, pkts); + /* Reduce the cached count */ + txq->fc_cache_pkts -= pkts; + } /* Get cmd skeleton */ cn10k_nix_tx_skeleton(txq, cmd, flags); - /* Reduce the cached count */ - txq->fc_cache_pkts -= pkts; - if (flags & NIX_TX_OFFLOAD_TSO_F) lso_tun_fmt = txq->lso_tun_fmt; @@ -558,6 +560,9 @@ again: lmt_addr += (1ULL << ROC_LMT_LINE_SIZE_LOG2); } + if (flags & NIX_TX_VWQE_F) + roc_sso_hws_head_wait(base); + /* Trigger LMTST */ if (burst > 16) { data = cn10k_nix_tx_steor_data(flags); @@ -604,7 +609,8 @@ again: static __rte_always_inline uint16_t cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts, - uint16_t pkts, uint64_t *cmd, const uint16_t flags) + uint16_t pkts, uint64_t *cmd, uintptr_t base, + const uint16_t flags) { struct cn10k_eth_txq *txq = tx_queue; uintptr_t pa0, pa1, lmt_addr = txq->lmt_base; @@ -652,6 +658,9 @@ again: shft += 3; } + if (flags & NIX_TX_VWQE_F) + roc_sso_hws_head_wait(base); + data0 = (uint64_t)data128; data1 = (uint64_t)(data128 >> 64); /* Make data0 similar to data1 */ @@ -984,7 +993,8 @@ cn10k_nix_prep_lmt_mseg_vector(struct rte_mbuf **mbufs, uint64x2_t *cmd0, static __rte_always_inline uint16_t cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts, - uint16_t pkts, uint64_t *cmd, const uint16_t flags) + uint16_t pkts, uint64_t *cmd, uintptr_t base, + const uint16_t flags) { uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3; uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3; @@ -1013,13 +1023,17 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts, uint64_t data[2]; } wd; - NIX_XMIT_FC_OR_RETURN(txq, pkts); - - scalar = pkts & (NIX_DESCS_PER_LOOP - 1); - pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP); + if (!(flags & NIX_TX_VWQE_F)) { + NIX_XMIT_FC_OR_RETURN(txq, pkts); + scalar = pkts & (NIX_DESCS_PER_LOOP - 1); + pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP); + /* Reduce the cached count */ + txq->fc_cache_pkts -= pkts; + } else { + scalar = pkts & (NIX_DESCS_PER_LOOP - 1); + pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP); + } - /* Reduce the cached count */ - txq->fc_cache_pkts -= pkts; /* Perform header writes before barrier for TSO */ if (flags & NIX_TX_OFFLOAD_TSO_F) { for (i = 0; i < pkts; i++) @@ -1973,6 +1987,9 @@ again: if (flags & NIX_TX_MULTI_SEG_F) wd.data[0] >>= 16; + if (flags & NIX_TX_VWQE_F) + roc_sso_hws_head_wait(base); + /* Trigger LMTST */ if (lnum > 16) { if (!(flags & NIX_TX_MULTI_SEG_F)) @@ -2029,10 +2046,11 @@ again: if (unlikely(scalar)) { if (flags & NIX_TX_MULTI_SEG_F) pkts += cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts, - scalar, cmd, flags); + scalar, cmd, base, + flags); else pkts += cn10k_nix_xmit_pkts(tx_queue, tx_pkts, scalar, - cmd, flags); + cmd, base, flags); } return pkts; @@ -2041,13 +2059,15 @@ again: #else static __rte_always_inline uint16_t cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts, - uint16_t pkts, uint64_t *cmd, const uint16_t flags) + uint16_t pkts, uint64_t *cmd, uintptr_t base, + const uint16_t flags) { RTE_SET_USED(tx_queue); RTE_SET_USED(tx_pkts); RTE_SET_USED(pkts); RTE_SET_USED(cmd); RTE_SET_USED(flags); + RTE_SET_USED(base); return 0; } #endif diff --git a/drivers/net/cnxk/cn10k_tx_mseg.c b/drivers/net/cnxk/cn10k_tx_mseg.c index 33f6754722..4ea4c8a4e5 100644 --- a/drivers/net/cnxk/cn10k_tx_mseg.c +++ b/drivers/net/cnxk/cn10k_tx_mseg.c @@ -18,7 +18,8 @@ !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F)) \ return 0; \ return cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts, pkts, cmd, \ - (flags) | NIX_TX_MULTI_SEG_F); \ + 0, (flags) \ + | NIX_TX_MULTI_SEG_F); \ } NIX_TX_FASTPATH_MODES diff --git a/drivers/net/cnxk/cn10k_tx_vec.c b/drivers/net/cnxk/cn10k_tx_vec.c index 34e3737501..a0350496ab 100644 --- a/drivers/net/cnxk/cn10k_tx_vec.c +++ b/drivers/net/cnxk/cn10k_tx_vec.c @@ -18,7 +18,7 @@ !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F)) \ return 0; \ return cn10k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd,\ - (flags)); \ + 0, (flags)); \ } NIX_TX_FASTPATH_MODES diff --git a/drivers/net/cnxk/cn10k_tx_vec_mseg.c b/drivers/net/cnxk/cn10k_tx_vec_mseg.c index 1fad81dbad..7f98f79b97 100644 --- a/drivers/net/cnxk/cn10k_tx_vec_mseg.c +++ b/drivers/net/cnxk/cn10k_tx_vec_mseg.c @@ -16,7 +16,7 @@ !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F)) \ return 0; \ return cn10k_nix_xmit_pkts_vector( \ - tx_queue, tx_pkts, pkts, cmd, \ + tx_queue, tx_pkts, pkts, cmd, 0, \ (flags) | NIX_TX_MULTI_SEG_F); \ }