event/cnxk: improve Rx performance
authorPavan Nikhilesh <pbhagavatula@marvell.com>
Thu, 10 Feb 2022 13:20:46 +0000 (18:50 +0530)
committerJerin Jacob <jerinj@marvell.com>
Mon, 14 Feb 2022 15:23:33 +0000 (16:23 +0100)
Improve vWQE and CQ Rx performance by tuning perfetches to 64B
cacheline size.
Also, prefetch the vWQE array offsets at cacheline boundaries.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>
drivers/event/cnxk/cn10k_worker.h
drivers/net/cnxk/cn10k_rx.h
drivers/net/cnxk/cn9k_rx.h

index ada230e..cfe729c 100644 (file)
@@ -118,11 +118,17 @@ cn10k_process_vwqe(uintptr_t vwqe, uint16_t port_id, const uint32_t flags,
        uint8_t loff = 0;
        uint64_t sa_base;
        uint64_t **wqe;
        uint8_t loff = 0;
        uint64_t sa_base;
        uint64_t **wqe;
+       int i;
 
        mbuf_init |= ((uint64_t)port_id) << 48;
        vec = (struct rte_event_vector *)vwqe;
        wqe = vec->u64s;
 
 
        mbuf_init |= ((uint64_t)port_id) << 48;
        vec = (struct rte_event_vector *)vwqe;
        wqe = vec->u64s;
 
+       rte_prefetch_non_temporal(&vec->ptrs[0]);
+#define OBJS_PER_CLINE (RTE_CACHE_LINE_SIZE / sizeof(void *))
+       for (i = OBJS_PER_CLINE; i < vec->nb_elem; i += OBJS_PER_CLINE)
+               rte_prefetch_non_temporal(&vec->ptrs[i]);
+
        nb_mbufs = RTE_ALIGN_FLOOR(vec->nb_elem, NIX_DESCS_PER_LOOP);
        nb_mbufs = cn10k_nix_recv_pkts_vector(&mbuf_init, vec->mbufs, nb_mbufs,
                                              flags | NIX_RX_VWQE_F, lookup_mem,
        nb_mbufs = RTE_ALIGN_FLOOR(vec->nb_elem, NIX_DESCS_PER_LOOP);
        nb_mbufs = cn10k_nix_recv_pkts_vector(&mbuf_init, vec->mbufs, nb_mbufs,
                                              flags | NIX_RX_VWQE_F, lookup_mem,
@@ -191,15 +197,13 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
                uint64_t u64[2];
        } gw;
        uint64_t tstamp_ptr;
                uint64_t u64[2];
        } gw;
        uint64_t tstamp_ptr;
-       uint64_t mbuf;
 
        gw.get_work = ws->gw_wdata;
 #if defined(RTE_ARCH_ARM64) && !defined(__clang__)
        asm volatile(
                PLT_CPU_FEATURE_PREAMBLE
 
        gw.get_work = ws->gw_wdata;
 #if defined(RTE_ARCH_ARM64) && !defined(__clang__)
        asm volatile(
                PLT_CPU_FEATURE_PREAMBLE
-               "caspl %[wdata], %H[wdata], %[wdata], %H[wdata], [%[gw_loc]]\n"
-               "sub %[mbuf], %H[wdata], #0x80                          \n"
-               : [wdata] "+r"(gw.get_work), [mbuf] "=&r"(mbuf)
+               "caspal %[wdata], %H[wdata], %[wdata], %H[wdata], [%[gw_loc]]\n"
+               : [wdata] "+r"(gw.get_work)
                : [gw_loc] "r"(ws->base + SSOW_LF_GWS_OP_GET_WORK0)
                : "memory");
 #else
                : [gw_loc] "r"(ws->base + SSOW_LF_GWS_OP_GET_WORK0)
                : "memory");
 #else
@@ -208,14 +212,12 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
                roc_load_pair(gw.u64[0], gw.u64[1],
                              ws->base + SSOW_LF_GWS_WQE0);
        } while (gw.u64[0] & BIT_ULL(63));
                roc_load_pair(gw.u64[0], gw.u64[1],
                              ws->base + SSOW_LF_GWS_WQE0);
        } while (gw.u64[0] & BIT_ULL(63));
-       mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
        ws->gw_rdata = gw.u64[0];
 #endif
        ws->gw_rdata = gw.u64[0];
-       gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
-                   (gw.u64[0] & (0x3FFull << 36)) << 4 |
-                   (gw.u64[0] & 0xffffffff);
-
-       if (CNXK_TT_FROM_EVENT(gw.u64[0]) != SSO_TT_EMPTY) {
+       if (gw.u64[1]) {
+               gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
+                           (gw.u64[0] & (0x3FFull << 36)) << 4 |
+                           (gw.u64[0] & 0xffffffff);
                if ((flags & CPT_RX_WQE_F) &&
                    (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
                     RTE_EVENT_TYPE_CRYPTODEV)) {
                if ((flags & CPT_RX_WQE_F) &&
                    (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
                     RTE_EVENT_TYPE_CRYPTODEV)) {
@@ -223,7 +225,10 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
                } else if (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
                           RTE_EVENT_TYPE_ETHDEV) {
                        uint8_t port = CNXK_SUB_EVENT_FROM_TAG(gw.u64[0]);
                } else if (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
                           RTE_EVENT_TYPE_ETHDEV) {
                        uint8_t port = CNXK_SUB_EVENT_FROM_TAG(gw.u64[0]);
+                       uint64_t mbuf;
 
 
+                       mbuf = gw.u64[1] - sizeof(struct rte_mbuf);
+                       rte_prefetch0((void *)mbuf);
                        if (flags & NIX_RX_OFFLOAD_SECURITY_F) {
                                struct rte_mbuf *m;
                                uintptr_t sa_base;
                        if (flags & NIX_RX_OFFLOAD_SECURITY_F) {
                                struct rte_mbuf *m;
                                uintptr_t sa_base;
index 8b00fcc..564e50f 100644 (file)
@@ -610,10 +610,10 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
                }
 
                /* Prefetch N desc ahead */
                }
 
                /* Prefetch N desc ahead */
-               rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 8, 0, flags));
-               rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 9, 0, flags));
-               rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 10, 0, flags));
-               rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 11, 0, flags));
+               rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 4, 64, flags));
+               rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 5, 64, flags));
+               rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 6, 64, flags));
+               rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 7, 64, flags));
 
                /* Get NIX_RX_SG_S for size and buffer pointer */
                cq0_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 0, 64, flags));
 
                /* Get NIX_RX_SG_S for size and buffer pointer */
                cq0_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 0, 64, flags));
index 1178f95..d36f292 100644 (file)
@@ -388,16 +388,16 @@ skip_parse:
                ol_flags =
                        nix_update_match_id(rx->cn9k.match_id, ol_flags, mbuf);
 
                ol_flags =
                        nix_update_match_id(rx->cn9k.match_id, ol_flags, mbuf);
 
-       mbuf->pkt_len = len;
-       mbuf->data_len = len;
-       *(uint64_t *)(&mbuf->rearm_data) = val;
-
        mbuf->ol_flags = ol_flags;
        mbuf->ol_flags = ol_flags;
+       *(uint64_t *)(&mbuf->rearm_data) = val;
+       mbuf->pkt_len = len;
 
 
-       if (flag & NIX_RX_MULTI_SEG_F)
+       if (flag & NIX_RX_MULTI_SEG_F) {
                nix_cqe_xtract_mseg(rx, mbuf, val, flag);
                nix_cqe_xtract_mseg(rx, mbuf, val, flag);
-       else
+       } else {
+               mbuf->data_len = len;
                mbuf->next = NULL;
                mbuf->next = NULL;
+       }
 }
 
 static inline uint16_t
 }
 
 static inline uint16_t
@@ -769,10 +769,6 @@ cn9k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
                vst1q_u64((uint64_t *)mbuf2->rearm_data, rearm2);
                vst1q_u64((uint64_t *)mbuf3->rearm_data, rearm3);
 
                vst1q_u64((uint64_t *)mbuf2->rearm_data, rearm2);
                vst1q_u64((uint64_t *)mbuf3->rearm_data, rearm3);
 
-               /* Store the mbufs to rx_pkts */
-               vst1q_u64((uint64_t *)&rx_pkts[packets], mbuf01);
-               vst1q_u64((uint64_t *)&rx_pkts[packets + 2], mbuf23);
-
                if (flags & NIX_RX_MULTI_SEG_F) {
                        /* Multi segment is enable build mseg list for
                         * individual mbufs in scalar mode.
                if (flags & NIX_RX_MULTI_SEG_F) {
                        /* Multi segment is enable build mseg list for
                         * individual mbufs in scalar mode.
@@ -797,6 +793,10 @@ cn9k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
                        mbuf3->next = NULL;
                }
 
                        mbuf3->next = NULL;
                }
 
+               /* Store the mbufs to rx_pkts */
+               vst1q_u64((uint64_t *)&rx_pkts[packets], mbuf01);
+               vst1q_u64((uint64_t *)&rx_pkts[packets + 2], mbuf23);
+
                /* Prefetch mbufs */
                roc_prefetch_store_keep(mbuf0);
                roc_prefetch_store_keep(mbuf1);
                /* Prefetch mbufs */
                roc_prefetch_store_keep(mbuf0);
                roc_prefetch_store_keep(mbuf1);