From: Pavan Nikhilesh Date: Fri, 22 Nov 2019 15:44:27 +0000 (+0530) Subject: event/octeontx2: use opposite bucket for current chunk X-Git-Url: http://git.droids-corp.org/?a=commitdiff_plain;h=6a697cf00736e744b12634a936124504ccffc03e;p=dpdk.git event/octeontx2: use opposite bucket for current chunk Since TIM buckets are always aligned to 32B and our cache line size being 128B, we will always have a cache miss when reading current_chunk pointer. Avoid the cache miss by storing the current_chunk pointer in the bucket opposite to the current bucket. Signed-off-by: Pavan Nikhilesh --- diff --git a/drivers/event/octeontx2/otx2_tim_worker.h b/drivers/event/octeontx2/otx2_tim_worker.h index c896b54338..7b771fbca7 100644 --- a/drivers/event/octeontx2/otx2_tim_worker.h +++ b/drivers/event/octeontx2/otx2_tim_worker.h @@ -115,20 +115,29 @@ tim_bkt_clr_nent(struct otx2_tim_bkt *bktp) return __atomic_and_fetch(&bktp->w1, v, __ATOMIC_ACQ_REL); } -static __rte_always_inline struct otx2_tim_bkt * +static __rte_always_inline void tim_get_target_bucket(struct otx2_tim_ring * const tim_ring, - const uint32_t rel_bkt, const uint8_t flag) + const uint32_t rel_bkt, struct otx2_tim_bkt **bkt, + struct otx2_tim_bkt **mirr_bkt, const uint8_t flag) { const uint64_t bkt_cyc = rte_rdtsc() - tim_ring->ring_start_cyc; uint32_t bucket = rte_reciprocal_divide_u64(bkt_cyc, &tim_ring->fast_div) + rel_bkt; + uint32_t mirr_bucket = 0; - if (flag & OTX2_TIM_BKT_MOD) + if (flag & OTX2_TIM_BKT_MOD) { bucket = bucket % tim_ring->nb_bkts; - if (flag & OTX2_TIM_BKT_AND) + mirr_bucket = (bucket + (tim_ring->nb_bkts >> 1)) % + tim_ring->nb_bkts; + } + if (flag & OTX2_TIM_BKT_AND) { bucket = bucket & (tim_ring->nb_bkts - 1); + mirr_bucket = (bucket + (tim_ring->nb_bkts >> 1)) & + (tim_ring->nb_bkts - 1); + } - return &tim_ring->bkt[bucket]; + *bkt = &tim_ring->bkt[bucket]; + *mirr_bkt = &tim_ring->bkt[mirr_bucket]; } static struct otx2_tim_ent * @@ -153,6 +162,7 @@ tim_clr_bkt(struct otx2_tim_ring * const tim_ring, static struct otx2_tim_ent * tim_refill_chunk(struct otx2_tim_bkt * const bkt, + struct otx2_tim_bkt * const mirr_bkt, struct otx2_tim_ring * const tim_ring) { struct otx2_tim_ent *chunk; @@ -162,8 +172,8 @@ tim_refill_chunk(struct otx2_tim_bkt * const bkt, (void **)&chunk))) return NULL; if (bkt->nb_entry) { - *(uint64_t *)(((struct otx2_tim_ent *)(uintptr_t) - bkt->current_chunk) + + *(uint64_t *)(((struct otx2_tim_ent *) + mirr_bkt->current_chunk) + tim_ring->nb_chunk_slots) = (uintptr_t)chunk; } else { @@ -180,6 +190,7 @@ tim_refill_chunk(struct otx2_tim_bkt * const bkt, static struct otx2_tim_ent * tim_insert_chunk(struct otx2_tim_bkt * const bkt, + struct otx2_tim_bkt * const mirr_bkt, struct otx2_tim_ring * const tim_ring) { struct otx2_tim_ent *chunk; @@ -190,7 +201,7 @@ tim_insert_chunk(struct otx2_tim_bkt * const bkt, *(uint64_t *)(chunk + tim_ring->nb_chunk_slots) = 0; if (bkt->nb_entry) { *(uint64_t *)(((struct otx2_tim_ent *)(uintptr_t) - bkt->current_chunk) + + mirr_bkt->current_chunk) + tim_ring->nb_chunk_slots) = (uintptr_t)chunk; } else { bkt->first_chunk = (uintptr_t)chunk; @@ -205,14 +216,15 @@ tim_add_entry_sp(struct otx2_tim_ring * const tim_ring, const struct otx2_tim_ent * const pent, const uint8_t flags) { + struct otx2_tim_bkt *mirr_bkt; struct otx2_tim_ent *chunk; struct otx2_tim_bkt *bkt; uint64_t lock_sema; int16_t rem; - bkt = tim_get_target_bucket(tim_ring, rel_bkt, flags); - __retry: + tim_get_target_bucket(tim_ring, rel_bkt, &bkt, &mirr_bkt, flags); + /* Get Bucket sema*/ lock_sema = tim_bkt_fetch_sema_lock(bkt); @@ -232,7 +244,7 @@ __retry: : [hbt] "=&r" (hbt_state) : [w1] "r" ((&bkt->w1)) : "memory" - ); + ); #else do { hbt_state = __atomic_load_n(&bkt->w1, @@ -246,15 +258,14 @@ __retry: } } } - /* Insert the work. */ rem = tim_bkt_fetch_rem(lock_sema); if (!rem) { if (flags & OTX2_TIM_ENA_FB) - chunk = tim_refill_chunk(bkt, tim_ring); + chunk = tim_refill_chunk(bkt, mirr_bkt, tim_ring); if (flags & OTX2_TIM_ENA_DFB) - chunk = tim_insert_chunk(bkt, tim_ring); + chunk = tim_insert_chunk(bkt, mirr_bkt, tim_ring); if (unlikely(chunk == NULL)) { bkt->chunk_remainder = 0; @@ -264,10 +275,10 @@ __retry: tim->state = RTE_EVENT_TIMER_ERROR; return -ENOMEM; } - bkt->current_chunk = (uintptr_t)chunk; + mirr_bkt->current_chunk = (uintptr_t)chunk; bkt->chunk_remainder = tim_ring->nb_chunk_slots - 1; } else { - chunk = (struct otx2_tim_ent *)(uintptr_t)bkt->current_chunk; + chunk = (struct otx2_tim_ent *)mirr_bkt->current_chunk; chunk += tim_ring->nb_chunk_slots - rem; } @@ -291,13 +302,14 @@ tim_add_entry_mp(struct otx2_tim_ring * const tim_ring, const struct otx2_tim_ent * const pent, const uint8_t flags) { + struct otx2_tim_bkt *mirr_bkt; struct otx2_tim_ent *chunk; struct otx2_tim_bkt *bkt; uint64_t lock_sema; int16_t rem; __retry: - bkt = tim_get_target_bucket(tim_ring, rel_bkt, flags); + tim_get_target_bucket(tim_ring, rel_bkt, &bkt, &mirr_bkt, flags); /* Get Bucket sema*/ lock_sema = tim_bkt_fetch_sema_lock(bkt); @@ -317,7 +329,7 @@ __retry: : [hbt] "=&r" (hbt_state) : [w1] "r" ((&bkt->w1)) : "memory" - ); + ); #else do { hbt_state = __atomic_load_n(&bkt->w1, @@ -358,9 +370,9 @@ __retry: } else if (!rem) { /* Only one thread can be here*/ if (flags & OTX2_TIM_ENA_FB) - chunk = tim_refill_chunk(bkt, tim_ring); + chunk = tim_refill_chunk(bkt, mirr_bkt, tim_ring); if (flags & OTX2_TIM_ENA_DFB) - chunk = tim_insert_chunk(bkt, tim_ring); + chunk = tim_insert_chunk(bkt, mirr_bkt, tim_ring); if (unlikely(chunk == NULL)) { tim_bkt_set_rem(bkt, 0); @@ -375,11 +387,11 @@ __retry: (-tim_bkt_fetch_rem(lock_sema))) lock_sema = __atomic_load_n(&bkt->w1, __ATOMIC_ACQUIRE); - bkt->current_chunk = (uintptr_t)chunk; + mirr_bkt->current_chunk = (uintptr_t)chunk; __atomic_store_n(&bkt->chunk_remainder, tim_ring->nb_chunk_slots - 1, __ATOMIC_RELEASE); } else { - chunk = (struct otx2_tim_ent *)bkt->current_chunk; + chunk = (struct otx2_tim_ent *)mirr_bkt->current_chunk; chunk += tim_ring->nb_chunk_slots - rem; *chunk = *pent; } @@ -420,6 +432,7 @@ tim_add_entry_brst(struct otx2_tim_ring * const tim_ring, const uint16_t nb_timers, const uint8_t flags) { struct otx2_tim_ent *chunk = NULL; + struct otx2_tim_bkt *mirr_bkt; struct otx2_tim_bkt *bkt; uint16_t chunk_remainder; uint16_t index = 0; @@ -428,7 +441,7 @@ tim_add_entry_brst(struct otx2_tim_ring * const tim_ring, uint8_t lock_cnt; __retry: - bkt = tim_get_target_bucket(tim_ring, rel_bkt, flags); + tim_get_target_bucket(tim_ring, rel_bkt, &bkt, &mirr_bkt, flags); /* Only one thread beyond this. */ lock_sema = tim_bkt_inc_lock(bkt); @@ -477,7 +490,7 @@ __retry: crem = tim_ring->nb_chunk_slots - chunk_remainder; if (chunk_remainder && crem) { chunk = ((struct otx2_tim_ent *) - (uintptr_t)bkt->current_chunk) + crem; + mirr_bkt->current_chunk) + crem; index = tim_cpy_wrk(index, chunk_remainder, chunk, tim, ents, bkt); @@ -486,9 +499,9 @@ __retry: } if (flags & OTX2_TIM_ENA_FB) - chunk = tim_refill_chunk(bkt, tim_ring); + chunk = tim_refill_chunk(bkt, mirr_bkt, tim_ring); if (flags & OTX2_TIM_ENA_DFB) - chunk = tim_insert_chunk(bkt, tim_ring); + chunk = tim_insert_chunk(bkt, mirr_bkt, tim_ring); if (unlikely(chunk == NULL)) { tim_bkt_dec_lock(bkt); @@ -497,14 +510,14 @@ __retry: return crem; } *(uint64_t *)(chunk + tim_ring->nb_chunk_slots) = 0; - bkt->current_chunk = (uintptr_t)chunk; + mirr_bkt->current_chunk = (uintptr_t)chunk; tim_cpy_wrk(index, nb_timers, chunk, tim, ents, bkt); rem = nb_timers - chunk_remainder; tim_bkt_set_rem(bkt, tim_ring->nb_chunk_slots - rem); tim_bkt_add_nent(bkt, rem); } else { - chunk = (struct otx2_tim_ent *)(uintptr_t)bkt->current_chunk; + chunk = (struct otx2_tim_ent *)mirr_bkt->current_chunk; chunk += (tim_ring->nb_chunk_slots - chunk_remainder); tim_cpy_wrk(index, nb_timers, chunk, tim, ents, bkt);