X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=drivers%2Fmempool%2Focteontx2%2Fotx2_mempool_ops.c;h=5229a7cfba05f756e0033741fde3d7de51c3a5c1;hb=53313910bcfd5a05cc07ef5db97e7c65db7a93a3;hp=0e7b7a77cfe51e6356b45a6aaa8b9853c600ab12;hpb=7bcc47cbe2fa29da6a624721e0b52148913418ca;p=dpdk.git diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c b/drivers/mempool/octeontx2/otx2_mempool_ops.c index 0e7b7a77cf..5229a7cfba 100644 --- a/drivers/mempool/octeontx2/otx2_mempool_ops.c +++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c @@ -7,6 +7,340 @@ #include "otx2_mempool.h" +static int __rte_hot +otx2_npa_enq(struct rte_mempool *mp, void * const *obj_table, unsigned int n) +{ + unsigned int index; const uint64_t aura_handle = mp->pool_id; + const uint64_t reg = npa_lf_aura_handle_to_aura(aura_handle); + const uint64_t addr = npa_lf_aura_handle_to_base(aura_handle) + + NPA_LF_AURA_OP_FREE0; + + for (index = 0; index < n; index++) + otx2_store_pair((uint64_t)obj_table[index], reg, addr); + + return 0; +} + +static __rte_noinline int +npa_lf_aura_op_alloc_one(const int64_t wdata, int64_t * const addr, + void **obj_table, uint8_t i) +{ + uint8_t retry = 4; + + do { + obj_table[i] = (void *)otx2_atomic64_add_nosync(wdata, addr); + if (obj_table[i] != NULL) + return 0; + + } while (retry--); + + return -ENOENT; +} + +#if defined(RTE_ARCH_ARM64) +static __rte_noinline int +npa_lf_aura_op_search_alloc(const int64_t wdata, int64_t * const addr, + void **obj_table, unsigned int n) +{ + uint8_t i; + + for (i = 0; i < n; i++) { + if (obj_table[i] != NULL) + continue; + if (npa_lf_aura_op_alloc_one(wdata, addr, obj_table, i)) + return -ENOENT; + } + + return 0; +} + +static __rte_noinline int +npa_lf_aura_op_alloc_bulk(const int64_t wdata, int64_t * const addr, + unsigned int n, void **obj_table) +{ + register const uint64_t wdata64 __asm("x26") = wdata; + register const uint64_t wdata128 __asm("x27") = wdata; + uint64x2_t failed = vdupq_n_u64(~0); + + switch (n) { + case 32: + { + asm volatile ( + ".cpu generic+lse\n" + "casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n" + "casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n" + "casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n" + "casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n" + "casp x8, x9, %[wdata64], %[wdata128], [%[loc]]\n" + "casp x10, x11, %[wdata64], %[wdata128], [%[loc]]\n" + "casp x12, x13, %[wdata64], %[wdata128], [%[loc]]\n" + "casp x14, x15, %[wdata64], %[wdata128], [%[loc]]\n" + "casp x16, x17, %[wdata64], %[wdata128], [%[loc]]\n" + "casp x18, x19, %[wdata64], %[wdata128], [%[loc]]\n" + "casp x20, x21, %[wdata64], %[wdata128], [%[loc]]\n" + "casp x22, x23, %[wdata64], %[wdata128], [%[loc]]\n" + "fmov d16, x0\n" + "fmov v16.D[1], x1\n" + "casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n" + "fmov d17, x2\n" + "fmov v17.D[1], x3\n" + "casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n" + "fmov d18, x4\n" + "fmov v18.D[1], x5\n" + "casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n" + "fmov d19, x6\n" + "fmov v19.D[1], x7\n" + "casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n" + "and %[failed].16B, %[failed].16B, v16.16B\n" + "and %[failed].16B, %[failed].16B, v17.16B\n" + "and %[failed].16B, %[failed].16B, v18.16B\n" + "and %[failed].16B, %[failed].16B, v19.16B\n" + "fmov d20, x8\n" + "fmov v20.D[1], x9\n" + "fmov d21, x10\n" + "fmov v21.D[1], x11\n" + "fmov d22, x12\n" + "fmov v22.D[1], x13\n" + "fmov d23, x14\n" + "fmov v23.D[1], x15\n" + "and %[failed].16B, %[failed].16B, v20.16B\n" + "and %[failed].16B, %[failed].16B, v21.16B\n" + "and %[failed].16B, %[failed].16B, v22.16B\n" + "and %[failed].16B, %[failed].16B, v23.16B\n" + "st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n" + "st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n" + "fmov d16, x16\n" + "fmov v16.D[1], x17\n" + "fmov d17, x18\n" + "fmov v17.D[1], x19\n" + "fmov d18, x20\n" + "fmov v18.D[1], x21\n" + "fmov d19, x22\n" + "fmov v19.D[1], x23\n" + "and %[failed].16B, %[failed].16B, v16.16B\n" + "and %[failed].16B, %[failed].16B, v17.16B\n" + "and %[failed].16B, %[failed].16B, v18.16B\n" + "and %[failed].16B, %[failed].16B, v19.16B\n" + "fmov d20, x0\n" + "fmov v20.D[1], x1\n" + "fmov d21, x2\n" + "fmov v21.D[1], x3\n" + "fmov d22, x4\n" + "fmov v22.D[1], x5\n" + "fmov d23, x6\n" + "fmov v23.D[1], x7\n" + "and %[failed].16B, %[failed].16B, v20.16B\n" + "and %[failed].16B, %[failed].16B, v21.16B\n" + "and %[failed].16B, %[failed].16B, v22.16B\n" + "and %[failed].16B, %[failed].16B, v23.16B\n" + "st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n" + "st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n" + : "+Q" (*addr), [failed] "=&w" (failed) + : [wdata64] "r" (wdata64), [wdata128] "r" (wdata128), + [dst] "r" (obj_table), [loc] "r" (addr) + : "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", + "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", + "x17", "x18", "x19", "x20", "x21", "x22", "x23", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23" + ); + break; + } + case 16: + { + asm volatile ( + ".cpu generic+lse\n" + "casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n" + "casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n" + "casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n" + "casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n" + "casp x8, x9, %[wdata64], %[wdata128], [%[loc]]\n" + "casp x10, x11, %[wdata64], %[wdata128], [%[loc]]\n" + "casp x12, x13, %[wdata64], %[wdata128], [%[loc]]\n" + "casp x14, x15, %[wdata64], %[wdata128], [%[loc]]\n" + "fmov d16, x0\n" + "fmov v16.D[1], x1\n" + "fmov d17, x2\n" + "fmov v17.D[1], x3\n" + "fmov d18, x4\n" + "fmov v18.D[1], x5\n" + "fmov d19, x6\n" + "fmov v19.D[1], x7\n" + "and %[failed].16B, %[failed].16B, v16.16B\n" + "and %[failed].16B, %[failed].16B, v17.16B\n" + "and %[failed].16B, %[failed].16B, v18.16B\n" + "and %[failed].16B, %[failed].16B, v19.16B\n" + "fmov d20, x8\n" + "fmov v20.D[1], x9\n" + "fmov d21, x10\n" + "fmov v21.D[1], x11\n" + "fmov d22, x12\n" + "fmov v22.D[1], x13\n" + "fmov d23, x14\n" + "fmov v23.D[1], x15\n" + "and %[failed].16B, %[failed].16B, v20.16B\n" + "and %[failed].16B, %[failed].16B, v21.16B\n" + "and %[failed].16B, %[failed].16B, v22.16B\n" + "and %[failed].16B, %[failed].16B, v23.16B\n" + "st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n" + "st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n" + : "+Q" (*addr), [failed] "=&w" (failed) + : [wdata64] "r" (wdata64), [wdata128] "r" (wdata128), + [dst] "r" (obj_table), [loc] "r" (addr) + : "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", + "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23" + ); + break; + } + case 8: + { + asm volatile ( + ".cpu generic+lse\n" + "casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n" + "casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n" + "casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n" + "casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n" + "fmov d16, x0\n" + "fmov v16.D[1], x1\n" + "fmov d17, x2\n" + "fmov v17.D[1], x3\n" + "fmov d18, x4\n" + "fmov v18.D[1], x5\n" + "fmov d19, x6\n" + "fmov v19.D[1], x7\n" + "and %[failed].16B, %[failed].16B, v16.16B\n" + "and %[failed].16B, %[failed].16B, v17.16B\n" + "and %[failed].16B, %[failed].16B, v18.16B\n" + "and %[failed].16B, %[failed].16B, v19.16B\n" + "st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n" + : "+Q" (*addr), [failed] "=&w" (failed) + : [wdata64] "r" (wdata64), [wdata128] "r" (wdata128), + [dst] "r" (obj_table), [loc] "r" (addr) + : "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", + "v16", "v17", "v18", "v19" + ); + break; + } + case 4: + { + asm volatile ( + ".cpu generic+lse\n" + "casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n" + "casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n" + "fmov d16, x0\n" + "fmov v16.D[1], x1\n" + "fmov d17, x2\n" + "fmov v17.D[1], x3\n" + "and %[failed].16B, %[failed].16B, v16.16B\n" + "and %[failed].16B, %[failed].16B, v17.16B\n" + "st1 { v16.2d, v17.2d}, [%[dst]], 32\n" + : "+Q" (*addr), [failed] "=&w" (failed) + : [wdata64] "r" (wdata64), [wdata128] "r" (wdata128), + [dst] "r" (obj_table), [loc] "r" (addr) + : "memory", "x0", "x1", "x2", "x3", "v16", "v17" + ); + break; + } + case 2: + { + asm volatile ( + ".cpu generic+lse\n" + "casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n" + "fmov d16, x0\n" + "fmov v16.D[1], x1\n" + "and %[failed].16B, %[failed].16B, v16.16B\n" + "st1 { v16.2d}, [%[dst]], 16\n" + : "+Q" (*addr), [failed] "=&w" (failed) + : [wdata64] "r" (wdata64), [wdata128] "r" (wdata128), + [dst] "r" (obj_table), [loc] "r" (addr) + : "memory", "x0", "x1", "v16" + ); + break; + } + case 1: + return npa_lf_aura_op_alloc_one(wdata, addr, obj_table, 0); + } + + if (unlikely(!(vgetq_lane_u64(failed, 0) & vgetq_lane_u64(failed, 1)))) + return npa_lf_aura_op_search_alloc(wdata, addr, (void **) + ((char *)obj_table - (sizeof(uint64_t) * n)), n); + + return 0; +} + +static __rte_noinline void +otx2_npa_clear_alloc(struct rte_mempool *mp, void **obj_table, unsigned int n) +{ + unsigned int i; + + for (i = 0; i < n; i++) { + if (obj_table[i] != NULL) { + otx2_npa_enq(mp, &obj_table[i], 1); + obj_table[i] = NULL; + } + } +} + +static __rte_noinline int __rte_hot +otx2_npa_deq_arm64(struct rte_mempool *mp, void **obj_table, unsigned int n) +{ + const int64_t wdata = npa_lf_aura_handle_to_aura(mp->pool_id); + void **obj_table_bak = obj_table; + const unsigned int nfree = n; + unsigned int parts; + + int64_t * const addr = (int64_t * const) + (npa_lf_aura_handle_to_base(mp->pool_id) + + NPA_LF_AURA_OP_ALLOCX(0)); + while (n) { + parts = n > 31 ? 32 : rte_align32prevpow2(n); + n -= parts; + if (unlikely(npa_lf_aura_op_alloc_bulk(wdata, addr, + parts, obj_table))) { + otx2_npa_clear_alloc(mp, obj_table_bak, nfree - n); + return -ENOENT; + } + obj_table += parts; + } + + return 0; +} + +#else + +static inline int __rte_hot +otx2_npa_deq(struct rte_mempool *mp, void **obj_table, unsigned int n) +{ + const int64_t wdata = npa_lf_aura_handle_to_aura(mp->pool_id); + unsigned int index; + uint64_t obj; + + int64_t * const addr = (int64_t *) + (npa_lf_aura_handle_to_base(mp->pool_id) + + NPA_LF_AURA_OP_ALLOCX(0)); + for (index = 0; index < n; index++, obj_table++) { + obj = npa_lf_aura_op_alloc_one(wdata, addr, obj_table, 0); + if (obj == 0) { + for (; index > 0; index--) { + obj_table--; + otx2_npa_enq(mp, obj_table, 1); + } + return -ENOENT; + } + *obj_table = (void *)obj; + } + + return 0; +} + +#endif + +static unsigned int +otx2_npa_get_count(const struct rte_mempool *mp) +{ + return (unsigned int)npa_lf_aura_op_available(mp->pool_id); +} + static int npa_lf_aura_pool_init(struct otx2_mbox *mbox, uint32_t aura_id, struct npa_aura_s *aura, struct npa_pool_s *pool) @@ -14,21 +348,26 @@ npa_lf_aura_pool_init(struct otx2_mbox *mbox, uint32_t aura_id, struct npa_aq_enq_req *aura_init_req, *pool_init_req; struct npa_aq_enq_rsp *aura_init_rsp, *pool_init_rsp; struct otx2_mbox_dev *mdev = &mbox->dev[0]; + struct otx2_idev_cfg *idev; int rc, off; + idev = otx2_intra_dev_get_cfg(); + if (idev == NULL) + return -ENOMEM; + aura_init_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox); aura_init_req->aura_id = aura_id; aura_init_req->ctype = NPA_AQ_CTYPE_AURA; aura_init_req->op = NPA_AQ_INSTOP_INIT; - memcpy(&aura_init_req->aura, aura, sizeof(*aura)); + otx2_mbox_memcpy(&aura_init_req->aura, aura, sizeof(*aura)); pool_init_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox); pool_init_req->aura_id = aura_id; pool_init_req->ctype = NPA_AQ_CTYPE_POOL; pool_init_req->op = NPA_AQ_INSTOP_INIT; - memcpy(&pool_init_req->pool, pool, sizeof(*pool)); + otx2_mbox_memcpy(&pool_init_req->pool, pool, sizeof(*pool)); otx2_mbox_msg_send(mbox, 0); rc = otx2_mbox_wait_for_rsp(mbox, 0); @@ -45,6 +384,131 @@ npa_lf_aura_pool_init(struct otx2_mbox *mbox, uint32_t aura_id, return 0; else return NPA_LF_ERR_AURA_POOL_INIT; + + if (!(idev->npa_lock_mask & BIT_ULL(aura_id))) + return 0; + + aura_init_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox); + aura_init_req->aura_id = aura_id; + aura_init_req->ctype = NPA_AQ_CTYPE_AURA; + aura_init_req->op = NPA_AQ_INSTOP_LOCK; + + pool_init_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox); + if (!pool_init_req) { + /* The shared memory buffer can be full. + * Flush it and retry + */ + otx2_mbox_msg_send(mbox, 0); + rc = otx2_mbox_wait_for_rsp(mbox, 0); + if (rc < 0) { + otx2_err("Failed to LOCK AURA context"); + return -ENOMEM; + } + + pool_init_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox); + if (!pool_init_req) { + otx2_err("Failed to LOCK POOL context"); + return -ENOMEM; + } + } + pool_init_req->aura_id = aura_id; + pool_init_req->ctype = NPA_AQ_CTYPE_POOL; + pool_init_req->op = NPA_AQ_INSTOP_LOCK; + + rc = otx2_mbox_process(mbox); + if (rc < 0) { + otx2_err("Failed to lock POOL ctx to NDC"); + return -ENOMEM; + } + + return 0; +} + +static int +npa_lf_aura_pool_fini(struct otx2_mbox *mbox, + uint32_t aura_id, + uint64_t aura_handle) +{ + struct npa_aq_enq_req *aura_req, *pool_req; + struct npa_aq_enq_rsp *aura_rsp, *pool_rsp; + struct otx2_mbox_dev *mdev = &mbox->dev[0]; + struct ndc_sync_op *ndc_req; + struct otx2_idev_cfg *idev; + int rc, off; + + idev = otx2_intra_dev_get_cfg(); + if (idev == NULL) + return -EINVAL; + + /* Procedure for disabling an aura/pool */ + rte_delay_us(10); + npa_lf_aura_op_alloc(aura_handle, 0); + + pool_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox); + pool_req->aura_id = aura_id; + pool_req->ctype = NPA_AQ_CTYPE_POOL; + pool_req->op = NPA_AQ_INSTOP_WRITE; + pool_req->pool.ena = 0; + pool_req->pool_mask.ena = ~pool_req->pool_mask.ena; + + aura_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox); + aura_req->aura_id = aura_id; + aura_req->ctype = NPA_AQ_CTYPE_AURA; + aura_req->op = NPA_AQ_INSTOP_WRITE; + aura_req->aura.ena = 0; + aura_req->aura_mask.ena = ~aura_req->aura_mask.ena; + + otx2_mbox_msg_send(mbox, 0); + rc = otx2_mbox_wait_for_rsp(mbox, 0); + if (rc < 0) + return rc; + + off = mbox->rx_start + + RTE_ALIGN(sizeof(struct mbox_hdr), MBOX_MSG_ALIGN); + pool_rsp = (struct npa_aq_enq_rsp *)((uintptr_t)mdev->mbase + off); + + off = mbox->rx_start + pool_rsp->hdr.next_msgoff; + aura_rsp = (struct npa_aq_enq_rsp *)((uintptr_t)mdev->mbase + off); + + if (rc != 2 || aura_rsp->hdr.rc != 0 || pool_rsp->hdr.rc != 0) + return NPA_LF_ERR_AURA_POOL_FINI; + + /* Sync NDC-NPA for LF */ + ndc_req = otx2_mbox_alloc_msg_ndc_sync_op(mbox); + ndc_req->npa_lf_sync = 1; + + rc = otx2_mbox_process(mbox); + if (rc) { + otx2_err("Error on NDC-NPA LF sync, rc %d", rc); + return NPA_LF_ERR_AURA_POOL_FINI; + } + + if (!(idev->npa_lock_mask & BIT_ULL(aura_id))) + return 0; + + aura_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox); + aura_req->aura_id = aura_id; + aura_req->ctype = NPA_AQ_CTYPE_AURA; + aura_req->op = NPA_AQ_INSTOP_UNLOCK; + + rc = otx2_mbox_process(mbox); + if (rc < 0) { + otx2_err("Failed to unlock AURA ctx to NDC"); + return -EINVAL; + } + + pool_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox); + pool_req->aura_id = aura_id; + pool_req->ctype = NPA_AQ_CTYPE_POOL; + pool_req->op = NPA_AQ_INSTOP_UNLOCK; + + rc = otx2_mbox_process(mbox); + if (rc < 0) { + otx2_err("Failed to unlock POOL ctx to NDC"); + return -EINVAL; + } + + return 0; } static inline char* @@ -65,6 +529,18 @@ npa_lf_stack_dma_alloc(struct otx2_npa_lf *lf, char *name, RTE_MEMZONE_IOVA_CONTIG, OTX2_ALIGN); } +static inline int +npa_lf_stack_dma_free(struct otx2_npa_lf *lf, char *name, int pool_id) +{ + const struct rte_memzone *mz; + + mz = rte_memzone_lookup(npa_lf_stack_memzone_name(lf, pool_id, name)); + if (mz == NULL) + return -EINVAL; + + return rte_memzone_free(mz); +} + static inline int bitmap_ctzll(uint64_t slab) { @@ -179,14 +655,67 @@ exit: return rc; } +static int +npa_lf_aura_pool_pair_free(struct otx2_npa_lf *lf, uint64_t aura_handle) +{ + char name[RTE_MEMZONE_NAMESIZE]; + int aura_id, pool_id, rc; + + if (!lf || !aura_handle) + return NPA_LF_ERR_PARAM; + + aura_id = pool_id = npa_lf_aura_handle_to_aura(aura_handle); + rc = npa_lf_aura_pool_fini(lf->mbox, aura_id, aura_handle); + rc |= npa_lf_stack_dma_free(lf, name, pool_id); + + rte_bitmap_set(lf->npa_bmp, aura_id); + + return rc; +} + +static int +npa_lf_aura_range_update_check(uint64_t aura_handle) +{ + uint64_t aura_id = npa_lf_aura_handle_to_aura(aura_handle); + struct otx2_npa_lf *lf = otx2_npa_lf_obj_get(); + struct npa_aura_lim *lim = lf->aura_lim; + __otx2_io struct npa_pool_s *pool; + struct npa_aq_enq_req *req; + struct npa_aq_enq_rsp *rsp; + int rc; + + req = otx2_mbox_alloc_msg_npa_aq_enq(lf->mbox); + + req->aura_id = aura_id; + req->ctype = NPA_AQ_CTYPE_POOL; + req->op = NPA_AQ_INSTOP_READ; + + rc = otx2_mbox_process_msg(lf->mbox, (void *)&rsp); + if (rc) { + otx2_err("Failed to get pool(0x%"PRIx64") context", aura_id); + return rc; + } + + pool = &rsp->pool; + + if (lim[aura_id].ptr_start != pool->ptr_start || + lim[aura_id].ptr_end != pool->ptr_end) { + otx2_err("Range update failed on pool(0x%"PRIx64")", aura_id); + return -ERANGE; + } + + return 0; +} + static int otx2_npa_alloc(struct rte_mempool *mp) { uint32_t block_size, block_count; + uint64_t aura_handle = 0; struct otx2_npa_lf *lf; struct npa_aura_s aura; struct npa_pool_s pool; - uint64_t aura_handle; + size_t padding; int rc; lf = otx2_npa_lf_obj_get(); @@ -196,6 +725,18 @@ otx2_npa_alloc(struct rte_mempool *mp) } block_size = mp->elt_size + mp->header_size + mp->trailer_size; + /* + * OCTEON TX2 has 8 sets, 41 ways L1D cache, VA<9:7> bits dictate + * the set selection. + * Add additional padding to ensure that the element size always + * occupies odd number of cachelines to ensure even distribution + * of elements among L1D cache sets. + */ + padding = ((block_size / RTE_CACHE_LINE_SIZE) % 2) ? 0 : + RTE_CACHE_LINE_SIZE; + mp->trailer_size += padding; + block_size += padding; + block_count = mp->size; if (block_size % OTX2_ALIGN != 0) { @@ -238,9 +779,117 @@ error: return rc; } +static void +otx2_npa_free(struct rte_mempool *mp) +{ + struct otx2_npa_lf *lf = otx2_npa_lf_obj_get(); + int rc = 0; + + otx2_npa_dbg("lf=%p aura_handle=0x%"PRIx64, lf, mp->pool_id); + if (lf != NULL) + rc = npa_lf_aura_pool_pair_free(lf, mp->pool_id); + + if (rc) + otx2_err("Failed to free pool or aura rc=%d", rc); + + /* Release the reference of npalf */ + otx2_npa_lf_fini(); +} + +static ssize_t +otx2_npa_calc_mem_size(const struct rte_mempool *mp, uint32_t obj_num, + uint32_t pg_shift, size_t *min_chunk_size, size_t *align) +{ + size_t total_elt_sz; + + /* Need space for one more obj on each chunk to fulfill + * alignment requirements. + */ + total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size; + return rte_mempool_op_calc_mem_size_helper(mp, obj_num, pg_shift, + total_elt_sz, min_chunk_size, + align); +} + +static uint8_t +otx2_npa_l1d_way_set_get(uint64_t iova) +{ + return (iova >> rte_log2_u32(RTE_CACHE_LINE_SIZE)) & 0x7; +} + +static int +otx2_npa_populate(struct rte_mempool *mp, unsigned int max_objs, void *vaddr, + rte_iova_t iova, size_t len, + rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg) +{ +#define OTX2_L1D_NB_SETS 8 + uint64_t distribution[OTX2_L1D_NB_SETS]; + rte_iova_t start_iova; + size_t total_elt_sz; + uint8_t set; + size_t off; + int i; + + if (iova == RTE_BAD_IOVA) + return -EINVAL; + + total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size; + + /* Align object start address to a multiple of total_elt_sz */ + off = total_elt_sz - ((((uintptr_t)vaddr - 1) % total_elt_sz) + 1); + + if (len < off) + return -EINVAL; + + + vaddr = (char *)vaddr + off; + iova += off; + len -= off; + + memset(distribution, 0, sizeof(uint64_t) * OTX2_L1D_NB_SETS); + start_iova = iova; + while (start_iova < iova + len) { + set = otx2_npa_l1d_way_set_get(start_iova + mp->header_size); + distribution[set]++; + start_iova += total_elt_sz; + } + + otx2_npa_dbg("iova %"PRIx64", aligned iova %"PRIx64"", iova - off, + iova); + otx2_npa_dbg("length %"PRIu64", aligned length %"PRIu64"", + (uint64_t)(len + off), (uint64_t)len); + otx2_npa_dbg("element size %"PRIu64"", (uint64_t)total_elt_sz); + otx2_npa_dbg("requested objects %"PRIu64", possible objects %"PRIu64"", + (uint64_t)max_objs, (uint64_t)(len / total_elt_sz)); + otx2_npa_dbg("L1D set distribution :"); + for (i = 0; i < OTX2_L1D_NB_SETS; i++) + otx2_npa_dbg("set[%d] : objects : %"PRIu64"", i, + distribution[i]); + + npa_lf_aura_op_range_set(mp->pool_id, iova, iova + len); + + if (npa_lf_aura_range_update_check(mp->pool_id) < 0) + return -EBUSY; + + return rte_mempool_op_populate_helper(mp, + RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ, + max_objs, vaddr, iova, len, + obj_cb, obj_cb_arg); +} + static struct rte_mempool_ops otx2_npa_ops = { .name = "octeontx2_npa", .alloc = otx2_npa_alloc, + .free = otx2_npa_free, + .enqueue = otx2_npa_enq, + .get_count = otx2_npa_get_count, + .calc_mem_size = otx2_npa_calc_mem_size, + .populate = otx2_npa_populate, +#if defined(RTE_ARCH_ARM64) + .dequeue = otx2_npa_deq_arm64, +#else + .dequeue = otx2_npa_deq, +#endif }; MEMPOOL_REGISTER_OPS(otx2_npa_ops);