X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=drivers%2Fmempool%2Focteontx2%2Fotx2_mempool_ops.c;h=5229a7cfba05f756e0033741fde3d7de51c3a5c1;hb=53313910bcfd5a05cc07ef5db97e7c65db7a93a3;hp=0e7b7a77cfe51e6356b45a6aaa8b9853c600ab12;hpb=7bcc47cbe2fa29da6a624721e0b52148913418ca;p=dpdk.git

diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c b/drivers/mempool/octeontx2/otx2_mempool_ops.c
index 0e7b7a77cf..5229a7cfba 100644
--- a/drivers/mempool/octeontx2/otx2_mempool_ops.c
+++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c
@@ -7,6 +7,340 @@
 
 #include "otx2_mempool.h"
 
+static int __rte_hot
+otx2_npa_enq(struct rte_mempool *mp, void * const *obj_table, unsigned int n)
+{
+	unsigned int index; const uint64_t aura_handle = mp->pool_id;
+	const uint64_t reg = npa_lf_aura_handle_to_aura(aura_handle);
+	const uint64_t addr = npa_lf_aura_handle_to_base(aura_handle) +
+				 NPA_LF_AURA_OP_FREE0;
+
+	for (index = 0; index < n; index++)
+		otx2_store_pair((uint64_t)obj_table[index], reg, addr);
+
+	return 0;
+}
+
+static __rte_noinline int
+npa_lf_aura_op_alloc_one(const int64_t wdata, int64_t * const addr,
+			 void **obj_table, uint8_t i)
+{
+	uint8_t retry = 4;
+
+	do {
+		obj_table[i] = (void *)otx2_atomic64_add_nosync(wdata, addr);
+		if (obj_table[i] != NULL)
+			return 0;
+
+	} while (retry--);
+
+	return -ENOENT;
+}
+
+#if defined(RTE_ARCH_ARM64)
+static __rte_noinline int
+npa_lf_aura_op_search_alloc(const int64_t wdata, int64_t * const addr,
+		void **obj_table, unsigned int n)
+{
+	uint8_t i;
+
+	for (i = 0; i < n; i++) {
+		if (obj_table[i] != NULL)
+			continue;
+		if (npa_lf_aura_op_alloc_one(wdata, addr, obj_table, i))
+			return -ENOENT;
+	}
+
+	return 0;
+}
+
+static __rte_noinline int
+npa_lf_aura_op_alloc_bulk(const int64_t wdata, int64_t * const addr,
+			  unsigned int n, void **obj_table)
+{
+	register const uint64_t wdata64 __asm("x26") = wdata;
+	register const uint64_t wdata128 __asm("x27") = wdata;
+	uint64x2_t failed = vdupq_n_u64(~0);
+
+	switch (n) {
+	case 32:
+	{
+		asm volatile (
+		".cpu  generic+lse\n"
+		"casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x8, x9, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x10, x11, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x12, x13, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x14, x15, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x16, x17, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x18, x19, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x20, x21, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x22, x23, %[wdata64], %[wdata128], [%[loc]]\n"
+		"fmov d16, x0\n"
+		"fmov v16.D[1], x1\n"
+		"casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+		"fmov d17, x2\n"
+		"fmov v17.D[1], x3\n"
+		"casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
+		"fmov d18, x4\n"
+		"fmov v18.D[1], x5\n"
+		"casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n"
+		"fmov d19, x6\n"
+		"fmov v19.D[1], x7\n"
+		"casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n"
+		"and %[failed].16B, %[failed].16B, v16.16B\n"
+		"and %[failed].16B, %[failed].16B, v17.16B\n"
+		"and %[failed].16B, %[failed].16B, v18.16B\n"
+		"and %[failed].16B, %[failed].16B, v19.16B\n"
+		"fmov d20, x8\n"
+		"fmov v20.D[1], x9\n"
+		"fmov d21, x10\n"
+		"fmov v21.D[1], x11\n"
+		"fmov d22, x12\n"
+		"fmov v22.D[1], x13\n"
+		"fmov d23, x14\n"
+		"fmov v23.D[1], x15\n"
+		"and %[failed].16B, %[failed].16B, v20.16B\n"
+		"and %[failed].16B, %[failed].16B, v21.16B\n"
+		"and %[failed].16B, %[failed].16B, v22.16B\n"
+		"and %[failed].16B, %[failed].16B, v23.16B\n"
+		"st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
+		"st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n"
+		"fmov d16, x16\n"
+		"fmov v16.D[1], x17\n"
+		"fmov d17, x18\n"
+		"fmov v17.D[1], x19\n"
+		"fmov d18, x20\n"
+		"fmov v18.D[1], x21\n"
+		"fmov d19, x22\n"
+		"fmov v19.D[1], x23\n"
+		"and %[failed].16B, %[failed].16B, v16.16B\n"
+		"and %[failed].16B, %[failed].16B, v17.16B\n"
+		"and %[failed].16B, %[failed].16B, v18.16B\n"
+		"and %[failed].16B, %[failed].16B, v19.16B\n"
+		"fmov d20, x0\n"
+		"fmov v20.D[1], x1\n"
+		"fmov d21, x2\n"
+		"fmov v21.D[1], x3\n"
+		"fmov d22, x4\n"
+		"fmov v22.D[1], x5\n"
+		"fmov d23, x6\n"
+		"fmov v23.D[1], x7\n"
+		"and %[failed].16B, %[failed].16B, v20.16B\n"
+		"and %[failed].16B, %[failed].16B, v21.16B\n"
+		"and %[failed].16B, %[failed].16B, v22.16B\n"
+		"and %[failed].16B, %[failed].16B, v23.16B\n"
+		"st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
+		"st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n"
+		: "+Q" (*addr), [failed] "=&w" (failed)
+		: [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
+		[dst] "r" (obj_table), [loc] "r" (addr)
+		: "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+		"x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16",
+		"x17", "x18", "x19", "x20", "x21", "x22", "x23", "v16", "v17",
+		"v18", "v19", "v20", "v21", "v22", "v23"
+		);
+		break;
+	}
+	case 16:
+	{
+		asm volatile (
+		".cpu  generic+lse\n"
+		"casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x8, x9, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x10, x11, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x12, x13, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x14, x15, %[wdata64], %[wdata128], [%[loc]]\n"
+		"fmov d16, x0\n"
+		"fmov v16.D[1], x1\n"
+		"fmov d17, x2\n"
+		"fmov v17.D[1], x3\n"
+		"fmov d18, x4\n"
+		"fmov v18.D[1], x5\n"
+		"fmov d19, x6\n"
+		"fmov v19.D[1], x7\n"
+		"and %[failed].16B, %[failed].16B, v16.16B\n"
+		"and %[failed].16B, %[failed].16B, v17.16B\n"
+		"and %[failed].16B, %[failed].16B, v18.16B\n"
+		"and %[failed].16B, %[failed].16B, v19.16B\n"
+		"fmov d20, x8\n"
+		"fmov v20.D[1], x9\n"
+		"fmov d21, x10\n"
+		"fmov v21.D[1], x11\n"
+		"fmov d22, x12\n"
+		"fmov v22.D[1], x13\n"
+		"fmov d23, x14\n"
+		"fmov v23.D[1], x15\n"
+		"and %[failed].16B, %[failed].16B, v20.16B\n"
+		"and %[failed].16B, %[failed].16B, v21.16B\n"
+		"and %[failed].16B, %[failed].16B, v22.16B\n"
+		"and %[failed].16B, %[failed].16B, v23.16B\n"
+		"st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
+		"st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n"
+		: "+Q" (*addr), [failed] "=&w" (failed)
+		: [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
+		[dst] "r" (obj_table), [loc] "r" (addr)
+		: "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+		"x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "v16",
+		"v17", "v18", "v19", "v20", "v21", "v22", "v23"
+		);
+		break;
+	}
+	case 8:
+	{
+		asm volatile (
+		".cpu  generic+lse\n"
+		"casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n"
+		"fmov d16, x0\n"
+		"fmov v16.D[1], x1\n"
+		"fmov d17, x2\n"
+		"fmov v17.D[1], x3\n"
+		"fmov d18, x4\n"
+		"fmov v18.D[1], x5\n"
+		"fmov d19, x6\n"
+		"fmov v19.D[1], x7\n"
+		"and %[failed].16B, %[failed].16B, v16.16B\n"
+		"and %[failed].16B, %[failed].16B, v17.16B\n"
+		"and %[failed].16B, %[failed].16B, v18.16B\n"
+		"and %[failed].16B, %[failed].16B, v19.16B\n"
+		"st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
+		: "+Q" (*addr), [failed] "=&w" (failed)
+		: [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
+		[dst] "r" (obj_table), [loc] "r" (addr)
+		: "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+		"v16", "v17", "v18", "v19"
+		);
+		break;
+	}
+	case 4:
+	{
+		asm volatile (
+		".cpu  generic+lse\n"
+		"casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+		"casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
+		"fmov d16, x0\n"
+		"fmov v16.D[1], x1\n"
+		"fmov d17, x2\n"
+		"fmov v17.D[1], x3\n"
+		"and %[failed].16B, %[failed].16B, v16.16B\n"
+		"and %[failed].16B, %[failed].16B, v17.16B\n"
+		"st1 { v16.2d, v17.2d}, [%[dst]], 32\n"
+		: "+Q" (*addr), [failed] "=&w" (failed)
+		: [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
+		[dst] "r" (obj_table), [loc] "r" (addr)
+		: "memory", "x0", "x1", "x2", "x3", "v16", "v17"
+		);
+		break;
+	}
+	case 2:
+	{
+		asm volatile (
+		".cpu  generic+lse\n"
+		"casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+		"fmov d16, x0\n"
+		"fmov v16.D[1], x1\n"
+		"and %[failed].16B, %[failed].16B, v16.16B\n"
+		"st1 { v16.2d}, [%[dst]], 16\n"
+		: "+Q" (*addr), [failed] "=&w" (failed)
+		: [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
+		[dst] "r" (obj_table), [loc] "r" (addr)
+		: "memory", "x0", "x1", "v16"
+		);
+		break;
+	}
+	case 1:
+		return npa_lf_aura_op_alloc_one(wdata, addr, obj_table, 0);
+	}
+
+	if (unlikely(!(vgetq_lane_u64(failed, 0) & vgetq_lane_u64(failed, 1))))
+		return npa_lf_aura_op_search_alloc(wdata, addr, (void **)
+			((char *)obj_table - (sizeof(uint64_t) * n)), n);
+
+	return 0;
+}
+
+static __rte_noinline void
+otx2_npa_clear_alloc(struct rte_mempool *mp, void **obj_table, unsigned int n)
+{
+	unsigned int i;
+
+	for (i = 0; i < n; i++) {
+		if (obj_table[i] != NULL) {
+			otx2_npa_enq(mp, &obj_table[i], 1);
+			obj_table[i] = NULL;
+		}
+	}
+}
+
+static __rte_noinline int __rte_hot
+otx2_npa_deq_arm64(struct rte_mempool *mp, void **obj_table, unsigned int n)
+{
+	const int64_t wdata = npa_lf_aura_handle_to_aura(mp->pool_id);
+	void **obj_table_bak = obj_table;
+	const unsigned int nfree = n;
+	unsigned int parts;
+
+	int64_t * const addr = (int64_t * const)
+			(npa_lf_aura_handle_to_base(mp->pool_id) +
+				NPA_LF_AURA_OP_ALLOCX(0));
+	while (n) {
+		parts = n > 31 ? 32 : rte_align32prevpow2(n);
+		n -= parts;
+		if (unlikely(npa_lf_aura_op_alloc_bulk(wdata, addr,
+				parts, obj_table))) {
+			otx2_npa_clear_alloc(mp, obj_table_bak, nfree - n);
+			return -ENOENT;
+		}
+		obj_table += parts;
+	}
+
+	return 0;
+}
+
+#else
+
+static inline int __rte_hot
+otx2_npa_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)
+{
+	const int64_t wdata = npa_lf_aura_handle_to_aura(mp->pool_id);
+	unsigned int index;
+	uint64_t obj;
+
+	int64_t * const addr = (int64_t *)
+			(npa_lf_aura_handle_to_base(mp->pool_id) +
+				NPA_LF_AURA_OP_ALLOCX(0));
+	for (index = 0; index < n; index++, obj_table++) {
+		obj = npa_lf_aura_op_alloc_one(wdata, addr, obj_table, 0);
+		if (obj == 0) {
+			for (; index > 0; index--) {
+				obj_table--;
+				otx2_npa_enq(mp, obj_table, 1);
+			}
+			return -ENOENT;
+		}
+		*obj_table = (void *)obj;
+	}
+
+	return 0;
+}
+
+#endif
+
+static unsigned int
+otx2_npa_get_count(const struct rte_mempool *mp)
+{
+	return (unsigned int)npa_lf_aura_op_available(mp->pool_id);
+}
+
 static int
 npa_lf_aura_pool_init(struct otx2_mbox *mbox, uint32_t aura_id,
 		      struct npa_aura_s *aura, struct npa_pool_s *pool)
@@ -14,21 +348,26 @@ npa_lf_aura_pool_init(struct otx2_mbox *mbox, uint32_t aura_id,
 	struct npa_aq_enq_req *aura_init_req, *pool_init_req;
 	struct npa_aq_enq_rsp *aura_init_rsp, *pool_init_rsp;
 	struct otx2_mbox_dev *mdev = &mbox->dev[0];
+	struct otx2_idev_cfg *idev;
 	int rc, off;
 
+	idev = otx2_intra_dev_get_cfg();
+	if (idev == NULL)
+		return -ENOMEM;
+
 	aura_init_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
 
 	aura_init_req->aura_id = aura_id;
 	aura_init_req->ctype = NPA_AQ_CTYPE_AURA;
 	aura_init_req->op = NPA_AQ_INSTOP_INIT;
-	memcpy(&aura_init_req->aura, aura, sizeof(*aura));
+	otx2_mbox_memcpy(&aura_init_req->aura, aura, sizeof(*aura));
 
 	pool_init_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
 
 	pool_init_req->aura_id = aura_id;
 	pool_init_req->ctype = NPA_AQ_CTYPE_POOL;
 	pool_init_req->op = NPA_AQ_INSTOP_INIT;
-	memcpy(&pool_init_req->pool, pool, sizeof(*pool));
+	otx2_mbox_memcpy(&pool_init_req->pool, pool, sizeof(*pool));
 
 	otx2_mbox_msg_send(mbox, 0);
 	rc = otx2_mbox_wait_for_rsp(mbox, 0);
@@ -45,6 +384,131 @@ npa_lf_aura_pool_init(struct otx2_mbox *mbox, uint32_t aura_id,
 		return 0;
 	else
 		return NPA_LF_ERR_AURA_POOL_INIT;
+
+	if (!(idev->npa_lock_mask & BIT_ULL(aura_id)))
+		return 0;
+
+	aura_init_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
+	aura_init_req->aura_id = aura_id;
+	aura_init_req->ctype = NPA_AQ_CTYPE_AURA;
+	aura_init_req->op = NPA_AQ_INSTOP_LOCK;
+
+	pool_init_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
+	if (!pool_init_req) {
+		/* The shared memory buffer can be full.
+		 * Flush it and retry
+		 */
+		otx2_mbox_msg_send(mbox, 0);
+		rc = otx2_mbox_wait_for_rsp(mbox, 0);
+		if (rc < 0) {
+			otx2_err("Failed to LOCK AURA context");
+			return -ENOMEM;
+		}
+
+		pool_init_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
+		if (!pool_init_req) {
+			otx2_err("Failed to LOCK POOL context");
+			return -ENOMEM;
+		}
+	}
+	pool_init_req->aura_id = aura_id;
+	pool_init_req->ctype = NPA_AQ_CTYPE_POOL;
+	pool_init_req->op = NPA_AQ_INSTOP_LOCK;
+
+	rc = otx2_mbox_process(mbox);
+	if (rc < 0) {
+		otx2_err("Failed to lock POOL ctx to NDC");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int
+npa_lf_aura_pool_fini(struct otx2_mbox *mbox,
+		      uint32_t aura_id,
+		      uint64_t aura_handle)
+{
+	struct npa_aq_enq_req *aura_req, *pool_req;
+	struct npa_aq_enq_rsp *aura_rsp, *pool_rsp;
+	struct otx2_mbox_dev *mdev = &mbox->dev[0];
+	struct ndc_sync_op *ndc_req;
+	struct otx2_idev_cfg *idev;
+	int rc, off;
+
+	idev = otx2_intra_dev_get_cfg();
+	if (idev == NULL)
+		return -EINVAL;
+
+	/* Procedure for disabling an aura/pool */
+	rte_delay_us(10);
+	npa_lf_aura_op_alloc(aura_handle, 0);
+
+	pool_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
+	pool_req->aura_id = aura_id;
+	pool_req->ctype = NPA_AQ_CTYPE_POOL;
+	pool_req->op = NPA_AQ_INSTOP_WRITE;
+	pool_req->pool.ena = 0;
+	pool_req->pool_mask.ena = ~pool_req->pool_mask.ena;
+
+	aura_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
+	aura_req->aura_id = aura_id;
+	aura_req->ctype = NPA_AQ_CTYPE_AURA;
+	aura_req->op = NPA_AQ_INSTOP_WRITE;
+	aura_req->aura.ena = 0;
+	aura_req->aura_mask.ena = ~aura_req->aura_mask.ena;
+
+	otx2_mbox_msg_send(mbox, 0);
+	rc = otx2_mbox_wait_for_rsp(mbox, 0);
+	if (rc < 0)
+		return rc;
+
+	off = mbox->rx_start +
+			RTE_ALIGN(sizeof(struct mbox_hdr), MBOX_MSG_ALIGN);
+	pool_rsp = (struct npa_aq_enq_rsp *)((uintptr_t)mdev->mbase + off);
+
+	off = mbox->rx_start + pool_rsp->hdr.next_msgoff;
+	aura_rsp = (struct npa_aq_enq_rsp *)((uintptr_t)mdev->mbase + off);
+
+	if (rc != 2 || aura_rsp->hdr.rc != 0 || pool_rsp->hdr.rc != 0)
+		return NPA_LF_ERR_AURA_POOL_FINI;
+
+	/* Sync NDC-NPA for LF */
+	ndc_req = otx2_mbox_alloc_msg_ndc_sync_op(mbox);
+	ndc_req->npa_lf_sync = 1;
+
+	rc = otx2_mbox_process(mbox);
+	if (rc) {
+		otx2_err("Error on NDC-NPA LF sync, rc %d", rc);
+		return NPA_LF_ERR_AURA_POOL_FINI;
+	}
+
+	if (!(idev->npa_lock_mask & BIT_ULL(aura_id)))
+		return 0;
+
+	aura_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
+	aura_req->aura_id = aura_id;
+	aura_req->ctype = NPA_AQ_CTYPE_AURA;
+	aura_req->op = NPA_AQ_INSTOP_UNLOCK;
+
+	rc = otx2_mbox_process(mbox);
+	if (rc < 0) {
+		otx2_err("Failed to unlock AURA ctx to NDC");
+		return -EINVAL;
+	}
+
+	pool_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
+	pool_req->aura_id = aura_id;
+	pool_req->ctype = NPA_AQ_CTYPE_POOL;
+	pool_req->op = NPA_AQ_INSTOP_UNLOCK;
+
+	rc = otx2_mbox_process(mbox);
+	if (rc < 0) {
+		otx2_err("Failed to unlock POOL ctx to NDC");
+		return -EINVAL;
+	}
+
+	return 0;
 }
 
 static inline char*
@@ -65,6 +529,18 @@ npa_lf_stack_dma_alloc(struct otx2_npa_lf *lf, char *name,
 			RTE_MEMZONE_IOVA_CONTIG, OTX2_ALIGN);
 }
 
+static inline int
+npa_lf_stack_dma_free(struct otx2_npa_lf *lf, char *name, int pool_id)
+{
+	const struct rte_memzone *mz;
+
+	mz = rte_memzone_lookup(npa_lf_stack_memzone_name(lf, pool_id, name));
+	if (mz == NULL)
+		return -EINVAL;
+
+	return rte_memzone_free(mz);
+}
+
 static inline int
 bitmap_ctzll(uint64_t slab)
 {
@@ -179,14 +655,67 @@ exit:
 	return rc;
 }
 
+static int
+npa_lf_aura_pool_pair_free(struct otx2_npa_lf *lf, uint64_t aura_handle)
+{
+	char name[RTE_MEMZONE_NAMESIZE];
+	int aura_id, pool_id, rc;
+
+	if (!lf || !aura_handle)
+		return NPA_LF_ERR_PARAM;
+
+	aura_id = pool_id = npa_lf_aura_handle_to_aura(aura_handle);
+	rc = npa_lf_aura_pool_fini(lf->mbox, aura_id, aura_handle);
+	rc |= npa_lf_stack_dma_free(lf, name, pool_id);
+
+	rte_bitmap_set(lf->npa_bmp, aura_id);
+
+	return rc;
+}
+
+static int
+npa_lf_aura_range_update_check(uint64_t aura_handle)
+{
+	uint64_t aura_id = npa_lf_aura_handle_to_aura(aura_handle);
+	struct otx2_npa_lf *lf = otx2_npa_lf_obj_get();
+	struct npa_aura_lim *lim = lf->aura_lim;
+	__otx2_io struct npa_pool_s *pool;
+	struct npa_aq_enq_req *req;
+	struct npa_aq_enq_rsp *rsp;
+	int rc;
+
+	req  = otx2_mbox_alloc_msg_npa_aq_enq(lf->mbox);
+
+	req->aura_id = aura_id;
+	req->ctype = NPA_AQ_CTYPE_POOL;
+	req->op = NPA_AQ_INSTOP_READ;
+
+	rc = otx2_mbox_process_msg(lf->mbox, (void *)&rsp);
+	if (rc) {
+		otx2_err("Failed to get pool(0x%"PRIx64") context", aura_id);
+		return rc;
+	}
+
+	pool = &rsp->pool;
+
+	if (lim[aura_id].ptr_start != pool->ptr_start ||
+		lim[aura_id].ptr_end != pool->ptr_end) {
+		otx2_err("Range update failed on pool(0x%"PRIx64")", aura_id);
+		return -ERANGE;
+	}
+
+	return 0;
+}
+
 static int
 otx2_npa_alloc(struct rte_mempool *mp)
 {
 	uint32_t block_size, block_count;
+	uint64_t aura_handle = 0;
 	struct otx2_npa_lf *lf;
 	struct npa_aura_s aura;
 	struct npa_pool_s pool;
-	uint64_t aura_handle;
+	size_t padding;
 	int rc;
 
 	lf = otx2_npa_lf_obj_get();
@@ -196,6 +725,18 @@ otx2_npa_alloc(struct rte_mempool *mp)
 	}
 
 	block_size = mp->elt_size + mp->header_size + mp->trailer_size;
+	/*
+	 * OCTEON TX2 has 8 sets, 41 ways L1D cache, VA<9:7> bits dictate
+	 * the set selection.
+	 * Add additional padding to ensure that the element size always
+	 * occupies odd number of cachelines to ensure even distribution
+	 * of elements among L1D cache sets.
+	 */
+	padding = ((block_size / RTE_CACHE_LINE_SIZE) % 2) ? 0 :
+				RTE_CACHE_LINE_SIZE;
+	mp->trailer_size += padding;
+	block_size += padding;
+
 	block_count = mp->size;
 
 	if (block_size % OTX2_ALIGN != 0) {
@@ -238,9 +779,117 @@ error:
 	return rc;
 }
 
+static void
+otx2_npa_free(struct rte_mempool *mp)
+{
+	struct otx2_npa_lf *lf = otx2_npa_lf_obj_get();
+	int rc = 0;
+
+	otx2_npa_dbg("lf=%p aura_handle=0x%"PRIx64, lf, mp->pool_id);
+	if (lf != NULL)
+		rc = npa_lf_aura_pool_pair_free(lf, mp->pool_id);
+
+	if (rc)
+		otx2_err("Failed to free pool or aura rc=%d", rc);
+
+	/* Release the reference of npalf */
+	otx2_npa_lf_fini();
+}
+
+static ssize_t
+otx2_npa_calc_mem_size(const struct rte_mempool *mp, uint32_t obj_num,
+		       uint32_t pg_shift, size_t *min_chunk_size, size_t *align)
+{
+	size_t total_elt_sz;
+
+	/* Need space for one more obj on each chunk to fulfill
+	 * alignment requirements.
+	 */
+	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
+	return rte_mempool_op_calc_mem_size_helper(mp, obj_num, pg_shift,
+						total_elt_sz, min_chunk_size,
+						align);
+}
+
+static uint8_t
+otx2_npa_l1d_way_set_get(uint64_t iova)
+{
+	return (iova >> rte_log2_u32(RTE_CACHE_LINE_SIZE)) & 0x7;
+}
+
+static int
+otx2_npa_populate(struct rte_mempool *mp, unsigned int max_objs, void *vaddr,
+		  rte_iova_t iova, size_t len,
+		  rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
+{
+#define OTX2_L1D_NB_SETS	8
+	uint64_t distribution[OTX2_L1D_NB_SETS];
+	rte_iova_t start_iova;
+	size_t total_elt_sz;
+	uint8_t set;
+	size_t off;
+	int i;
+
+	if (iova == RTE_BAD_IOVA)
+		return -EINVAL;
+
+	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
+
+	/* Align object start address to a multiple of total_elt_sz */
+	off = total_elt_sz - ((((uintptr_t)vaddr - 1) % total_elt_sz) + 1);
+
+	if (len < off)
+		return -EINVAL;
+
+
+	vaddr = (char *)vaddr + off;
+	iova += off;
+	len -= off;
+
+	memset(distribution, 0, sizeof(uint64_t) * OTX2_L1D_NB_SETS);
+	start_iova = iova;
+	while (start_iova < iova + len) {
+		set = otx2_npa_l1d_way_set_get(start_iova + mp->header_size);
+		distribution[set]++;
+		start_iova += total_elt_sz;
+	}
+
+	otx2_npa_dbg("iova %"PRIx64", aligned iova %"PRIx64"", iova - off,
+		     iova);
+	otx2_npa_dbg("length %"PRIu64", aligned length %"PRIu64"",
+		     (uint64_t)(len + off), (uint64_t)len);
+	otx2_npa_dbg("element size %"PRIu64"", (uint64_t)total_elt_sz);
+	otx2_npa_dbg("requested objects %"PRIu64", possible objects %"PRIu64"",
+		     (uint64_t)max_objs, (uint64_t)(len / total_elt_sz));
+	otx2_npa_dbg("L1D set distribution :");
+	for (i = 0; i < OTX2_L1D_NB_SETS; i++)
+		otx2_npa_dbg("set[%d] : objects : %"PRIu64"", i,
+			     distribution[i]);
+
+	npa_lf_aura_op_range_set(mp->pool_id, iova, iova + len);
+
+	if (npa_lf_aura_range_update_check(mp->pool_id) < 0)
+		return -EBUSY;
+
+	return rte_mempool_op_populate_helper(mp,
+					RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ,
+					max_objs, vaddr, iova, len,
+					obj_cb, obj_cb_arg);
+}
+
 static struct rte_mempool_ops otx2_npa_ops = {
 	.name = "octeontx2_npa",
 	.alloc = otx2_npa_alloc,
+	.free = otx2_npa_free,
+	.enqueue = otx2_npa_enq,
+	.get_count = otx2_npa_get_count,
+	.calc_mem_size = otx2_npa_calc_mem_size,
+	.populate = otx2_npa_populate,
+#if defined(RTE_ARCH_ARM64)
+	.dequeue = otx2_npa_deq_arm64,
+#else
+	.dequeue = otx2_npa_deq,
+#endif
 };
 
 MEMPOOL_REGISTER_OPS(otx2_npa_ops);