net/af_xdp: allow bigger batch sizes
[dpdk.git] / drivers / mempool / octeontx2 / otx2_mempool_ops.c
index 97146d1..9ff71bc 100644 (file)
@@ -7,7 +7,7 @@
 
 #include "otx2_mempool.h"
 
-static int __hot
+static int __rte_hot
 otx2_npa_enq(struct rte_mempool *mp, void * const *obj_table, unsigned int n)
 {
        unsigned int index; const uint64_t aura_handle = mp->pool_id;
@@ -15,6 +15,10 @@ otx2_npa_enq(struct rte_mempool *mp, void * const *obj_table, unsigned int n)
        const uint64_t addr = npa_lf_aura_handle_to_base(aura_handle) +
                                 NPA_LF_AURA_OP_FREE0;
 
+       /* Ensure mbuf init changes are written before the free pointers
+        * are enqueued to the stack.
+        */
+       rte_io_wmb();
        for (index = 0; index < n; index++)
                otx2_store_pair((uint64_t)obj_table[index], reg, addr);
 
@@ -54,233 +58,206 @@ npa_lf_aura_op_search_alloc(const int64_t wdata, int64_t * const addr,
        return 0;
 }
 
-/*
- * Some versions of the compiler don't have support for __int128_t for
- * CASP inline-asm. i.e. if the optimization level is reduced to -O0 the
- * CASP restrictions aren't followed and the compiler might end up violation the
- * CASP rules. Fix it by explicitly providing ((optimize("-O3"))).
- *
- * Example:
- * ccSPMGzq.s:1648: Error: reg pair must start from even reg at
- * operand 1 - `casp x21,x22,x0,x1,[x19]'
- */
-static  __attribute__((optimize("-O3"))) __rte_noinline int __hot
+static __rte_noinline int
 npa_lf_aura_op_alloc_bulk(const int64_t wdata, int64_t * const addr,
                          unsigned int n, void **obj_table)
 {
-       const __uint128_t wdata128 = ((__uint128_t)wdata << 64) | wdata;
+       register const uint64_t wdata64 __asm("x26") = wdata;
+       register const uint64_t wdata128 __asm("x27") = wdata;
        uint64x2_t failed = vdupq_n_u64(~0);
 
        switch (n) {
        case 32:
        {
-               __uint128_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
-               __uint128_t t10, t11;
-
                asm volatile (
                ".cpu  generic+lse\n"
-               "casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
-               "casp %[t1], %H[t1], %[wdata], %H[wdata], [%[loc]]\n"
-               "casp %[t2], %H[t2], %[wdata], %H[wdata], [%[loc]]\n"
-               "casp %[t3], %H[t3], %[wdata], %H[wdata], [%[loc]]\n"
-               "casp %[t4], %H[t4], %[wdata], %H[wdata], [%[loc]]\n"
-               "casp %[t5], %H[t5], %[wdata], %H[wdata], [%[loc]]\n"
-               "casp %[t6], %H[t6], %[wdata], %H[wdata], [%[loc]]\n"
-               "casp %[t7], %H[t7], %[wdata], %H[wdata], [%[loc]]\n"
-               "casp %[t8], %H[t8], %[wdata], %H[wdata], [%[loc]]\n"
-               "casp %[t9], %H[t9], %[wdata], %H[wdata], [%[loc]]\n"
-               "casp %[t10], %H[t10], %[wdata], %H[wdata], [%[loc]]\n"
-               "casp %[t11], %H[t11], %[wdata], %H[wdata], [%[loc]]\n"
-               "fmov d16, %[t0]\n"
-               "fmov v16.D[1], %H[t0]\n"
-               "casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
-               "fmov d17, %[t1]\n"
-               "fmov v17.D[1], %H[t1]\n"
-               "casp %[t1], %H[t1], %[wdata], %H[wdata], [%[loc]]\n"
-               "fmov d18, %[t2]\n"
-               "fmov v18.D[1], %H[t2]\n"
-               "casp %[t2], %H[t2], %[wdata], %H[wdata], [%[loc]]\n"
-               "fmov d19, %[t3]\n"
-               "fmov v19.D[1], %H[t3]\n"
-               "casp %[t3], %H[t3], %[wdata], %H[wdata], [%[loc]]\n"
+               "casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x8, x9, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x10, x11, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x12, x13, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x14, x15, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x16, x17, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x18, x19, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x20, x21, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x22, x23, %[wdata64], %[wdata128], [%[loc]]\n"
+               "fmov d16, x0\n"
+               "fmov v16.D[1], x1\n"
+               "casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+               "fmov d17, x2\n"
+               "fmov v17.D[1], x3\n"
+               "casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
+               "fmov d18, x4\n"
+               "fmov v18.D[1], x5\n"
+               "casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n"
+               "fmov d19, x6\n"
+               "fmov v19.D[1], x7\n"
+               "casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n"
                "and %[failed].16B, %[failed].16B, v16.16B\n"
                "and %[failed].16B, %[failed].16B, v17.16B\n"
                "and %[failed].16B, %[failed].16B, v18.16B\n"
                "and %[failed].16B, %[failed].16B, v19.16B\n"
-               "fmov d20, %[t4]\n"
-               "fmov v20.D[1], %H[t4]\n"
-               "fmov d21, %[t5]\n"
-               "fmov v21.D[1], %H[t5]\n"
-               "fmov d22, %[t6]\n"
-               "fmov v22.D[1], %H[t6]\n"
-               "fmov d23, %[t7]\n"
-               "fmov v23.D[1], %H[t7]\n"
+               "fmov d20, x8\n"
+               "fmov v20.D[1], x9\n"
+               "fmov d21, x10\n"
+               "fmov v21.D[1], x11\n"
+               "fmov d22, x12\n"
+               "fmov v22.D[1], x13\n"
+               "fmov d23, x14\n"
+               "fmov v23.D[1], x15\n"
                "and %[failed].16B, %[failed].16B, v20.16B\n"
                "and %[failed].16B, %[failed].16B, v21.16B\n"
                "and %[failed].16B, %[failed].16B, v22.16B\n"
                "and %[failed].16B, %[failed].16B, v23.16B\n"
                "st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
                "st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n"
-               "fmov d16, %[t8]\n"
-               "fmov v16.D[1], %H[t8]\n"
-               "fmov d17, %[t9]\n"
-               "fmov v17.D[1], %H[t9]\n"
-               "fmov d18, %[t10]\n"
-               "fmov v18.D[1], %H[t10]\n"
-               "fmov d19, %[t11]\n"
-               "fmov v19.D[1], %H[t11]\n"
+               "fmov d16, x16\n"
+               "fmov v16.D[1], x17\n"
+               "fmov d17, x18\n"
+               "fmov v17.D[1], x19\n"
+               "fmov d18, x20\n"
+               "fmov v18.D[1], x21\n"
+               "fmov d19, x22\n"
+               "fmov v19.D[1], x23\n"
                "and %[failed].16B, %[failed].16B, v16.16B\n"
                "and %[failed].16B, %[failed].16B, v17.16B\n"
                "and %[failed].16B, %[failed].16B, v18.16B\n"
                "and %[failed].16B, %[failed].16B, v19.16B\n"
-               "fmov d20, %[t0]\n"
-               "fmov v20.D[1], %H[t0]\n"
-               "fmov d21, %[t1]\n"
-               "fmov v21.D[1], %H[t1]\n"
-               "fmov d22, %[t2]\n"
-               "fmov v22.D[1], %H[t2]\n"
-               "fmov d23, %[t3]\n"
-               "fmov v23.D[1], %H[t3]\n"
+               "fmov d20, x0\n"
+               "fmov v20.D[1], x1\n"
+               "fmov d21, x2\n"
+               "fmov v21.D[1], x3\n"
+               "fmov d22, x4\n"
+               "fmov v22.D[1], x5\n"
+               "fmov d23, x6\n"
+               "fmov v23.D[1], x7\n"
                "and %[failed].16B, %[failed].16B, v20.16B\n"
                "and %[failed].16B, %[failed].16B, v21.16B\n"
                "and %[failed].16B, %[failed].16B, v22.16B\n"
                "and %[failed].16B, %[failed].16B, v23.16B\n"
                "st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
                "st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n"
-               : "+Q" (*addr), [failed] "=&w" (failed),
-               [t0] "=&r" (t0), [t1] "=&r" (t1), [t2] "=&r" (t2),
-               [t3] "=&r" (t3), [t4] "=&r" (t4), [t5] "=&r" (t5),
-               [t6] "=&r" (t6), [t7] "=&r" (t7), [t8] "=&r" (t8),
-               [t9] "=&r" (t9), [t10] "=&r" (t10), [t11] "=&r" (t11)
-               : [wdata] "r" (wdata128), [dst] "r" (obj_table),
-               [loc] "r" (addr)
-               : "memory", "v16", "v17", "v18",
-               "v19", "v20", "v21", "v22", "v23"
+               : "+Q" (*addr), [failed] "=&w" (failed)
+               : [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
+               [dst] "r" (obj_table), [loc] "r" (addr)
+               : "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+               "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16",
+               "x17", "x18", "x19", "x20", "x21", "x22", "x23", "v16", "v17",
+               "v18", "v19", "v20", "v21", "v22", "v23"
                );
                break;
        }
        case 16:
        {
-               __uint128_t t0, t1, t2, t3, t4, t5, t6, t7;
-
                asm volatile (
                ".cpu  generic+lse\n"
-               "casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
-               "casp %[t1], %H[t1], %[wdata], %H[wdata], [%[loc]]\n"
-               "casp %[t2], %H[t2], %[wdata], %H[wdata], [%[loc]]\n"
-               "casp %[t3], %H[t3], %[wdata], %H[wdata], [%[loc]]\n"
-               "casp %[t4], %H[t4], %[wdata], %H[wdata], [%[loc]]\n"
-               "casp %[t5], %H[t5], %[wdata], %H[wdata], [%[loc]]\n"
-               "casp %[t6], %H[t6], %[wdata], %H[wdata], [%[loc]]\n"
-               "casp %[t7], %H[t7], %[wdata], %H[wdata], [%[loc]]\n"
-               "fmov d16, %[t0]\n"
-               "fmov v16.D[1], %H[t0]\n"
-               "fmov d17, %[t1]\n"
-               "fmov v17.D[1], %H[t1]\n"
-               "fmov d18, %[t2]\n"
-               "fmov v18.D[1], %H[t2]\n"
-               "fmov d19, %[t3]\n"
-               "fmov v19.D[1], %H[t3]\n"
+               "casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x8, x9, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x10, x11, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x12, x13, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x14, x15, %[wdata64], %[wdata128], [%[loc]]\n"
+               "fmov d16, x0\n"
+               "fmov v16.D[1], x1\n"
+               "fmov d17, x2\n"
+               "fmov v17.D[1], x3\n"
+               "fmov d18, x4\n"
+               "fmov v18.D[1], x5\n"
+               "fmov d19, x6\n"
+               "fmov v19.D[1], x7\n"
                "and %[failed].16B, %[failed].16B, v16.16B\n"
                "and %[failed].16B, %[failed].16B, v17.16B\n"
                "and %[failed].16B, %[failed].16B, v18.16B\n"
                "and %[failed].16B, %[failed].16B, v19.16B\n"
-               "fmov d20, %[t4]\n"
-               "fmov v20.D[1], %H[t4]\n"
-               "fmov d21, %[t5]\n"
-               "fmov v21.D[1], %H[t5]\n"
-               "fmov d22, %[t6]\n"
-               "fmov v22.D[1], %H[t6]\n"
-               "fmov d23, %[t7]\n"
-               "fmov v23.D[1], %H[t7]\n"
+               "fmov d20, x8\n"
+               "fmov v20.D[1], x9\n"
+               "fmov d21, x10\n"
+               "fmov v21.D[1], x11\n"
+               "fmov d22, x12\n"
+               "fmov v22.D[1], x13\n"
+               "fmov d23, x14\n"
+               "fmov v23.D[1], x15\n"
                "and %[failed].16B, %[failed].16B, v20.16B\n"
                "and %[failed].16B, %[failed].16B, v21.16B\n"
                "and %[failed].16B, %[failed].16B, v22.16B\n"
                "and %[failed].16B, %[failed].16B, v23.16B\n"
                "st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
                "st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n"
-               : "+Q" (*addr), [failed] "=&w" (failed),
-               [t0] "=&r" (t0), [t1] "=&r" (t1), [t2] "=&r" (t2),
-               [t3] "=&r" (t3), [t4] "=&r" (t4), [t5] "=&r" (t5),
-               [t6] "=&r" (t6), [t7] "=&r" (t7)
-               : [wdata] "r" (wdata128), [dst] "r" (obj_table),
-               [loc] "r" (addr)
-               : "memory", "v16", "v17", "v18", "v19",
-                 "v20", "v21", "v22", "v23"
+               : "+Q" (*addr), [failed] "=&w" (failed)
+               : [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
+               [dst] "r" (obj_table), [loc] "r" (addr)
+               : "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+               "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "v16",
+               "v17", "v18", "v19", "v20", "v21", "v22", "v23"
                );
                break;
        }
        case 8:
        {
-               __uint128_t t0, t1, t2, t3;
-
                asm volatile (
                ".cpu  generic+lse\n"
-               "casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
-               "casp %[t1], %H[t1], %[wdata], %H[wdata], [%[loc]]\n"
-               "casp %[t2], %H[t2], %[wdata], %H[wdata], [%[loc]]\n"
-               "casp %[t3], %H[t3], %[wdata], %H[wdata], [%[loc]]\n"
-               "fmov d16, %[t0]\n"
-               "fmov v16.D[1], %H[t0]\n"
-               "fmov d17, %[t1]\n"
-               "fmov v17.D[1], %H[t1]\n"
-               "fmov d18, %[t2]\n"
-               "fmov v18.D[1], %H[t2]\n"
-               "fmov d19, %[t3]\n"
-               "fmov v19.D[1], %H[t3]\n"
+               "casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n"
+               "fmov d16, x0\n"
+               "fmov v16.D[1], x1\n"
+               "fmov d17, x2\n"
+               "fmov v17.D[1], x3\n"
+               "fmov d18, x4\n"
+               "fmov v18.D[1], x5\n"
+               "fmov d19, x6\n"
+               "fmov v19.D[1], x7\n"
                "and %[failed].16B, %[failed].16B, v16.16B\n"
                "and %[failed].16B, %[failed].16B, v17.16B\n"
                "and %[failed].16B, %[failed].16B, v18.16B\n"
                "and %[failed].16B, %[failed].16B, v19.16B\n"
                "st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
-               : "+Q" (*addr), [failed] "=&w" (failed),
-               [t0] "=&r" (t0), [t1] "=&r" (t1), [t2] "=&r" (t2),
-               [t3] "=&r" (t3)
-               : [wdata] "r" (wdata128), [dst] "r" (obj_table),
-               [loc] "r" (addr)
-               : "memory", "v16", "v17", "v18", "v19"
+               : "+Q" (*addr), [failed] "=&w" (failed)
+               : [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
+               [dst] "r" (obj_table), [loc] "r" (addr)
+               : "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+               "v16", "v17", "v18", "v19"
                );
                break;
        }
        case 4:
        {
-               __uint128_t t0, t1;
-
                asm volatile (
                ".cpu  generic+lse\n"
-               "casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
-               "casp %[t1], %H[t1], %[wdata], %H[wdata], [%[loc]]\n"
-               "fmov d16, %[t0]\n"
-               "fmov v16.D[1], %H[t0]\n"
-               "fmov d17, %[t1]\n"
-               "fmov v17.D[1], %H[t1]\n"
+               "casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
+               "fmov d16, x0\n"
+               "fmov v16.D[1], x1\n"
+               "fmov d17, x2\n"
+               "fmov v17.D[1], x3\n"
                "and %[failed].16B, %[failed].16B, v16.16B\n"
                "and %[failed].16B, %[failed].16B, v17.16B\n"
                "st1 { v16.2d, v17.2d}, [%[dst]], 32\n"
-               : "+Q" (*addr), [failed] "=&w" (failed),
-               [t0] "=&r" (t0), [t1] "=&r" (t1)
-               : [wdata] "r" (wdata128), [dst] "r" (obj_table),
-               [loc] "r" (addr)
-               : "memory", "v16", "v17"
+               : "+Q" (*addr), [failed] "=&w" (failed)
+               : [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
+               [dst] "r" (obj_table), [loc] "r" (addr)
+               : "memory", "x0", "x1", "x2", "x3", "v16", "v17"
                );
                break;
        }
        case 2:
        {
-               __uint128_t t0;
-
                asm volatile (
                ".cpu  generic+lse\n"
-               "casp %[t0], %H[t0], %[wdata], %H[wdata], [%[loc]]\n"
-               "fmov d16, %[t0]\n"
-               "fmov v16.D[1], %H[t0]\n"
+               "casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+               "fmov d16, x0\n"
+               "fmov v16.D[1], x1\n"
                "and %[failed].16B, %[failed].16B, v16.16B\n"
                "st1 { v16.2d}, [%[dst]], 16\n"
-               : "+Q" (*addr), [failed] "=&w" (failed),
-               [t0] "=&r" (t0)
-               : [wdata] "r" (wdata128), [dst] "r" (obj_table),
-               [loc] "r" (addr)
-               : "memory", "v16"
+               : "+Q" (*addr), [failed] "=&w" (failed)
+               : [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
+               [dst] "r" (obj_table), [loc] "r" (addr)
+               : "memory", "x0", "x1", "v16"
                );
                break;
        }
@@ -308,7 +285,7 @@ otx2_npa_clear_alloc(struct rte_mempool *mp, void **obj_table, unsigned int n)
        }
 }
 
-static inline int __hot
+static __rte_noinline int __rte_hot
 otx2_npa_deq_arm64(struct rte_mempool *mp, void **obj_table, unsigned int n)
 {
        const int64_t wdata = npa_lf_aura_handle_to_aura(mp->pool_id);
@@ -332,9 +309,10 @@ otx2_npa_deq_arm64(struct rte_mempool *mp, void **obj_table, unsigned int n)
 
        return 0;
 }
-#endif
 
-static inline int __hot
+#else
+
+static inline int __rte_hot
 otx2_npa_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)
 {
        const int64_t wdata = npa_lf_aura_handle_to_aura(mp->pool_id);
@@ -359,6 +337,8 @@ otx2_npa_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)
        return 0;
 }
 
+#endif
+
 static unsigned int
 otx2_npa_get_count(const struct rte_mempool *mp)
 {
@@ -372,21 +352,26 @@ npa_lf_aura_pool_init(struct otx2_mbox *mbox, uint32_t aura_id,
        struct npa_aq_enq_req *aura_init_req, *pool_init_req;
        struct npa_aq_enq_rsp *aura_init_rsp, *pool_init_rsp;
        struct otx2_mbox_dev *mdev = &mbox->dev[0];
+       struct otx2_idev_cfg *idev;
        int rc, off;
 
+       idev = otx2_intra_dev_get_cfg();
+       if (idev == NULL)
+               return -ENOMEM;
+
        aura_init_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
 
        aura_init_req->aura_id = aura_id;
        aura_init_req->ctype = NPA_AQ_CTYPE_AURA;
        aura_init_req->op = NPA_AQ_INSTOP_INIT;
-       memcpy(&aura_init_req->aura, aura, sizeof(*aura));
+       otx2_mbox_memcpy(&aura_init_req->aura, aura, sizeof(*aura));
 
        pool_init_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
 
        pool_init_req->aura_id = aura_id;
        pool_init_req->ctype = NPA_AQ_CTYPE_POOL;
        pool_init_req->op = NPA_AQ_INSTOP_INIT;
-       memcpy(&pool_init_req->pool, pool, sizeof(*pool));
+       otx2_mbox_memcpy(&pool_init_req->pool, pool, sizeof(*pool));
 
        otx2_mbox_msg_send(mbox, 0);
        rc = otx2_mbox_wait_for_rsp(mbox, 0);
@@ -403,6 +388,44 @@ npa_lf_aura_pool_init(struct otx2_mbox *mbox, uint32_t aura_id,
                return 0;
        else
                return NPA_LF_ERR_AURA_POOL_INIT;
+
+       if (!(idev->npa_lock_mask & BIT_ULL(aura_id)))
+               return 0;
+
+       aura_init_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
+       aura_init_req->aura_id = aura_id;
+       aura_init_req->ctype = NPA_AQ_CTYPE_AURA;
+       aura_init_req->op = NPA_AQ_INSTOP_LOCK;
+
+       pool_init_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
+       if (!pool_init_req) {
+               /* The shared memory buffer can be full.
+                * Flush it and retry
+                */
+               otx2_mbox_msg_send(mbox, 0);
+               rc = otx2_mbox_wait_for_rsp(mbox, 0);
+               if (rc < 0) {
+                       otx2_err("Failed to LOCK AURA context");
+                       return -ENOMEM;
+               }
+
+               pool_init_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
+               if (!pool_init_req) {
+                       otx2_err("Failed to LOCK POOL context");
+                       return -ENOMEM;
+               }
+       }
+       pool_init_req->aura_id = aura_id;
+       pool_init_req->ctype = NPA_AQ_CTYPE_POOL;
+       pool_init_req->op = NPA_AQ_INSTOP_LOCK;
+
+       rc = otx2_mbox_process(mbox);
+       if (rc < 0) {
+               otx2_err("Failed to lock POOL ctx to NDC");
+               return -ENOMEM;
+       }
+
+       return 0;
 }
 
 static int
@@ -414,8 +437,13 @@ npa_lf_aura_pool_fini(struct otx2_mbox *mbox,
        struct npa_aq_enq_rsp *aura_rsp, *pool_rsp;
        struct otx2_mbox_dev *mdev = &mbox->dev[0];
        struct ndc_sync_op *ndc_req;
+       struct otx2_idev_cfg *idev;
        int rc, off;
 
+       idev = otx2_intra_dev_get_cfg();
+       if (idev == NULL)
+               return -EINVAL;
+
        /* Procedure for disabling an aura/pool */
        rte_delay_us(10);
        npa_lf_aura_op_alloc(aura_handle, 0);
@@ -458,6 +486,32 @@ npa_lf_aura_pool_fini(struct otx2_mbox *mbox,
                otx2_err("Error on NDC-NPA LF sync, rc %d", rc);
                return NPA_LF_ERR_AURA_POOL_FINI;
        }
+
+       if (!(idev->npa_lock_mask & BIT_ULL(aura_id)))
+               return 0;
+
+       aura_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
+       aura_req->aura_id = aura_id;
+       aura_req->ctype = NPA_AQ_CTYPE_AURA;
+       aura_req->op = NPA_AQ_INSTOP_UNLOCK;
+
+       rc = otx2_mbox_process(mbox);
+       if (rc < 0) {
+               otx2_err("Failed to unlock AURA ctx to NDC");
+               return -EINVAL;
+       }
+
+       pool_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
+       pool_req->aura_id = aura_id;
+       pool_req->ctype = NPA_AQ_CTYPE_POOL;
+       pool_req->op = NPA_AQ_INSTOP_UNLOCK;
+
+       rc = otx2_mbox_process(mbox);
+       if (rc < 0) {
+               otx2_err("Failed to unlock POOL ctx to NDC");
+               return -EINVAL;
+       }
+
        return 0;
 }
 
@@ -623,14 +677,49 @@ npa_lf_aura_pool_pair_free(struct otx2_npa_lf *lf, uint64_t aura_handle)
        return rc;
 }
 
+static int
+npa_lf_aura_range_update_check(uint64_t aura_handle)
+{
+       uint64_t aura_id = npa_lf_aura_handle_to_aura(aura_handle);
+       struct otx2_npa_lf *lf = otx2_npa_lf_obj_get();
+       struct npa_aura_lim *lim = lf->aura_lim;
+       __otx2_io struct npa_pool_s *pool;
+       struct npa_aq_enq_req *req;
+       struct npa_aq_enq_rsp *rsp;
+       int rc;
+
+       req  = otx2_mbox_alloc_msg_npa_aq_enq(lf->mbox);
+
+       req->aura_id = aura_id;
+       req->ctype = NPA_AQ_CTYPE_POOL;
+       req->op = NPA_AQ_INSTOP_READ;
+
+       rc = otx2_mbox_process_msg(lf->mbox, (void *)&rsp);
+       if (rc) {
+               otx2_err("Failed to get pool(0x%"PRIx64") context", aura_id);
+               return rc;
+       }
+
+       pool = &rsp->pool;
+
+       if (lim[aura_id].ptr_start != pool->ptr_start ||
+               lim[aura_id].ptr_end != pool->ptr_end) {
+               otx2_err("Range update failed on pool(0x%"PRIx64")", aura_id);
+               return -ERANGE;
+       }
+
+       return 0;
+}
+
 static int
 otx2_npa_alloc(struct rte_mempool *mp)
 {
        uint32_t block_size, block_count;
+       uint64_t aura_handle = 0;
        struct otx2_npa_lf *lf;
        struct npa_aura_s aura;
        struct npa_pool_s pool;
-       uint64_t aura_handle;
+       size_t padding;
        int rc;
 
        lf = otx2_npa_lf_obj_get();
@@ -640,6 +729,18 @@ otx2_npa_alloc(struct rte_mempool *mp)
        }
 
        block_size = mp->elt_size + mp->header_size + mp->trailer_size;
+       /*
+        * OCTEON TX2 has 8 sets, 41 ways L1D cache, VA<9:7> bits dictate
+        * the set selection.
+        * Add additional padding to ensure that the element size always
+        * occupies odd number of cachelines to ensure even distribution
+        * of elements among L1D cache sets.
+        */
+       padding = ((block_size / RTE_CACHE_LINE_SIZE) % 2) ? 0 :
+                               RTE_CACHE_LINE_SIZE;
+       mp->trailer_size += padding;
+       block_size += padding;
+
        block_count = mp->size;
 
        if (block_size % OTX2_ALIGN != 0) {
@@ -703,24 +804,21 @@ static ssize_t
 otx2_npa_calc_mem_size(const struct rte_mempool *mp, uint32_t obj_num,
                       uint32_t pg_shift, size_t *min_chunk_size, size_t *align)
 {
-       ssize_t mem_size;
+       size_t total_elt_sz;
 
-       /*
-        * Simply need space for one more object to be able to
-        * fulfill alignment requirements.
+       /* Need space for one more obj on each chunk to fulfill
+        * alignment requirements.
         */
-       mem_size = rte_mempool_op_calc_mem_size_default(mp, obj_num + 1,
-                                                       pg_shift,
-                                                       min_chunk_size, align);
-       if (mem_size >= 0) {
-               /*
-                * Memory area which contains objects must be physically
-                * contiguous.
-                */
-               *min_chunk_size = mem_size;
-       }
+       total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
+       return rte_mempool_op_calc_mem_size_helper(mp, obj_num, pg_shift,
+                                               total_elt_sz, min_chunk_size,
+                                               align);
+}
 
-       return mem_size;
+static uint8_t
+otx2_npa_l1d_way_set_get(uint64_t iova)
+{
+       return (iova >> rte_log2_u32(RTE_CACHE_LINE_SIZE)) & 0x7;
 }
 
 static int
@@ -728,8 +826,13 @@ otx2_npa_populate(struct rte_mempool *mp, unsigned int max_objs, void *vaddr,
                  rte_iova_t iova, size_t len,
                  rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
 {
+#define OTX2_L1D_NB_SETS       8
+       uint64_t distribution[OTX2_L1D_NB_SETS];
+       rte_iova_t start_iova;
        size_t total_elt_sz;
+       uint8_t set;
        size_t off;
+       int i;
 
        if (iova == RTE_BAD_IOVA)
                return -EINVAL;
@@ -737,19 +840,45 @@ otx2_npa_populate(struct rte_mempool *mp, unsigned int max_objs, void *vaddr,
        total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
 
        /* Align object start address to a multiple of total_elt_sz */
-       off = total_elt_sz - ((uintptr_t)vaddr % total_elt_sz);
+       off = total_elt_sz - ((((uintptr_t)vaddr - 1) % total_elt_sz) + 1);
 
        if (len < off)
                return -EINVAL;
 
+
        vaddr = (char *)vaddr + off;
        iova += off;
        len -= off;
 
+       memset(distribution, 0, sizeof(uint64_t) * OTX2_L1D_NB_SETS);
+       start_iova = iova;
+       while (start_iova < iova + len) {
+               set = otx2_npa_l1d_way_set_get(start_iova + mp->header_size);
+               distribution[set]++;
+               start_iova += total_elt_sz;
+       }
+
+       otx2_npa_dbg("iova %"PRIx64", aligned iova %"PRIx64"", iova - off,
+                    iova);
+       otx2_npa_dbg("length %"PRIu64", aligned length %"PRIu64"",
+                    (uint64_t)(len + off), (uint64_t)len);
+       otx2_npa_dbg("element size %"PRIu64"", (uint64_t)total_elt_sz);
+       otx2_npa_dbg("requested objects %"PRIu64", possible objects %"PRIu64"",
+                    (uint64_t)max_objs, (uint64_t)(len / total_elt_sz));
+       otx2_npa_dbg("L1D set distribution :");
+       for (i = 0; i < OTX2_L1D_NB_SETS; i++)
+               otx2_npa_dbg("set[%d] : objects : %"PRIu64"", i,
+                            distribution[i]);
+
        npa_lf_aura_op_range_set(mp->pool_id, iova, iova + len);
 
-       return rte_mempool_op_populate_default(mp, max_objs, vaddr, iova, len,
-                                              obj_cb, obj_cb_arg);
+       if (npa_lf_aura_range_update_check(mp->pool_id) < 0)
+               return -EBUSY;
+
+       return rte_mempool_op_populate_helper(mp,
+                                       RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ,
+                                       max_objs, vaddr, iova, len,
+                                       obj_cb, obj_cb_arg);
 }
 
 static struct rte_mempool_ops otx2_npa_ops = {