+static int __rte_hot
+otx2_npa_enq(struct rte_mempool *mp, void * const *obj_table, unsigned int n)
+{
+ unsigned int index; const uint64_t aura_handle = mp->pool_id;
+ const uint64_t reg = npa_lf_aura_handle_to_aura(aura_handle);
+ const uint64_t addr = npa_lf_aura_handle_to_base(aura_handle) +
+ NPA_LF_AURA_OP_FREE0;
+
+ /* Ensure mbuf init changes are written before the free pointers
+ * are enqueued to the stack.
+ */
+ rte_io_wmb();
+ for (index = 0; index < n; index++)
+ otx2_store_pair((uint64_t)obj_table[index], reg, addr);
+
+ return 0;
+}
+
+static __rte_noinline int
+npa_lf_aura_op_alloc_one(const int64_t wdata, int64_t * const addr,
+ void **obj_table, uint8_t i)
+{
+ uint8_t retry = 4;
+
+ do {
+ obj_table[i] = (void *)otx2_atomic64_add_nosync(wdata, addr);
+ if (obj_table[i] != NULL)
+ return 0;
+
+ } while (retry--);
+
+ return -ENOENT;
+}
+
+#if defined(RTE_ARCH_ARM64)
+static __rte_noinline int
+npa_lf_aura_op_search_alloc(const int64_t wdata, int64_t * const addr,
+ void **obj_table, unsigned int n)
+{
+ uint8_t i;
+
+ for (i = 0; i < n; i++) {
+ if (obj_table[i] != NULL)
+ continue;
+ if (npa_lf_aura_op_alloc_one(wdata, addr, obj_table, i))
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+static __rte_noinline int
+npa_lf_aura_op_alloc_bulk(const int64_t wdata, int64_t * const addr,
+ unsigned int n, void **obj_table)
+{
+ register const uint64_t wdata64 __asm("x26") = wdata;
+ register const uint64_t wdata128 __asm("x27") = wdata;
+ uint64x2_t failed = vdupq_n_u64(~0);
+
+ switch (n) {
+ case 32:
+ {
+ asm volatile (
+ ".cpu generic+lse\n"
+ "casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+ "casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
+ "casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n"
+ "casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n"
+ "casp x8, x9, %[wdata64], %[wdata128], [%[loc]]\n"
+ "casp x10, x11, %[wdata64], %[wdata128], [%[loc]]\n"
+ "casp x12, x13, %[wdata64], %[wdata128], [%[loc]]\n"
+ "casp x14, x15, %[wdata64], %[wdata128], [%[loc]]\n"
+ "casp x16, x17, %[wdata64], %[wdata128], [%[loc]]\n"
+ "casp x18, x19, %[wdata64], %[wdata128], [%[loc]]\n"
+ "casp x20, x21, %[wdata64], %[wdata128], [%[loc]]\n"
+ "casp x22, x23, %[wdata64], %[wdata128], [%[loc]]\n"
+ "fmov d16, x0\n"
+ "fmov v16.D[1], x1\n"
+ "casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+ "fmov d17, x2\n"
+ "fmov v17.D[1], x3\n"
+ "casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
+ "fmov d18, x4\n"
+ "fmov v18.D[1], x5\n"
+ "casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n"
+ "fmov d19, x6\n"
+ "fmov v19.D[1], x7\n"
+ "casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n"
+ "and %[failed].16B, %[failed].16B, v16.16B\n"
+ "and %[failed].16B, %[failed].16B, v17.16B\n"
+ "and %[failed].16B, %[failed].16B, v18.16B\n"
+ "and %[failed].16B, %[failed].16B, v19.16B\n"
+ "fmov d20, x8\n"
+ "fmov v20.D[1], x9\n"
+ "fmov d21, x10\n"
+ "fmov v21.D[1], x11\n"
+ "fmov d22, x12\n"
+ "fmov v22.D[1], x13\n"
+ "fmov d23, x14\n"
+ "fmov v23.D[1], x15\n"
+ "and %[failed].16B, %[failed].16B, v20.16B\n"
+ "and %[failed].16B, %[failed].16B, v21.16B\n"
+ "and %[failed].16B, %[failed].16B, v22.16B\n"
+ "and %[failed].16B, %[failed].16B, v23.16B\n"
+ "st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
+ "st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n"
+ "fmov d16, x16\n"
+ "fmov v16.D[1], x17\n"
+ "fmov d17, x18\n"
+ "fmov v17.D[1], x19\n"
+ "fmov d18, x20\n"
+ "fmov v18.D[1], x21\n"
+ "fmov d19, x22\n"
+ "fmov v19.D[1], x23\n"
+ "and %[failed].16B, %[failed].16B, v16.16B\n"
+ "and %[failed].16B, %[failed].16B, v17.16B\n"
+ "and %[failed].16B, %[failed].16B, v18.16B\n"
+ "and %[failed].16B, %[failed].16B, v19.16B\n"
+ "fmov d20, x0\n"
+ "fmov v20.D[1], x1\n"
+ "fmov d21, x2\n"
+ "fmov v21.D[1], x3\n"
+ "fmov d22, x4\n"
+ "fmov v22.D[1], x5\n"
+ "fmov d23, x6\n"
+ "fmov v23.D[1], x7\n"
+ "and %[failed].16B, %[failed].16B, v20.16B\n"
+ "and %[failed].16B, %[failed].16B, v21.16B\n"
+ "and %[failed].16B, %[failed].16B, v22.16B\n"
+ "and %[failed].16B, %[failed].16B, v23.16B\n"
+ "st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
+ "st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n"
+ : "+Q" (*addr), [failed] "=&w" (failed)
+ : [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
+ [dst] "r" (obj_table), [loc] "r" (addr)
+ : "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+ "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16",
+ "x17", "x18", "x19", "x20", "x21", "x22", "x23", "v16", "v17",
+ "v18", "v19", "v20", "v21", "v22", "v23"
+ );
+ break;
+ }
+ case 16:
+ {
+ asm volatile (
+ ".cpu generic+lse\n"
+ "casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+ "casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
+ "casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n"
+ "casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n"
+ "casp x8, x9, %[wdata64], %[wdata128], [%[loc]]\n"
+ "casp x10, x11, %[wdata64], %[wdata128], [%[loc]]\n"
+ "casp x12, x13, %[wdata64], %[wdata128], [%[loc]]\n"
+ "casp x14, x15, %[wdata64], %[wdata128], [%[loc]]\n"
+ "fmov d16, x0\n"
+ "fmov v16.D[1], x1\n"
+ "fmov d17, x2\n"
+ "fmov v17.D[1], x3\n"
+ "fmov d18, x4\n"
+ "fmov v18.D[1], x5\n"
+ "fmov d19, x6\n"
+ "fmov v19.D[1], x7\n"
+ "and %[failed].16B, %[failed].16B, v16.16B\n"
+ "and %[failed].16B, %[failed].16B, v17.16B\n"
+ "and %[failed].16B, %[failed].16B, v18.16B\n"
+ "and %[failed].16B, %[failed].16B, v19.16B\n"
+ "fmov d20, x8\n"
+ "fmov v20.D[1], x9\n"
+ "fmov d21, x10\n"
+ "fmov v21.D[1], x11\n"
+ "fmov d22, x12\n"
+ "fmov v22.D[1], x13\n"
+ "fmov d23, x14\n"
+ "fmov v23.D[1], x15\n"
+ "and %[failed].16B, %[failed].16B, v20.16B\n"
+ "and %[failed].16B, %[failed].16B, v21.16B\n"
+ "and %[failed].16B, %[failed].16B, v22.16B\n"
+ "and %[failed].16B, %[failed].16B, v23.16B\n"
+ "st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
+ "st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n"
+ : "+Q" (*addr), [failed] "=&w" (failed)
+ : [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
+ [dst] "r" (obj_table), [loc] "r" (addr)
+ : "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+ "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "v16",
+ "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+ );
+ break;
+ }
+ case 8:
+ {
+ asm volatile (
+ ".cpu generic+lse\n"
+ "casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+ "casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
+ "casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n"
+ "casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n"
+ "fmov d16, x0\n"
+ "fmov v16.D[1], x1\n"
+ "fmov d17, x2\n"
+ "fmov v17.D[1], x3\n"
+ "fmov d18, x4\n"
+ "fmov v18.D[1], x5\n"
+ "fmov d19, x6\n"
+ "fmov v19.D[1], x7\n"
+ "and %[failed].16B, %[failed].16B, v16.16B\n"
+ "and %[failed].16B, %[failed].16B, v17.16B\n"
+ "and %[failed].16B, %[failed].16B, v18.16B\n"
+ "and %[failed].16B, %[failed].16B, v19.16B\n"
+ "st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
+ : "+Q" (*addr), [failed] "=&w" (failed)
+ : [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
+ [dst] "r" (obj_table), [loc] "r" (addr)
+ : "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+ "v16", "v17", "v18", "v19"
+ );
+ break;
+ }
+ case 4:
+ {
+ asm volatile (
+ ".cpu generic+lse\n"
+ "casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+ "casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
+ "fmov d16, x0\n"
+ "fmov v16.D[1], x1\n"
+ "fmov d17, x2\n"
+ "fmov v17.D[1], x3\n"
+ "and %[failed].16B, %[failed].16B, v16.16B\n"
+ "and %[failed].16B, %[failed].16B, v17.16B\n"
+ "st1 { v16.2d, v17.2d}, [%[dst]], 32\n"
+ : "+Q" (*addr), [failed] "=&w" (failed)
+ : [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
+ [dst] "r" (obj_table), [loc] "r" (addr)
+ : "memory", "x0", "x1", "x2", "x3", "v16", "v17"
+ );
+ break;
+ }
+ case 2:
+ {
+ asm volatile (
+ ".cpu generic+lse\n"
+ "casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+ "fmov d16, x0\n"
+ "fmov v16.D[1], x1\n"
+ "and %[failed].16B, %[failed].16B, v16.16B\n"
+ "st1 { v16.2d}, [%[dst]], 16\n"
+ : "+Q" (*addr), [failed] "=&w" (failed)
+ : [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
+ [dst] "r" (obj_table), [loc] "r" (addr)
+ : "memory", "x0", "x1", "v16"
+ );
+ break;
+ }
+ case 1:
+ return npa_lf_aura_op_alloc_one(wdata, addr, obj_table, 0);
+ }
+
+ if (unlikely(!(vgetq_lane_u64(failed, 0) & vgetq_lane_u64(failed, 1))))
+ return npa_lf_aura_op_search_alloc(wdata, addr, (void **)
+ ((char *)obj_table - (sizeof(uint64_t) * n)), n);
+
+ return 0;
+}
+
+static __rte_noinline void
+otx2_npa_clear_alloc(struct rte_mempool *mp, void **obj_table, unsigned int n)
+{
+ unsigned int i;
+
+ for (i = 0; i < n; i++) {
+ if (obj_table[i] != NULL) {
+ otx2_npa_enq(mp, &obj_table[i], 1);
+ obj_table[i] = NULL;
+ }
+ }
+}
+
+static __rte_noinline int __rte_hot
+otx2_npa_deq_arm64(struct rte_mempool *mp, void **obj_table, unsigned int n)
+{
+ const int64_t wdata = npa_lf_aura_handle_to_aura(mp->pool_id);
+ void **obj_table_bak = obj_table;
+ const unsigned int nfree = n;
+ unsigned int parts;
+
+ int64_t * const addr = (int64_t * const)
+ (npa_lf_aura_handle_to_base(mp->pool_id) +
+ NPA_LF_AURA_OP_ALLOCX(0));
+ while (n) {
+ parts = n > 31 ? 32 : rte_align32prevpow2(n);
+ n -= parts;
+ if (unlikely(npa_lf_aura_op_alloc_bulk(wdata, addr,
+ parts, obj_table))) {
+ otx2_npa_clear_alloc(mp, obj_table_bak, nfree - n);
+ return -ENOENT;
+ }
+ obj_table += parts;
+ }
+
+ return 0;
+}
+
+#else
+
+static inline int __rte_hot
+otx2_npa_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)
+{
+ const int64_t wdata = npa_lf_aura_handle_to_aura(mp->pool_id);
+ unsigned int index;
+ uint64_t obj;
+
+ int64_t * const addr = (int64_t *)
+ (npa_lf_aura_handle_to_base(mp->pool_id) +
+ NPA_LF_AURA_OP_ALLOCX(0));
+ for (index = 0; index < n; index++, obj_table++) {
+ obj = npa_lf_aura_op_alloc_one(wdata, addr, obj_table, 0);
+ if (obj == 0) {
+ for (; index > 0; index--) {
+ obj_table--;
+ otx2_npa_enq(mp, obj_table, 1);
+ }
+ return -ENOENT;
+ }
+ *obj_table = (void *)obj;
+ }
+
+ return 0;
+}
+
+#endif
+
+static unsigned int
+otx2_npa_get_count(const struct rte_mempool *mp)
+{
+ return (unsigned int)npa_lf_aura_op_available(mp->pool_id);
+}
+