mempool: introduce helpers for populate and required size
[dpdk.git] / drivers / mempool / octeontx2 / otx2_mempool_ops.c
index c59bd73..3aea92a 100644 (file)
@@ -37,6 +37,277 @@ npa_lf_aura_op_alloc_one(const int64_t wdata, int64_t * const addr,
        return -ENOENT;
 }
 
+#if defined(RTE_ARCH_ARM64)
+static __rte_noinline int
+npa_lf_aura_op_search_alloc(const int64_t wdata, int64_t * const addr,
+               void **obj_table, unsigned int n)
+{
+       uint8_t i;
+
+       for (i = 0; i < n; i++) {
+               if (obj_table[i] != NULL)
+                       continue;
+               if (npa_lf_aura_op_alloc_one(wdata, addr, obj_table, i))
+                       return -ENOENT;
+       }
+
+       return 0;
+}
+
+static __rte_noinline int
+npa_lf_aura_op_alloc_bulk(const int64_t wdata, int64_t * const addr,
+                         unsigned int n, void **obj_table)
+{
+       register const uint64_t wdata64 __asm("x26") = wdata;
+       register const uint64_t wdata128 __asm("x27") = wdata;
+       uint64x2_t failed = vdupq_n_u64(~0);
+
+       switch (n) {
+       case 32:
+       {
+               asm volatile (
+               ".cpu  generic+lse\n"
+               "casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x8, x9, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x10, x11, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x12, x13, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x14, x15, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x16, x17, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x18, x19, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x20, x21, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x22, x23, %[wdata64], %[wdata128], [%[loc]]\n"
+               "fmov d16, x0\n"
+               "fmov v16.D[1], x1\n"
+               "casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+               "fmov d17, x2\n"
+               "fmov v17.D[1], x3\n"
+               "casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
+               "fmov d18, x4\n"
+               "fmov v18.D[1], x5\n"
+               "casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n"
+               "fmov d19, x6\n"
+               "fmov v19.D[1], x7\n"
+               "casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n"
+               "and %[failed].16B, %[failed].16B, v16.16B\n"
+               "and %[failed].16B, %[failed].16B, v17.16B\n"
+               "and %[failed].16B, %[failed].16B, v18.16B\n"
+               "and %[failed].16B, %[failed].16B, v19.16B\n"
+               "fmov d20, x8\n"
+               "fmov v20.D[1], x9\n"
+               "fmov d21, x10\n"
+               "fmov v21.D[1], x11\n"
+               "fmov d22, x12\n"
+               "fmov v22.D[1], x13\n"
+               "fmov d23, x14\n"
+               "fmov v23.D[1], x15\n"
+               "and %[failed].16B, %[failed].16B, v20.16B\n"
+               "and %[failed].16B, %[failed].16B, v21.16B\n"
+               "and %[failed].16B, %[failed].16B, v22.16B\n"
+               "and %[failed].16B, %[failed].16B, v23.16B\n"
+               "st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
+               "st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n"
+               "fmov d16, x16\n"
+               "fmov v16.D[1], x17\n"
+               "fmov d17, x18\n"
+               "fmov v17.D[1], x19\n"
+               "fmov d18, x20\n"
+               "fmov v18.D[1], x21\n"
+               "fmov d19, x22\n"
+               "fmov v19.D[1], x23\n"
+               "and %[failed].16B, %[failed].16B, v16.16B\n"
+               "and %[failed].16B, %[failed].16B, v17.16B\n"
+               "and %[failed].16B, %[failed].16B, v18.16B\n"
+               "and %[failed].16B, %[failed].16B, v19.16B\n"
+               "fmov d20, x0\n"
+               "fmov v20.D[1], x1\n"
+               "fmov d21, x2\n"
+               "fmov v21.D[1], x3\n"
+               "fmov d22, x4\n"
+               "fmov v22.D[1], x5\n"
+               "fmov d23, x6\n"
+               "fmov v23.D[1], x7\n"
+               "and %[failed].16B, %[failed].16B, v20.16B\n"
+               "and %[failed].16B, %[failed].16B, v21.16B\n"
+               "and %[failed].16B, %[failed].16B, v22.16B\n"
+               "and %[failed].16B, %[failed].16B, v23.16B\n"
+               "st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
+               "st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n"
+               : "+Q" (*addr), [failed] "=&w" (failed)
+               : [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
+               [dst] "r" (obj_table), [loc] "r" (addr)
+               : "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+               "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16",
+               "x17", "x18", "x19", "x20", "x21", "x22", "x23", "v16", "v17",
+               "v18", "v19", "v20", "v21", "v22", "v23"
+               );
+               break;
+       }
+       case 16:
+       {
+               asm volatile (
+               ".cpu  generic+lse\n"
+               "casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x8, x9, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x10, x11, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x12, x13, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x14, x15, %[wdata64], %[wdata128], [%[loc]]\n"
+               "fmov d16, x0\n"
+               "fmov v16.D[1], x1\n"
+               "fmov d17, x2\n"
+               "fmov v17.D[1], x3\n"
+               "fmov d18, x4\n"
+               "fmov v18.D[1], x5\n"
+               "fmov d19, x6\n"
+               "fmov v19.D[1], x7\n"
+               "and %[failed].16B, %[failed].16B, v16.16B\n"
+               "and %[failed].16B, %[failed].16B, v17.16B\n"
+               "and %[failed].16B, %[failed].16B, v18.16B\n"
+               "and %[failed].16B, %[failed].16B, v19.16B\n"
+               "fmov d20, x8\n"
+               "fmov v20.D[1], x9\n"
+               "fmov d21, x10\n"
+               "fmov v21.D[1], x11\n"
+               "fmov d22, x12\n"
+               "fmov v22.D[1], x13\n"
+               "fmov d23, x14\n"
+               "fmov v23.D[1], x15\n"
+               "and %[failed].16B, %[failed].16B, v20.16B\n"
+               "and %[failed].16B, %[failed].16B, v21.16B\n"
+               "and %[failed].16B, %[failed].16B, v22.16B\n"
+               "and %[failed].16B, %[failed].16B, v23.16B\n"
+               "st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
+               "st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n"
+               : "+Q" (*addr), [failed] "=&w" (failed)
+               : [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
+               [dst] "r" (obj_table), [loc] "r" (addr)
+               : "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+               "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "v16",
+               "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+               );
+               break;
+       }
+       case 8:
+       {
+               asm volatile (
+               ".cpu  generic+lse\n"
+               "casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n"
+               "fmov d16, x0\n"
+               "fmov v16.D[1], x1\n"
+               "fmov d17, x2\n"
+               "fmov v17.D[1], x3\n"
+               "fmov d18, x4\n"
+               "fmov v18.D[1], x5\n"
+               "fmov d19, x6\n"
+               "fmov v19.D[1], x7\n"
+               "and %[failed].16B, %[failed].16B, v16.16B\n"
+               "and %[failed].16B, %[failed].16B, v17.16B\n"
+               "and %[failed].16B, %[failed].16B, v18.16B\n"
+               "and %[failed].16B, %[failed].16B, v19.16B\n"
+               "st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
+               : "+Q" (*addr), [failed] "=&w" (failed)
+               : [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
+               [dst] "r" (obj_table), [loc] "r" (addr)
+               : "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+               "v16", "v17", "v18", "v19"
+               );
+               break;
+       }
+       case 4:
+       {
+               asm volatile (
+               ".cpu  generic+lse\n"
+               "casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+               "casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
+               "fmov d16, x0\n"
+               "fmov v16.D[1], x1\n"
+               "fmov d17, x2\n"
+               "fmov v17.D[1], x3\n"
+               "and %[failed].16B, %[failed].16B, v16.16B\n"
+               "and %[failed].16B, %[failed].16B, v17.16B\n"
+               "st1 { v16.2d, v17.2d}, [%[dst]], 32\n"
+               : "+Q" (*addr), [failed] "=&w" (failed)
+               : [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
+               [dst] "r" (obj_table), [loc] "r" (addr)
+               : "memory", "x0", "x1", "x2", "x3", "v16", "v17"
+               );
+               break;
+       }
+       case 2:
+       {
+               asm volatile (
+               ".cpu  generic+lse\n"
+               "casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
+               "fmov d16, x0\n"
+               "fmov v16.D[1], x1\n"
+               "and %[failed].16B, %[failed].16B, v16.16B\n"
+               "st1 { v16.2d}, [%[dst]], 16\n"
+               : "+Q" (*addr), [failed] "=&w" (failed)
+               : [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
+               [dst] "r" (obj_table), [loc] "r" (addr)
+               : "memory", "x0", "x1", "v16"
+               );
+               break;
+       }
+       case 1:
+               return npa_lf_aura_op_alloc_one(wdata, addr, obj_table, 0);
+       }
+
+       if (unlikely(!(vgetq_lane_u64(failed, 0) & vgetq_lane_u64(failed, 1))))
+               return npa_lf_aura_op_search_alloc(wdata, addr, (void **)
+                       ((char *)obj_table - (sizeof(uint64_t) * n)), n);
+
+       return 0;
+}
+
+static __rte_noinline void
+otx2_npa_clear_alloc(struct rte_mempool *mp, void **obj_table, unsigned int n)
+{
+       unsigned int i;
+
+       for (i = 0; i < n; i++) {
+               if (obj_table[i] != NULL) {
+                       otx2_npa_enq(mp, &obj_table[i], 1);
+                       obj_table[i] = NULL;
+               }
+       }
+}
+
+static __rte_noinline int __hot
+otx2_npa_deq_arm64(struct rte_mempool *mp, void **obj_table, unsigned int n)
+{
+       const int64_t wdata = npa_lf_aura_handle_to_aura(mp->pool_id);
+       void **obj_table_bak = obj_table;
+       const unsigned int nfree = n;
+       unsigned int parts;
+
+       int64_t * const addr = (int64_t * const)
+                       (npa_lf_aura_handle_to_base(mp->pool_id) +
+                               NPA_LF_AURA_OP_ALLOCX(0));
+       while (n) {
+               parts = n > 31 ? 32 : rte_align32prevpow2(n);
+               n -= parts;
+               if (unlikely(npa_lf_aura_op_alloc_bulk(wdata, addr,
+                               parts, obj_table))) {
+                       otx2_npa_clear_alloc(mp, obj_table_bak, nfree - n);
+                       return -ENOENT;
+               }
+               obj_table += parts;
+       }
+
+       return 0;
+}
+
+#else
+
 static inline int __hot
 otx2_npa_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)
 {
@@ -44,7 +315,7 @@ otx2_npa_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)
        unsigned int index;
        uint64_t obj;
 
-       int64_t * const addr = (int64_t * const)
+       int64_t * const addr = (int64_t *)
                        (npa_lf_aura_handle_to_base(mp->pool_id) +
                                NPA_LF_AURA_OP_ALLOCX(0));
        for (index = 0; index < n; index++, obj_table++) {
@@ -62,6 +333,8 @@ otx2_npa_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)
        return 0;
 }
 
+#endif
+
 static unsigned int
 otx2_npa_get_count(const struct rte_mempool *mp)
 {
@@ -82,14 +355,14 @@ npa_lf_aura_pool_init(struct otx2_mbox *mbox, uint32_t aura_id,
        aura_init_req->aura_id = aura_id;
        aura_init_req->ctype = NPA_AQ_CTYPE_AURA;
        aura_init_req->op = NPA_AQ_INSTOP_INIT;
-       memcpy(&aura_init_req->aura, aura, sizeof(*aura));
+       otx2_mbox_memcpy(&aura_init_req->aura, aura, sizeof(*aura));
 
        pool_init_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
 
        pool_init_req->aura_id = aura_id;
        pool_init_req->ctype = NPA_AQ_CTYPE_POOL;
        pool_init_req->op = NPA_AQ_INSTOP_INIT;
-       memcpy(&pool_init_req->pool, pool, sizeof(*pool));
+       otx2_mbox_memcpy(&pool_init_req->pool, pool, sizeof(*pool));
 
        otx2_mbox_msg_send(mbox, 0);
        rc = otx2_mbox_wait_for_rsp(mbox, 0);
@@ -326,6 +599,40 @@ npa_lf_aura_pool_pair_free(struct otx2_npa_lf *lf, uint64_t aura_handle)
        return rc;
 }
 
+static int
+npa_lf_aura_range_update_check(uint64_t aura_handle)
+{
+       uint64_t aura_id = npa_lf_aura_handle_to_aura(aura_handle);
+       struct otx2_npa_lf *lf = otx2_npa_lf_obj_get();
+       struct npa_aura_lim *lim = lf->aura_lim;
+       __otx2_io struct npa_pool_s *pool;
+       struct npa_aq_enq_req *req;
+       struct npa_aq_enq_rsp *rsp;
+       int rc;
+
+       req  = otx2_mbox_alloc_msg_npa_aq_enq(lf->mbox);
+
+       req->aura_id = aura_id;
+       req->ctype = NPA_AQ_CTYPE_POOL;
+       req->op = NPA_AQ_INSTOP_READ;
+
+       rc = otx2_mbox_process_msg(lf->mbox, (void *)&rsp);
+       if (rc) {
+               otx2_err("Failed to get pool(0x%"PRIx64") context", aura_id);
+               return rc;
+       }
+
+       pool = &rsp->pool;
+
+       if (lim[aura_id].ptr_start != pool->ptr_start ||
+               lim[aura_id].ptr_end != pool->ptr_end) {
+               otx2_err("Range update failed on pool(0x%"PRIx64")", aura_id);
+               return -ERANGE;
+       }
+
+       return 0;
+}
+
 static int
 otx2_npa_alloc(struct rte_mempool *mp)
 {
@@ -406,24 +713,12 @@ static ssize_t
 otx2_npa_calc_mem_size(const struct rte_mempool *mp, uint32_t obj_num,
                       uint32_t pg_shift, size_t *min_chunk_size, size_t *align)
 {
-       ssize_t mem_size;
-
        /*
         * Simply need space for one more object to be able to
         * fulfill alignment requirements.
         */
-       mem_size = rte_mempool_op_calc_mem_size_default(mp, obj_num + 1,
-                                                       pg_shift,
-                                                       min_chunk_size, align);
-       if (mem_size >= 0) {
-               /*
-                * Memory area which contains objects must be physically
-                * contiguous.
-                */
-               *min_chunk_size = mem_size;
-       }
-
-       return mem_size;
+       return rte_mempool_op_calc_mem_size_helper(mp, obj_num + 1, pg_shift,
+                                                   min_chunk_size, align);
 }
 
 static int
@@ -451,7 +746,10 @@ otx2_npa_populate(struct rte_mempool *mp, unsigned int max_objs, void *vaddr,
 
        npa_lf_aura_op_range_set(mp->pool_id, iova, iova + len);
 
-       return rte_mempool_op_populate_default(mp, max_objs, vaddr, iova, len,
+       if (npa_lf_aura_range_update_check(mp->pool_id) < 0)
+               return -EBUSY;
+
+       return rte_mempool_op_populate_helper(mp, max_objs, vaddr, iova, len,
                                               obj_cb, obj_cb_arg);
 }
 
@@ -463,7 +761,11 @@ static struct rte_mempool_ops otx2_npa_ops = {
        .get_count = otx2_npa_get_count,
        .calc_mem_size = otx2_npa_calc_mem_size,
        .populate = otx2_npa_populate,
+#if defined(RTE_ARCH_ARM64)
+       .dequeue = otx2_npa_deq_arm64,
+#else
        .dequeue = otx2_npa_deq,
+#endif
 };
 
 MEMPOOL_REGISTER_OPS(otx2_npa_ops);