From 2409f53a27ab64099ea8c63b1a0f87138d27e68c Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Tue, 6 Apr 2021 20:11:06 +0530 Subject: [PATCH] common/cnxk: support NPA bulk alloc/free Add APIs to alloc/free in bulk from NPA pool. Signed-off-by: Ashwin Sekhar T K Acked-by: Nithin Dabilpuram --- drivers/common/cnxk/roc_npa.h | 229 ++++++++++++++++++++++++++++++++++ 1 file changed, 229 insertions(+) diff --git a/drivers/common/cnxk/roc_npa.h b/drivers/common/cnxk/roc_npa.h index 698384984e..b829b2319e 100644 --- a/drivers/common/cnxk/roc_npa.h +++ b/drivers/common/cnxk/roc_npa.h @@ -8,6 +8,11 @@ #define ROC_AURA_ID_MASK (BIT_ULL(16) - 1) #define ROC_AURA_OP_LIMIT_MASK (BIT_ULL(36) - 1) +/* 16 CASP instructions can be outstanding in CN9k, but we use only 15 + * outstanding CASPs as we run out of registers. + */ +#define ROC_CN9K_NPA_BULK_ALLOC_MAX_PTRS 30 + /* * Generate 64bit handle to have optimized alloc and free aura operation. * 0 - ROC_AURA_ID_MASK for storing the aura_id. @@ -141,6 +146,230 @@ roc_npa_aura_op_available(uint64_t aura_handle) return reg & 0xFFFFFFFFF; } +static inline void +roc_npa_aura_op_bulk_free(uint64_t aura_handle, uint64_t const *buf, + unsigned int num, const int fabs) +{ + unsigned int i; + + for (i = 0; i < num; i++) { + const uint64_t inbuf = buf[i]; + + roc_npa_aura_op_free(aura_handle, fabs, inbuf); + } +} + +static inline unsigned int +roc_npa_aura_bulk_alloc(uint64_t aura_handle, uint64_t *buf, unsigned int num, + const int drop) +{ +#if defined(__aarch64__) + uint64_t wdata = roc_npa_aura_handle_to_aura(aura_handle); + unsigned int i, count; + uint64_t addr; + + if (drop) + wdata |= BIT_ULL(63); /* DROP */ + + addr = roc_npa_aura_handle_to_base(aura_handle) + + NPA_LF_AURA_OP_ALLOCX(0); + + switch (num) { + case 30: + asm volatile( + ".cpu generic+lse\n" + "mov v18.d[0], %[dst]\n" + "mov v18.d[1], %[loc]\n" + "mov v19.d[0], %[wdata]\n" + "mov v19.d[1], x30\n" + "mov v20.d[0], x24\n" + "mov v20.d[1], x25\n" + "mov v21.d[0], x26\n" + "mov v21.d[1], x27\n" + "mov v22.d[0], x28\n" + "mov v22.d[1], x29\n" + "mov x28, v19.d[0]\n" + "mov x29, v19.d[0]\n" + "mov x30, v18.d[1]\n" + "casp x0, x1, x28, x29, [x30]\n" + "casp x2, x3, x28, x29, [x30]\n" + "casp x4, x5, x28, x29, [x30]\n" + "casp x6, x7, x28, x29, [x30]\n" + "casp x8, x9, x28, x29, [x30]\n" + "casp x10, x11, x28, x29, [x30]\n" + "casp x12, x13, x28, x29, [x30]\n" + "casp x14, x15, x28, x29, [x30]\n" + "casp x16, x17, x28, x29, [x30]\n" + "casp x18, x19, x28, x29, [x30]\n" + "casp x20, x21, x28, x29, [x30]\n" + "casp x22, x23, x28, x29, [x30]\n" + "casp x24, x25, x28, x29, [x30]\n" + "casp x26, x27, x28, x29, [x30]\n" + "casp x28, x29, x28, x29, [x30]\n" + "mov x30, v18.d[0]\n" + "stp x0, x1, [x30]\n" + "stp x2, x3, [x30, #16]\n" + "stp x4, x5, [x30, #32]\n" + "stp x6, x7, [x30, #48]\n" + "stp x8, x9, [x30, #64]\n" + "stp x10, x11, [x30, #80]\n" + "stp x12, x13, [x30, #96]\n" + "stp x14, x15, [x30, #112]\n" + "stp x16, x17, [x30, #128]\n" + "stp x18, x19, [x30, #144]\n" + "stp x20, x21, [x30, #160]\n" + "stp x22, x23, [x30, #176]\n" + "stp x24, x25, [x30, #192]\n" + "stp x26, x27, [x30, #208]\n" + "stp x28, x29, [x30, #224]\n" + "mov %[dst], v18.d[0]\n" + "mov %[loc], v18.d[1]\n" + "mov %[wdata], v19.d[0]\n" + "mov x30, v19.d[1]\n" + "mov x24, v20.d[0]\n" + "mov x25, v20.d[1]\n" + "mov x26, v21.d[0]\n" + "mov x27, v21.d[1]\n" + "mov x28, v22.d[0]\n" + "mov x29, v22.d[1]\n" + : + : [wdata] "r"(wdata), [loc] "r"(addr), [dst] "r"(buf) + : "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", + "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", + "x15", "x16", "x17", "x18", "x19", "x20", "x21", + "x22", "x23", "v18", "v19", "v20", "v21", "v22"); + break; + case 16: + asm volatile( + ".cpu generic+lse\n" + "mov x16, %[wdata]\n" + "mov x17, %[wdata]\n" + "casp x0, x1, x16, x17, [%[loc]]\n" + "casp x2, x3, x16, x17, [%[loc]]\n" + "casp x4, x5, x16, x17, [%[loc]]\n" + "casp x6, x7, x16, x17, [%[loc]]\n" + "casp x8, x9, x16, x17, [%[loc]]\n" + "casp x10, x11, x16, x17, [%[loc]]\n" + "casp x12, x13, x16, x17, [%[loc]]\n" + "casp x14, x15, x16, x17, [%[loc]]\n" + "stp x0, x1, [%[dst]]\n" + "stp x2, x3, [%[dst], #16]\n" + "stp x4, x5, [%[dst], #32]\n" + "stp x6, x7, [%[dst], #48]\n" + "stp x8, x9, [%[dst], #64]\n" + "stp x10, x11, [%[dst], #80]\n" + "stp x12, x13, [%[dst], #96]\n" + "stp x14, x15, [%[dst], #112]\n" + : + : [wdata] "r" (wdata), [dst] "r" (buf), [loc] "r" (addr) + : "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", + "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", + "x15", "x16", "x17" + ); + break; + case 8: + asm volatile( + ".cpu generic+lse\n" + "mov x16, %[wdata]\n" + "mov x17, %[wdata]\n" + "casp x0, x1, x16, x17, [%[loc]]\n" + "casp x2, x3, x16, x17, [%[loc]]\n" + "casp x4, x5, x16, x17, [%[loc]]\n" + "casp x6, x7, x16, x17, [%[loc]]\n" + "stp x0, x1, [%[dst]]\n" + "stp x2, x3, [%[dst], #16]\n" + "stp x4, x5, [%[dst], #32]\n" + "stp x6, x7, [%[dst], #48]\n" + : + : [wdata] "r" (wdata), [dst] "r" (buf), [loc] "r" (addr) + : "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", + "x7", "x16", "x17" + ); + break; + case 4: + asm volatile( + ".cpu generic+lse\n" + "mov x16, %[wdata]\n" + "mov x17, %[wdata]\n" + "casp x0, x1, x16, x17, [%[loc]]\n" + "casp x2, x3, x16, x17, [%[loc]]\n" + "stp x0, x1, [%[dst]]\n" + "stp x2, x3, [%[dst], #16]\n" + : + : [wdata] "r" (wdata), [dst] "r" (buf), [loc] "r" (addr) + : "memory", "x0", "x1", "x2", "x3", "x16", "x17" + ); + break; + case 2: + asm volatile( + ".cpu generic+lse\n" + "mov x16, %[wdata]\n" + "mov x17, %[wdata]\n" + "casp x0, x1, x16, x17, [%[loc]]\n" + "stp x0, x1, [%[dst]]\n" + : + : [wdata] "r" (wdata), [dst] "r" (buf), [loc] "r" (addr) + : "memory", "x0", "x1", "x16", "x17" + ); + break; + case 1: + buf[0] = roc_npa_aura_op_alloc(aura_handle, drop); + return !!buf[0]; + } + + /* Pack the pointers */ + for (i = 0, count = 0; i < num; i++) + if (buf[i]) + buf[count++] = buf[i]; + + return count; +#else + unsigned int i, count; + + for (i = 0, count = 0; i < num; i++) { + buf[count] = roc_npa_aura_op_alloc(aura_handle, drop); + if (buf[count]) + count++; + } + + return count; +#endif +} + +static inline unsigned int +roc_npa_aura_op_bulk_alloc(uint64_t aura_handle, uint64_t *buf, + unsigned int num, const int drop, const int partial) +{ + unsigned int chunk, count, num_alloc; + + count = 0; + while (num) { + chunk = (num >= ROC_CN9K_NPA_BULK_ALLOC_MAX_PTRS) ? + ROC_CN9K_NPA_BULK_ALLOC_MAX_PTRS : + plt_align32prevpow2(num); + + num_alloc = + roc_npa_aura_bulk_alloc(aura_handle, buf, chunk, drop); + + count += num_alloc; + buf += num_alloc; + num -= num_alloc; + + if (unlikely(num_alloc != chunk)) + break; + } + + /* If the requested number of pointers was not allocated and if partial + * alloc is not desired, then free allocated pointers. + */ + if (unlikely(num != 0 && !partial)) { + roc_npa_aura_op_bulk_free(aura_handle, buf - count, count, 1); + count = 0; + } + + return count; +} + struct roc_npa { struct plt_pci_device *pci_dev; -- 2.20.1