X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=drivers%2Fcommon%2Fcnxk%2Froc_npa.h;h=89f5c6facb854ae3da3f1d3ed1feedae10d36145;hb=218d022e1f3f4b5b54ebe7c02c51fb1b495e4398;hp=698384984e6e9200cbfbbdc34e68af20db5399bc;hpb=f765f561124014045702051c8965dd5d41b430fd;p=dpdk.git diff --git a/drivers/common/cnxk/roc_npa.h b/drivers/common/cnxk/roc_npa.h index 698384984e..89f5c6facb 100644 --- a/drivers/common/cnxk/roc_npa.h +++ b/drivers/common/cnxk/roc_npa.h @@ -8,6 +8,14 @@ #define ROC_AURA_ID_MASK (BIT_ULL(16) - 1) #define ROC_AURA_OP_LIMIT_MASK (BIT_ULL(36) - 1) +#define ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS 512 +#define ROC_CN10K_NPA_BATCH_FREE_MAX_PTRS 15 + +/* 16 CASP instructions can be outstanding in CN9k, but we use only 15 + * outstanding CASPs as we run out of registers. + */ +#define ROC_CN9K_NPA_BULK_ALLOC_MAX_PTRS 30 + /* * Generate 64bit handle to have optimized alloc and free aura operation. * 0 - ROC_AURA_ID_MASK for storing the aura_id. @@ -141,6 +149,478 @@ roc_npa_aura_op_available(uint64_t aura_handle) return reg & 0xFFFFFFFFF; } +static inline uint64_t +roc_npa_pool_op_performance_counter(uint64_t aura_handle, const int drop) +{ + union { + uint64_t u; + struct npa_aura_op_wdata_s s; + } op_wdata; + int64_t *addr; + uint64_t reg; + + op_wdata.u = 0; + op_wdata.s.aura = roc_npa_aura_handle_to_aura(aura_handle); + if (drop) + op_wdata.s.drop |= BIT_ULL(63); /* DROP */ + + addr = (int64_t *)(roc_npa_aura_handle_to_base(aura_handle) + + NPA_LF_POOL_OP_PC); + + reg = roc_atomic64_add_nosync(op_wdata.u, addr); + /* + * NPA_LF_POOL_OP_PC Read Data + * + * 63 49 48 48 47 0 + * ----------------------------- + * | Reserved | OP_ERR | OP_PC | + * ----------------------------- + */ + + if (reg & BIT_ULL(48) /* OP_ERR */) + return 0; + else + return reg & 0xFFFFFFFFFFFF; +} + +static inline int +roc_npa_aura_batch_alloc_issue(uint64_t aura_handle, uint64_t *buf, + unsigned int num, const int dis_wait, + const int drop) +{ + unsigned int i; + int64_t *addr; + uint64_t res; + union { + uint64_t u; + struct npa_batch_alloc_compare_s compare_s; + } cmp; + + if (num > ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS) + return -1; + + /* Zero first word of every cache line */ + for (i = 0; i < num; i += (ROC_ALIGN / sizeof(uint64_t))) + buf[i] = 0; + + addr = (int64_t *)(roc_npa_aura_handle_to_base(aura_handle) + + NPA_LF_AURA_BATCH_ALLOC); + cmp.u = 0; + cmp.compare_s.aura = roc_npa_aura_handle_to_aura(aura_handle); + cmp.compare_s.drop = drop; + cmp.compare_s.stype = ALLOC_STYPE_STSTP; + cmp.compare_s.dis_wait = dis_wait; + cmp.compare_s.count = num; + + res = roc_atomic64_cas(cmp.u, (uint64_t)buf, addr); + if (res != ALLOC_RESULT_ACCEPTED && res != ALLOC_RESULT_NOCORE) + return -1; + + return 0; +} + +static inline unsigned int +roc_npa_aura_batch_alloc_count(uint64_t *aligned_buf, unsigned int num) +{ + unsigned int count, i; + + if (num > ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS) + return 0; + + count = 0; + /* Check each ROC cache line one by one */ + for (i = 0; i < num; i += (ROC_ALIGN >> 3)) { + struct npa_batch_alloc_status_s *status; + int ccode; + + status = (struct npa_batch_alloc_status_s *)&aligned_buf[i]; + + /* Status is updated in first 7 bits of each 128 byte cache + * line. Wait until the status gets updated. + */ + do { + ccode = (volatile int)status->ccode; + } while (ccode == ALLOC_CCODE_INVAL); + + count += status->count; + } + + return count; +} + +static inline unsigned int +roc_npa_aura_batch_alloc_extract(uint64_t *buf, uint64_t *aligned_buf, + unsigned int num) +{ + unsigned int count, i; + + if (num > ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS) + return 0; + + count = 0; + /* Check each ROC cache line one by one */ + for (i = 0; i < num; i += (ROC_ALIGN >> 3)) { + struct npa_batch_alloc_status_s *status; + int line_count, ccode; + + status = (struct npa_batch_alloc_status_s *)&aligned_buf[i]; + + /* Status is updated in first 7 bits of each 128 byte cache + * line. Wait until the status gets updated. + */ + do { + ccode = (volatile int)status->ccode; + } while (ccode == ALLOC_CCODE_INVAL); + + line_count = status->count; + + /* Clear the status from the cache line */ + status->ccode = 0; + status->count = 0; + + /* 'Compress' the allocated buffers as there can + * be 'holes' at the end of the 128 byte cache + * lines. + */ + memmove(&buf[count], &aligned_buf[i], + line_count * sizeof(uint64_t)); + + count += line_count; + } + + return count; +} + +static inline void +roc_npa_aura_op_bulk_free(uint64_t aura_handle, uint64_t const *buf, + unsigned int num, const int fabs) +{ + unsigned int i; + + for (i = 0; i < num; i++) { + const uint64_t inbuf = buf[i]; + + roc_npa_aura_op_free(aura_handle, fabs, inbuf); + } +} + +static inline unsigned int +roc_npa_aura_op_batch_alloc(uint64_t aura_handle, uint64_t *buf, + uint64_t *aligned_buf, unsigned int num, + const int dis_wait, const int drop, + const int partial) +{ + unsigned int count, chunk, num_alloc; + + /* The buffer should be 128 byte cache line aligned */ + if (((uint64_t)aligned_buf & (ROC_ALIGN - 1)) != 0) + return 0; + + count = 0; + while (num) { + chunk = (num > ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS) ? + ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS : + num; + + if (roc_npa_aura_batch_alloc_issue(aura_handle, aligned_buf, + chunk, dis_wait, drop)) + break; + + num_alloc = roc_npa_aura_batch_alloc_extract(buf, aligned_buf, + chunk); + + count += num_alloc; + buf += num_alloc; + num -= num_alloc; + + if (num_alloc != chunk) + break; + } + + /* If the requested number of pointers was not allocated and if partial + * alloc is not desired, then free allocated pointers. + */ + if (unlikely(num != 0 && !partial)) { + roc_npa_aura_op_bulk_free(aura_handle, buf - count, count, 1); + count = 0; + } + + return count; +} + +static inline void +roc_npa_aura_batch_free(uint64_t aura_handle, uint64_t const *buf, + unsigned int num, const int fabs, uint64_t lmt_addr, + uint64_t lmt_id) +{ + uint64_t addr, tar_addr, free0; + volatile uint64_t *lmt_data; + unsigned int i; + + if (num > ROC_CN10K_NPA_BATCH_FREE_MAX_PTRS) + return; + + lmt_data = (uint64_t *)lmt_addr; + + addr = roc_npa_aura_handle_to_base(aura_handle) + + NPA_LF_AURA_BATCH_FREE0; + + /* + * NPA_LF_AURA_BATCH_FREE0 + * + * 63 63 62 33 32 32 31 20 19 0 + * ----------------------------------------- + * | FABS | Rsvd | COUNT_EOT | Rsvd | AURA | + * ----------------------------------------- + */ + free0 = roc_npa_aura_handle_to_aura(aura_handle); + if (fabs) + free0 |= (0x1UL << 63); + if (num & 0x1) + free0 |= (0x1UL << 32); + + /* tar_addr[4:6] is LMTST size-1 in units of 128b */ + tar_addr = addr | ((num >> 1) << 4); + + lmt_data[0] = free0; + for (i = 0; i < num; i++) + lmt_data[i + 1] = buf[i]; + + roc_lmt_submit_steorl(lmt_id, tar_addr); + plt_io_wmb(); +} + +static inline void +roc_npa_aura_op_batch_free(uint64_t aura_handle, uint64_t const *buf, + unsigned int num, const int fabs, uint64_t lmt_addr, + uint64_t lmt_id) +{ + unsigned int chunk; + + while (num) { + chunk = (num >= ROC_CN10K_NPA_BATCH_FREE_MAX_PTRS) ? + ROC_CN10K_NPA_BATCH_FREE_MAX_PTRS : + num; + + roc_npa_aura_batch_free(aura_handle, buf, chunk, fabs, lmt_addr, + lmt_id); + + buf += chunk; + num -= chunk; + } +} + +static inline unsigned int +roc_npa_aura_bulk_alloc(uint64_t aura_handle, uint64_t *buf, unsigned int num, + const int drop) +{ +#if defined(__aarch64__) + uint64_t wdata = roc_npa_aura_handle_to_aura(aura_handle); + unsigned int i, count; + uint64_t addr; + + if (drop) + wdata |= BIT_ULL(63); /* DROP */ + + addr = roc_npa_aura_handle_to_base(aura_handle) + + NPA_LF_AURA_OP_ALLOCX(0); + + switch (num) { + case 30: + asm volatile( + ".cpu generic+lse\n" + "mov v18.d[0], %[dst]\n" + "mov v18.d[1], %[loc]\n" + "mov v19.d[0], %[wdata]\n" + "mov v19.d[1], x30\n" + "mov v20.d[0], x24\n" + "mov v20.d[1], x25\n" + "mov v21.d[0], x26\n" + "mov v21.d[1], x27\n" + "mov v22.d[0], x28\n" + "mov v22.d[1], x29\n" + "mov x28, v19.d[0]\n" + "mov x29, v19.d[0]\n" + "mov x30, v18.d[1]\n" + "casp x0, x1, x28, x29, [x30]\n" + "casp x2, x3, x28, x29, [x30]\n" + "casp x4, x5, x28, x29, [x30]\n" + "casp x6, x7, x28, x29, [x30]\n" + "casp x8, x9, x28, x29, [x30]\n" + "casp x10, x11, x28, x29, [x30]\n" + "casp x12, x13, x28, x29, [x30]\n" + "casp x14, x15, x28, x29, [x30]\n" + "casp x16, x17, x28, x29, [x30]\n" + "casp x18, x19, x28, x29, [x30]\n" + "casp x20, x21, x28, x29, [x30]\n" + "casp x22, x23, x28, x29, [x30]\n" + "casp x24, x25, x28, x29, [x30]\n" + "casp x26, x27, x28, x29, [x30]\n" + "casp x28, x29, x28, x29, [x30]\n" + "mov x30, v18.d[0]\n" + "stp x0, x1, [x30]\n" + "stp x2, x3, [x30, #16]\n" + "stp x4, x5, [x30, #32]\n" + "stp x6, x7, [x30, #48]\n" + "stp x8, x9, [x30, #64]\n" + "stp x10, x11, [x30, #80]\n" + "stp x12, x13, [x30, #96]\n" + "stp x14, x15, [x30, #112]\n" + "stp x16, x17, [x30, #128]\n" + "stp x18, x19, [x30, #144]\n" + "stp x20, x21, [x30, #160]\n" + "stp x22, x23, [x30, #176]\n" + "stp x24, x25, [x30, #192]\n" + "stp x26, x27, [x30, #208]\n" + "stp x28, x29, [x30, #224]\n" + "mov %[dst], v18.d[0]\n" + "mov %[loc], v18.d[1]\n" + "mov %[wdata], v19.d[0]\n" + "mov x30, v19.d[1]\n" + "mov x24, v20.d[0]\n" + "mov x25, v20.d[1]\n" + "mov x26, v21.d[0]\n" + "mov x27, v21.d[1]\n" + "mov x28, v22.d[0]\n" + "mov x29, v22.d[1]\n" + : + : [wdata] "r"(wdata), [loc] "r"(addr), [dst] "r"(buf) + : "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", + "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", + "x15", "x16", "x17", "x18", "x19", "x20", "x21", + "x22", "x23", "v18", "v19", "v20", "v21", "v22"); + break; + case 16: + asm volatile( + ".cpu generic+lse\n" + "mov x16, %[wdata]\n" + "mov x17, %[wdata]\n" + "casp x0, x1, x16, x17, [%[loc]]\n" + "casp x2, x3, x16, x17, [%[loc]]\n" + "casp x4, x5, x16, x17, [%[loc]]\n" + "casp x6, x7, x16, x17, [%[loc]]\n" + "casp x8, x9, x16, x17, [%[loc]]\n" + "casp x10, x11, x16, x17, [%[loc]]\n" + "casp x12, x13, x16, x17, [%[loc]]\n" + "casp x14, x15, x16, x17, [%[loc]]\n" + "stp x0, x1, [%[dst]]\n" + "stp x2, x3, [%[dst], #16]\n" + "stp x4, x5, [%[dst], #32]\n" + "stp x6, x7, [%[dst], #48]\n" + "stp x8, x9, [%[dst], #64]\n" + "stp x10, x11, [%[dst], #80]\n" + "stp x12, x13, [%[dst], #96]\n" + "stp x14, x15, [%[dst], #112]\n" + : + : [wdata] "r" (wdata), [dst] "r" (buf), [loc] "r" (addr) + : "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", + "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", + "x15", "x16", "x17" + ); + break; + case 8: + asm volatile( + ".cpu generic+lse\n" + "mov x16, %[wdata]\n" + "mov x17, %[wdata]\n" + "casp x0, x1, x16, x17, [%[loc]]\n" + "casp x2, x3, x16, x17, [%[loc]]\n" + "casp x4, x5, x16, x17, [%[loc]]\n" + "casp x6, x7, x16, x17, [%[loc]]\n" + "stp x0, x1, [%[dst]]\n" + "stp x2, x3, [%[dst], #16]\n" + "stp x4, x5, [%[dst], #32]\n" + "stp x6, x7, [%[dst], #48]\n" + : + : [wdata] "r" (wdata), [dst] "r" (buf), [loc] "r" (addr) + : "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", + "x7", "x16", "x17" + ); + break; + case 4: + asm volatile( + ".cpu generic+lse\n" + "mov x16, %[wdata]\n" + "mov x17, %[wdata]\n" + "casp x0, x1, x16, x17, [%[loc]]\n" + "casp x2, x3, x16, x17, [%[loc]]\n" + "stp x0, x1, [%[dst]]\n" + "stp x2, x3, [%[dst], #16]\n" + : + : [wdata] "r" (wdata), [dst] "r" (buf), [loc] "r" (addr) + : "memory", "x0", "x1", "x2", "x3", "x16", "x17" + ); + break; + case 2: + asm volatile( + ".cpu generic+lse\n" + "mov x16, %[wdata]\n" + "mov x17, %[wdata]\n" + "casp x0, x1, x16, x17, [%[loc]]\n" + "stp x0, x1, [%[dst]]\n" + : + : [wdata] "r" (wdata), [dst] "r" (buf), [loc] "r" (addr) + : "memory", "x0", "x1", "x16", "x17" + ); + break; + case 1: + buf[0] = roc_npa_aura_op_alloc(aura_handle, drop); + return !!buf[0]; + } + + /* Pack the pointers */ + for (i = 0, count = 0; i < num; i++) + if (buf[i]) + buf[count++] = buf[i]; + + return count; +#else + unsigned int i, count; + + for (i = 0, count = 0; i < num; i++) { + buf[count] = roc_npa_aura_op_alloc(aura_handle, drop); + if (buf[count]) + count++; + } + + return count; +#endif +} + +static inline unsigned int +roc_npa_aura_op_bulk_alloc(uint64_t aura_handle, uint64_t *buf, + unsigned int num, const int drop, const int partial) +{ + unsigned int chunk, count, num_alloc; + + count = 0; + while (num) { + chunk = (num >= ROC_CN9K_NPA_BULK_ALLOC_MAX_PTRS) ? + ROC_CN9K_NPA_BULK_ALLOC_MAX_PTRS : + plt_align32prevpow2(num); + + num_alloc = + roc_npa_aura_bulk_alloc(aura_handle, buf, chunk, drop); + + count += num_alloc; + buf += num_alloc; + num -= num_alloc; + + if (unlikely(num_alloc != chunk)) + break; + } + + /* If the requested number of pointers was not allocated and if partial + * alloc is not desired, then free allocated pointers. + */ + if (unlikely(num != 0 && !partial)) { + roc_npa_aura_op_bulk_free(aura_handle, buf - count, count, 1); + count = 0; + } + + return count; +} + struct roc_npa { struct plt_pci_device *pci_dev; @@ -167,4 +647,7 @@ void __roc_api roc_npa_aura_op_range_set(uint64_t aura_handle, int __roc_api roc_npa_ctx_dump(void); int __roc_api roc_npa_dump(void); +/* Reset operation performance counter. */ +int __roc_api roc_npa_pool_op_pc_reset(uint64_t aura_handle); + #endif /* _ROC_NPA_H_ */