X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=drivers%2Fcommon%2Fcnxk%2Froc_npa.h;h=89f5c6facb854ae3da3f1d3ed1feedae10d36145;hb=218d022e1f3f4b5b54ebe7c02c51fb1b495e4398;hp=698384984e6e9200cbfbbdc34e68af20db5399bc;hpb=f765f561124014045702051c8965dd5d41b430fd;p=dpdk.git

diff --git a/drivers/common/cnxk/roc_npa.h b/drivers/common/cnxk/roc_npa.h
index 698384984e..89f5c6facb 100644
--- a/drivers/common/cnxk/roc_npa.h
+++ b/drivers/common/cnxk/roc_npa.h
@@ -8,6 +8,14 @@
 #define ROC_AURA_ID_MASK       (BIT_ULL(16) - 1)
 #define ROC_AURA_OP_LIMIT_MASK (BIT_ULL(36) - 1)
 
+#define ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS 512
+#define ROC_CN10K_NPA_BATCH_FREE_MAX_PTRS  15
+
+/* 16 CASP instructions can be outstanding in CN9k, but we use only 15
+ * outstanding CASPs as we run out of registers.
+ */
+#define ROC_CN9K_NPA_BULK_ALLOC_MAX_PTRS 30
+
 /*
  * Generate 64bit handle to have optimized alloc and free aura operation.
  * 0 - ROC_AURA_ID_MASK for storing the aura_id.
@@ -141,6 +149,478 @@ roc_npa_aura_op_available(uint64_t aura_handle)
 		return reg & 0xFFFFFFFFF;
 }
 
+static inline uint64_t
+roc_npa_pool_op_performance_counter(uint64_t aura_handle, const int drop)
+{
+	union {
+		uint64_t u;
+		struct npa_aura_op_wdata_s s;
+	} op_wdata;
+	int64_t *addr;
+	uint64_t reg;
+
+	op_wdata.u = 0;
+	op_wdata.s.aura = roc_npa_aura_handle_to_aura(aura_handle);
+	if (drop)
+		op_wdata.s.drop |= BIT_ULL(63); /* DROP */
+
+	addr = (int64_t *)(roc_npa_aura_handle_to_base(aura_handle) +
+			   NPA_LF_POOL_OP_PC);
+
+	reg = roc_atomic64_add_nosync(op_wdata.u, addr);
+	/*
+	 * NPA_LF_POOL_OP_PC Read Data
+	 *
+	 * 63       49 48    48 47     0
+	 * -----------------------------
+	 * | Reserved | OP_ERR | OP_PC |
+	 * -----------------------------
+	 */
+
+	if (reg & BIT_ULL(48) /* OP_ERR */)
+		return 0;
+	else
+		return reg & 0xFFFFFFFFFFFF;
+}
+
+static inline int
+roc_npa_aura_batch_alloc_issue(uint64_t aura_handle, uint64_t *buf,
+			       unsigned int num, const int dis_wait,
+			       const int drop)
+{
+	unsigned int i;
+	int64_t *addr;
+	uint64_t res;
+	union {
+		uint64_t u;
+		struct npa_batch_alloc_compare_s compare_s;
+	} cmp;
+
+	if (num > ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS)
+		return -1;
+
+	/* Zero first word of every cache line */
+	for (i = 0; i < num; i += (ROC_ALIGN / sizeof(uint64_t)))
+		buf[i] = 0;
+
+	addr = (int64_t *)(roc_npa_aura_handle_to_base(aura_handle) +
+			   NPA_LF_AURA_BATCH_ALLOC);
+	cmp.u = 0;
+	cmp.compare_s.aura = roc_npa_aura_handle_to_aura(aura_handle);
+	cmp.compare_s.drop = drop;
+	cmp.compare_s.stype = ALLOC_STYPE_STSTP;
+	cmp.compare_s.dis_wait = dis_wait;
+	cmp.compare_s.count = num;
+
+	res = roc_atomic64_cas(cmp.u, (uint64_t)buf, addr);
+	if (res != ALLOC_RESULT_ACCEPTED && res != ALLOC_RESULT_NOCORE)
+		return -1;
+
+	return 0;
+}
+
+static inline unsigned int
+roc_npa_aura_batch_alloc_count(uint64_t *aligned_buf, unsigned int num)
+{
+	unsigned int count, i;
+
+	if (num > ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS)
+		return 0;
+
+	count = 0;
+	/* Check each ROC cache line one by one */
+	for (i = 0; i < num; i += (ROC_ALIGN >> 3)) {
+		struct npa_batch_alloc_status_s *status;
+		int ccode;
+
+		status = (struct npa_batch_alloc_status_s *)&aligned_buf[i];
+
+		/* Status is updated in first 7 bits of each 128 byte cache
+		 * line. Wait until the status gets updated.
+		 */
+		do {
+			ccode = (volatile int)status->ccode;
+		} while (ccode == ALLOC_CCODE_INVAL);
+
+		count += status->count;
+	}
+
+	return count;
+}
+
+static inline unsigned int
+roc_npa_aura_batch_alloc_extract(uint64_t *buf, uint64_t *aligned_buf,
+				 unsigned int num)
+{
+	unsigned int count, i;
+
+	if (num > ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS)
+		return 0;
+
+	count = 0;
+	/* Check each ROC cache line one by one */
+	for (i = 0; i < num; i += (ROC_ALIGN >> 3)) {
+		struct npa_batch_alloc_status_s *status;
+		int line_count, ccode;
+
+		status = (struct npa_batch_alloc_status_s *)&aligned_buf[i];
+
+		/* Status is updated in first 7 bits of each 128 byte cache
+		 * line. Wait until the status gets updated.
+		 */
+		do {
+			ccode = (volatile int)status->ccode;
+		} while (ccode == ALLOC_CCODE_INVAL);
+
+		line_count = status->count;
+
+		/* Clear the status from the cache line */
+		status->ccode = 0;
+		status->count = 0;
+
+		/* 'Compress' the allocated buffers as there can
+		 * be 'holes' at the end of the 128 byte cache
+		 * lines.
+		 */
+		memmove(&buf[count], &aligned_buf[i],
+			line_count * sizeof(uint64_t));
+
+		count += line_count;
+	}
+
+	return count;
+}
+
+static inline void
+roc_npa_aura_op_bulk_free(uint64_t aura_handle, uint64_t const *buf,
+			  unsigned int num, const int fabs)
+{
+	unsigned int i;
+
+	for (i = 0; i < num; i++) {
+		const uint64_t inbuf = buf[i];
+
+		roc_npa_aura_op_free(aura_handle, fabs, inbuf);
+	}
+}
+
+static inline unsigned int
+roc_npa_aura_op_batch_alloc(uint64_t aura_handle, uint64_t *buf,
+			    uint64_t *aligned_buf, unsigned int num,
+			    const int dis_wait, const int drop,
+			    const int partial)
+{
+	unsigned int count, chunk, num_alloc;
+
+	/* The buffer should be 128 byte cache line aligned */
+	if (((uint64_t)aligned_buf & (ROC_ALIGN - 1)) != 0)
+		return 0;
+
+	count = 0;
+	while (num) {
+		chunk = (num > ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS) ?
+				      ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS :
+				      num;
+
+		if (roc_npa_aura_batch_alloc_issue(aura_handle, aligned_buf,
+						   chunk, dis_wait, drop))
+			break;
+
+		num_alloc = roc_npa_aura_batch_alloc_extract(buf, aligned_buf,
+							     chunk);
+
+		count += num_alloc;
+		buf += num_alloc;
+		num -= num_alloc;
+
+		if (num_alloc != chunk)
+			break;
+	}
+
+	/* If the requested number of pointers was not allocated and if partial
+	 * alloc is not desired, then free allocated pointers.
+	 */
+	if (unlikely(num != 0 && !partial)) {
+		roc_npa_aura_op_bulk_free(aura_handle, buf - count, count, 1);
+		count = 0;
+	}
+
+	return count;
+}
+
+static inline void
+roc_npa_aura_batch_free(uint64_t aura_handle, uint64_t const *buf,
+			unsigned int num, const int fabs, uint64_t lmt_addr,
+			uint64_t lmt_id)
+{
+	uint64_t addr, tar_addr, free0;
+	volatile uint64_t *lmt_data;
+	unsigned int i;
+
+	if (num > ROC_CN10K_NPA_BATCH_FREE_MAX_PTRS)
+		return;
+
+	lmt_data = (uint64_t *)lmt_addr;
+
+	addr = roc_npa_aura_handle_to_base(aura_handle) +
+	       NPA_LF_AURA_BATCH_FREE0;
+
+	/*
+	 * NPA_LF_AURA_BATCH_FREE0
+	 *
+	 * 63   63 62  33 32       32 31  20 19    0
+	 * -----------------------------------------
+	 * | FABS | Rsvd | COUNT_EOT | Rsvd | AURA |
+	 * -----------------------------------------
+	 */
+	free0 = roc_npa_aura_handle_to_aura(aura_handle);
+	if (fabs)
+		free0 |= (0x1UL << 63);
+	if (num & 0x1)
+		free0 |= (0x1UL << 32);
+
+	/* tar_addr[4:6] is LMTST size-1 in units of 128b */
+	tar_addr = addr | ((num >> 1) << 4);
+
+	lmt_data[0] = free0;
+	for (i = 0; i < num; i++)
+		lmt_data[i + 1] = buf[i];
+
+	roc_lmt_submit_steorl(lmt_id, tar_addr);
+	plt_io_wmb();
+}
+
+static inline void
+roc_npa_aura_op_batch_free(uint64_t aura_handle, uint64_t const *buf,
+			   unsigned int num, const int fabs, uint64_t lmt_addr,
+			   uint64_t lmt_id)
+{
+	unsigned int chunk;
+
+	while (num) {
+		chunk = (num >= ROC_CN10K_NPA_BATCH_FREE_MAX_PTRS) ?
+				      ROC_CN10K_NPA_BATCH_FREE_MAX_PTRS :
+				      num;
+
+		roc_npa_aura_batch_free(aura_handle, buf, chunk, fabs, lmt_addr,
+					lmt_id);
+
+		buf += chunk;
+		num -= chunk;
+	}
+}
+
+static inline unsigned int
+roc_npa_aura_bulk_alloc(uint64_t aura_handle, uint64_t *buf, unsigned int num,
+			const int drop)
+{
+#if defined(__aarch64__)
+	uint64_t wdata = roc_npa_aura_handle_to_aura(aura_handle);
+	unsigned int i, count;
+	uint64_t addr;
+
+	if (drop)
+		wdata |= BIT_ULL(63); /* DROP */
+
+	addr = roc_npa_aura_handle_to_base(aura_handle) +
+	       NPA_LF_AURA_OP_ALLOCX(0);
+
+	switch (num) {
+	case 30:
+		asm volatile(
+			".cpu  generic+lse\n"
+			"mov v18.d[0], %[dst]\n"
+			"mov v18.d[1], %[loc]\n"
+			"mov v19.d[0], %[wdata]\n"
+			"mov v19.d[1], x30\n"
+			"mov v20.d[0], x24\n"
+			"mov v20.d[1], x25\n"
+			"mov v21.d[0], x26\n"
+			"mov v21.d[1], x27\n"
+			"mov v22.d[0], x28\n"
+			"mov v22.d[1], x29\n"
+			"mov x28, v19.d[0]\n"
+			"mov x29, v19.d[0]\n"
+			"mov x30, v18.d[1]\n"
+			"casp x0, x1, x28, x29, [x30]\n"
+			"casp x2, x3, x28, x29, [x30]\n"
+			"casp x4, x5, x28, x29, [x30]\n"
+			"casp x6, x7, x28, x29, [x30]\n"
+			"casp x8, x9, x28, x29, [x30]\n"
+			"casp x10, x11, x28, x29, [x30]\n"
+			"casp x12, x13, x28, x29, [x30]\n"
+			"casp x14, x15, x28, x29, [x30]\n"
+			"casp x16, x17, x28, x29, [x30]\n"
+			"casp x18, x19, x28, x29, [x30]\n"
+			"casp x20, x21, x28, x29, [x30]\n"
+			"casp x22, x23, x28, x29, [x30]\n"
+			"casp x24, x25, x28, x29, [x30]\n"
+			"casp x26, x27, x28, x29, [x30]\n"
+			"casp x28, x29, x28, x29, [x30]\n"
+			"mov x30, v18.d[0]\n"
+			"stp x0, x1, [x30]\n"
+			"stp x2, x3, [x30, #16]\n"
+			"stp x4, x5, [x30, #32]\n"
+			"stp x6, x7, [x30, #48]\n"
+			"stp x8, x9, [x30, #64]\n"
+			"stp x10, x11, [x30, #80]\n"
+			"stp x12, x13, [x30, #96]\n"
+			"stp x14, x15, [x30, #112]\n"
+			"stp x16, x17, [x30, #128]\n"
+			"stp x18, x19, [x30, #144]\n"
+			"stp x20, x21, [x30, #160]\n"
+			"stp x22, x23, [x30, #176]\n"
+			"stp x24, x25, [x30, #192]\n"
+			"stp x26, x27, [x30, #208]\n"
+			"stp x28, x29, [x30, #224]\n"
+			"mov %[dst], v18.d[0]\n"
+			"mov %[loc], v18.d[1]\n"
+			"mov %[wdata], v19.d[0]\n"
+			"mov x30, v19.d[1]\n"
+			"mov x24, v20.d[0]\n"
+			"mov x25, v20.d[1]\n"
+			"mov x26, v21.d[0]\n"
+			"mov x27, v21.d[1]\n"
+			"mov x28, v22.d[0]\n"
+			"mov x29, v22.d[1]\n"
+			:
+			: [wdata] "r"(wdata), [loc] "r"(addr), [dst] "r"(buf)
+			: "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6",
+			  "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14",
+			  "x15", "x16", "x17", "x18", "x19", "x20", "x21",
+			  "x22", "x23", "v18", "v19", "v20", "v21", "v22");
+		break;
+	case 16:
+		asm volatile(
+			".cpu  generic+lse\n"
+			"mov x16, %[wdata]\n"
+			"mov x17, %[wdata]\n"
+			"casp x0, x1, x16, x17, [%[loc]]\n"
+			"casp x2, x3, x16, x17, [%[loc]]\n"
+			"casp x4, x5, x16, x17, [%[loc]]\n"
+			"casp x6, x7, x16, x17, [%[loc]]\n"
+			"casp x8, x9, x16, x17, [%[loc]]\n"
+			"casp x10, x11, x16, x17, [%[loc]]\n"
+			"casp x12, x13, x16, x17, [%[loc]]\n"
+			"casp x14, x15, x16, x17, [%[loc]]\n"
+			"stp x0, x1, [%[dst]]\n"
+			"stp x2, x3, [%[dst], #16]\n"
+			"stp x4, x5, [%[dst], #32]\n"
+			"stp x6, x7, [%[dst], #48]\n"
+			"stp x8, x9, [%[dst], #64]\n"
+			"stp x10, x11, [%[dst], #80]\n"
+			"stp x12, x13, [%[dst], #96]\n"
+			"stp x14, x15, [%[dst], #112]\n"
+			:
+			: [wdata] "r" (wdata), [dst] "r" (buf), [loc] "r" (addr)
+			: "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6",
+			  "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14",
+			  "x15", "x16", "x17"
+		);
+		break;
+	case 8:
+		asm volatile(
+			".cpu  generic+lse\n"
+			"mov x16, %[wdata]\n"
+			"mov x17, %[wdata]\n"
+			"casp x0, x1, x16, x17, [%[loc]]\n"
+			"casp x2, x3, x16, x17, [%[loc]]\n"
+			"casp x4, x5, x16, x17, [%[loc]]\n"
+			"casp x6, x7, x16, x17, [%[loc]]\n"
+			"stp x0, x1, [%[dst]]\n"
+			"stp x2, x3, [%[dst], #16]\n"
+			"stp x4, x5, [%[dst], #32]\n"
+			"stp x6, x7, [%[dst], #48]\n"
+			:
+			: [wdata] "r" (wdata), [dst] "r" (buf), [loc] "r" (addr)
+			: "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6",
+			  "x7", "x16", "x17"
+		);
+		break;
+	case 4:
+		asm volatile(
+			".cpu  generic+lse\n"
+			"mov x16, %[wdata]\n"
+			"mov x17, %[wdata]\n"
+			"casp x0, x1, x16, x17, [%[loc]]\n"
+			"casp x2, x3, x16, x17, [%[loc]]\n"
+			"stp x0, x1, [%[dst]]\n"
+			"stp x2, x3, [%[dst], #16]\n"
+			:
+			: [wdata] "r" (wdata), [dst] "r" (buf), [loc] "r" (addr)
+			: "memory", "x0", "x1", "x2", "x3", "x16", "x17"
+		);
+		break;
+	case 2:
+		asm volatile(
+			".cpu  generic+lse\n"
+			"mov x16, %[wdata]\n"
+			"mov x17, %[wdata]\n"
+			"casp x0, x1, x16, x17, [%[loc]]\n"
+			"stp x0, x1, [%[dst]]\n"
+			:
+			: [wdata] "r" (wdata), [dst] "r" (buf), [loc] "r" (addr)
+			: "memory", "x0", "x1", "x16", "x17"
+		);
+		break;
+	case 1:
+		buf[0] = roc_npa_aura_op_alloc(aura_handle, drop);
+		return !!buf[0];
+	}
+
+	/* Pack the pointers */
+	for (i = 0, count = 0; i < num; i++)
+		if (buf[i])
+			buf[count++] = buf[i];
+
+	return count;
+#else
+	unsigned int i, count;
+
+	for (i = 0, count = 0; i < num; i++) {
+		buf[count] = roc_npa_aura_op_alloc(aura_handle, drop);
+		if (buf[count])
+			count++;
+	}
+
+	return count;
+#endif
+}
+
+static inline unsigned int
+roc_npa_aura_op_bulk_alloc(uint64_t aura_handle, uint64_t *buf,
+			   unsigned int num, const int drop, const int partial)
+{
+	unsigned int chunk, count, num_alloc;
+
+	count = 0;
+	while (num) {
+		chunk = (num >= ROC_CN9K_NPA_BULK_ALLOC_MAX_PTRS) ?
+				      ROC_CN9K_NPA_BULK_ALLOC_MAX_PTRS :
+				      plt_align32prevpow2(num);
+
+		num_alloc =
+			roc_npa_aura_bulk_alloc(aura_handle, buf, chunk, drop);
+
+		count += num_alloc;
+		buf += num_alloc;
+		num -= num_alloc;
+
+		if (unlikely(num_alloc != chunk))
+			break;
+	}
+
+	/* If the requested number of pointers was not allocated and if partial
+	 * alloc is not desired, then free allocated pointers.
+	 */
+	if (unlikely(num != 0 && !partial)) {
+		roc_npa_aura_op_bulk_free(aura_handle, buf - count, count, 1);
+		count = 0;
+	}
+
+	return count;
+}
+
 struct roc_npa {
 	struct plt_pci_device *pci_dev;
 
@@ -167,4 +647,7 @@ void __roc_api roc_npa_aura_op_range_set(uint64_t aura_handle,
 int __roc_api roc_npa_ctx_dump(void);
 int __roc_api roc_npa_dump(void);
 
+/* Reset operation performance counter. */
+int __roc_api roc_npa_pool_op_pc_reset(uint64_t aura_handle);
+
 #endif /* _ROC_NPA_H_ */