1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(C) 2021 Marvell.
8 #define ROC_AURA_ID_MASK (BIT_ULL(16) - 1)
9 #define ROC_AURA_OP_LIMIT_MASK (BIT_ULL(36) - 1)
11 #define ROC_NPA_MAX_BLOCK_SZ (128 * 1024)
12 #define ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS 512
13 #define ROC_CN10K_NPA_BATCH_FREE_MAX_PTRS 15U
14 #define ROC_CN10K_NPA_BATCH_FREE_BURST_MAX 16U
16 /* This value controls how much of the present average resource level is used to
17 * calculate the new resource level.
19 #define ROC_NPA_AVG_CONT 0xE0
21 /* 16 CASP instructions can be outstanding in CN9k, but we use only 15
22 * outstanding CASPs as we run out of registers.
24 #define ROC_CN9K_NPA_BULK_ALLOC_MAX_PTRS 30
27 * Generate 64bit handle to have optimized alloc and free aura operation.
28 * 0 - ROC_AURA_ID_MASK for storing the aura_id.
29 * [ROC_AURA_ID_MASK+1, (2^64 - 1)] for storing the lf base address.
30 * This scheme is valid when OS can give ROC_AURA_ID_MASK
31 * aligned address for lf base address.
33 static inline uint64_t
34 roc_npa_aura_handle_gen(uint32_t aura_id, uintptr_t addr)
38 val = aura_id & ROC_AURA_ID_MASK;
39 return (uint64_t)addr | val;
42 static inline uint64_t
43 roc_npa_aura_handle_to_aura(uint64_t aura_handle)
45 return aura_handle & ROC_AURA_ID_MASK;
48 static inline uintptr_t
49 roc_npa_aura_handle_to_base(uint64_t aura_handle)
51 return (uintptr_t)(aura_handle & ~ROC_AURA_ID_MASK);
54 static inline uint64_t
55 roc_npa_aura_op_alloc(uint64_t aura_handle, const int drop)
57 uint64_t wdata = roc_npa_aura_handle_to_aura(aura_handle);
61 wdata |= BIT_ULL(63); /* DROP */
63 addr = (int64_t *)(roc_npa_aura_handle_to_base(aura_handle) +
64 NPA_LF_AURA_OP_ALLOCX(0));
65 return roc_atomic64_add_nosync(wdata, addr);
69 roc_npa_aura_op_free(uint64_t aura_handle, const int fabs, uint64_t iova)
71 uint64_t reg = roc_npa_aura_handle_to_aura(aura_handle);
73 roc_npa_aura_handle_to_base(aura_handle) + NPA_LF_AURA_OP_FREE0;
75 reg |= BIT_ULL(63); /* FABS */
77 roc_store_pair(iova, reg, addr);
80 static inline uint64_t
81 roc_npa_aura_op_cnt_get(uint64_t aura_handle)
87 wdata = roc_npa_aura_handle_to_aura(aura_handle) << 44;
88 addr = (int64_t *)(roc_npa_aura_handle_to_base(aura_handle) +
90 reg = roc_atomic64_add_nosync(wdata, addr);
92 if (reg & BIT_ULL(42) /* OP_ERR */)
95 return reg & 0xFFFFFFFFF;
99 roc_npa_aura_op_cnt_set(uint64_t aura_handle, const int sign, uint64_t count)
101 uint64_t reg = count & (BIT_ULL(36) - 1);
104 reg |= BIT_ULL(43); /* CNT_ADD */
106 reg |= (roc_npa_aura_handle_to_aura(aura_handle) << 44);
108 plt_write64(reg, roc_npa_aura_handle_to_base(aura_handle) +
112 static inline uint64_t
113 roc_npa_aura_op_limit_get(uint64_t aura_handle)
119 wdata = roc_npa_aura_handle_to_aura(aura_handle) << 44;
120 addr = (int64_t *)(roc_npa_aura_handle_to_base(aura_handle) +
121 NPA_LF_AURA_OP_LIMIT);
122 reg = roc_atomic64_add_nosync(wdata, addr);
124 if (reg & BIT_ULL(42) /* OP_ERR */)
127 return reg & ROC_AURA_OP_LIMIT_MASK;
131 roc_npa_aura_op_limit_set(uint64_t aura_handle, uint64_t limit)
133 uint64_t reg = limit & ROC_AURA_OP_LIMIT_MASK;
135 reg |= (roc_npa_aura_handle_to_aura(aura_handle) << 44);
137 plt_write64(reg, roc_npa_aura_handle_to_base(aura_handle) +
138 NPA_LF_AURA_OP_LIMIT);
141 static inline uint64_t
142 roc_npa_aura_op_available(uint64_t aura_handle)
148 wdata = roc_npa_aura_handle_to_aura(aura_handle) << 44;
149 addr = (int64_t *)(roc_npa_aura_handle_to_base(aura_handle) +
150 NPA_LF_POOL_OP_AVAILABLE);
151 reg = roc_atomic64_add_nosync(wdata, addr);
153 if (reg & BIT_ULL(42) /* OP_ERR */)
156 return reg & 0xFFFFFFFFF;
159 /* Wait for a given timeout, repeatedly checking whether the available
160 * pointers has reached the given count. Returns the available pointer
161 * count if it has reached the given count or if timeout has expired
163 static inline uint32_t
164 roc_npa_aura_op_available_wait(uint64_t aura_handle, uint32_t count,
167 #define OP_AVAIL_WAIT_MS_DEFAULT (100)
168 #define OP_AVAIL_CHECK_INTERVAL_MS (1)
172 tmo_ms = tmo_ms ? tmo_ms : OP_AVAIL_WAIT_MS_DEFAULT;
174 retry = tmo_ms / OP_AVAIL_CHECK_INTERVAL_MS;
175 op_avail = roc_npa_aura_op_available(aura_handle);
176 while (retry && (op_avail < count)) {
177 plt_delay_ms(OP_AVAIL_CHECK_INTERVAL_MS);
178 op_avail = roc_npa_aura_op_available(aura_handle);
185 static inline uint64_t
186 roc_npa_pool_op_performance_counter(uint64_t aura_handle, const int drop)
190 struct npa_aura_op_wdata_s s;
196 op_wdata.s.aura = roc_npa_aura_handle_to_aura(aura_handle);
198 op_wdata.s.drop |= BIT_ULL(63); /* DROP */
200 addr = (int64_t *)(roc_npa_aura_handle_to_base(aura_handle) +
203 reg = roc_atomic64_add_nosync(op_wdata.u, addr);
205 * NPA_LF_POOL_OP_PC Read Data
208 * -----------------------------
209 * | Reserved | OP_ERR | OP_PC |
210 * -----------------------------
213 if (reg & BIT_ULL(48) /* OP_ERR */)
216 return reg & 0xFFFFFFFFFFFF;
220 roc_npa_aura_batch_alloc_issue(uint64_t aura_handle, uint64_t *buf,
221 unsigned int num, const int dis_wait,
229 struct npa_batch_alloc_compare_s compare_s;
232 if (num > ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS)
235 /* Zero first word of every cache line */
236 for (i = 0; i < num; i += (ROC_ALIGN / sizeof(uint64_t)))
239 addr = (int64_t *)(roc_npa_aura_handle_to_base(aura_handle) +
240 NPA_LF_AURA_BATCH_ALLOC);
242 cmp.compare_s.aura = roc_npa_aura_handle_to_aura(aura_handle);
243 cmp.compare_s.drop = drop;
244 cmp.compare_s.stype = ALLOC_STYPE_STF;
245 cmp.compare_s.dis_wait = dis_wait;
246 cmp.compare_s.count = num;
248 res = roc_atomic64_casl(cmp.u, (uint64_t)buf, addr);
249 if (res != ALLOC_RESULT_ACCEPTED && res != ALLOC_RESULT_NOCORE)
256 roc_npa_batch_alloc_wait(uint64_t *cache_line)
258 /* Batch alloc status code is updated in bits [5:6] of the first word
259 * of the 128 byte cache line.
261 while (((__atomic_load_n(cache_line, __ATOMIC_RELAXED) >> 5) & 0x3) ==
266 static inline unsigned int
267 roc_npa_aura_batch_alloc_count(uint64_t *aligned_buf, unsigned int num)
269 unsigned int count, i;
271 if (num > ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS)
275 /* Check each ROC cache line one by one */
276 for (i = 0; i < num; i += (ROC_ALIGN >> 3)) {
277 struct npa_batch_alloc_status_s *status;
279 status = (struct npa_batch_alloc_status_s *)&aligned_buf[i];
281 roc_npa_batch_alloc_wait(&aligned_buf[i]);
282 count += status->count;
288 static inline unsigned int
289 roc_npa_aura_batch_alloc_extract(uint64_t *buf, uint64_t *aligned_buf,
292 unsigned int count, i;
294 if (num > ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS)
298 /* Check each ROC cache line one by one */
299 for (i = 0; i < num; i += (ROC_ALIGN >> 3)) {
300 struct npa_batch_alloc_status_s *status;
303 status = (struct npa_batch_alloc_status_s *)&aligned_buf[i];
305 roc_npa_batch_alloc_wait(&aligned_buf[i]);
307 line_count = status->count;
309 /* Clear the status from the cache line */
313 /* 'Compress' the allocated buffers as there can
314 * be 'holes' at the end of the 128 byte cache
317 memmove(&buf[count], &aligned_buf[i],
318 line_count * sizeof(uint64_t));
327 roc_npa_aura_op_bulk_free(uint64_t aura_handle, uint64_t const *buf,
328 unsigned int num, const int fabs)
332 for (i = 0; i < num; i++) {
333 const uint64_t inbuf = buf[i];
335 roc_npa_aura_op_free(aura_handle, fabs, inbuf);
339 static inline unsigned int
340 roc_npa_aura_op_batch_alloc(uint64_t aura_handle, uint64_t *buf,
341 uint64_t *aligned_buf, unsigned int num,
342 const int dis_wait, const int drop,
345 unsigned int count, chunk, num_alloc;
347 /* The buffer should be 128 byte cache line aligned */
348 if (((uint64_t)aligned_buf & (ROC_ALIGN - 1)) != 0)
353 chunk = (num > ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS) ?
354 ROC_CN10K_NPA_BATCH_ALLOC_MAX_PTRS :
357 if (roc_npa_aura_batch_alloc_issue(aura_handle, aligned_buf,
358 chunk, dis_wait, drop))
361 num_alloc = roc_npa_aura_batch_alloc_extract(buf, aligned_buf,
368 if (num_alloc != chunk)
372 /* If the requested number of pointers was not allocated and if partial
373 * alloc is not desired, then free allocated pointers.
375 if (unlikely(num != 0 && !partial)) {
376 roc_npa_aura_op_bulk_free(aura_handle, buf - count, count, 1);
384 roc_npa_aura_batch_free(uint64_t aura_handle, uint64_t const *buf,
385 unsigned int num, const int fabs, uint64_t lmt_addr,
388 uint64_t addr, tar_addr, free0;
389 volatile uint64_t *lmt_data;
392 lmt_data = (uint64_t *)lmt_addr;
394 addr = roc_npa_aura_handle_to_base(aura_handle) +
395 NPA_LF_AURA_BATCH_FREE0;
398 * NPA_LF_AURA_BATCH_FREE0
400 * 63 63 62 33 32 32 31 20 19 0
401 * -----------------------------------------
402 * | FABS | Rsvd | COUNT_EOT | Rsvd | AURA |
403 * -----------------------------------------
405 free0 = roc_npa_aura_handle_to_aura(aura_handle);
406 free0 |= ((uint64_t)!!fabs << 63);
407 free0 |= ((uint64_t)(num & 0x1) << 32);
409 /* tar_addr[4:6] is LMTST size-1 in units of 128b */
410 tar_addr = addr | ((num >> 1) << 4);
413 for (i = 0; i < num; i++)
414 lmt_data[i + 1] = buf[i];
416 roc_lmt_submit_steorl(lmt_id, tar_addr);
421 roc_npa_aura_batch_free_burst(uint64_t aura_handle, uint64_t const *buf,
422 unsigned int num, const int fabs,
423 uint64_t lmt_addr, uint64_t lmt_id)
425 uint64_t addr, tar_addr, free0, send_data, lmtline;
428 /* 63 52 51 20 19 7 6 4 3 0
429 * ----------------------------------------
430 * | RSVD | ADDR | RSVD | LMTST SZ(0) | 0 |
431 * ----------------------------------------
433 addr = roc_npa_aura_handle_to_base(aura_handle) +
434 NPA_LF_AURA_BATCH_FREE0;
435 tar_addr = addr | (0x7 << 4);
437 /* 63 63 62 33 32 32 31 20 19 0
438 * -----------------------------------------
439 * | FABS | Rsvd | COUNT_EOT | Rsvd | AURA |
440 * -----------------------------------------
442 free0 = roc_npa_aura_handle_to_aura(aura_handle);
443 free0 |= ((uint64_t)!!fabs << 63);
444 free0 |= (0x1UL << 32);
446 /* Fill the lmt lines */
447 lmt_data = (uint64_t *)lmt_addr;
450 lmt_data[lmtline * 16] = free0;
451 memcpy(&lmt_data[(lmtline * 16) + 1], buf,
452 ROC_CN10K_NPA_BATCH_FREE_MAX_PTRS * sizeof(uint64_t));
454 num -= ROC_CN10K_NPA_BATCH_FREE_MAX_PTRS;
455 buf += ROC_CN10K_NPA_BATCH_FREE_MAX_PTRS;
458 /* 63 19 18 16 15 12 11 11 10 0
459 * ---------------------------------------------------------------
460 * | LMTST SZ(15) ... LMTST SZ(1) | Rsvd | CNTM1 | Rsvd | LMT_ID |
461 * ---------------------------------------------------------------
463 send_data = lmt_id | ((lmtline - 1) << 12) | (0x1FFFFFFFFFFFUL << 19);
464 roc_lmt_submit_steorl(send_data, tar_addr);
469 roc_npa_aura_op_batch_free(uint64_t aura_handle, uint64_t const *buf,
470 unsigned int num, const int fabs, uint64_t lmt_addr,
473 unsigned int max_burst, chunk, bnum;
475 max_burst = ROC_CN10K_NPA_BATCH_FREE_MAX_PTRS *
476 ROC_CN10K_NPA_BATCH_FREE_BURST_MAX;
477 bnum = num / ROC_CN10K_NPA_BATCH_FREE_MAX_PTRS;
478 bnum *= ROC_CN10K_NPA_BATCH_FREE_MAX_PTRS;
482 chunk = (bnum >= max_burst) ? max_burst : bnum;
483 roc_npa_aura_batch_free_burst(aura_handle, buf, chunk, fabs,
490 roc_npa_aura_batch_free(aura_handle, buf, num, fabs, lmt_addr,
494 static inline unsigned int
495 roc_npa_aura_bulk_alloc(uint64_t aura_handle, uint64_t *buf, unsigned int num,
498 #if defined(__aarch64__)
499 uint64_t wdata = roc_npa_aura_handle_to_aura(aura_handle);
500 unsigned int i, count;
504 wdata |= BIT_ULL(63); /* DROP */
506 addr = roc_npa_aura_handle_to_base(aura_handle) +
507 NPA_LF_AURA_OP_ALLOCX(0);
512 ".arch_extension lse\n"
513 "mov v18.d[0], %[dst]\n"
514 "mov v18.d[1], %[loc]\n"
515 "mov v19.d[0], %[wdata]\n"
516 "mov v19.d[1], x30\n"
517 "mov v20.d[0], x24\n"
518 "mov v20.d[1], x25\n"
519 "mov v21.d[0], x26\n"
520 "mov v21.d[1], x27\n"
521 "mov v22.d[0], x28\n"
522 "mov v22.d[1], x29\n"
523 "mov x28, v19.d[0]\n"
524 "mov x29, v19.d[0]\n"
525 "mov x30, v18.d[1]\n"
526 "casp x0, x1, x28, x29, [x30]\n"
527 "casp x2, x3, x28, x29, [x30]\n"
528 "casp x4, x5, x28, x29, [x30]\n"
529 "casp x6, x7, x28, x29, [x30]\n"
530 "casp x8, x9, x28, x29, [x30]\n"
531 "casp x10, x11, x28, x29, [x30]\n"
532 "casp x12, x13, x28, x29, [x30]\n"
533 "casp x14, x15, x28, x29, [x30]\n"
534 "casp x16, x17, x28, x29, [x30]\n"
535 "casp x18, x19, x28, x29, [x30]\n"
536 "casp x20, x21, x28, x29, [x30]\n"
537 "casp x22, x23, x28, x29, [x30]\n"
538 "casp x24, x25, x28, x29, [x30]\n"
539 "casp x26, x27, x28, x29, [x30]\n"
540 "casp x28, x29, x28, x29, [x30]\n"
541 "mov x30, v18.d[0]\n"
542 "stp x0, x1, [x30]\n"
543 "stp x2, x3, [x30, #16]\n"
544 "stp x4, x5, [x30, #32]\n"
545 "stp x6, x7, [x30, #48]\n"
546 "stp x8, x9, [x30, #64]\n"
547 "stp x10, x11, [x30, #80]\n"
548 "stp x12, x13, [x30, #96]\n"
549 "stp x14, x15, [x30, #112]\n"
550 "stp x16, x17, [x30, #128]\n"
551 "stp x18, x19, [x30, #144]\n"
552 "stp x20, x21, [x30, #160]\n"
553 "stp x22, x23, [x30, #176]\n"
554 "stp x24, x25, [x30, #192]\n"
555 "stp x26, x27, [x30, #208]\n"
556 "stp x28, x29, [x30, #224]\n"
557 "mov %[dst], v18.d[0]\n"
558 "mov %[loc], v18.d[1]\n"
559 "mov %[wdata], v19.d[0]\n"
560 "mov x30, v19.d[1]\n"
561 "mov x24, v20.d[0]\n"
562 "mov x25, v20.d[1]\n"
563 "mov x26, v21.d[0]\n"
564 "mov x27, v21.d[1]\n"
565 "mov x28, v22.d[0]\n"
566 "mov x29, v22.d[1]\n"
568 : [wdata] "r"(wdata), [loc] "r"(addr), [dst] "r"(buf)
569 : "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6",
570 "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14",
571 "x15", "x16", "x17", "x18", "x19", "x20", "x21",
572 "x22", "x23", "v18", "v19", "v20", "v21", "v22");
576 ".arch_extension lse\n"
577 "mov x16, %[wdata]\n"
578 "mov x17, %[wdata]\n"
579 "casp x0, x1, x16, x17, [%[loc]]\n"
580 "casp x2, x3, x16, x17, [%[loc]]\n"
581 "casp x4, x5, x16, x17, [%[loc]]\n"
582 "casp x6, x7, x16, x17, [%[loc]]\n"
583 "casp x8, x9, x16, x17, [%[loc]]\n"
584 "casp x10, x11, x16, x17, [%[loc]]\n"
585 "casp x12, x13, x16, x17, [%[loc]]\n"
586 "casp x14, x15, x16, x17, [%[loc]]\n"
587 "stp x0, x1, [%[dst]]\n"
588 "stp x2, x3, [%[dst], #16]\n"
589 "stp x4, x5, [%[dst], #32]\n"
590 "stp x6, x7, [%[dst], #48]\n"
591 "stp x8, x9, [%[dst], #64]\n"
592 "stp x10, x11, [%[dst], #80]\n"
593 "stp x12, x13, [%[dst], #96]\n"
594 "stp x14, x15, [%[dst], #112]\n"
596 : [wdata] "r"(wdata), [dst] "r"(buf), [loc] "r"(addr)
597 : "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6",
598 "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14",
599 "x15", "x16", "x17");
603 ".arch_extension lse\n"
604 "mov x16, %[wdata]\n"
605 "mov x17, %[wdata]\n"
606 "casp x0, x1, x16, x17, [%[loc]]\n"
607 "casp x2, x3, x16, x17, [%[loc]]\n"
608 "casp x4, x5, x16, x17, [%[loc]]\n"
609 "casp x6, x7, x16, x17, [%[loc]]\n"
610 "stp x0, x1, [%[dst]]\n"
611 "stp x2, x3, [%[dst], #16]\n"
612 "stp x4, x5, [%[dst], #32]\n"
613 "stp x6, x7, [%[dst], #48]\n"
615 : [wdata] "r"(wdata), [dst] "r"(buf), [loc] "r"(addr)
616 : "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6",
621 ".arch_extension lse\n"
622 "mov x16, %[wdata]\n"
623 "mov x17, %[wdata]\n"
624 "casp x0, x1, x16, x17, [%[loc]]\n"
625 "casp x2, x3, x16, x17, [%[loc]]\n"
626 "stp x0, x1, [%[dst]]\n"
627 "stp x2, x3, [%[dst], #16]\n"
629 : [wdata] "r"(wdata), [dst] "r"(buf), [loc] "r"(addr)
630 : "memory", "x0", "x1", "x2", "x3", "x16", "x17");
634 ".arch_extension lse\n"
635 "mov x16, %[wdata]\n"
636 "mov x17, %[wdata]\n"
637 "casp x0, x1, x16, x17, [%[loc]]\n"
638 "stp x0, x1, [%[dst]]\n"
640 : [wdata] "r"(wdata), [dst] "r"(buf), [loc] "r"(addr)
641 : "memory", "x0", "x1", "x16", "x17");
644 buf[0] = roc_npa_aura_op_alloc(aura_handle, drop);
648 /* Pack the pointers */
649 for (i = 0, count = 0; i < num; i++)
651 buf[count++] = buf[i];
655 unsigned int i, count;
657 for (i = 0, count = 0; i < num; i++) {
658 buf[count] = roc_npa_aura_op_alloc(aura_handle, drop);
667 static inline unsigned int
668 roc_npa_aura_op_bulk_alloc(uint64_t aura_handle, uint64_t *buf,
669 unsigned int num, const int drop, const int partial)
671 unsigned int chunk, count, num_alloc;
675 chunk = (num >= ROC_CN9K_NPA_BULK_ALLOC_MAX_PTRS) ?
676 ROC_CN9K_NPA_BULK_ALLOC_MAX_PTRS :
677 plt_align32prevpow2(num);
680 roc_npa_aura_bulk_alloc(aura_handle, buf, chunk, drop);
686 if (unlikely(num_alloc != chunk))
690 /* If the requested number of pointers was not allocated and if partial
691 * alloc is not desired, then free allocated pointers.
693 if (unlikely(num != 0 && !partial)) {
694 roc_npa_aura_op_bulk_free(aura_handle, buf - count, count, 1);
702 struct plt_pci_device *pci_dev;
704 #define ROC_NPA_MEM_SZ (1 * 1024)
705 uint8_t reserved[ROC_NPA_MEM_SZ] __plt_cache_aligned;
706 } __plt_cache_aligned;
708 int __roc_api roc_npa_dev_init(struct roc_npa *roc_npa);
709 int __roc_api roc_npa_dev_fini(struct roc_npa *roc_npa);
712 int __roc_api roc_npa_pool_create(uint64_t *aura_handle, uint32_t block_size,
713 uint32_t block_count, struct npa_aura_s *aura,
714 struct npa_pool_s *pool);
715 int __roc_api roc_npa_aura_limit_modify(uint64_t aura_handle,
716 uint16_t aura_limit);
717 int __roc_api roc_npa_pool_destroy(uint64_t aura_handle);
718 int __roc_api roc_npa_pool_range_update_check(uint64_t aura_handle);
719 void __roc_api roc_npa_aura_op_range_set(uint64_t aura_handle,
724 typedef int (*roc_npa_lf_init_cb_t)(struct plt_pci_device *pci_dev);
725 int __roc_api roc_npa_lf_init_cb_register(roc_npa_lf_init_cb_t cb);
728 int __roc_api roc_npa_ctx_dump(void);
729 int __roc_api roc_npa_dump(void);
731 /* Reset operation performance counter. */
732 int __roc_api roc_npa_pool_op_pc_reset(uint64_t aura_handle);
734 int __roc_api roc_npa_aura_drop_set(uint64_t aura_handle, uint64_t limit,
737 #endif /* _ROC_NPA_H_ */