From 74464005a2af86a30c752709953a0c5b01873fb7 Mon Sep 17 00:00:00 2001 From: Bruce Richardson Date: Tue, 4 May 2021 14:14:56 +0100 Subject: [PATCH] raw/ioat: rework SW ring layout The ring management in the idxd part of the driver is more complex than it needs to be, tracking individual batches in a ring and having null descriptors as padding to avoid having single-operation batches. This can be simplified by using a regular ring-based layout, with additional overflow at the end to ensure that the one does not need to wrap within a batch. Signed-off-by: Bruce Richardson --- drivers/raw/ioat/idxd_pci.c | 5 +- drivers/raw/ioat/ioat_common.c | 99 +++++------ drivers/raw/ioat/ioat_rawdev_test.c | 1 + drivers/raw/ioat/rte_idxd_rawdev_fns.h | 233 +++++++++++++------------ 4 files changed, 179 insertions(+), 159 deletions(-) diff --git a/drivers/raw/ioat/idxd_pci.c b/drivers/raw/ioat/idxd_pci.c index b48e565b4c..13515dbc6c 100644 --- a/drivers/raw/ioat/idxd_pci.c +++ b/drivers/raw/ioat/idxd_pci.c @@ -90,7 +90,7 @@ idxd_pci_dev_start(struct rte_rawdev *dev) return 0; } - if (idxd->public.batch_ring == NULL) { + if (idxd->public.desc_ring == NULL) { IOAT_PMD_ERR("WQ %d has not been fully configured", idxd->qid); return -EINVAL; } @@ -337,7 +337,8 @@ idxd_rawdev_destroy(const char *name) /* free device memory */ IOAT_PMD_DEBUG("Freeing device driver memory"); rdev->dev_private = NULL; - rte_free(idxd->public.batch_ring); + rte_free(idxd->public.batch_idx_ring); + rte_free(idxd->public.desc_ring); rte_free(idxd->public.hdl_ring); rte_memzone_free(idxd->mz); diff --git a/drivers/raw/ioat/ioat_common.c b/drivers/raw/ioat/ioat_common.c index d055c36a2a..fcb30572e6 100644 --- a/drivers/raw/ioat/ioat_common.c +++ b/drivers/raw/ioat/ioat_common.c @@ -84,21 +84,21 @@ idxd_dev_dump(struct rte_rawdev *dev, FILE *f) fprintf(f, "Driver: %s\n\n", dev->driver_name); fprintf(f, "Portal: %p\n", rte_idxd->portal); - fprintf(f, "Batch Ring size: %u\n", rte_idxd->batch_ring_sz); - fprintf(f, "Comp Handle Ring size: %u\n\n", rte_idxd->hdl_ring_sz); - - fprintf(f, "Next batch: %u\n", rte_idxd->next_batch); - fprintf(f, "Next batch to be completed: %u\n", rte_idxd->next_completed); - for (i = 0; i < rte_idxd->batch_ring_sz; i++) { - struct rte_idxd_desc_batch *b = &rte_idxd->batch_ring[i]; - fprintf(f, "Batch %u @%p: submitted=%u, op_count=%u, hdl_end=%u\n", - i, b, b->submitted, b->op_count, b->hdl_end); - } - - fprintf(f, "\n"); - fprintf(f, "Next free hdl: %u\n", rte_idxd->next_free_hdl); - fprintf(f, "Last completed hdl: %u\n", rte_idxd->last_completed_hdl); - fprintf(f, "Next returned hdl: %u\n", rte_idxd->next_ret_hdl); + fprintf(f, "Config: {ring_size: %u, hdls_disable: %u}\n\n", + rte_idxd->cfg.ring_size, rte_idxd->cfg.hdls_disable); + + fprintf(f, "max batches: %u\n", rte_idxd->max_batches); + fprintf(f, "batch idx read: %u\n", rte_idxd->batch_idx_read); + fprintf(f, "batch idx write: %u\n", rte_idxd->batch_idx_write); + fprintf(f, "batch idxes:"); + for (i = 0; i < rte_idxd->max_batches + 1; i++) + fprintf(f, "%u ", rte_idxd->batch_idx_ring[i]); + fprintf(f, "\n\n"); + + fprintf(f, "hdls read: %u\n", rte_idxd->max_batches); + fprintf(f, "hdls avail: %u\n", rte_idxd->hdls_avail); + fprintf(f, "batch start: %u\n", rte_idxd->batch_start); + fprintf(f, "batch size: %u\n", rte_idxd->batch_size); return 0; } @@ -114,10 +114,8 @@ idxd_dev_info_get(struct rte_rawdev *dev, rte_rawdev_obj_t dev_info, if (info_size != sizeof(*cfg)) return -EINVAL; - if (cfg != NULL) { - cfg->ring_size = rte_idxd->hdl_ring_sz; - cfg->hdls_disable = rte_idxd->hdls_disable; - } + if (cfg != NULL) + *cfg = rte_idxd->cfg; return 0; } @@ -129,8 +127,6 @@ idxd_dev_configure(const struct rte_rawdev *dev, struct rte_idxd_rawdev *rte_idxd = &idxd->public; struct rte_ioat_rawdev_config *cfg = config; uint16_t max_desc = cfg->ring_size; - uint16_t max_batches = max_desc / BATCH_SIZE; - uint16_t i; if (config_size != sizeof(*cfg)) return -EINVAL; @@ -140,47 +136,34 @@ idxd_dev_configure(const struct rte_rawdev *dev, return -EAGAIN; } - rte_idxd->hdls_disable = cfg->hdls_disable; + rte_idxd->cfg = *cfg; - /* limit the batches to what can be stored in hardware */ - if (max_batches > idxd->max_batches) { - IOAT_PMD_DEBUG("Ring size of %u is too large for this device, need to limit to %u batches of %u", - max_desc, idxd->max_batches, BATCH_SIZE); - max_batches = idxd->max_batches; - max_desc = max_batches * BATCH_SIZE; - } if (!rte_is_power_of_2(max_desc)) max_desc = rte_align32pow2(max_desc); - IOAT_PMD_DEBUG("Rawdev %u using %u descriptors in %u batches", - dev->dev_id, max_desc, max_batches); + IOAT_PMD_DEBUG("Rawdev %u using %u descriptors", + dev->dev_id, max_desc); + rte_idxd->desc_ring_mask = max_desc - 1; /* in case we are reconfiguring a device, free any existing memory */ - rte_free(rte_idxd->batch_ring); + rte_free(rte_idxd->desc_ring); rte_free(rte_idxd->hdl_ring); - rte_idxd->batch_ring = rte_zmalloc(NULL, - sizeof(*rte_idxd->batch_ring) * max_batches, 0); - if (rte_idxd->batch_ring == NULL) + /* allocate the descriptor ring at 2x size as batches can't wrap */ + rte_idxd->desc_ring = rte_zmalloc(NULL, + sizeof(*rte_idxd->desc_ring) * max_desc * 2, 0); + if (rte_idxd->desc_ring == NULL) return -ENOMEM; + rte_idxd->desc_iova = rte_mem_virt2iova(rte_idxd->desc_ring); rte_idxd->hdl_ring = rte_zmalloc(NULL, sizeof(*rte_idxd->hdl_ring) * max_desc, 0); if (rte_idxd->hdl_ring == NULL) { - rte_free(rte_idxd->batch_ring); - rte_idxd->batch_ring = NULL; + rte_free(rte_idxd->desc_ring); + rte_idxd->desc_ring = NULL; return -ENOMEM; } - rte_idxd->batch_ring_sz = max_batches; - rte_idxd->hdl_ring_sz = max_desc; - - for (i = 0; i < rte_idxd->batch_ring_sz; i++) { - struct rte_idxd_desc_batch *b = &rte_idxd->batch_ring[i]; - b->batch_desc.completion = rte_mem_virt2iova(&b->comp); - b->batch_desc.desc_addr = rte_mem_virt2iova(&b->null_desc); - b->batch_desc.op_flags = (idxd_op_batch << IDXD_CMD_OP_SHIFT) | - IDXD_FLAG_COMPLETION_ADDR_VALID | - IDXD_FLAG_REQUEST_COMPLETION; - } + rte_idxd->hdls_read = rte_idxd->batch_start = 0; + rte_idxd->batch_size = 0; return 0; } @@ -191,6 +174,7 @@ idxd_rawdev_create(const char *name, struct rte_device *dev, const struct rte_rawdev_ops *ops) { struct idxd_rawdev *idxd; + struct rte_idxd_rawdev *public; struct rte_rawdev *rawdev = NULL; const struct rte_memzone *mz = NULL; char mz_name[RTE_MEMZONE_NAMESIZE]; @@ -245,13 +229,30 @@ idxd_rawdev_create(const char *name, struct rte_device *dev, idxd = rawdev->dev_private; *idxd = *base_idxd; /* copy over the main fields already passed in */ - idxd->public.type = RTE_IDXD_DEV; idxd->rawdev = rawdev; idxd->mz = mz; + public = &idxd->public; + public->type = RTE_IDXD_DEV; + public->max_batches = idxd->max_batches; + public->batch_idx_read = 0; + public->batch_idx_write = 0; + /* allocate batch index ring. The +1 is because we can never fully use + * the ring, otherwise read == write means both full and empty. + */ + public->batch_idx_ring = rte_zmalloc(NULL, + sizeof(uint16_t) * (idxd->max_batches + 1), 0); + if (public->batch_idx_ring == NULL) { + IOAT_PMD_ERR("Unable to reserve memory for batch data\n"); + ret = -ENOMEM; + goto cleanup; + } + return 0; cleanup: + if (mz) + rte_memzone_free(mz); if (rawdev) rte_rawdev_pmd_release(rawdev); diff --git a/drivers/raw/ioat/ioat_rawdev_test.c b/drivers/raw/ioat/ioat_rawdev_test.c index a5064d739d..51eebe152f 100644 --- a/drivers/raw/ioat/ioat_rawdev_test.c +++ b/drivers/raw/ioat/ioat_rawdev_test.c @@ -206,6 +206,7 @@ test_enqueue_copies(int dev_id) if (rte_ioat_completed_ops(dev_id, max_completions, (void *)&completed[0], (void *)&completed[max_completions]) != max_ops) { PRINT_ERR("Error with rte_ioat_completed_ops\n"); + rte_rawdev_dump(dev_id, stdout); return -1; } if (completed[0] != src || completed[max_completions] != dst) { diff --git a/drivers/raw/ioat/rte_idxd_rawdev_fns.h b/drivers/raw/ioat/rte_idxd_rawdev_fns.h index 0e8e5cf8c3..dfc591bb80 100644 --- a/drivers/raw/ioat/rte_idxd_rawdev_fns.h +++ b/drivers/raw/ioat/rte_idxd_rawdev_fns.h @@ -18,7 +18,7 @@ #include /* - * Defines used in the data path for interacting with hardware. + * Defines used in the data path for interacting with IDXD hardware. */ #define IDXD_CMD_OP_SHIFT 24 enum rte_idxd_ops { @@ -78,26 +78,6 @@ struct rte_idxd_completion { uint32_t invalid_flags; } __rte_aligned(32); -#define BATCH_SIZE 64 - -/** - * Structure used inside the driver for building up and submitting - * a batch of operations to the DSA hardware. - */ -struct rte_idxd_desc_batch { - struct rte_idxd_completion comp; /* the completion record for batch */ - - uint16_t submitted; - uint16_t op_count; - uint16_t hdl_end; - - struct rte_idxd_hw_desc batch_desc; - - /* batches must always have 2 descriptors, so put a null at the start */ - struct rte_idxd_hw_desc null_desc; - struct rte_idxd_hw_desc ops[BATCH_SIZE]; -}; - /** * structure used to save the "handles" provided by the user to be * returned to the user on job completion. @@ -117,51 +97,65 @@ struct rte_idxd_rawdev { void *portal; /* address to write the batch descriptor */ - /* counters to track the batches and the individual op handles */ - uint16_t batch_ring_sz; /* size of batch ring */ - uint16_t hdl_ring_sz; /* size of the user hdl ring */ + struct rte_ioat_rawdev_config cfg; + rte_iova_t desc_iova; /* base address of desc ring, needed for completions */ - uint16_t next_batch; /* where we write descriptor ops */ - uint16_t next_completed; /* batch where we read completions */ - uint16_t next_ret_hdl; /* the next user hdl to return */ - uint16_t last_completed_hdl; /* the last user hdl that has completed */ - uint16_t next_free_hdl; /* where the handle for next op will go */ - uint16_t hdls_disable; /* disable tracking completion handles */ + /* counters to track the batches */ + unsigned short max_batches; + unsigned short batch_idx_read; + unsigned short batch_idx_write; + unsigned short *batch_idx_ring; /* store where each batch ends */ + /* track descriptors and handles */ + unsigned short desc_ring_mask; + unsigned short hdls_avail; /* handles for ops completed */ + unsigned short hdls_read; /* the read pointer for hdls/desc rings */ + unsigned short batch_start; /* start+size == write pointer for hdls/desc */ + unsigned short batch_size; + + struct rte_idxd_hw_desc *desc_ring; struct rte_idxd_user_hdl *hdl_ring; - struct rte_idxd_desc_batch *batch_ring; }; +static __rte_always_inline rte_iova_t +__desc_idx_to_iova(struct rte_idxd_rawdev *idxd, uint16_t n) +{ + return idxd->desc_iova + (n * sizeof(struct rte_idxd_hw_desc)); +} + static __rte_always_inline int -__idxd_write_desc(int dev_id, const struct rte_idxd_hw_desc *desc, +__idxd_write_desc(int dev_id, + const uint32_t op_flags, + const rte_iova_t src, + const rte_iova_t dst, + const uint32_t size, const struct rte_idxd_user_hdl *hdl) { struct rte_idxd_rawdev *idxd = (struct rte_idxd_rawdev *)rte_rawdevs[dev_id].dev_private; - struct rte_idxd_desc_batch *b = &idxd->batch_ring[idxd->next_batch]; + uint16_t write_idx = idxd->batch_start + idxd->batch_size; - /* check for room in the handle ring */ - if (((idxd->next_free_hdl + 1) & (idxd->hdl_ring_sz - 1)) == idxd->next_ret_hdl) + /* first check batch ring space then desc ring space */ + if ((idxd->batch_idx_read == 0 && idxd->batch_idx_write == idxd->max_batches) || + idxd->batch_idx_write + 1 == idxd->batch_idx_read) goto failed; - - /* check for space in current batch */ - if (b->op_count >= BATCH_SIZE) - goto failed; - - /* check that we can actually use the current batch */ - if (b->submitted) + if (((write_idx + 1) & idxd->desc_ring_mask) == idxd->hdls_read) goto failed; - /* write the descriptor */ - b->ops[b->op_count++] = *desc; + /* write desc and handle. Note, descriptors don't wrap */ + idxd->desc_ring[write_idx].pasid = 0; + idxd->desc_ring[write_idx].op_flags = op_flags | IDXD_FLAG_COMPLETION_ADDR_VALID; + idxd->desc_ring[write_idx].completion = __desc_idx_to_iova(idxd, write_idx); + idxd->desc_ring[write_idx].src = src; + idxd->desc_ring[write_idx].dst = dst; + idxd->desc_ring[write_idx].size = size; - /* store the completion details */ - if (!idxd->hdls_disable) - idxd->hdl_ring[idxd->next_free_hdl] = *hdl; - if (++idxd->next_free_hdl == idxd->hdl_ring_sz) - idxd->next_free_hdl = 0; + idxd->hdl_ring[write_idx & idxd->desc_ring_mask] = *hdl; + idxd->batch_size++; idxd->xstats.enqueued++; + + rte_prefetch0_write(&idxd->desc_ring[write_idx + 1]); return 1; failed: @@ -174,53 +168,42 @@ static __rte_always_inline int __idxd_enqueue_fill(int dev_id, uint64_t pattern, rte_iova_t dst, unsigned int length, uintptr_t dst_hdl) { - const struct rte_idxd_hw_desc desc = { - .op_flags = (idxd_op_fill << IDXD_CMD_OP_SHIFT) | - IDXD_FLAG_CACHE_CONTROL, - .src = pattern, - .dst = dst, - .size = length - }; const struct rte_idxd_user_hdl hdl = { .dst = dst_hdl }; - return __idxd_write_desc(dev_id, &desc, &hdl); + return __idxd_write_desc(dev_id, + (idxd_op_fill << IDXD_CMD_OP_SHIFT) | IDXD_FLAG_CACHE_CONTROL, + pattern, dst, length, &hdl); } static __rte_always_inline int __idxd_enqueue_copy(int dev_id, rte_iova_t src, rte_iova_t dst, unsigned int length, uintptr_t src_hdl, uintptr_t dst_hdl) { - const struct rte_idxd_hw_desc desc = { - .op_flags = (idxd_op_memmove << IDXD_CMD_OP_SHIFT) | - IDXD_FLAG_CACHE_CONTROL, - .src = src, - .dst = dst, - .size = length - }; const struct rte_idxd_user_hdl hdl = { .src = src_hdl, .dst = dst_hdl }; - return __idxd_write_desc(dev_id, &desc, &hdl); + return __idxd_write_desc(dev_id, + (idxd_op_memmove << IDXD_CMD_OP_SHIFT) | IDXD_FLAG_CACHE_CONTROL, + src, dst, length, &hdl); } static __rte_always_inline int __idxd_fence(int dev_id) { - static const struct rte_idxd_hw_desc fence = { - .op_flags = IDXD_FLAG_FENCE - }; static const struct rte_idxd_user_hdl null_hdl; - return __idxd_write_desc(dev_id, &fence, &null_hdl); + /* only op field needs filling - zero src, dst and length */ + return __idxd_write_desc(dev_id, IDXD_FLAG_FENCE, 0, 0, 0, &null_hdl); } static __rte_always_inline void -__idxd_movdir64b(volatile void *dst, const void *src) +__idxd_movdir64b(volatile void *dst, const struct rte_idxd_hw_desc *src) { asm volatile (".byte 0x66, 0x0f, 0x38, 0xf8, 0x02" : - : "a" (dst), "d" (src)); + : "a" (dst), "d" (src) + : "memory"); } static __rte_always_inline int @@ -228,19 +211,49 @@ __idxd_perform_ops(int dev_id) { struct rte_idxd_rawdev *idxd = (struct rte_idxd_rawdev *)rte_rawdevs[dev_id].dev_private; - struct rte_idxd_desc_batch *b = &idxd->batch_ring[idxd->next_batch]; + /* write completion to last desc in the batch */ + uint16_t comp_idx = idxd->batch_start + idxd->batch_size - 1; + if (comp_idx > idxd->desc_ring_mask) { + comp_idx &= idxd->desc_ring_mask; + *((uint64_t *)&idxd->desc_ring[comp_idx]) = 0; /* zero start of desc */ + } - if (b->submitted || b->op_count == 0) + if (idxd->batch_size == 0) return 0; - b->hdl_end = idxd->next_free_hdl; - b->comp.status = 0; - b->submitted = 1; - b->batch_desc.size = b->op_count + 1; - __idxd_movdir64b(idxd->portal, &b->batch_desc); - - if (++idxd->next_batch == idxd->batch_ring_sz) - idxd->next_batch = 0; - idxd->xstats.started = idxd->xstats.enqueued; + + _mm_sfence(); /* fence before writing desc to device */ + if (idxd->batch_size > 1) { + struct rte_idxd_hw_desc batch_desc = { + .op_flags = (idxd_op_batch << IDXD_CMD_OP_SHIFT) | + IDXD_FLAG_COMPLETION_ADDR_VALID | + IDXD_FLAG_REQUEST_COMPLETION, + .desc_addr = __desc_idx_to_iova(idxd, idxd->batch_start), + .completion = __desc_idx_to_iova(idxd, comp_idx), + .size = idxd->batch_size, + }; + + __idxd_movdir64b(idxd->portal, &batch_desc); + } else { + /* special case batch size of 1, as not allowed by HW */ + /* comp_idx == batch_start */ + struct rte_idxd_hw_desc *desc = &idxd->desc_ring[comp_idx]; + desc->op_flags |= IDXD_FLAG_COMPLETION_ADDR_VALID | + IDXD_FLAG_REQUEST_COMPLETION; + desc->completion = __desc_idx_to_iova(idxd, comp_idx); + + __idxd_movdir64b(idxd->portal, desc); + } + + idxd->xstats.started += idxd->batch_size; + + idxd->batch_start += idxd->batch_size; + idxd->batch_start &= idxd->desc_ring_mask; + idxd->batch_size = 0; + + idxd->batch_idx_ring[idxd->batch_idx_write++] = comp_idx; + if (idxd->batch_idx_write > idxd->max_batches) + idxd->batch_idx_write = 0; + return 0; } @@ -250,35 +263,39 @@ __idxd_completed_ops(int dev_id, uint8_t max_ops, { struct rte_idxd_rawdev *idxd = (struct rte_idxd_rawdev *)rte_rawdevs[dev_id].dev_private; - struct rte_idxd_desc_batch *b = &idxd->batch_ring[idxd->next_completed]; - uint16_t h_idx = idxd->next_ret_hdl; - int n = 0; - - while (b->submitted && b->comp.status != 0) { - idxd->last_completed_hdl = b->hdl_end; - b->submitted = 0; - b->op_count = 0; - if (++idxd->next_completed == idxd->batch_ring_sz) - idxd->next_completed = 0; - b = &idxd->batch_ring[idxd->next_completed]; + unsigned short n, h_idx; + + while (idxd->batch_idx_read != idxd->batch_idx_write) { + uint16_t idx_to_chk = idxd->batch_idx_ring[idxd->batch_idx_read]; + volatile struct rte_idxd_completion *comp_to_chk = + (struct rte_idxd_completion *)&idxd->desc_ring[idx_to_chk]; + if (comp_to_chk->status == 0) + break; + /* avail points to one after the last one written */ + idxd->hdls_avail = (idx_to_chk + 1) & idxd->desc_ring_mask; + idxd->batch_idx_read++; + if (idxd->batch_idx_read > idxd->max_batches) + idxd->batch_idx_read = 0; } - if (!idxd->hdls_disable) - for (n = 0; n < max_ops && h_idx != idxd->last_completed_hdl; n++) { - src_hdls[n] = idxd->hdl_ring[h_idx].src; - dst_hdls[n] = idxd->hdl_ring[h_idx].dst; - if (++h_idx == idxd->hdl_ring_sz) - h_idx = 0; - } - else - while (h_idx != idxd->last_completed_hdl) { - n++; - if (++h_idx == idxd->hdl_ring_sz) - h_idx = 0; - } - - idxd->next_ret_hdl = h_idx; + if (idxd->cfg.hdls_disable) { + n = (idxd->hdls_avail < idxd->hdls_read) ? + (idxd->hdls_avail + idxd->desc_ring_mask + 1 - idxd->hdls_read) : + (idxd->hdls_avail - idxd->hdls_read); + idxd->hdls_read = idxd->hdls_avail; + goto out; + } + + for (n = 0, h_idx = idxd->hdls_read; + n < max_ops && h_idx != idxd->hdls_avail; n++) { + src_hdls[n] = idxd->hdl_ring[h_idx].src; + dst_hdls[n] = idxd->hdl_ring[h_idx].dst; + if (++h_idx > idxd->desc_ring_mask) + h_idx = 0; + } + idxd->hdls_read = h_idx; +out: idxd->xstats.completed += n; return n; } -- 2.20.1