dma/idxd: add data path job submission

author Kevin Laatz <kevin.laatz@intel.com>

Wed, 20 Oct 2021 16:30:06 +0000 (16:30 +0000)

committer Thomas Monjalon <thomas@monjalon.net>

Fri, 22 Oct 2021 20:40:59 +0000 (22:40 +0200)
author Kevin Laatz <kevin.laatz@intel.com>
Wed, 20 Oct 2021 16:30:06 +0000 (16:30 +0000)
committer Thomas Monjalon <thomas@monjalon.net>
Fri, 22 Oct 2021 20:40:59 +0000 (22:40 +0200)
diff --git a/doc/guides/dmadevs/idxd.rst b/doc/guides/dmadevs/idxd.rst

index 711890bd9eb580099414be3cc0638c90e817e172..d548c4751a6fe9544083010896e57227ef561a5e 100644 (file)
--- a/doc/guides/dmadevs/idxd.rst
+++ b/doc/guides/dmadevs/idxd.rst
@@ -138,3 +138,12 @@ IDXD configuration requirements:
  
  Once configured, the device can then be made ready for use by calling the
  ``rte_dma_start()`` API.
+
+Performing Data Copies
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Refer to the :ref:`Enqueue / Dequeue APIs <dmadev_enqueue_dequeue>` section of the dmadev library
+documentation for details on operation enqueue and submission API usage.
+
+It is expected that, for efficiency reasons, a burst of operations will be enqueued to the
+device via multiple enqueue calls between calls to the ``rte_dma_submit()`` function.
diff --git a/doc/guides/prog_guide/dmadev.rst b/doc/guides/prog_guide/dmadev.rst

index 32f7147862e556c009cbff514ab0673825eaf71b..30734f3a36ecfd3f2a74a5b75659fc809cd9546f 100644 (file)
--- a/doc/guides/prog_guide/dmadev.rst
+++ b/doc/guides/prog_guide/dmadev.rst
@@ -67,6 +67,8 @@ can be used to get the device info and supported features.
  Silent mode is a special device capability which does not require the
  application to invoke dequeue APIs.
  
+.. _dmadev_enqueue_dequeue:
+
  
  Enqueue / Dequeue APIs
  ~~~~~~~~~~~~~~~~~~~~~~
@@ -80,6 +82,23 @@ The ``rte_dma_submit`` API is used to issue doorbell to hardware.
  Alternatively the ``RTE_DMA_OP_FLAG_SUBMIT`` flag can be passed to the enqueue
  APIs to also issue the doorbell to hardware.
  
+The following code demonstrates how to enqueue a burst of copies to the
+device and start the hardware processing of them:
+
+.. code-block:: C
+
+   struct rte_mbuf *srcs[DMA_BURST_SZ], *dsts[DMA_BURST_SZ];
+   unsigned int i;
+
+   for (i = 0; i < RTE_DIM(srcs); i++) {
+      if (rte_dma_copy(dev_id, vchan, rte_pktmbuf_iova(srcs[i]),
+            rte_pktmbuf_iova(dsts[i]), COPY_LEN, 0) < 0) {
+         PRINT_ERR("Error with rte_dma_copy for buffer %u\n", i);
+         return -1;
+      }
+   }
+   rte_dma_submit(dev_id, vchan);
+
  There are two dequeue APIs ``rte_dma_completed`` and
  ``rte_dma_completed_status``, these are used to obtain the results of the
  enqueue requests. ``rte_dma_completed`` will return the number of successfully
diff --git a/drivers/dma/idxd/idxd_common.c b/drivers/dma/idxd/idxd_common.c

index 70d094e3a2ee89e52b95d268e4e25af01e9ed05d..616829c962634b9a76b9657119b241d93af91408 100644 (file)
--- a/drivers/dma/idxd/idxd_common.c
+++ b/drivers/dma/idxd/idxd_common.c
@@ -2,14 +2,145 @@
   * Copyright 2021 Intel Corporation
   */
  
+#include <x86intrin.h>
+
  #include <rte_malloc.h>
  #include <rte_common.h>
  #include <rte_log.h>
+#include <rte_prefetch.h>
  
  #include "idxd_internal.h"
  
  #define IDXD_PMD_NAME_STR "dmadev_idxd"
  
+static __rte_always_inline rte_iova_t
+__desc_idx_to_iova(struct idxd_dmadev *idxd, uint16_t n)
+{
+       return idxd->desc_iova + (n * sizeof(struct idxd_hw_desc));
+}
+
+static __rte_always_inline void
+__idxd_movdir64b(volatile void *dst, const struct idxd_hw_desc *src)
+{
+       asm volatile (".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
+                       :
+                       : "a" (dst), "d" (src)
+                       : "memory");
+}
+
+static __rte_always_inline void
+__submit(struct idxd_dmadev *idxd)
+{
+       rte_prefetch1(&idxd->batch_comp_ring[idxd->batch_idx_read]);
+
+       if (idxd->batch_size == 0)
+               return;
+
+       /* write completion to batch comp ring */
+       rte_iova_t comp_addr = idxd->batch_iova +
+                       (idxd->batch_idx_write * sizeof(struct idxd_completion));
+
+       if (idxd->batch_size == 1) {
+               /* submit batch directly */
+               struct idxd_hw_desc desc =
+                               idxd->desc_ring[idxd->batch_start & idxd->desc_ring_mask];
+               desc.completion = comp_addr;
+               desc.op_flags |= IDXD_FLAG_REQUEST_COMPLETION;
+               _mm_sfence(); /* fence before writing desc to device */
+               __idxd_movdir64b(idxd->portal, &desc);
+       } else {
+               const struct idxd_hw_desc batch_desc = {
+                               .op_flags = (idxd_op_batch << IDXD_CMD_OP_SHIFT) |
+                               IDXD_FLAG_COMPLETION_ADDR_VALID |
+                               IDXD_FLAG_REQUEST_COMPLETION,
+                               .desc_addr = __desc_idx_to_iova(idxd,
+                                               idxd->batch_start & idxd->desc_ring_mask),
+                               .completion = comp_addr,
+                               .size = idxd->batch_size,
+               };
+               _mm_sfence(); /* fence before writing desc to device */
+               __idxd_movdir64b(idxd->portal, &batch_desc);
+       }
+
+       if (++idxd->batch_idx_write > idxd->max_batches)
+               idxd->batch_idx_write = 0;
+
+       idxd->batch_start += idxd->batch_size;
+       idxd->batch_size = 0;
+       idxd->batch_idx_ring[idxd->batch_idx_write] = idxd->batch_start;
+       _mm256_store_si256((void *)&idxd->batch_comp_ring[idxd->batch_idx_write],
+                       _mm256_setzero_si256());
+}
+
+static __rte_always_inline int
+__idxd_write_desc(struct idxd_dmadev *idxd,
+               const uint32_t op_flags,
+               const rte_iova_t src,
+               const rte_iova_t dst,
+               const uint32_t size,
+               const uint32_t flags)
+{
+       uint16_t mask = idxd->desc_ring_mask;
+       uint16_t job_id = idxd->batch_start + idxd->batch_size;
+       /* we never wrap batches, so we only mask the start and allow start+size to overflow */
+       uint16_t write_idx = (idxd->batch_start & mask) + idxd->batch_size;
+
+       /* first check batch ring space then desc ring space */
+       if ((idxd->batch_idx_read == 0 && idxd->batch_idx_write == idxd->max_batches) ||
+                       idxd->batch_idx_write + 1 == idxd->batch_idx_read)
+               return -ENOSPC;
+       if (((write_idx + 1) & mask) == (idxd->ids_returned & mask))
+               return -ENOSPC;
+
+       /* write desc. Note: descriptors don't wrap, but the completion address does */
+       const uint64_t op_flags64 = (uint64_t)(op_flags | IDXD_FLAG_COMPLETION_ADDR_VALID) << 32;
+       const uint64_t comp_addr = __desc_idx_to_iova(idxd, write_idx & mask);
+       _mm256_store_si256((void *)&idxd->desc_ring[write_idx],
+                       _mm256_set_epi64x(dst, src, comp_addr, op_flags64));
+       _mm256_store_si256((void *)&idxd->desc_ring[write_idx].size,
+                       _mm256_set_epi64x(0, 0, 0, size));
+
+       idxd->batch_size++;
+
+       rte_prefetch0_write(&idxd->desc_ring[write_idx + 1]);
+
+       if (flags & RTE_DMA_OP_FLAG_SUBMIT)
+               __submit(idxd);
+
+       return job_id;
+}
+
+int
+idxd_enqueue_copy(void *dev_private, uint16_t qid __rte_unused, rte_iova_t src,
+               rte_iova_t dst, unsigned int length, uint64_t flags)
+{
+       /* we can take advantage of the fact that the fence flag in dmadev and DSA are the same,
+        * but check it at compile time to be sure.
+        */
+       RTE_BUILD_BUG_ON(RTE_DMA_OP_FLAG_FENCE != IDXD_FLAG_FENCE);
+       uint32_t memmove = (idxd_op_memmove << IDXD_CMD_OP_SHIFT) |
+                       IDXD_FLAG_CACHE_CONTROL | (flags & IDXD_FLAG_FENCE);
+       return __idxd_write_desc(dev_private, memmove, src, dst, length,
+                       flags);
+}
+
+int
+idxd_enqueue_fill(void *dev_private, uint16_t qid __rte_unused, uint64_t pattern,
+               rte_iova_t dst, unsigned int length, uint64_t flags)
+{
+       uint32_t fill = (idxd_op_fill << IDXD_CMD_OP_SHIFT) |
+                       IDXD_FLAG_CACHE_CONTROL | (flags & IDXD_FLAG_FENCE);
+       return __idxd_write_desc(dev_private, fill, pattern, dst, length,
+                       flags);
+}
+
+int
+idxd_submit(void *dev_private, uint16_t qid __rte_unused)
+{
+       __submit(dev_private);
+       return 0;
+}
+
  int
  idxd_dump(const struct rte_dma_dev *dev, FILE *f)
  {
@@ -139,6 +270,10 @@ idxd_dmadev_create(const char *name, struct rte_device *dev,
         dmadev->dev_ops = ops;
         dmadev->device = dev;
  
+       dmadev->fp_obj->copy = idxd_enqueue_copy;
+       dmadev->fp_obj->fill = idxd_enqueue_fill;
+       dmadev->fp_obj->submit = idxd_submit;
+
         idxd = dmadev->data->dev_private;
         *idxd = *base_idxd; /* copy over the main fields already passed in */
         idxd->dmadev = dmadev;
diff --git a/drivers/dma/idxd/idxd_internal.h b/drivers/dma/idxd/idxd_internal.h

index 1dbe31abcdaa315d2bfdc006e80dbadb1fb5d1ca..ab4d71095ed9a680a7b7ecd5d505934ea3e4ca65 100644 (file)
--- a/drivers/dma/idxd/idxd_internal.h
+++ b/drivers/dma/idxd/idxd_internal.h
@@ -87,5 +87,10 @@ int idxd_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
                 const struct rte_dma_vchan_conf *qconf, uint32_t qconf_sz);
  int idxd_info_get(const struct rte_dma_dev *dev, struct rte_dma_info *dev_info,
                 uint32_t size);
+int idxd_enqueue_copy(void *dev_private, uint16_t qid, rte_iova_t src,
+               rte_iova_t dst, unsigned int length, uint64_t flags);
+int idxd_enqueue_fill(void *dev_private, uint16_t qid, uint64_t pattern,
+               rte_iova_t dst, unsigned int length, uint64_t flags);
+int idxd_submit(void *dev_private, uint16_t qid);
  
  #endif /* _IDXD_INTERNAL_H_ */
diff --git a/drivers/dma/idxd/meson.build b/drivers/dma/idxd/meson.build

index dcc0a297d7ee3aa43250186bfa2ec92d2e715bc4..f1396be945b9de5fcc54045eb94a7efa22505d1e 100644 (file)
--- a/drivers/dma/idxd/meson.build
+++ b/drivers/dma/idxd/meson.build
@@ -5,6 +5,7 @@ build = dpdk_conf.has('RTE_ARCH_X86')
  reason = 'only supported on x86'
  
  deps += ['bus_pci']
+cflags += '-mavx2' # all platforms with idxd HW support AVX
  sources = files(
          'idxd_common.c',
          'idxd_pci.c',
author	Kevin Laatz <kevin.laatz@intel.com>
	Wed, 20 Oct 2021 16:30:06 +0000 (16:30 +0000)
committer	Thomas Monjalon <thomas@monjalon.net>
	Fri, 22 Oct 2021 20:40:59 +0000 (22:40 +0200)
doc/guides/dmadevs/idxd.rst		patch \| blob \| history
doc/guides/prog_guide/dmadev.rst		patch \| blob \| history
drivers/dma/idxd/idxd_common.c		patch \| blob \| history
drivers/dma/idxd/idxd_internal.h		patch \| blob \| history
drivers/dma/idxd/meson.build		patch \| blob \| history