vdpa/mlx5: prepare memory regions
authorMatan Azrad <matan@mellanox.com>
Sun, 2 Feb 2020 16:03:44 +0000 (16:03 +0000)
committerFerruh Yigit <ferruh.yigit@intel.com>
Wed, 5 Feb 2020 08:51:21 +0000 (09:51 +0100)
In order to map the guest physical addresses used by the virtio device
guest side to the host physical addresses used by the HW as the host
side, memory regions are created.

By this way, for example, the HW can translate the addresses of the
packets posted by the guest and to take the packets from the correct
place.

The design is to work with single MR which will be configured to the
virtio queues in the HW, hence a lot of direct MRs are grouped to single
indirect MR.

Create functions to prepare and release MRs with all the related
resources that are required for it.

Create a new file mlx5_vdpa_mem.c to manage all the MR related code
in the driver.

Signed-off-by: Matan Azrad <matan@mellanox.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
Acked-by: Maxime Coquelin <maxime.coquelin@redhat.com>
drivers/vdpa/mlx5/Makefile
drivers/vdpa/mlx5/meson.build
drivers/vdpa/mlx5/mlx5_vdpa.c
drivers/vdpa/mlx5/mlx5_vdpa.h [new file with mode: 0644]
drivers/vdpa/mlx5/mlx5_vdpa_mem.c [new file with mode: 0644]

index 1ab5296..bceab1e 100644 (file)
@@ -8,6 +8,7 @@ LIB = librte_pmd_mlx5_vdpa.a
 
 # Sources.
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa_mem.c
 
 # Basic CFLAGS.
 CFLAGS += -O3
@@ -15,6 +16,7 @@ CFLAGS += -std=c11 -Wall -Wextra
 CFLAGS += -g
 CFLAGS += -I$(RTE_SDK)/drivers/common/mlx5
 CFLAGS += -I$(RTE_SDK)/drivers/net/mlx5_vdpa
+CFLAGS += -I$(RTE_SDK)/lib/librte_sched
 CFLAGS += -I$(BUILDDIR)/drivers/common/mlx5
 CFLAGS += -D_BSD_SOURCE
 CFLAGS += -D_DEFAULT_SOURCE
@@ -22,7 +24,7 @@ CFLAGS += -D_XOPEN_SOURCE=600
 CFLAGS += $(WERROR_FLAGS)
 CFLAGS += -Wno-strict-prototypes
 LDLIBS += -lrte_common_mlx5
-LDLIBS += -lrte_eal -lrte_vhost -lrte_kvargs -lrte_bus_pci
+LDLIBS += -lrte_eal -lrte_vhost -lrte_kvargs -lrte_bus_pci -lrte_sched
 
 # A few warnings cannot be avoided in external headers.
 CFLAGS += -Wno-error=cast-qual
index 6d3ab98..47f9537 100644 (file)
@@ -9,9 +9,10 @@ endif
 
 fmt_name = 'mlx5_vdpa'
 allow_experimental_apis = true
-deps += ['hash', 'common_mlx5', 'vhost', 'bus_pci', 'eal']
+deps += ['hash', 'common_mlx5', 'vhost', 'bus_pci', 'eal', 'sched']
 sources = files(
        'mlx5_vdpa.c',
+       'mlx5_vdpa_mem.c',
 )
 cflags_options = [
        '-std=c11',
@@ -30,4 +31,4 @@ if get_option('buildtype').contains('debug')
        cflags += [ '-pedantic', '-DPEDANTIC' ]
 else
        cflags += [ '-UPEDANTIC' ]
-endif
\ No newline at end of file
+endif
index 00d3a19..16107cf 100644 (file)
@@ -7,13 +7,6 @@
 #include <rte_log.h>
 #include <rte_errno.h>
 #include <rte_bus_pci.h>
-#ifdef PEDANTIC
-#pragma GCC diagnostic ignored "-Wpedantic"
-#endif
-#include <rte_vdpa.h>
-#ifdef PEDANTIC
-#pragma GCC diagnostic error "-Wpedantic"
-#endif
 
 #include <mlx5_glue.h>
 #include <mlx5_common.h>
 #include <mlx5_prm.h>
 
 #include "mlx5_vdpa_utils.h"
+#include "mlx5_vdpa.h"
 
 
-struct mlx5_vdpa_priv {
-       TAILQ_ENTRY(mlx5_vdpa_priv) next;
-       int id; /* vDPA device id. */
-       struct ibv_context *ctx; /* Device context. */
-       struct rte_vdpa_dev_addr dev_addr;
-       struct mlx5_hca_vdpa_attr caps;
-};
-
 #ifndef VIRTIO_F_ORDER_PLATFORM
 #define VIRTIO_F_ORDER_PLATFORM 36
 #endif
@@ -243,6 +229,7 @@ mlx5_vdpa_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
                rte_errno = rte_errno ? rte_errno : EINVAL;
                goto error;
        }
+       SLIST_INIT(&priv->mr_list);
        pthread_mutex_lock(&priv_list_lock);
        TAILQ_INSERT_TAIL(&priv_list, priv, next);
        pthread_mutex_unlock(&priv_list_lock);
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa.h b/drivers/vdpa/mlx5/mlx5_vdpa.h
new file mode 100644 (file)
index 0000000..f367991
--- /dev/null
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_VDPA_H_
+#define RTE_PMD_MLX5_VDPA_H_
+
+#include <sys/queue.h>
+
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <rte_vdpa.h>
+#include <rte_vhost.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <mlx5_glue.h>
+#include <mlx5_devx_cmds.h>
+
+struct mlx5_vdpa_query_mr {
+       SLIST_ENTRY(mlx5_vdpa_query_mr) next;
+       void *addr;
+       uint64_t length;
+       struct mlx5dv_devx_umem *umem;
+       struct mlx5_devx_obj *mkey;
+       int is_indirect;
+};
+
+struct mlx5_vdpa_priv {
+       TAILQ_ENTRY(mlx5_vdpa_priv) next;
+       int id; /* vDPA device id. */
+       int vid; /* vhost device id. */
+       struct ibv_context *ctx; /* Device context. */
+       struct rte_vdpa_dev_addr dev_addr;
+       struct mlx5_hca_vdpa_attr caps;
+       uint32_t pdn; /* Protection Domain number. */
+       struct ibv_pd *pd;
+       uint32_t gpa_mkey_index;
+       struct ibv_mr *null_mr;
+       struct rte_vhost_memory *vmem;
+       SLIST_HEAD(mr_list, mlx5_vdpa_query_mr) mr_list;
+};
+
+/**
+ * Release all the prepared memory regions and all their related resources.
+ *
+ * @param[in] priv
+ *   The vdpa driver private structure.
+ */
+void mlx5_vdpa_mem_dereg(struct mlx5_vdpa_priv *priv);
+
+/**
+ * Register all the memory regions of the virtio device to the HW and allocate
+ * all their related resources.
+ *
+ * @param[in] priv
+ *   The vdpa driver private structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int mlx5_vdpa_mem_register(struct mlx5_vdpa_priv *priv);
+
+#endif /* RTE_PMD_MLX5_VDPA_H_ */
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa_mem.c b/drivers/vdpa/mlx5/mlx5_vdpa_mem.c
new file mode 100644 (file)
index 0000000..398ca35
--- /dev/null
@@ -0,0 +1,346 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+#include <stdlib.h>
+
+#include <rte_malloc.h>
+#include <rte_errno.h>
+#include <rte_common.h>
+#include <rte_sched_common.h>
+
+#include <mlx5_prm.h>
+#include <mlx5_common.h>
+
+#include "mlx5_vdpa_utils.h"
+#include "mlx5_vdpa.h"
+
+static int
+mlx5_vdpa_pd_prepare(struct mlx5_vdpa_priv *priv)
+{
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+       if (priv->pd)
+               return 0;
+       priv->pd = mlx5_glue->alloc_pd(priv->ctx);
+       if (priv->pd == NULL) {
+               DRV_LOG(ERR, "Failed to allocate PD.");
+               return errno ? -errno : -ENOMEM;
+       }
+       struct mlx5dv_obj obj;
+       struct mlx5dv_pd pd_info;
+       int ret = 0;
+
+       obj.pd.in = priv->pd;
+       obj.pd.out = &pd_info;
+       ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
+       if (ret) {
+               DRV_LOG(ERR, "Fail to get PD object info.");
+               mlx5_glue->dealloc_pd(priv->pd);
+               priv->pd = NULL;
+               return -errno;
+       }
+       priv->pdn = pd_info.pdn;
+       return 0;
+#else
+       (void)priv;
+       DRV_LOG(ERR, "Cannot get pdn - no DV support.");
+       return -ENOTSUP;
+#endif /* HAVE_IBV_FLOW_DV_SUPPORT */
+}
+
+void
+mlx5_vdpa_mem_dereg(struct mlx5_vdpa_priv *priv)
+{
+       struct mlx5_vdpa_query_mr *entry;
+       struct mlx5_vdpa_query_mr *next;
+
+       entry = SLIST_FIRST(&priv->mr_list);
+       while (entry) {
+               next = SLIST_NEXT(entry, next);
+               claim_zero(mlx5_devx_cmd_destroy(entry->mkey));
+               if (!entry->is_indirect)
+                       claim_zero(mlx5_glue->devx_umem_dereg(entry->umem));
+               SLIST_REMOVE(&priv->mr_list, entry, mlx5_vdpa_query_mr, next);
+               rte_free(entry);
+               entry = next;
+       }
+       SLIST_INIT(&priv->mr_list);
+       if (priv->null_mr) {
+               claim_zero(mlx5_glue->dereg_mr(priv->null_mr));
+               priv->null_mr = NULL;
+       }
+       if (priv->pd) {
+               claim_zero(mlx5_glue->dealloc_pd(priv->pd));
+               priv->pd = NULL;
+       }
+       if (priv->vmem) {
+               free(priv->vmem);
+               priv->vmem = NULL;
+       }
+}
+
+static int
+mlx5_vdpa_regions_addr_cmp(const void *a, const void *b)
+{
+       const struct rte_vhost_mem_region *region_a = a;
+       const struct rte_vhost_mem_region *region_b = b;
+
+       if (region_a->guest_phys_addr < region_b->guest_phys_addr)
+               return -1;
+       if (region_a->guest_phys_addr > region_b->guest_phys_addr)
+               return 1;
+       return 0;
+}
+
+#define KLM_NUM_MAX_ALIGN(sz) (RTE_ALIGN_CEIL(sz, MLX5_MAX_KLM_BYTE_COUNT) / \
+                              MLX5_MAX_KLM_BYTE_COUNT)
+
+/*
+ * Allocate and sort the region list and choose indirect mkey mode:
+ *   1. Calculate GCD, guest memory size and indirect mkey entries num per mode.
+ *   2. Align GCD to the maximum allowed size(2G) and to be power of 2.
+ *   2. Decide the indirect mkey mode according to the next rules:
+ *         a. If both KLM_FBS entries number and KLM entries number are bigger
+ *            than the maximum allowed(MLX5_DEVX_MAX_KLM_ENTRIES) - error.
+ *         b. KLM mode if KLM_FBS entries number is bigger than the maximum
+ *            allowed(MLX5_DEVX_MAX_KLM_ENTRIES).
+ *         c. KLM mode if GCD is smaller than the minimum allowed(4K).
+ *         d. KLM mode if the total size of KLM entries is in one cache line
+ *            and the total size of KLM_FBS entries is not in one cache line.
+ *         e. Otherwise, KLM_FBS mode.
+ */
+static struct rte_vhost_memory *
+mlx5_vdpa_vhost_mem_regions_prepare(int vid, uint8_t *mode, uint64_t *mem_size,
+                                   uint64_t *gcd, uint32_t *entries_num)
+{
+       struct rte_vhost_memory *mem;
+       uint64_t size;
+       uint64_t klm_entries_num = 0;
+       uint64_t klm_fbs_entries_num;
+       uint32_t i;
+       int ret = rte_vhost_get_mem_table(vid, &mem);
+
+       if (ret < 0) {
+               DRV_LOG(ERR, "Failed to get VM memory layout vid =%d.", vid);
+               rte_errno = EINVAL;
+               return NULL;
+       }
+       qsort(mem->regions, mem->nregions, sizeof(mem->regions[0]),
+             mlx5_vdpa_regions_addr_cmp);
+       *mem_size = (mem->regions[(mem->nregions - 1)].guest_phys_addr) +
+                                     (mem->regions[(mem->nregions - 1)].size) -
+                                             (mem->regions[0].guest_phys_addr);
+       *gcd = 0;
+       for (i = 0; i < mem->nregions; ++i) {
+               DRV_LOG(INFO,  "Region %u: HVA 0x%" PRIx64 ", GPA 0x%" PRIx64
+                       ", size 0x%" PRIx64 ".", i,
+                       mem->regions[i].host_user_addr,
+                       mem->regions[i].guest_phys_addr, mem->regions[i].size);
+               if (i > 0) {
+                       /* Hole handle. */
+                       size = mem->regions[i].guest_phys_addr -
+                               (mem->regions[i - 1].guest_phys_addr +
+                                mem->regions[i - 1].size);
+                       *gcd = rte_get_gcd(*gcd, size);
+                       klm_entries_num += KLM_NUM_MAX_ALIGN(size);
+               }
+               size = mem->regions[i].size;
+               *gcd = rte_get_gcd(*gcd, size);
+               klm_entries_num += KLM_NUM_MAX_ALIGN(size);
+       }
+       if (*gcd > MLX5_MAX_KLM_BYTE_COUNT)
+               *gcd = rte_get_gcd(*gcd, MLX5_MAX_KLM_BYTE_COUNT);
+       if (!RTE_IS_POWER_OF_2(*gcd)) {
+               uint64_t candidate_gcd = rte_align64prevpow2(*gcd);
+
+               while (candidate_gcd > 1 && (*gcd % candidate_gcd))
+                       candidate_gcd /= 2;
+               DRV_LOG(DEBUG, "GCD 0x%" PRIx64 " is not power of 2. Adjusted "
+                       "GCD is 0x%" PRIx64 ".", *gcd, candidate_gcd);
+               *gcd = candidate_gcd;
+       }
+       klm_fbs_entries_num = *mem_size / *gcd;
+       if (*gcd < MLX5_MIN_KLM_FIXED_BUFFER_SIZE || klm_fbs_entries_num >
+           MLX5_DEVX_MAX_KLM_ENTRIES ||
+           ((klm_entries_num * sizeof(struct mlx5_klm)) <=
+           RTE_CACHE_LINE_SIZE && (klm_fbs_entries_num *
+                                   sizeof(struct mlx5_klm)) >
+                                                       RTE_CACHE_LINE_SIZE)) {
+               *mode = MLX5_MKC_ACCESS_MODE_KLM;
+               *entries_num = klm_entries_num;
+               DRV_LOG(INFO, "Indirect mkey mode is KLM.");
+       } else {
+               *mode = MLX5_MKC_ACCESS_MODE_KLM_FBS;
+               *entries_num = klm_fbs_entries_num;
+               DRV_LOG(INFO, "Indirect mkey mode is KLM Fixed Buffer Size.");
+       }
+       DRV_LOG(DEBUG, "Memory registration information: nregions = %u, "
+               "mem_size = 0x%" PRIx64 ", GCD = 0x%" PRIx64
+               ", klm_fbs_entries_num = 0x%" PRIx64 ", klm_entries_num = 0x%"
+               PRIx64 ".", mem->nregions, *mem_size, *gcd, klm_fbs_entries_num,
+               klm_entries_num);
+       if (*entries_num > MLX5_DEVX_MAX_KLM_ENTRIES) {
+               DRV_LOG(ERR, "Failed to prepare memory of vid %d - memory is "
+                       "too fragmented.", vid);
+               free(mem);
+               return NULL;
+       }
+       return mem;
+}
+
+#define KLM_SIZE_MAX_ALIGN(sz) ((sz) > MLX5_MAX_KLM_BYTE_COUNT ? \
+                               MLX5_MAX_KLM_BYTE_COUNT : (sz))
+
+/*
+ * The target here is to group all the physical memory regions of the
+ * virtio device in one indirect mkey.
+ * For KLM Fixed Buffer Size mode (HW find the translation entry in one
+ * read according to the guest phisical address):
+ * All the sub-direct mkeys of it must be in the same size, hence, each
+ * one of them should be in the GCD size of all the virtio memory
+ * regions and the holes between them.
+ * For KLM mode (each entry may be in different size so HW must iterate
+ * the entries):
+ * Each virtio memory region and each hole between them have one entry,
+ * just need to cover the maximum allowed size(2G) by splitting entries
+ * which their associated memory regions are bigger than 2G.
+ * It means that each virtio memory region may be mapped to more than
+ * one direct mkey in the 2 modes.
+ * All the holes of invalid memory between the virtio memory regions
+ * will be mapped to the null memory region for security.
+ */
+int
+mlx5_vdpa_mem_register(struct mlx5_vdpa_priv *priv)
+{
+       struct mlx5_devx_mkey_attr mkey_attr;
+       struct mlx5_vdpa_query_mr *entry = NULL;
+       struct rte_vhost_mem_region *reg = NULL;
+       uint8_t mode;
+       uint32_t entries_num = 0;
+       uint32_t i;
+       uint64_t gcd;
+       uint64_t klm_size;
+       uint64_t mem_size;
+       uint64_t k;
+       int klm_index = 0;
+       int ret;
+       struct rte_vhost_memory *mem = mlx5_vdpa_vhost_mem_regions_prepare
+                             (priv->vid, &mode, &mem_size, &gcd, &entries_num);
+       struct mlx5_klm klm_array[entries_num];
+
+       if (!mem)
+               return -rte_errno;
+       priv->vmem = mem;
+       ret = mlx5_vdpa_pd_prepare(priv);
+       if (ret)
+               goto error;
+       priv->null_mr = mlx5_glue->alloc_null_mr(priv->pd);
+       if (!priv->null_mr) {
+               DRV_LOG(ERR, "Failed to allocate null MR.");
+               ret = -errno;
+               goto error;
+       }
+       DRV_LOG(DEBUG, "Dump fill Mkey = %u.", priv->null_mr->lkey);
+       for (i = 0; i < mem->nregions; i++) {
+               reg = &mem->regions[i];
+               entry = rte_zmalloc(__func__, sizeof(*entry), 0);
+               if (!entry) {
+                       ret = -ENOMEM;
+                       DRV_LOG(ERR, "Failed to allocate mem entry memory.");
+                       goto error;
+               }
+               entry->umem = mlx5_glue->devx_umem_reg(priv->ctx,
+                                        (void *)(uintptr_t)reg->host_user_addr,
+                                            reg->size, IBV_ACCESS_LOCAL_WRITE);
+               if (!entry->umem) {
+                       DRV_LOG(ERR, "Failed to register Umem by Devx.");
+                       ret = -errno;
+                       goto error;
+               }
+               mkey_attr.addr = (uintptr_t)(reg->guest_phys_addr);
+               mkey_attr.size = reg->size;
+               mkey_attr.umem_id = entry->umem->umem_id;
+               mkey_attr.pd = priv->pdn;
+               mkey_attr.pg_access = 1;
+               mkey_attr.klm_array = NULL;
+               mkey_attr.klm_num = 0;
+               entry->mkey = mlx5_devx_cmd_mkey_create(priv->ctx, &mkey_attr);
+               if (!entry->mkey) {
+                       DRV_LOG(ERR, "Failed to create direct Mkey.");
+                       ret = -rte_errno;
+                       goto error;
+               }
+               entry->addr = (void *)(uintptr_t)(reg->host_user_addr);
+               entry->length = reg->size;
+               entry->is_indirect = 0;
+               if (i > 0) {
+                       uint64_t sadd;
+                       uint64_t empty_region_sz = reg->guest_phys_addr -
+                                         (mem->regions[i - 1].guest_phys_addr +
+                                          mem->regions[i - 1].size);
+
+                       if (empty_region_sz > 0) {
+                               sadd = mem->regions[i - 1].guest_phys_addr +
+                                      mem->regions[i - 1].size;
+                               klm_size = mode == MLX5_MKC_ACCESS_MODE_KLM ?
+                                     KLM_SIZE_MAX_ALIGN(empty_region_sz) : gcd;
+                               for (k = 0; k < empty_region_sz;
+                                    k += klm_size) {
+                                       klm_array[klm_index].byte_count =
+                                               k + klm_size > empty_region_sz ?
+                                                empty_region_sz - k : klm_size;
+                                       klm_array[klm_index].mkey =
+                                                           priv->null_mr->lkey;
+                                       klm_array[klm_index].address = sadd + k;
+                                       klm_index++;
+                               }
+                       }
+               }
+               klm_size = mode == MLX5_MKC_ACCESS_MODE_KLM ?
+                                           KLM_SIZE_MAX_ALIGN(reg->size) : gcd;
+               for (k = 0; k < reg->size; k += klm_size) {
+                       klm_array[klm_index].byte_count = k + klm_size >
+                                          reg->size ? reg->size - k : klm_size;
+                       klm_array[klm_index].mkey = entry->mkey->id;
+                       klm_array[klm_index].address = reg->guest_phys_addr + k;
+                       klm_index++;
+               }
+               SLIST_INSERT_HEAD(&priv->mr_list, entry, next);
+       }
+       mkey_attr.addr = (uintptr_t)(mem->regions[0].guest_phys_addr);
+       mkey_attr.size = mem_size;
+       mkey_attr.pd = priv->pdn;
+       mkey_attr.umem_id = 0;
+       /* Must be zero for KLM mode. */
+       mkey_attr.log_entity_size = mode == MLX5_MKC_ACCESS_MODE_KLM_FBS ?
+                                                         rte_log2_u64(gcd) : 0;
+       mkey_attr.pg_access = 0;
+       mkey_attr.klm_array = klm_array;
+       mkey_attr.klm_num = klm_index;
+       entry = rte_zmalloc(__func__, sizeof(*entry), 0);
+       if (!entry) {
+               DRV_LOG(ERR, "Failed to allocate memory for indirect entry.");
+               ret = -ENOMEM;
+               goto error;
+       }
+       entry->mkey = mlx5_devx_cmd_mkey_create(priv->ctx, &mkey_attr);
+       if (!entry->mkey) {
+               DRV_LOG(ERR, "Failed to create indirect Mkey.");
+               ret = -rte_errno;
+               goto error;
+       }
+       entry->is_indirect = 1;
+       SLIST_INSERT_HEAD(&priv->mr_list, entry, next);
+       priv->gpa_mkey_index = entry->mkey->id;
+       return 0;
+error:
+       if (entry) {
+               if (entry->mkey)
+                       mlx5_devx_cmd_destroy(entry->mkey);
+               if (entry->umem)
+                       mlx5_glue->devx_umem_dereg(entry->umem);
+               rte_free(entry);
+       }
+       mlx5_vdpa_mem_dereg(priv);
+       rte_errno = -ret;
+       return ret;
+}