vfio: fix expanding DMA area in ppc64le
[dpdk.git] / lib / librte_eal / linux / eal / eal_vfio.c
index c821e83..ed04231 100644 (file)
@@ -15,6 +15,7 @@
 #include <rte_vfio.h>
 
 #include "eal_filesystem.h"
+#include "eal_memcfg.h"
 #include "eal_vfio.h"
 #include "eal_private.h"
 
@@ -635,8 +636,6 @@ int
 rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
                int *vfio_dev_fd, struct vfio_device_info *device_info)
 {
-       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
-       rte_rwlock_t *mem_lock = &mcfg->memory_hotplug_lock;
        struct vfio_group_status group_status = {
                        .argsz = sizeof(group_status)
        };
@@ -685,7 +684,9 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
                rte_vfio_clear_group(vfio_group_fd);
                return -1;
        } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
-               RTE_LOG(ERR, EAL, "  %s VFIO group is not viable!\n", dev_addr);
+               RTE_LOG(ERR, EAL, "  %s VFIO group is not viable! "
+                               "Not all devices in IOMMU group bound to VFIO or unbound\n",
+                               dev_addr);
                close(vfio_group_fd);
                rte_vfio_clear_group(vfio_group_fd);
                return -1;
@@ -737,7 +738,7 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
                        /* lock memory hotplug before mapping and release it
                         * after registering callback, to prevent races
                         */
-                       rte_rwlock_read_lock(mem_lock);
+                       rte_mcfg_mem_read_lock();
                        if (vfio_cfg == default_vfio_cfg)
                                ret = t->dma_map_func(vfio_container_fd);
                        else
@@ -748,7 +749,7 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
                                        dev_addr, errno, strerror(errno));
                                close(vfio_group_fd);
                                rte_vfio_clear_group(vfio_group_fd);
-                               rte_rwlock_read_unlock(mem_lock);
+                               rte_mcfg_mem_read_unlock();
                                return -1;
                        }
 
@@ -779,7 +780,7 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
                                                        map->len);
                                        rte_spinlock_recursive_unlock(
                                                        &user_mem_maps->lock);
-                                       rte_rwlock_read_unlock(mem_lock);
+                                       rte_mcfg_mem_read_unlock();
                                        return -1;
                                }
                        }
@@ -793,7 +794,7 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
                        else
                                ret = 0;
                        /* unlock memory hotplug */
-                       rte_rwlock_read_unlock(mem_lock);
+                       rte_mcfg_mem_read_unlock();
 
                        if (ret && rte_errno != ENOTSUP) {
                                RTE_LOG(ERR, EAL, "Could not install memory event callback for VFIO\n");
@@ -860,8 +861,6 @@ int
 rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
                    int vfio_dev_fd)
 {
-       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
-       rte_rwlock_t *mem_lock = &mcfg->memory_hotplug_lock;
        struct vfio_group_status group_status = {
                        .argsz = sizeof(group_status)
        };
@@ -874,7 +873,7 @@ rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
         * VFIO device, because this might be the last device and we might need
         * to unregister the callback.
         */
-       rte_rwlock_read_lock(mem_lock);
+       rte_mcfg_mem_read_lock();
 
        /* get group number */
        ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num);
@@ -945,7 +944,7 @@ rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
        ret = 0;
 
 out:
-       rte_rwlock_read_unlock(mem_lock);
+       rte_mcfg_mem_read_unlock();
        return ret;
 }
 
@@ -1264,9 +1263,21 @@ vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
 
                ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
                if (ret) {
-                       RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, error %i (%s)\n",
-                               errno, strerror(errno));
+                       /**
+                        * In case the mapping was already done EEXIST will be
+                        * returned from kernel.
+                        */
+                       if (errno == EEXIST) {
+                               RTE_LOG(DEBUG, EAL,
+                                       " Memory segment is allready mapped,"
+                                       " skipping");
+                       } else {
+                               RTE_LOG(ERR, EAL,
+                                       "  cannot set up DMA remapping,"
+                                       " error %i (%s)\n",
+                                       errno, strerror(errno));
                                return -1;
+                       }
                }
        } else {
                memset(&dma_unmap, 0, sizeof(dma_unmap));
@@ -1325,20 +1336,24 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
 
                ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
                if (ret) {
-                       RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, error %i (%s)\n",
-                               errno, strerror(errno));
+                       /**
+                        * In case the mapping was already done EBUSY will be
+                        * returned from kernel.
+                        */
+                       if (errno == EBUSY) {
+                               RTE_LOG(DEBUG, EAL,
+                                       " Memory segment is allready mapped,"
+                                       " skipping");
+                       } else {
+                               RTE_LOG(ERR, EAL,
+                                       "  cannot set up DMA remapping,"
+                                       " error %i (%s)\n", errno,
+                                       strerror(errno));
                                return -1;
+                       }
                }
 
        } else {
-               ret = ioctl(vfio_container_fd,
-                               VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
-               if (ret) {
-                       RTE_LOG(ERR, EAL, "  cannot unregister vaddr for IOMMU, error %i (%s)\n",
-                                       errno, strerror(errno));
-                       return -1;
-               }
-
                memset(&dma_unmap, 0, sizeof(dma_unmap));
                dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
                dma_unmap.size = len;
@@ -1351,28 +1366,56 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
                                        errno, strerror(errno));
                        return -1;
                }
+
+               ret = ioctl(vfio_container_fd,
+                               VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
+               if (ret) {
+                       RTE_LOG(ERR, EAL, "  cannot unregister vaddr for IOMMU, error %i (%s)\n",
+                                       errno, strerror(errno));
+                       return -1;
+               }
        }
 
        return 0;
 }
 
+struct spapr_remap_walk_param {
+       int vfio_container_fd;
+       uint64_t addr_64;
+};
+
 static int
 vfio_spapr_map_walk(const struct rte_memseg_list *msl,
                const struct rte_memseg *ms, void *arg)
 {
-       int *vfio_container_fd = arg;
+       struct spapr_remap_walk_param *param = arg;
 
-       if (msl->external)
+       if (msl->external || ms->addr_64 == param->addr_64)
                return 0;
 
-       return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova,
+       return vfio_spapr_dma_do_map(param->vfio_container_fd, ms->addr_64, ms->iova,
                        ms->len, 1);
 }
 
+static int
+vfio_spapr_unmap_walk(const struct rte_memseg_list *msl,
+               const struct rte_memseg *ms, void *arg)
+{
+       struct spapr_remap_walk_param *param = arg;
+
+       if (msl->external || ms->addr_64 == param->addr_64)
+               return 0;
+
+       return vfio_spapr_dma_do_map(param->vfio_container_fd, ms->addr_64, ms->iova,
+                       ms->len, 0);
+}
+
 struct spapr_walk_param {
        uint64_t window_size;
        uint64_t hugepage_sz;
+       uint64_t addr_64;
 };
+
 static int
 vfio_spapr_window_size_walk(const struct rte_memseg_list *msl,
                const struct rte_memseg *ms, void *arg)
@@ -1383,6 +1426,10 @@ vfio_spapr_window_size_walk(const struct rte_memseg_list *msl,
        if (msl->external)
                return 0;
 
+       /* do not iterate ms we haven't mapped yet  */
+       if (param->addr_64 && ms->addr_64 == param->addr_64)
+               return 0;
+
        if (max > param->window_size) {
                param->hugepage_sz = ms->hugepage_sz;
                param->window_size = max;
@@ -1422,9 +1469,31 @@ vfio_spapr_create_new_dma_window(int vfio_container_fd,
        /* create new DMA window */
        ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, create);
        if (ret) {
-               RTE_LOG(ERR, EAL, "  cannot create new DMA window, "
-                               "error %i (%s)\n", errno, strerror(errno));
-               return -1;
+#ifdef VFIO_IOMMU_SPAPR_INFO_DDW
+               /* try possible page_shift and levels for workaround */
+               uint32_t levels;
+
+               for (levels = 1; levels <= info.ddw.levels; levels++) {
+                       uint32_t pgsizes = info.ddw.pgsizes;
+
+                       while (pgsizes != 0) {
+                               create->page_shift = 31 - __builtin_clz(pgsizes);
+                               create->levels = levels;
+                               ret = ioctl(vfio_container_fd,
+                                       VFIO_IOMMU_SPAPR_TCE_CREATE, create);
+                               if (!ret)
+                                       break;
+                               pgsizes &= ~(1 << create->page_shift);
+                       }
+                       if (!ret)
+                               break;
+               }
+#endif
+               if (ret) {
+                       RTE_LOG(ERR, EAL, "  cannot create new DMA window, "
+                                       "error %i (%s)\n", errno, strerror(errno));
+                       return -1;
+               }
        }
 
        if (create->start_addr != 0) {
@@ -1458,6 +1527,7 @@ vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
 
        /* check if window size needs to be adjusted */
        memset(&param, 0, sizeof(param));
+       param.addr_64 = vaddr;
 
        /* we're inside a callback so use thread-unsafe version */
        if (rte_memseg_walk_thread_unsafe(vfio_spapr_window_size_walk,
@@ -1471,7 +1541,7 @@ vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
        for (i = 0; i < user_mem_maps->n_maps; i++) {
                uint64_t max = user_mem_maps->maps[i].iova +
                                user_mem_maps->maps[i].len;
-               create.window_size = RTE_MAX(create.window_size, max);
+               param.window_size = RTE_MAX(param.window_size, max);
        }
 
        /* sPAPR requires window size to be a power of 2 */
@@ -1480,9 +1550,33 @@ vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
        create.levels = 1;
 
        if (do_map) {
-               void *addr;
                /* re-create window and remap the entire memory */
-               if (iova > create.window_size) {
+               if (iova + len > create.window_size) {
+                       struct spapr_remap_walk_param remap_param = {
+                               .vfio_container_fd = vfio_container_fd,
+                               .addr_64 = vaddr,
+                       };
+
+                       /* release all maps before recreating the window */
+                       if (rte_memseg_walk_thread_unsafe(vfio_spapr_unmap_walk,
+                                       &remap_param) < 0) {
+                               RTE_LOG(ERR, EAL, "Could not release DMA maps\n");
+                               ret = -1;
+                               goto out;
+                       }
+                       /* release all user maps */
+                       for (i = 0; i < user_mem_maps->n_maps; i++) {
+                               struct user_mem_map *map =
+                                               &user_mem_maps->maps[i];
+                               if (vfio_spapr_dma_do_map(vfio_container_fd,
+                                               map->addr, map->iova, map->len,
+                                               0)) {
+                                       RTE_LOG(ERR, EAL, "Could not release user DMA maps\n");
+                                       ret = -1;
+                                       goto out;
+                               }
+                       }
+                       create.window_size = rte_align64pow2(iova + len);
                        if (vfio_spapr_create_new_dma_window(vfio_container_fd,
                                        &create) < 0) {
                                RTE_LOG(ERR, EAL, "Could not create new DMA window\n");
@@ -1492,7 +1586,7 @@ vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
                        /* we're inside a callback, so use thread-unsafe version
                         */
                        if (rte_memseg_walk_thread_unsafe(vfio_spapr_map_walk,
-                                       &vfio_container_fd) < 0) {
+                                       &remap_param) < 0) {
                                RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n");
                                ret = -1;
                                goto out;
@@ -1510,23 +1604,8 @@ vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
                                }
                        }
                }
-
-               /* now that we've remapped all of the memory that was present
-                * before, map the segment that we were requested to map.
-                *
-                * however, if we were called by the callback, the memory we
-                * were called with was already in the memseg list, so previous
-                * mapping should've mapped that segment already.
-                *
-                * virt2memseg_list is a relatively cheap check, so use that. if
-                * memory is within any memseg list, it's a memseg, so it's
-                * already mapped.
-                */
-               addr = (void *)(uintptr_t)vaddr;
-               if (rte_mem_virt2memseg_list(addr) == NULL &&
-                               vfio_spapr_dma_do_map(vfio_container_fd,
-                                       vaddr, iova, len, 1) < 0) {
-                       RTE_LOG(ERR, EAL, "Could not map segment\n");
+               if (vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 1)) {
+                       RTE_LOG(ERR, EAL, "Failed to map DMA\n");
                        ret = -1;
                        goto out;
                }
@@ -1554,6 +1633,7 @@ vfio_spapr_dma_map(int vfio_container_fd)
        struct spapr_walk_param param;
 
        memset(&param, 0, sizeof(param));
+       param.addr_64 = 0UL;
 
        /* create DMA window from 0 to max(phys_addr + len) */
        rte_memseg_walk(vfio_spapr_window_size_walk, &param);
@@ -1808,7 +1888,7 @@ rte_vfio_container_create(void)
        return vfio_cfgs[i].vfio_container_fd;
 }
 
-int __rte_experimental
+int
 rte_vfio_container_destroy(int container_fd)
 {
        struct vfio_config *vfio_cfg;