eal: move common header files
[dpdk.git] / lib / librte_eal / linux / eal / eal_vfio.c
index 7053ebe..4502aef 100644 (file)
@@ -264,7 +264,7 @@ vfio_open_group_fd(int iommu_group_num)
        int vfio_group_fd;
        char filename[PATH_MAX];
        struct rte_mp_msg mp_req, *mp_rep;
-       struct rte_mp_reply mp_reply;
+       struct rte_mp_reply mp_reply = {0};
        struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
        struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
 
@@ -320,9 +320,9 @@ vfio_open_group_fd(int iommu_group_num)
                        RTE_LOG(ERR, EAL, "  bad VFIO group fd\n");
                        vfio_group_fd = 0;
                }
-               free(mp_reply.msgs);
        }
 
+       free(mp_reply.msgs);
        if (vfio_group_fd < 0)
                RTE_LOG(ERR, EAL, "  cannot request group fd\n");
        return vfio_group_fd;
@@ -412,6 +412,9 @@ get_vfio_cfg_by_container_fd(int container_fd)
 {
        int i;
 
+       if (container_fd == RTE_VFIO_DEFAULT_CONTAINER_FD)
+               return default_vfio_cfg;
+
        for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
                if (vfio_cfgs[i].vfio_container_fd == container_fd)
                        return &vfio_cfgs[i];
@@ -511,9 +514,11 @@ static void
 vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
                void *arg __rte_unused)
 {
+       rte_iova_t iova_start, iova_expected;
        struct rte_memseg_list *msl;
        struct rte_memseg *ms;
        size_t cur_len = 0;
+       uint64_t va_start;
 
        msl = rte_mem_virt2memseg_list(addr);
 
@@ -529,32 +534,95 @@ vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
                return;
        }
 
+#ifdef RTE_ARCH_PPC_64
+       ms = rte_mem_virt2memseg(addr, msl);
+       while (cur_len < len) {
+               int idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
+
+               rte_fbarray_set_free(&msl->memseg_arr, idx);
+               cur_len += ms->len;
+               ++ms;
+       }
+       cur_len = 0;
+#endif
        /* memsegs are contiguous in memory */
        ms = rte_mem_virt2memseg(addr, msl);
+
+       /*
+        * This memory is not guaranteed to be contiguous, but it still could
+        * be, or it could have some small contiguous chunks. Since the number
+        * of VFIO mappings is limited, and VFIO appears to not concatenate
+        * adjacent mappings, we have to do this ourselves.
+        *
+        * So, find contiguous chunks, then map them.
+        */
+       va_start = ms->addr_64;
+       iova_start = iova_expected = ms->iova;
        while (cur_len < len) {
+               bool new_contig_area = ms->iova != iova_expected;
+               bool last_seg = (len - cur_len) == ms->len;
+               bool skip_last = false;
+
+               /* only do mappings when current contiguous area ends */
+               if (new_contig_area) {
+                       if (type == RTE_MEM_EVENT_ALLOC)
+                               vfio_dma_mem_map(default_vfio_cfg, va_start,
+                                               iova_start,
+                                               iova_expected - iova_start, 1);
+                       else
+                               vfio_dma_mem_map(default_vfio_cfg, va_start,
+                                               iova_start,
+                                               iova_expected - iova_start, 0);
+                       va_start = ms->addr_64;
+                       iova_start = ms->iova;
+               }
                /* some memory segments may have invalid IOVA */
                if (ms->iova == RTE_BAD_IOVA) {
                        RTE_LOG(DEBUG, EAL, "Memory segment at %p has bad IOVA, skipping\n",
                                        ms->addr);
-                       goto next;
+                       skip_last = true;
                }
-               if (type == RTE_MEM_EVENT_ALLOC)
-                       vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
-                                       ms->iova, ms->len, 1);
-               else
-                       vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
-                                       ms->iova, ms->len, 0);
-next:
+               iova_expected = ms->iova + ms->len;
+               cur_len += ms->len;
+               ++ms;
+
+               /*
+                * don't count previous segment, and don't attempt to
+                * dereference a potentially invalid pointer.
+                */
+               if (skip_last && !last_seg) {
+                       iova_expected = iova_start = ms->iova;
+                       va_start = ms->addr_64;
+               } else if (!skip_last && last_seg) {
+                       /* this is the last segment and we're not skipping */
+                       if (type == RTE_MEM_EVENT_ALLOC)
+                               vfio_dma_mem_map(default_vfio_cfg, va_start,
+                                               iova_start,
+                                               iova_expected - iova_start, 1);
+                       else
+                               vfio_dma_mem_map(default_vfio_cfg, va_start,
+                                               iova_start,
+                                               iova_expected - iova_start, 0);
+               }
+       }
+#ifdef RTE_ARCH_PPC_64
+       cur_len = 0;
+       ms = rte_mem_virt2memseg(addr, msl);
+       while (cur_len < len) {
+               int idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
+
+               rte_fbarray_set_used(&msl->memseg_arr, idx);
                cur_len += ms->len;
                ++ms;
        }
+#endif
 }
 
 static int
 vfio_sync_default_container(void)
 {
        struct rte_mp_msg mp_req, *mp_rep;
-       struct rte_mp_reply mp_reply;
+       struct rte_mp_reply mp_reply = {0};
        struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
        struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
        int iommu_type_id;
@@ -584,8 +652,8 @@ vfio_sync_default_container(void)
                p = (struct vfio_mp_param *)mp_rep->param;
                if (p->result == SOCKET_OK)
                        iommu_type_id = p->iommu_type_id;
-               free(mp_reply.msgs);
        }
+       free(mp_reply.msgs);
        if (iommu_type_id < 0) {
                RTE_LOG(ERR, EAL, "Could not get IOMMU type for default container\n");
                return -1;
@@ -1021,7 +1089,7 @@ int
 vfio_get_default_container_fd(void)
 {
        struct rte_mp_msg mp_req, *mp_rep;
-       struct rte_mp_reply mp_reply;
+       struct rte_mp_reply mp_reply = {0};
        struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
        struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
 
@@ -1049,9 +1117,9 @@ vfio_get_default_container_fd(void)
                        free(mp_reply.msgs);
                        return mp_rep->fds[0];
                }
-               free(mp_reply.msgs);
        }
 
+       free(mp_reply.msgs);
        RTE_LOG(ERR, EAL, "  cannot request default container fd\n");
        return -1;
 }
@@ -1127,7 +1195,7 @@ rte_vfio_get_container_fd(void)
 {
        int ret, vfio_container_fd;
        struct rte_mp_msg mp_req, *mp_rep;
-       struct rte_mp_reply mp_reply;
+       struct rte_mp_reply mp_reply = {0};
        struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
        struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
 
@@ -1181,9 +1249,9 @@ rte_vfio_get_container_fd(void)
                        free(mp_reply.msgs);
                        return vfio_container_fd;
                }
-               free(mp_reply.msgs);
        }
 
+       free(mp_reply.msgs);
        RTE_LOG(ERR, EAL, "  cannot request container fd\n");
        return -1;
 }
@@ -1231,13 +1299,35 @@ rte_vfio_get_group_num(const char *sysfs_base,
        return 1;
 }
 
+static int
+type1_map_contig(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+               size_t len, void *arg)
+{
+       int *vfio_container_fd = arg;
+
+       if (msl->external)
+               return 0;
+
+       return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
+                       len, 1);
+}
+
 static int
 type1_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
                void *arg)
 {
        int *vfio_container_fd = arg;
 
-       if (msl->external)
+       /* skip external memory that isn't a heap */
+       if (msl->external && !msl->heap)
+               return 0;
+
+       /* skip any segments with invalid IOVA addresses */
+       if (ms->iova == RTE_BAD_IOVA)
+               return 0;
+
+       /* if IOVA mode is VA, we've already mapped the internal segments */
+       if (!msl->external && rte_eal_iova_mode() == RTE_IOVA_VA)
                return 0;
 
        return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
@@ -1269,7 +1359,7 @@ vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
                         */
                        if (errno == EEXIST) {
                                RTE_LOG(DEBUG, EAL,
-                                       " Memory segment is allready mapped,"
+                                       " Memory segment is already mapped,"
                                        " skipping");
                        } else {
                                RTE_LOG(ERR, EAL,
@@ -1300,6 +1390,18 @@ vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
 static int
 vfio_type1_dma_map(int vfio_container_fd)
 {
+       if (rte_eal_iova_mode() == RTE_IOVA_VA) {
+               /* with IOVA as VA mode, we can get away with mapping contiguous
+                * chunks rather than going page-by-page.
+                */
+               int ret = rte_memseg_contig_walk(type1_map_contig,
+                               &vfio_container_fd);
+               if (ret)
+                       return ret;
+               /* we have to continue the walk because we've skipped the
+                * external segments during the config walk.
+                */
+       }
        return rte_memseg_walk(type1_map, &vfio_container_fd);
 }
 
@@ -1342,7 +1444,7 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
                         */
                        if (errno == EBUSY) {
                                RTE_LOG(DEBUG, EAL,
-                                       " Memory segment is allready mapped,"
+                                       " Memory segment is already mapped,"
                                        " skipping");
                        } else {
                                RTE_LOG(ERR, EAL,
@@ -1354,14 +1456,6 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
                }
 
        } else {
-               ret = ioctl(vfio_container_fd,
-                               VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
-               if (ret) {
-                       RTE_LOG(ERR, EAL, "  cannot unregister vaddr for IOMMU, error %i (%s)\n",
-                                       errno, strerror(errno));
-                       return -1;
-               }
-
                memset(&dma_unmap, 0, sizeof(dma_unmap));
                dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
                dma_unmap.size = len;
@@ -1374,6 +1468,14 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
                                        errno, strerror(errno));
                        return -1;
                }
+
+               ret = ioctl(vfio_container_fd,
+                               VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
+               if (ret) {
+                       RTE_LOG(ERR, EAL, "  cannot unregister vaddr for IOMMU, error %i (%s)\n",
+                                       errno, strerror(errno));
+                       return -1;
+               }
        }
 
        return 0;
@@ -1385,17 +1487,41 @@ vfio_spapr_map_walk(const struct rte_memseg_list *msl,
 {
        int *vfio_container_fd = arg;
 
-       if (msl->external)
+       /* skip external memory that isn't a heap */
+       if (msl->external && !msl->heap)
+               return 0;
+
+       /* skip any segments with invalid IOVA addresses */
+       if (ms->iova == RTE_BAD_IOVA)
                return 0;
 
        return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova,
                        ms->len, 1);
 }
 
+static int
+vfio_spapr_unmap_walk(const struct rte_memseg_list *msl,
+               const struct rte_memseg *ms, void *arg)
+{
+       int *vfio_container_fd = arg;
+
+       /* skip external memory that isn't a heap */
+       if (msl->external && !msl->heap)
+               return 0;
+
+       /* skip any segments with invalid IOVA addresses */
+       if (ms->iova == RTE_BAD_IOVA)
+               return 0;
+
+       return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova,
+                       ms->len, 0);
+}
+
 struct spapr_walk_param {
        uint64_t window_size;
        uint64_t hugepage_sz;
 };
+
 static int
 vfio_spapr_window_size_walk(const struct rte_memseg_list *msl,
                const struct rte_memseg *ms, void *arg)
@@ -1403,7 +1529,12 @@ vfio_spapr_window_size_walk(const struct rte_memseg_list *msl,
        struct spapr_walk_param *param = arg;
        uint64_t max = ms->iova + ms->len;
 
-       if (msl->external)
+       /* skip external memory that isn't a heap */
+       if (msl->external && !msl->heap)
+               return 0;
+
+       /* skip any segments with invalid IOVA addresses */
+       if (ms->iova == RTE_BAD_IOVA)
                return 0;
 
        if (max > param->window_size) {
@@ -1445,24 +1576,17 @@ vfio_spapr_create_new_dma_window(int vfio_container_fd,
        /* create new DMA window */
        ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, create);
        if (ret) {
+#ifdef VFIO_IOMMU_SPAPR_INFO_DDW
                /* try possible page_shift and levels for workaround */
                uint32_t levels;
 
-               for (levels = 1; levels <= info.ddw.levels; levels++) {
-                       uint32_t pgsizes = info.ddw.pgsizes;
-
-                       while (pgsizes != 0) {
-                               create->page_shift = 31 - __builtin_clz(pgsizes);
-                               create->levels = levels;
-                               ret = ioctl(vfio_container_fd,
-                                       VFIO_IOMMU_SPAPR_TCE_CREATE, create);
-                               if (!ret)
-                                       break;
-                               pgsizes &= ~(1 << create->page_shift);
-                       }
-                       if (!ret)
-                               break;
+               for (levels = create->levels + 1;
+                       ret && levels <= info.ddw.levels; levels++) {
+                       create->levels = levels;
+                       ret = ioctl(vfio_container_fd,
+                               VFIO_IOMMU_SPAPR_TCE_CREATE, create);
                }
+#endif
                if (ret) {
                        RTE_LOG(ERR, EAL, "  cannot create new DMA window, "
                                        "error %i (%s)\n", errno, strerror(errno));
@@ -1514,7 +1638,7 @@ vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
        for (i = 0; i < user_mem_maps->n_maps; i++) {
                uint64_t max = user_mem_maps->maps[i].iova +
                                user_mem_maps->maps[i].len;
-               create.window_size = RTE_MAX(create.window_size, max);
+               param.window_size = RTE_MAX(param.window_size, max);
        }
 
        /* sPAPR requires window size to be a power of 2 */
@@ -1523,9 +1647,28 @@ vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
        create.levels = 1;
 
        if (do_map) {
-               void *addr;
                /* re-create window and remap the entire memory */
-               if (iova > create.window_size) {
+               if (iova + len > create.window_size) {
+                       /* release all maps before recreating the window */
+                       if (rte_memseg_walk_thread_unsafe(vfio_spapr_unmap_walk,
+                                       &vfio_container_fd) < 0) {
+                               RTE_LOG(ERR, EAL, "Could not release DMA maps\n");
+                               ret = -1;
+                               goto out;
+                       }
+                       /* release all user maps */
+                       for (i = 0; i < user_mem_maps->n_maps; i++) {
+                               struct user_mem_map *map =
+                                               &user_mem_maps->maps[i];
+                               if (vfio_spapr_dma_do_map(vfio_container_fd,
+                                               map->addr, map->iova, map->len,
+                                               0)) {
+                                       RTE_LOG(ERR, EAL, "Could not release user DMA maps\n");
+                                       ret = -1;
+                                       goto out;
+                               }
+                       }
+                       create.window_size = rte_align64pow2(iova + len);
                        if (vfio_spapr_create_new_dma_window(vfio_container_fd,
                                        &create) < 0) {
                                RTE_LOG(ERR, EAL, "Could not create new DMA window\n");
@@ -1553,23 +1696,8 @@ vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
                                }
                        }
                }
-
-               /* now that we've remapped all of the memory that was present
-                * before, map the segment that we were requested to map.
-                *
-                * however, if we were called by the callback, the memory we
-                * were called with was already in the memseg list, so previous
-                * mapping should've mapped that segment already.
-                *
-                * virt2memseg_list is a relatively cheap check, so use that. if
-                * memory is within any memseg list, it's a memseg, so it's
-                * already mapped.
-                */
-               addr = (void *)(uintptr_t)vaddr;
-               if (rte_mem_virt2memseg_list(addr) == NULL &&
-                               vfio_spapr_dma_do_map(vfio_container_fd,
-                                       vaddr, iova, len, 1) < 0) {
-                       RTE_LOG(ERR, EAL, "Could not map segment\n");
+               if (vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 1)) {
+                       RTE_LOG(ERR, EAL, "Failed to map DMA\n");
                        ret = -1;
                        goto out;
                }
@@ -1772,28 +1900,6 @@ out:
        return ret;
 }
 
-int
-rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len)
-{
-       if (len == 0) {
-               rte_errno = EINVAL;
-               return -1;
-       }
-
-       return container_dma_map(default_vfio_cfg, vaddr, iova, len);
-}
-
-int
-rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len)
-{
-       if (len == 0) {
-               rte_errno = EINVAL;
-               return -1;
-       }
-
-       return container_dma_unmap(default_vfio_cfg, vaddr, iova, len);
-}
-
 int
 rte_vfio_noiommu_is_enabled(void)
 {
@@ -1971,20 +2077,6 @@ rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova,
 
 #else
 
-int
-rte_vfio_dma_map(uint64_t __rte_unused vaddr, __rte_unused uint64_t iova,
-                 __rte_unused uint64_t len)
-{
-       return -1;
-}
-
-int
-rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova,
-                   __rte_unused uint64_t len)
-{
-       return -1;
-}
-
 int
 rte_vfio_setup_device(__rte_unused const char *sysfs_base,
                __rte_unused const char *dev_addr,