X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=lib%2Flibrte_eal%2Flinuxapp%2Feal%2Feal_vfio.c;h=5101c04fc7aec3ab4191513984cdaa68727e52c4;hb=2a04139f66b457d069445a65fd11722d91463bcb;hp=3f569676bd9f01a0a92caa9f152ab43214756270;hpb=69f7504949e95f6ad89260fb59da90f0f1fede87;p=dpdk.git diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c index 3f569676bd..5101c04fc7 100644 --- a/lib/librte_eal/linuxapp/eal/eal_vfio.c +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c @@ -1,44 +1,18 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation */ +#include #include #include #include #include +#include #include #include #include +#include #include "eal_filesystem.h" #include "eal_vfio.h" @@ -50,19 +24,227 @@ static struct vfio_config vfio_cfg; static int vfio_type1_dma_map(int); +static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); static int vfio_spapr_dma_map(int); +static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); static int vfio_noiommu_dma_map(int); +static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); +static int vfio_dma_mem_map(uint64_t vaddr, uint64_t iova, uint64_t len, + int do_map); /* IOMMU types we support */ static const struct vfio_iommu_type iommu_types[] = { /* x86 IOMMU, otherwise known as type 1 */ - { RTE_VFIO_TYPE1, "Type 1", &vfio_type1_dma_map}, + { + .type_id = RTE_VFIO_TYPE1, + .name = "Type 1", + .dma_map_func = &vfio_type1_dma_map, + .dma_user_map_func = &vfio_type1_dma_mem_map + }, /* ppc64 IOMMU, otherwise known as spapr */ - { RTE_VFIO_SPAPR, "sPAPR", &vfio_spapr_dma_map}, + { + .type_id = RTE_VFIO_SPAPR, + .name = "sPAPR", + .dma_map_func = &vfio_spapr_dma_map, + .dma_user_map_func = &vfio_spapr_dma_mem_map + }, /* IOMMU-less mode */ - { RTE_VFIO_NOIOMMU, "No-IOMMU", &vfio_noiommu_dma_map}, + { + .type_id = RTE_VFIO_NOIOMMU, + .name = "No-IOMMU", + .dma_map_func = &vfio_noiommu_dma_map, + .dma_user_map_func = &vfio_noiommu_dma_mem_map + }, }; +/* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can + * recreate the mappings for DPDK segments, but we cannot do so for memory that + * was registered by the user themselves, so we need to store the user mappings + * somewhere, to recreate them later. + */ +#define VFIO_MAX_USER_MEM_MAPS 256 +struct user_mem_map { + uint64_t addr; + uint64_t iova; + uint64_t len; +}; +static struct { + rte_spinlock_t lock; + int n_maps; + struct user_mem_map maps[VFIO_MAX_USER_MEM_MAPS]; +} user_mem_maps = { + .lock = RTE_SPINLOCK_INITIALIZER +}; + +static int +is_null_map(const struct user_mem_map *map) +{ + return map->addr == 0 && map->iova == 0 && map->len == 0; +} + +/* we may need to merge user mem maps together in case of user mapping/unmapping + * chunks of memory, so we'll need a comparator function to sort segments. + */ +static int +user_mem_map_cmp(const void *a, const void *b) +{ + const struct user_mem_map *umm_a = a; + const struct user_mem_map *umm_b = b; + + /* move null entries to end */ + if (is_null_map(umm_a)) + return 1; + if (is_null_map(umm_b)) + return -1; + + /* sort by iova first */ + if (umm_a->iova < umm_b->iova) + return -1; + if (umm_a->iova > umm_b->iova) + return 1; + + if (umm_a->addr < umm_b->addr) + return -1; + if (umm_a->addr > umm_b->addr) + return 1; + + if (umm_a->len < umm_b->len) + return -1; + if (umm_a->len > umm_b->len) + return 1; + + return 0; +} + +/* adjust user map entry. this may result in shortening of existing map, or in + * splitting existing map in two pieces. + */ +static void +adjust_map(struct user_mem_map *src, struct user_mem_map *end, + uint64_t remove_va_start, uint64_t remove_len) +{ + /* if va start is same as start address, we're simply moving start */ + if (remove_va_start == src->addr) { + src->addr += remove_len; + src->iova += remove_len; + src->len -= remove_len; + } else if (remove_va_start + remove_len == src->addr + src->len) { + /* we're shrinking mapping from the end */ + src->len -= remove_len; + } else { + /* we're blowing a hole in the middle */ + struct user_mem_map tmp; + uint64_t total_len = src->len; + + /* adjust source segment length */ + src->len = remove_va_start - src->addr; + + /* create temporary segment in the middle */ + tmp.addr = src->addr + src->len; + tmp.iova = src->iova + src->len; + tmp.len = remove_len; + + /* populate end segment - this one we will be keeping */ + end->addr = tmp.addr + tmp.len; + end->iova = tmp.iova + tmp.len; + end->len = total_len - src->len - tmp.len; + } +} + +/* try merging two maps into one, return 1 if succeeded */ +static int +merge_map(struct user_mem_map *left, struct user_mem_map *right) +{ + if (left->addr + left->len != right->addr) + return 0; + if (left->iova + left->len != right->iova) + return 0; + + left->len += right->len; + + memset(right, 0, sizeof(*right)); + + return 1; +} + +static struct user_mem_map * +find_user_mem_map(uint64_t addr, uint64_t iova, uint64_t len) +{ + uint64_t va_end = addr + len; + uint64_t iova_end = iova + len; + int i; + + for (i = 0; i < user_mem_maps.n_maps; i++) { + struct user_mem_map *map = &user_mem_maps.maps[i]; + uint64_t map_va_end = map->addr + map->len; + uint64_t map_iova_end = map->iova + map->len; + + /* check start VA */ + if (addr < map->addr || addr >= map_va_end) + continue; + /* check if IOVA end is within boundaries */ + if (va_end <= map->addr || va_end >= map_va_end) + continue; + + /* check start PA */ + if (iova < map->iova || iova >= map_iova_end) + continue; + /* check if IOVA end is within boundaries */ + if (iova_end <= map->iova || iova_end >= map_iova_end) + continue; + + /* we've found our map */ + return map; + } + return NULL; +} + +/* this will sort all user maps, and merge/compact any adjacent maps */ +static void +compact_user_maps(void) +{ + int i, n_merged, cur_idx; + + qsort(user_mem_maps.maps, user_mem_maps.n_maps, + sizeof(user_mem_maps.maps[0]), user_mem_map_cmp); + + /* we'll go over the list backwards when merging */ + n_merged = 0; + for (i = user_mem_maps.n_maps - 2; i >= 0; i--) { + struct user_mem_map *l, *r; + + l = &user_mem_maps.maps[i]; + r = &user_mem_maps.maps[i + 1]; + + if (is_null_map(l) || is_null_map(r)) + continue; + + if (merge_map(l, r)) + n_merged++; + } + + /* the entries are still sorted, but now they have holes in them, so + * walk through the list and remove the holes + */ + if (n_merged > 0) { + cur_idx = 0; + for (i = 0; i < user_mem_maps.n_maps; i++) { + if (!is_null_map(&user_mem_maps.maps[i])) { + struct user_mem_map *src, *dst; + + src = &user_mem_maps.maps[i]; + dst = &user_mem_maps.maps[cur_idx++]; + + if (src != dst) { + memcpy(dst, src, sizeof(*src)); + memset(src, 0, sizeof(*src)); + } + } + } + user_mem_maps.n_maps = cur_idx; + } +} + int vfio_get_group_fd(int iommu_group_no) { @@ -225,7 +407,7 @@ vfio_group_device_count(int vfio_group_fd) } int -clear_group(int vfio_group_fd) +rte_vfio_clear_group(int vfio_group_fd) { int i; int socket_fd, ret; @@ -283,7 +465,7 @@ clear_group(int vfio_group_fd) } int -vfio_setup_device(const char *sysfs_base, const char *dev_addr, +rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr, int *vfio_dev_fd, struct vfio_device_info *device_info) { struct vfio_group_status group_status = { @@ -291,7 +473,7 @@ vfio_setup_device(const char *sysfs_base, const char *dev_addr, }; int vfio_group_fd; int iommu_group_no; - int ret; + int i, ret; /* get group number */ ret = vfio_get_group_no(sysfs_base, dev_addr, &iommu_group_no); @@ -328,12 +510,12 @@ vfio_setup_device(const char *sysfs_base, const char *dev_addr, RTE_LOG(ERR, EAL, " %s cannot get group status, " "error %i (%s)\n", dev_addr, errno, strerror(errno)); close(vfio_group_fd); - clear_group(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); return -1; } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { RTE_LOG(ERR, EAL, " %s VFIO group is not viable!\n", dev_addr); close(vfio_group_fd); - clear_group(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); return -1; } @@ -347,7 +529,7 @@ vfio_setup_device(const char *sysfs_base, const char *dev_addr, RTE_LOG(ERR, EAL, " %s cannot add VFIO group to container, " "error %i (%s)\n", dev_addr, errno, strerror(errno)); close(vfio_group_fd); - clear_group(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); return -1; } @@ -361,15 +543,16 @@ vfio_setup_device(const char *sysfs_base, const char *dev_addr, */ if (internal_config.process_type == RTE_PROC_PRIMARY && vfio_cfg.vfio_active_groups == 1) { + const struct vfio_iommu_type *t; + /* select an IOMMU type which we will be using */ - const struct vfio_iommu_type *t = - vfio_set_iommu_type(vfio_cfg.vfio_container_fd); + t = vfio_set_iommu_type(vfio_cfg.vfio_container_fd); if (!t) { RTE_LOG(ERR, EAL, " %s failed to select IOMMU type\n", dev_addr); close(vfio_group_fd); - clear_group(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); return -1; } ret = t->dma_map_func(vfio_cfg.vfio_container_fd); @@ -378,9 +561,41 @@ vfio_setup_device(const char *sysfs_base, const char *dev_addr, " %s DMA remapping failed, error %i (%s)\n", dev_addr, errno, strerror(errno)); close(vfio_group_fd); - clear_group(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); return -1; } + + vfio_cfg.vfio_iommu_type = t; + + /* re-map all user-mapped segments */ + rte_spinlock_lock(&user_mem_maps.lock); + + /* this IOMMU type may not support DMA mapping, but + * if we have mappings in the list - that means we have + * previously mapped something successfully, so we can + * be sure that DMA mapping is supported. + */ + for (i = 0; i < user_mem_maps.n_maps; i++) { + struct user_mem_map *map; + map = &user_mem_maps.maps[i]; + + ret = t->dma_user_map_func( + vfio_cfg.vfio_container_fd, + map->addr, map->iova, map->len, + 1); + if (ret) { + RTE_LOG(ERR, EAL, "Couldn't map user memory for DMA: " + "va: 0x%" PRIx64 " " + "iova: 0x%" PRIx64 " " + "len: 0x%" PRIu64 "\n", + map->addr, map->iova, + map->len); + rte_spinlock_unlock( + &user_mem_maps.lock); + return -1; + } + } + rte_spinlock_unlock(&user_mem_maps.lock); } } @@ -394,7 +609,7 @@ vfio_setup_device(const char *sysfs_base, const char *dev_addr, RTE_LOG(WARNING, EAL, "Getting a vfio_dev_fd for %s failed\n", dev_addr); close(vfio_group_fd); - clear_group(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); return -1; } @@ -406,7 +621,7 @@ vfio_setup_device(const char *sysfs_base, const char *dev_addr, strerror(errno)); close(*vfio_dev_fd); close(vfio_group_fd); - clear_group(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); return -1; } vfio_group_device_get(vfio_group_fd); @@ -415,7 +630,7 @@ vfio_setup_device(const char *sysfs_base, const char *dev_addr, } int -vfio_release_device(const char *sysfs_base, const char *dev_addr, +rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, int vfio_dev_fd) { struct vfio_group_status group_status = { @@ -466,7 +681,7 @@ vfio_release_device(const char *sysfs_base, const char *dev_addr, return -1; } - if (clear_group(vfio_group_fd) < 0) { + if (rte_vfio_clear_group(vfio_group_fd) < 0) { RTE_LOG(INFO, EAL, "Error when clearing group for %s\n", dev_addr); return -1; @@ -477,7 +692,7 @@ vfio_release_device(const char *sysfs_base, const char *dev_addr, } int -vfio_enable(const char *modname) +rte_vfio_enable(const char *modname) { /* initialize group list */ int i; @@ -522,9 +737,9 @@ vfio_enable(const char *modname) } int -vfio_is_enabled(const char *modname) +rte_vfio_is_enabled(const char *modname) { - const int mod_available = rte_eal_check_module(modname); + const int mod_available = rte_eal_check_module(modname) > 0; return vfio_cfg.vfio_enabled && mod_available; } @@ -693,34 +908,49 @@ vfio_get_group_no(const char *sysfs_base, } static int -vfio_type1_dma_map(int vfio_container_fd) +type1_map(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, void *arg) { - const struct rte_memseg *ms = rte_eal_get_physmem_layout(); - int i, ret; + int *vfio_container_fd = arg; - /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */ - for (i = 0; i < RTE_MAX_MEMSEG; i++) { - struct vfio_iommu_type1_dma_map dma_map; + return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova, + ms->len, 1); +} - if (ms[i].addr == NULL) - break; +static int +vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map) +{ + struct vfio_iommu_type1_dma_map dma_map; + struct vfio_iommu_type1_dma_unmap dma_unmap; + int ret; + if (do_map != 0) { memset(&dma_map, 0, sizeof(dma_map)); dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); - dma_map.vaddr = ms[i].addr_64; - dma_map.size = ms[i].len; - if (rte_eal_iova_mode() == RTE_IOVA_VA) - dma_map.iova = dma_map.vaddr; - else - dma_map.iova = ms[i].phys_addr; - dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + dma_map.vaddr = vaddr; + dma_map.size = len; + dma_map.iova = iova; + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | + VFIO_DMA_MAP_FLAG_WRITE; ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); + if (ret) { + RTE_LOG(ERR, EAL, " cannot set up DMA remapping, error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + } else { + memset(&dma_unmap, 0, sizeof(dma_unmap)); + dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap); + dma_unmap.size = len; + dma_unmap.iova = iova; + ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA, + &dma_unmap); if (ret) { - RTE_LOG(ERR, EAL, " cannot set up DMA remapping, " - "error %i (%s)\n", errno, - strerror(errno)); + RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n", + errno, strerror(errno)); return -1; } } @@ -729,24 +959,107 @@ vfio_type1_dma_map(int vfio_container_fd) } static int -vfio_spapr_dma_map(int vfio_container_fd) +vfio_type1_dma_map(int vfio_container_fd) { - const struct rte_memseg *ms = rte_eal_get_physmem_layout(); - int i, ret; + return rte_memseg_walk(type1_map, &vfio_container_fd); +} + +static int +vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map) +{ + struct vfio_iommu_type1_dma_map dma_map; + struct vfio_iommu_type1_dma_unmap dma_unmap; + int ret; + + if (do_map != 0) { + memset(&dma_map, 0, sizeof(dma_map)); + dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); + dma_map.vaddr = vaddr; + dma_map.size = len; + dma_map.iova = iova; + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | + VFIO_DMA_MAP_FLAG_WRITE; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); + if (ret) { + RTE_LOG(ERR, EAL, " cannot set up DMA remapping, error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + + } else { + struct vfio_iommu_spapr_register_memory reg = { + .argsz = sizeof(reg), + .flags = 0 + }; + reg.vaddr = (uintptr_t) vaddr; + reg.size = len; + + ret = ioctl(vfio_container_fd, + VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®); + if (ret) { + RTE_LOG(ERR, EAL, " cannot unregister vaddr for IOMMU, error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + + memset(&dma_unmap, 0, sizeof(dma_unmap)); + dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap); + dma_unmap.size = len; + dma_unmap.iova = iova; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA, + &dma_unmap); + if (ret) { + RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + } + + return 0; +} + +static int +vfio_spapr_map_walk(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, void *arg) +{ + int *vfio_container_fd = arg; - struct vfio_iommu_spapr_register_memory reg = { - .argsz = sizeof(reg), - .flags = 0 + return vfio_spapr_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova, + ms->len, 1); +} + +struct spapr_walk_param { + uint64_t window_size; + uint64_t hugepage_sz; +}; +static int +vfio_spapr_window_size_walk(const struct rte_memseg_list *msl __rte_unused, + const struct rte_memseg *ms, void *arg) +{ + struct spapr_walk_param *param = arg; + uint64_t max = ms->iova + ms->len; + + if (max > param->window_size) { + param->hugepage_sz = ms->hugepage_sz; + param->window_size = max; + } + + return 0; +} + +static int +vfio_spapr_create_new_dma_window(int vfio_container_fd, + struct vfio_iommu_spapr_tce_create *create) { + struct vfio_iommu_spapr_tce_remove remove = { + .argsz = sizeof(remove), }; struct vfio_iommu_spapr_tce_info info = { .argsz = sizeof(info), }; - struct vfio_iommu_spapr_tce_create create = { - .argsz = sizeof(create), - }; - struct vfio_iommu_spapr_tce_remove remove = { - .argsz = sizeof(remove), - }; + int ret; /* query spapr iommu info */ ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); @@ -765,69 +1078,134 @@ vfio_spapr_dma_map(int vfio_container_fd) return -1; } - /* create DMA window from 0 to max(phys_addr + len) */ - for (i = 0; i < RTE_MAX_MEMSEG; i++) { - if (ms[i].addr == NULL) - break; - - create.window_size = RTE_MAX(create.window_size, - ms[i].phys_addr + ms[i].len); - } - - /* sPAPR requires window size to be a power of 2 */ - create.window_size = rte_align64pow2(create.window_size); - create.page_shift = __builtin_ctzll(ms->hugepage_sz); - create.levels = 1; - - ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); + /* create new DMA window */ + ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, create); if (ret) { RTE_LOG(ERR, EAL, " cannot create new DMA window, " "error %i (%s)\n", errno, strerror(errno)); return -1; } - if (create.start_addr != 0) { + if (create->start_addr != 0) { RTE_LOG(ERR, EAL, " DMA window start address != 0\n"); return -1; } - /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */ - for (i = 0; i < RTE_MAX_MEMSEG; i++) { - struct vfio_iommu_type1_dma_map dma_map; + return 0; +} - if (ms[i].addr == NULL) - break; +static int +vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map) +{ + struct spapr_walk_param param; + struct vfio_iommu_spapr_tce_create create = { + .argsz = sizeof(create), + }; + int i, ret = 0; - reg.vaddr = (uintptr_t) ms[i].addr; - reg.size = ms[i].len; - ret = ioctl(vfio_container_fd, - VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®); - if (ret) { - RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, " - "error %i (%s)\n", errno, strerror(errno)); - return -1; - } + rte_spinlock_lock(&user_mem_maps.lock); - memset(&dma_map, 0, sizeof(dma_map)); - dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); - dma_map.vaddr = ms[i].addr_64; - dma_map.size = ms[i].len; - if (rte_eal_iova_mode() == RTE_IOVA_VA) - dma_map.iova = dma_map.vaddr; - else - dma_map.iova = ms[i].phys_addr; - dma_map.flags = VFIO_DMA_MAP_FLAG_READ | - VFIO_DMA_MAP_FLAG_WRITE; + /* check if window size needs to be adjusted */ + memset(¶m, 0, sizeof(param)); - ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); + if (rte_memseg_walk(vfio_spapr_window_size_walk, ¶m) < 0) { + RTE_LOG(ERR, EAL, "Could not get window size\n"); + ret = -1; + goto out; + } - if (ret) { - RTE_LOG(ERR, EAL, " cannot set up DMA remapping, " - "error %i (%s)\n", errno, strerror(errno)); - return -1; + /* also check user maps */ + for (i = 0; i < user_mem_maps.n_maps; i++) { + uint64_t max = user_mem_maps.maps[i].iova + + user_mem_maps.maps[i].len; + create.window_size = RTE_MAX(create.window_size, max); + } + + /* sPAPR requires window size to be a power of 2 */ + create.window_size = rte_align64pow2(param.window_size); + create.page_shift = __builtin_ctzll(param.hugepage_sz); + create.levels = 1; + + if (do_map) { + /* re-create window and remap the entire memory */ + if (iova > create.window_size) { + if (vfio_spapr_create_new_dma_window(vfio_container_fd, + &create) < 0) { + RTE_LOG(ERR, EAL, "Could not create new DMA window\n"); + ret = -1; + goto out; + } + if (rte_memseg_walk(vfio_spapr_map_walk, + &vfio_container_fd) < 0) { + RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n"); + ret = -1; + goto out; + } + /* remap all user maps */ + for (i = 0; i < user_mem_maps.n_maps; i++) { + struct user_mem_map *map = + &user_mem_maps.maps[i]; + if (vfio_spapr_dma_do_map(vfio_container_fd, + map->addr, map->iova, map->len, + 1)) { + RTE_LOG(ERR, EAL, "Could not recreate user DMA maps\n"); + ret = -1; + goto out; + } + } + } + + /* now that we've remapped all of the memory that was present + * before, map the segment that we were requested to map. + */ + if (vfio_spapr_dma_do_map(vfio_container_fd, + vaddr, iova, len, 1) < 0) { + RTE_LOG(ERR, EAL, "Could not map segment\n"); + ret = -1; + goto out; + } + } else { + /* for unmap, check if iova within DMA window */ + if (iova > create.window_size) { + RTE_LOG(ERR, EAL, "iova beyond DMA window for unmap"); + ret = -1; + goto out; } + vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 0); } +out: + rte_spinlock_unlock(&user_mem_maps.lock); + return ret; +} + +static int +vfio_spapr_dma_map(int vfio_container_fd) +{ + struct vfio_iommu_spapr_tce_create create = { + .argsz = sizeof(create), + }; + struct spapr_walk_param param; + + memset(¶m, 0, sizeof(param)); + + /* create DMA window from 0 to max(phys_addr + len) */ + rte_memseg_walk(vfio_spapr_window_size_walk, ¶m); + + /* sPAPR requires window size to be a power of 2 */ + create.window_size = rte_align64pow2(param.window_size); + create.page_shift = __builtin_ctzll(param.hugepage_sz); + create.levels = 1; + + if (vfio_spapr_create_new_dma_window(vfio_container_fd, &create) < 0) { + RTE_LOG(ERR, EAL, "Could not create new DMA window\n"); + return -1; + } + + /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */ + if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0) + return -1; return 0; } @@ -839,23 +1217,202 @@ vfio_noiommu_dma_map(int __rte_unused vfio_container_fd) return 0; } +static int +vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd, + uint64_t __rte_unused vaddr, + uint64_t __rte_unused iova, uint64_t __rte_unused len, + int __rte_unused do_map) +{ + /* No-IOMMU mode does not need DMA mapping */ + return 0; +} + +static int +vfio_dma_mem_map(uint64_t vaddr, uint64_t iova, uint64_t len, int do_map) +{ + const struct vfio_iommu_type *t = vfio_cfg.vfio_iommu_type; + + if (!t) { + RTE_LOG(ERR, EAL, " VFIO support not initialized\n"); + rte_errno = ENODEV; + return -1; + } + + if (!t->dma_user_map_func) { + RTE_LOG(ERR, EAL, + " VFIO custom DMA region maping not supported by IOMMU %s\n", + t->name); + rte_errno = ENOTSUP; + return -1; + } + + return t->dma_user_map_func(vfio_cfg.vfio_container_fd, vaddr, iova, + len, do_map); +} + +int __rte_experimental +rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len) +{ + struct user_mem_map *new_map; + int ret = 0; + + if (len == 0) { + rte_errno = EINVAL; + return -1; + } + + rte_spinlock_lock(&user_mem_maps.lock); + if (user_mem_maps.n_maps == VFIO_MAX_USER_MEM_MAPS) { + RTE_LOG(ERR, EAL, "No more space for user mem maps\n"); + rte_errno = ENOMEM; + ret = -1; + goto out; + } + /* map the entry */ + if (vfio_dma_mem_map(vaddr, iova, len, 1)) { + /* technically, this will fail if there are currently no devices + * plugged in, even if a device were added later, this mapping + * might have succeeded. however, since we cannot verify if this + * is a valid mapping without having a device attached, consider + * this to be unsupported, because we can't just store any old + * mapping and pollute list of active mappings willy-nilly. + */ + RTE_LOG(ERR, EAL, "Couldn't map new region for DMA\n"); + ret = -1; + goto out; + } + /* create new user mem map entry */ + new_map = &user_mem_maps.maps[user_mem_maps.n_maps++]; + new_map->addr = vaddr; + new_map->iova = iova; + new_map->len = len; + + compact_user_maps(); +out: + rte_spinlock_unlock(&user_mem_maps.lock); + return ret; +} + +int __rte_experimental +rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len) +{ + struct user_mem_map *map, *new_map = NULL; + int ret = 0; + + if (len == 0) { + rte_errno = EINVAL; + return -1; + } + + rte_spinlock_lock(&user_mem_maps.lock); + + /* find our mapping */ + map = find_user_mem_map(vaddr, iova, len); + if (!map) { + RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n"); + rte_errno = EINVAL; + ret = -1; + goto out; + } + if (map->addr != vaddr || map->iova != iova || map->len != len) { + /* we're partially unmapping a previously mapped region, so we + * need to split entry into two. + */ + if (user_mem_maps.n_maps == VFIO_MAX_USER_MEM_MAPS) { + RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n"); + rte_errno = ENOMEM; + ret = -1; + goto out; + } + new_map = &user_mem_maps.maps[user_mem_maps.n_maps++]; + } + + /* unmap the entry */ + if (vfio_dma_mem_map(vaddr, iova, len, 0)) { + /* there may not be any devices plugged in, so unmapping will + * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't + * stop us from removing the mapping, as the assumption is we + * won't be needing this memory any more and thus will want to + * prevent it from being remapped again on hotplug. so, only + * fail if we indeed failed to unmap (e.g. if the mapping was + * within our mapped range but had invalid alignment). + */ + if (rte_errno != ENODEV && rte_errno != ENOTSUP) { + RTE_LOG(ERR, EAL, "Couldn't unmap region for DMA\n"); + ret = -1; + goto out; + } else { + RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n"); + } + } + /* remove map from the list of active mappings */ + if (new_map != NULL) { + adjust_map(map, new_map, vaddr, len); + + /* if we've created a new map by splitting, sort everything */ + if (!is_null_map(new_map)) { + compact_user_maps(); + } else { + /* we've created a new mapping, but it was unused */ + user_mem_maps.n_maps--; + } + } else { + memset(map, 0, sizeof(*map)); + compact_user_maps(); + user_mem_maps.n_maps--; + } + +out: + rte_spinlock_unlock(&user_mem_maps.lock); + return ret; +} + int -vfio_noiommu_is_enabled(void) +rte_vfio_noiommu_is_enabled(void) { - int fd, ret, cnt __rte_unused; + int fd; + ssize_t cnt; char c; - ret = -1; fd = open(VFIO_NOIOMMU_MODE, O_RDONLY); - if (fd < 0) - return -1; + if (fd < 0) { + if (errno != ENOENT) { + RTE_LOG(ERR, EAL, " cannot open vfio noiommu file %i (%s)\n", + errno, strerror(errno)); + return -1; + } + /* + * else the file does not exists + * i.e. noiommu is not enabled + */ + return 0; + } cnt = read(fd, &c, 1); - if (c == 'Y') - ret = 1; - close(fd); - return ret; + if (cnt != 1) { + RTE_LOG(ERR, EAL, " unable to read from vfio noiommu " + "file %i (%s)\n", errno, strerror(errno)); + return -1; + } + + return c == 'Y'; +} + +#else + +int __rte_experimental +rte_vfio_dma_map(uint64_t __rte_unused vaddr, __rte_unused uint64_t iova, + __rte_unused uint64_t len) +{ + return -1; +} + +int __rte_experimental +rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova, + __rte_unused uint64_t len) +{ + return -1; } #endif