},
};
-/* for sPAPR IOMMU, we will need to walk memseg list, but we cannot use
- * rte_memseg_walk() because by the time we enter callback we will be holding a
- * write lock, so regular rte-memseg_walk will deadlock. copying the same
- * iteration code everywhere is not ideal as well. so, use a lockless copy of
- * memseg walk here.
- */
-static int
-memseg_walk_thread_unsafe(rte_memseg_walk_t func, void *arg)
-{
- struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- int i, ms_idx, ret = 0;
-
- for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
- struct rte_memseg_list *msl = &mcfg->memsegs[i];
- const struct rte_memseg *ms;
- struct rte_fbarray *arr;
-
- if (msl->memseg_arr.count == 0)
- continue;
-
- arr = &msl->memseg_arr;
-
- ms_idx = rte_fbarray_find_next_used(arr, 0);
- while (ms_idx >= 0) {
- ms = rte_fbarray_get(arr, ms_idx);
- ret = func(msl, ms, arg);
- if (ret < 0)
- return -1;
- if (ret > 0)
- return 1;
- ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1);
- }
- }
- return 0;
-}
-
static int
is_null_map(const struct user_mem_map *map)
{
return NULL;
}
-static struct vfio_config *
-get_vfio_cfg_by_group_fd(int vfio_group_fd)
-{
- struct vfio_config *vfio_cfg;
- int i, j;
-
- for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
- vfio_cfg = &vfio_cfgs[i];
- for (j = 0; j < VFIO_MAX_GROUPS; j++)
- if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
- return vfio_cfg;
- }
-
- return NULL;
-}
-
-static struct vfio_config *
-get_vfio_cfg_by_container_fd(int container_fd)
-{
- int i;
-
- for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
- if (vfio_cfgs[i].vfio_container_fd == container_fd)
- return &vfio_cfgs[i];
- }
-
- return NULL;
-}
-
-int
-rte_vfio_get_group_fd(int iommu_group_num)
+static int
+vfio_get_group_fd(struct vfio_config *vfio_cfg,
+ int iommu_group_num)
{
int i;
int vfio_group_fd;
struct vfio_group *cur_grp;
- struct vfio_config *vfio_cfg;
-
- /* get the vfio_config it belongs to */
- vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
- vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
/* check if we already have the group descriptor open */
for (i = 0; i < VFIO_MAX_GROUPS; i++)
return vfio_group_fd;
}
+static struct vfio_config *
+get_vfio_cfg_by_group_fd(int vfio_group_fd)
+{
+ struct vfio_config *vfio_cfg;
+ int i, j;
+
+ for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+ vfio_cfg = &vfio_cfgs[i];
+ for (j = 0; j < VFIO_MAX_GROUPS; j++)
+ if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
+ return vfio_cfg;
+ }
+
+ return NULL;
+}
+
+static struct vfio_config *
+get_vfio_cfg_by_container_fd(int container_fd)
+{
+ int i;
+
+ for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+ if (vfio_cfgs[i].vfio_container_fd == container_fd)
+ return &vfio_cfgs[i];
+ }
+
+ return NULL;
+}
+
+int
+rte_vfio_get_group_fd(int iommu_group_num)
+{
+ struct vfio_config *vfio_cfg;
+
+ /* get the vfio_config it belongs to */
+ vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
+ vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
+
+ return vfio_get_group_fd(vfio_cfg, iommu_group_num);
+}
+
static int
get_vfio_group_idx(int vfio_group_fd)
{
}
static void
-vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len)
+vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
+ void *arg __rte_unused)
{
struct rte_memseg_list *msl;
struct rte_memseg *ms;
msl = rte_mem_virt2memseg_list(addr);
/* for IOVA as VA mode, no need to care for IOVA addresses */
- if (rte_eal_iova_mode() == RTE_IOVA_VA) {
+ if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) {
uint64_t vfio_va = (uint64_t)(uintptr_t)addr;
if (type == RTE_MEM_EVENT_ALLOC)
vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va,
/* memsegs are contiguous in memory */
ms = rte_mem_virt2memseg(addr, msl);
while (cur_len < len) {
+ /* some memory segments may have invalid IOVA */
+ if (ms->iova == RTE_BAD_IOVA) {
+ RTE_LOG(DEBUG, EAL, "Memory segment at %p has bad IOVA, skipping\n",
+ ms->addr);
+ goto next;
+ }
if (type == RTE_MEM_EVENT_ALLOC)
vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
ms->iova, ms->len, 1);
else
vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
ms->iova, ms->len, 0);
-
+next:
cur_len += ms->len;
++ms;
}
}
-int
-rte_vfio_clear_group(int vfio_group_fd)
+static int
+vfio_sync_default_container(void)
{
- int i;
struct rte_mp_msg mp_req, *mp_rep;
struct rte_mp_reply mp_reply;
struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
- struct vfio_config *vfio_cfg;
+ int iommu_type_id;
+ unsigned int i;
- vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
- if (vfio_cfg == NULL) {
- RTE_LOG(ERR, EAL, " invalid group fd!\n");
+ /* cannot be called from primary */
+ if (rte_eal_process_type() != RTE_PROC_SECONDARY)
return -1;
- }
- if (internal_config.process_type == RTE_PROC_PRIMARY) {
-
- i = get_vfio_group_idx(vfio_group_fd);
- if (i < 0)
- return -1;
- vfio_cfg->vfio_groups[i].group_num = -1;
- vfio_cfg->vfio_groups[i].fd = -1;
- vfio_cfg->vfio_groups[i].devices = 0;
- vfio_cfg->vfio_active_groups--;
- return 0;
+ /* default container fd should have been opened in rte_vfio_enable() */
+ if (!default_vfio_cfg->vfio_enabled ||
+ default_vfio_cfg->vfio_container_fd < 0) {
+ RTE_LOG(ERR, EAL, "VFIO support is not initialized\n");
+ return -1;
}
- p->req = SOCKET_CLR_GROUP;
- p->group_num = vfio_group_fd;
+ /* find default container's IOMMU type */
+ p->req = SOCKET_REQ_IOMMU_TYPE;
strcpy(mp_req.name, EAL_VFIO_MP);
mp_req.len_param = sizeof(*p);
mp_req.num_fds = 0;
+ iommu_type_id = -1;
if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
- mp_reply.nb_received == 1) {
+ mp_reply.nb_received == 1) {
mp_rep = &mp_reply.msgs[0];
p = (struct vfio_mp_param *)mp_rep->param;
- if (p->result == SOCKET_OK) {
- free(mp_reply.msgs);
- return 0;
- } else if (p->result == SOCKET_NO_FD)
- RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n");
- else
- RTE_LOG(ERR, EAL, " no such VFIO group fd!\n");
-
+ if (p->result == SOCKET_OK)
+ iommu_type_id = p->iommu_type_id;
free(mp_reply.msgs);
}
+ if (iommu_type_id < 0) {
+ RTE_LOG(ERR, EAL, "Could not get IOMMU type for default container\n");
+ return -1;
+ }
+
+ /* we now have an fd for default container, as well as its IOMMU type.
+ * now, set up default VFIO container config to match.
+ */
+ for (i = 0; i < RTE_DIM(iommu_types); i++) {
+ const struct vfio_iommu_type *t = &iommu_types[i];
+ if (t->type_id != iommu_type_id)
+ continue;
+
+ /* we found our IOMMU type */
+ default_vfio_cfg->vfio_iommu_type = t;
+ return 0;
+ }
+ RTE_LOG(ERR, EAL, "Could not find IOMMU type id (%i)\n",
+ iommu_type_id);
return -1;
}
+int
+rte_vfio_clear_group(int vfio_group_fd)
+{
+ int i;
+ struct vfio_config *vfio_cfg;
+
+ vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
+ if (vfio_cfg == NULL) {
+ RTE_LOG(ERR, EAL, " invalid group fd!\n");
+ return -1;
+ }
+
+ i = get_vfio_group_idx(vfio_group_fd);
+ if (i < 0)
+ return -1;
+ vfio_cfg->vfio_groups[i].group_num = -1;
+ vfio_cfg->vfio_groups[i].fd = -1;
+ vfio_cfg->vfio_groups[i].devices = 0;
+ vfio_cfg->vfio_active_groups--;
+
+ return 0;
+}
+
int
rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
int *vfio_dev_fd, struct vfio_device_info *device_info)
if (vfio_cfg == default_vfio_cfg)
ret = rte_mem_event_callback_register(
VFIO_MEM_EVENT_CLB_NAME,
- vfio_mem_event_callback);
+ vfio_mem_event_callback, NULL);
else
ret = 0;
/* unlock memory hotplug */
else
RTE_LOG(DEBUG, EAL, "Installed memory event callback for VFIO\n");
}
+ } else if (rte_eal_process_type() != RTE_PROC_PRIMARY &&
+ vfio_cfg == default_vfio_cfg &&
+ vfio_cfg->vfio_iommu_type == NULL) {
+ /* if we're not a primary process, we do not set up the VFIO
+ * container because it's already been set up by the primary
+ * process. instead, we simply ask the primary about VFIO type
+ * we are using, and set the VFIO config up appropriately.
+ */
+ ret = vfio_sync_default_container();
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "Could not sync default VFIO container\n");
+ close(vfio_group_fd);
+ rte_vfio_clear_group(vfio_group_fd);
+ return -1;
+ }
+ /* we have successfully initialized VFIO, notify user */
+ const struct vfio_iommu_type *t =
+ default_vfio_cfg->vfio_iommu_type;
+ RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n",
+ t->type_id, t->name);
}
/* get a file descriptor for the device */
/* if there are no active device groups, unregister the callback to
* avoid spurious attempts to map/unmap memory from VFIO.
*/
- if (vfio_cfg == default_vfio_cfg && vfio_cfg->vfio_active_groups == 0)
- rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME);
+ if (vfio_cfg == default_vfio_cfg && vfio_cfg->vfio_active_groups == 0 &&
+ rte_eal_process_type() != RTE_PROC_SECONDARY)
+ rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME,
+ NULL);
/* success */
ret = 0;
return 0;
}
- default_vfio_cfg->vfio_container_fd = rte_vfio_get_container_fd();
+ if (internal_config.process_type == RTE_PROC_PRIMARY) {
+ /* open a new container */
+ default_vfio_cfg->vfio_container_fd =
+ rte_vfio_get_container_fd();
+ } else {
+ /* get the default container from the primary process */
+ default_vfio_cfg->vfio_container_fd =
+ vfio_get_default_container_fd();
+ }
/* check if we have VFIO driver enabled */
if (default_vfio_cfg->vfio_container_fd != -1) {
return default_vfio_cfg->vfio_enabled && mod_available;
}
+int
+vfio_get_default_container_fd(void)
+{
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+
+ if (default_vfio_cfg->vfio_enabled)
+ return default_vfio_cfg->vfio_container_fd;
+
+ if (internal_config.process_type == RTE_PROC_PRIMARY) {
+ /* if we were secondary process we would try requesting
+ * container fd from the primary, but we're the primary
+ * process so just exit here
+ */
+ return -1;
+ }
+
+ p->req = SOCKET_REQ_DEFAULT_CONTAINER;
+ strcpy(mp_req.name, EAL_VFIO_MP);
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ free(mp_reply.msgs);
+ return mp_rep->fds[0];
+ }
+ free(mp_reply.msgs);
+ }
+
+ RTE_LOG(ERR, EAL, " cannot request default container fd\n");
+ return -1;
+}
+
+int
+vfio_get_iommu_type(void)
+{
+ if (default_vfio_cfg->vfio_iommu_type == NULL)
+ return -1;
+
+ return default_vfio_cfg->vfio_iommu_type->type_id;
+}
+
const struct vfio_iommu_type *
vfio_set_iommu_type(int vfio_container_fd)
{
mp_rep = &mp_reply.msgs[0];
p = (struct vfio_mp_param *)mp_rep->param;
if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ vfio_container_fd = mp_rep->fds[0];
free(mp_reply.msgs);
- return mp_rep->fds[0];
+ return vfio_container_fd;
}
free(mp_reply.msgs);
}
}
static int
-type1_map(const struct rte_memseg_list *msl __rte_unused,
- const struct rte_memseg *ms, void *arg)
+type1_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+ void *arg)
{
int *vfio_container_fd = arg;
+ if (msl->external)
+ return 0;
+
return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
ms->len, 1);
}
struct vfio_iommu_type1_dma_map dma_map;
struct vfio_iommu_type1_dma_unmap dma_unmap;
int ret;
+ struct vfio_iommu_spapr_register_memory reg = {
+ .argsz = sizeof(reg),
+ .flags = 0
+ };
+ reg.vaddr = (uintptr_t) vaddr;
+ reg.size = len;
if (do_map != 0) {
+ ret = ioctl(vfio_container_fd,
+ VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
memset(&dma_map, 0, sizeof(dma_map));
dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
dma_map.vaddr = vaddr;
}
} else {
- struct vfio_iommu_spapr_register_memory reg = {
- .argsz = sizeof(reg),
- .flags = 0
- };
- reg.vaddr = (uintptr_t) vaddr;
- reg.size = len;
-
ret = ioctl(vfio_container_fd,
VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®);
if (ret) {
}
static int
-vfio_spapr_map_walk(const struct rte_memseg_list *msl __rte_unused,
+vfio_spapr_map_walk(const struct rte_memseg_list *msl,
const struct rte_memseg *ms, void *arg)
{
int *vfio_container_fd = arg;
- return vfio_spapr_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
+ if (msl->external)
+ return 0;
+
+ return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova,
ms->len, 1);
}
uint64_t hugepage_sz;
};
static int
-vfio_spapr_window_size_walk(const struct rte_memseg_list *msl __rte_unused,
+vfio_spapr_window_size_walk(const struct rte_memseg_list *msl,
const struct rte_memseg *ms, void *arg)
{
struct spapr_walk_param *param = arg;
uint64_t max = ms->iova + ms->len;
+ if (msl->external)
+ return 0;
+
if (max > param->window_size) {
param->hugepage_sz = ms->hugepage_sz;
param->window_size = max;
/* check if window size needs to be adjusted */
memset(¶m, 0, sizeof(param));
- if (memseg_walk_thread_unsafe(vfio_spapr_window_size_walk,
+ /* we're inside a callback so use thread-unsafe version */
+ if (rte_memseg_walk_thread_unsafe(vfio_spapr_window_size_walk,
¶m) < 0) {
RTE_LOG(ERR, EAL, "Could not get window size\n");
ret = -1;
ret = -1;
goto out;
}
- if (memseg_walk_thread_unsafe(vfio_spapr_map_walk,
+ /* we're inside a callback, so use thread-unsafe version
+ */
+ if (rte_memseg_walk_thread_unsafe(vfio_spapr_map_walk,
&vfio_container_fd) < 0) {
RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n");
ret = -1;
return ret;
}
-int __rte_experimental
+int
rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len)
{
if (len == 0) {
return container_dma_map(default_vfio_cfg, vaddr, iova, len);
}
-int __rte_experimental
+int
rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len)
{
if (len == 0) {
return c == 'Y';
}
-int __rte_experimental
+int
rte_vfio_container_create(void)
{
int i;
return 0;
}
-int __rte_experimental
+int
rte_vfio_container_group_bind(int container_fd, int iommu_group_num)
{
struct vfio_config *vfio_cfg;
- struct vfio_group *cur_grp;
- int vfio_group_fd;
- int i;
vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
if (vfio_cfg == NULL) {
return -1;
}
- /* Check room for new group */
- if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) {
- RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
- return -1;
- }
-
- /* Get an index for the new group */
- for (i = 0; i < VFIO_MAX_GROUPS; i++)
- if (vfio_cfg->vfio_groups[i].group_num == -1) {
- cur_grp = &vfio_cfg->vfio_groups[i];
- break;
- }
-
- /* This should not happen */
- if (i == VFIO_MAX_GROUPS) {
- RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
- return -1;
- }
-
- vfio_group_fd = vfio_open_group_fd(iommu_group_num);
- if (vfio_group_fd < 0) {
- RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num);
- return -1;
- }
- cur_grp->group_num = iommu_group_num;
- cur_grp->fd = vfio_group_fd;
- cur_grp->devices = 0;
- vfio_cfg->vfio_active_groups++;
-
- return vfio_group_fd;
+ return vfio_get_group_fd(vfio_cfg, iommu_group_num);
}
-int __rte_experimental
+int
rte_vfio_container_group_unbind(int container_fd, int iommu_group_num)
{
struct vfio_config *vfio_cfg;
- struct vfio_group *cur_grp;
+ struct vfio_group *cur_grp = NULL;
int i;
vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
}
/* This should not happen */
- if (i == VFIO_MAX_GROUPS) {
+ if (i == VFIO_MAX_GROUPS || cur_grp == NULL) {
RTE_LOG(ERR, EAL, "Specified group number not found\n");
return -1;
}
return 0;
}
-int __rte_experimental
+int
rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova,
uint64_t len)
{
return container_dma_map(vfio_cfg, vaddr, iova, len);
}
-int __rte_experimental
+int
rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova,
uint64_t len)
{
#else
-int __rte_experimental
+int
rte_vfio_dma_map(uint64_t __rte_unused vaddr, __rte_unused uint64_t iova,
__rte_unused uint64_t len)
{
return -1;
}
-int __rte_experimental
+int
rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova,
__rte_unused uint64_t len)
{
return -1;
}
-int __rte_experimental
+int
rte_vfio_get_group_num(__rte_unused const char *sysfs_base,
__rte_unused const char *dev_addr,
__rte_unused int *iommu_group_num)
return -1;
}
-int __rte_experimental
+int
rte_vfio_get_container_fd(void)
{
return -1;
}
-int __rte_experimental
+int
rte_vfio_get_group_fd(__rte_unused int iommu_group_num)
{
return -1;
}
-int __rte_experimental
+int
rte_vfio_container_create(void)
{
return -1;
}
-int __rte_experimental
+int
rte_vfio_container_destroy(__rte_unused int container_fd)
{
return -1;
}
-int __rte_experimental
+int
rte_vfio_container_group_bind(__rte_unused int container_fd,
__rte_unused int iommu_group_num)
{
return -1;
}
-int __rte_experimental
+int
rte_vfio_container_group_unbind(__rte_unused int container_fd,
__rte_unused int iommu_group_num)
{
return -1;
}
-int __rte_experimental
+int
rte_vfio_container_dma_map(__rte_unused int container_fd,
__rte_unused uint64_t vaddr,
__rte_unused uint64_t iova,
return -1;
}
-int __rte_experimental
+int
rte_vfio_container_dma_unmap(__rte_unused int container_fd,
__rte_unused uint64_t vaddr,
__rte_unused uint64_t iova,