1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2018 Intel Corporation
11 #include <rte_errno.h>
13 #include <rte_memory.h>
14 #include <rte_eal_memconfig.h>
17 #include "eal_filesystem.h"
18 #include "eal_memcfg.h"
20 #include "eal_private.h"
21 #include "eal_internal_cfg.h"
25 #define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb"
27 /* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can
28 * recreate the mappings for DPDK segments, but we cannot do so for memory that
29 * was registered by the user themselves, so we need to store the user mappings
30 * somewhere, to recreate them later.
32 #define VFIO_MAX_USER_MEM_MAPS 256
39 struct user_mem_maps {
40 rte_spinlock_recursive_t lock;
42 struct user_mem_map maps[VFIO_MAX_USER_MEM_MAPS];
47 int vfio_container_fd;
48 int vfio_active_groups;
49 const struct vfio_iommu_type *vfio_iommu_type;
50 struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
51 struct user_mem_maps mem_maps;
54 /* per-process VFIO config */
55 static struct vfio_config vfio_cfgs[VFIO_MAX_CONTAINERS];
56 static struct vfio_config *default_vfio_cfg = &vfio_cfgs[0];
58 static int vfio_type1_dma_map(int);
59 static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
60 static int vfio_spapr_dma_map(int);
61 static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
62 static int vfio_noiommu_dma_map(int);
63 static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
64 static int vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr,
65 uint64_t iova, uint64_t len, int do_map);
67 /* IOMMU types we support */
68 static const struct vfio_iommu_type iommu_types[] = {
69 /* x86 IOMMU, otherwise known as type 1 */
71 .type_id = RTE_VFIO_TYPE1,
73 .partial_unmap = false,
74 .dma_map_func = &vfio_type1_dma_map,
75 .dma_user_map_func = &vfio_type1_dma_mem_map
77 /* ppc64 IOMMU, otherwise known as spapr */
79 .type_id = RTE_VFIO_SPAPR,
81 .partial_unmap = true,
82 .dma_map_func = &vfio_spapr_dma_map,
83 .dma_user_map_func = &vfio_spapr_dma_mem_map
87 .type_id = RTE_VFIO_NOIOMMU,
89 .partial_unmap = true,
90 .dma_map_func = &vfio_noiommu_dma_map,
91 .dma_user_map_func = &vfio_noiommu_dma_mem_map
96 is_null_map(const struct user_mem_map *map)
98 return map->addr == 0 && map->iova == 0 && map->len == 0;
101 /* we may need to merge user mem maps together in case of user mapping/unmapping
102 * chunks of memory, so we'll need a comparator function to sort segments.
105 user_mem_map_cmp(const void *a, const void *b)
107 const struct user_mem_map *umm_a = a;
108 const struct user_mem_map *umm_b = b;
110 /* move null entries to end */
111 if (is_null_map(umm_a))
113 if (is_null_map(umm_b))
116 /* sort by iova first */
117 if (umm_a->iova < umm_b->iova)
119 if (umm_a->iova > umm_b->iova)
122 if (umm_a->addr < umm_b->addr)
124 if (umm_a->addr > umm_b->addr)
127 if (umm_a->len < umm_b->len)
129 if (umm_a->len > umm_b->len)
135 /* adjust user map entry. this may result in shortening of existing map, or in
136 * splitting existing map in two pieces.
139 adjust_map(struct user_mem_map *src, struct user_mem_map *end,
140 uint64_t remove_va_start, uint64_t remove_len)
142 /* if va start is same as start address, we're simply moving start */
143 if (remove_va_start == src->addr) {
144 src->addr += remove_len;
145 src->iova += remove_len;
146 src->len -= remove_len;
147 } else if (remove_va_start + remove_len == src->addr + src->len) {
148 /* we're shrinking mapping from the end */
149 src->len -= remove_len;
151 /* we're blowing a hole in the middle */
152 struct user_mem_map tmp;
153 uint64_t total_len = src->len;
155 /* adjust source segment length */
156 src->len = remove_va_start - src->addr;
158 /* create temporary segment in the middle */
159 tmp.addr = src->addr + src->len;
160 tmp.iova = src->iova + src->len;
161 tmp.len = remove_len;
163 /* populate end segment - this one we will be keeping */
164 end->addr = tmp.addr + tmp.len;
165 end->iova = tmp.iova + tmp.len;
166 end->len = total_len - src->len - tmp.len;
170 /* try merging two maps into one, return 1 if succeeded */
172 merge_map(struct user_mem_map *left, struct user_mem_map *right)
174 if (left->addr + left->len != right->addr)
176 if (left->iova + left->len != right->iova)
179 left->len += right->len;
181 memset(right, 0, sizeof(*right));
186 static struct user_mem_map *
187 find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t addr,
188 uint64_t iova, uint64_t len)
190 uint64_t va_end = addr + len;
191 uint64_t iova_end = iova + len;
194 for (i = 0; i < user_mem_maps->n_maps; i++) {
195 struct user_mem_map *map = &user_mem_maps->maps[i];
196 uint64_t map_va_end = map->addr + map->len;
197 uint64_t map_iova_end = map->iova + map->len;
200 if (addr < map->addr || addr >= map_va_end)
202 /* check if VA end is within boundaries */
203 if (va_end <= map->addr || va_end > map_va_end)
206 /* check start IOVA */
207 if (iova < map->iova || iova >= map_iova_end)
209 /* check if IOVA end is within boundaries */
210 if (iova_end <= map->iova || iova_end > map_iova_end)
213 /* we've found our map */
219 /* this will sort all user maps, and merge/compact any adjacent maps */
221 compact_user_maps(struct user_mem_maps *user_mem_maps)
223 int i, n_merged, cur_idx;
225 qsort(user_mem_maps->maps, user_mem_maps->n_maps,
226 sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
228 /* we'll go over the list backwards when merging */
230 for (i = user_mem_maps->n_maps - 2; i >= 0; i--) {
231 struct user_mem_map *l, *r;
233 l = &user_mem_maps->maps[i];
234 r = &user_mem_maps->maps[i + 1];
236 if (is_null_map(l) || is_null_map(r))
243 /* the entries are still sorted, but now they have holes in them, so
244 * walk through the list and remove the holes
248 for (i = 0; i < user_mem_maps->n_maps; i++) {
249 if (!is_null_map(&user_mem_maps->maps[i])) {
250 struct user_mem_map *src, *dst;
252 src = &user_mem_maps->maps[i];
253 dst = &user_mem_maps->maps[cur_idx++];
256 memcpy(dst, src, sizeof(*src));
257 memset(src, 0, sizeof(*src));
261 user_mem_maps->n_maps = cur_idx;
266 vfio_open_group_fd(int iommu_group_num)
269 char filename[PATH_MAX];
270 struct rte_mp_msg mp_req, *mp_rep;
271 struct rte_mp_reply mp_reply = {0};
272 struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
273 struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
274 const struct internal_config *internal_conf =
275 eal_get_internal_configuration();
277 /* if primary, try to open the group */
278 if (internal_conf->process_type == RTE_PROC_PRIMARY) {
279 /* try regular group format */
280 snprintf(filename, sizeof(filename),
281 VFIO_GROUP_FMT, iommu_group_num);
282 vfio_group_fd = open(filename, O_RDWR);
283 if (vfio_group_fd < 0) {
284 /* if file not found, it's not an error */
285 if (errno != ENOENT) {
286 RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
291 /* special case: try no-IOMMU path as well */
292 snprintf(filename, sizeof(filename),
293 VFIO_NOIOMMU_GROUP_FMT,
295 vfio_group_fd = open(filename, O_RDWR);
296 if (vfio_group_fd < 0) {
297 if (errno != ENOENT) {
298 RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
304 /* noiommu group found */
307 return vfio_group_fd;
309 /* if we're in a secondary process, request group fd from the primary
310 * process via mp channel.
312 p->req = SOCKET_REQ_GROUP;
313 p->group_num = iommu_group_num;
314 strcpy(mp_req.name, EAL_VFIO_MP);
315 mp_req.len_param = sizeof(*p);
319 if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
320 mp_reply.nb_received == 1) {
321 mp_rep = &mp_reply.msgs[0];
322 p = (struct vfio_mp_param *)mp_rep->param;
323 if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
324 vfio_group_fd = mp_rep->fds[0];
325 } else if (p->result == SOCKET_NO_FD) {
326 RTE_LOG(ERR, EAL, " bad VFIO group fd\n");
327 vfio_group_fd = -ENOENT;
332 if (vfio_group_fd < 0 && vfio_group_fd != -ENOENT)
333 RTE_LOG(ERR, EAL, " cannot request group fd\n");
334 return vfio_group_fd;
337 static struct vfio_config *
338 get_vfio_cfg_by_group_num(int iommu_group_num)
340 struct vfio_config *vfio_cfg;
343 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
344 vfio_cfg = &vfio_cfgs[i];
345 for (j = 0; j < VFIO_MAX_GROUPS; j++) {
346 if (vfio_cfg->vfio_groups[j].group_num ==
356 vfio_get_group_fd(struct vfio_config *vfio_cfg,
361 struct vfio_group *cur_grp;
363 /* check if we already have the group descriptor open */
364 for (i = 0; i < VFIO_MAX_GROUPS; i++)
365 if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num)
366 return vfio_cfg->vfio_groups[i].fd;
368 /* Lets see first if there is room for a new group */
369 if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) {
370 RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
374 /* Now lets get an index for the new group */
375 for (i = 0; i < VFIO_MAX_GROUPS; i++)
376 if (vfio_cfg->vfio_groups[i].group_num == -1) {
377 cur_grp = &vfio_cfg->vfio_groups[i];
381 /* This should not happen */
382 if (i == VFIO_MAX_GROUPS) {
383 RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
387 vfio_group_fd = vfio_open_group_fd(iommu_group_num);
388 if (vfio_group_fd < 0) {
389 RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num);
390 return vfio_group_fd;
393 cur_grp->group_num = iommu_group_num;
394 cur_grp->fd = vfio_group_fd;
395 vfio_cfg->vfio_active_groups++;
397 return vfio_group_fd;
400 static struct vfio_config *
401 get_vfio_cfg_by_group_fd(int vfio_group_fd)
403 struct vfio_config *vfio_cfg;
406 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
407 vfio_cfg = &vfio_cfgs[i];
408 for (j = 0; j < VFIO_MAX_GROUPS; j++)
409 if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
416 static struct vfio_config *
417 get_vfio_cfg_by_container_fd(int container_fd)
421 if (container_fd == RTE_VFIO_DEFAULT_CONTAINER_FD)
422 return default_vfio_cfg;
424 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
425 if (vfio_cfgs[i].vfio_container_fd == container_fd)
426 return &vfio_cfgs[i];
433 rte_vfio_get_group_fd(int iommu_group_num)
435 struct vfio_config *vfio_cfg;
437 /* get the vfio_config it belongs to */
438 vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
439 vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
441 return vfio_get_group_fd(vfio_cfg, iommu_group_num);
445 get_vfio_group_idx(int vfio_group_fd)
447 struct vfio_config *vfio_cfg;
450 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
451 vfio_cfg = &vfio_cfgs[i];
452 for (j = 0; j < VFIO_MAX_GROUPS; j++)
453 if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
461 vfio_group_device_get(int vfio_group_fd)
463 struct vfio_config *vfio_cfg;
466 vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
467 if (vfio_cfg == NULL) {
468 RTE_LOG(ERR, EAL, " invalid group fd!\n");
472 i = get_vfio_group_idx(vfio_group_fd);
473 if (i < 0 || i > (VFIO_MAX_GROUPS - 1))
474 RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i);
476 vfio_cfg->vfio_groups[i].devices++;
480 vfio_group_device_put(int vfio_group_fd)
482 struct vfio_config *vfio_cfg;
485 vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
486 if (vfio_cfg == NULL) {
487 RTE_LOG(ERR, EAL, " invalid group fd!\n");
491 i = get_vfio_group_idx(vfio_group_fd);
492 if (i < 0 || i > (VFIO_MAX_GROUPS - 1))
493 RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i);
495 vfio_cfg->vfio_groups[i].devices--;
499 vfio_group_device_count(int vfio_group_fd)
501 struct vfio_config *vfio_cfg;
504 vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
505 if (vfio_cfg == NULL) {
506 RTE_LOG(ERR, EAL, " invalid group fd!\n");
510 i = get_vfio_group_idx(vfio_group_fd);
511 if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) {
512 RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i);
516 return vfio_cfg->vfio_groups[i].devices;
520 vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
521 void *arg __rte_unused)
523 struct rte_memseg_list *msl;
524 struct rte_memseg *ms;
527 msl = rte_mem_virt2memseg_list(addr);
529 /* for IOVA as VA mode, no need to care for IOVA addresses */
530 if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) {
531 uint64_t vfio_va = (uint64_t)(uintptr_t)addr;
532 uint64_t page_sz = msl->page_sz;
534 /* Maintain granularity of DMA map/unmap to memseg size */
535 for (; cur_len < len; cur_len += page_sz) {
536 if (type == RTE_MEM_EVENT_ALLOC)
537 vfio_dma_mem_map(default_vfio_cfg, vfio_va,
538 vfio_va, page_sz, 1);
540 vfio_dma_mem_map(default_vfio_cfg, vfio_va,
541 vfio_va, page_sz, 0);
548 /* memsegs are contiguous in memory */
549 ms = rte_mem_virt2memseg(addr, msl);
550 while (cur_len < len) {
551 /* some memory segments may have invalid IOVA */
552 if (ms->iova == RTE_BAD_IOVA) {
553 RTE_LOG(DEBUG, EAL, "Memory segment at %p has bad IOVA, skipping\n",
557 if (type == RTE_MEM_EVENT_ALLOC)
558 vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
559 ms->iova, ms->len, 1);
561 vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
562 ms->iova, ms->len, 0);
570 vfio_sync_default_container(void)
572 struct rte_mp_msg mp_req, *mp_rep;
573 struct rte_mp_reply mp_reply = {0};
574 struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
575 struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
579 /* cannot be called from primary */
580 if (rte_eal_process_type() != RTE_PROC_SECONDARY)
583 /* default container fd should have been opened in rte_vfio_enable() */
584 if (!default_vfio_cfg->vfio_enabled ||
585 default_vfio_cfg->vfio_container_fd < 0) {
586 RTE_LOG(ERR, EAL, "VFIO support is not initialized\n");
590 /* find default container's IOMMU type */
591 p->req = SOCKET_REQ_IOMMU_TYPE;
592 strcpy(mp_req.name, EAL_VFIO_MP);
593 mp_req.len_param = sizeof(*p);
597 if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
598 mp_reply.nb_received == 1) {
599 mp_rep = &mp_reply.msgs[0];
600 p = (struct vfio_mp_param *)mp_rep->param;
601 if (p->result == SOCKET_OK)
602 iommu_type_id = p->iommu_type_id;
605 if (iommu_type_id < 0) {
606 RTE_LOG(ERR, EAL, "Could not get IOMMU type for default container\n");
610 /* we now have an fd for default container, as well as its IOMMU type.
611 * now, set up default VFIO container config to match.
613 for (i = 0; i < RTE_DIM(iommu_types); i++) {
614 const struct vfio_iommu_type *t = &iommu_types[i];
615 if (t->type_id != iommu_type_id)
618 /* we found our IOMMU type */
619 default_vfio_cfg->vfio_iommu_type = t;
623 RTE_LOG(ERR, EAL, "Could not find IOMMU type id (%i)\n",
629 rte_vfio_clear_group(int vfio_group_fd)
632 struct vfio_config *vfio_cfg;
634 vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
635 if (vfio_cfg == NULL) {
636 RTE_LOG(ERR, EAL, " invalid group fd!\n");
640 i = get_vfio_group_idx(vfio_group_fd);
643 vfio_cfg->vfio_groups[i].group_num = -1;
644 vfio_cfg->vfio_groups[i].fd = -1;
645 vfio_cfg->vfio_groups[i].devices = 0;
646 vfio_cfg->vfio_active_groups--;
652 rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
653 int *vfio_dev_fd, struct vfio_device_info *device_info)
655 struct vfio_group_status group_status = {
656 .argsz = sizeof(group_status)
658 struct vfio_config *vfio_cfg;
659 struct user_mem_maps *user_mem_maps;
660 int vfio_container_fd;
665 const struct internal_config *internal_conf =
666 eal_get_internal_configuration();
668 /* get group number */
669 ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num);
671 RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n",
676 /* if negative, something failed */
680 /* get the actual group fd */
681 vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num);
682 if (vfio_group_fd < 0 && vfio_group_fd != -ENOENT)
686 * if vfio_group_fd == -ENOENT, that means the device
687 * isn't managed by VFIO
689 if (vfio_group_fd == -ENOENT) {
690 RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n",
696 * at this point, we know that this group is viable (meaning, all devices
697 * are either bound to VFIO or not bound to anything)
700 /* check if the group is viable */
701 ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status);
703 RTE_LOG(ERR, EAL, " %s cannot get group status, "
704 "error %i (%s)\n", dev_addr, errno, strerror(errno));
705 close(vfio_group_fd);
706 rte_vfio_clear_group(vfio_group_fd);
708 } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
709 RTE_LOG(ERR, EAL, " %s VFIO group is not viable! "
710 "Not all devices in IOMMU group bound to VFIO or unbound\n",
712 close(vfio_group_fd);
713 rte_vfio_clear_group(vfio_group_fd);
717 /* get the vfio_config it belongs to */
718 vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
719 vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
720 vfio_container_fd = vfio_cfg->vfio_container_fd;
721 user_mem_maps = &vfio_cfg->mem_maps;
723 /* check if group does not have a container yet */
724 if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) {
726 /* add group to a container */
727 ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER,
730 RTE_LOG(ERR, EAL, " %s cannot add VFIO group to container, "
731 "error %i (%s)\n", dev_addr, errno, strerror(errno));
732 close(vfio_group_fd);
733 rte_vfio_clear_group(vfio_group_fd);
738 * pick an IOMMU type and set up DMA mappings for container
740 * needs to be done only once, only when first group is
741 * assigned to a container and only in primary process.
742 * Note this can happen several times with the hotplug
745 if (internal_conf->process_type == RTE_PROC_PRIMARY &&
746 vfio_cfg->vfio_active_groups == 1 &&
747 vfio_group_device_count(vfio_group_fd) == 0) {
748 const struct vfio_iommu_type *t;
750 /* select an IOMMU type which we will be using */
751 t = vfio_set_iommu_type(vfio_container_fd);
754 " %s failed to select IOMMU type\n",
756 close(vfio_group_fd);
757 rte_vfio_clear_group(vfio_group_fd);
760 /* lock memory hotplug before mapping and release it
761 * after registering callback, to prevent races
763 rte_mcfg_mem_read_lock();
764 if (vfio_cfg == default_vfio_cfg)
765 ret = t->dma_map_func(vfio_container_fd);
770 " %s DMA remapping failed, error %i (%s)\n",
771 dev_addr, errno, strerror(errno));
772 close(vfio_group_fd);
773 rte_vfio_clear_group(vfio_group_fd);
774 rte_mcfg_mem_read_unlock();
778 vfio_cfg->vfio_iommu_type = t;
780 /* re-map all user-mapped segments */
781 rte_spinlock_recursive_lock(&user_mem_maps->lock);
783 /* this IOMMU type may not support DMA mapping, but
784 * if we have mappings in the list - that means we have
785 * previously mapped something successfully, so we can
786 * be sure that DMA mapping is supported.
788 for (i = 0; i < user_mem_maps->n_maps; i++) {
789 struct user_mem_map *map;
790 map = &user_mem_maps->maps[i];
792 ret = t->dma_user_map_func(
794 map->addr, map->iova, map->len,
797 RTE_LOG(ERR, EAL, "Couldn't map user memory for DMA: "
799 "iova: 0x%" PRIx64 " "
800 "len: 0x%" PRIu64 "\n",
801 map->addr, map->iova,
803 rte_spinlock_recursive_unlock(
804 &user_mem_maps->lock);
805 rte_mcfg_mem_read_unlock();
809 rte_spinlock_recursive_unlock(&user_mem_maps->lock);
811 /* register callback for mem events */
812 if (vfio_cfg == default_vfio_cfg)
813 ret = rte_mem_event_callback_register(
814 VFIO_MEM_EVENT_CLB_NAME,
815 vfio_mem_event_callback, NULL);
818 /* unlock memory hotplug */
819 rte_mcfg_mem_read_unlock();
821 if (ret && rte_errno != ENOTSUP) {
822 RTE_LOG(ERR, EAL, "Could not install memory event callback for VFIO\n");
826 RTE_LOG(DEBUG, EAL, "Memory event callbacks not supported\n");
828 RTE_LOG(DEBUG, EAL, "Installed memory event callback for VFIO\n");
830 } else if (rte_eal_process_type() != RTE_PROC_PRIMARY &&
831 vfio_cfg == default_vfio_cfg &&
832 vfio_cfg->vfio_iommu_type == NULL) {
833 /* if we're not a primary process, we do not set up the VFIO
834 * container because it's already been set up by the primary
835 * process. instead, we simply ask the primary about VFIO type
836 * we are using, and set the VFIO config up appropriately.
838 ret = vfio_sync_default_container();
840 RTE_LOG(ERR, EAL, "Could not sync default VFIO container\n");
841 close(vfio_group_fd);
842 rte_vfio_clear_group(vfio_group_fd);
845 /* we have successfully initialized VFIO, notify user */
846 const struct vfio_iommu_type *t =
847 default_vfio_cfg->vfio_iommu_type;
848 RTE_LOG(INFO, EAL, " using IOMMU type %d (%s)\n",
849 t->type_id, t->name);
852 rte_eal_vfio_get_vf_token(vf_token);
854 /* get a file descriptor for the device with VF token firstly */
855 if (!rte_uuid_is_null(vf_token)) {
856 char vf_token_str[RTE_UUID_STRLEN];
859 rte_uuid_unparse(vf_token, vf_token_str, sizeof(vf_token_str));
860 snprintf(dev, sizeof(dev),
861 "%s vf_token=%s", dev_addr, vf_token_str);
863 *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD,
865 if (*vfio_dev_fd >= 0)
869 /* get a file descriptor for the device */
870 *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr);
871 if (*vfio_dev_fd < 0) {
872 /* if we cannot get a device fd, this implies a problem with
873 * the VFIO group or the container not having IOMMU configured.
876 RTE_LOG(WARNING, EAL, "Getting a vfio_dev_fd for %s failed\n",
878 close(vfio_group_fd);
879 rte_vfio_clear_group(vfio_group_fd);
883 /* test and setup the device */
885 ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
887 RTE_LOG(ERR, EAL, " %s cannot get device info, "
888 "error %i (%s)\n", dev_addr, errno,
891 close(vfio_group_fd);
892 rte_vfio_clear_group(vfio_group_fd);
895 vfio_group_device_get(vfio_group_fd);
901 rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
904 struct vfio_config *vfio_cfg;
909 /* we don't want any DMA mapping messages to come while we're detaching
910 * VFIO device, because this might be the last device and we might need
911 * to unregister the callback.
913 rte_mcfg_mem_read_lock();
915 /* get group number */
916 ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num);
918 RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver\n",
920 /* This is an error at this point. */
925 /* get the actual group fd */
926 vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num);
927 if (vfio_group_fd < 0) {
928 RTE_LOG(INFO, EAL, "rte_vfio_get_group_fd failed for %s\n",
934 /* get the vfio_config it belongs to */
935 vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
936 vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
938 /* At this point we got an active group. Closing it will make the
939 * container detachment. If this is the last active group, VFIO kernel
940 * code will unset the container and the IOMMU mappings.
943 /* Closing a device */
944 if (close(vfio_dev_fd) < 0) {
945 RTE_LOG(INFO, EAL, "Error when closing vfio_dev_fd for %s\n",
951 /* An VFIO group can have several devices attached. Just when there is
952 * no devices remaining should the group be closed.
954 vfio_group_device_put(vfio_group_fd);
955 if (!vfio_group_device_count(vfio_group_fd)) {
957 if (close(vfio_group_fd) < 0) {
958 RTE_LOG(INFO, EAL, "Error when closing vfio_group_fd for %s\n",
964 if (rte_vfio_clear_group(vfio_group_fd) < 0) {
965 RTE_LOG(INFO, EAL, "Error when clearing group for %s\n",
972 /* if there are no active device groups, unregister the callback to
973 * avoid spurious attempts to map/unmap memory from VFIO.
975 if (vfio_cfg == default_vfio_cfg && vfio_cfg->vfio_active_groups == 0 &&
976 rte_eal_process_type() != RTE_PROC_SECONDARY)
977 rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME,
984 rte_mcfg_mem_read_unlock();
989 rte_vfio_enable(const char *modname)
991 /* initialize group list */
994 const struct internal_config *internal_conf =
995 eal_get_internal_configuration();
997 rte_spinlock_recursive_t lock = RTE_SPINLOCK_RECURSIVE_INITIALIZER;
999 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
1000 vfio_cfgs[i].vfio_container_fd = -1;
1001 vfio_cfgs[i].vfio_active_groups = 0;
1002 vfio_cfgs[i].vfio_iommu_type = NULL;
1003 vfio_cfgs[i].mem_maps.lock = lock;
1005 for (j = 0; j < VFIO_MAX_GROUPS; j++) {
1006 vfio_cfgs[i].vfio_groups[j].fd = -1;
1007 vfio_cfgs[i].vfio_groups[j].group_num = -1;
1008 vfio_cfgs[i].vfio_groups[j].devices = 0;
1012 /* inform the user that we are probing for VFIO */
1013 RTE_LOG(INFO, EAL, "Probing VFIO support...\n");
1015 /* check if vfio module is loaded */
1016 vfio_available = rte_eal_check_module(modname);
1018 /* return error directly */
1019 if (vfio_available == -1) {
1020 RTE_LOG(INFO, EAL, "Could not get loaded module details!\n");
1024 /* return 0 if VFIO modules not loaded */
1025 if (vfio_available == 0) {
1026 RTE_LOG(DEBUG, EAL, "VFIO modules not loaded, "
1027 "skipping VFIO support...\n");
1031 if (internal_conf->process_type == RTE_PROC_PRIMARY) {
1032 /* open a new container */
1033 default_vfio_cfg->vfio_container_fd =
1034 rte_vfio_get_container_fd();
1036 /* get the default container from the primary process */
1037 default_vfio_cfg->vfio_container_fd =
1038 vfio_get_default_container_fd();
1041 /* check if we have VFIO driver enabled */
1042 if (default_vfio_cfg->vfio_container_fd != -1) {
1043 RTE_LOG(INFO, EAL, "VFIO support initialized\n");
1044 default_vfio_cfg->vfio_enabled = 1;
1046 RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n");
1053 rte_vfio_is_enabled(const char *modname)
1055 const int mod_available = rte_eal_check_module(modname) > 0;
1056 return default_vfio_cfg->vfio_enabled && mod_available;
1060 vfio_get_default_container_fd(void)
1062 struct rte_mp_msg mp_req, *mp_rep;
1063 struct rte_mp_reply mp_reply = {0};
1064 struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
1065 struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
1067 const struct internal_config *internal_conf =
1068 eal_get_internal_configuration();
1070 if (default_vfio_cfg->vfio_enabled)
1071 return default_vfio_cfg->vfio_container_fd;
1073 if (internal_conf->process_type == RTE_PROC_PRIMARY) {
1074 /* if we were secondary process we would try requesting
1075 * container fd from the primary, but we're the primary
1076 * process so just exit here
1081 p->req = SOCKET_REQ_DEFAULT_CONTAINER;
1082 strcpy(mp_req.name, EAL_VFIO_MP);
1083 mp_req.len_param = sizeof(*p);
1086 if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
1087 mp_reply.nb_received == 1) {
1088 mp_rep = &mp_reply.msgs[0];
1089 p = (struct vfio_mp_param *)mp_rep->param;
1090 if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
1091 container_fd = mp_rep->fds[0];
1092 free(mp_reply.msgs);
1093 return container_fd;
1097 free(mp_reply.msgs);
1098 RTE_LOG(ERR, EAL, " cannot request default container fd\n");
1103 vfio_get_iommu_type(void)
1105 if (default_vfio_cfg->vfio_iommu_type == NULL)
1108 return default_vfio_cfg->vfio_iommu_type->type_id;
1111 const struct vfio_iommu_type *
1112 vfio_set_iommu_type(int vfio_container_fd)
1115 for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
1116 const struct vfio_iommu_type *t = &iommu_types[idx];
1118 int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
1121 RTE_LOG(INFO, EAL, " using IOMMU type %d (%s)\n",
1122 t->type_id, t->name);
1125 /* not an error, there may be more supported IOMMU types */
1126 RTE_LOG(DEBUG, EAL, " set IOMMU type %d (%s) failed, "
1127 "error %i (%s)\n", t->type_id, t->name, errno,
1130 /* if we didn't find a suitable IOMMU type, fail */
1135 vfio_has_supported_extensions(int vfio_container_fd)
1138 unsigned idx, n_extensions = 0;
1139 for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
1140 const struct vfio_iommu_type *t = &iommu_types[idx];
1142 ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION,
1145 RTE_LOG(ERR, EAL, " could not get IOMMU type, "
1146 "error %i (%s)\n", errno,
1148 close(vfio_container_fd);
1150 } else if (ret == 1) {
1151 /* we found a supported extension */
1154 RTE_LOG(DEBUG, EAL, " IOMMU type %d (%s) is %s\n",
1155 t->type_id, t->name,
1156 ret ? "supported" : "not supported");
1159 /* if we didn't find any supported IOMMU types, fail */
1160 if (!n_extensions) {
1161 close(vfio_container_fd);
1169 rte_vfio_get_container_fd(void)
1171 int ret, vfio_container_fd;
1172 struct rte_mp_msg mp_req, *mp_rep;
1173 struct rte_mp_reply mp_reply = {0};
1174 struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
1175 struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
1176 const struct internal_config *internal_conf =
1177 eal_get_internal_configuration();
1180 /* if we're in a primary process, try to open the container */
1181 if (internal_conf->process_type == RTE_PROC_PRIMARY) {
1182 vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR);
1183 if (vfio_container_fd < 0) {
1184 RTE_LOG(ERR, EAL, " cannot open VFIO container, "
1185 "error %i (%s)\n", errno, strerror(errno));
1189 /* check VFIO API version */
1190 ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION);
1191 if (ret != VFIO_API_VERSION) {
1193 RTE_LOG(ERR, EAL, " could not get VFIO API version, "
1194 "error %i (%s)\n", errno, strerror(errno));
1196 RTE_LOG(ERR, EAL, " unsupported VFIO API version!\n");
1197 close(vfio_container_fd);
1201 ret = vfio_has_supported_extensions(vfio_container_fd);
1203 RTE_LOG(ERR, EAL, " no supported IOMMU "
1204 "extensions found!\n");
1208 return vfio_container_fd;
1211 * if we're in a secondary process, request container fd from the
1212 * primary process via mp channel
1214 p->req = SOCKET_REQ_CONTAINER;
1215 strcpy(mp_req.name, EAL_VFIO_MP);
1216 mp_req.len_param = sizeof(*p);
1219 vfio_container_fd = -1;
1220 if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
1221 mp_reply.nb_received == 1) {
1222 mp_rep = &mp_reply.msgs[0];
1223 p = (struct vfio_mp_param *)mp_rep->param;
1224 if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
1225 vfio_container_fd = mp_rep->fds[0];
1226 free(mp_reply.msgs);
1227 return vfio_container_fd;
1231 free(mp_reply.msgs);
1232 RTE_LOG(ERR, EAL, " cannot request container fd\n");
1237 rte_vfio_get_group_num(const char *sysfs_base,
1238 const char *dev_addr, int *iommu_group_num)
1240 char linkname[PATH_MAX];
1241 char filename[PATH_MAX];
1242 char *tok[16], *group_tok, *end;
1245 memset(linkname, 0, sizeof(linkname));
1246 memset(filename, 0, sizeof(filename));
1248 /* try to find out IOMMU group for this device */
1249 snprintf(linkname, sizeof(linkname),
1250 "%s/%s/iommu_group", sysfs_base, dev_addr);
1252 ret = readlink(linkname, filename, sizeof(filename));
1254 /* if the link doesn't exist, no VFIO for us */
1258 ret = rte_strsplit(filename, sizeof(filename),
1259 tok, RTE_DIM(tok), '/');
1262 RTE_LOG(ERR, EAL, " %s cannot get IOMMU group\n", dev_addr);
1266 /* IOMMU group is always the last token */
1268 group_tok = tok[ret - 1];
1270 *iommu_group_num = strtol(group_tok, &end, 10);
1271 if ((end != group_tok && *end != '\0') || errno != 0) {
1272 RTE_LOG(ERR, EAL, " %s error parsing IOMMU number!\n", dev_addr);
1280 type1_map_contig(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
1281 size_t len, void *arg)
1283 int *vfio_container_fd = arg;
1288 return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
1293 type1_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
1296 int *vfio_container_fd = arg;
1298 /* skip external memory that isn't a heap */
1299 if (msl->external && !msl->heap)
1302 /* skip any segments with invalid IOVA addresses */
1303 if (ms->iova == RTE_BAD_IOVA)
1306 /* if IOVA mode is VA, we've already mapped the internal segments */
1307 if (!msl->external && rte_eal_iova_mode() == RTE_IOVA_VA)
1310 return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
1315 vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
1316 uint64_t len, int do_map)
1318 struct vfio_iommu_type1_dma_map dma_map;
1319 struct vfio_iommu_type1_dma_unmap dma_unmap;
1323 memset(&dma_map, 0, sizeof(dma_map));
1324 dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
1325 dma_map.vaddr = vaddr;
1327 dma_map.iova = iova;
1328 dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
1329 VFIO_DMA_MAP_FLAG_WRITE;
1331 ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
1334 * In case the mapping was already done EEXIST will be
1335 * returned from kernel.
1337 if (errno == EEXIST) {
1339 " Memory segment is already mapped,"
1343 " cannot set up DMA remapping,"
1345 errno, strerror(errno));
1350 memset(&dma_unmap, 0, sizeof(dma_unmap));
1351 dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
1352 dma_unmap.size = len;
1353 dma_unmap.iova = iova;
1355 ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
1358 RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n",
1359 errno, strerror(errno));
1361 } else if (dma_unmap.size != len) {
1362 RTE_LOG(ERR, EAL, " unexpected size %"PRIu64" of DMA "
1363 "remapping cleared instead of %"PRIu64"\n",
1364 (uint64_t)dma_unmap.size, len);
1374 vfio_type1_dma_map(int vfio_container_fd)
1376 if (rte_eal_iova_mode() == RTE_IOVA_VA) {
1377 /* with IOVA as VA mode, we can get away with mapping contiguous
1378 * chunks rather than going page-by-page.
1380 int ret = rte_memseg_contig_walk(type1_map_contig,
1381 &vfio_container_fd);
1384 /* we have to continue the walk because we've skipped the
1385 * external segments during the config walk.
1388 return rte_memseg_walk(type1_map, &vfio_container_fd);
1391 /* Track the size of the statically allocated DMA window for SPAPR */
1392 uint64_t spapr_dma_win_len;
1393 uint64_t spapr_dma_win_page_sz;
1396 vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
1397 uint64_t len, int do_map)
1399 struct vfio_iommu_spapr_register_memory reg = {
1400 .argsz = sizeof(reg),
1401 .vaddr = (uintptr_t) vaddr,
1408 struct vfio_iommu_type1_dma_map dma_map;
1410 if (iova + len > spapr_dma_win_len) {
1411 RTE_LOG(ERR, EAL, " dma map attempt outside DMA window\n");
1415 ret = ioctl(vfio_container_fd,
1416 VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®);
1418 RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, "
1419 "error %i (%s)\n", errno, strerror(errno));
1423 memset(&dma_map, 0, sizeof(dma_map));
1424 dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
1425 dma_map.vaddr = vaddr;
1427 dma_map.iova = iova;
1428 dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
1429 VFIO_DMA_MAP_FLAG_WRITE;
1431 ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
1433 RTE_LOG(ERR, EAL, " cannot map vaddr for IOMMU, error %i (%s)\n",
1434 errno, strerror(errno));
1439 struct vfio_iommu_type1_dma_map dma_unmap;
1441 memset(&dma_unmap, 0, sizeof(dma_unmap));
1442 dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
1443 dma_unmap.size = len;
1444 dma_unmap.iova = iova;
1446 ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
1449 RTE_LOG(ERR, EAL, " cannot unmap vaddr for IOMMU, error %i (%s)\n",
1450 errno, strerror(errno));
1454 ret = ioctl(vfio_container_fd,
1455 VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®);
1457 RTE_LOG(ERR, EAL, " cannot unregister vaddr for IOMMU, error %i (%s)\n",
1458 errno, strerror(errno));
1467 vfio_spapr_map_walk(const struct rte_memseg_list *msl,
1468 const struct rte_memseg *ms, void *arg)
1470 int *vfio_container_fd = arg;
1472 /* skip external memory that isn't a heap */
1473 if (msl->external && !msl->heap)
1476 /* skip any segments with invalid IOVA addresses */
1477 if (ms->iova == RTE_BAD_IOVA)
1480 return vfio_spapr_dma_do_map(*vfio_container_fd,
1481 ms->addr_64, ms->iova, ms->len, 1);
1484 struct spapr_size_walk_param {
1487 bool is_user_managed;
1491 * In order to set the DMA window size required for the SPAPR IOMMU
1492 * we need to walk the existing virtual memory allocations as well as
1493 * find the hugepage size used.
1496 vfio_spapr_size_walk(const struct rte_memseg_list *msl, void *arg)
1498 struct spapr_size_walk_param *param = arg;
1499 uint64_t max = (uint64_t) msl->base_va + (uint64_t) msl->len;
1501 if (msl->external && !msl->heap) {
1502 /* ignore user managed external memory */
1503 param->is_user_managed = true;
1507 if (max > param->max_va) {
1508 param->page_sz = msl->page_sz;
1509 param->max_va = max;
1516 * Find the highest memory address used in physical or virtual address
1517 * space and use that as the top of the DMA window.
1520 find_highest_mem_addr(struct spapr_size_walk_param *param)
1522 /* find the maximum IOVA address for setting the DMA window size */
1523 if (rte_eal_iova_mode() == RTE_IOVA_PA) {
1524 static const char proc_iomem[] = "/proc/iomem";
1525 static const char str_sysram[] = "System RAM";
1526 uint64_t start, end, max = 0;
1532 * Example "System RAM" in /proc/iomem:
1533 * 00000000-1fffffffff : System RAM
1534 * 200000000000-201fffffffff : System RAM
1536 FILE *fd = fopen(proc_iomem, "r");
1538 RTE_LOG(ERR, EAL, "Cannot open %s\n", proc_iomem);
1541 /* Scan /proc/iomem for the highest PA in the system */
1542 while (getline(&line, &line_len, fd) != -1) {
1543 if (strstr(line, str_sysram) == NULL)
1546 space = strstr(line, " ");
1547 dash = strstr(line, "-");
1549 /* Validate the format of the memory string */
1550 if (space == NULL || dash == NULL || space < dash) {
1551 RTE_LOG(ERR, EAL, "Can't parse line \"%s\" in file %s\n",
1556 start = strtoull(line, NULL, 16);
1557 end = strtoull(dash + 1, NULL, 16);
1558 RTE_LOG(DEBUG, EAL, "Found system RAM from 0x%" PRIx64
1559 " to 0x%" PRIx64 "\n", start, end);
1567 RTE_LOG(ERR, EAL, "Failed to find valid \"System RAM\" "
1568 "entry in file %s\n", proc_iomem);
1572 spapr_dma_win_len = rte_align64pow2(max + 1);
1574 } else if (rte_eal_iova_mode() == RTE_IOVA_VA) {
1575 RTE_LOG(DEBUG, EAL, "Highest VA address in memseg list is 0x%"
1576 PRIx64 "\n", param->max_va);
1577 spapr_dma_win_len = rte_align64pow2(param->max_va);
1581 spapr_dma_win_len = 0;
1582 RTE_LOG(ERR, EAL, "Unsupported IOVA mode\n");
1588 * The SPAPRv2 IOMMU supports 2 DMA windows with starting
1589 * address at 0 or 1<<59. By default, a DMA window is set
1590 * at address 0, 2GB long, with a 4KB page. For DPDK we
1591 * must remove the default window and setup a new DMA window
1592 * based on the hugepage size and memory requirements of
1593 * the application before we can map memory for DMA.
1596 spapr_dma_win_size(void)
1598 struct spapr_size_walk_param param;
1600 /* only create DMA window once */
1601 if (spapr_dma_win_len > 0)
1604 /* walk the memseg list to find the page size/max VA address */
1605 memset(¶m, 0, sizeof(param));
1606 if (rte_memseg_list_walk(vfio_spapr_size_walk, ¶m) < 0) {
1607 RTE_LOG(ERR, EAL, "Failed to walk memseg list for DMA window size\n");
1611 /* we can't be sure if DMA window covers external memory */
1612 if (param.is_user_managed)
1613 RTE_LOG(WARNING, EAL, "Detected user managed external memory which may not be managed by the IOMMU\n");
1615 /* check physical/virtual memory size */
1616 if (find_highest_mem_addr(¶m) < 0)
1618 RTE_LOG(DEBUG, EAL, "Setting DMA window size to 0x%" PRIx64 "\n",
1620 spapr_dma_win_page_sz = param.page_sz;
1621 rte_mem_set_dma_mask(__builtin_ctzll(spapr_dma_win_len));
1626 vfio_spapr_create_dma_window(int vfio_container_fd)
1628 struct vfio_iommu_spapr_tce_create create = {
1629 .argsz = sizeof(create), };
1630 struct vfio_iommu_spapr_tce_remove remove = {
1631 .argsz = sizeof(remove), };
1632 struct vfio_iommu_spapr_tce_info info = {
1633 .argsz = sizeof(info), };
1636 ret = spapr_dma_win_size();
1640 ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
1642 RTE_LOG(ERR, EAL, " can't get iommu info, error %i (%s)\n",
1643 errno, strerror(errno));
1648 * sPAPR v1/v2 IOMMU always has a default 1G DMA window set. The window
1649 * can't be changed for v1 but it can be changed for v2. Since DPDK only
1650 * supports v2, remove the default DMA window so it can be resized.
1652 remove.start_addr = info.dma32_window_start;
1653 ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
1657 /* create a new DMA window (start address is not selectable) */
1658 create.window_size = spapr_dma_win_len;
1659 create.page_shift = __builtin_ctzll(spapr_dma_win_page_sz);
1661 ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
1662 #ifdef VFIO_IOMMU_SPAPR_INFO_DDW
1664 * The vfio_iommu_spapr_tce_info structure was modified in
1665 * Linux kernel 4.2.0 to add support for the
1666 * vfio_iommu_spapr_tce_ddw_info structure needed to try
1667 * multiple table levels. Skip the attempt if running with
1671 /* if at first we don't succeed, try more levels */
1674 for (levels = create.levels + 1;
1675 ret && levels <= info.ddw.levels; levels++) {
1676 create.levels = levels;
1677 ret = ioctl(vfio_container_fd,
1678 VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
1681 #endif /* VFIO_IOMMU_SPAPR_INFO_DDW */
1683 RTE_LOG(ERR, EAL, " cannot create new DMA window, error %i (%s)\n",
1684 errno, strerror(errno));
1685 RTE_LOG(ERR, EAL, " consider using a larger hugepage size "
1686 "if supported by the system\n");
1690 /* verify the start address */
1691 if (create.start_addr != 0) {
1692 RTE_LOG(ERR, EAL, " received unsupported start address 0x%"
1693 PRIx64 "\n", (uint64_t)create.start_addr);
1700 vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr,
1701 uint64_t iova, uint64_t len, int do_map)
1706 if (vfio_spapr_dma_do_map(vfio_container_fd,
1707 vaddr, iova, len, 1)) {
1708 RTE_LOG(ERR, EAL, "Failed to map DMA\n");
1712 if (vfio_spapr_dma_do_map(vfio_container_fd,
1713 vaddr, iova, len, 0)) {
1714 RTE_LOG(ERR, EAL, "Failed to unmap DMA\n");
1723 vfio_spapr_dma_map(int vfio_container_fd)
1725 if (vfio_spapr_create_dma_window(vfio_container_fd) < 0) {
1726 RTE_LOG(ERR, EAL, "Could not create new DMA window!\n");
1730 /* map all existing DPDK segments for DMA */
1731 if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0)
1738 vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
1740 /* No-IOMMU mode does not need DMA mapping */
1745 vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd,
1746 uint64_t __rte_unused vaddr,
1747 uint64_t __rte_unused iova, uint64_t __rte_unused len,
1748 int __rte_unused do_map)
1750 /* No-IOMMU mode does not need DMA mapping */
1755 vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
1756 uint64_t len, int do_map)
1758 const struct vfio_iommu_type *t = vfio_cfg->vfio_iommu_type;
1761 RTE_LOG(ERR, EAL, " VFIO support not initialized\n");
1766 if (!t->dma_user_map_func) {
1768 " VFIO custom DMA region maping not supported by IOMMU %s\n",
1770 rte_errno = ENOTSUP;
1774 return t->dma_user_map_func(vfio_cfg->vfio_container_fd, vaddr, iova,
1779 container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
1782 struct user_mem_map *new_map;
1783 struct user_mem_maps *user_mem_maps;
1786 user_mem_maps = &vfio_cfg->mem_maps;
1787 rte_spinlock_recursive_lock(&user_mem_maps->lock);
1788 if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
1789 RTE_LOG(ERR, EAL, "No more space for user mem maps\n");
1795 if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 1)) {
1796 /* technically, this will fail if there are currently no devices
1797 * plugged in, even if a device were added later, this mapping
1798 * might have succeeded. however, since we cannot verify if this
1799 * is a valid mapping without having a device attached, consider
1800 * this to be unsupported, because we can't just store any old
1801 * mapping and pollute list of active mappings willy-nilly.
1803 RTE_LOG(ERR, EAL, "Couldn't map new region for DMA\n");
1807 /* create new user mem map entry */
1808 new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
1809 new_map->addr = vaddr;
1810 new_map->iova = iova;
1813 compact_user_maps(user_mem_maps);
1815 rte_spinlock_recursive_unlock(&user_mem_maps->lock);
1820 container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
1823 struct user_mem_map *map, *new_map = NULL;
1824 struct user_mem_maps *user_mem_maps;
1827 user_mem_maps = &vfio_cfg->mem_maps;
1828 rte_spinlock_recursive_lock(&user_mem_maps->lock);
1830 /* find our mapping */
1831 map = find_user_mem_map(user_mem_maps, vaddr, iova, len);
1833 RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n");
1838 if (map->addr != vaddr || map->iova != iova || map->len != len) {
1839 /* we're partially unmapping a previously mapped region, so we
1840 * need to split entry into two.
1842 if (!vfio_cfg->vfio_iommu_type->partial_unmap) {
1843 RTE_LOG(DEBUG, EAL, "DMA partial unmap unsupported\n");
1844 rte_errno = ENOTSUP;
1848 if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
1849 RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
1854 new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
1857 /* unmap the entry */
1858 if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 0)) {
1859 /* there may not be any devices plugged in, so unmapping will
1860 * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't
1861 * stop us from removing the mapping, as the assumption is we
1862 * won't be needing this memory any more and thus will want to
1863 * prevent it from being remapped again on hotplug. so, only
1864 * fail if we indeed failed to unmap (e.g. if the mapping was
1865 * within our mapped range but had invalid alignment).
1867 if (rte_errno != ENODEV && rte_errno != ENOTSUP) {
1868 RTE_LOG(ERR, EAL, "Couldn't unmap region for DMA\n");
1872 RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n");
1875 /* remove map from the list of active mappings */
1876 if (new_map != NULL) {
1877 adjust_map(map, new_map, vaddr, len);
1879 /* if we've created a new map by splitting, sort everything */
1880 if (!is_null_map(new_map)) {
1881 compact_user_maps(user_mem_maps);
1883 /* we've created a new mapping, but it was unused */
1884 user_mem_maps->n_maps--;
1887 memset(map, 0, sizeof(*map));
1888 compact_user_maps(user_mem_maps);
1889 user_mem_maps->n_maps--;
1893 rte_spinlock_recursive_unlock(&user_mem_maps->lock);
1898 rte_vfio_noiommu_is_enabled(void)
1904 fd = open(VFIO_NOIOMMU_MODE, O_RDONLY);
1906 if (errno != ENOENT) {
1907 RTE_LOG(ERR, EAL, " cannot open vfio noiommu file %i (%s)\n",
1908 errno, strerror(errno));
1912 * else the file does not exists
1913 * i.e. noiommu is not enabled
1918 cnt = read(fd, &c, 1);
1921 RTE_LOG(ERR, EAL, " unable to read from vfio noiommu "
1922 "file %i (%s)\n", errno, strerror(errno));
1930 rte_vfio_container_create(void)
1934 /* Find an empty slot to store new vfio config */
1935 for (i = 1; i < VFIO_MAX_CONTAINERS; i++) {
1936 if (vfio_cfgs[i].vfio_container_fd == -1)
1940 if (i == VFIO_MAX_CONTAINERS) {
1941 RTE_LOG(ERR, EAL, "exceed max vfio container limit\n");
1945 vfio_cfgs[i].vfio_container_fd = rte_vfio_get_container_fd();
1946 if (vfio_cfgs[i].vfio_container_fd < 0) {
1947 RTE_LOG(NOTICE, EAL, "fail to create a new container\n");
1951 return vfio_cfgs[i].vfio_container_fd;
1955 rte_vfio_container_destroy(int container_fd)
1957 struct vfio_config *vfio_cfg;
1960 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
1961 if (vfio_cfg == NULL) {
1962 RTE_LOG(ERR, EAL, "Invalid container fd\n");
1966 for (i = 0; i < VFIO_MAX_GROUPS; i++)
1967 if (vfio_cfg->vfio_groups[i].group_num != -1)
1968 rte_vfio_container_group_unbind(container_fd,
1969 vfio_cfg->vfio_groups[i].group_num);
1971 close(container_fd);
1972 vfio_cfg->vfio_container_fd = -1;
1973 vfio_cfg->vfio_active_groups = 0;
1974 vfio_cfg->vfio_iommu_type = NULL;
1980 rte_vfio_container_group_bind(int container_fd, int iommu_group_num)
1982 struct vfio_config *vfio_cfg;
1984 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
1985 if (vfio_cfg == NULL) {
1986 RTE_LOG(ERR, EAL, "Invalid container fd\n");
1990 return vfio_get_group_fd(vfio_cfg, iommu_group_num);
1994 rte_vfio_container_group_unbind(int container_fd, int iommu_group_num)
1996 struct vfio_config *vfio_cfg;
1997 struct vfio_group *cur_grp = NULL;
2000 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
2001 if (vfio_cfg == NULL) {
2002 RTE_LOG(ERR, EAL, "Invalid container fd\n");
2006 for (i = 0; i < VFIO_MAX_GROUPS; i++) {
2007 if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) {
2008 cur_grp = &vfio_cfg->vfio_groups[i];
2013 /* This should not happen */
2014 if (i == VFIO_MAX_GROUPS || cur_grp == NULL) {
2015 RTE_LOG(ERR, EAL, "Specified group number not found\n");
2019 if (cur_grp->fd >= 0 && close(cur_grp->fd) < 0) {
2020 RTE_LOG(ERR, EAL, "Error when closing vfio_group_fd for"
2021 " iommu_group_num %d\n", iommu_group_num);
2024 cur_grp->group_num = -1;
2026 cur_grp->devices = 0;
2027 vfio_cfg->vfio_active_groups--;
2033 rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova,
2036 struct vfio_config *vfio_cfg;
2043 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
2044 if (vfio_cfg == NULL) {
2045 RTE_LOG(ERR, EAL, "Invalid container fd\n");
2049 return container_dma_map(vfio_cfg, vaddr, iova, len);
2053 rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova,
2056 struct vfio_config *vfio_cfg;
2063 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
2064 if (vfio_cfg == NULL) {
2065 RTE_LOG(ERR, EAL, "Invalid container fd\n");
2069 return container_dma_unmap(vfio_cfg, vaddr, iova, len);
2075 rte_vfio_setup_device(__rte_unused const char *sysfs_base,
2076 __rte_unused const char *dev_addr,
2077 __rte_unused int *vfio_dev_fd,
2078 __rte_unused struct vfio_device_info *device_info)
2084 rte_vfio_release_device(__rte_unused const char *sysfs_base,
2085 __rte_unused const char *dev_addr, __rte_unused int fd)
2091 rte_vfio_enable(__rte_unused const char *modname)
2097 rte_vfio_is_enabled(__rte_unused const char *modname)
2103 rte_vfio_noiommu_is_enabled(void)
2109 rte_vfio_clear_group(__rte_unused int vfio_group_fd)
2115 rte_vfio_get_group_num(__rte_unused const char *sysfs_base,
2116 __rte_unused const char *dev_addr,
2117 __rte_unused int *iommu_group_num)
2123 rte_vfio_get_container_fd(void)
2129 rte_vfio_get_group_fd(__rte_unused int iommu_group_num)
2135 rte_vfio_container_create(void)
2141 rte_vfio_container_destroy(__rte_unused int container_fd)
2147 rte_vfio_container_group_bind(__rte_unused int container_fd,
2148 __rte_unused int iommu_group_num)
2154 rte_vfio_container_group_unbind(__rte_unused int container_fd,
2155 __rte_unused int iommu_group_num)
2161 rte_vfio_container_dma_map(__rte_unused int container_fd,
2162 __rte_unused uint64_t vaddr,
2163 __rte_unused uint64_t iova,
2164 __rte_unused uint64_t len)
2170 rte_vfio_container_dma_unmap(__rte_unused int container_fd,
2171 __rte_unused uint64_t vaddr,
2172 __rte_unused uint64_t iova,
2173 __rte_unused uint64_t len)
2178 #endif /* VFIO_PRESENT */