1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2018 Intel Corporation
11 #include <rte_errno.h>
13 #include <rte_memory.h>
14 #include <rte_eal_memconfig.h>
17 #include "eal_filesystem.h"
18 #include "eal_memcfg.h"
20 #include "eal_private.h"
21 #include "eal_internal_cfg.h"
25 #define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb"
27 /* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can
28 * recreate the mappings for DPDK segments, but we cannot do so for memory that
29 * was registered by the user themselves, so we need to store the user mappings
30 * somewhere, to recreate them later.
32 #define VFIO_MAX_USER_MEM_MAPS 256
34 uint64_t addr; /**< start VA */
35 uint64_t iova; /**< start IOVA */
36 uint64_t len; /**< total length of the mapping */
37 uint64_t chunk; /**< this mapping can be split in chunks of this size */
40 struct user_mem_maps {
41 rte_spinlock_recursive_t lock;
43 struct user_mem_map maps[VFIO_MAX_USER_MEM_MAPS];
48 int vfio_container_fd;
49 int vfio_active_groups;
50 const struct vfio_iommu_type *vfio_iommu_type;
51 struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
52 struct user_mem_maps mem_maps;
55 /* per-process VFIO config */
56 static struct vfio_config vfio_cfgs[VFIO_MAX_CONTAINERS];
57 static struct vfio_config *default_vfio_cfg = &vfio_cfgs[0];
59 static int vfio_type1_dma_map(int);
60 static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
61 static int vfio_spapr_dma_map(int);
62 static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
63 static int vfio_noiommu_dma_map(int);
64 static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
65 static int vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr,
66 uint64_t iova, uint64_t len, int do_map);
68 /* IOMMU types we support */
69 static const struct vfio_iommu_type iommu_types[] = {
70 /* x86 IOMMU, otherwise known as type 1 */
72 .type_id = RTE_VFIO_TYPE1,
74 .partial_unmap = false,
75 .dma_map_func = &vfio_type1_dma_map,
76 .dma_user_map_func = &vfio_type1_dma_mem_map
78 /* ppc64 IOMMU, otherwise known as spapr */
80 .type_id = RTE_VFIO_SPAPR,
82 .partial_unmap = true,
83 .dma_map_func = &vfio_spapr_dma_map,
84 .dma_user_map_func = &vfio_spapr_dma_mem_map
88 .type_id = RTE_VFIO_NOIOMMU,
90 .partial_unmap = true,
91 .dma_map_func = &vfio_noiommu_dma_map,
92 .dma_user_map_func = &vfio_noiommu_dma_mem_map
97 is_null_map(const struct user_mem_map *map)
99 return map->addr == 0 && map->iova == 0 &&
100 map->len == 0 && map->chunk == 0;
103 /* we may need to merge user mem maps together in case of user mapping/unmapping
104 * chunks of memory, so we'll need a comparator function to sort segments.
107 user_mem_map_cmp(const void *a, const void *b)
109 const struct user_mem_map *umm_a = a;
110 const struct user_mem_map *umm_b = b;
112 /* move null entries to end */
113 if (is_null_map(umm_a))
115 if (is_null_map(umm_b))
118 /* sort by iova first */
119 if (umm_a->iova < umm_b->iova)
121 if (umm_a->iova > umm_b->iova)
124 if (umm_a->addr < umm_b->addr)
126 if (umm_a->addr > umm_b->addr)
129 if (umm_a->len < umm_b->len)
131 if (umm_a->len > umm_b->len)
134 if (umm_a->chunk < umm_b->chunk)
136 if (umm_a->chunk > umm_b->chunk)
143 * Take in an address range and list of current mappings, and produce a list of
144 * mappings that will be kept.
147 process_maps(struct user_mem_map *src, size_t src_len,
148 struct user_mem_map newmap[2], uint64_t vaddr, uint64_t len)
150 struct user_mem_map *src_first = &src[0];
151 struct user_mem_map *src_last = &src[src_len - 1];
152 struct user_mem_map *dst_first = &newmap[0];
153 /* we can get at most two new segments */
154 struct user_mem_map *dst_last = &newmap[1];
155 uint64_t first_off = vaddr - src_first->addr;
156 uint64_t last_off = (src_last->addr + src_last->len) - (vaddr + len);
159 if (first_off != 0) {
160 dst_first->addr = src_first->addr;
161 dst_first->iova = src_first->iova;
162 dst_first->len = first_off;
163 dst_first->chunk = src_first->chunk;
168 /* if we had start offset, we have two segments */
169 struct user_mem_map *last =
170 first_off == 0 ? dst_first : dst_last;
171 last->addr = (src_last->addr + src_last->len) - last_off;
172 last->iova = (src_last->iova + src_last->len) - last_off;
173 last->len = last_off;
174 last->chunk = src_last->chunk;
181 /* erase certain maps from the list */
183 delete_maps(struct user_mem_maps *user_mem_maps, struct user_mem_map *del_maps,
189 for (i = 0, j = 0; i < VFIO_MAX_USER_MEM_MAPS && j < n_del; i++) {
190 struct user_mem_map *left = &user_mem_maps->maps[i];
191 struct user_mem_map *right = &del_maps[j];
193 if (user_mem_map_cmp(left, right) == 0) {
194 memset(left, 0, sizeof(*left));
196 user_mem_maps->n_maps--;
202 copy_maps(struct user_mem_maps *user_mem_maps, struct user_mem_map *add_maps,
208 for (i = 0, j = 0; i < VFIO_MAX_USER_MEM_MAPS && j < n_add; i++) {
209 struct user_mem_map *left = &user_mem_maps->maps[i];
210 struct user_mem_map *right = &add_maps[j];
212 /* insert into empty space */
213 if (is_null_map(left)) {
214 memcpy(left, right, sizeof(*left));
216 user_mem_maps->n_maps++;
221 /* try merging two maps into one, return 1 if succeeded */
223 merge_map(struct user_mem_map *left, struct user_mem_map *right)
225 /* merge the same maps into one */
226 if (memcmp(left, right, sizeof(struct user_mem_map)) == 0)
229 if (left->addr + left->len != right->addr)
231 if (left->iova + left->len != right->iova)
233 if (left->chunk != right->chunk)
235 left->len += right->len;
238 memset(right, 0, sizeof(*right));
244 addr_is_chunk_aligned(struct user_mem_map *maps, size_t n_maps,
245 uint64_t vaddr, uint64_t iova)
249 for (i = 0; i < n_maps; i++) {
250 struct user_mem_map *map = &maps[i];
251 uint64_t map_va_end = map->addr + map->len;
252 uint64_t map_iova_end = map->iova + map->len;
253 uint64_t map_va_off = vaddr - map->addr;
254 uint64_t map_iova_off = iova - map->iova;
256 /* we include end of the segment in comparison as well */
257 bool addr_in_map = (vaddr >= map->addr) && (vaddr <= map_va_end);
258 bool iova_in_map = (iova >= map->iova) && (iova <= map_iova_end);
259 /* chunk may not be power of two, so use modulo */
260 bool addr_is_aligned = (map_va_off % map->chunk) == 0;
261 bool iova_is_aligned = (map_iova_off % map->chunk) == 0;
263 if (addr_in_map && iova_in_map &&
264 addr_is_aligned && iova_is_aligned)
271 find_user_mem_maps(struct user_mem_maps *user_mem_maps, uint64_t addr,
272 uint64_t iova, uint64_t len, struct user_mem_map *dst,
275 uint64_t va_end = addr + len;
276 uint64_t iova_end = iova + len;
281 for (i = 0, j = 0; i < user_mem_maps->n_maps; i++) {
282 struct user_mem_map *map = &user_mem_maps->maps[i];
283 uint64_t map_va_end = map->addr + map->len;
284 uint64_t map_iova_end = map->iova + map->len;
286 bool start_addr_in_map = (addr >= map->addr) &&
288 bool end_addr_in_map = (va_end > map->addr) &&
289 (va_end <= map_va_end);
290 bool start_iova_in_map = (iova >= map->iova) &&
291 (iova < map_iova_end);
292 bool end_iova_in_map = (iova_end > map->iova) &&
293 (iova_end <= map_iova_end);
295 /* do we have space in temporary map? */
300 /* check if current map is start of our segment */
301 if (!found && start_addr_in_map && start_iova_in_map)
303 /* if we have previously found a segment, add it to the map */
305 /* copy the segment into our temporary map */
306 memcpy(&dst[j++], map, sizeof(*map));
308 /* if we match end of segment, quit */
309 if (end_addr_in_map && end_iova_in_map)
313 /* we didn't find anything */
316 memset(dst, 0, sizeof(*dst) * dst_len);
320 /* this will sort all user maps, and merge/compact any adjacent maps */
322 compact_user_maps(struct user_mem_maps *user_mem_maps)
326 qsort(user_mem_maps->maps, VFIO_MAX_USER_MEM_MAPS,
327 sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
329 /* we'll go over the list backwards when merging */
330 for (i = VFIO_MAX_USER_MEM_MAPS - 2; i >= 0; i--) {
331 struct user_mem_map *l, *r;
333 l = &user_mem_maps->maps[i];
334 r = &user_mem_maps->maps[i + 1];
336 if (is_null_map(l) || is_null_map(r))
339 /* try and merge the maps */
341 user_mem_maps->n_maps--;
344 /* the entries are still sorted, but now they have holes in them, so
345 * sort the list again.
347 qsort(user_mem_maps->maps, VFIO_MAX_USER_MEM_MAPS,
348 sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
352 vfio_open_group_fd(int iommu_group_num)
355 char filename[PATH_MAX];
356 struct rte_mp_msg mp_req, *mp_rep;
357 struct rte_mp_reply mp_reply = {0};
358 struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
359 struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
360 const struct internal_config *internal_conf =
361 eal_get_internal_configuration();
363 /* if primary, try to open the group */
364 if (internal_conf->process_type == RTE_PROC_PRIMARY) {
365 /* try regular group format */
366 snprintf(filename, sizeof(filename),
367 VFIO_GROUP_FMT, iommu_group_num);
368 vfio_group_fd = open(filename, O_RDWR);
369 if (vfio_group_fd < 0) {
370 /* if file not found, it's not an error */
371 if (errno != ENOENT) {
372 RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
373 filename, strerror(errno));
377 /* special case: try no-IOMMU path as well */
378 snprintf(filename, sizeof(filename),
379 VFIO_NOIOMMU_GROUP_FMT,
381 vfio_group_fd = open(filename, O_RDWR);
382 if (vfio_group_fd < 0) {
383 if (errno != ENOENT) {
385 "Cannot open %s: %s\n",
386 filename, strerror(errno));
391 /* noiommu group found */
394 return vfio_group_fd;
396 /* if we're in a secondary process, request group fd from the primary
397 * process via mp channel.
399 p->req = SOCKET_REQ_GROUP;
400 p->group_num = iommu_group_num;
401 strcpy(mp_req.name, EAL_VFIO_MP);
402 mp_req.len_param = sizeof(*p);
406 if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
407 mp_reply.nb_received == 1) {
408 mp_rep = &mp_reply.msgs[0];
409 p = (struct vfio_mp_param *)mp_rep->param;
410 if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
411 vfio_group_fd = mp_rep->fds[0];
412 } else if (p->result == SOCKET_NO_FD) {
413 RTE_LOG(ERR, EAL, "Bad VFIO group fd\n");
414 vfio_group_fd = -ENOENT;
419 if (vfio_group_fd < 0 && vfio_group_fd != -ENOENT)
420 RTE_LOG(ERR, EAL, "Cannot request VFIO group fd\n");
421 return vfio_group_fd;
424 static struct vfio_config *
425 get_vfio_cfg_by_group_num(int iommu_group_num)
427 struct vfio_config *vfio_cfg;
430 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
431 vfio_cfg = &vfio_cfgs[i];
432 for (j = 0; j < VFIO_MAX_GROUPS; j++) {
433 if (vfio_cfg->vfio_groups[j].group_num ==
443 vfio_get_group_fd(struct vfio_config *vfio_cfg,
448 struct vfio_group *cur_grp;
450 /* check if we already have the group descriptor open */
451 for (i = 0; i < VFIO_MAX_GROUPS; i++)
452 if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num)
453 return vfio_cfg->vfio_groups[i].fd;
455 /* Lets see first if there is room for a new group */
456 if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) {
457 RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
461 /* Now lets get an index for the new group */
462 for (i = 0; i < VFIO_MAX_GROUPS; i++)
463 if (vfio_cfg->vfio_groups[i].group_num == -1) {
464 cur_grp = &vfio_cfg->vfio_groups[i];
468 /* This should not happen */
469 if (i == VFIO_MAX_GROUPS) {
470 RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
474 vfio_group_fd = vfio_open_group_fd(iommu_group_num);
475 if (vfio_group_fd < 0) {
476 RTE_LOG(ERR, EAL, "Failed to open VFIO group %d\n",
478 return vfio_group_fd;
481 cur_grp->group_num = iommu_group_num;
482 cur_grp->fd = vfio_group_fd;
483 vfio_cfg->vfio_active_groups++;
485 return vfio_group_fd;
488 static struct vfio_config *
489 get_vfio_cfg_by_group_fd(int vfio_group_fd)
491 struct vfio_config *vfio_cfg;
494 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
495 vfio_cfg = &vfio_cfgs[i];
496 for (j = 0; j < VFIO_MAX_GROUPS; j++)
497 if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
504 static struct vfio_config *
505 get_vfio_cfg_by_container_fd(int container_fd)
509 if (container_fd == RTE_VFIO_DEFAULT_CONTAINER_FD)
510 return default_vfio_cfg;
512 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
513 if (vfio_cfgs[i].vfio_container_fd == container_fd)
514 return &vfio_cfgs[i];
521 rte_vfio_get_group_fd(int iommu_group_num)
523 struct vfio_config *vfio_cfg;
525 /* get the vfio_config it belongs to */
526 vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
527 vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
529 return vfio_get_group_fd(vfio_cfg, iommu_group_num);
533 get_vfio_group_idx(int vfio_group_fd)
535 struct vfio_config *vfio_cfg;
538 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
539 vfio_cfg = &vfio_cfgs[i];
540 for (j = 0; j < VFIO_MAX_GROUPS; j++)
541 if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
549 vfio_group_device_get(int vfio_group_fd)
551 struct vfio_config *vfio_cfg;
554 vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
555 if (vfio_cfg == NULL) {
556 RTE_LOG(ERR, EAL, "Invalid VFIO group fd!\n");
560 i = get_vfio_group_idx(vfio_group_fd);
561 if (i < 0 || i > (VFIO_MAX_GROUPS - 1))
562 RTE_LOG(ERR, EAL, "Wrong VFIO group index (%d)\n", i);
564 vfio_cfg->vfio_groups[i].devices++;
568 vfio_group_device_put(int vfio_group_fd)
570 struct vfio_config *vfio_cfg;
573 vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
574 if (vfio_cfg == NULL) {
575 RTE_LOG(ERR, EAL, "Invalid VFIO group fd!\n");
579 i = get_vfio_group_idx(vfio_group_fd);
580 if (i < 0 || i > (VFIO_MAX_GROUPS - 1))
581 RTE_LOG(ERR, EAL, "Wrong VFIO group index (%d)\n", i);
583 vfio_cfg->vfio_groups[i].devices--;
587 vfio_group_device_count(int vfio_group_fd)
589 struct vfio_config *vfio_cfg;
592 vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
593 if (vfio_cfg == NULL) {
594 RTE_LOG(ERR, EAL, "Invalid VFIO group fd!\n");
598 i = get_vfio_group_idx(vfio_group_fd);
599 if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) {
600 RTE_LOG(ERR, EAL, "Wrong VFIO group index (%d)\n", i);
604 return vfio_cfg->vfio_groups[i].devices;
608 vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
609 void *arg __rte_unused)
611 struct rte_memseg_list *msl;
612 struct rte_memseg *ms;
615 msl = rte_mem_virt2memseg_list(addr);
617 /* for IOVA as VA mode, no need to care for IOVA addresses */
618 if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) {
619 uint64_t vfio_va = (uint64_t)(uintptr_t)addr;
620 uint64_t page_sz = msl->page_sz;
622 /* Maintain granularity of DMA map/unmap to memseg size */
623 for (; cur_len < len; cur_len += page_sz) {
624 if (type == RTE_MEM_EVENT_ALLOC)
625 vfio_dma_mem_map(default_vfio_cfg, vfio_va,
626 vfio_va, page_sz, 1);
628 vfio_dma_mem_map(default_vfio_cfg, vfio_va,
629 vfio_va, page_sz, 0);
636 /* memsegs are contiguous in memory */
637 ms = rte_mem_virt2memseg(addr, msl);
638 while (cur_len < len) {
639 /* some memory segments may have invalid IOVA */
640 if (ms->iova == RTE_BAD_IOVA) {
642 "Memory segment at %p has bad IOVA, skipping\n",
646 if (type == RTE_MEM_EVENT_ALLOC)
647 vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
648 ms->iova, ms->len, 1);
650 vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
651 ms->iova, ms->len, 0);
659 vfio_sync_default_container(void)
661 struct rte_mp_msg mp_req, *mp_rep;
662 struct rte_mp_reply mp_reply = {0};
663 struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
664 struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
668 /* cannot be called from primary */
669 if (rte_eal_process_type() != RTE_PROC_SECONDARY)
672 /* default container fd should have been opened in rte_vfio_enable() */
673 if (!default_vfio_cfg->vfio_enabled ||
674 default_vfio_cfg->vfio_container_fd < 0) {
675 RTE_LOG(ERR, EAL, "VFIO support is not initialized\n");
679 /* find default container's IOMMU type */
680 p->req = SOCKET_REQ_IOMMU_TYPE;
681 strcpy(mp_req.name, EAL_VFIO_MP);
682 mp_req.len_param = sizeof(*p);
686 if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
687 mp_reply.nb_received == 1) {
688 mp_rep = &mp_reply.msgs[0];
689 p = (struct vfio_mp_param *)mp_rep->param;
690 if (p->result == SOCKET_OK)
691 iommu_type_id = p->iommu_type_id;
694 if (iommu_type_id < 0) {
696 "Could not get IOMMU type for default container\n");
700 /* we now have an fd for default container, as well as its IOMMU type.
701 * now, set up default VFIO container config to match.
703 for (i = 0; i < RTE_DIM(iommu_types); i++) {
704 const struct vfio_iommu_type *t = &iommu_types[i];
705 if (t->type_id != iommu_type_id)
708 /* we found our IOMMU type */
709 default_vfio_cfg->vfio_iommu_type = t;
713 RTE_LOG(ERR, EAL, "Could not find IOMMU type id (%i)\n",
719 rte_vfio_clear_group(int vfio_group_fd)
722 struct vfio_config *vfio_cfg;
724 vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
725 if (vfio_cfg == NULL) {
726 RTE_LOG(ERR, EAL, "Invalid VFIO group fd!\n");
730 i = get_vfio_group_idx(vfio_group_fd);
733 vfio_cfg->vfio_groups[i].group_num = -1;
734 vfio_cfg->vfio_groups[i].fd = -1;
735 vfio_cfg->vfio_groups[i].devices = 0;
736 vfio_cfg->vfio_active_groups--;
742 rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
743 int *vfio_dev_fd, struct vfio_device_info *device_info)
745 struct vfio_group_status group_status = {
746 .argsz = sizeof(group_status)
748 struct vfio_config *vfio_cfg;
749 struct user_mem_maps *user_mem_maps;
750 int vfio_container_fd;
755 const struct internal_config *internal_conf =
756 eal_get_internal_configuration();
758 /* get group number */
759 ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num);
762 "%s not managed by VFIO driver, skipping\n",
767 /* if negative, something failed */
771 /* get the actual group fd */
772 vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num);
773 if (vfio_group_fd < 0 && vfio_group_fd != -ENOENT)
777 * if vfio_group_fd == -ENOENT, that means the device
778 * isn't managed by VFIO
780 if (vfio_group_fd == -ENOENT) {
782 "%s not managed by VFIO driver, skipping\n",
788 * at this point, we know that this group is viable (meaning, all devices
789 * are either bound to VFIO or not bound to anything)
792 /* check if the group is viable */
793 ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status);
795 RTE_LOG(ERR, EAL, "%s cannot get VFIO group status, "
796 "error %i (%s)\n", dev_addr, errno, strerror(errno));
797 close(vfio_group_fd);
798 rte_vfio_clear_group(vfio_group_fd);
800 } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
801 RTE_LOG(ERR, EAL, "%s VFIO group is not viable! "
802 "Not all devices in IOMMU group bound to VFIO or unbound\n",
804 close(vfio_group_fd);
805 rte_vfio_clear_group(vfio_group_fd);
809 /* get the vfio_config it belongs to */
810 vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
811 vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
812 vfio_container_fd = vfio_cfg->vfio_container_fd;
813 user_mem_maps = &vfio_cfg->mem_maps;
815 /* check if group does not have a container yet */
816 if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) {
818 /* add group to a container */
819 ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER,
823 "%s cannot add VFIO group to container, error "
824 "%i (%s)\n", dev_addr, errno, strerror(errno));
825 close(vfio_group_fd);
826 rte_vfio_clear_group(vfio_group_fd);
831 * pick an IOMMU type and set up DMA mappings for container
833 * needs to be done only once, only when first group is
834 * assigned to a container and only in primary process.
835 * Note this can happen several times with the hotplug
838 if (internal_conf->process_type == RTE_PROC_PRIMARY &&
839 vfio_cfg->vfio_active_groups == 1 &&
840 vfio_group_device_count(vfio_group_fd) == 0) {
841 const struct vfio_iommu_type *t;
843 /* select an IOMMU type which we will be using */
844 t = vfio_set_iommu_type(vfio_container_fd);
847 "%s failed to select IOMMU type\n",
849 close(vfio_group_fd);
850 rte_vfio_clear_group(vfio_group_fd);
853 /* lock memory hotplug before mapping and release it
854 * after registering callback, to prevent races
856 rte_mcfg_mem_read_lock();
857 if (vfio_cfg == default_vfio_cfg)
858 ret = t->dma_map_func(vfio_container_fd);
863 "%s DMA remapping failed, error "
865 dev_addr, errno, strerror(errno));
866 close(vfio_group_fd);
867 rte_vfio_clear_group(vfio_group_fd);
868 rte_mcfg_mem_read_unlock();
872 vfio_cfg->vfio_iommu_type = t;
874 /* re-map all user-mapped segments */
875 rte_spinlock_recursive_lock(&user_mem_maps->lock);
877 /* this IOMMU type may not support DMA mapping, but
878 * if we have mappings in the list - that means we have
879 * previously mapped something successfully, so we can
880 * be sure that DMA mapping is supported.
882 for (i = 0; i < user_mem_maps->n_maps; i++) {
883 struct user_mem_map *map;
884 map = &user_mem_maps->maps[i];
886 ret = t->dma_user_map_func(
888 map->addr, map->iova, map->len,
891 RTE_LOG(ERR, EAL, "Couldn't map user memory for DMA: "
893 "iova: 0x%" PRIx64 " "
894 "len: 0x%" PRIu64 "\n",
895 map->addr, map->iova,
897 rte_spinlock_recursive_unlock(
898 &user_mem_maps->lock);
899 rte_mcfg_mem_read_unlock();
903 rte_spinlock_recursive_unlock(&user_mem_maps->lock);
905 /* register callback for mem events */
906 if (vfio_cfg == default_vfio_cfg)
907 ret = rte_mem_event_callback_register(
908 VFIO_MEM_EVENT_CLB_NAME,
909 vfio_mem_event_callback, NULL);
912 /* unlock memory hotplug */
913 rte_mcfg_mem_read_unlock();
915 if (ret && rte_errno != ENOTSUP) {
916 RTE_LOG(ERR, EAL, "Could not install memory event callback for VFIO\n");
920 RTE_LOG(DEBUG, EAL, "Memory event callbacks not supported\n");
922 RTE_LOG(DEBUG, EAL, "Installed memory event callback for VFIO\n");
924 } else if (rte_eal_process_type() != RTE_PROC_PRIMARY &&
925 vfio_cfg == default_vfio_cfg &&
926 vfio_cfg->vfio_iommu_type == NULL) {
927 /* if we're not a primary process, we do not set up the VFIO
928 * container because it's already been set up by the primary
929 * process. instead, we simply ask the primary about VFIO type
930 * we are using, and set the VFIO config up appropriately.
932 ret = vfio_sync_default_container();
934 RTE_LOG(ERR, EAL, "Could not sync default VFIO container\n");
935 close(vfio_group_fd);
936 rte_vfio_clear_group(vfio_group_fd);
939 /* we have successfully initialized VFIO, notify user */
940 const struct vfio_iommu_type *t =
941 default_vfio_cfg->vfio_iommu_type;
942 RTE_LOG(INFO, EAL, "Using IOMMU type %d (%s)\n",
943 t->type_id, t->name);
946 rte_eal_vfio_get_vf_token(vf_token);
948 /* get a file descriptor for the device with VF token firstly */
949 if (!rte_uuid_is_null(vf_token)) {
950 char vf_token_str[RTE_UUID_STRLEN];
953 rte_uuid_unparse(vf_token, vf_token_str, sizeof(vf_token_str));
954 snprintf(dev, sizeof(dev),
955 "%s vf_token=%s", dev_addr, vf_token_str);
957 *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD,
959 if (*vfio_dev_fd >= 0)
963 /* get a file descriptor for the device */
964 *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr);
965 if (*vfio_dev_fd < 0) {
966 /* if we cannot get a device fd, this implies a problem with
967 * the VFIO group or the container not having IOMMU configured.
970 RTE_LOG(WARNING, EAL, "Getting a vfio_dev_fd for %s failed\n",
972 close(vfio_group_fd);
973 rte_vfio_clear_group(vfio_group_fd);
977 /* test and setup the device */
979 ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
981 RTE_LOG(ERR, EAL, "%s cannot get device info, "
982 "error %i (%s)\n", dev_addr, errno,
985 close(vfio_group_fd);
986 rte_vfio_clear_group(vfio_group_fd);
989 vfio_group_device_get(vfio_group_fd);
995 rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
998 struct vfio_config *vfio_cfg;
1000 int iommu_group_num;
1003 /* we don't want any DMA mapping messages to come while we're detaching
1004 * VFIO device, because this might be the last device and we might need
1005 * to unregister the callback.
1007 rte_mcfg_mem_read_lock();
1009 /* get group number */
1010 ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num);
1012 RTE_LOG(WARNING, EAL, "%s not managed by VFIO driver\n",
1014 /* This is an error at this point. */
1019 /* get the actual group fd */
1020 vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num);
1021 if (vfio_group_fd < 0) {
1022 RTE_LOG(INFO, EAL, "rte_vfio_get_group_fd failed for %s\n",
1024 ret = vfio_group_fd;
1028 /* get the vfio_config it belongs to */
1029 vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
1030 vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
1032 /* At this point we got an active group. Closing it will make the
1033 * container detachment. If this is the last active group, VFIO kernel
1034 * code will unset the container and the IOMMU mappings.
1037 /* Closing a device */
1038 if (close(vfio_dev_fd) < 0) {
1039 RTE_LOG(INFO, EAL, "Error when closing vfio_dev_fd for %s\n",
1045 /* An VFIO group can have several devices attached. Just when there is
1046 * no devices remaining should the group be closed.
1048 vfio_group_device_put(vfio_group_fd);
1049 if (!vfio_group_device_count(vfio_group_fd)) {
1051 if (close(vfio_group_fd) < 0) {
1052 RTE_LOG(INFO, EAL, "Error when closing vfio_group_fd for %s\n",
1058 if (rte_vfio_clear_group(vfio_group_fd) < 0) {
1059 RTE_LOG(INFO, EAL, "Error when clearing group for %s\n",
1066 /* if there are no active device groups, unregister the callback to
1067 * avoid spurious attempts to map/unmap memory from VFIO.
1069 if (vfio_cfg == default_vfio_cfg && vfio_cfg->vfio_active_groups == 0 &&
1070 rte_eal_process_type() != RTE_PROC_SECONDARY)
1071 rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME,
1078 rte_mcfg_mem_read_unlock();
1083 rte_vfio_enable(const char *modname)
1085 /* initialize group list */
1088 const struct internal_config *internal_conf =
1089 eal_get_internal_configuration();
1091 rte_spinlock_recursive_t lock = RTE_SPINLOCK_RECURSIVE_INITIALIZER;
1093 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
1094 vfio_cfgs[i].vfio_container_fd = -1;
1095 vfio_cfgs[i].vfio_active_groups = 0;
1096 vfio_cfgs[i].vfio_iommu_type = NULL;
1097 vfio_cfgs[i].mem_maps.lock = lock;
1099 for (j = 0; j < VFIO_MAX_GROUPS; j++) {
1100 vfio_cfgs[i].vfio_groups[j].fd = -1;
1101 vfio_cfgs[i].vfio_groups[j].group_num = -1;
1102 vfio_cfgs[i].vfio_groups[j].devices = 0;
1106 RTE_LOG(DEBUG, EAL, "Probing VFIO support...\n");
1108 /* check if vfio module is loaded */
1109 vfio_available = rte_eal_check_module(modname);
1111 /* return error directly */
1112 if (vfio_available == -1) {
1113 RTE_LOG(INFO, EAL, "Could not get loaded module details!\n");
1117 /* return 0 if VFIO modules not loaded */
1118 if (vfio_available == 0) {
1120 "VFIO modules not loaded, skipping VFIO support...\n");
1124 if (internal_conf->process_type == RTE_PROC_PRIMARY) {
1125 /* open a new container */
1126 default_vfio_cfg->vfio_container_fd =
1127 rte_vfio_get_container_fd();
1129 /* get the default container from the primary process */
1130 default_vfio_cfg->vfio_container_fd =
1131 vfio_get_default_container_fd();
1134 /* check if we have VFIO driver enabled */
1135 if (default_vfio_cfg->vfio_container_fd != -1) {
1136 RTE_LOG(INFO, EAL, "VFIO support initialized\n");
1137 default_vfio_cfg->vfio_enabled = 1;
1139 RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n");
1146 rte_vfio_is_enabled(const char *modname)
1148 const int mod_available = rte_eal_check_module(modname) > 0;
1149 return default_vfio_cfg->vfio_enabled && mod_available;
1153 vfio_get_default_container_fd(void)
1155 struct rte_mp_msg mp_req, *mp_rep;
1156 struct rte_mp_reply mp_reply = {0};
1157 struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
1158 struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
1160 const struct internal_config *internal_conf =
1161 eal_get_internal_configuration();
1163 if (default_vfio_cfg->vfio_enabled)
1164 return default_vfio_cfg->vfio_container_fd;
1166 if (internal_conf->process_type == RTE_PROC_PRIMARY) {
1167 /* if we were secondary process we would try requesting
1168 * container fd from the primary, but we're the primary
1169 * process so just exit here
1174 p->req = SOCKET_REQ_DEFAULT_CONTAINER;
1175 strcpy(mp_req.name, EAL_VFIO_MP);
1176 mp_req.len_param = sizeof(*p);
1179 if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
1180 mp_reply.nb_received == 1) {
1181 mp_rep = &mp_reply.msgs[0];
1182 p = (struct vfio_mp_param *)mp_rep->param;
1183 if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
1184 container_fd = mp_rep->fds[0];
1185 free(mp_reply.msgs);
1186 return container_fd;
1190 free(mp_reply.msgs);
1191 RTE_LOG(ERR, EAL, "Cannot request default VFIO container fd\n");
1196 vfio_get_iommu_type(void)
1198 if (default_vfio_cfg->vfio_iommu_type == NULL)
1201 return default_vfio_cfg->vfio_iommu_type->type_id;
1204 const struct vfio_iommu_type *
1205 vfio_set_iommu_type(int vfio_container_fd)
1208 for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
1209 const struct vfio_iommu_type *t = &iommu_types[idx];
1211 int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
1214 RTE_LOG(INFO, EAL, "Using IOMMU type %d (%s)\n",
1215 t->type_id, t->name);
1218 /* not an error, there may be more supported IOMMU types */
1219 RTE_LOG(DEBUG, EAL, "Set IOMMU type %d (%s) failed, error "
1220 "%i (%s)\n", t->type_id, t->name, errno,
1223 /* if we didn't find a suitable IOMMU type, fail */
1228 vfio_has_supported_extensions(int vfio_container_fd)
1231 unsigned idx, n_extensions = 0;
1232 for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
1233 const struct vfio_iommu_type *t = &iommu_types[idx];
1235 ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION,
1238 RTE_LOG(ERR, EAL, "Could not get IOMMU type, error "
1239 "%i (%s)\n", errno, strerror(errno));
1240 close(vfio_container_fd);
1242 } else if (ret == 1) {
1243 /* we found a supported extension */
1246 RTE_LOG(DEBUG, EAL, "IOMMU type %d (%s) is %s\n",
1247 t->type_id, t->name,
1248 ret ? "supported" : "not supported");
1251 /* if we didn't find any supported IOMMU types, fail */
1252 if (!n_extensions) {
1253 close(vfio_container_fd);
1261 rte_vfio_get_container_fd(void)
1263 int ret, vfio_container_fd;
1264 struct rte_mp_msg mp_req, *mp_rep;
1265 struct rte_mp_reply mp_reply = {0};
1266 struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
1267 struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
1268 const struct internal_config *internal_conf =
1269 eal_get_internal_configuration();
1272 /* if we're in a primary process, try to open the container */
1273 if (internal_conf->process_type == RTE_PROC_PRIMARY) {
1274 vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR);
1275 if (vfio_container_fd < 0) {
1277 "Cannot open VFIO container %s, error "
1278 "%i (%s)\n", VFIO_CONTAINER_PATH,
1279 errno, strerror(errno));
1283 /* check VFIO API version */
1284 ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION);
1285 if (ret != VFIO_API_VERSION) {
1288 "Could not get VFIO API version, error "
1289 "%i (%s)\n", errno, strerror(errno));
1291 RTE_LOG(ERR, EAL, "Unsupported VFIO API version!\n");
1292 close(vfio_container_fd);
1296 ret = vfio_has_supported_extensions(vfio_container_fd);
1299 "No supported IOMMU extensions found!\n");
1303 return vfio_container_fd;
1306 * if we're in a secondary process, request container fd from the
1307 * primary process via mp channel
1309 p->req = SOCKET_REQ_CONTAINER;
1310 strcpy(mp_req.name, EAL_VFIO_MP);
1311 mp_req.len_param = sizeof(*p);
1314 vfio_container_fd = -1;
1315 if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
1316 mp_reply.nb_received == 1) {
1317 mp_rep = &mp_reply.msgs[0];
1318 p = (struct vfio_mp_param *)mp_rep->param;
1319 if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
1320 vfio_container_fd = mp_rep->fds[0];
1321 free(mp_reply.msgs);
1322 return vfio_container_fd;
1326 free(mp_reply.msgs);
1327 RTE_LOG(ERR, EAL, "Cannot request VFIO container fd\n");
1332 rte_vfio_get_group_num(const char *sysfs_base,
1333 const char *dev_addr, int *iommu_group_num)
1335 char linkname[PATH_MAX];
1336 char filename[PATH_MAX];
1337 char *tok[16], *group_tok, *end;
1340 memset(linkname, 0, sizeof(linkname));
1341 memset(filename, 0, sizeof(filename));
1343 /* try to find out IOMMU group for this device */
1344 snprintf(linkname, sizeof(linkname),
1345 "%s/%s/iommu_group", sysfs_base, dev_addr);
1347 ret = readlink(linkname, filename, sizeof(filename));
1349 /* if the link doesn't exist, no VFIO for us */
1353 ret = rte_strsplit(filename, sizeof(filename),
1354 tok, RTE_DIM(tok), '/');
1357 RTE_LOG(ERR, EAL, "%s cannot get IOMMU group\n", dev_addr);
1361 /* IOMMU group is always the last token */
1363 group_tok = tok[ret - 1];
1365 *iommu_group_num = strtol(group_tok, &end, 10);
1366 if ((end != group_tok && *end != '\0') || errno != 0) {
1367 RTE_LOG(ERR, EAL, "%s error parsing IOMMU number!\n", dev_addr);
1375 type1_map_contig(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
1376 size_t len, void *arg)
1378 int *vfio_container_fd = arg;
1383 return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
1388 type1_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
1391 int *vfio_container_fd = arg;
1393 /* skip external memory that isn't a heap */
1394 if (msl->external && !msl->heap)
1397 /* skip any segments with invalid IOVA addresses */
1398 if (ms->iova == RTE_BAD_IOVA)
1401 /* if IOVA mode is VA, we've already mapped the internal segments */
1402 if (!msl->external && rte_eal_iova_mode() == RTE_IOVA_VA)
1405 return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
1410 vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
1411 uint64_t len, int do_map)
1413 struct vfio_iommu_type1_dma_map dma_map;
1414 struct vfio_iommu_type1_dma_unmap dma_unmap;
1418 memset(&dma_map, 0, sizeof(dma_map));
1419 dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
1420 dma_map.vaddr = vaddr;
1422 dma_map.iova = iova;
1423 dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
1424 VFIO_DMA_MAP_FLAG_WRITE;
1426 ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
1429 * In case the mapping was already done EEXIST will be
1430 * returned from kernel.
1432 if (errno == EEXIST) {
1434 "Memory segment is already mapped, skipping");
1437 "Cannot set up DMA remapping, error "
1438 "%i (%s)\n", errno, strerror(errno));
1443 memset(&dma_unmap, 0, sizeof(dma_unmap));
1444 dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
1445 dma_unmap.size = len;
1446 dma_unmap.iova = iova;
1448 ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
1451 RTE_LOG(ERR, EAL, "Cannot clear DMA remapping, error "
1452 "%i (%s)\n", errno, strerror(errno));
1454 } else if (dma_unmap.size != len) {
1455 RTE_LOG(ERR, EAL, "Unexpected size %"PRIu64
1456 " of DMA remapping cleared instead of %"PRIu64"\n",
1457 (uint64_t)dma_unmap.size, len);
1467 vfio_type1_dma_map(int vfio_container_fd)
1469 if (rte_eal_iova_mode() == RTE_IOVA_VA) {
1470 /* with IOVA as VA mode, we can get away with mapping contiguous
1471 * chunks rather than going page-by-page.
1473 int ret = rte_memseg_contig_walk(type1_map_contig,
1474 &vfio_container_fd);
1477 /* we have to continue the walk because we've skipped the
1478 * external segments during the config walk.
1481 return rte_memseg_walk(type1_map, &vfio_container_fd);
1484 /* Track the size of the statically allocated DMA window for SPAPR */
1485 uint64_t spapr_dma_win_len;
1486 uint64_t spapr_dma_win_page_sz;
1489 vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
1490 uint64_t len, int do_map)
1492 struct vfio_iommu_spapr_register_memory reg = {
1493 .argsz = sizeof(reg),
1494 .vaddr = (uintptr_t) vaddr,
1501 struct vfio_iommu_type1_dma_map dma_map;
1503 if (iova + len > spapr_dma_win_len) {
1504 RTE_LOG(ERR, EAL, "DMA map attempt outside DMA window\n");
1508 ret = ioctl(vfio_container_fd,
1509 VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®);
1512 "Cannot register vaddr for IOMMU, error "
1513 "%i (%s)\n", errno, strerror(errno));
1517 memset(&dma_map, 0, sizeof(dma_map));
1518 dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
1519 dma_map.vaddr = vaddr;
1521 dma_map.iova = iova;
1522 dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
1523 VFIO_DMA_MAP_FLAG_WRITE;
1525 ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
1527 RTE_LOG(ERR, EAL, "Cannot map vaddr for IOMMU, error "
1528 "%i (%s)\n", errno, strerror(errno));
1533 struct vfio_iommu_type1_dma_map dma_unmap;
1535 memset(&dma_unmap, 0, sizeof(dma_unmap));
1536 dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
1537 dma_unmap.size = len;
1538 dma_unmap.iova = iova;
1540 ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
1543 RTE_LOG(ERR, EAL, "Cannot unmap vaddr for IOMMU, error "
1544 "%i (%s)\n", errno, strerror(errno));
1548 ret = ioctl(vfio_container_fd,
1549 VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®);
1552 "Cannot unregister vaddr for IOMMU, error "
1553 "%i (%s)\n", errno, strerror(errno));
1562 vfio_spapr_map_walk(const struct rte_memseg_list *msl,
1563 const struct rte_memseg *ms, void *arg)
1565 int *vfio_container_fd = arg;
1567 /* skip external memory that isn't a heap */
1568 if (msl->external && !msl->heap)
1571 /* skip any segments with invalid IOVA addresses */
1572 if (ms->iova == RTE_BAD_IOVA)
1575 return vfio_spapr_dma_do_map(*vfio_container_fd,
1576 ms->addr_64, ms->iova, ms->len, 1);
1579 struct spapr_size_walk_param {
1582 bool is_user_managed;
1586 * In order to set the DMA window size required for the SPAPR IOMMU
1587 * we need to walk the existing virtual memory allocations as well as
1588 * find the hugepage size used.
1591 vfio_spapr_size_walk(const struct rte_memseg_list *msl, void *arg)
1593 struct spapr_size_walk_param *param = arg;
1594 uint64_t max = (uint64_t) msl->base_va + (uint64_t) msl->len;
1596 if (msl->external && !msl->heap) {
1597 /* ignore user managed external memory */
1598 param->is_user_managed = true;
1602 if (max > param->max_va) {
1603 param->page_sz = msl->page_sz;
1604 param->max_va = max;
1611 * Find the highest memory address used in physical or virtual address
1612 * space and use that as the top of the DMA window.
1615 find_highest_mem_addr(struct spapr_size_walk_param *param)
1617 /* find the maximum IOVA address for setting the DMA window size */
1618 if (rte_eal_iova_mode() == RTE_IOVA_PA) {
1619 static const char proc_iomem[] = "/proc/iomem";
1620 static const char str_sysram[] = "System RAM";
1621 uint64_t start, end, max = 0;
1627 * Example "System RAM" in /proc/iomem:
1628 * 00000000-1fffffffff : System RAM
1629 * 200000000000-201fffffffff : System RAM
1631 FILE *fd = fopen(proc_iomem, "r");
1633 RTE_LOG(ERR, EAL, "Cannot open %s\n", proc_iomem);
1636 /* Scan /proc/iomem for the highest PA in the system */
1637 while (getline(&line, &line_len, fd) != -1) {
1638 if (strstr(line, str_sysram) == NULL)
1641 space = strstr(line, " ");
1642 dash = strstr(line, "-");
1644 /* Validate the format of the memory string */
1645 if (space == NULL || dash == NULL || space < dash) {
1646 RTE_LOG(ERR, EAL, "Can't parse line \"%s\" in file %s\n",
1651 start = strtoull(line, NULL, 16);
1652 end = strtoull(dash + 1, NULL, 16);
1653 RTE_LOG(DEBUG, EAL, "Found system RAM from 0x%" PRIx64
1654 " to 0x%" PRIx64 "\n", start, end);
1662 RTE_LOG(ERR, EAL, "Failed to find valid \"System RAM\" "
1663 "entry in file %s\n", proc_iomem);
1667 spapr_dma_win_len = rte_align64pow2(max + 1);
1669 } else if (rte_eal_iova_mode() == RTE_IOVA_VA) {
1670 RTE_LOG(DEBUG, EAL, "Highest VA address in memseg list is 0x%"
1671 PRIx64 "\n", param->max_va);
1672 spapr_dma_win_len = rte_align64pow2(param->max_va);
1676 spapr_dma_win_len = 0;
1677 RTE_LOG(ERR, EAL, "Unsupported IOVA mode\n");
1683 * The SPAPRv2 IOMMU supports 2 DMA windows with starting
1684 * address at 0 or 1<<59. By default, a DMA window is set
1685 * at address 0, 2GB long, with a 4KB page. For DPDK we
1686 * must remove the default window and setup a new DMA window
1687 * based on the hugepage size and memory requirements of
1688 * the application before we can map memory for DMA.
1691 spapr_dma_win_size(void)
1693 struct spapr_size_walk_param param;
1695 /* only create DMA window once */
1696 if (spapr_dma_win_len > 0)
1699 /* walk the memseg list to find the page size/max VA address */
1700 memset(¶m, 0, sizeof(param));
1701 if (rte_memseg_list_walk(vfio_spapr_size_walk, ¶m) < 0) {
1702 RTE_LOG(ERR, EAL, "Failed to walk memseg list for DMA window size\n");
1706 /* we can't be sure if DMA window covers external memory */
1707 if (param.is_user_managed)
1708 RTE_LOG(WARNING, EAL, "Detected user managed external memory which may not be managed by the IOMMU\n");
1710 /* check physical/virtual memory size */
1711 if (find_highest_mem_addr(¶m) < 0)
1713 RTE_LOG(DEBUG, EAL, "Setting DMA window size to 0x%" PRIx64 "\n",
1715 spapr_dma_win_page_sz = param.page_sz;
1716 rte_mem_set_dma_mask(__builtin_ctzll(spapr_dma_win_len));
1721 vfio_spapr_create_dma_window(int vfio_container_fd)
1723 struct vfio_iommu_spapr_tce_create create = {
1724 .argsz = sizeof(create), };
1725 struct vfio_iommu_spapr_tce_remove remove = {
1726 .argsz = sizeof(remove), };
1727 struct vfio_iommu_spapr_tce_info info = {
1728 .argsz = sizeof(info), };
1731 ret = spapr_dma_win_size();
1735 ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
1737 RTE_LOG(ERR, EAL, "Cannot get IOMMU info, error %i (%s)\n",
1738 errno, strerror(errno));
1743 * sPAPR v1/v2 IOMMU always has a default 1G DMA window set. The window
1744 * can't be changed for v1 but it can be changed for v2. Since DPDK only
1745 * supports v2, remove the default DMA window so it can be resized.
1747 remove.start_addr = info.dma32_window_start;
1748 ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
1752 /* create a new DMA window (start address is not selectable) */
1753 create.window_size = spapr_dma_win_len;
1754 create.page_shift = __builtin_ctzll(spapr_dma_win_page_sz);
1756 ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
1757 #ifdef VFIO_IOMMU_SPAPR_INFO_DDW
1759 * The vfio_iommu_spapr_tce_info structure was modified in
1760 * Linux kernel 4.2.0 to add support for the
1761 * vfio_iommu_spapr_tce_ddw_info structure needed to try
1762 * multiple table levels. Skip the attempt if running with
1766 /* if at first we don't succeed, try more levels */
1769 for (levels = create.levels + 1;
1770 ret && levels <= info.ddw.levels; levels++) {
1771 create.levels = levels;
1772 ret = ioctl(vfio_container_fd,
1773 VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
1776 #endif /* VFIO_IOMMU_SPAPR_INFO_DDW */
1778 RTE_LOG(ERR, EAL, "Cannot create new DMA window, error "
1779 "%i (%s)\n", errno, strerror(errno));
1781 "Consider using a larger hugepage size if supported by the system\n");
1785 /* verify the start address */
1786 if (create.start_addr != 0) {
1787 RTE_LOG(ERR, EAL, "Received unsupported start address 0x%"
1788 PRIx64 "\n", (uint64_t)create.start_addr);
1795 vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr,
1796 uint64_t iova, uint64_t len, int do_map)
1801 if (vfio_spapr_dma_do_map(vfio_container_fd,
1802 vaddr, iova, len, 1)) {
1803 RTE_LOG(ERR, EAL, "Failed to map DMA\n");
1807 if (vfio_spapr_dma_do_map(vfio_container_fd,
1808 vaddr, iova, len, 0)) {
1809 RTE_LOG(ERR, EAL, "Failed to unmap DMA\n");
1818 vfio_spapr_dma_map(int vfio_container_fd)
1820 if (vfio_spapr_create_dma_window(vfio_container_fd) < 0) {
1821 RTE_LOG(ERR, EAL, "Could not create new DMA window!\n");
1825 /* map all existing DPDK segments for DMA */
1826 if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0)
1833 vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
1835 /* No-IOMMU mode does not need DMA mapping */
1840 vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd,
1841 uint64_t __rte_unused vaddr,
1842 uint64_t __rte_unused iova, uint64_t __rte_unused len,
1843 int __rte_unused do_map)
1845 /* No-IOMMU mode does not need DMA mapping */
1850 vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
1851 uint64_t len, int do_map)
1853 const struct vfio_iommu_type *t = vfio_cfg->vfio_iommu_type;
1856 RTE_LOG(ERR, EAL, "VFIO support not initialized\n");
1861 if (!t->dma_user_map_func) {
1863 "VFIO custom DMA region mapping not supported by IOMMU %s\n",
1865 rte_errno = ENOTSUP;
1869 return t->dma_user_map_func(vfio_cfg->vfio_container_fd, vaddr, iova,
1874 container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
1877 struct user_mem_map *new_map;
1878 struct user_mem_maps *user_mem_maps;
1879 bool has_partial_unmap;
1882 user_mem_maps = &vfio_cfg->mem_maps;
1883 rte_spinlock_recursive_lock(&user_mem_maps->lock);
1884 if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
1885 RTE_LOG(ERR, EAL, "No more space for user mem maps\n");
1891 if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 1)) {
1892 /* technically, this will fail if there are currently no devices
1893 * plugged in, even if a device were added later, this mapping
1894 * might have succeeded. however, since we cannot verify if this
1895 * is a valid mapping without having a device attached, consider
1896 * this to be unsupported, because we can't just store any old
1897 * mapping and pollute list of active mappings willy-nilly.
1899 RTE_LOG(ERR, EAL, "Couldn't map new region for DMA\n");
1903 /* do we have partial unmap support? */
1904 has_partial_unmap = vfio_cfg->vfio_iommu_type->partial_unmap;
1906 /* create new user mem map entry */
1907 new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
1908 new_map->addr = vaddr;
1909 new_map->iova = iova;
1911 /* for IOMMU types supporting partial unmap, we don't need chunking */
1912 new_map->chunk = has_partial_unmap ? 0 : len;
1914 compact_user_maps(user_mem_maps);
1916 rte_spinlock_recursive_unlock(&user_mem_maps->lock);
1921 container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
1924 struct user_mem_map orig_maps[VFIO_MAX_USER_MEM_MAPS];
1925 struct user_mem_map new_maps[2]; /* can be at most 2 */
1926 struct user_mem_maps *user_mem_maps;
1927 int n_orig, n_new, newlen, ret = 0;
1928 bool has_partial_unmap;
1930 user_mem_maps = &vfio_cfg->mem_maps;
1931 rte_spinlock_recursive_lock(&user_mem_maps->lock);
1934 * Previously, we had adjacent mappings entirely contained within one
1935 * mapping entry. Since we now store original mapping length in some
1936 * cases, this is no longer the case, so unmapping can potentially go
1937 * over multiple segments and split them in any number of ways.
1939 * To complicate things further, some IOMMU types support arbitrary
1940 * partial unmapping, while others will only support unmapping along the
1941 * chunk size, so there are a lot of cases we need to handle. To make
1942 * things easier code wise, instead of trying to adjust existing
1943 * mappings, let's just rebuild them using information we have.
1947 * first thing to do is check if there exists a mapping that includes
1948 * the start and the end of our requested unmap. We need to collect all
1949 * maps that include our unmapped region.
1951 n_orig = find_user_mem_maps(user_mem_maps, vaddr, iova, len,
1952 orig_maps, RTE_DIM(orig_maps));
1953 /* did we find anything? */
1955 RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n");
1961 /* do we have partial unmap capability? */
1962 has_partial_unmap = vfio_cfg->vfio_iommu_type->partial_unmap;
1965 * if we don't support partial unmap, we must check if start and end of
1966 * current unmap region are chunk-aligned.
1968 if (!has_partial_unmap) {
1969 bool start_aligned, end_aligned;
1971 start_aligned = addr_is_chunk_aligned(orig_maps, n_orig,
1973 end_aligned = addr_is_chunk_aligned(orig_maps, n_orig,
1974 vaddr + len, iova + len);
1976 if (!start_aligned || !end_aligned) {
1977 RTE_LOG(DEBUG, EAL, "DMA partial unmap unsupported\n");
1978 rte_errno = ENOTSUP;
1985 * now we know we can potentially unmap the region, but we still have to
1986 * figure out if there is enough space in our list to store remaining
1987 * maps. for this, we will figure out how many segments we are going to
1988 * remove, and how many new segments we are going to create.
1990 n_new = process_maps(orig_maps, n_orig, new_maps, vaddr, len);
1992 /* can we store the new maps in our list? */
1993 newlen = (user_mem_maps->n_maps - n_orig) + n_new;
1994 if (newlen >= VFIO_MAX_USER_MEM_MAPS) {
1995 RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
2001 /* unmap the entry */
2002 if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 0)) {
2003 /* there may not be any devices plugged in, so unmapping will
2004 * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't
2005 * stop us from removing the mapping, as the assumption is we
2006 * won't be needing this memory any more and thus will want to
2007 * prevent it from being remapped again on hotplug. so, only
2008 * fail if we indeed failed to unmap (e.g. if the mapping was
2009 * within our mapped range but had invalid alignment).
2011 if (rte_errno != ENODEV && rte_errno != ENOTSUP) {
2012 RTE_LOG(ERR, EAL, "Couldn't unmap region for DMA\n");
2016 RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n");
2020 /* we have unmapped the region, so now update the maps */
2021 delete_maps(user_mem_maps, orig_maps, n_orig);
2022 copy_maps(user_mem_maps, new_maps, n_new);
2023 compact_user_maps(user_mem_maps);
2025 rte_spinlock_recursive_unlock(&user_mem_maps->lock);
2030 rte_vfio_noiommu_is_enabled(void)
2036 fd = open(VFIO_NOIOMMU_MODE, O_RDONLY);
2038 if (errno != ENOENT) {
2039 RTE_LOG(ERR, EAL, "Cannot open VFIO noiommu file "
2040 "%i (%s)\n", errno, strerror(errno));
2044 * else the file does not exists
2045 * i.e. noiommu is not enabled
2050 cnt = read(fd, &c, 1);
2053 RTE_LOG(ERR, EAL, "Unable to read from VFIO noiommu file "
2054 "%i (%s)\n", errno, strerror(errno));
2062 rte_vfio_container_create(void)
2066 /* Find an empty slot to store new vfio config */
2067 for (i = 1; i < VFIO_MAX_CONTAINERS; i++) {
2068 if (vfio_cfgs[i].vfio_container_fd == -1)
2072 if (i == VFIO_MAX_CONTAINERS) {
2073 RTE_LOG(ERR, EAL, "Exceed max VFIO container limit\n");
2077 vfio_cfgs[i].vfio_container_fd = rte_vfio_get_container_fd();
2078 if (vfio_cfgs[i].vfio_container_fd < 0) {
2079 RTE_LOG(NOTICE, EAL, "Fail to create a new VFIO container\n");
2083 return vfio_cfgs[i].vfio_container_fd;
2087 rte_vfio_container_destroy(int container_fd)
2089 struct vfio_config *vfio_cfg;
2092 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
2093 if (vfio_cfg == NULL) {
2094 RTE_LOG(ERR, EAL, "Invalid VFIO container fd\n");
2098 for (i = 0; i < VFIO_MAX_GROUPS; i++)
2099 if (vfio_cfg->vfio_groups[i].group_num != -1)
2100 rte_vfio_container_group_unbind(container_fd,
2101 vfio_cfg->vfio_groups[i].group_num);
2103 close(container_fd);
2104 vfio_cfg->vfio_container_fd = -1;
2105 vfio_cfg->vfio_active_groups = 0;
2106 vfio_cfg->vfio_iommu_type = NULL;
2112 rte_vfio_container_group_bind(int container_fd, int iommu_group_num)
2114 struct vfio_config *vfio_cfg;
2116 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
2117 if (vfio_cfg == NULL) {
2118 RTE_LOG(ERR, EAL, "Invalid VFIO container fd\n");
2122 return vfio_get_group_fd(vfio_cfg, iommu_group_num);
2126 rte_vfio_container_group_unbind(int container_fd, int iommu_group_num)
2128 struct vfio_config *vfio_cfg;
2129 struct vfio_group *cur_grp = NULL;
2132 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
2133 if (vfio_cfg == NULL) {
2134 RTE_LOG(ERR, EAL, "Invalid VFIO container fd\n");
2138 for (i = 0; i < VFIO_MAX_GROUPS; i++) {
2139 if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) {
2140 cur_grp = &vfio_cfg->vfio_groups[i];
2145 /* This should not happen */
2146 if (i == VFIO_MAX_GROUPS || cur_grp == NULL) {
2147 RTE_LOG(ERR, EAL, "Specified VFIO group number not found\n");
2151 if (cur_grp->fd >= 0 && close(cur_grp->fd) < 0) {
2153 "Error when closing vfio_group_fd for iommu_group_num "
2154 "%d\n", iommu_group_num);
2157 cur_grp->group_num = -1;
2159 cur_grp->devices = 0;
2160 vfio_cfg->vfio_active_groups--;
2166 rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova,
2169 struct vfio_config *vfio_cfg;
2176 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
2177 if (vfio_cfg == NULL) {
2178 RTE_LOG(ERR, EAL, "Invalid VFIO container fd\n");
2182 return container_dma_map(vfio_cfg, vaddr, iova, len);
2186 rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova,
2189 struct vfio_config *vfio_cfg;
2196 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
2197 if (vfio_cfg == NULL) {
2198 RTE_LOG(ERR, EAL, "Invalid VFIO container fd\n");
2202 return container_dma_unmap(vfio_cfg, vaddr, iova, len);
2208 rte_vfio_setup_device(__rte_unused const char *sysfs_base,
2209 __rte_unused const char *dev_addr,
2210 __rte_unused int *vfio_dev_fd,
2211 __rte_unused struct vfio_device_info *device_info)
2217 rte_vfio_release_device(__rte_unused const char *sysfs_base,
2218 __rte_unused const char *dev_addr, __rte_unused int fd)
2224 rte_vfio_enable(__rte_unused const char *modname)
2230 rte_vfio_is_enabled(__rte_unused const char *modname)
2236 rte_vfio_noiommu_is_enabled(void)
2242 rte_vfio_clear_group(__rte_unused int vfio_group_fd)
2248 rte_vfio_get_group_num(__rte_unused const char *sysfs_base,
2249 __rte_unused const char *dev_addr,
2250 __rte_unused int *iommu_group_num)
2256 rte_vfio_get_container_fd(void)
2262 rte_vfio_get_group_fd(__rte_unused int iommu_group_num)
2268 rte_vfio_container_create(void)
2274 rte_vfio_container_destroy(__rte_unused int container_fd)
2280 rte_vfio_container_group_bind(__rte_unused int container_fd,
2281 __rte_unused int iommu_group_num)
2287 rte_vfio_container_group_unbind(__rte_unused int container_fd,
2288 __rte_unused int iommu_group_num)
2294 rte_vfio_container_dma_map(__rte_unused int container_fd,
2295 __rte_unused uint64_t vaddr,
2296 __rte_unused uint64_t iova,
2297 __rte_unused uint64_t len)
2303 rte_vfio_container_dma_unmap(__rte_unused int container_fd,
2304 __rte_unused uint64_t vaddr,
2305 __rte_unused uint64_t iova,
2306 __rte_unused uint64_t len)
2311 #endif /* VFIO_PRESENT */