1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2018 Intel Corporation
11 #include <rte_errno.h>
13 #include <rte_memory.h>
14 #include <rte_eal_memconfig.h>
17 #include "eal_filesystem.h"
18 #include "eal_memcfg.h"
20 #include "eal_private.h"
21 #include "eal_internal_cfg.h"
25 #define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb"
27 /* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can
28 * recreate the mappings for DPDK segments, but we cannot do so for memory that
29 * was registered by the user themselves, so we need to store the user mappings
30 * somewhere, to recreate them later.
32 #define VFIO_MAX_USER_MEM_MAPS 256
39 struct user_mem_maps {
40 rte_spinlock_recursive_t lock;
42 struct user_mem_map maps[VFIO_MAX_USER_MEM_MAPS];
47 int vfio_container_fd;
48 int vfio_active_groups;
49 const struct vfio_iommu_type *vfio_iommu_type;
50 struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
51 struct user_mem_maps mem_maps;
54 /* per-process VFIO config */
55 static struct vfio_config vfio_cfgs[VFIO_MAX_CONTAINERS];
56 static struct vfio_config *default_vfio_cfg = &vfio_cfgs[0];
58 static int vfio_type1_dma_map(int);
59 static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
60 static int vfio_spapr_dma_map(int);
61 static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
62 static int vfio_noiommu_dma_map(int);
63 static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
64 static int vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr,
65 uint64_t iova, uint64_t len, int do_map);
67 /* IOMMU types we support */
68 static const struct vfio_iommu_type iommu_types[] = {
69 /* x86 IOMMU, otherwise known as type 1 */
71 .type_id = RTE_VFIO_TYPE1,
73 .partial_unmap = false,
74 .dma_map_func = &vfio_type1_dma_map,
75 .dma_user_map_func = &vfio_type1_dma_mem_map
77 /* ppc64 IOMMU, otherwise known as spapr */
79 .type_id = RTE_VFIO_SPAPR,
81 .partial_unmap = true,
82 .dma_map_func = &vfio_spapr_dma_map,
83 .dma_user_map_func = &vfio_spapr_dma_mem_map
87 .type_id = RTE_VFIO_NOIOMMU,
89 .partial_unmap = true,
90 .dma_map_func = &vfio_noiommu_dma_map,
91 .dma_user_map_func = &vfio_noiommu_dma_mem_map
96 is_null_map(const struct user_mem_map *map)
98 return map->addr == 0 && map->iova == 0 && map->len == 0;
101 /* we may need to merge user mem maps together in case of user mapping/unmapping
102 * chunks of memory, so we'll need a comparator function to sort segments.
105 user_mem_map_cmp(const void *a, const void *b)
107 const struct user_mem_map *umm_a = a;
108 const struct user_mem_map *umm_b = b;
110 /* move null entries to end */
111 if (is_null_map(umm_a))
113 if (is_null_map(umm_b))
116 /* sort by iova first */
117 if (umm_a->iova < umm_b->iova)
119 if (umm_a->iova > umm_b->iova)
122 if (umm_a->addr < umm_b->addr)
124 if (umm_a->addr > umm_b->addr)
127 if (umm_a->len < umm_b->len)
129 if (umm_a->len > umm_b->len)
135 /* adjust user map entry. this may result in shortening of existing map, or in
136 * splitting existing map in two pieces.
139 adjust_map(struct user_mem_map *src, struct user_mem_map *end,
140 uint64_t remove_va_start, uint64_t remove_len)
142 /* if va start is same as start address, we're simply moving start */
143 if (remove_va_start == src->addr) {
144 src->addr += remove_len;
145 src->iova += remove_len;
146 src->len -= remove_len;
147 } else if (remove_va_start + remove_len == src->addr + src->len) {
148 /* we're shrinking mapping from the end */
149 src->len -= remove_len;
151 /* we're blowing a hole in the middle */
152 struct user_mem_map tmp;
153 uint64_t total_len = src->len;
155 /* adjust source segment length */
156 src->len = remove_va_start - src->addr;
158 /* create temporary segment in the middle */
159 tmp.addr = src->addr + src->len;
160 tmp.iova = src->iova + src->len;
161 tmp.len = remove_len;
163 /* populate end segment - this one we will be keeping */
164 end->addr = tmp.addr + tmp.len;
165 end->iova = tmp.iova + tmp.len;
166 end->len = total_len - src->len - tmp.len;
170 /* try merging two maps into one, return 1 if succeeded */
172 merge_map(struct user_mem_map *left, struct user_mem_map *right)
174 /* merge the same maps into one */
175 if (memcmp(left, right, sizeof(struct user_mem_map)) == 0)
178 if (left->addr + left->len != right->addr)
180 if (left->iova + left->len != right->iova)
183 left->len += right->len;
186 memset(right, 0, sizeof(*right));
191 static struct user_mem_map *
192 find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t addr,
193 uint64_t iova, uint64_t len)
195 uint64_t va_end = addr + len;
196 uint64_t iova_end = iova + len;
199 for (i = 0; i < user_mem_maps->n_maps; i++) {
200 struct user_mem_map *map = &user_mem_maps->maps[i];
201 uint64_t map_va_end = map->addr + map->len;
202 uint64_t map_iova_end = map->iova + map->len;
205 if (addr < map->addr || addr >= map_va_end)
207 /* check if VA end is within boundaries */
208 if (va_end <= map->addr || va_end > map_va_end)
211 /* check start IOVA */
212 if (iova < map->iova || iova >= map_iova_end)
214 /* check if IOVA end is within boundaries */
215 if (iova_end <= map->iova || iova_end > map_iova_end)
218 /* we've found our map */
224 /* this will sort all user maps, and merge/compact any adjacent maps */
226 compact_user_maps(struct user_mem_maps *user_mem_maps)
228 int i, n_merged, cur_idx;
230 qsort(user_mem_maps->maps, user_mem_maps->n_maps,
231 sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
233 /* we'll go over the list backwards when merging */
235 for (i = user_mem_maps->n_maps - 2; i >= 0; i--) {
236 struct user_mem_map *l, *r;
238 l = &user_mem_maps->maps[i];
239 r = &user_mem_maps->maps[i + 1];
241 if (is_null_map(l) || is_null_map(r))
248 /* the entries are still sorted, but now they have holes in them, so
249 * walk through the list and remove the holes
253 for (i = 0; i < user_mem_maps->n_maps; i++) {
254 if (!is_null_map(&user_mem_maps->maps[i])) {
255 struct user_mem_map *src, *dst;
257 src = &user_mem_maps->maps[i];
258 dst = &user_mem_maps->maps[cur_idx++];
261 memcpy(dst, src, sizeof(*src));
262 memset(src, 0, sizeof(*src));
266 user_mem_maps->n_maps = cur_idx;
271 vfio_open_group_fd(int iommu_group_num)
274 char filename[PATH_MAX];
275 struct rte_mp_msg mp_req, *mp_rep;
276 struct rte_mp_reply mp_reply = {0};
277 struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
278 struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
279 const struct internal_config *internal_conf =
280 eal_get_internal_configuration();
282 /* if primary, try to open the group */
283 if (internal_conf->process_type == RTE_PROC_PRIMARY) {
284 /* try regular group format */
285 snprintf(filename, sizeof(filename),
286 VFIO_GROUP_FMT, iommu_group_num);
287 vfio_group_fd = open(filename, O_RDWR);
288 if (vfio_group_fd < 0) {
289 /* if file not found, it's not an error */
290 if (errno != ENOENT) {
291 RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
292 filename, strerror(errno));
296 /* special case: try no-IOMMU path as well */
297 snprintf(filename, sizeof(filename),
298 VFIO_NOIOMMU_GROUP_FMT,
300 vfio_group_fd = open(filename, O_RDWR);
301 if (vfio_group_fd < 0) {
302 if (errno != ENOENT) {
304 "Cannot open %s: %s\n",
305 filename, strerror(errno));
310 /* noiommu group found */
313 return vfio_group_fd;
315 /* if we're in a secondary process, request group fd from the primary
316 * process via mp channel.
318 p->req = SOCKET_REQ_GROUP;
319 p->group_num = iommu_group_num;
320 strcpy(mp_req.name, EAL_VFIO_MP);
321 mp_req.len_param = sizeof(*p);
325 if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
326 mp_reply.nb_received == 1) {
327 mp_rep = &mp_reply.msgs[0];
328 p = (struct vfio_mp_param *)mp_rep->param;
329 if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
330 vfio_group_fd = mp_rep->fds[0];
331 } else if (p->result == SOCKET_NO_FD) {
332 RTE_LOG(ERR, EAL, "Bad VFIO group fd\n");
333 vfio_group_fd = -ENOENT;
338 if (vfio_group_fd < 0 && vfio_group_fd != -ENOENT)
339 RTE_LOG(ERR, EAL, "Cannot request VFIO group fd\n");
340 return vfio_group_fd;
343 static struct vfio_config *
344 get_vfio_cfg_by_group_num(int iommu_group_num)
346 struct vfio_config *vfio_cfg;
349 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
350 vfio_cfg = &vfio_cfgs[i];
351 for (j = 0; j < VFIO_MAX_GROUPS; j++) {
352 if (vfio_cfg->vfio_groups[j].group_num ==
362 vfio_get_group_fd(struct vfio_config *vfio_cfg,
367 struct vfio_group *cur_grp;
369 /* check if we already have the group descriptor open */
370 for (i = 0; i < VFIO_MAX_GROUPS; i++)
371 if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num)
372 return vfio_cfg->vfio_groups[i].fd;
374 /* Lets see first if there is room for a new group */
375 if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) {
376 RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
380 /* Now lets get an index for the new group */
381 for (i = 0; i < VFIO_MAX_GROUPS; i++)
382 if (vfio_cfg->vfio_groups[i].group_num == -1) {
383 cur_grp = &vfio_cfg->vfio_groups[i];
387 /* This should not happen */
388 if (i == VFIO_MAX_GROUPS) {
389 RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
393 vfio_group_fd = vfio_open_group_fd(iommu_group_num);
394 if (vfio_group_fd < 0) {
395 RTE_LOG(ERR, EAL, "Failed to open VFIO group %d\n",
397 return vfio_group_fd;
400 cur_grp->group_num = iommu_group_num;
401 cur_grp->fd = vfio_group_fd;
402 vfio_cfg->vfio_active_groups++;
404 return vfio_group_fd;
407 static struct vfio_config *
408 get_vfio_cfg_by_group_fd(int vfio_group_fd)
410 struct vfio_config *vfio_cfg;
413 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
414 vfio_cfg = &vfio_cfgs[i];
415 for (j = 0; j < VFIO_MAX_GROUPS; j++)
416 if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
423 static struct vfio_config *
424 get_vfio_cfg_by_container_fd(int container_fd)
428 if (container_fd == RTE_VFIO_DEFAULT_CONTAINER_FD)
429 return default_vfio_cfg;
431 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
432 if (vfio_cfgs[i].vfio_container_fd == container_fd)
433 return &vfio_cfgs[i];
440 rte_vfio_get_group_fd(int iommu_group_num)
442 struct vfio_config *vfio_cfg;
444 /* get the vfio_config it belongs to */
445 vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
446 vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
448 return vfio_get_group_fd(vfio_cfg, iommu_group_num);
452 get_vfio_group_idx(int vfio_group_fd)
454 struct vfio_config *vfio_cfg;
457 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
458 vfio_cfg = &vfio_cfgs[i];
459 for (j = 0; j < VFIO_MAX_GROUPS; j++)
460 if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
468 vfio_group_device_get(int vfio_group_fd)
470 struct vfio_config *vfio_cfg;
473 vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
474 if (vfio_cfg == NULL) {
475 RTE_LOG(ERR, EAL, "Invalid VFIO group fd!\n");
479 i = get_vfio_group_idx(vfio_group_fd);
480 if (i < 0 || i > (VFIO_MAX_GROUPS - 1))
481 RTE_LOG(ERR, EAL, "Wrong VFIO group index (%d)\n", i);
483 vfio_cfg->vfio_groups[i].devices++;
487 vfio_group_device_put(int vfio_group_fd)
489 struct vfio_config *vfio_cfg;
492 vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
493 if (vfio_cfg == NULL) {
494 RTE_LOG(ERR, EAL, "Invalid VFIO group fd!\n");
498 i = get_vfio_group_idx(vfio_group_fd);
499 if (i < 0 || i > (VFIO_MAX_GROUPS - 1))
500 RTE_LOG(ERR, EAL, "Wrong VFIO group index (%d)\n", i);
502 vfio_cfg->vfio_groups[i].devices--;
506 vfio_group_device_count(int vfio_group_fd)
508 struct vfio_config *vfio_cfg;
511 vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
512 if (vfio_cfg == NULL) {
513 RTE_LOG(ERR, EAL, "Invalid VFIO group fd!\n");
517 i = get_vfio_group_idx(vfio_group_fd);
518 if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) {
519 RTE_LOG(ERR, EAL, "Wrong VFIO group index (%d)\n", i);
523 return vfio_cfg->vfio_groups[i].devices;
527 vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
528 void *arg __rte_unused)
530 struct rte_memseg_list *msl;
531 struct rte_memseg *ms;
534 msl = rte_mem_virt2memseg_list(addr);
536 /* for IOVA as VA mode, no need to care for IOVA addresses */
537 if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) {
538 uint64_t vfio_va = (uint64_t)(uintptr_t)addr;
539 uint64_t page_sz = msl->page_sz;
541 /* Maintain granularity of DMA map/unmap to memseg size */
542 for (; cur_len < len; cur_len += page_sz) {
543 if (type == RTE_MEM_EVENT_ALLOC)
544 vfio_dma_mem_map(default_vfio_cfg, vfio_va,
545 vfio_va, page_sz, 1);
547 vfio_dma_mem_map(default_vfio_cfg, vfio_va,
548 vfio_va, page_sz, 0);
555 /* memsegs are contiguous in memory */
556 ms = rte_mem_virt2memseg(addr, msl);
557 while (cur_len < len) {
558 /* some memory segments may have invalid IOVA */
559 if (ms->iova == RTE_BAD_IOVA) {
561 "Memory segment at %p has bad IOVA, skipping\n",
565 if (type == RTE_MEM_EVENT_ALLOC)
566 vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
567 ms->iova, ms->len, 1);
569 vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
570 ms->iova, ms->len, 0);
578 vfio_sync_default_container(void)
580 struct rte_mp_msg mp_req, *mp_rep;
581 struct rte_mp_reply mp_reply = {0};
582 struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
583 struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
587 /* cannot be called from primary */
588 if (rte_eal_process_type() != RTE_PROC_SECONDARY)
591 /* default container fd should have been opened in rte_vfio_enable() */
592 if (!default_vfio_cfg->vfio_enabled ||
593 default_vfio_cfg->vfio_container_fd < 0) {
594 RTE_LOG(ERR, EAL, "VFIO support is not initialized\n");
598 /* find default container's IOMMU type */
599 p->req = SOCKET_REQ_IOMMU_TYPE;
600 strcpy(mp_req.name, EAL_VFIO_MP);
601 mp_req.len_param = sizeof(*p);
605 if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
606 mp_reply.nb_received == 1) {
607 mp_rep = &mp_reply.msgs[0];
608 p = (struct vfio_mp_param *)mp_rep->param;
609 if (p->result == SOCKET_OK)
610 iommu_type_id = p->iommu_type_id;
613 if (iommu_type_id < 0) {
615 "Could not get IOMMU type for default container\n");
619 /* we now have an fd for default container, as well as its IOMMU type.
620 * now, set up default VFIO container config to match.
622 for (i = 0; i < RTE_DIM(iommu_types); i++) {
623 const struct vfio_iommu_type *t = &iommu_types[i];
624 if (t->type_id != iommu_type_id)
627 /* we found our IOMMU type */
628 default_vfio_cfg->vfio_iommu_type = t;
632 RTE_LOG(ERR, EAL, "Could not find IOMMU type id (%i)\n",
638 rte_vfio_clear_group(int vfio_group_fd)
641 struct vfio_config *vfio_cfg;
643 vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
644 if (vfio_cfg == NULL) {
645 RTE_LOG(ERR, EAL, "Invalid VFIO group fd!\n");
649 i = get_vfio_group_idx(vfio_group_fd);
652 vfio_cfg->vfio_groups[i].group_num = -1;
653 vfio_cfg->vfio_groups[i].fd = -1;
654 vfio_cfg->vfio_groups[i].devices = 0;
655 vfio_cfg->vfio_active_groups--;
661 rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
662 int *vfio_dev_fd, struct vfio_device_info *device_info)
664 struct vfio_group_status group_status = {
665 .argsz = sizeof(group_status)
667 struct vfio_config *vfio_cfg;
668 struct user_mem_maps *user_mem_maps;
669 int vfio_container_fd;
674 const struct internal_config *internal_conf =
675 eal_get_internal_configuration();
677 /* get group number */
678 ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num);
681 "%s not managed by VFIO driver, skipping\n",
686 /* if negative, something failed */
690 /* get the actual group fd */
691 vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num);
692 if (vfio_group_fd < 0 && vfio_group_fd != -ENOENT)
696 * if vfio_group_fd == -ENOENT, that means the device
697 * isn't managed by VFIO
699 if (vfio_group_fd == -ENOENT) {
701 "%s not managed by VFIO driver, skipping\n",
707 * at this point, we know that this group is viable (meaning, all devices
708 * are either bound to VFIO or not bound to anything)
711 /* check if the group is viable */
712 ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status);
714 RTE_LOG(ERR, EAL, "%s cannot get VFIO group status, "
715 "error %i (%s)\n", dev_addr, errno, strerror(errno));
716 close(vfio_group_fd);
717 rte_vfio_clear_group(vfio_group_fd);
719 } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
720 RTE_LOG(ERR, EAL, "%s VFIO group is not viable! "
721 "Not all devices in IOMMU group bound to VFIO or unbound\n",
723 close(vfio_group_fd);
724 rte_vfio_clear_group(vfio_group_fd);
728 /* get the vfio_config it belongs to */
729 vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
730 vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
731 vfio_container_fd = vfio_cfg->vfio_container_fd;
732 user_mem_maps = &vfio_cfg->mem_maps;
734 /* check if group does not have a container yet */
735 if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) {
737 /* add group to a container */
738 ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER,
742 "%s cannot add VFIO group to container, error "
743 "%i (%s)\n", dev_addr, errno, strerror(errno));
744 close(vfio_group_fd);
745 rte_vfio_clear_group(vfio_group_fd);
750 * pick an IOMMU type and set up DMA mappings for container
752 * needs to be done only once, only when first group is
753 * assigned to a container and only in primary process.
754 * Note this can happen several times with the hotplug
757 if (internal_conf->process_type == RTE_PROC_PRIMARY &&
758 vfio_cfg->vfio_active_groups == 1 &&
759 vfio_group_device_count(vfio_group_fd) == 0) {
760 const struct vfio_iommu_type *t;
762 /* select an IOMMU type which we will be using */
763 t = vfio_set_iommu_type(vfio_container_fd);
766 "%s failed to select IOMMU type\n",
768 close(vfio_group_fd);
769 rte_vfio_clear_group(vfio_group_fd);
772 /* lock memory hotplug before mapping and release it
773 * after registering callback, to prevent races
775 rte_mcfg_mem_read_lock();
776 if (vfio_cfg == default_vfio_cfg)
777 ret = t->dma_map_func(vfio_container_fd);
782 "%s DMA remapping failed, error "
784 dev_addr, errno, strerror(errno));
785 close(vfio_group_fd);
786 rte_vfio_clear_group(vfio_group_fd);
787 rte_mcfg_mem_read_unlock();
791 vfio_cfg->vfio_iommu_type = t;
793 /* re-map all user-mapped segments */
794 rte_spinlock_recursive_lock(&user_mem_maps->lock);
796 /* this IOMMU type may not support DMA mapping, but
797 * if we have mappings in the list - that means we have
798 * previously mapped something successfully, so we can
799 * be sure that DMA mapping is supported.
801 for (i = 0; i < user_mem_maps->n_maps; i++) {
802 struct user_mem_map *map;
803 map = &user_mem_maps->maps[i];
805 ret = t->dma_user_map_func(
807 map->addr, map->iova, map->len,
810 RTE_LOG(ERR, EAL, "Couldn't map user memory for DMA: "
812 "iova: 0x%" PRIx64 " "
813 "len: 0x%" PRIu64 "\n",
814 map->addr, map->iova,
816 rte_spinlock_recursive_unlock(
817 &user_mem_maps->lock);
818 rte_mcfg_mem_read_unlock();
822 rte_spinlock_recursive_unlock(&user_mem_maps->lock);
824 /* register callback for mem events */
825 if (vfio_cfg == default_vfio_cfg)
826 ret = rte_mem_event_callback_register(
827 VFIO_MEM_EVENT_CLB_NAME,
828 vfio_mem_event_callback, NULL);
831 /* unlock memory hotplug */
832 rte_mcfg_mem_read_unlock();
834 if (ret && rte_errno != ENOTSUP) {
835 RTE_LOG(ERR, EAL, "Could not install memory event callback for VFIO\n");
839 RTE_LOG(DEBUG, EAL, "Memory event callbacks not supported\n");
841 RTE_LOG(DEBUG, EAL, "Installed memory event callback for VFIO\n");
843 } else if (rte_eal_process_type() != RTE_PROC_PRIMARY &&
844 vfio_cfg == default_vfio_cfg &&
845 vfio_cfg->vfio_iommu_type == NULL) {
846 /* if we're not a primary process, we do not set up the VFIO
847 * container because it's already been set up by the primary
848 * process. instead, we simply ask the primary about VFIO type
849 * we are using, and set the VFIO config up appropriately.
851 ret = vfio_sync_default_container();
853 RTE_LOG(ERR, EAL, "Could not sync default VFIO container\n");
854 close(vfio_group_fd);
855 rte_vfio_clear_group(vfio_group_fd);
858 /* we have successfully initialized VFIO, notify user */
859 const struct vfio_iommu_type *t =
860 default_vfio_cfg->vfio_iommu_type;
861 RTE_LOG(INFO, EAL, "Using IOMMU type %d (%s)\n",
862 t->type_id, t->name);
865 rte_eal_vfio_get_vf_token(vf_token);
867 /* get a file descriptor for the device with VF token firstly */
868 if (!rte_uuid_is_null(vf_token)) {
869 char vf_token_str[RTE_UUID_STRLEN];
872 rte_uuid_unparse(vf_token, vf_token_str, sizeof(vf_token_str));
873 snprintf(dev, sizeof(dev),
874 "%s vf_token=%s", dev_addr, vf_token_str);
876 *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD,
878 if (*vfio_dev_fd >= 0)
882 /* get a file descriptor for the device */
883 *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr);
884 if (*vfio_dev_fd < 0) {
885 /* if we cannot get a device fd, this implies a problem with
886 * the VFIO group or the container not having IOMMU configured.
889 RTE_LOG(WARNING, EAL, "Getting a vfio_dev_fd for %s failed\n",
891 close(vfio_group_fd);
892 rte_vfio_clear_group(vfio_group_fd);
896 /* test and setup the device */
898 ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
900 RTE_LOG(ERR, EAL, "%s cannot get device info, "
901 "error %i (%s)\n", dev_addr, errno,
904 close(vfio_group_fd);
905 rte_vfio_clear_group(vfio_group_fd);
908 vfio_group_device_get(vfio_group_fd);
914 rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
917 struct vfio_config *vfio_cfg;
922 /* we don't want any DMA mapping messages to come while we're detaching
923 * VFIO device, because this might be the last device and we might need
924 * to unregister the callback.
926 rte_mcfg_mem_read_lock();
928 /* get group number */
929 ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num);
931 RTE_LOG(WARNING, EAL, "%s not managed by VFIO driver\n",
933 /* This is an error at this point. */
938 /* get the actual group fd */
939 vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num);
940 if (vfio_group_fd < 0) {
941 RTE_LOG(INFO, EAL, "rte_vfio_get_group_fd failed for %s\n",
947 /* get the vfio_config it belongs to */
948 vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
949 vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
951 /* At this point we got an active group. Closing it will make the
952 * container detachment. If this is the last active group, VFIO kernel
953 * code will unset the container and the IOMMU mappings.
956 /* Closing a device */
957 if (close(vfio_dev_fd) < 0) {
958 RTE_LOG(INFO, EAL, "Error when closing vfio_dev_fd for %s\n",
964 /* An VFIO group can have several devices attached. Just when there is
965 * no devices remaining should the group be closed.
967 vfio_group_device_put(vfio_group_fd);
968 if (!vfio_group_device_count(vfio_group_fd)) {
970 if (close(vfio_group_fd) < 0) {
971 RTE_LOG(INFO, EAL, "Error when closing vfio_group_fd for %s\n",
977 if (rte_vfio_clear_group(vfio_group_fd) < 0) {
978 RTE_LOG(INFO, EAL, "Error when clearing group for %s\n",
985 /* if there are no active device groups, unregister the callback to
986 * avoid spurious attempts to map/unmap memory from VFIO.
988 if (vfio_cfg == default_vfio_cfg && vfio_cfg->vfio_active_groups == 0 &&
989 rte_eal_process_type() != RTE_PROC_SECONDARY)
990 rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME,
997 rte_mcfg_mem_read_unlock();
1002 rte_vfio_enable(const char *modname)
1004 /* initialize group list */
1007 const struct internal_config *internal_conf =
1008 eal_get_internal_configuration();
1010 rte_spinlock_recursive_t lock = RTE_SPINLOCK_RECURSIVE_INITIALIZER;
1012 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
1013 vfio_cfgs[i].vfio_container_fd = -1;
1014 vfio_cfgs[i].vfio_active_groups = 0;
1015 vfio_cfgs[i].vfio_iommu_type = NULL;
1016 vfio_cfgs[i].mem_maps.lock = lock;
1018 for (j = 0; j < VFIO_MAX_GROUPS; j++) {
1019 vfio_cfgs[i].vfio_groups[j].fd = -1;
1020 vfio_cfgs[i].vfio_groups[j].group_num = -1;
1021 vfio_cfgs[i].vfio_groups[j].devices = 0;
1025 RTE_LOG(DEBUG, EAL, "Probing VFIO support...\n");
1027 /* check if vfio module is loaded */
1028 vfio_available = rte_eal_check_module(modname);
1030 /* return error directly */
1031 if (vfio_available == -1) {
1032 RTE_LOG(INFO, EAL, "Could not get loaded module details!\n");
1036 /* return 0 if VFIO modules not loaded */
1037 if (vfio_available == 0) {
1039 "VFIO modules not loaded, skipping VFIO support...\n");
1043 if (internal_conf->process_type == RTE_PROC_PRIMARY) {
1044 /* open a new container */
1045 default_vfio_cfg->vfio_container_fd =
1046 rte_vfio_get_container_fd();
1048 /* get the default container from the primary process */
1049 default_vfio_cfg->vfio_container_fd =
1050 vfio_get_default_container_fd();
1053 /* check if we have VFIO driver enabled */
1054 if (default_vfio_cfg->vfio_container_fd != -1) {
1055 RTE_LOG(INFO, EAL, "VFIO support initialized\n");
1056 default_vfio_cfg->vfio_enabled = 1;
1058 RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n");
1065 rte_vfio_is_enabled(const char *modname)
1067 const int mod_available = rte_eal_check_module(modname) > 0;
1068 return default_vfio_cfg->vfio_enabled && mod_available;
1072 vfio_get_default_container_fd(void)
1074 struct rte_mp_msg mp_req, *mp_rep;
1075 struct rte_mp_reply mp_reply = {0};
1076 struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
1077 struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
1079 const struct internal_config *internal_conf =
1080 eal_get_internal_configuration();
1082 if (default_vfio_cfg->vfio_enabled)
1083 return default_vfio_cfg->vfio_container_fd;
1085 if (internal_conf->process_type == RTE_PROC_PRIMARY) {
1086 /* if we were secondary process we would try requesting
1087 * container fd from the primary, but we're the primary
1088 * process so just exit here
1093 p->req = SOCKET_REQ_DEFAULT_CONTAINER;
1094 strcpy(mp_req.name, EAL_VFIO_MP);
1095 mp_req.len_param = sizeof(*p);
1098 if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
1099 mp_reply.nb_received == 1) {
1100 mp_rep = &mp_reply.msgs[0];
1101 p = (struct vfio_mp_param *)mp_rep->param;
1102 if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
1103 container_fd = mp_rep->fds[0];
1104 free(mp_reply.msgs);
1105 return container_fd;
1109 free(mp_reply.msgs);
1110 RTE_LOG(ERR, EAL, "Cannot request default VFIO container fd\n");
1115 vfio_get_iommu_type(void)
1117 if (default_vfio_cfg->vfio_iommu_type == NULL)
1120 return default_vfio_cfg->vfio_iommu_type->type_id;
1123 const struct vfio_iommu_type *
1124 vfio_set_iommu_type(int vfio_container_fd)
1127 for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
1128 const struct vfio_iommu_type *t = &iommu_types[idx];
1130 int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
1133 RTE_LOG(INFO, EAL, "Using IOMMU type %d (%s)\n",
1134 t->type_id, t->name);
1137 /* not an error, there may be more supported IOMMU types */
1138 RTE_LOG(DEBUG, EAL, "Set IOMMU type %d (%s) failed, error "
1139 "%i (%s)\n", t->type_id, t->name, errno,
1142 /* if we didn't find a suitable IOMMU type, fail */
1147 vfio_has_supported_extensions(int vfio_container_fd)
1150 unsigned idx, n_extensions = 0;
1151 for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
1152 const struct vfio_iommu_type *t = &iommu_types[idx];
1154 ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION,
1157 RTE_LOG(ERR, EAL, "Could not get IOMMU type, error "
1158 "%i (%s)\n", errno, strerror(errno));
1159 close(vfio_container_fd);
1161 } else if (ret == 1) {
1162 /* we found a supported extension */
1165 RTE_LOG(DEBUG, EAL, "IOMMU type %d (%s) is %s\n",
1166 t->type_id, t->name,
1167 ret ? "supported" : "not supported");
1170 /* if we didn't find any supported IOMMU types, fail */
1171 if (!n_extensions) {
1172 close(vfio_container_fd);
1180 rte_vfio_get_container_fd(void)
1182 int ret, vfio_container_fd;
1183 struct rte_mp_msg mp_req, *mp_rep;
1184 struct rte_mp_reply mp_reply = {0};
1185 struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
1186 struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
1187 const struct internal_config *internal_conf =
1188 eal_get_internal_configuration();
1191 /* if we're in a primary process, try to open the container */
1192 if (internal_conf->process_type == RTE_PROC_PRIMARY) {
1193 vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR);
1194 if (vfio_container_fd < 0) {
1196 "Cannot open VFIO container %s, error "
1197 "%i (%s)\n", VFIO_CONTAINER_PATH,
1198 errno, strerror(errno));
1202 /* check VFIO API version */
1203 ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION);
1204 if (ret != VFIO_API_VERSION) {
1207 "Could not get VFIO API version, error "
1208 "%i (%s)\n", errno, strerror(errno));
1210 RTE_LOG(ERR, EAL, "Unsupported VFIO API version!\n");
1211 close(vfio_container_fd);
1215 ret = vfio_has_supported_extensions(vfio_container_fd);
1218 "No supported IOMMU extensions found!\n");
1222 return vfio_container_fd;
1225 * if we're in a secondary process, request container fd from the
1226 * primary process via mp channel
1228 p->req = SOCKET_REQ_CONTAINER;
1229 strcpy(mp_req.name, EAL_VFIO_MP);
1230 mp_req.len_param = sizeof(*p);
1233 vfio_container_fd = -1;
1234 if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
1235 mp_reply.nb_received == 1) {
1236 mp_rep = &mp_reply.msgs[0];
1237 p = (struct vfio_mp_param *)mp_rep->param;
1238 if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
1239 vfio_container_fd = mp_rep->fds[0];
1240 free(mp_reply.msgs);
1241 return vfio_container_fd;
1245 free(mp_reply.msgs);
1246 RTE_LOG(ERR, EAL, "Cannot request VFIO container fd\n");
1251 rte_vfio_get_group_num(const char *sysfs_base,
1252 const char *dev_addr, int *iommu_group_num)
1254 char linkname[PATH_MAX];
1255 char filename[PATH_MAX];
1256 char *tok[16], *group_tok, *end;
1259 memset(linkname, 0, sizeof(linkname));
1260 memset(filename, 0, sizeof(filename));
1262 /* try to find out IOMMU group for this device */
1263 snprintf(linkname, sizeof(linkname),
1264 "%s/%s/iommu_group", sysfs_base, dev_addr);
1266 ret = readlink(linkname, filename, sizeof(filename));
1268 /* if the link doesn't exist, no VFIO for us */
1272 ret = rte_strsplit(filename, sizeof(filename),
1273 tok, RTE_DIM(tok), '/');
1276 RTE_LOG(ERR, EAL, "%s cannot get IOMMU group\n", dev_addr);
1280 /* IOMMU group is always the last token */
1282 group_tok = tok[ret - 1];
1284 *iommu_group_num = strtol(group_tok, &end, 10);
1285 if ((end != group_tok && *end != '\0') || errno != 0) {
1286 RTE_LOG(ERR, EAL, "%s error parsing IOMMU number!\n", dev_addr);
1294 type1_map_contig(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
1295 size_t len, void *arg)
1297 int *vfio_container_fd = arg;
1302 return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
1307 type1_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
1310 int *vfio_container_fd = arg;
1312 /* skip external memory that isn't a heap */
1313 if (msl->external && !msl->heap)
1316 /* skip any segments with invalid IOVA addresses */
1317 if (ms->iova == RTE_BAD_IOVA)
1320 /* if IOVA mode is VA, we've already mapped the internal segments */
1321 if (!msl->external && rte_eal_iova_mode() == RTE_IOVA_VA)
1324 return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
1329 vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
1330 uint64_t len, int do_map)
1332 struct vfio_iommu_type1_dma_map dma_map;
1333 struct vfio_iommu_type1_dma_unmap dma_unmap;
1337 memset(&dma_map, 0, sizeof(dma_map));
1338 dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
1339 dma_map.vaddr = vaddr;
1341 dma_map.iova = iova;
1342 dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
1343 VFIO_DMA_MAP_FLAG_WRITE;
1345 ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
1348 * In case the mapping was already done EEXIST will be
1349 * returned from kernel.
1351 if (errno == EEXIST) {
1353 "Memory segment is already mapped, skipping");
1356 "Cannot set up DMA remapping, error "
1357 "%i (%s)\n", errno, strerror(errno));
1362 memset(&dma_unmap, 0, sizeof(dma_unmap));
1363 dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
1364 dma_unmap.size = len;
1365 dma_unmap.iova = iova;
1367 ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
1370 RTE_LOG(ERR, EAL, "Cannot clear DMA remapping, error "
1371 "%i (%s)\n", errno, strerror(errno));
1373 } else if (dma_unmap.size != len) {
1374 RTE_LOG(ERR, EAL, "Unexpected size %"PRIu64
1375 " of DMA remapping cleared instead of %"PRIu64"\n",
1376 (uint64_t)dma_unmap.size, len);
1386 vfio_type1_dma_map(int vfio_container_fd)
1388 if (rte_eal_iova_mode() == RTE_IOVA_VA) {
1389 /* with IOVA as VA mode, we can get away with mapping contiguous
1390 * chunks rather than going page-by-page.
1392 int ret = rte_memseg_contig_walk(type1_map_contig,
1393 &vfio_container_fd);
1396 /* we have to continue the walk because we've skipped the
1397 * external segments during the config walk.
1400 return rte_memseg_walk(type1_map, &vfio_container_fd);
1403 /* Track the size of the statically allocated DMA window for SPAPR */
1404 uint64_t spapr_dma_win_len;
1405 uint64_t spapr_dma_win_page_sz;
1408 vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
1409 uint64_t len, int do_map)
1411 struct vfio_iommu_spapr_register_memory reg = {
1412 .argsz = sizeof(reg),
1413 .vaddr = (uintptr_t) vaddr,
1420 struct vfio_iommu_type1_dma_map dma_map;
1422 if (iova + len > spapr_dma_win_len) {
1423 RTE_LOG(ERR, EAL, "DMA map attempt outside DMA window\n");
1427 ret = ioctl(vfio_container_fd,
1428 VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®);
1431 "Cannot register vaddr for IOMMU, error "
1432 "%i (%s)\n", errno, strerror(errno));
1436 memset(&dma_map, 0, sizeof(dma_map));
1437 dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
1438 dma_map.vaddr = vaddr;
1440 dma_map.iova = iova;
1441 dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
1442 VFIO_DMA_MAP_FLAG_WRITE;
1444 ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
1446 RTE_LOG(ERR, EAL, "Cannot map vaddr for IOMMU, error "
1447 "%i (%s)\n", errno, strerror(errno));
1452 struct vfio_iommu_type1_dma_map dma_unmap;
1454 memset(&dma_unmap, 0, sizeof(dma_unmap));
1455 dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
1456 dma_unmap.size = len;
1457 dma_unmap.iova = iova;
1459 ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
1462 RTE_LOG(ERR, EAL, "Cannot unmap vaddr for IOMMU, error "
1463 "%i (%s)\n", errno, strerror(errno));
1467 ret = ioctl(vfio_container_fd,
1468 VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®);
1471 "Cannot unregister vaddr for IOMMU, error "
1472 "%i (%s)\n", errno, strerror(errno));
1481 vfio_spapr_map_walk(const struct rte_memseg_list *msl,
1482 const struct rte_memseg *ms, void *arg)
1484 int *vfio_container_fd = arg;
1486 /* skip external memory that isn't a heap */
1487 if (msl->external && !msl->heap)
1490 /* skip any segments with invalid IOVA addresses */
1491 if (ms->iova == RTE_BAD_IOVA)
1494 return vfio_spapr_dma_do_map(*vfio_container_fd,
1495 ms->addr_64, ms->iova, ms->len, 1);
1498 struct spapr_size_walk_param {
1501 bool is_user_managed;
1505 * In order to set the DMA window size required for the SPAPR IOMMU
1506 * we need to walk the existing virtual memory allocations as well as
1507 * find the hugepage size used.
1510 vfio_spapr_size_walk(const struct rte_memseg_list *msl, void *arg)
1512 struct spapr_size_walk_param *param = arg;
1513 uint64_t max = (uint64_t) msl->base_va + (uint64_t) msl->len;
1515 if (msl->external && !msl->heap) {
1516 /* ignore user managed external memory */
1517 param->is_user_managed = true;
1521 if (max > param->max_va) {
1522 param->page_sz = msl->page_sz;
1523 param->max_va = max;
1530 * Find the highest memory address used in physical or virtual address
1531 * space and use that as the top of the DMA window.
1534 find_highest_mem_addr(struct spapr_size_walk_param *param)
1536 /* find the maximum IOVA address for setting the DMA window size */
1537 if (rte_eal_iova_mode() == RTE_IOVA_PA) {
1538 static const char proc_iomem[] = "/proc/iomem";
1539 static const char str_sysram[] = "System RAM";
1540 uint64_t start, end, max = 0;
1546 * Example "System RAM" in /proc/iomem:
1547 * 00000000-1fffffffff : System RAM
1548 * 200000000000-201fffffffff : System RAM
1550 FILE *fd = fopen(proc_iomem, "r");
1552 RTE_LOG(ERR, EAL, "Cannot open %s\n", proc_iomem);
1555 /* Scan /proc/iomem for the highest PA in the system */
1556 while (getline(&line, &line_len, fd) != -1) {
1557 if (strstr(line, str_sysram) == NULL)
1560 space = strstr(line, " ");
1561 dash = strstr(line, "-");
1563 /* Validate the format of the memory string */
1564 if (space == NULL || dash == NULL || space < dash) {
1565 RTE_LOG(ERR, EAL, "Can't parse line \"%s\" in file %s\n",
1570 start = strtoull(line, NULL, 16);
1571 end = strtoull(dash + 1, NULL, 16);
1572 RTE_LOG(DEBUG, EAL, "Found system RAM from 0x%" PRIx64
1573 " to 0x%" PRIx64 "\n", start, end);
1581 RTE_LOG(ERR, EAL, "Failed to find valid \"System RAM\" "
1582 "entry in file %s\n", proc_iomem);
1586 spapr_dma_win_len = rte_align64pow2(max + 1);
1588 } else if (rte_eal_iova_mode() == RTE_IOVA_VA) {
1589 RTE_LOG(DEBUG, EAL, "Highest VA address in memseg list is 0x%"
1590 PRIx64 "\n", param->max_va);
1591 spapr_dma_win_len = rte_align64pow2(param->max_va);
1595 spapr_dma_win_len = 0;
1596 RTE_LOG(ERR, EAL, "Unsupported IOVA mode\n");
1602 * The SPAPRv2 IOMMU supports 2 DMA windows with starting
1603 * address at 0 or 1<<59. By default, a DMA window is set
1604 * at address 0, 2GB long, with a 4KB page. For DPDK we
1605 * must remove the default window and setup a new DMA window
1606 * based on the hugepage size and memory requirements of
1607 * the application before we can map memory for DMA.
1610 spapr_dma_win_size(void)
1612 struct spapr_size_walk_param param;
1614 /* only create DMA window once */
1615 if (spapr_dma_win_len > 0)
1618 /* walk the memseg list to find the page size/max VA address */
1619 memset(¶m, 0, sizeof(param));
1620 if (rte_memseg_list_walk(vfio_spapr_size_walk, ¶m) < 0) {
1621 RTE_LOG(ERR, EAL, "Failed to walk memseg list for DMA window size\n");
1625 /* we can't be sure if DMA window covers external memory */
1626 if (param.is_user_managed)
1627 RTE_LOG(WARNING, EAL, "Detected user managed external memory which may not be managed by the IOMMU\n");
1629 /* check physical/virtual memory size */
1630 if (find_highest_mem_addr(¶m) < 0)
1632 RTE_LOG(DEBUG, EAL, "Setting DMA window size to 0x%" PRIx64 "\n",
1634 spapr_dma_win_page_sz = param.page_sz;
1635 rte_mem_set_dma_mask(__builtin_ctzll(spapr_dma_win_len));
1640 vfio_spapr_create_dma_window(int vfio_container_fd)
1642 struct vfio_iommu_spapr_tce_create create = {
1643 .argsz = sizeof(create), };
1644 struct vfio_iommu_spapr_tce_remove remove = {
1645 .argsz = sizeof(remove), };
1646 struct vfio_iommu_spapr_tce_info info = {
1647 .argsz = sizeof(info), };
1650 ret = spapr_dma_win_size();
1654 ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
1656 RTE_LOG(ERR, EAL, "Cannot get IOMMU info, error %i (%s)\n",
1657 errno, strerror(errno));
1662 * sPAPR v1/v2 IOMMU always has a default 1G DMA window set. The window
1663 * can't be changed for v1 but it can be changed for v2. Since DPDK only
1664 * supports v2, remove the default DMA window so it can be resized.
1666 remove.start_addr = info.dma32_window_start;
1667 ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
1671 /* create a new DMA window (start address is not selectable) */
1672 create.window_size = spapr_dma_win_len;
1673 create.page_shift = __builtin_ctzll(spapr_dma_win_page_sz);
1675 ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
1676 #ifdef VFIO_IOMMU_SPAPR_INFO_DDW
1678 * The vfio_iommu_spapr_tce_info structure was modified in
1679 * Linux kernel 4.2.0 to add support for the
1680 * vfio_iommu_spapr_tce_ddw_info structure needed to try
1681 * multiple table levels. Skip the attempt if running with
1685 /* if at first we don't succeed, try more levels */
1688 for (levels = create.levels + 1;
1689 ret && levels <= info.ddw.levels; levels++) {
1690 create.levels = levels;
1691 ret = ioctl(vfio_container_fd,
1692 VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
1695 #endif /* VFIO_IOMMU_SPAPR_INFO_DDW */
1697 RTE_LOG(ERR, EAL, "Cannot create new DMA window, error "
1698 "%i (%s)\n", errno, strerror(errno));
1700 "Consider using a larger hugepage size if supported by the system\n");
1704 /* verify the start address */
1705 if (create.start_addr != 0) {
1706 RTE_LOG(ERR, EAL, "Received unsupported start address 0x%"
1707 PRIx64 "\n", (uint64_t)create.start_addr);
1714 vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr,
1715 uint64_t iova, uint64_t len, int do_map)
1720 if (vfio_spapr_dma_do_map(vfio_container_fd,
1721 vaddr, iova, len, 1)) {
1722 RTE_LOG(ERR, EAL, "Failed to map DMA\n");
1726 if (vfio_spapr_dma_do_map(vfio_container_fd,
1727 vaddr, iova, len, 0)) {
1728 RTE_LOG(ERR, EAL, "Failed to unmap DMA\n");
1737 vfio_spapr_dma_map(int vfio_container_fd)
1739 if (vfio_spapr_create_dma_window(vfio_container_fd) < 0) {
1740 RTE_LOG(ERR, EAL, "Could not create new DMA window!\n");
1744 /* map all existing DPDK segments for DMA */
1745 if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0)
1752 vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
1754 /* No-IOMMU mode does not need DMA mapping */
1759 vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd,
1760 uint64_t __rte_unused vaddr,
1761 uint64_t __rte_unused iova, uint64_t __rte_unused len,
1762 int __rte_unused do_map)
1764 /* No-IOMMU mode does not need DMA mapping */
1769 vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
1770 uint64_t len, int do_map)
1772 const struct vfio_iommu_type *t = vfio_cfg->vfio_iommu_type;
1775 RTE_LOG(ERR, EAL, "VFIO support not initialized\n");
1780 if (!t->dma_user_map_func) {
1782 "VFIO custom DMA region mapping not supported by IOMMU %s\n",
1784 rte_errno = ENOTSUP;
1788 return t->dma_user_map_func(vfio_cfg->vfio_container_fd, vaddr, iova,
1793 container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
1796 struct user_mem_map *new_map;
1797 struct user_mem_maps *user_mem_maps;
1800 user_mem_maps = &vfio_cfg->mem_maps;
1801 rte_spinlock_recursive_lock(&user_mem_maps->lock);
1802 if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
1803 RTE_LOG(ERR, EAL, "No more space for user mem maps\n");
1809 if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 1)) {
1810 /* technically, this will fail if there are currently no devices
1811 * plugged in, even if a device were added later, this mapping
1812 * might have succeeded. however, since we cannot verify if this
1813 * is a valid mapping without having a device attached, consider
1814 * this to be unsupported, because we can't just store any old
1815 * mapping and pollute list of active mappings willy-nilly.
1817 RTE_LOG(ERR, EAL, "Couldn't map new region for DMA\n");
1821 /* create new user mem map entry */
1822 new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
1823 new_map->addr = vaddr;
1824 new_map->iova = iova;
1827 compact_user_maps(user_mem_maps);
1829 rte_spinlock_recursive_unlock(&user_mem_maps->lock);
1834 container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
1837 struct user_mem_map *map, *new_map = NULL;
1838 struct user_mem_maps *user_mem_maps;
1841 user_mem_maps = &vfio_cfg->mem_maps;
1842 rte_spinlock_recursive_lock(&user_mem_maps->lock);
1844 /* find our mapping */
1845 map = find_user_mem_map(user_mem_maps, vaddr, iova, len);
1847 RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n");
1852 if (map->addr != vaddr || map->iova != iova || map->len != len) {
1853 /* we're partially unmapping a previously mapped region, so we
1854 * need to split entry into two.
1856 if (!vfio_cfg->vfio_iommu_type->partial_unmap) {
1857 RTE_LOG(DEBUG, EAL, "DMA partial unmap unsupported\n");
1858 rte_errno = ENOTSUP;
1862 if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
1863 RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
1868 new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
1871 /* unmap the entry */
1872 if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 0)) {
1873 /* there may not be any devices plugged in, so unmapping will
1874 * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't
1875 * stop us from removing the mapping, as the assumption is we
1876 * won't be needing this memory any more and thus will want to
1877 * prevent it from being remapped again on hotplug. so, only
1878 * fail if we indeed failed to unmap (e.g. if the mapping was
1879 * within our mapped range but had invalid alignment).
1881 if (rte_errno != ENODEV && rte_errno != ENOTSUP) {
1882 RTE_LOG(ERR, EAL, "Couldn't unmap region for DMA\n");
1886 RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n");
1889 /* remove map from the list of active mappings */
1890 if (new_map != NULL) {
1891 adjust_map(map, new_map, vaddr, len);
1893 /* if we've created a new map by splitting, sort everything */
1894 if (!is_null_map(new_map)) {
1895 compact_user_maps(user_mem_maps);
1897 /* we've created a new mapping, but it was unused */
1898 user_mem_maps->n_maps--;
1901 memset(map, 0, sizeof(*map));
1902 compact_user_maps(user_mem_maps);
1903 user_mem_maps->n_maps--;
1907 rte_spinlock_recursive_unlock(&user_mem_maps->lock);
1912 rte_vfio_noiommu_is_enabled(void)
1918 fd = open(VFIO_NOIOMMU_MODE, O_RDONLY);
1920 if (errno != ENOENT) {
1921 RTE_LOG(ERR, EAL, "Cannot open VFIO noiommu file "
1922 "%i (%s)\n", errno, strerror(errno));
1926 * else the file does not exists
1927 * i.e. noiommu is not enabled
1932 cnt = read(fd, &c, 1);
1935 RTE_LOG(ERR, EAL, "Unable to read from VFIO noiommu file "
1936 "%i (%s)\n", errno, strerror(errno));
1944 rte_vfio_container_create(void)
1948 /* Find an empty slot to store new vfio config */
1949 for (i = 1; i < VFIO_MAX_CONTAINERS; i++) {
1950 if (vfio_cfgs[i].vfio_container_fd == -1)
1954 if (i == VFIO_MAX_CONTAINERS) {
1955 RTE_LOG(ERR, EAL, "Exceed max VFIO container limit\n");
1959 vfio_cfgs[i].vfio_container_fd = rte_vfio_get_container_fd();
1960 if (vfio_cfgs[i].vfio_container_fd < 0) {
1961 RTE_LOG(NOTICE, EAL, "Fail to create a new VFIO container\n");
1965 return vfio_cfgs[i].vfio_container_fd;
1969 rte_vfio_container_destroy(int container_fd)
1971 struct vfio_config *vfio_cfg;
1974 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
1975 if (vfio_cfg == NULL) {
1976 RTE_LOG(ERR, EAL, "Invalid VFIO container fd\n");
1980 for (i = 0; i < VFIO_MAX_GROUPS; i++)
1981 if (vfio_cfg->vfio_groups[i].group_num != -1)
1982 rte_vfio_container_group_unbind(container_fd,
1983 vfio_cfg->vfio_groups[i].group_num);
1985 close(container_fd);
1986 vfio_cfg->vfio_container_fd = -1;
1987 vfio_cfg->vfio_active_groups = 0;
1988 vfio_cfg->vfio_iommu_type = NULL;
1994 rte_vfio_container_group_bind(int container_fd, int iommu_group_num)
1996 struct vfio_config *vfio_cfg;
1998 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
1999 if (vfio_cfg == NULL) {
2000 RTE_LOG(ERR, EAL, "Invalid VFIO container fd\n");
2004 return vfio_get_group_fd(vfio_cfg, iommu_group_num);
2008 rte_vfio_container_group_unbind(int container_fd, int iommu_group_num)
2010 struct vfio_config *vfio_cfg;
2011 struct vfio_group *cur_grp = NULL;
2014 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
2015 if (vfio_cfg == NULL) {
2016 RTE_LOG(ERR, EAL, "Invalid VFIO container fd\n");
2020 for (i = 0; i < VFIO_MAX_GROUPS; i++) {
2021 if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) {
2022 cur_grp = &vfio_cfg->vfio_groups[i];
2027 /* This should not happen */
2028 if (i == VFIO_MAX_GROUPS || cur_grp == NULL) {
2029 RTE_LOG(ERR, EAL, "Specified VFIO group number not found\n");
2033 if (cur_grp->fd >= 0 && close(cur_grp->fd) < 0) {
2035 "Error when closing vfio_group_fd for iommu_group_num "
2036 "%d\n", iommu_group_num);
2039 cur_grp->group_num = -1;
2041 cur_grp->devices = 0;
2042 vfio_cfg->vfio_active_groups--;
2048 rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova,
2051 struct vfio_config *vfio_cfg;
2058 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
2059 if (vfio_cfg == NULL) {
2060 RTE_LOG(ERR, EAL, "Invalid VFIO container fd\n");
2064 return container_dma_map(vfio_cfg, vaddr, iova, len);
2068 rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova,
2071 struct vfio_config *vfio_cfg;
2078 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
2079 if (vfio_cfg == NULL) {
2080 RTE_LOG(ERR, EAL, "Invalid VFIO container fd\n");
2084 return container_dma_unmap(vfio_cfg, vaddr, iova, len);
2090 rte_vfio_setup_device(__rte_unused const char *sysfs_base,
2091 __rte_unused const char *dev_addr,
2092 __rte_unused int *vfio_dev_fd,
2093 __rte_unused struct vfio_device_info *device_info)
2099 rte_vfio_release_device(__rte_unused const char *sysfs_base,
2100 __rte_unused const char *dev_addr, __rte_unused int fd)
2106 rte_vfio_enable(__rte_unused const char *modname)
2112 rte_vfio_is_enabled(__rte_unused const char *modname)
2118 rte_vfio_noiommu_is_enabled(void)
2124 rte_vfio_clear_group(__rte_unused int vfio_group_fd)
2130 rte_vfio_get_group_num(__rte_unused const char *sysfs_base,
2131 __rte_unused const char *dev_addr,
2132 __rte_unused int *iommu_group_num)
2138 rte_vfio_get_container_fd(void)
2144 rte_vfio_get_group_fd(__rte_unused int iommu_group_num)
2150 rte_vfio_container_create(void)
2156 rte_vfio_container_destroy(__rte_unused int container_fd)
2162 rte_vfio_container_group_bind(__rte_unused int container_fd,
2163 __rte_unused int iommu_group_num)
2169 rte_vfio_container_group_unbind(__rte_unused int container_fd,
2170 __rte_unused int iommu_group_num)
2176 rte_vfio_container_dma_map(__rte_unused int container_fd,
2177 __rte_unused uint64_t vaddr,
2178 __rte_unused uint64_t iova,
2179 __rte_unused uint64_t len)
2185 rte_vfio_container_dma_unmap(__rte_unused int container_fd,
2186 __rte_unused uint64_t vaddr,
2187 __rte_unused uint64_t iova,
2188 __rte_unused uint64_t len)
2193 #endif /* VFIO_PRESENT */