X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=lib%2Flibrte_eal%2Flinuxapp%2Feal%2Feal_pci_vfio.c;h=ffa2dd05bf02ee4b912f006e34069815f6bb36fb;hb=756ce64b1ecdf107acfa45fc3f31359ca338649e;hp=c776ddc442be9eb00ca61cb33db36a5077737eae;hpb=6f41fe75e2dd8dd38f7bea7b9501edd4f9b72fa5;p=dpdk.git diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c index c776ddc442..ffa2dd05bf 100644 --- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c @@ -37,12 +37,13 @@ #include #include #include +#include #include #include -#include #include #include +#include #include "eal_filesystem.h" #include "eal_pci_init.h" @@ -60,20 +61,109 @@ #ifdef VFIO_PRESENT +#define PAGE_SIZE (sysconf(_SC_PAGESIZE)) +#define PAGE_MASK (~(PAGE_SIZE - 1)) + +static struct rte_tailq_elem rte_vfio_tailq = { + .name = "VFIO_RESOURCE_LIST", +}; +EAL_REGISTER_TAILQ(rte_vfio_tailq) + #define VFIO_DIR "/dev/vfio" #define VFIO_CONTAINER_PATH "/dev/vfio/vfio" #define VFIO_GROUP_FMT "/dev/vfio/%u" +#define VFIO_NOIOMMU_GROUP_FMT "/dev/vfio/noiommu-%u" #define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL) /* per-process VFIO config */ static struct vfio_config vfio_cfg; +/* DMA mapping function prototype. + * Takes VFIO container fd as a parameter. + * Returns 0 on success, -1 on error. + * */ +typedef int (*vfio_dma_func_t)(int); + +struct vfio_iommu_type { + int type_id; + const char *name; + vfio_dma_func_t dma_map_func; +}; + +static int vfio_type1_dma_map(int); +static int vfio_noiommu_dma_map(int); + +/* IOMMU types we support */ +static const struct vfio_iommu_type iommu_types[] = { + /* x86 IOMMU, otherwise known as type 1 */ + { RTE_VFIO_TYPE1, "Type 1", &vfio_type1_dma_map}, + /* IOMMU-less mode */ + { RTE_VFIO_NOIOMMU, "No-IOMMU", &vfio_noiommu_dma_map}, +}; + +int +vfio_type1_dma_map(int vfio_container_fd) +{ + const struct rte_memseg *ms = rte_eal_get_physmem_layout(); + int i, ret; + + /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */ + for (i = 0; i < RTE_MAX_MEMSEG; i++) { + struct vfio_iommu_type1_dma_map dma_map; + + if (ms[i].addr == NULL) + break; + + memset(&dma_map, 0, sizeof(dma_map)); + dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); + dma_map.vaddr = ms[i].addr_64; + dma_map.size = ms[i].len; + dma_map.iova = ms[i].phys_addr; + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); + + if (ret) { + RTE_LOG(ERR, EAL, " cannot set up DMA remapping, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + } + + return 0; +} + +int +vfio_noiommu_dma_map(int __rte_unused vfio_container_fd) +{ + /* No-IOMMU mode does not need DMA mapping */ + return 0; +} + +int +pci_vfio_read_config(const struct rte_intr_handle *intr_handle, + void *buf, size_t len, off_t offs) +{ + return pread64(intr_handle->vfio_dev_fd, buf, len, + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs); +} + +int +pci_vfio_write_config(const struct rte_intr_handle *intr_handle, + const void *buf, size_t len, off_t offs) +{ + return pwrite64(intr_handle->vfio_dev_fd, buf, len, + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs); +} + /* get PCI BAR number where MSI-X interrupts are */ static int -pci_vfio_get_msix_bar(int fd, int *msix_bar) +pci_vfio_get_msix_bar(int fd, int *msix_bar, uint32_t *msix_table_offset, + uint32_t *msix_table_size) { int ret; uint32_t reg; + uint16_t flags; uint8_t cap_id, cap_offset; /* read PCI capability pointer from config space */ @@ -132,7 +222,18 @@ pci_vfio_get_msix_bar(int fd, int *msix_bar) return -1; } + ret = pread64(fd, &flags, sizeof(flags), + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + + cap_offset + 2); + if (ret != sizeof(flags)) { + RTE_LOG(ERR, EAL, "Cannot read table flags from PCI config " + "space!\n"); + return -1; + } + *msix_bar = reg & RTE_PCI_MSIX_TABLE_BIR; + *msix_table_offset = reg & RTE_PCI_MSIX_TABLE_OFFSET; + *msix_table_size = 16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE)); return 0; } @@ -170,42 +271,58 @@ pci_vfio_set_bus_master(int dev_fd) return 0; } -/* set up DMA mappings */ -static int -pci_vfio_setup_dma_maps(int vfio_container_fd) -{ - const struct rte_memseg *ms = rte_eal_get_physmem_layout(); - int i, ret; - - ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU, - VFIO_TYPE1_IOMMU); - if (ret) { - RTE_LOG(ERR, EAL, " cannot set IOMMU type, " - "error %i (%s)\n", errno, strerror(errno)); - return -1; +/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */ +static const struct vfio_iommu_type * +pci_vfio_set_iommu_type(int vfio_container_fd) { + unsigned idx; + for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { + const struct vfio_iommu_type *t = &iommu_types[idx]; + + int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU, + t->type_id); + if (!ret) { + RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n", + t->type_id, t->name); + return t; + } + /* not an error, there may be more supported IOMMU types */ + RTE_LOG(DEBUG, EAL, " set IOMMU type %d (%s) failed, " + "error %i (%s)\n", t->type_id, t->name, errno, + strerror(errno)); } + /* if we didn't find a suitable IOMMU type, fail */ + return NULL; +} - /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */ - for (i = 0; i < RTE_MAX_MEMSEG; i++) { - struct vfio_iommu_type1_dma_map dma_map; - - if (ms[i].addr == NULL) - break; - - memset(&dma_map, 0, sizeof(dma_map)); - dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); - dma_map.vaddr = ms[i].addr_64; - dma_map.size = ms[i].len; - dma_map.iova = ms[i].phys_addr; - dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; - - ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); +/* check if we have any supported extensions */ +static int +pci_vfio_has_supported_extensions(int vfio_container_fd) { + int ret; + unsigned idx, n_extensions = 0; + for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { + const struct vfio_iommu_type *t = &iommu_types[idx]; - if (ret) { - RTE_LOG(ERR, EAL, " cannot set up DMA remapping, " - "error %i (%s)\n", errno, strerror(errno)); + ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, + t->type_id); + if (ret < 0) { + RTE_LOG(ERR, EAL, " could not get IOMMU type, " + "error %i (%s)\n", errno, + strerror(errno)); + close(vfio_container_fd); return -1; + } else if (ret == 1) { + /* we found a supported extension */ + n_extensions++; } + RTE_LOG(DEBUG, EAL, " IOMMU type %d (%s) is %s\n", + t->type_id, t->name, + ret ? "supported" : "not supported"); + } + + /* if we didn't find any supported IOMMU types, fail */ + if (!n_extensions) { + close(vfio_container_fd); + return -1; } return 0; @@ -272,7 +389,7 @@ pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd) } /* set up an eventfd for interrupts */ - fd = eventfd(0, 0); + fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); if (fd < 0) { RTE_LOG(ERR, EAL, " cannot set up eventfd, " "error %i (%s)\n", errno, strerror(errno)); @@ -334,15 +451,10 @@ pci_vfio_get_container_fd(void) return -1; } - /* check if we support IOMMU type 1 */ - ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU); - if (ret != 1) { - if (ret < 0) - RTE_LOG(ERR, EAL, " could not get IOMMU type, " - "error %i (%s)\n", errno, strerror(errno)); - else - RTE_LOG(ERR, EAL, " unsupported IOMMU type!\n"); - close(vfio_container_fd); + ret = pci_vfio_has_supported_extensions(vfio_container_fd); + if (ret) { + RTE_LOG(ERR, EAL, " no supported IOMMU " + "extensions found!\n"); return -1; } @@ -392,6 +504,7 @@ pci_vfio_get_group_fd(int iommu_group_no) /* if primary, try to open the group */ if (internal_config.process_type == RTE_PROC_PRIMARY) { + /* try regular group format */ snprintf(filename, sizeof(filename), VFIO_GROUP_FMT, iommu_group_no); vfio_group_fd = open(filename, O_RDWR); @@ -402,7 +515,20 @@ pci_vfio_get_group_fd(int iommu_group_no) strerror(errno)); return -1; } - return 0; + + /* special case: try no-IOMMU path as well */ + snprintf(filename, sizeof(filename), + VFIO_NOIOMMU_GROUP_FMT, iommu_group_no); + vfio_group_fd = open(filename, O_RDWR); + if (vfio_group_fd < 0) { + if (errno != ENOENT) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename, + strerror(errno)); + return -1; + } + return 0; + } + /* noiommu group found */ } /* if the fd is valid, create a new group for it */ @@ -459,14 +585,15 @@ pci_vfio_get_group_fd(int iommu_group_no) } /* parse IOMMU group number for a PCI device - * returns -1 for errors, 0 for non-existent group */ + * returns 1 on success, -1 for errors, 0 for non-existent group + */ static int -pci_vfio_get_group_no(const char *pci_addr) +pci_vfio_get_group_no(const char *pci_addr, int *iommu_group_no) { char linkname[PATH_MAX]; char filename[PATH_MAX]; char *tok[16], *group_tok, *end; - int ret, iommu_group_no; + int ret; memset(linkname, 0, sizeof(linkname)); memset(filename, 0, sizeof(filename)); @@ -493,13 +620,13 @@ pci_vfio_get_group_no(const char *pci_addr) errno = 0; group_tok = tok[ret - 1]; end = group_tok; - iommu_group_no = strtol(group_tok, &end, 10); + *iommu_group_no = strtol(group_tok, &end, 10); if ((end != group_tok && *end != '\0') || errno != 0) { RTE_LOG(ERR, EAL, " %s error parsing IOMMU number!\n", pci_addr); return -1; } - return iommu_group_no; + return 1; } static void @@ -527,7 +654,11 @@ pci_vfio_map_resource(struct rte_pci_device *dev) struct rte_pci_addr *loc = &dev->addr; int i, ret, msix_bar; struct mapped_pci_resource *vfio_res = NULL; + struct mapped_pci_res_list *vfio_res_list = RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); + struct pci_map *maps; + uint32_t msix_table_offset = 0; + uint32_t msix_table_size = 0; dev->intr_handle.fd = -1; dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; @@ -537,16 +668,15 @@ pci_vfio_map_resource(struct rte_pci_device *dev) loc->domain, loc->bus, loc->devid, loc->function); /* get group number */ - iommu_group_no = pci_vfio_get_group_no(pci_addr); - - /* if 0, group doesn't exist */ - if (iommu_group_no == 0) { + ret = pci_vfio_get_group_no(pci_addr, &iommu_group_no); + if (ret == 0) { RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", - pci_addr); + pci_addr); return 1; } + /* if negative, something failed */ - else if (iommu_group_no < 0) + if (ret < 0) return -1; /* get the actual group fd */ @@ -616,14 +746,21 @@ pci_vfio_map_resource(struct rte_pci_device *dev) } /* - * set up DMA mappings for container + * pick an IOMMU type and set up DMA mappings for container * * needs to be done only once, only when at least one group is assigned to * a container and only in primary process */ if (internal_config.process_type == RTE_PROC_PRIMARY && vfio_cfg.vfio_container_has_dma == 0) { - ret = pci_vfio_setup_dma_maps(vfio_cfg.vfio_container_fd); + /* select an IOMMU type which we will be using */ + const struct vfio_iommu_type *t = + pci_vfio_set_iommu_type(vfio_cfg.vfio_container_fd); + if (!t) { + RTE_LOG(ERR, EAL, " %s failed to select IOMMU type\n", pci_addr); + return -1; + } + ret = t->dma_map_func(vfio_cfg.vfio_container_fd); if (ret) { RTE_LOG(ERR, EAL, " %s DMA remapping failed, " "error %i (%s)\n", pci_addr, errno, strerror(errno)); @@ -653,9 +790,10 @@ pci_vfio_map_resource(struct rte_pci_device *dev) } /* get MSI-X BAR, if any (we have to know where it is because we can't - * mmap it when using VFIO) */ + * easily mmap it when using VFIO) */ msix_bar = -1; - ret = pci_vfio_get_msix_bar(vfio_dev_fd, &msix_bar); + ret = pci_vfio_get_msix_bar(vfio_dev_fd, &msix_bar, + &msix_table_offset, &msix_table_size); if (ret < 0) { RTE_LOG(ERR, EAL, " %s cannot get MSI-X BAR number!\n", pci_addr); close(vfio_dev_fd); @@ -678,7 +816,7 @@ pci_vfio_map_resource(struct rte_pci_device *dev) VFIO_PCI_BAR5_REGION_INDEX + 1); } else { /* if we're in a secondary process, just find our tailq entry */ - TAILQ_FOREACH(vfio_res, pci_res_list, next) { + TAILQ_FOREACH(vfio_res, vfio_res_list, next) { if (memcmp(&vfio_res->pci_addr, &dev->addr, sizeof(dev->addr))) continue; break; @@ -698,6 +836,9 @@ pci_vfio_map_resource(struct rte_pci_device *dev) for (i = 0; i < (int) vfio_res->nb_maps; i++) { struct vfio_region_info reg = { .argsz = sizeof(reg) }; void *bar_addr; + struct memreg { + unsigned long offset, size; + } memreg[2] = {}; reg.index = i; @@ -716,14 +857,82 @@ pci_vfio_map_resource(struct rte_pci_device *dev) if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) continue; - /* skip MSI-X BAR */ - if (i == msix_bar) - continue; + if (i == msix_bar) { + /* + * VFIO will not let us map the MSI-X table, + * but we can map around it. + */ + uint32_t table_start = msix_table_offset; + uint32_t table_end = table_start + msix_table_size; + table_end = (table_end + ~PAGE_MASK) & PAGE_MASK; + table_start &= PAGE_MASK; + + if (table_start == 0 && table_end >= reg.size) { + /* Cannot map this BAR */ + RTE_LOG(DEBUG, EAL, "Skipping BAR %d\n", i); + continue; + } else { + memreg[0].offset = reg.offset; + memreg[0].size = table_start; + memreg[1].offset = table_end; + memreg[1].size = reg.size - table_end; + + RTE_LOG(DEBUG, EAL, + "Trying to map BAR %d that contains the MSI-X " + "table. Trying offsets: " + "0x%04lx:0x%04lx, 0x%04lx:0x%04lx\n", i, + memreg[0].offset, memreg[0].size, + memreg[1].offset, memreg[1].size); + } + } else { + memreg[0].offset = reg.offset; + memreg[0].size = reg.size; + } - bar_addr = pci_map_resource(maps[i].addr, vfio_dev_fd, reg.offset, - reg.size); + /* try to figure out an address */ + if (internal_config.process_type == RTE_PROC_PRIMARY) { + /* try mapping somewhere close to the end of hugepages */ + if (pci_map_addr == NULL) + pci_map_addr = pci_find_max_end_va(); - if (bar_addr == NULL) { + bar_addr = pci_map_addr; + pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size); + } else { + bar_addr = maps[i].addr; + } + + /* reserve the address using an inaccessible mapping */ + bar_addr = mmap(bar_addr, reg.size, 0, MAP_PRIVATE | + MAP_ANONYMOUS, -1, 0); + if (bar_addr != MAP_FAILED) { + void *map_addr = NULL; + if (memreg[0].size) { + /* actual map of first part */ + map_addr = pci_map_resource(bar_addr, vfio_dev_fd, + memreg[0].offset, + memreg[0].size, + MAP_FIXED); + } + + /* if there's a second part, try to map it */ + if (map_addr != MAP_FAILED + && memreg[1].offset && memreg[1].size) { + void *second_addr = RTE_PTR_ADD(bar_addr, memreg[1].offset); + map_addr = pci_map_resource(second_addr, + vfio_dev_fd, memreg[1].offset, + memreg[1].size, + MAP_FIXED); + } + + if (map_addr == MAP_FAILED || !map_addr) { + munmap(bar_addr, reg.size); + bar_addr = MAP_FAILED; + } + } + + if (bar_addr == MAP_FAILED || + (internal_config.process_type == RTE_PROC_SECONDARY && + bar_addr != maps[i].addr)) { RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\n", pci_addr, i, strerror(errno)); close(vfio_dev_fd); @@ -735,6 +944,7 @@ pci_vfio_map_resource(struct rte_pci_device *dev) maps[i].addr = bar_addr; maps[i].offset = reg.offset; maps[i].size = reg.size; + maps[i].path = NULL; /* vfio doesn't have per-resource paths */ dev->mem_resource[i].addr = bar_addr; } @@ -760,28 +970,88 @@ pci_vfio_map_resource(struct rte_pci_device *dev) } if (internal_config.process_type == RTE_PROC_PRIMARY) - TAILQ_INSERT_TAIL(pci_res_list, vfio_res, next); + TAILQ_INSERT_TAIL(vfio_res_list, vfio_res, next); return 0; } +int +pci_vfio_ioport_map(struct rte_pci_device *dev, int bar, + struct rte_pci_ioport *p) +{ + RTE_SET_USED(dev); + RTE_SET_USED(bar); + RTE_SET_USED(p); + return -1; +} + +void +pci_vfio_ioport_read(struct rte_pci_ioport *p, + void *data, size_t len, off_t offset) +{ + RTE_SET_USED(p); + RTE_SET_USED(data); + RTE_SET_USED(len); + RTE_SET_USED(offset); +} + +void +pci_vfio_ioport_write(struct rte_pci_ioport *p, + const void *data, size_t len, off_t offset) +{ + RTE_SET_USED(p); + RTE_SET_USED(data); + RTE_SET_USED(len); + RTE_SET_USED(offset); +} + +int +pci_vfio_ioport_unmap(struct rte_pci_ioport *p) +{ + RTE_SET_USED(p); + return -1; +} + int pci_vfio_enable(void) { /* initialize group list */ int i; + int vfio_available; for (i = 0; i < VFIO_MAX_GROUPS; i++) { vfio_cfg.vfio_groups[i].fd = -1; vfio_cfg.vfio_groups[i].group_no = -1; } + + /* inform the user that we are probing for VFIO */ + RTE_LOG(INFO, EAL, "Probing VFIO support...\n"); + + /* check if vfio-pci module is loaded */ + vfio_available = rte_eal_check_module("vfio_pci"); + + /* return error directly */ + if (vfio_available == -1) { + RTE_LOG(INFO, EAL, "Could not get loaded module details!\n"); + return -1; + } + + /* return 0 if VFIO modules not loaded */ + if (vfio_available == 0) { + RTE_LOG(INFO, EAL, "VFIO modules not loaded, " + "skipping VFIO support...\n"); + return 0; + } + vfio_cfg.vfio_container_fd = pci_vfio_get_container_fd(); /* check if we have VFIO driver enabled */ - if (vfio_cfg.vfio_container_fd != -1) + if (vfio_cfg.vfio_container_fd != -1) { + RTE_LOG(NOTICE, EAL, "VFIO support initialized\n"); vfio_cfg.vfio_enabled = 1; - else - RTE_LOG(INFO, EAL, "VFIO support could not be initialized\n"); + } else { + RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n"); + } return 0; }