X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=lib%2Flibrte_eal%2Flinuxapp%2Feal%2Feal_pci_vfio.c;h=10a75da6cefc313f5339ca79b232f2814953a380;hb=84cc318424d49372dd2a5fbf3cf84426bf95acce;hp=4d29a0d7d926a69865c03d19a1bc90b0d5e47f81;hpb=c316ed45bde280e54618ff6df2a670e25bd1a308;p=dpdk.git diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c index 4d29a0d7d9..10a75da6ce 100644 --- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c @@ -38,16 +38,17 @@ #include #include #include +#include #include #include #include #include -#include #include "eal_filesystem.h" #include "eal_pci_init.h" #include "eal_vfio.h" +#include "eal_private.h" /** * @file @@ -69,78 +70,6 @@ static struct rte_tailq_elem rte_vfio_tailq = { }; EAL_REGISTER_TAILQ(rte_vfio_tailq) -#define VFIO_DIR "/dev/vfio" -#define VFIO_CONTAINER_PATH "/dev/vfio/vfio" -#define VFIO_GROUP_FMT "/dev/vfio/%u" -#define VFIO_NOIOMMU_GROUP_FMT "/dev/vfio/noiommu-%u" -#define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL) -#define VFIO_GET_REGION_IDX(x) (x >> 40) - -/* per-process VFIO config */ -static struct vfio_config vfio_cfg; - -/* DMA mapping function prototype. - * Takes VFIO container fd as a parameter. - * Returns 0 on success, -1 on error. - * */ -typedef int (*vfio_dma_func_t)(int); - -struct vfio_iommu_type { - int type_id; - const char *name; - vfio_dma_func_t dma_map_func; -}; - -static int vfio_type1_dma_map(int); -static int vfio_noiommu_dma_map(int); - -/* IOMMU types we support */ -static const struct vfio_iommu_type iommu_types[] = { - /* x86 IOMMU, otherwise known as type 1 */ - { RTE_VFIO_TYPE1, "Type 1", &vfio_type1_dma_map}, - /* IOMMU-less mode */ - { RTE_VFIO_NOIOMMU, "No-IOMMU", &vfio_noiommu_dma_map}, -}; - -int -vfio_type1_dma_map(int vfio_container_fd) -{ - const struct rte_memseg *ms = rte_eal_get_physmem_layout(); - int i, ret; - - /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */ - for (i = 0; i < RTE_MAX_MEMSEG; i++) { - struct vfio_iommu_type1_dma_map dma_map; - - if (ms[i].addr == NULL) - break; - - memset(&dma_map, 0, sizeof(dma_map)); - dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); - dma_map.vaddr = ms[i].addr_64; - dma_map.size = ms[i].len; - dma_map.iova = ms[i].phys_addr; - dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; - - ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); - - if (ret) { - RTE_LOG(ERR, EAL, " cannot set up DMA remapping, " - "error %i (%s)\n", errno, strerror(errno)); - return -1; - } - } - - return 0; -} - -int -vfio_noiommu_dma_map(int __rte_unused vfio_container_fd) -{ - /* No-IOMMU mode does not need DMA mapping */ - return 0; -} - int pci_vfio_read_config(const struct rte_intr_handle *intr_handle, void *buf, size_t len, off_t offs) @@ -159,8 +88,7 @@ pci_vfio_write_config(const struct rte_intr_handle *intr_handle, /* get PCI BAR number where MSI-X interrupts are */ static int -pci_vfio_get_msix_bar(int fd, int *msix_bar, uint32_t *msix_table_offset, - uint32_t *msix_table_size) +pci_vfio_get_msix_bar(int fd, struct pci_msix_table *msix_table) { int ret; uint32_t reg; @@ -232,9 +160,10 @@ pci_vfio_get_msix_bar(int fd, int *msix_bar, uint32_t *msix_table_offset, return -1; } - *msix_bar = reg & RTE_PCI_MSIX_TABLE_BIR; - *msix_table_offset = reg & RTE_PCI_MSIX_TABLE_OFFSET; - *msix_table_size = 16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE)); + msix_table->bar_index = reg & RTE_PCI_MSIX_TABLE_BIR; + msix_table->offset = reg & RTE_PCI_MSIX_TABLE_OFFSET; + msix_table->size = + 16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE)); return 0; } @@ -244,7 +173,7 @@ pci_vfio_get_msix_bar(int fd, int *msix_bar, uint32_t *msix_table_offset, /* set PCI bus mastering */ static int -pci_vfio_set_bus_master(int dev_fd) +pci_vfio_set_bus_master(int dev_fd, bool op) { uint16_t reg; int ret; @@ -257,8 +186,11 @@ pci_vfio_set_bus_master(int dev_fd) return -1; } - /* set the master bit */ - reg |= PCI_COMMAND_MASTER; + if (op) + /* set the master bit */ + reg |= PCI_COMMAND_MASTER; + else + reg &= ~(PCI_COMMAND_MASTER); ret = pwrite64(dev_fd, ®, sizeof(reg), VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + @@ -272,63 +204,6 @@ pci_vfio_set_bus_master(int dev_fd) return 0; } -/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */ -static const struct vfio_iommu_type * -pci_vfio_set_iommu_type(int vfio_container_fd) { - unsigned idx; - for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { - const struct vfio_iommu_type *t = &iommu_types[idx]; - - int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU, - t->type_id); - if (!ret) { - RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n", - t->type_id, t->name); - return t; - } - /* not an error, there may be more supported IOMMU types */ - RTE_LOG(DEBUG, EAL, " set IOMMU type %d (%s) failed, " - "error %i (%s)\n", t->type_id, t->name, errno, - strerror(errno)); - } - /* if we didn't find a suitable IOMMU type, fail */ - return NULL; -} - -/* check if we have any supported extensions */ -static int -pci_vfio_has_supported_extensions(int vfio_container_fd) { - int ret; - unsigned idx, n_extensions = 0; - for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { - const struct vfio_iommu_type *t = &iommu_types[idx]; - - ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, - t->type_id); - if (ret < 0) { - RTE_LOG(ERR, EAL, " could not get IOMMU type, " - "error %i (%s)\n", errno, - strerror(errno)); - close(vfio_container_fd); - return -1; - } else if (ret == 1) { - /* we found a supported extension */ - n_extensions++; - } - RTE_LOG(DEBUG, EAL, " IOMMU type %d (%s) is %s\n", - t->type_id, t->name, - ret ? "supported" : "not supported"); - } - - /* if we didn't find any supported IOMMU types, fail */ - if (!n_extensions) { - close(vfio_container_fd); - return -1; - } - - return 0; -} - /* set up interrupt support (but not enable interrupts) */ static int pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd) @@ -339,7 +214,7 @@ pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd) intr_idx = VFIO_PCI_NUM_IRQS; /* get interrupt type from internal config (MSI-X by default, can be - * overriden from the command line + * overridden from the command line */ switch (internal_config.vfio_intr_mode) { case RTE_INTR_MODE_MSIX: @@ -425,242 +300,150 @@ pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd) return -1; } -/* open container fd or get an existing one */ -int -pci_vfio_get_container_fd(void) +static int +pci_vfio_is_ioport_bar(int vfio_dev_fd, int bar_index) { - int ret, vfio_container_fd; - - /* if we're in a primary process, try to open the container */ - if (internal_config.process_type == RTE_PROC_PRIMARY) { - vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR); - if (vfio_container_fd < 0) { - RTE_LOG(ERR, EAL, " cannot open VFIO container, " - "error %i (%s)\n", errno, strerror(errno)); - return -1; - } - - /* check VFIO API version */ - ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION); - if (ret != VFIO_API_VERSION) { - if (ret < 0) - RTE_LOG(ERR, EAL, " could not get VFIO API version, " - "error %i (%s)\n", errno, strerror(errno)); - else - RTE_LOG(ERR, EAL, " unsupported VFIO API version!\n"); - close(vfio_container_fd); - return -1; - } - - ret = pci_vfio_has_supported_extensions(vfio_container_fd); - if (ret) { - RTE_LOG(ERR, EAL, " no supported IOMMU " - "extensions found!\n"); - return -1; - } - - return vfio_container_fd; - } else { - /* - * if we're in a secondary process, request container fd from the - * primary process via our socket - */ - int socket_fd; + uint32_t ioport_bar; + int ret; - socket_fd = vfio_mp_sync_connect_to_primary(); - if (socket_fd < 0) { - RTE_LOG(ERR, EAL, " cannot connect to primary process!\n"); - return -1; - } - if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) { - RTE_LOG(ERR, EAL, " cannot request container fd!\n"); - close(socket_fd); - return -1; - } - vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd); - if (vfio_container_fd < 0) { - RTE_LOG(ERR, EAL, " cannot get container fd!\n"); - close(socket_fd); - return -1; - } - close(socket_fd); - return vfio_container_fd; + ret = pread64(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar), + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + + PCI_BASE_ADDRESS_0 + bar_index*4); + if (ret != sizeof(ioport_bar)) { + RTE_LOG(ERR, EAL, "Cannot read command (%x) from config space!\n", + PCI_BASE_ADDRESS_0 + bar_index*4); + return -1; } - return -1; + return (ioport_bar & PCI_BASE_ADDRESS_SPACE_IO) != 0; } -/* open group fd or get an existing one */ -int -pci_vfio_get_group_fd(int iommu_group_no) +static int +pci_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd) { - int i; - int vfio_group_fd; - char filename[PATH_MAX]; - - /* check if we already have the group descriptor open */ - for (i = 0; i < vfio_cfg.vfio_group_idx; i++) - if (vfio_cfg.vfio_groups[i].group_no == iommu_group_no) - return vfio_cfg.vfio_groups[i].fd; - - /* if primary, try to open the group */ - if (internal_config.process_type == RTE_PROC_PRIMARY) { - /* try regular group format */ - snprintf(filename, sizeof(filename), - VFIO_GROUP_FMT, iommu_group_no); - vfio_group_fd = open(filename, O_RDWR); - if (vfio_group_fd < 0) { - /* if file not found, it's not an error */ - if (errno != ENOENT) { - RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename, - strerror(errno)); - return -1; - } - - /* special case: try no-IOMMU path as well */ - snprintf(filename, sizeof(filename), - VFIO_NOIOMMU_GROUP_FMT, iommu_group_no); - vfio_group_fd = open(filename, O_RDWR); - if (vfio_group_fd < 0) { - if (errno != ENOENT) { - RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename, - strerror(errno)); - return -1; - } - return 0; - } - /* noiommu group found */ - } + if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) { + RTE_LOG(ERR, EAL, "Error setting up interrupts!\n"); + return -1; + } - /* if the fd is valid, create a new group for it */ - if (vfio_cfg.vfio_group_idx == VFIO_MAX_GROUPS) { - RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n"); - return -1; - } - vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = iommu_group_no; - vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = vfio_group_fd; - return vfio_group_fd; + /* set bus mastering for the device */ + if (pci_vfio_set_bus_master(vfio_dev_fd, true)) { + RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n"); + return -1; } - /* if we're in a secondary process, request group fd from the primary - * process via our socket - */ - else { - int socket_fd, ret; - socket_fd = vfio_mp_sync_connect_to_primary(); + /* Reset the device */ + ioctl(vfio_dev_fd, VFIO_DEVICE_RESET); - if (socket_fd < 0) { - RTE_LOG(ERR, EAL, " cannot connect to primary process!\n"); - return -1; - } - if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) { - RTE_LOG(ERR, EAL, " cannot request container fd!\n"); - close(socket_fd); - return -1; - } - if (vfio_mp_sync_send_request(socket_fd, iommu_group_no) < 0) { - RTE_LOG(ERR, EAL, " cannot send group number!\n"); - close(socket_fd); - return -1; - } - ret = vfio_mp_sync_receive_request(socket_fd); - switch (ret) { - case SOCKET_NO_FD: - close(socket_fd); - return 0; - case SOCKET_OK: - vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd); - /* if we got the fd, return it */ - if (vfio_group_fd > 0) { - close(socket_fd); - return vfio_group_fd; - } - /* fall-through on error */ - default: - RTE_LOG(ERR, EAL, " cannot get container fd!\n"); - close(socket_fd); - return -1; - } - } - return -1; + return 0; } -/* parse IOMMU group number for a PCI device - * returns 1 on success, -1 for errors, 0 for non-existent group - */ static int -pci_vfio_get_group_no(const char *pci_addr, int *iommu_group_no) +pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res, + int bar_index, int additional_flags) { - char linkname[PATH_MAX]; - char filename[PATH_MAX]; - char *tok[16], *group_tok, *end; - int ret; - - memset(linkname, 0, sizeof(linkname)); - memset(filename, 0, sizeof(filename)); - - /* try to find out IOMMU group for this device */ - snprintf(linkname, sizeof(linkname), - SYSFS_PCI_DEVICES "/%s/iommu_group", pci_addr); - - ret = readlink(linkname, filename, sizeof(filename)); - - /* if the link doesn't exist, no VFIO for us */ - if (ret < 0) + struct memreg { + unsigned long offset, size; + } memreg[2] = {}; + void *bar_addr; + struct pci_msix_table *msix_table = &vfio_res->msix_table; + struct pci_map *bar = &vfio_res->maps[bar_index]; + + if (bar->size == 0) + /* Skip this BAR */ return 0; - ret = rte_strsplit(filename, sizeof(filename), - tok, RTE_DIM(tok), '/'); + if (msix_table->bar_index == bar_index) { + /* + * VFIO will not let us map the MSI-X table, + * but we can map around it. + */ + uint32_t table_start = msix_table->offset; + uint32_t table_end = table_start + msix_table->size; + table_end = (table_end + ~PAGE_MASK) & PAGE_MASK; + table_start &= PAGE_MASK; + + if (table_start == 0 && table_end >= bar->size) { + /* Cannot map this BAR */ + RTE_LOG(DEBUG, EAL, "Skipping BAR%d\n", bar_index); + bar->size = 0; + bar->addr = 0; + return 0; + } + + memreg[0].offset = bar->offset; + memreg[0].size = table_start; + memreg[1].offset = bar->offset + table_end; + memreg[1].size = bar->size - table_end; - if (ret <= 0) { - RTE_LOG(ERR, EAL, " %s cannot get IOMMU group\n", pci_addr); - return -1; + RTE_LOG(DEBUG, EAL, + "Trying to map BAR%d that contains the MSI-X " + "table. Trying offsets: " + "0x%04lx:0x%04lx, 0x%04lx:0x%04lx\n", bar_index, + memreg[0].offset, memreg[0].size, + memreg[1].offset, memreg[1].size); + } else { + memreg[0].offset = bar->offset; + memreg[0].size = bar->size; } - /* IOMMU group is always the last token */ - errno = 0; - group_tok = tok[ret - 1]; - end = group_tok; - *iommu_group_no = strtol(group_tok, &end, 10); - if ((end != group_tok && *end != '\0') || errno != 0) { - RTE_LOG(ERR, EAL, " %s error parsing IOMMU number!\n", pci_addr); + /* reserve the address using an inaccessible mapping */ + bar_addr = mmap(bar->addr, bar->size, 0, MAP_PRIVATE | + MAP_ANONYMOUS | additional_flags, -1, 0); + if (bar_addr != MAP_FAILED) { + void *map_addr = NULL; + if (memreg[0].size) { + /* actual map of first part */ + map_addr = pci_map_resource(bar_addr, vfio_dev_fd, + memreg[0].offset, + memreg[0].size, + MAP_FIXED); + } + + /* if there's a second part, try to map it */ + if (map_addr != MAP_FAILED + && memreg[1].offset && memreg[1].size) { + void *second_addr = RTE_PTR_ADD(bar_addr, + memreg[1].offset - + (uintptr_t)bar->offset); + map_addr = pci_map_resource(second_addr, + vfio_dev_fd, + memreg[1].offset, + memreg[1].size, + MAP_FIXED); + } + + if (map_addr == MAP_FAILED || !map_addr) { + munmap(bar_addr, bar->size); + bar_addr = MAP_FAILED; + RTE_LOG(ERR, EAL, "Failed to map pci BAR%d\n", + bar_index); + return -1; + } + } else { + RTE_LOG(ERR, EAL, + "Failed to create inaccessible mapping for BAR%d\n", + bar_index); return -1; } - return 1; -} - -static void -clear_current_group(void) -{ - vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = 0; - vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = -1; + bar->addr = bar_addr; + return 0; } - -/* - * map the PCI resources of a PCI device in virtual memory (VFIO version). - * primary and secondary processes follow almost exactly the same path - */ -int -pci_vfio_map_resource(struct rte_pci_device *dev) +static int +pci_vfio_map_resource_primary(struct rte_pci_device *dev) { - struct vfio_group_status group_status = { - .argsz = sizeof(group_status) - }; struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; - int vfio_group_fd, vfio_dev_fd; - int iommu_group_no; char pci_addr[PATH_MAX] = {0}; + int vfio_dev_fd; struct rte_pci_addr *loc = &dev->addr; - int i, ret, msix_bar; + int i, ret; struct mapped_pci_resource *vfio_res = NULL; - struct mapped_pci_res_list *vfio_res_list = RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); + struct mapped_pci_res_list *vfio_res_list = + RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); struct pci_map *maps; - uint32_t msix_table_offset = 0; - uint32_t msix_table_size = 0; - uint32_t ioport_bar; dev->intr_handle.fd = -1; dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; @@ -669,329 +452,244 @@ pci_vfio_map_resource(struct rte_pci_device *dev) snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, loc->domain, loc->bus, loc->devid, loc->function); - /* get group number */ - ret = pci_vfio_get_group_no(pci_addr, &iommu_group_no); - if (ret == 0) { - RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", - pci_addr); - return 1; - } - - /* if negative, something failed */ - if (ret < 0) - return -1; + ret = vfio_setup_device(pci_get_sysfs_path(), pci_addr, + &vfio_dev_fd, &device_info); + if (ret) + return ret; - /* get the actual group fd */ - vfio_group_fd = pci_vfio_get_group_fd(iommu_group_no); - if (vfio_group_fd < 0) - return -1; + /* allocate vfio_res and get region info */ + vfio_res = rte_zmalloc("VFIO_RES", sizeof(*vfio_res), 0); + if (vfio_res == NULL) { + RTE_LOG(ERR, EAL, + "%s(): cannot store uio mmap details\n", __func__); + goto err_vfio_dev_fd; + } + memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr)); - /* store group fd */ - vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = iommu_group_no; - vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = vfio_group_fd; + /* get number of registers (up to BAR5) */ + vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions, + VFIO_PCI_BAR5_REGION_INDEX + 1); - /* if group_fd == 0, that means the device isn't managed by VFIO */ - if (vfio_group_fd == 0) { - RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", - pci_addr); - /* we store 0 as group fd to distinguish between existing but - * unbound VFIO groups, and groups that don't exist at all. - */ - vfio_cfg.vfio_group_idx++; - return 1; - } + /* map BARs */ + maps = vfio_res->maps; - /* - * at this point, we know at least one port on this device is bound to VFIO, - * so we can proceed to try and set this particular port up + vfio_res->msix_table.bar_index = -1; + /* get MSI-X BAR, if any (we have to know where it is because we can't + * easily mmap it when using VFIO) */ - - /* check if the group is viable */ - ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status); - if (ret) { - RTE_LOG(ERR, EAL, " %s cannot get group status, " - "error %i (%s)\n", pci_addr, errno, strerror(errno)); - close(vfio_group_fd); - clear_current_group(); - return -1; - } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { - RTE_LOG(ERR, EAL, " %s VFIO group is not viable!\n", pci_addr); - close(vfio_group_fd); - clear_current_group(); - return -1; + ret = pci_vfio_get_msix_bar(vfio_dev_fd, &vfio_res->msix_table); + if (ret < 0) { + RTE_LOG(ERR, EAL, " %s cannot get MSI-X BAR number!\n", + pci_addr); + goto err_vfio_dev_fd; } - /* - * at this point, we know that this group is viable (meaning, all devices - * are either bound to VFIO or not bound to anything) - */ + for (i = 0; i < (int) vfio_res->nb_maps; i++) { + struct vfio_region_info reg = { .argsz = sizeof(reg) }; + void *bar_addr; - /* check if group does not have a container yet */ - if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) { + reg.index = i; - /* add group to a container */ - ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER, - &vfio_cfg.vfio_container_fd); + ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ®); if (ret) { - RTE_LOG(ERR, EAL, " %s cannot add VFIO group to container, " + RTE_LOG(ERR, EAL, " %s cannot get device region info " "error %i (%s)\n", pci_addr, errno, strerror(errno)); - close(vfio_group_fd); - clear_current_group(); - return -1; + goto err_vfio_res; } - /* - * at this point we know that this group has been successfully - * initialized, so we increment vfio_group_idx to indicate that we can - * add new groups. - */ - vfio_cfg.vfio_group_idx++; - } - /* - * pick an IOMMU type and set up DMA mappings for container - * - * needs to be done only once, only when at least one group is assigned to - * a container and only in primary process - */ - if (internal_config.process_type == RTE_PROC_PRIMARY && - vfio_cfg.vfio_container_has_dma == 0) { - /* select an IOMMU type which we will be using */ - const struct vfio_iommu_type *t = - pci_vfio_set_iommu_type(vfio_cfg.vfio_container_fd); - if (!t) { - RTE_LOG(ERR, EAL, " %s failed to select IOMMU type\n", pci_addr); - return -1; + /* chk for io port region */ + ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i); + if (ret < 0) + goto err_vfio_res; + else if (ret) { + RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n", + i); + continue; } - ret = t->dma_map_func(vfio_cfg.vfio_container_fd); - if (ret) { - RTE_LOG(ERR, EAL, " %s DMA remapping failed, " - "error %i (%s)\n", pci_addr, errno, strerror(errno)); - return -1; + + /* skip non-mmapable BARs */ + if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) + continue; + + /* try mapping somewhere close to the end of hugepages */ + if (pci_map_addr == NULL) + pci_map_addr = pci_find_max_end_va(); + + bar_addr = pci_map_addr; + pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size); + + maps[i].addr = bar_addr; + maps[i].offset = reg.offset; + maps[i].size = reg.size; + maps[i].path = NULL; /* vfio doesn't have per-resource paths */ + + ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0); + if (ret < 0) { + RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\n", + pci_addr, i, strerror(errno)); + goto err_vfio_res; } - vfio_cfg.vfio_container_has_dma = 1; - } - /* get a file descriptor for the device */ - vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, pci_addr); - if (vfio_dev_fd < 0) { - /* if we cannot get a device fd, this simply means that this - * particular port is not bound to VFIO - */ - RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", - pci_addr); - return 1; + dev->mem_resource[i].addr = maps[i].addr; } - /* test and setup the device */ - ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_INFO, &device_info); - if (ret) { - RTE_LOG(ERR, EAL, " %s cannot get device info, " - "error %i (%s)\n", pci_addr, errno, strerror(errno)); - close(vfio_dev_fd); - return -1; + if (pci_vfio_setup_device(dev, vfio_dev_fd) < 0) { + RTE_LOG(ERR, EAL, " %s setup device failed\n", pci_addr); + goto err_vfio_res; } - /* get MSI-X BAR, if any (we have to know where it is because we can't - * easily mmap it when using VFIO) */ - msix_bar = -1; - ret = pci_vfio_get_msix_bar(vfio_dev_fd, &msix_bar, - &msix_table_offset, &msix_table_size); - if (ret < 0) { - RTE_LOG(ERR, EAL, " %s cannot get MSI-X BAR number!\n", pci_addr); - close(vfio_dev_fd); - return -1; - } + TAILQ_INSERT_TAIL(vfio_res_list, vfio_res, next); - /* if we're in a primary process, allocate vfio_res and get region info */ - if (internal_config.process_type == RTE_PROC_PRIMARY) { - vfio_res = rte_zmalloc("VFIO_RES", sizeof(*vfio_res), 0); - if (vfio_res == NULL) { - RTE_LOG(ERR, EAL, - "%s(): cannot store uio mmap details\n", __func__); - close(vfio_dev_fd); - return -1; - } - memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr)); + return 0; +err_vfio_res: + rte_free(vfio_res); +err_vfio_dev_fd: + close(vfio_dev_fd); + return -1; +} - /* get number of registers (up to BAR5) */ - vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions, - VFIO_PCI_BAR5_REGION_INDEX + 1); - } else { - /* if we're in a secondary process, just find our tailq entry */ - TAILQ_FOREACH(vfio_res, vfio_res_list, next) { - if (memcmp(&vfio_res->pci_addr, &dev->addr, sizeof(dev->addr))) - continue; - break; - } - /* if we haven't found our tailq entry, something's wrong */ - if (vfio_res == NULL) { - RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n", - pci_addr); - close(vfio_dev_fd); - return -1; - } +static int +pci_vfio_map_resource_secondary(struct rte_pci_device *dev) +{ + struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; + char pci_addr[PATH_MAX] = {0}; + int vfio_dev_fd; + struct rte_pci_addr *loc = &dev->addr; + int i, ret; + struct mapped_pci_resource *vfio_res = NULL; + struct mapped_pci_res_list *vfio_res_list = + RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); + + struct pci_map *maps; + + dev->intr_handle.fd = -1; + dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; + + /* store PCI address string */ + snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, + loc->domain, loc->bus, loc->devid, loc->function); + + ret = vfio_setup_device(pci_get_sysfs_path(), pci_addr, + &vfio_dev_fd, &device_info); + if (ret) + return ret; + + /* if we're in a secondary process, just find our tailq entry */ + TAILQ_FOREACH(vfio_res, vfio_res_list, next) { + if (rte_eal_compare_pci_addr(&vfio_res->pci_addr, + &dev->addr)) + continue; + break; + } + /* if we haven't found our tailq entry, something's wrong */ + if (vfio_res == NULL) { + RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n", + pci_addr); + goto err_vfio_dev_fd; } /* map BARs */ maps = vfio_res->maps; for (i = 0; i < (int) vfio_res->nb_maps; i++) { - struct vfio_region_info reg = { .argsz = sizeof(reg) }; - void *bar_addr; - struct memreg { - unsigned long offset, size; - } memreg[2] = {}; - - reg.index = i; - - ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ®); - - if (ret) { - RTE_LOG(ERR, EAL, " %s cannot get device region info " - "error %i (%s)\n", pci_addr, errno, strerror(errno)); - close(vfio_dev_fd); - if (internal_config.process_type == RTE_PROC_PRIMARY) - rte_free(vfio_res); - return -1; + ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, MAP_FIXED); + if (ret < 0) { + RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\n", + pci_addr, i, strerror(errno)); + goto err_vfio_dev_fd; } - /* chk for io port region */ - ret = pread64(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar), - VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) - + PCI_BASE_ADDRESS_0 + i*4); - - if (ret != sizeof(ioport_bar)) { - RTE_LOG(ERR, EAL, - "Cannot read command (%x) from config space!\n", - PCI_BASE_ADDRESS_0 + i*4); - return -1; - } + dev->mem_resource[i].addr = maps[i].addr; + } - if (ioport_bar & PCI_BASE_ADDRESS_SPACE_IO) { - RTE_LOG(INFO, EAL, - "Ignore mapping IO port bar(%d) addr: %x\n", - i, ioport_bar); - continue; - } + return 0; +err_vfio_dev_fd: + close(vfio_dev_fd); + return -1; +} - /* skip non-mmapable BARs */ - if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) - continue; +/* + * map the PCI resources of a PCI device in virtual memory (VFIO version). + * primary and secondary processes follow almost exactly the same path + */ +int +pci_vfio_map_resource(struct rte_pci_device *dev) +{ + if (internal_config.process_type == RTE_PROC_PRIMARY) + return pci_vfio_map_resource_primary(dev); + else + return pci_vfio_map_resource_secondary(dev); +} - if (i == msix_bar) { - /* - * VFIO will not let us map the MSI-X table, - * but we can map around it. - */ - uint32_t table_start = msix_table_offset; - uint32_t table_end = table_start + msix_table_size; - table_end = (table_end + ~PAGE_MASK) & PAGE_MASK; - table_start &= PAGE_MASK; - - if (table_start == 0 && table_end >= reg.size) { - /* Cannot map this BAR */ - RTE_LOG(DEBUG, EAL, "Skipping BAR %d\n", i); - continue; - } else { - memreg[0].offset = reg.offset; - memreg[0].size = table_start; - memreg[1].offset = table_end; - memreg[1].size = reg.size - table_end; - - RTE_LOG(DEBUG, EAL, - "Trying to map BAR %d that contains the MSI-X " - "table. Trying offsets: " - "0x%04lx:0x%04lx, 0x%04lx:0x%04lx\n", i, - memreg[0].offset, memreg[0].size, - memreg[1].offset, memreg[1].size); - } - } else { - memreg[0].offset = reg.offset; - memreg[0].size = reg.size; - } +int +pci_vfio_unmap_resource(struct rte_pci_device *dev) +{ + char pci_addr[PATH_MAX] = {0}; + struct rte_pci_addr *loc = &dev->addr; + int i, ret; + struct mapped_pci_resource *vfio_res = NULL; + struct mapped_pci_res_list *vfio_res_list; - /* try to figure out an address */ - if (internal_config.process_type == RTE_PROC_PRIMARY) { - /* try mapping somewhere close to the end of hugepages */ - if (pci_map_addr == NULL) - pci_map_addr = pci_find_max_end_va(); + struct pci_map *maps; - bar_addr = pci_map_addr; - pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size); - } else { - bar_addr = maps[i].addr; - } + /* store PCI address string */ + snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, + loc->domain, loc->bus, loc->devid, loc->function); - /* reserve the address using an inaccessible mapping */ - bar_addr = mmap(bar_addr, reg.size, 0, MAP_PRIVATE | - MAP_ANONYMOUS, -1, 0); - if (bar_addr != MAP_FAILED) { - void *map_addr = NULL; - if (memreg[0].size) { - /* actual map of first part */ - map_addr = pci_map_resource(bar_addr, vfio_dev_fd, - memreg[0].offset, - memreg[0].size, - MAP_FIXED); - } - /* if there's a second part, try to map it */ - if (map_addr != MAP_FAILED - && memreg[1].offset && memreg[1].size) { - void *second_addr = RTE_PTR_ADD(bar_addr, memreg[1].offset); - map_addr = pci_map_resource(second_addr, - vfio_dev_fd, memreg[1].offset, - memreg[1].size, - MAP_FIXED); - } + if (close(dev->intr_handle.fd) < 0) { + RTE_LOG(INFO, EAL, "Error when closing eventfd file descriptor for %s\n", + pci_addr); + return -1; + } - if (map_addr == MAP_FAILED || !map_addr) { - munmap(bar_addr, reg.size); - bar_addr = MAP_FAILED; - } - } + if (pci_vfio_set_bus_master(dev->intr_handle.vfio_dev_fd, false)) { + RTE_LOG(ERR, EAL, " %s cannot unset bus mastering for PCI device!\n", + pci_addr); + return -1; + } - if (bar_addr == MAP_FAILED || - (internal_config.process_type == RTE_PROC_SECONDARY && - bar_addr != maps[i].addr)) { - RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\n", pci_addr, i, - strerror(errno)); - close(vfio_dev_fd); - if (internal_config.process_type == RTE_PROC_PRIMARY) - rte_free(vfio_res); - return -1; - } + ret = vfio_release_device(pci_get_sysfs_path(), pci_addr, + dev->intr_handle.vfio_dev_fd); + if (ret < 0) { + RTE_LOG(ERR, EAL, + "%s(): cannot release device\n", __func__); + return ret; + } - maps[i].addr = bar_addr; - maps[i].offset = reg.offset; - maps[i].size = reg.size; - maps[i].path = NULL; /* vfio doesn't have per-resource paths */ - dev->mem_resource[i].addr = bar_addr; + vfio_res_list = RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); + /* Get vfio_res */ + TAILQ_FOREACH(vfio_res, vfio_res_list, next) { + if (memcmp(&vfio_res->pci_addr, &dev->addr, sizeof(dev->addr))) + continue; + break; + } + /* if we haven't found our tailq entry, something's wrong */ + if (vfio_res == NULL) { + RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n", + pci_addr); + return -1; } - /* if secondary process, do not set up interrupts */ - if (internal_config.process_type == RTE_PROC_PRIMARY) { - if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) { - RTE_LOG(ERR, EAL, " %s error setting up interrupts!\n", pci_addr); - close(vfio_dev_fd); - rte_free(vfio_res); - return -1; - } + /* unmap BARs */ + maps = vfio_res->maps; - /* set bus mastering for the device */ - if (pci_vfio_set_bus_master(vfio_dev_fd)) { - RTE_LOG(ERR, EAL, " %s cannot set up bus mastering!\n", pci_addr); - close(vfio_dev_fd); - rte_free(vfio_res); - return -1; - } + RTE_LOG(INFO, EAL, "Releasing pci mapped resource for %s\n", + pci_addr); + for (i = 0; i < (int) vfio_res->nb_maps; i++) { - /* Reset the device */ - ioctl(vfio_dev_fd, VFIO_DEVICE_RESET); + /* + * We do not need to be aware of MSI-X table BAR mappings as + * when mapping. Just using current maps array is enough + */ + if (maps[i].addr) { + RTE_LOG(INFO, EAL, "Calling pci_unmap_resource for %s at %p\n", + pci_addr, maps[i].addr); + pci_unmap_resource(maps[i].addr, maps[i].size); + } } - if (internal_config.process_type == RTE_PROC_PRIMARY) - TAILQ_INSERT_TAIL(vfio_res_list, vfio_res, next); + TAILQ_REMOVE(vfio_res_list, vfio_res, next); return 0; } @@ -1047,50 +745,12 @@ pci_vfio_ioport_unmap(struct rte_pci_ioport *p) int pci_vfio_enable(void) { - /* initialize group list */ - int i; - int vfio_available; - - for (i = 0; i < VFIO_MAX_GROUPS; i++) { - vfio_cfg.vfio_groups[i].fd = -1; - vfio_cfg.vfio_groups[i].group_no = -1; - } - - /* inform the user that we are probing for VFIO */ - RTE_LOG(INFO, EAL, "Probing VFIO support...\n"); - - /* check if vfio-pci module is loaded */ - vfio_available = rte_eal_check_module("vfio_pci"); - - /* return error directly */ - if (vfio_available == -1) { - RTE_LOG(INFO, EAL, "Could not get loaded module details!\n"); - return -1; - } - - /* return 0 if VFIO modules not loaded */ - if (vfio_available == 0) { - RTE_LOG(INFO, EAL, "VFIO modules not loaded, " - "skipping VFIO support...\n"); - return 0; - } - - vfio_cfg.vfio_container_fd = pci_vfio_get_container_fd(); - - /* check if we have VFIO driver enabled */ - if (vfio_cfg.vfio_container_fd != -1) { - RTE_LOG(NOTICE, EAL, "VFIO support initialized\n"); - vfio_cfg.vfio_enabled = 1; - } else { - RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n"); - } - - return 0; + return vfio_enable("vfio_pci"); } int pci_vfio_is_enabled(void) { - return vfio_cfg.vfio_enabled; + return vfio_is_enabled("vfio_pci"); } #endif