From 90a1633b2347e0c806faf0eed4edcb9e302e4247 Mon Sep 17 00:00:00 2001 From: Dan Aloni Date: Thu, 29 Jan 2015 00:04:53 +0200 Subject: [PATCH] eal/linux: allow to map BARs with MSI-X tables While VFIO doesn't allow us to map complete BARs with MSI-X tables, it does allow us to map around them in PAGE_SIZE granularity. There might be adapters that provide their registers in the same BAR but on a different page. For example, Intel's NVME adapter, though not a network adapter, provides only one MMIO BAR that contains the MSI-X table. Signed-off-by: Dan Aloni Acked-by: Anatoly Burakov --- lib/librte_eal/linuxapp/eal/eal_pci.c | 5 +- lib/librte_eal/linuxapp/eal/eal_pci_init.h | 2 +- lib/librte_eal/linuxapp/eal/eal_pci_uio.c | 4 +- lib/librte_eal/linuxapp/eal/eal_pci_vfio.c | 98 +++++++++++++++++++--- lib/librte_eal/linuxapp/eal/eal_vfio.h | 8 +- 5 files changed, 100 insertions(+), 17 deletions(-) diff --git a/lib/librte_eal/linuxapp/eal/eal_pci.c b/lib/librte_eal/linuxapp/eal/eal_pci.c index 63bcbceb1d..a4fd5f5a10 100644 --- a/lib/librte_eal/linuxapp/eal/eal_pci.c +++ b/lib/librte_eal/linuxapp/eal/eal_pci.c @@ -118,13 +118,14 @@ pci_find_max_end_va(void) /* map a particular resource from a file */ void * -pci_map_resource(void *requested_addr, int fd, off_t offset, size_t size) +pci_map_resource(void *requested_addr, int fd, off_t offset, size_t size, + int additional_flags) { void *mapaddr; /* Map the PCI memory resource of device */ mapaddr = mmap(requested_addr, size, PROT_READ | PROT_WRITE, - MAP_SHARED, fd, offset); + MAP_SHARED | additional_flags, fd, offset); if (mapaddr == MAP_FAILED) { RTE_LOG(ERR, EAL, "%s(): cannot mmap(%d, %p, 0x%lx, 0x%lx): %s (%p)\n", __func__, fd, requested_addr, diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_init.h b/lib/librte_eal/linuxapp/eal/eal_pci_init.h index 1070eb88fe..0a0853d4c4 100644 --- a/lib/librte_eal/linuxapp/eal/eal_pci_init.h +++ b/lib/librte_eal/linuxapp/eal/eal_pci_init.h @@ -66,7 +66,7 @@ extern void *pci_map_addr; void *pci_find_max_end_va(void); void *pci_map_resource(void *requested_addr, int fd, off_t offset, - size_t size); + size_t size, int additional_flags); /* map IGB_UIO resource prototype */ int pci_uio_map_resource(struct rte_pci_device *dev); diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c index 2b16fcb544..ff903d287e 100644 --- a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c +++ b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c @@ -146,7 +146,7 @@ pci_uio_map_secondary(struct rte_pci_device *dev) if (pci_map_resource(uio_res->maps[i].addr, fd, (off_t)uio_res->maps[i].offset, - (size_t)uio_res->maps[i].size) + (size_t)uio_res->maps[i].size, 0) != uio_res->maps[i].addr) { RTE_LOG(ERR, EAL, "Cannot mmap device resource\n"); @@ -409,7 +409,7 @@ pci_uio_map_resource(struct rte_pci_device *dev) pci_map_addr = pci_find_max_end_va(); mapaddr = pci_map_resource(pci_map_addr, fd, (off_t)offset, - (size_t)maps[j].size); + (size_t)maps[j].size, 0); if (mapaddr == MAP_FAILED) fail = 1; diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c index 20e097727f..041917231f 100644 --- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c @@ -62,6 +62,9 @@ #ifdef VFIO_PRESENT +#define PAGE_SIZE (sysconf(_SC_PAGESIZE)) +#define PAGE_MASK (~(PAGE_SIZE - 1)) + #define VFIO_DIR "/dev/vfio" #define VFIO_CONTAINER_PATH "/dev/vfio/vfio" #define VFIO_GROUP_FMT "/dev/vfio/%u" @@ -72,10 +75,12 @@ static struct vfio_config vfio_cfg; /* get PCI BAR number where MSI-X interrupts are */ static int -pci_vfio_get_msix_bar(int fd, int *msix_bar) +pci_vfio_get_msix_bar(int fd, int *msix_bar, uint32_t *msix_table_offset, + uint32_t *msix_table_size) { int ret; uint32_t reg; + uint16_t flags; uint8_t cap_id, cap_offset; /* read PCI capability pointer from config space */ @@ -134,7 +139,18 @@ pci_vfio_get_msix_bar(int fd, int *msix_bar) return -1; } + ret = pread64(fd, &flags, sizeof(flags), + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + + cap_offset + 2); + if (ret != sizeof(flags)) { + RTE_LOG(ERR, EAL, "Cannot read table flags from PCI config " + "space!\n"); + return -1; + } + *msix_bar = reg & RTE_PCI_MSIX_TABLE_BIR; + *msix_table_offset = reg & RTE_PCI_MSIX_TABLE_OFFSET; + *msix_table_size = 16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE)); return 0; } @@ -532,6 +548,8 @@ pci_vfio_map_resource(struct rte_pci_device *dev) int i, ret, msix_bar; struct mapped_pci_resource *vfio_res = NULL; struct pci_map *maps; + uint32_t msix_table_offset = 0; + uint32_t msix_table_size = 0; dev->intr_handle.fd = -1; dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; @@ -657,9 +675,10 @@ pci_vfio_map_resource(struct rte_pci_device *dev) } /* get MSI-X BAR, if any (we have to know where it is because we can't - * mmap it when using VFIO) */ + * easily mmap it when using VFIO) */ msix_bar = -1; - ret = pci_vfio_get_msix_bar(vfio_dev_fd, &msix_bar); + ret = pci_vfio_get_msix_bar(vfio_dev_fd, &msix_bar, + &msix_table_offset, &msix_table_size); if (ret < 0) { RTE_LOG(ERR, EAL, " %s cannot get MSI-X BAR number!\n", pci_addr); close(vfio_dev_fd); @@ -702,6 +721,9 @@ pci_vfio_map_resource(struct rte_pci_device *dev) for (i = 0; i < (int) vfio_res->nb_maps; i++) { struct vfio_region_info reg = { .argsz = sizeof(reg) }; void *bar_addr; + struct memreg { + uint32_t offset, size; + } memreg[2] = {}; reg.index = i; @@ -720,21 +742,77 @@ pci_vfio_map_resource(struct rte_pci_device *dev) if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) continue; - /* skip MSI-X BAR */ - if (i == msix_bar) - continue; + if (i == msix_bar) { + /* + * VFIO will not let us map the MSI-X table, + * but we can map around it. + */ + uint32_t table_start = msix_table_offset; + uint32_t table_end = table_start + msix_table_size; + table_end = (table_end + ~PAGE_MASK) & PAGE_MASK; + table_start &= PAGE_MASK; + + if (table_start == 0 && table_end >= reg.size) { + /* Cannot map this BAR */ + RTE_LOG(DEBUG, EAL, "Skipping BAR %d\n", i); + continue; + } else { + memreg[0].offset = reg.offset; + memreg[0].size = table_start; + memreg[1].offset = table_end; + memreg[1].size = reg.size - table_end; + + RTE_LOG(DEBUG, EAL, + "Trying to map BAR %d that contains the MSI-X " + "table. Trying offsets: " + "%04x:%04x, %04x:%04x\n", i, + memreg[0].offset, memreg[0].size, + memreg[1].offset, memreg[1].size); + } + } else { + memreg[0].offset = reg.offset; + memreg[0].size = reg.size; + } + /* try to figure out an address */ if (internal_config.process_type == RTE_PROC_PRIMARY) { /* try mapping somewhere close to the end of hugepages */ if (pci_map_addr == NULL) pci_map_addr = pci_find_max_end_va(); - bar_addr = pci_map_resource(pci_map_addr, vfio_dev_fd, reg.offset, - reg.size); + bar_addr = pci_map_addr; pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size); } else { - bar_addr = pci_map_resource(maps[i].addr, vfio_dev_fd, reg.offset, - reg.size); + bar_addr = maps[i].addr; + } + + /* reserve the address using an inaccessible mapping */ + bar_addr = mmap(bar_addr, reg.size, 0, MAP_PRIVATE | + MAP_ANONYMOUS, -1, 0); + if (bar_addr != MAP_FAILED) { + void *map_addr = NULL; + if (memreg[0].size) { + /* actual map of first part */ + map_addr = pci_map_resource(bar_addr, vfio_dev_fd, + memreg[0].offset, + memreg[0].size, + MAP_FIXED); + } + + /* if there's a second part, try to map it */ + if (map_addr != MAP_FAILED + && memreg[1].offset && memreg[1].size) { + void *second_addr = RTE_PTR_ADD(bar_addr, memreg[1].offset); + map_addr = pci_map_resource(second_addr, + vfio_dev_fd, memreg[1].offset, + memreg[1].size, + MAP_FIXED); + } + + if (map_addr == MAP_FAILED || !map_addr) { + munmap(bar_addr, reg.size); + bar_addr = MAP_FAILED; + } } if (bar_addr == MAP_FAILED || diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h index 03e693e01b..72ec3f62a3 100644 --- a/lib/librte_eal/linuxapp/eal/eal_vfio.h +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h @@ -43,9 +43,13 @@ #include #if LINUX_VERSION_CODE < KERNEL_VERSION(3, 10, 0) -#define RTE_PCI_MSIX_TABLE_BIR 0x7 +#define RTE_PCI_MSIX_TABLE_BIR 0x7 +#define RTE_PCI_MSIX_TABLE_OFFSET 0xfffffff8 +#define RTE_PCI_MSIX_FLAGS_QSIZE 0x07ff #else -#define RTE_PCI_MSIX_TABLE_BIR PCI_MSIX_TABLE_BIR +#define RTE_PCI_MSIX_TABLE_BIR PCI_MSIX_TABLE_BIR +#define RTE_PCI_MSIX_TABLE_OFFSET PCI_MSIX_TABLE_OFFSET +#define RTE_PCI_MSIX_FLAGS_QSIZE PCI_MSIX_FLAGS_QSIZE #endif #define VFIO_PRESENT -- 2.20.1