eal/linux: allow to map BARs with MSI-X tables
authorDan Aloni <dan@kernelim.com>
Wed, 28 Jan 2015 22:04:53 +0000 (00:04 +0200)
committerThomas Monjalon <thomas.monjalon@6wind.com>
Mon, 23 Feb 2015 20:57:31 +0000 (21:57 +0100)
While VFIO doesn't allow us to map complete BARs with MSI-X tables,
it does allow us to map around them in PAGE_SIZE granularity. There
might be adapters that provide their registers in the same BAR
but on a different page. For example, Intel's NVME adapter, though
not a network adapter, provides only one MMIO BAR that contains
the MSI-X table.

Signed-off-by: Dan Aloni <dan@kernelim.com>
Acked-by: Anatoly Burakov <anatoly.burakov@intel.com>
lib/librte_eal/linuxapp/eal/eal_pci.c
lib/librte_eal/linuxapp/eal/eal_pci_init.h
lib/librte_eal/linuxapp/eal/eal_pci_uio.c
lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
lib/librte_eal/linuxapp/eal/eal_vfio.h

index 63bcbce..a4fd5f5 100644 (file)
@@ -118,13 +118,14 @@ pci_find_max_end_va(void)
 
 /* map a particular resource from a file */
 void *
-pci_map_resource(void *requested_addr, int fd, off_t offset, size_t size)
+pci_map_resource(void *requested_addr, int fd, off_t offset, size_t size,
+                int additional_flags)
 {
        void *mapaddr;
 
        /* Map the PCI memory resource of device */
        mapaddr = mmap(requested_addr, size, PROT_READ | PROT_WRITE,
-                       MAP_SHARED, fd, offset);
+                       MAP_SHARED | additional_flags, fd, offset);
        if (mapaddr == MAP_FAILED) {
                RTE_LOG(ERR, EAL, "%s(): cannot mmap(%d, %p, 0x%lx, 0x%lx): %s (%p)\n",
                        __func__, fd, requested_addr,
index 1070eb8..0a0853d 100644 (file)
@@ -66,7 +66,7 @@ extern void *pci_map_addr;
 void *pci_find_max_end_va(void);
 
 void *pci_map_resource(void *requested_addr, int fd, off_t offset,
-               size_t size);
+              size_t size, int additional_flags);
 
 /* map IGB_UIO resource prototype */
 int pci_uio_map_resource(struct rte_pci_device *dev);
index 2b16fcb..ff903d2 100644 (file)
@@ -146,7 +146,7 @@ pci_uio_map_secondary(struct rte_pci_device *dev)
 
                        if (pci_map_resource(uio_res->maps[i].addr, fd,
                                             (off_t)uio_res->maps[i].offset,
-                                            (size_t)uio_res->maps[i].size)
+                                            (size_t)uio_res->maps[i].size, 0)
                            != uio_res->maps[i].addr) {
                                RTE_LOG(ERR, EAL,
                                        "Cannot mmap device resource\n");
@@ -409,7 +409,7 @@ pci_uio_map_resource(struct rte_pci_device *dev)
                                        pci_map_addr = pci_find_max_end_va();
 
                                mapaddr = pci_map_resource(pci_map_addr, fd, (off_t)offset,
-                                               (size_t)maps[j].size);
+                                               (size_t)maps[j].size, 0);
                                if (mapaddr == MAP_FAILED)
                                        fail = 1;
 
index 20e0977..0419172 100644 (file)
@@ -62,6 +62,9 @@
 
 #ifdef VFIO_PRESENT
 
+#define PAGE_SIZE   (sysconf(_SC_PAGESIZE))
+#define PAGE_MASK   (~(PAGE_SIZE - 1))
+
 #define VFIO_DIR "/dev/vfio"
 #define VFIO_CONTAINER_PATH "/dev/vfio/vfio"
 #define VFIO_GROUP_FMT "/dev/vfio/%u"
@@ -72,10 +75,12 @@ static struct vfio_config vfio_cfg;
 
 /* get PCI BAR number where MSI-X interrupts are */
 static int
-pci_vfio_get_msix_bar(int fd, int *msix_bar)
+pci_vfio_get_msix_bar(int fd, int *msix_bar, uint32_t *msix_table_offset,
+                     uint32_t *msix_table_size)
 {
        int ret;
        uint32_t reg;
+       uint16_t flags;
        uint8_t cap_id, cap_offset;
 
        /* read PCI capability pointer from config space */
@@ -134,7 +139,18 @@ pci_vfio_get_msix_bar(int fd, int *msix_bar)
                                return -1;
                        }
 
+                       ret = pread64(fd, &flags, sizeof(flags),
+                                       VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+                                       cap_offset + 2);
+                       if (ret != sizeof(flags)) {
+                               RTE_LOG(ERR, EAL, "Cannot read table flags from PCI config "
+                                               "space!\n");
+                               return -1;
+                       }
+
                        *msix_bar = reg & RTE_PCI_MSIX_TABLE_BIR;
+                       *msix_table_offset = reg & RTE_PCI_MSIX_TABLE_OFFSET;
+                       *msix_table_size = 16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE));
 
                        return 0;
                }
@@ -532,6 +548,8 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
        int i, ret, msix_bar;
        struct mapped_pci_resource *vfio_res = NULL;
        struct pci_map *maps;
+       uint32_t msix_table_offset = 0;
+       uint32_t msix_table_size = 0;
 
        dev->intr_handle.fd = -1;
        dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
@@ -657,9 +675,10 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
        }
 
        /* get MSI-X BAR, if any (we have to know where it is because we can't
-        * mmap it when using VFIO) */
+        * easily mmap it when using VFIO) */
        msix_bar = -1;
-       ret = pci_vfio_get_msix_bar(vfio_dev_fd, &msix_bar);
+       ret = pci_vfio_get_msix_bar(vfio_dev_fd, &msix_bar,
+                                   &msix_table_offset, &msix_table_size);
        if (ret < 0) {
                RTE_LOG(ERR, EAL, "  %s cannot get MSI-X BAR number!\n", pci_addr);
                close(vfio_dev_fd);
@@ -702,6 +721,9 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
        for (i = 0; i < (int) vfio_res->nb_maps; i++) {
                struct vfio_region_info reg = { .argsz = sizeof(reg) };
                void *bar_addr;
+               struct memreg {
+                       uint32_t offset, size;
+               } memreg[2] = {};
 
                reg.index = i;
 
@@ -720,21 +742,77 @@ pci_vfio_map_resource(struct rte_pci_device *dev)
                if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0)
                        continue;
 
-               /* skip MSI-X BAR */
-               if (i == msix_bar)
-                       continue;
+               if (i == msix_bar) {
+                       /*
+                        * VFIO will not let us map the MSI-X table,
+                        * but we can map around it.
+                        */
+                       uint32_t table_start = msix_table_offset;
+                       uint32_t table_end = table_start + msix_table_size;
+                       table_end = (table_end + ~PAGE_MASK) & PAGE_MASK;
+                       table_start &= PAGE_MASK;
+
+                       if (table_start == 0 && table_end >= reg.size) {
+                               /* Cannot map this BAR */
+                               RTE_LOG(DEBUG, EAL, "Skipping BAR %d\n", i);
+                               continue;
+                       } else {
+                               memreg[0].offset = reg.offset;
+                               memreg[0].size = table_start;
+                               memreg[1].offset = table_end;
+                               memreg[1].size = reg.size - table_end;
+
+                               RTE_LOG(DEBUG, EAL,
+                                       "Trying to map BAR %d that contains the MSI-X "
+                                       "table. Trying offsets: "
+                                       "%04x:%04x, %04x:%04x\n", i,
+                                       memreg[0].offset, memreg[0].size,
+                                       memreg[1].offset, memreg[1].size);
+                       }
+               } else {
+                       memreg[0].offset = reg.offset;
+                       memreg[0].size = reg.size;
+               }
 
+               /* try to figure out an address */
                if (internal_config.process_type == RTE_PROC_PRIMARY) {
                        /* try mapping somewhere close to the end of hugepages */
                        if (pci_map_addr == NULL)
                                pci_map_addr = pci_find_max_end_va();
 
-                       bar_addr = pci_map_resource(pci_map_addr, vfio_dev_fd, reg.offset,
-                                       reg.size);
+                       bar_addr = pci_map_addr;
                        pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size);
                } else {
-                       bar_addr = pci_map_resource(maps[i].addr, vfio_dev_fd, reg.offset,
-                                       reg.size);
+                       bar_addr = maps[i].addr;
+               }
+
+               /* reserve the address using an inaccessible mapping */
+               bar_addr = mmap(bar_addr, reg.size, 0, MAP_PRIVATE |
+                               MAP_ANONYMOUS, -1, 0);
+               if (bar_addr != MAP_FAILED) {
+                       void *map_addr = NULL;
+                       if (memreg[0].size) {
+                               /* actual map of first part */
+                               map_addr = pci_map_resource(bar_addr, vfio_dev_fd,
+                                                           memreg[0].offset,
+                                                           memreg[0].size,
+                                                           MAP_FIXED);
+                       }
+
+                       /* if there's a second part, try to map it */
+                       if (map_addr != MAP_FAILED
+                           && memreg[1].offset && memreg[1].size) {
+                               void *second_addr = RTE_PTR_ADD(bar_addr, memreg[1].offset);
+                               map_addr = pci_map_resource(second_addr,
+                                                           vfio_dev_fd, memreg[1].offset,
+                                                           memreg[1].size,
+                                                           MAP_FIXED);
+                       }
+
+                       if (map_addr == MAP_FAILED || !map_addr) {
+                               munmap(bar_addr, reg.size);
+                               bar_addr = MAP_FAILED;
+                       }
                }
 
                if (bar_addr == MAP_FAILED ||
index 03e693e..72ec3f6 100644 (file)
 #include <linux/vfio.h>
 
 #if LINUX_VERSION_CODE < KERNEL_VERSION(3, 10, 0)
-#define RTE_PCI_MSIX_TABLE_BIR 0x7
+#define RTE_PCI_MSIX_TABLE_BIR    0x7
+#define RTE_PCI_MSIX_TABLE_OFFSET 0xfffffff8
+#define RTE_PCI_MSIX_FLAGS_QSIZE  0x07ff
 #else
-#define RTE_PCI_MSIX_TABLE_BIR PCI_MSIX_TABLE_BIR
+#define RTE_PCI_MSIX_TABLE_BIR    PCI_MSIX_TABLE_BIR
+#define RTE_PCI_MSIX_TABLE_OFFSET PCI_MSIX_TABLE_OFFSET
+#define RTE_PCI_MSIX_FLAGS_QSIZE  PCI_MSIX_FLAGS_QSIZE
 #endif
 
 #define VFIO_PRESENT