#define PAGE_SIZE (sysconf(_SC_PAGESIZE))
#define PAGE_MASK (~(PAGE_SIZE - 1))
+static struct rte_tailq_elem rte_vfio_tailq = {
+ .name = "VFIO_RESOURCE_LIST",
+};
+EAL_REGISTER_TAILQ(rte_vfio_tailq)
+
#define VFIO_DIR "/dev/vfio"
#define VFIO_CONTAINER_PATH "/dev/vfio/vfio"
#define VFIO_GROUP_FMT "/dev/vfio/%u"
+#define VFIO_NOIOMMU_GROUP_FMT "/dev/vfio/noiommu-%u"
#define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL)
+#define VFIO_GET_REGION_IDX(x) (x >> 40)
/* per-process VFIO config */
static struct vfio_config vfio_cfg;
+/* DMA mapping function prototype.
+ * Takes VFIO container fd as a parameter.
+ * Returns 0 on success, -1 on error.
+ * */
+typedef int (*vfio_dma_func_t)(int);
+
+struct vfio_iommu_type {
+ int type_id;
+ const char *name;
+ vfio_dma_func_t dma_map_func;
+};
+
+static int vfio_type1_dma_map(int);
+static int vfio_noiommu_dma_map(int);
+
+/* IOMMU types we support */
+static const struct vfio_iommu_type iommu_types[] = {
+ /* x86 IOMMU, otherwise known as type 1 */
+ { RTE_VFIO_TYPE1, "Type 1", &vfio_type1_dma_map},
+ /* IOMMU-less mode */
+ { RTE_VFIO_NOIOMMU, "No-IOMMU", &vfio_noiommu_dma_map},
+};
+
+int
+vfio_type1_dma_map(int vfio_container_fd)
+{
+ const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+ int i, ret;
+
+ /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
+ for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+ struct vfio_iommu_type1_dma_map dma_map;
+
+ if (ms[i].addr == NULL)
+ break;
+
+ memset(&dma_map, 0, sizeof(dma_map));
+ dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+ dma_map.vaddr = ms[i].addr_64;
+ dma_map.size = ms[i].len;
+ dma_map.iova = ms[i].phys_addr;
+ dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot set up DMA remapping, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+int
+vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
+{
+ /* No-IOMMU mode does not need DMA mapping */
+ return 0;
+}
+
+int
+pci_vfio_read_config(const struct rte_intr_handle *intr_handle,
+ void *buf, size_t len, off_t offs)
+{
+ return pread64(intr_handle->vfio_dev_fd, buf, len,
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs);
+}
+
+int
+pci_vfio_write_config(const struct rte_intr_handle *intr_handle,
+ const void *buf, size_t len, off_t offs)
+{
+ return pwrite64(intr_handle->vfio_dev_fd, buf, len,
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs);
+}
+
/* get PCI BAR number where MSI-X interrupts are */
static int
pci_vfio_get_msix_bar(int fd, int *msix_bar, uint32_t *msix_table_offset,
return 0;
}
-/* set up DMA mappings */
-static int
-pci_vfio_setup_dma_maps(int vfio_container_fd)
-{
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
- int i, ret;
-
- ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
- VFIO_TYPE1_IOMMU);
- if (ret) {
- RTE_LOG(ERR, EAL, " cannot set IOMMU type, "
- "error %i (%s)\n", errno, strerror(errno));
- return -1;
+/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */
+static const struct vfio_iommu_type *
+pci_vfio_set_iommu_type(int vfio_container_fd) {
+ unsigned idx;
+ for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
+ const struct vfio_iommu_type *t = &iommu_types[idx];
+
+ int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
+ t->type_id);
+ if (!ret) {
+ RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n",
+ t->type_id, t->name);
+ return t;
+ }
+ /* not an error, there may be more supported IOMMU types */
+ RTE_LOG(DEBUG, EAL, " set IOMMU type %d (%s) failed, "
+ "error %i (%s)\n", t->type_id, t->name, errno,
+ strerror(errno));
}
+ /* if we didn't find a suitable IOMMU type, fail */
+ return NULL;
+}
- /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- struct vfio_iommu_type1_dma_map dma_map;
-
- if (ms[i].addr == NULL)
- break;
-
- memset(&dma_map, 0, sizeof(dma_map));
- dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
- dma_map.vaddr = ms[i].addr_64;
- dma_map.size = ms[i].len;
- dma_map.iova = ms[i].phys_addr;
- dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
-
- ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+/* check if we have any supported extensions */
+static int
+pci_vfio_has_supported_extensions(int vfio_container_fd) {
+ int ret;
+ unsigned idx, n_extensions = 0;
+ for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
+ const struct vfio_iommu_type *t = &iommu_types[idx];
- if (ret) {
- RTE_LOG(ERR, EAL, " cannot set up DMA remapping, "
- "error %i (%s)\n", errno, strerror(errno));
+ ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION,
+ t->type_id);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, " could not get IOMMU type, "
+ "error %i (%s)\n", errno,
+ strerror(errno));
+ close(vfio_container_fd);
return -1;
+ } else if (ret == 1) {
+ /* we found a supported extension */
+ n_extensions++;
}
+ RTE_LOG(DEBUG, EAL, " IOMMU type %d (%s) is %s\n",
+ t->type_id, t->name,
+ ret ? "supported" : "not supported");
+ }
+
+ /* if we didn't find any supported IOMMU types, fail */
+ if (!n_extensions) {
+ close(vfio_container_fd);
+ return -1;
}
return 0;
}
/* set up an eventfd for interrupts */
- fd = eventfd(0, 0);
+ fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
if (fd < 0) {
RTE_LOG(ERR, EAL, " cannot set up eventfd, "
"error %i (%s)\n", errno, strerror(errno));
return -1;
}
- /* check if we support IOMMU type 1 */
- ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU);
- if (ret != 1) {
- if (ret < 0)
- RTE_LOG(ERR, EAL, " could not get IOMMU type, "
- "error %i (%s)\n", errno,
- strerror(errno));
- else
- RTE_LOG(ERR, EAL, " unsupported IOMMU type "
- "detected in VFIO\n");
- close(vfio_container_fd);
+ ret = pci_vfio_has_supported_extensions(vfio_container_fd);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " no supported IOMMU "
+ "extensions found!\n");
return -1;
}
/* if primary, try to open the group */
if (internal_config.process_type == RTE_PROC_PRIMARY) {
+ /* try regular group format */
snprintf(filename, sizeof(filename),
VFIO_GROUP_FMT, iommu_group_no);
vfio_group_fd = open(filename, O_RDWR);
strerror(errno));
return -1;
}
- return 0;
+
+ /* special case: try no-IOMMU path as well */
+ snprintf(filename, sizeof(filename),
+ VFIO_NOIOMMU_GROUP_FMT, iommu_group_no);
+ vfio_group_fd = open(filename, O_RDWR);
+ if (vfio_group_fd < 0) {
+ if (errno != ENOENT) {
+ RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
+ strerror(errno));
+ return -1;
+ }
+ return 0;
+ }
+ /* noiommu group found */
}
/* if the fd is valid, create a new group for it */
if (vfio_cfg.vfio_group_idx == VFIO_MAX_GROUPS) {
RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
+ close(vfio_group_fd);
return -1;
}
vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = iommu_group_no;
}
/* parse IOMMU group number for a PCI device
- * returns -1 for errors, 0 for non-existent group */
+ * returns 1 on success, -1 for errors, 0 for non-existent group
+ */
static int
-pci_vfio_get_group_no(const char *pci_addr)
+pci_vfio_get_group_no(const char *pci_addr, int *iommu_group_no)
{
char linkname[PATH_MAX];
char filename[PATH_MAX];
char *tok[16], *group_tok, *end;
- int ret, iommu_group_no;
+ int ret;
memset(linkname, 0, sizeof(linkname));
memset(filename, 0, sizeof(filename));
errno = 0;
group_tok = tok[ret - 1];
end = group_tok;
- iommu_group_no = strtol(group_tok, &end, 10);
+ *iommu_group_no = strtol(group_tok, &end, 10);
if ((end != group_tok && *end != '\0') || errno != 0) {
RTE_LOG(ERR, EAL, " %s error parsing IOMMU number!\n", pci_addr);
return -1;
}
- return iommu_group_no;
+ return 1;
}
static void
struct rte_pci_addr *loc = &dev->addr;
int i, ret, msix_bar;
struct mapped_pci_resource *vfio_res = NULL;
+ struct mapped_pci_res_list *vfio_res_list = RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
+
struct pci_map *maps;
uint32_t msix_table_offset = 0;
uint32_t msix_table_size = 0;
+ uint32_t ioport_bar;
dev->intr_handle.fd = -1;
dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
loc->domain, loc->bus, loc->devid, loc->function);
/* get group number */
- iommu_group_no = pci_vfio_get_group_no(pci_addr);
-
- /* if 0, group doesn't exist */
- if (iommu_group_no == 0) {
+ ret = pci_vfio_get_group_no(pci_addr, &iommu_group_no);
+ if (ret == 0) {
RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n",
- pci_addr);
+ pci_addr);
return 1;
}
+
/* if negative, something failed */
- else if (iommu_group_no < 0)
+ if (ret < 0)
return -1;
/* get the actual group fd */
}
/*
- * set up DMA mappings for container
+ * pick an IOMMU type and set up DMA mappings for container
*
* needs to be done only once, only when at least one group is assigned to
* a container and only in primary process
*/
if (internal_config.process_type == RTE_PROC_PRIMARY &&
vfio_cfg.vfio_container_has_dma == 0) {
- ret = pci_vfio_setup_dma_maps(vfio_cfg.vfio_container_fd);
+ /* select an IOMMU type which we will be using */
+ const struct vfio_iommu_type *t =
+ pci_vfio_set_iommu_type(vfio_cfg.vfio_container_fd);
+ if (!t) {
+ RTE_LOG(ERR, EAL, " %s failed to select IOMMU type\n", pci_addr);
+ return -1;
+ }
+ ret = t->dma_map_func(vfio_cfg.vfio_container_fd);
if (ret) {
RTE_LOG(ERR, EAL, " %s DMA remapping failed, "
"error %i (%s)\n", pci_addr, errno, strerror(errno));
VFIO_PCI_BAR5_REGION_INDEX + 1);
} else {
/* if we're in a secondary process, just find our tailq entry */
- TAILQ_FOREACH(vfio_res, pci_res_list, next) {
+ TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
if (memcmp(&vfio_res->pci_addr, &dev->addr, sizeof(dev->addr)))
continue;
break;
struct vfio_region_info reg = { .argsz = sizeof(reg) };
void *bar_addr;
struct memreg {
- uint32_t offset, size;
+ unsigned long offset, size;
} memreg[2] = {};
reg.index = i;
return -1;
}
+ /* chk for io port region */
+ ret = pread64(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar),
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX)
+ + PCI_BASE_ADDRESS_0 + i*4);
+
+ if (ret != sizeof(ioport_bar)) {
+ RTE_LOG(ERR, EAL,
+ "Cannot read command (%x) from config space!\n",
+ PCI_BASE_ADDRESS_0 + i*4);
+ return -1;
+ }
+
+ if (ioport_bar & PCI_BASE_ADDRESS_SPACE_IO) {
+ RTE_LOG(INFO, EAL,
+ "Ignore mapping IO port bar(%d) addr: %x\n",
+ i, ioport_bar);
+ continue;
+ }
+
/* skip non-mmapable BARs */
if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0)
continue;
RTE_LOG(DEBUG, EAL,
"Trying to map BAR %d that contains the MSI-X "
"table. Trying offsets: "
- "%04x:%04x, %04x:%04x\n", i,
+ "0x%04lx:0x%04lx, 0x%04lx:0x%04lx\n", i,
memreg[0].offset, memreg[0].size,
memreg[1].offset, memreg[1].size);
}
}
if (internal_config.process_type == RTE_PROC_PRIMARY)
- TAILQ_INSERT_TAIL(pci_res_list, vfio_res, next);
+ TAILQ_INSERT_TAIL(vfio_res_list, vfio_res, next);
+
+ return 0;
+}
+
+int
+pci_vfio_ioport_map(struct rte_pci_device *dev, int bar,
+ struct rte_pci_ioport *p)
+{
+ if (bar < VFIO_PCI_BAR0_REGION_INDEX ||
+ bar > VFIO_PCI_BAR5_REGION_INDEX) {
+ RTE_LOG(ERR, EAL, "invalid bar (%d)!\n", bar);
+ return -1;
+ }
+ p->dev = dev;
+ p->base = VFIO_GET_REGION_ADDR(bar);
return 0;
}
+void
+pci_vfio_ioport_read(struct rte_pci_ioport *p,
+ void *data, size_t len, off_t offset)
+{
+ const struct rte_intr_handle *intr_handle = &p->dev->intr_handle;
+
+ if (pread64(intr_handle->vfio_dev_fd, data,
+ len, p->base + offset) <= 0)
+ RTE_LOG(ERR, EAL,
+ "Can't read from PCI bar (%" PRIu64 ") : offset (%x)\n",
+ VFIO_GET_REGION_IDX(p->base), (int)offset);
+}
+
+void
+pci_vfio_ioport_write(struct rte_pci_ioport *p,
+ const void *data, size_t len, off_t offset)
+{
+ const struct rte_intr_handle *intr_handle = &p->dev->intr_handle;
+
+ if (pwrite64(intr_handle->vfio_dev_fd, data,
+ len, p->base + offset) <= 0)
+ RTE_LOG(ERR, EAL,
+ "Can't write to PCI bar (%" PRIu64 ") : offset (%x)\n",
+ VFIO_GET_REGION_IDX(p->base), (int)offset);
+}
+
+int
+pci_vfio_ioport_unmap(struct rte_pci_ioport *p)
+{
+ RTE_SET_USED(p);
+ return -1;
+}
+
int
pci_vfio_enable(void)
{
/* initialize group list */
int i;
- int module_vfio_type1;
+ int vfio_available;
for (i = 0; i < VFIO_MAX_GROUPS; i++) {
vfio_cfg.vfio_groups[i].fd = -1;
vfio_cfg.vfio_groups[i].group_no = -1;
}
- module_vfio_type1 = rte_eal_check_module("vfio_iommu_type1");
+ /* inform the user that we are probing for VFIO */
+ RTE_LOG(INFO, EAL, "Probing VFIO support...\n");
+
+ /* check if vfio-pci module is loaded */
+ vfio_available = rte_eal_check_module("vfio_pci");
/* return error directly */
- if (module_vfio_type1 == -1) {
+ if (vfio_available == -1) {
RTE_LOG(INFO, EAL, "Could not get loaded module details!\n");
return -1;
}
/* return 0 if VFIO modules not loaded */
- if (module_vfio_type1 == 0) {
- RTE_LOG(INFO, EAL, "VFIO modules not all loaded, "
- "skip VFIO support...\n");
+ if (vfio_available == 0) {
+ RTE_LOG(DEBUG, EAL, "VFIO modules not loaded, "
+ "skipping VFIO support...\n");
return 0;
}
vfio_cfg.vfio_container_fd = pci_vfio_get_container_fd();
/* check if we have VFIO driver enabled */
- if (vfio_cfg.vfio_container_fd != -1)
+ if (vfio_cfg.vfio_container_fd != -1) {
+ RTE_LOG(NOTICE, EAL, "VFIO support initialized\n");
vfio_cfg.vfio_enabled = 1;
- else
- RTE_LOG(INFO, EAL, "VFIO support could not be initialized\n");
+ } else {
+ RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n");
+ }
return 0;
}