bus/pci: clear driver on detach
[dpdk.git] / drivers / bus / pci / pci_common.c
index b3879a9..3f55420 100644 (file)
@@ -6,6 +6,7 @@
 #include <string.h>
 #include <inttypes.h>
 #include <stdint.h>
+#include <stdbool.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <sys/queue.h>
 #include <rte_string_fns.h>
 #include <rte_common.h>
 #include <rte_devargs.h>
+#include <rte_vfio.h>
 
 #include "private.h"
 
 
-static void rte_pci_remove_device(struct rte_pci_device *pci_device);
-
-extern struct rte_pci_bus rte_pci_bus;
-
 #define SYSFS_PCI_DEVICES "/sys/bus/pci/devices"
 
 const char *rte_pci_get_sysfs_path(void)
@@ -125,6 +123,7 @@ rte_pci_probe_one_driver(struct rte_pci_driver *dr,
                         struct rte_pci_device *dev)
 {
        int ret;
+       bool already_probed;
        struct rte_pci_addr *loc;
 
        if ((dr == NULL) || (dev == NULL))
@@ -155,6 +154,13 @@ rte_pci_probe_one_driver(struct rte_pci_driver *dr,
                dev->device.numa_node = 0;
        }
 
+       already_probed = rte_dev_is_probed(&dev->device);
+       if (already_probed && !(dr->drv_flags & RTE_PCI_DRV_PROBE_AGAIN)) {
+               RTE_LOG(DEBUG, EAL, "Device %s is already probed\n",
+                               dev->device.name);
+               return -EEXIST;
+       }
+
        RTE_LOG(INFO, EAL, "  probe driver: %x:%x %s\n", dev->id.vendor_id,
                dev->id.device_id, dr->driver.name);
 
@@ -163,24 +169,38 @@ rte_pci_probe_one_driver(struct rte_pci_driver *dr,
         * This needs to be before rte_pci_map_device(), as it enables to use
         * driver flags for adjusting configuration.
         */
-       dev->driver = dr;
-       dev->device.driver = &dr->driver;
+       if (!already_probed) {
+               enum rte_iova_mode dev_iova_mode;
+               enum rte_iova_mode iova_mode;
+
+               dev_iova_mode = pci_device_iova_mode(dr, dev);
+               iova_mode = rte_eal_iova_mode();
+               if (dev_iova_mode != RTE_IOVA_DC &&
+                   dev_iova_mode != iova_mode) {
+                       RTE_LOG(ERR, EAL, "  Expecting '%s' IOVA mode but current mode is '%s', not initializing\n",
+                               dev_iova_mode == RTE_IOVA_PA ? "PA" : "VA",
+                               iova_mode == RTE_IOVA_PA ? "PA" : "VA");
+                       return -EINVAL;
+               }
 
-       if (dr->drv_flags & RTE_PCI_DRV_NEED_MAPPING) {
+               dev->driver = dr;
+       }
+
+       if (!already_probed && (dr->drv_flags & RTE_PCI_DRV_NEED_MAPPING)) {
                /* map resources for devices that use igb_uio */
                ret = rte_pci_map_device(dev);
                if (ret != 0) {
                        dev->driver = NULL;
-                       dev->device.driver = NULL;
                        return ret;
                }
        }
 
        /* call the driver probe() function */
        ret = dr->probe(dr, dev);
+       if (already_probed)
+               return ret; /* no rollback if already succeeded earlier */
        if (ret) {
                dev->driver = NULL;
-               dev->device.driver = NULL;
                if ((dr->drv_flags & RTE_PCI_DRV_NEED_MAPPING) &&
                        /* Don't unmap if device is unsupported and
                         * driver needs mapped resources.
@@ -188,6 +208,8 @@ rte_pci_probe_one_driver(struct rte_pci_driver *dr,
                        !(ret > 0 &&
                                (dr->drv_flags & RTE_PCI_DRV_KEEP_MAPPED_RES)))
                        rte_pci_unmap_device(dev);
+       } else {
+               dev->device.driver = &dr->driver;
        }
 
        return ret;
@@ -225,6 +247,7 @@ rte_pci_detach_dev(struct rte_pci_device *dev)
 
        /* clear driver structure */
        dev->driver = NULL;
+       dev->device.driver = NULL;
 
        if (dr->drv_flags & RTE_PCI_DRV_NEED_MAPPING)
                /* unmap resources for devices that use igb_uio */
@@ -235,7 +258,7 @@ rte_pci_detach_dev(struct rte_pci_device *dev)
 
 /*
  * If vendor/device ID match, call the probe() function of all
- * registered driver for the given device. Return -1 if initialization
+ * registered driver for the given device. Return < 0 if initialization
  * failed, return 1 if no driver is found for this device.
  */
 static int
@@ -245,17 +268,13 @@ pci_probe_all_drivers(struct rte_pci_device *dev)
        int rc = 0;
 
        if (dev == NULL)
-               return -1;
-
-       /* Check if a driver is already loaded */
-       if (dev->driver != NULL)
-               return 0;
+               return -EINVAL;
 
        FOREACH_DRIVER_ON_PCIBUS(dr) {
                rc = rte_pci_probe_one_driver(dr, dev);
                if (rc < 0)
                        /* negative value is an error */
-                       return -1;
+                       return rc;
                if (rc > 0)
                        /* positive value means driver doesn't support it */
                        continue;
@@ -292,11 +311,14 @@ rte_pci_probe(void)
                        devargs->policy == RTE_DEV_WHITELISTED)
                        ret = pci_probe_all_drivers(dev);
                if (ret < 0) {
-                       RTE_LOG(ERR, EAL, "Requested device " PCI_PRI_FMT
-                                " cannot be used\n", dev->addr.domain, dev->addr.bus,
-                                dev->addr.devid, dev->addr.function);
-                       rte_errno = errno;
-                       failed++;
+                       if (ret != -EEXIST) {
+                               RTE_LOG(ERR, EAL, "Requested device "
+                                       PCI_PRI_FMT " cannot be used\n",
+                                       dev->addr.domain, dev->addr.bus,
+                                       dev->addr.devid, dev->addr.function);
+                               rte_errno = errno;
+                               failed++;
+                       }
                        ret = 0;
                }
        }
@@ -407,6 +429,98 @@ pci_find_device(const struct rte_device *start, rte_dev_cmp_t cmp,
        return NULL;
 }
 
+/*
+ * find the device which encounter the failure, by iterate over all device on
+ * PCI bus to check if the memory failure address is located in the range
+ * of the BARs of the device.
+ */
+static struct rte_pci_device *
+pci_find_device_by_addr(const void *failure_addr)
+{
+       struct rte_pci_device *pdev = NULL;
+       uint64_t check_point, start, end, len;
+       int i;
+
+       check_point = (uint64_t)(uintptr_t)failure_addr;
+
+       FOREACH_DEVICE_ON_PCIBUS(pdev) {
+               for (i = 0; i != RTE_DIM(pdev->mem_resource); i++) {
+                       start = (uint64_t)(uintptr_t)pdev->mem_resource[i].addr;
+                       len = pdev->mem_resource[i].len;
+                       end = start + len;
+                       if (check_point >= start && check_point < end) {
+                               RTE_LOG(DEBUG, EAL, "Failure address %16.16"
+                                       PRIx64" belongs to device %s!\n",
+                                       check_point, pdev->device.name);
+                               return pdev;
+                       }
+               }
+       }
+       return NULL;
+}
+
+static int
+pci_hot_unplug_handler(struct rte_device *dev)
+{
+       struct rte_pci_device *pdev = NULL;
+       int ret = 0;
+
+       pdev = RTE_DEV_TO_PCI(dev);
+       if (!pdev)
+               return -1;
+
+       switch (pdev->kdrv) {
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+       case RTE_KDRV_VFIO:
+               /*
+                * vfio kernel module guaranty the pci device would not be
+                * deleted until the user space release the resource, so no
+                * need to remap BARs resource here, just directly notify
+                * the req event to the user space to handle it.
+                */
+               rte_dev_event_callback_process(dev->name,
+                                              RTE_DEV_EVENT_REMOVE);
+               break;
+#endif
+       case RTE_KDRV_IGB_UIO:
+       case RTE_KDRV_UIO_GENERIC:
+       case RTE_KDRV_NIC_UIO:
+               /* BARs resource is invalid, remap it to be safe. */
+               ret = pci_uio_remap_resource(pdev);
+               break;
+       default:
+               RTE_LOG(DEBUG, EAL,
+                       "Not managed by a supported kernel driver, skipped\n");
+               ret = -1;
+               break;
+       }
+
+       return ret;
+}
+
+static int
+pci_sigbus_handler(const void *failure_addr)
+{
+       struct rte_pci_device *pdev = NULL;
+       int ret = 0;
+
+       pdev = pci_find_device_by_addr(failure_addr);
+       if (!pdev) {
+               /* It is a generic sigbus error, no bus would handle it. */
+               ret = 1;
+       } else {
+               /* The sigbus error is caused of hot-unplug. */
+               ret = pci_hot_unplug_handler(&pdev->device);
+               if (ret) {
+                       RTE_LOG(ERR, EAL,
+                               "Failed to handle hot-unplug for device %s",
+                               pdev->name);
+                       ret = -1;
+               }
+       }
+       return ret;
+}
+
 static int
 pci_plug(struct rte_device *dev)
 {
@@ -423,11 +537,141 @@ pci_unplug(struct rte_device *dev)
        ret = rte_pci_detach_dev(pdev);
        if (ret == 0) {
                rte_pci_remove_device(pdev);
+               rte_devargs_remove(dev->devargs);
                free(pdev);
        }
        return ret;
 }
 
+static int
+pci_dma_map(struct rte_device *dev, void *addr, uint64_t iova, size_t len)
+{
+       struct rte_pci_device *pdev = RTE_DEV_TO_PCI(dev);
+
+       if (!pdev || !pdev->driver) {
+               rte_errno = EINVAL;
+               return -1;
+       }
+       if (pdev->driver->dma_map)
+               return pdev->driver->dma_map(pdev, addr, iova, len);
+       /**
+        *  In case driver don't provides any specific mapping
+        *  try fallback to VFIO.
+        */
+       if (pdev->kdrv == RTE_KDRV_VFIO)
+               return rte_vfio_container_dma_map
+                               (RTE_VFIO_DEFAULT_CONTAINER_FD, (uintptr_t)addr,
+                                iova, len);
+       rte_errno = ENOTSUP;
+       return -1;
+}
+
+static int
+pci_dma_unmap(struct rte_device *dev, void *addr, uint64_t iova, size_t len)
+{
+       struct rte_pci_device *pdev = RTE_DEV_TO_PCI(dev);
+
+       if (!pdev || !pdev->driver) {
+               rte_errno = EINVAL;
+               return -1;
+       }
+       if (pdev->driver->dma_unmap)
+               return pdev->driver->dma_unmap(pdev, addr, iova, len);
+       /**
+        *  In case driver don't provides any specific mapping
+        *  try fallback to VFIO.
+        */
+       if (pdev->kdrv == RTE_KDRV_VFIO)
+               return rte_vfio_container_dma_unmap
+                               (RTE_VFIO_DEFAULT_CONTAINER_FD, (uintptr_t)addr,
+                                iova, len);
+       rte_errno = ENOTSUP;
+       return -1;
+}
+
+static bool
+pci_ignore_device(const struct rte_pci_device *dev)
+{
+       struct rte_devargs *devargs = dev->device.devargs;
+
+       switch (rte_pci_bus.bus.conf.scan_mode) {
+       case RTE_BUS_SCAN_WHITELIST:
+               if (devargs && devargs->policy == RTE_DEV_WHITELISTED)
+                       return false;
+               break;
+       case RTE_BUS_SCAN_UNDEFINED:
+       case RTE_BUS_SCAN_BLACKLIST:
+               if (devargs == NULL ||
+                   devargs->policy != RTE_DEV_BLACKLISTED)
+                       return false;
+               break;
+       }
+       return true;
+}
+
+enum rte_iova_mode
+rte_pci_get_iommu_class(void)
+{
+       enum rte_iova_mode iova_mode = RTE_IOVA_DC;
+       const struct rte_pci_device *dev;
+       const struct rte_pci_driver *drv;
+       bool devices_want_va = false;
+       bool devices_want_pa = false;
+       int iommu_no_va = -1;
+
+       FOREACH_DEVICE_ON_PCIBUS(dev) {
+               /*
+                * We can check this only once, because the IOMMU hardware is
+                * the same for all of them.
+                */
+               if (iommu_no_va == -1)
+                       iommu_no_va = pci_device_iommu_support_va(dev)
+                                       ? 0 : 1;
+               if (pci_ignore_device(dev))
+                       continue;
+               if (dev->kdrv == RTE_KDRV_UNKNOWN ||
+                   dev->kdrv == RTE_KDRV_NONE)
+                       continue;
+               FOREACH_DRIVER_ON_PCIBUS(drv) {
+                       enum rte_iova_mode dev_iova_mode;
+
+                       if (!rte_pci_match(drv, dev))
+                               continue;
+
+                       dev_iova_mode = pci_device_iova_mode(drv, dev);
+                       RTE_LOG(DEBUG, EAL, "PCI driver %s for device "
+                               PCI_PRI_FMT " wants IOVA as '%s'\n",
+                               drv->driver.name,
+                               dev->addr.domain, dev->addr.bus,
+                               dev->addr.devid, dev->addr.function,
+                               dev_iova_mode == RTE_IOVA_DC ? "DC" :
+                               (dev_iova_mode == RTE_IOVA_PA ? "PA" : "VA"));
+                       if (dev_iova_mode == RTE_IOVA_PA)
+                               devices_want_pa = true;
+                       else if (dev_iova_mode == RTE_IOVA_VA)
+                               devices_want_va = true;
+               }
+       }
+       if (iommu_no_va == 1) {
+               iova_mode = RTE_IOVA_PA;
+               if (devices_want_va) {
+                       RTE_LOG(WARNING, EAL, "Some devices want 'VA' but IOMMU does not support 'VA'.\n");
+                       RTE_LOG(WARNING, EAL, "The devices that want 'VA' won't initialize.\n");
+               }
+       } else if (devices_want_va && !devices_want_pa) {
+               iova_mode = RTE_IOVA_VA;
+       } else if (devices_want_pa && !devices_want_va) {
+               iova_mode = RTE_IOVA_PA;
+       } else {
+               iova_mode = RTE_IOVA_DC;
+               if (devices_want_va) {
+                       RTE_LOG(WARNING, EAL, "Some devices want 'VA' but forcing 'DC' because other devices want 'PA'.\n");
+                       RTE_LOG(WARNING, EAL, "Depending on the final decision by the EAL, not all devices may be able to initialize.\n");
+               }
+       }
+       return iova_mode;
+}
+
 struct rte_pci_bus rte_pci_bus = {
        .bus = {
                .scan = rte_pci_scan,
@@ -436,7 +680,12 @@ struct rte_pci_bus rte_pci_bus = {
                .plug = pci_plug,
                .unplug = pci_unplug,
                .parse = pci_parse,
+               .dma_map = pci_dma_map,
+               .dma_unmap = pci_dma_unmap,
                .get_iommu_class = rte_pci_get_iommu_class,
+               .dev_iterate = rte_pci_dev_iterate,
+               .hot_unplug_handler = pci_hot_unplug_handler,
+               .sigbus_handler = pci_sigbus_handler,
        },
        .device_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.device_list),
        .driver_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.driver_list),