Listen to INTR_RMV events issued by slaves.
Add atomic flags on slave queues to detect use of slave bursts function.
If a removal is detected, set the recollection flag on this slave.
During a slave upkeep round, if its recollection flag is set and its
burst functions are not in use by any thread, remove that slave.
Signed-off-by: Gaetan Rivet <gaetan.rivet@6wind.com>
Acked-by: Olga Shern <olgas@mellanox.com>
 device underneath the Fail-safe PMD with a specific feature, this feature must
 be supported by the Fail-safe PMD to avoid throwing any error.
 
+A notable exception is the device removal feature. The fail-safe PMD being a
+virtual device, it cannot currently be removed in the sense of a specific bus
+hotplug, like for PCI for example. It will however enable this feature for its
+sub-device automatically, detecting those that are capable and register the
+relevant callback for such event.
+
 Check the feature matrix for the complete set of supported features.
 
 Compilation option
 pass, the new sub-device will be synchronized with other sub-devices, i.e. be
 started if the fail-safe PMD has been started by the user before.
 
+Plug-out feature
+----------------
+
+A sub-device supporting the device removal event can be removed from its bus at
+any time. The fail-safe PMD will register a callback for such event and react
+accordingly. It will try to safely stop, close and uninit the sub-device having
+emitted this event, allowing it to free its eventual resources.
+
 Fail-safe glossary
 ------------------
 
 Upkeep round
     Periodical process when slaves are serviced. Each devices having a state
     different to that of the fail-safe device itself, is synchronized with it.
+    Additionally, each slave having the remove flag set are cleaned-up.
 
 Slave
     In the context of the fail-safe PMD, synonymous to sub-device.
 
                if (ret)
                        ERROR("Unable to synchronize sub_device state");
        }
+       failsafe_dev_remove(dev);
        ret = failsafe_hotplug_alarm_install(dev);
        if (ret)
                ERROR("Unable to set up next alarm");
 
                                    dev, params);
 }
 
+static int
+fs_parse_sub_device(struct sub_device *sdev)
+{
+       struct rte_devargs *da;
+       char devstr[DEVARGS_MAXLEN] = "";
+
+       da = &sdev->devargs;
+       snprintf(devstr, sizeof(devstr), "%s,%s", da->name, da->args);
+       return fs_parse_device(sdev, devstr);
+}
+
 int
 failsafe_args_parse_subs(struct rte_eth_dev *dev)
 {
                        continue;
                if (sdev->cmdline)
                        ret = fs_execute_cmd(sdev, sdev->cmdline);
+               else
+                       ret = fs_parse_sub_device(sdev);
                if (ret == 0)
                        sdev->state = DEV_PARSED;
        }
 
                        return -ENODEV;
                }
                SUB_ID(sdev) = i;
+               sdev->fs_dev = dev;
                sdev->dev = ETH(sdev)->device;
                ETH(sdev)->state = RTE_ETH_DEV_DEFERRED;
                sdev->state = DEV_PROBED;
                return ret;
        if (PRIV(dev)->state < DEV_PROBED)
                PRIV(dev)->state = DEV_PROBED;
-       fs_switch_dev(dev);
+       fs_switch_dev(dev, NULL);
        return 0;
 }
 
 
        return 0;
 }
 
+static void
+fs_dev_remove(struct sub_device *sdev)
+{
+       int ret;
+
+       if (sdev == NULL)
+               return;
+       switch (sdev->state) {
+       case DEV_STARTED:
+               rte_eth_dev_stop(PORT_ID(sdev));
+               sdev->state = DEV_ACTIVE;
+               /* fallthrough */
+       case DEV_ACTIVE:
+               rte_eth_dev_close(PORT_ID(sdev));
+               sdev->state = DEV_PROBED;
+               /* fallthrough */
+       case DEV_PROBED:
+               ret = rte_eal_hotplug_remove(sdev->bus->name,
+                                            sdev->dev->name);
+               if (ret) {
+                       ERROR("Bus detach failed for sub_device %u",
+                             SUB_ID(sdev));
+               } else {
+                       ETH(sdev)->state = RTE_ETH_DEV_UNUSED;
+               }
+               sdev->state = DEV_PARSED;
+               /* fallthrough */
+       case DEV_PARSED:
+       case DEV_UNDEFINED:
+               sdev->state = DEV_UNDEFINED;
+               /* the end */
+               break;
+       }
+       failsafe_hotplug_alarm_install(sdev->fs_dev);
+}
+
+static inline int
+fs_rxtx_clean(struct sub_device *sdev)
+{
+       uint16_t i;
+
+       for (i = 0; i < ETH(sdev)->data->nb_rx_queues; i++)
+               if (FS_ATOMIC_RX(sdev, i))
+                       return 0;
+       for (i = 0; i < ETH(sdev)->data->nb_tx_queues; i++)
+               if (FS_ATOMIC_TX(sdev, i))
+                       return 0;
+       return 1;
+}
+
+void
+failsafe_dev_remove(struct rte_eth_dev *dev)
+{
+       struct sub_device *sdev;
+       uint8_t i;
+
+       FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE)
+               if (sdev->remove && fs_rxtx_clean(sdev))
+                       fs_dev_remove(sdev);
+}
+
 int
 failsafe_eth_dev_state_sync(struct rte_eth_dev *dev)
 {
 
        ret = failsafe_args_parse_subs(dev);
        if (ret)
-               return ret;
+               goto err_remove;
 
        if (PRIV(dev)->state < DEV_PROBED)
                return 0;
        ret = failsafe_eal_init(dev);
        if (ret)
-               return ret;
+               goto err_remove;
        if (PRIV(dev)->state < DEV_ACTIVE)
                return 0;
        inactive = 0;
                        inactive |= UINT32_C(1) << i;
        ret = dev->dev_ops->dev_configure(dev);
        if (ret)
-               return ret;
+               goto err_remove;
        FOREACH_SUBDEV(sdev, i, dev) {
                if (inactive & (UINT32_C(1) << i)) {
                        ret = fs_eth_dev_conf_apply(dev, sdev);
                        if (ret) {
                                ERROR("Could not apply configuration to sub_device %d",
                                      i);
-                               /* TODO: disable device */
-                               return ret;
+                               goto err_remove;
                        }
                }
        }
                return 0;
        ret = dev->dev_ops->dev_start(dev);
        if (ret)
-               return ret;
+               goto err_remove;
+       return 0;
+err_remove:
+       FOREACH_SUBDEV(sdev, i, dev)
+               if (sdev->state != PRIV(dev)->state)
+                       sdev->remove = 1;
+       return ret;
+}
+
+int
+failsafe_eth_rmv_event_callback(uint8_t port_id __rte_unused,
+                               enum rte_eth_event_type event __rte_unused,
+                               void *cb_arg, void *out __rte_unused)
+{
+       struct sub_device *sdev = cb_arg;
+
+       /* Switch as soon as possible tx_dev. */
+       fs_switch_dev(sdev->fs_dev, sdev);
+       /* Use safe bursts in any case. */
+       set_burst_fn(sdev->fs_dev, 1);
+       /*
+        * Async removal, the sub-PMD will try to unregister
+        * the callback at the source of the current thread context.
+        */
+       sdev->remove = 1;
        return 0;
 }
 
 #include <stdint.h>
 
 #include <rte_debug.h>
+#include <rte_atomic.h>
 #include <rte_ethdev.h>
 #include <rte_malloc.h>
 #include <rte_flow.h>
                }
        }
        FOREACH_SUBDEV(sdev, i, dev) {
+               int rmv_interrupt = 0;
+
                if (sdev->state != DEV_PROBED)
                        continue;
+
+               rmv_interrupt = ETH(sdev)->data->dev_flags &
+                               RTE_ETH_DEV_INTR_RMV;
+               if (rmv_interrupt) {
+                       DEBUG("Enabling RMV interrupts for sub_device %d", i);
+                       dev->data->dev_conf.intr_conf.rmv = 1;
+               } else {
+                       DEBUG("sub_device %d does not support RMV event", i);
+               }
                DEBUG("Configuring sub-device %d", i);
+               sdev->remove = 0;
                ret = rte_eth_dev_configure(PORT_ID(sdev),
                                        dev->data->nb_rx_queues,
                                        dev->data->nb_tx_queues,
                        ERROR("Could not configure sub_device %d", i);
                        return ret;
                }
+               if (rmv_interrupt) {
+                       ret = rte_eth_dev_callback_register(PORT_ID(sdev),
+                                       RTE_ETH_EVENT_INTR_RMV,
+                                       failsafe_eth_rmv_event_callback,
+                                       sdev);
+                       if (ret)
+                               WARN("Failed to register RMV callback for sub_device %d",
+                                    SUB_ID(sdev));
+               }
+               dev->data->dev_conf.intr_conf.rmv = 0;
                sdev->state = DEV_ACTIVE;
        }
        if (PRIV(dev)->state < DEV_ACTIVE)
        }
        if (PRIV(dev)->state < DEV_STARTED)
                PRIV(dev)->state = DEV_STARTED;
-       fs_switch_dev(dev);
+       fs_switch_dev(dev, NULL);
        return 0;
 }
 
                fs_rx_queue_release(rxq);
                dev->data->rx_queues[rx_queue_id] = NULL;
        }
-       rxq = rte_zmalloc(NULL, sizeof(*rxq),
+       rxq = rte_zmalloc(NULL,
+                         sizeof(*rxq) +
+                         sizeof(rte_atomic64_t) * PRIV(dev)->subs_tail,
                          RTE_CACHE_LINE_SIZE);
        if (rxq == NULL)
                return -ENOMEM;
+       FOREACH_SUBDEV(sdev, i, dev)
+               rte_atomic64_init(&rxq->refcnt[i]);
        rxq->qid = rx_queue_id;
        rxq->socket_id = socket_id;
        rxq->info.mp = mb_pool;
                fs_tx_queue_release(txq);
                dev->data->tx_queues[tx_queue_id] = NULL;
        }
-       txq = rte_zmalloc("ethdev TX queue", sizeof(*txq),
+       txq = rte_zmalloc("ethdev TX queue",
+                         sizeof(*txq) +
+                         sizeof(rte_atomic64_t) * PRIV(dev)->subs_tail,
                          RTE_CACHE_LINE_SIZE);
        if (txq == NULL)
                return -ENOMEM;
+       FOREACH_SUBDEV(sdev, i, dev)
+               rte_atomic64_init(&txq->refcnt[i]);
        txq->qid = tx_queue_id;
        txq->socket_id = socket_id;
        txq->info.conf = *tx_conf;
 
 
 #include <sys/queue.h>
 
+#include <rte_atomic.h>
 #include <rte_dev.h>
 #include <rte_ethdev.h>
 #include <rte_devargs.h>
        uint8_t last_polled;
        unsigned int socket_id;
        struct rte_eth_rxq_info info;
+       rte_atomic64_t refcnt[];
 };
 
 struct txq {
        uint16_t qid;
        unsigned int socket_id;
        struct rte_eth_txq_info info;
+       rte_atomic64_t refcnt[];
 };
 
 struct rte_flow {
        enum dev_state state;
        /* Some device are defined as a command line */
        char *cmdline;
+       /* fail-safe device backreference */
+       struct rte_eth_dev *fs_dev;
+       /* flag calling for recollection */
+       volatile unsigned int remove:1;
 };
 
 struct fs_priv {
 /* ETH_DEV */
 
 int failsafe_eth_dev_state_sync(struct rte_eth_dev *dev);
+void failsafe_dev_remove(struct rte_eth_dev *dev);
+int failsafe_eth_rmv_event_callback(uint8_t port_id,
+                                   enum rte_eth_event_type type,
+                                   void *arg, void *out);
 
 /* GLOBALS */
 
 #define SUBOPS(s, ops) \
        (ETH(s)->dev_ops->ops)
 
+/**
+ * Atomic guard
+ */
+
+/**
+ * a: (rte_atomic64_t)
+ */
+#define FS_ATOMIC_P(a) \
+       rte_atomic64_add(&(a), 1)
+
+/**
+ * a: (rte_atomic64_t)
+ */
+#define FS_ATOMIC_V(a) \
+       rte_atomic64_sub(&(a), 1)
+
+/**
+ * s: (struct sub_device *)
+ * i: uint16_t qid
+ */
+#define FS_ATOMIC_RX(s, i) \
+       rte_atomic64_read( \
+        &((struct rxq *)((s)->fs_dev->data->rx_queues[i]))->refcnt[(s)->sid] \
+       )
+/**
+ * s: (struct sub_device *)
+ * i: uint16_t qid
+ */
+#define FS_ATOMIC_TX(s, i) \
+       rte_atomic64_read( \
+        &((struct txq *)((s)->fs_dev->data->tx_queues[i]))->refcnt[(s)->sid] \
+       )
+
 #define LOG__(level, m, ...) \
        RTE_LOG(level, PMD, "net_failsafe: " m "%c", __VA_ARGS__)
 #define LOG_(level, ...) LOG__(level, __VA_ARGS__, '\n')
        return sid;
 }
 
+/*
+ * Switch emitting device.
+ * If banned is set, banned must not be considered for
+ * the role of emitting device.
+ */
 static inline void
-fs_switch_dev(struct rte_eth_dev *dev)
+fs_switch_dev(struct rte_eth_dev *dev,
+             struct sub_device *banned)
 {
+       struct sub_device *txd;
        enum dev_state req_state;
 
        req_state = PRIV(dev)->state;
-       if (PREFERRED_SUBDEV(dev)->state >= req_state) {
-               if (TX_SUBDEV(dev) != PREFERRED_SUBDEV(dev) &&
-                   (TX_SUBDEV(dev) == NULL ||
+       txd = TX_SUBDEV(dev);
+       if (PREFERRED_SUBDEV(dev)->state >= req_state &&
+           PREFERRED_SUBDEV(dev) != banned) {
+               if (txd != PREFERRED_SUBDEV(dev) &&
+                   (txd == NULL ||
                     (req_state == DEV_STARTED) ||
-                    (TX_SUBDEV(dev) && TX_SUBDEV(dev)->state < DEV_STARTED))) {
+                    (txd && txd->state < DEV_STARTED))) {
                        DEBUG("Switching tx_dev to preferred sub_device");
                        PRIV(dev)->subs_tx = 0;
                }
-       } else if ((TX_SUBDEV(dev) && TX_SUBDEV(dev)->state < req_state) ||
-                  TX_SUBDEV(dev) == NULL) {
+       } else if ((txd && txd->state < req_state) ||
+                  txd == NULL ||
+                  txd == banned) {
                struct sub_device *sdev;
                uint8_t i;
 
                /* Using acceptable device */
                FOREACH_SUBDEV_STATE(sdev, i, dev, req_state) {
+                       if (sdev == banned)
+                               continue;
                        DEBUG("Switching tx_dev to sub_device %d",
                              i);
                        PRIV(dev)->subs_tx = i;
                        break;
                }
-       } else if (TX_SUBDEV(dev) && TX_SUBDEV(dev)->state < req_state) {
+       } else if (txd && txd->state < req_state) {
                DEBUG("No device ready, deactivating tx_dev");
                PRIV(dev)->subs_tx = PRIV(dev)->subs_tail;
        } else {
 
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <rte_atomic.h>
 #include <rte_debug.h>
 #include <rte_mbuf.h>
 #include <rte_ethdev.h>
                if (unlikely(fs_rx_unsafe(sdev)))
                        continue;
                sub_rxq = ETH(sdev)->data->rx_queues[rxq->qid];
+               FS_ATOMIC_P(rxq->refcnt[sdev->sid]);
                nb_rx = ETH(sdev)->
                        rx_pkt_burst(sub_rxq, rx_pkts, nb_pkts);
+               FS_ATOMIC_V(rxq->refcnt[sdev->sid]);
                if (nb_rx) {
                        rxq->last_polled = i;
                        return nb_rx;
                sdev = &priv->subs[i];
                RTE_ASSERT(!fs_rx_unsafe(sdev));
                sub_rxq = ETH(sdev)->data->rx_queues[rxq->qid];
+               FS_ATOMIC_P(rxq->refcnt[sdev->sid]);
                nb_rx = ETH(sdev)->
                        rx_pkt_burst(sub_rxq, rx_pkts, nb_pkts);
+               FS_ATOMIC_V(rxq->refcnt[sdev->sid]);
                if (nb_rx) {
                        rxq->last_polled = i;
                        return nb_rx;
        struct sub_device *sdev;
        struct txq *txq;
        void *sub_txq;
+       uint16_t nb_tx;
 
        txq = queue;
        sdev = TX_SUBDEV(txq->priv->dev);
        if (unlikely(fs_tx_unsafe(sdev)))
                return 0;
        sub_txq = ETH(sdev)->data->tx_queues[txq->qid];
-       return ETH(sdev)->tx_pkt_burst(sub_txq, tx_pkts, nb_pkts);
+       FS_ATOMIC_P(txq->refcnt[sdev->sid]);
+       nb_tx = ETH(sdev)->tx_pkt_burst(sub_txq, tx_pkts, nb_pkts);
+       FS_ATOMIC_V(txq->refcnt[sdev->sid]);
+       return nb_tx;
 }
 
 uint16_t
        struct sub_device *sdev;
        struct txq *txq;
        void *sub_txq;
+       uint16_t nb_tx;
 
        txq = queue;
        sdev = TX_SUBDEV(txq->priv->dev);
        RTE_ASSERT(!fs_tx_unsafe(sdev));
        sub_txq = ETH(sdev)->data->tx_queues[txq->qid];
-       return ETH(sdev)->tx_pkt_burst(sub_txq, tx_pkts, nb_pkts);
+       FS_ATOMIC_P(txq->refcnt[sdev->sid]);
+       nb_tx = ETH(sdev)->tx_pkt_burst(sub_txq, tx_pkts, nb_pkts);
+       FS_ATOMIC_V(txq->refcnt[sdev->sid]);
+       return nb_tx;
 }