Integrate accelerated networking support into netvsc PMD.
This allows netvsc to manage VF without using failsafe or vdev_netvsc.
For the exception vswitch path some tests like transmit
get a 22% increase in packets/sec.
For the VF path, the code is slightly shorter but has no
real change in performance.
Pro:
   * using netvsc is more like other DPDK NIC's
   * the exception packet uses less CPU
   * much smaller code size
   * no locking required on VF transmit/receive path
   * no legacy Linux network device to get mangled by userspace
   * much simpler (1K vs 9K) LOC
   * unified extended statistics
Con:
   * using netvsc has more complex startup model
   * no bifurcated driver support
   * no flow support (since host does not have flow API).
   * no tunnel offload support
   * no receive interrupt support
Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
 *   The maximum number of queues is limited by the host (currently 64).
     When used with 4.16 kernel only a single queue is available.
 
-.. note::
-   This driver is intended for use with **Hyper-V only** and is
-   not recommended for use on Azure because accelerated Networking
-   (SR-IOV) is not supported.
-
-   On Azure, use the :doc:`vdev_netvsc` which
-   automatically configures the necessary TAP and failsave drivers.
-
+*   This driver supports SR-IOV network acceleration.
+    If SR-IOV is enabled then the driver will transparently manage the interface,
+    and send and receive packets using the VF path.
+    The VDEV_NETVSC and FAILSAFE drivers are *not* used when using netvsc PMD.
 
 Installation
 ------------
 
   * Match items: destination MAC address.
   * Action items: push/pop/rewrite vlan header.
 
+* **Added support for SR-IOV in netvsc PMD.**
+
+  The ``netvsc`` poll mode driver now supports the Accelerated Networking
+  SR-IOV option in Hyper-V and Azure. This is an alternative to the previous
+  vdev_netvsc, tap, and failsafe drivers combination.
+
 
 API Changes
 -----------
      librte_bus_fslmc.so.1
      librte_bus_pci.so.1
      librte_bus_vdev.so.1
-     librte_bus_vmbus.so.1
+   + librte_bus_vmbus.so.1
      librte_cfgfile.so.2
      librte_cmdline.so.2
      librte_common_octeontx.so.1
      librte_pmd_ring.so.2
      librte_pmd_softnic.so.1
      librte_pmd_vhost.so.2
+   + librte_pmd_netvsc.so.1
      librte_port.so.3
      librte_power.so.1
      librte_rawdev.so.1
    Also, make sure to start the actual text at the margin.
    =========================================================
 
+* When using SR-IOV (VF) support with netvsc PMD and the Mellanox mlx5 bifurcated
+  driver; the Linux netvsc device must be brought up before the netvsc device is
+  unbound and passed to the DPDK.
+
 
 Tested Platforms
 ----------------
    This section is a comment. Do not overwrite or remove it.
    Also, make sure to start the actual text at the margin.
    =========================================================
-
 
 SRCS-$(CONFIG_RTE_LIBRTE_NETVSC_PMD) += hn_rxtx.c
 SRCS-$(CONFIG_RTE_LIBRTE_NETVSC_PMD) += hn_rndis.c
 SRCS-$(CONFIG_RTE_LIBRTE_NETVSC_PMD) += hn_nvs.c
+SRCS-$(CONFIG_RTE_LIBRTE_NETVSC_PMD) += hn_vf.c
 
 LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
 LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs
 
  */
 int
 hn_dev_link_update(struct rte_eth_dev *dev,
-                  __rte_unused int wait_to_complete)
+                  int wait_to_complete)
 {
        struct hn_data *hv = dev->data->dev_private;
        struct rte_eth_link link, old;
 
        hn_rndis_get_linkspeed(hv);
 
+       hn_vf_link_update(dev, wait_to_complete);
+
        link = (struct rte_eth_link) {
                .link_duplex = ETH_LINK_FULL_DUPLEX,
                .link_autoneg = ETH_LINK_SPEED_FIXED,
        dev_info->max_tx_queues = hv->max_queues;
 
        hn_rndis_get_offload(hv, dev_info);
+       hn_vf_info_get(hv, dev_info);
 }
 
 static void
                }
        }
 
-       return 0;
+       return hn_vf_configure(dev, dev_conf);
 }
 
 static int hn_dev_stats_get(struct rte_eth_dev *dev,
 {
        unsigned int i;
 
+       hn_vf_stats_get(dev, stats);
+
        for (i = 0; i < dev->data->nb_tx_queues; i++) {
                const struct hn_tx_queue *txq = dev->data->tx_queues[i];
 
        }
 }
 
+static void
+hn_dev_xstats_reset(struct rte_eth_dev *dev)
+{
+       hn_dev_stats_reset(dev);
+       hn_vf_xstats_reset(dev);
+}
+
+static int
+hn_dev_xstats_count(struct rte_eth_dev *dev)
+{
+       int ret, count;
+
+       count = dev->data->nb_tx_queues * RTE_DIM(hn_stat_strings);
+       count += dev->data->nb_rx_queues * RTE_DIM(hn_stat_strings);
+
+       ret = hn_vf_xstats_get_names(dev, NULL, 0);
+       if (ret < 0)
+               return ret;
+
+       return count + ret;
+}
+
 static int
 hn_dev_xstats_get_names(struct rte_eth_dev *dev,
                        struct rte_eth_xstat_name *xstats_names,
-                       __rte_unused unsigned int limit)
+                       unsigned int limit)
 {
        unsigned int i, t, count = 0;
-
-       PMD_INIT_FUNC_TRACE();
+       int ret;
 
        if (!xstats_names)
-               return dev->data->nb_tx_queues * RTE_DIM(hn_stat_strings)
-                       + dev->data->nb_rx_queues * RTE_DIM(hn_stat_strings);
+               return hn_dev_xstats_count(dev);
 
        /* Note: limit checked in rte_eth_xstats_names() */
        for (i = 0; i < dev->data->nb_tx_queues; i++) {
                if (!txq)
                        continue;
 
+               if (count >= limit)
+                       break;
+
                for (t = 0; t < RTE_DIM(hn_stat_strings); t++)
                        snprintf(xstats_names[count++].name,
                                 RTE_ETH_XSTATS_NAME_SIZE,
                if (!rxq)
                        continue;
 
+               if (count >= limit)
+                       break;
+
                for (t = 0; t < RTE_DIM(hn_stat_strings); t++)
                        snprintf(xstats_names[count++].name,
                                 RTE_ETH_XSTATS_NAME_SIZE,
                                 hn_stat_strings[t].name);
        }
 
-       return count;
+       ret = hn_vf_xstats_get_names(dev, xstats_names + count,
+                                    limit - count);
+       if (ret < 0)
+               return ret;
+
+       return count + ret;
 }
 
 static int
                  unsigned int n)
 {
        unsigned int i, t, count = 0;
-
-       const unsigned int nstats =
-               dev->data->nb_tx_queues * RTE_DIM(hn_stat_strings)
-               + dev->data->nb_rx_queues * RTE_DIM(hn_stat_strings);
+       const unsigned int nstats = hn_dev_xstats_count(dev);
        const char *stats;
+       int ret;
 
        PMD_INIT_FUNC_TRACE();
 
                                (stats + hn_stat_strings[t].offset);
        }
 
-       return count;
+       ret = hn_vf_xstats_get(dev, xstats + count, n - count);
+       if (ret < 0)
+               return ret;
+
+       return count + ret;
 }
 
 static int
 hn_dev_start(struct rte_eth_dev *dev)
 {
        struct hn_data *hv = dev->data->dev_private;
+       int error;
 
        PMD_INIT_FUNC_TRACE();
 
-       return hn_rndis_set_rxfilter(hv,
-                                    NDIS_PACKET_TYPE_BROADCAST |
-                                    NDIS_PACKET_TYPE_ALL_MULTICAST |
-                                    NDIS_PACKET_TYPE_DIRECTED);
+       error = hn_rndis_set_rxfilter(hv,
+                                     NDIS_PACKET_TYPE_BROADCAST |
+                                     NDIS_PACKET_TYPE_ALL_MULTICAST |
+                                     NDIS_PACKET_TYPE_DIRECTED);
+       if (error)
+               return error;
+
+       error = hn_vf_start(dev);
+       if (error)
+               hn_rndis_set_rxfilter(hv, 0);
+
+       return error;
 }
 
 static void
        PMD_INIT_FUNC_TRACE();
 
        hn_rndis_set_rxfilter(hv, 0);
+       hn_vf_stop(dev);
 }
 
 static void
 hn_dev_close(struct rte_eth_dev *dev __rte_unused)
 {
        PMD_INIT_LOG(DEBUG, "close");
+
+       hn_vf_close(dev);
 }
 
 static const struct eth_dev_ops hn_eth_dev_ops = {
        .dev_stop               = hn_dev_stop,
        .dev_close              = hn_dev_close,
        .dev_infos_get          = hn_dev_info_get,
-       .txq_info_get           = hn_dev_tx_queue_info,
-       .rxq_info_get           = hn_dev_rx_queue_info,
+       .dev_supported_ptypes_get = hn_vf_supported_ptypes,
        .promiscuous_enable     = hn_dev_promiscuous_enable,
        .promiscuous_disable    = hn_dev_promiscuous_disable,
        .allmulticast_enable    = hn_dev_allmulticast_enable,
        .rx_queue_release       = hn_dev_rx_queue_release,
        .link_update            = hn_dev_link_update,
        .stats_get              = hn_dev_stats_get,
+       .stats_reset            = hn_dev_stats_reset,
        .xstats_get             = hn_dev_xstats_get,
        .xstats_get_names       = hn_dev_xstats_get_names,
-       .stats_reset            = hn_dev_stats_reset,
-       .xstats_reset           = hn_dev_stats_reset,
+       .xstats_reset           = hn_dev_xstats_reset,
 };
 
 /*
        if (err)
                return err;
 
+       strlcpy(hv->owner.name, eth_dev->device->name,
+               RTE_ETH_MAX_OWNER_NAME_LEN);
+       err = rte_eth_dev_owner_new(&hv->owner.id);
+       if (err) {
+               PMD_INIT_LOG(ERR, "Can not get owner id");
+               return err;
+       }
+
        /* Initialize primary channel input for control operations */
        err = rte_vmbus_chan_open(vmbus, &hv->channels[0]);
        if (err)
 
        hv->max_queues = RTE_MIN(rxr_cnt, (unsigned int)max_chan);
 
+       /* If VF was reported but not added, do it now */
+       if (hv->vf_present && !hv->vf_dev) {
+               PMD_INIT_LOG(DEBUG, "Adding VF device");
+
+               err = hn_vf_add(eth_dev, hv);
+               if (err)
+                       goto failed;
+       }
+
        return 0;
 
 failed:
        hn_detach(hv);
        rte_vmbus_chan_close(hv->primary->chan);
        rte_free(hv->primary);
+       rte_eth_dev_owner_delete(hv->owner.id);
 
        eth_dev->data->mac_addrs = NULL;
 
 
 hn_nvs_set_datapath(struct hn_data *hv, uint32_t path)
 {
        struct hn_nvs_datapath dp;
+       int error;
+
+       PMD_DRV_LOG(DEBUG, "set datapath %s",
+                   path ? "VF" : "Synthetic");
 
        memset(&dp, 0, sizeof(dp));
        dp.type = NVS_TYPE_SET_DATAPATH;
        dp.active_path = path;
 
-       hn_nvs_req_send(hv, &dp, sizeof(dp));
+       error = hn_nvs_req_send(hv, &dp, sizeof(dp));
+       if (error) {
+               PMD_DRV_LOG(ERR,
+                           "send set datapath failed: %d",
+                           error);
+       }
 }
 
        uint8_t         rsvd[28];
 } __rte_packed;
 
+struct hn_nvs_vf_association {
+       uint32_t        type;   /* NVS_TYPE_VFASSOC_NOTE */
+       uint32_t        allocated;
+       uint32_t        serial;
+} __rte_packed;
+
 #define NVS_DATAPATH_SYNTHETIC 0
 #define NVS_DATAPATH_VF                1
 
 void   hn_nvs_ack_rxbuf(struct vmbus_channel *chan, uint64_t tid);
 int    hn_nvs_alloc_subchans(struct hn_data *hv, uint32_t *nsubch);
 void   hn_nvs_set_datapath(struct hn_data *hv, uint32_t path);
+void   hn_nvs_handle_vfassoc(struct rte_eth_dev *dev,
+                             const struct vmbus_chanpkt_hdr *hdr,
+                             const void *data);
 
 static inline int
 hn_nvs_send(struct vmbus_channel *chan, uint16_t flags,
 
        return 0;
 }
 
+uint32_t
+hn_rndis_get_ptypes(struct hn_data *hv)
+{
+       struct ndis_offload hwcaps;
+       uint32_t ptypes;
+       int error;
+
+       memset(&hwcaps, 0, sizeof(hwcaps));
+
+       error = hn_rndis_query_hwcaps(hv, &hwcaps);
+       if (error) {
+               PMD_DRV_LOG(ERR, "hwcaps query failed: %d", error);
+               return RTE_PTYPE_L2_ETHER;
+       }
+
+       ptypes = RTE_PTYPE_L2_ETHER;
+
+       if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_IP4)
+               ptypes |= RTE_PTYPE_L3_IPV4;
+
+       if ((hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_TCP4) ||
+           (hwcaps.ndis_csum.ndis_ip6_rxcsum & NDIS_RXCSUM_CAP_TCP6))
+               ptypes |= RTE_PTYPE_L4_TCP;
+
+       if ((hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_UDP4) ||
+           (hwcaps.ndis_csum.ndis_ip6_rxcsum & NDIS_RXCSUM_CAP_UDP6))
+               ptypes |= RTE_PTYPE_L4_UDP;
+
+       return ptypes;
+}
+
 int
 hn_rndis_set_rxfilter(struct hn_data *hv, uint32_t filter)
 {
 
                               unsigned int *rxr_cnt0);
 int    hn_rndis_conf_rss(struct hn_data *hv,
                          const struct rte_eth_rss_conf *rss_conf);
+uint32_t hn_rndis_get_ptypes(struct hn_data *hv);
 
 #ifdef RTE_LIBRTE_NETVSC_DEBUG_DUMP
 void hn_rndis_dump(const void *buf);
 
        struct hn_data *hv = dev->data->dev_private;
        struct hn_tx_queue *txq;
        uint32_t tx_free_thresh;
+       int err;
 
        PMD_INIT_FUNC_TRACE();
 
 
        hn_reset_txagg(txq);
 
-       dev->data->tx_queues[queue_idx] = txq;
+       err = hn_vf_tx_queue_setup(dev, queue_idx, nb_desc,
+                                    socket_id, tx_conf);
+       if (err) {
+               rte_free(txq);
+               return err;
+       }
 
+       dev->data->tx_queues[queue_idx] = txq;
        return 0;
 }
 
        rte_free(txq);
 }
 
-void
-hn_dev_tx_queue_info(struct rte_eth_dev *dev, uint16_t queue_idx,
-                    struct rte_eth_txq_info *qinfo)
-{
-       struct hn_data *hv = dev->data->dev_private;
-       struct hn_tx_queue *txq = dev->data->rx_queues[queue_idx];
-
-       qinfo->conf.tx_free_thresh = txq->free_thresh;
-       qinfo->nb_desc = hv->tx_pool->size;
-}
-
 static void
 hn_nvs_send_completed(struct rte_eth_dev *dev, uint16_t queue_id,
                      unsigned long xactid, const struct hn_nvs_rndis_ack *ack)
        hn_rx_buf_release(rxb);
 }
 
+/*
+ * Called when NVS inband events are received.
+ * Send up a two part message with port_id and the NVS message
+ * to the pipe to the netvsc-vf-event control thread.
+ */
+static void hn_nvs_handle_notify(struct rte_eth_dev *dev,
+                                const struct vmbus_chanpkt_hdr *pkt,
+                                const void *data)
+{
+       const struct hn_nvs_hdr *hdr = data;
+
+       switch (hdr->type) {
+       case NVS_TYPE_TXTBL_NOTE:
+               /* Transmit indirection table has locking problems
+                * in DPDK and therefore not implemented
+                */
+               PMD_DRV_LOG(DEBUG, "host notify of transmit indirection table");
+               break;
+
+       case NVS_TYPE_VFASSOC_NOTE:
+               hn_nvs_handle_vfassoc(dev, pkt, data);
+               break;
+
+       default:
+               PMD_DRV_LOG(INFO,
+                           "got notify, nvs type %u", hdr->type);
+       }
+}
+
 struct hn_rx_queue *hn_rx_queue_alloc(struct hn_data *hv,
                                      uint16_t queue_id,
                                      unsigned int socket_id)
 hn_dev_rx_queue_setup(struct rte_eth_dev *dev,
                      uint16_t queue_idx, uint16_t nb_desc,
                      unsigned int socket_id,
-                     const struct rte_eth_rxconf *rx_conf __rte_unused,
+                     const struct rte_eth_rxconf *rx_conf,
                      struct rte_mempool *mp)
 {
        struct hn_data *hv = dev->data->dev_private;
        char ring_name[RTE_RING_NAMESIZE];
        struct hn_rx_queue *rxq;
        unsigned int count;
+       int error = -ENOMEM;
 
        PMD_INIT_FUNC_TRACE();
 
        if (!rxq->rx_ring)
                goto fail;
 
+       error = hn_vf_rx_queue_setup(dev, queue_idx, nb_desc,
+                                    socket_id, rx_conf, mp);
+       if (error)
+               goto fail;
+
        dev->data->rx_queues[queue_idx] = rxq;
        return 0;
 
        rte_ring_free(rxq->rx_ring);
        rte_free(rxq->event_buf);
        rte_free(rxq);
-       return -ENOMEM;
+       return error;
 }
 
 void
        rxq->rx_ring = NULL;
        rxq->mb_pool = NULL;
 
+       hn_vf_rx_queue_release(rxq->hv, rxq->queue_id);
+
+       /* Keep primary queue to allow for control operations */
        if (rxq != rxq->hv->primary) {
                rte_free(rxq->event_buf);
                rte_free(rxq);
        return hn_process_events(txq->hv, txq->queue_id, free_cnt);
 }
 
-void
-hn_dev_rx_queue_info(struct rte_eth_dev *dev, uint16_t queue_idx,
-                    struct rte_eth_rxq_info *qinfo)
-{
-       struct hn_rx_queue *rxq = dev->data->rx_queues[queue_idx];
-
-       qinfo->mp = rxq->mb_pool;
-       qinfo->scattered_rx = 1;
-       qinfo->nb_desc = rte_ring_get_capacity(rxq->rx_ring);
-}
-
-static void
-hn_nvs_handle_notify(const struct vmbus_chanpkt_hdr *pkthdr,
-                    const void *data)
-{
-       const struct hn_nvs_hdr *hdr = data;
-
-       if (unlikely(vmbus_chanpkt_datalen(pkthdr) < sizeof(*hdr))) {
-               PMD_DRV_LOG(ERR, "invalid nvs notify");
-               return;
-       }
-
-       PMD_DRV_LOG(INFO,
-                   "got notify, nvs type %u", hdr->type);
-}
-
 /*
  * Process pending events on the channel.
  * Called from both Rx queue poll and Tx cleanup
                        break;
 
                case VMBUS_CHANPKT_TYPE_INBAND:
-                       hn_nvs_handle_notify(pkt, data);
+                       hn_nvs_handle_notify(dev, pkt, data);
                        break;
 
                default:
 hn_xmit_pkts(void *ptxq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 {
        struct hn_tx_queue *txq = ptxq;
+       uint16_t queue_id = txq->queue_id;
        struct hn_data *hv = txq->hv;
+       struct rte_eth_dev *vf_dev;
        bool need_sig = false;
        uint16_t nb_tx;
        int ret;
        if (unlikely(hv->closed))
                return 0;
 
+       /* Transmit over VF if present and up */
+       vf_dev = hv->vf_dev;
+       rte_compiler_barrier();
+       if (vf_dev && vf_dev->data->dev_started) {
+               void *sub_q = vf_dev->data->tx_queues[queue_id];
+
+               return (*vf_dev->tx_pkt_burst)(sub_q, tx_pkts, nb_pkts);
+       }
+
        if (rte_mempool_avail_count(hv->tx_pool) <= txq->free_thresh)
                hn_process_events(hv, txq->queue_id, 0);
 
                        if (unlikely(!pkt))
                                break;
 
-                       hn_encap(pkt, txq->queue_id, m);
+                       hn_encap(pkt, queue_id, m);
                        hn_append_to_chim(txq, pkt, m);
 
                        rte_pktmbuf_free(m);
                        txd->data_size += m->pkt_len;
                        ++txd->packets;
 
-                       hn_encap(pkt, txq->queue_id, m);
+                       hn_encap(pkt, queue_id, m);
 
                        ret = hn_xmit_sg(txq, txd, m, &need_sig);
                        if (unlikely(ret != 0)) {
 {
        struct hn_rx_queue *rxq = prxq;
        struct hn_data *hv = rxq->hv;
+       struct rte_eth_dev *vf_dev;
+       uint16_t nb_rcv;
 
        if (unlikely(hv->closed))
                return 0;
 
-       /* If ring is empty then process more */
-       if (rte_ring_count(rxq->rx_ring) < nb_pkts)
+       vf_dev = hv->vf_dev;
+       rte_compiler_barrier();
+
+       if (vf_dev && vf_dev->data->dev_started) {
+               /* Normally, with SR-IOV the ring buffer will be empty */
                hn_process_events(hv, rxq->queue_id, 0);
 
-       /* Get mbufs off staging ring */
-       return rte_ring_sc_dequeue_burst(rxq->rx_ring, (void **)rx_pkts,
-                                        nb_pkts, NULL);
+               /* Get mbufs some bufs off of staging ring */
+               nb_rcv = rte_ring_sc_dequeue_burst(rxq->rx_ring,
+                                                  (void **)rx_pkts,
+                                                  nb_pkts / 2, NULL);
+               /* And rest off of VF */
+               nb_rcv += rte_eth_rx_burst(vf_dev->data->port_id,
+                                          rxq->queue_id,
+                                          rx_pkts + nb_rcv, nb_pkts - nb_rcv);
+       } else {
+               /* If receive ring is not full then get more */
+               if (rte_ring_count(rxq->rx_ring) < nb_pkts)
+                       hn_process_events(hv, rxq->queue_id, 0);
+
+               nb_rcv = rte_ring_sc_dequeue_burst(rxq->rx_ring,
+                                                  (void **)rx_pkts,
+                                                  nb_pkts, NULL);
+       }
+
+       return nb_rcv;
 }
 
 struct hn_data {
        struct rte_vmbus_device *vmbus;
        struct hn_rx_queue *primary;
+       struct rte_eth_dev *vf_dev;             /* Subordinate device */
+       rte_spinlock_t  vf_lock;
        uint16_t        port_id;
        bool            closed;
+       bool            vf_present;
        uint32_t        link_status;
        uint32_t        link_speed;
 
        uint8_t         rndis_resp[256];
 
        struct ether_addr mac_addr;
+
+       struct rte_eth_dev_owner owner;
+       struct rte_intr_handle vf_intr;
+
        struct vmbus_channel *channels[HN_MAX_CHANNELS];
 };
 
                              const struct rte_eth_rxconf *rx_conf,
                              struct rte_mempool *mp);
 void   hn_dev_rx_queue_release(void *arg);
-void   hn_dev_rx_queue_info(struct rte_eth_dev *dev, uint16_t queue_idx,
-                            struct rte_eth_rxq_info *qinfo);
+
+void   hn_vf_info_get(struct hn_data *hv,
+                      struct rte_eth_dev_info *info);
+int    hn_vf_add(struct rte_eth_dev *dev, struct hn_data *hv);
+int    hn_vf_configure(struct rte_eth_dev *dev,
+                       const struct rte_eth_conf *dev_conf);
+const uint32_t *hn_vf_supported_ptypes(struct rte_eth_dev *dev);
+int    hn_vf_start(struct rte_eth_dev *dev);
+void   hn_vf_reset(struct rte_eth_dev *dev);
+void   hn_vf_stop(struct rte_eth_dev *dev);
+void   hn_vf_close(struct rte_eth_dev *dev);
+int    hn_vf_link_update(struct rte_eth_dev *dev,
+                         int wait_to_complete);
+int    hn_vf_tx_queue_setup(struct rte_eth_dev *dev,
+                            uint16_t queue_idx, uint16_t nb_desc,
+                            unsigned int socket_id,
+                            const struct rte_eth_txconf *tx_conf);
+void   hn_vf_tx_queue_release(struct hn_data *hv, uint16_t queue_id);
+int    hn_vf_rx_queue_setup(struct rte_eth_dev *dev,
+                            uint16_t queue_idx, uint16_t nb_desc,
+                            unsigned int socket_id,
+                            const struct rte_eth_rxconf *rx_conf,
+                            struct rte_mempool *mp);
+void   hn_vf_rx_queue_release(struct hn_data *hv, uint16_t queue_id);
+
+int    hn_vf_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats);
+void   hn_vf_stats_reset(struct rte_eth_dev *dev);
+int    hn_vf_xstats_get_names(struct rte_eth_dev *dev,
+                              struct rte_eth_xstat_name *xstats_names,
+                              unsigned int size);
+int    hn_vf_xstats_get(struct rte_eth_dev *dev,
+                        struct rte_eth_xstat *xstats,
+                        unsigned int n);
+void   hn_vf_xstats_reset(struct rte_eth_dev *dev);
 
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2018 Microsoft Corp.
+ * All rights reserved.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+#include <errno.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <sys/types.h>
+#include <sys/fcntl.h>
+#include <sys/uio.h>
+
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_ethdev_driver.h>
+#include <rte_lcore.h>
+#include <rte_memory.h>
+#include <rte_bus_vmbus.h>
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_log.h>
+#include <rte_string_fns.h>
+
+#include "hn_logs.h"
+#include "hn_var.h"
+#include "hn_nvs.h"
+
+/* Search for VF with matching MAC address, return port id */
+static int hn_vf_match(const struct rte_eth_dev *dev)
+{
+       const struct ether_addr *mac = dev->data->mac_addrs;
+       char buf[32];
+       int i;
+
+       ether_format_addr(buf, sizeof(buf), mac);
+       RTE_ETH_FOREACH_DEV(i) {
+               const struct rte_eth_dev *vf_dev = &rte_eth_devices[i];
+               const struct ether_addr *vf_mac = vf_dev->data->mac_addrs;
+
+               if (vf_dev == dev)
+                       continue;
+
+               ether_format_addr(buf, sizeof(buf), vf_mac);
+               if (is_same_ether_addr(mac, vf_mac))
+                       return i;
+       }
+       return -ENOENT;
+}
+
+/*
+ * Attach new PCI VF device and return the port_id
+ */
+static int hn_vf_attach(struct hn_data *hv, uint16_t port_id,
+                       struct rte_eth_dev **vf_dev)
+{
+       struct rte_eth_dev_owner owner = { .id = RTE_ETH_DEV_NO_OWNER };
+       int ret;
+
+       ret = rte_eth_dev_owner_get(port_id, &owner);
+       if (ret < 0) {
+               PMD_DRV_LOG(ERR, "Can not find owner for port %d", port_id);
+               return ret;
+       }
+
+       if (owner.id != RTE_ETH_DEV_NO_OWNER) {
+               PMD_DRV_LOG(ERR, "Port %u already owned by other device %s",
+                           port_id, owner.name);
+               return -EBUSY;
+       }
+
+       ret = rte_eth_dev_owner_set(port_id, &hv->owner);
+       if (ret < 0) {
+               PMD_DRV_LOG(ERR, "Can set owner for port %d", port_id);
+               return ret;
+       }
+
+       PMD_DRV_LOG(DEBUG, "Attach VF device %u", port_id);
+       rte_smp_wmb();
+       *vf_dev = &rte_eth_devices[port_id];
+       return 0;
+}
+
+/* Add new VF device to synthetic device */
+int hn_vf_add(struct rte_eth_dev *dev, struct hn_data *hv)
+{
+       int port, err;
+
+       port = hn_vf_match(dev);
+       if (port < 0) {
+               PMD_DRV_LOG(NOTICE, "No matching MAC found");
+               return port;
+       }
+
+       rte_spinlock_lock(&hv->vf_lock);
+       if (hv->vf_dev) {
+               PMD_DRV_LOG(ERR, "VF already attached");
+               err = -EBUSY;
+       } else {
+               err = hn_vf_attach(hv, port, &hv->vf_dev);
+       }
+
+       if (err == 0) {
+               dev->data->dev_flags |= RTE_ETH_DEV_INTR_LSC;
+               hv->vf_intr = (struct rte_intr_handle) {
+                       .fd = -1,
+                       .type = RTE_INTR_HANDLE_EXT,
+               };
+               dev->intr_handle = &hv->vf_intr;
+               hn_nvs_set_datapath(hv, NVS_DATAPATH_VF);
+       }
+       rte_spinlock_unlock(&hv->vf_lock);
+
+       return err;
+}
+
+/* Remove new VF device */
+static void hn_vf_remove(struct hn_data *hv)
+{
+       struct rte_eth_dev *vf_dev;
+
+       rte_spinlock_lock(&hv->vf_lock);
+       vf_dev = hv->vf_dev;
+       if (!vf_dev) {
+               PMD_DRV_LOG(ERR, "VF path not active");
+               rte_spinlock_unlock(&hv->vf_lock);
+               return;
+       }
+
+       /* Stop incoming packets from arriving on VF */
+       hn_nvs_set_datapath(hv, NVS_DATAPATH_SYNTHETIC);
+       hv->vf_dev = NULL;
+
+       /* Give back ownership */
+       rte_eth_dev_owner_unset(vf_dev->data->port_id, hv->owner.id);
+       rte_spinlock_unlock(&hv->vf_lock);
+}
+
+/* Handle VF association message from host */
+void
+hn_nvs_handle_vfassoc(struct rte_eth_dev *dev,
+                     const struct vmbus_chanpkt_hdr *hdr,
+                     const void *data)
+{
+       struct hn_data *hv = dev->data->dev_private;
+       const struct hn_nvs_vf_association *vf_assoc = data;
+
+       if (unlikely(vmbus_chanpkt_datalen(hdr) < sizeof(*vf_assoc))) {
+               PMD_DRV_LOG(ERR, "invalid vf association NVS");
+               return;
+       }
+
+       PMD_DRV_LOG(DEBUG, "VF serial %u %s port %u",
+                   vf_assoc->serial,
+                   vf_assoc->allocated ? "add to" : "remove from",
+                   dev->data->port_id);
+
+       hv->vf_present = vf_assoc->allocated;
+
+       if (dev->state != RTE_ETH_DEV_ATTACHED)
+               return;
+
+       if (vf_assoc->allocated)
+               hn_vf_add(dev, hv);
+       else
+               hn_vf_remove(hv);
+}
+
+/*
+ * Merge the info from the VF and synthetic path.
+ * use the default config of the VF
+ * and the minimum number of queues and buffer sizes.
+ */
+static void hn_vf_info_merge(struct rte_eth_dev *vf_dev,
+                            struct rte_eth_dev_info *info)
+{
+       struct rte_eth_dev_info vf_info;
+
+       rte_eth_dev_info_get(vf_dev->data->port_id, &vf_info);
+
+       info->speed_capa = vf_info.speed_capa;
+       info->default_rxportconf = vf_info.default_rxportconf;
+       info->default_txportconf = vf_info.default_txportconf;
+
+       info->max_rx_queues = RTE_MIN(vf_info.max_rx_queues,
+                                     info->max_rx_queues);
+       info->rx_offload_capa &= vf_info.rx_offload_capa;
+       info->rx_queue_offload_capa &= vf_info.rx_queue_offload_capa;
+       info->flow_type_rss_offloads &= vf_info.flow_type_rss_offloads;
+
+       info->max_tx_queues = RTE_MIN(vf_info.max_tx_queues,
+                                     info->max_tx_queues);
+       info->tx_offload_capa &= vf_info.tx_offload_capa;
+       info->tx_queue_offload_capa &= vf_info.tx_queue_offload_capa;
+
+       info->min_rx_bufsize = RTE_MAX(vf_info.min_rx_bufsize,
+                                      info->min_rx_bufsize);
+       info->max_rx_pktlen  = RTE_MAX(vf_info.max_rx_pktlen,
+                                      info->max_rx_pktlen);
+}
+
+void hn_vf_info_get(struct hn_data *hv, struct rte_eth_dev_info *info)
+{
+       struct rte_eth_dev *vf_dev;
+
+       rte_spinlock_lock(&hv->vf_lock);
+       vf_dev = hv->vf_dev;
+       if (vf_dev)
+               hn_vf_info_merge(vf_dev, info);
+       rte_spinlock_unlock(&hv->vf_lock);
+}
+
+int hn_vf_link_update(struct rte_eth_dev *dev,
+                     int wait_to_complete)
+{
+       struct hn_data *hv = dev->data->dev_private;
+       struct rte_eth_dev *vf_dev;
+       int ret = 0;
+
+       rte_spinlock_lock(&hv->vf_lock);
+       vf_dev = hv->vf_dev;
+       if (vf_dev && vf_dev->dev_ops->link_update)
+               ret = (*vf_dev->dev_ops->link_update)(dev, wait_to_complete);
+       rte_spinlock_unlock(&hv->vf_lock);
+
+       return ret;
+}
+
+/* called when VF has link state interrupts enabled */
+static int hn_vf_lsc_event(uint16_t port_id __rte_unused,
+                          enum rte_eth_event_type event,
+                          void *cb_arg, void *out __rte_unused)
+{
+       struct rte_eth_dev *dev = cb_arg;
+
+       if (event != RTE_ETH_EVENT_INTR_LSC)
+               return 0;
+
+       /* if link state has changed pass on */
+       if (hn_dev_link_update(dev, 0) == 0)
+               return 0; /* no change */
+
+       return _rte_eth_dev_callback_process(dev,
+                                            RTE_ETH_EVENT_INTR_LSC,
+                                            NULL);
+}
+
+static int _hn_vf_configure(struct rte_eth_dev *dev,
+                           struct rte_eth_dev *vf_dev,
+                           const struct rte_eth_conf *dev_conf)
+{
+       struct rte_eth_conf vf_conf = *dev_conf;
+       uint16_t vf_port = vf_dev->data->port_id;
+       int ret;
+
+       if (dev_conf->intr_conf.lsc &&
+           (vf_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)) {
+               PMD_DRV_LOG(DEBUG, "enabling LSC for VF %u",
+                           vf_port);
+               vf_conf.intr_conf.lsc = 1;
+       } else {
+               PMD_DRV_LOG(DEBUG, "disabling LSC for VF %u",
+                           vf_port);
+               vf_conf.intr_conf.lsc = 0;
+       }
+
+       ret = rte_eth_dev_configure(vf_port,
+                                   dev->data->nb_rx_queues,
+                                   dev->data->nb_tx_queues,
+                                   &vf_conf);
+       if (ret) {
+               PMD_DRV_LOG(ERR,
+                           "VF configuration failed: %d", ret);
+       } else if (vf_conf.intr_conf.lsc) {
+               ret = rte_eth_dev_callback_register(vf_port,
+                                                   RTE_ETH_DEV_INTR_LSC,
+                                                   hn_vf_lsc_event, dev);
+               if (ret)
+                       PMD_DRV_LOG(ERR,
+                                   "Failed to register LSC callback for VF %u",
+                                   vf_port);
+       }
+       return ret;
+}
+
+/*
+ * Configure VF if present.
+ * Force VF to have same number of queues as synthetic device
+ */
+int hn_vf_configure(struct rte_eth_dev *dev,
+                   const struct rte_eth_conf *dev_conf)
+{
+       struct hn_data *hv = dev->data->dev_private;
+       struct rte_eth_dev *vf_dev;
+       int ret = 0;
+
+       rte_spinlock_lock(&hv->vf_lock);
+       vf_dev = hv->vf_dev;
+       if (vf_dev)
+               ret = _hn_vf_configure(dev, vf_dev, dev_conf);
+       rte_spinlock_unlock(&hv->vf_lock);
+       return ret;
+}
+
+const uint32_t *hn_vf_supported_ptypes(struct rte_eth_dev *dev)
+{
+       struct hn_data *hv = dev->data->dev_private;
+       struct rte_eth_dev *vf_dev;
+       const uint32_t *ptypes = NULL;
+
+       rte_spinlock_lock(&hv->vf_lock);
+       vf_dev = hv->vf_dev;
+       if (vf_dev && vf_dev->dev_ops->dev_supported_ptypes_get)
+               ptypes = (*vf_dev->dev_ops->dev_supported_ptypes_get)(vf_dev);
+       rte_spinlock_unlock(&hv->vf_lock);
+
+       return ptypes;
+}
+
+int hn_vf_start(struct rte_eth_dev *dev)
+{
+       struct hn_data *hv = dev->data->dev_private;
+       struct rte_eth_dev *vf_dev;
+       int ret = 0;
+
+       rte_spinlock_lock(&hv->vf_lock);
+       vf_dev = hv->vf_dev;
+       if (vf_dev)
+               ret = rte_eth_dev_start(vf_dev->data->port_id);
+       rte_spinlock_unlock(&hv->vf_lock);
+       return ret;
+}
+
+void hn_vf_stop(struct rte_eth_dev *dev)
+{
+       struct hn_data *hv = dev->data->dev_private;
+       struct rte_eth_dev *vf_dev;
+
+       rte_spinlock_lock(&hv->vf_lock);
+       vf_dev = hv->vf_dev;
+       if (vf_dev)
+               rte_eth_dev_stop(vf_dev->data->port_id);
+       rte_spinlock_unlock(&hv->vf_lock);
+}
+
+/* If VF is present, then cascade configuration down */
+#define VF_ETHDEV_FUNC(dev, func)                              \
+       {                                                       \
+               struct hn_data *hv = (dev)->data->dev_private;  \
+               struct rte_eth_dev *vf_dev;                     \
+               rte_spinlock_lock(&hv->vf_lock);                \
+               vf_dev = hv->vf_dev;                            \
+               if (vf_dev)                                     \
+                       func(vf_dev->data->port_id);            \
+               rte_spinlock_unlock(&hv->vf_lock);              \
+       }
+
+void hn_vf_reset(struct rte_eth_dev *dev)
+{
+       VF_ETHDEV_FUNC(dev, rte_eth_dev_reset);
+}
+
+void hn_vf_close(struct rte_eth_dev *dev)
+{
+       VF_ETHDEV_FUNC(dev, rte_eth_dev_close);
+}
+
+void hn_vf_stats_reset(struct rte_eth_dev *dev)
+{
+       VF_ETHDEV_FUNC(dev, rte_eth_stats_reset);
+}
+
+int hn_vf_tx_queue_setup(struct rte_eth_dev *dev,
+                        uint16_t queue_idx, uint16_t nb_desc,
+                        unsigned int socket_id,
+                        const struct rte_eth_txconf *tx_conf)
+{
+       struct hn_data *hv = dev->data->dev_private;
+       struct rte_eth_dev *vf_dev;
+       int ret = 0;
+
+       rte_spinlock_lock(&hv->vf_lock);
+       vf_dev = hv->vf_dev;
+       if (vf_dev)
+               ret = rte_eth_tx_queue_setup(vf_dev->data->port_id,
+                                            queue_idx, nb_desc,
+                                            socket_id, tx_conf);
+       rte_spinlock_unlock(&hv->vf_lock);
+       return ret;
+}
+
+void hn_vf_tx_queue_release(struct hn_data *hv, uint16_t queue_id)
+{
+       struct rte_eth_dev *vf_dev;
+
+       rte_spinlock_lock(&hv->vf_lock);
+       vf_dev = hv->vf_dev;
+       if (vf_dev && vf_dev->dev_ops->tx_queue_release) {
+               void *subq = vf_dev->data->tx_queues[queue_id];
+
+               (*vf_dev->dev_ops->tx_queue_release)(subq);
+       }
+
+       rte_spinlock_unlock(&hv->vf_lock);
+}
+
+int hn_vf_rx_queue_setup(struct rte_eth_dev *dev,
+                        uint16_t queue_idx, uint16_t nb_desc,
+                        unsigned int socket_id,
+                        const struct rte_eth_rxconf *rx_conf,
+                        struct rte_mempool *mp)
+{
+       struct hn_data *hv = dev->data->dev_private;
+       struct rte_eth_dev *vf_dev;
+       int ret = 0;
+
+       rte_spinlock_lock(&hv->vf_lock);
+       vf_dev = hv->vf_dev;
+       if (vf_dev)
+               ret = rte_eth_rx_queue_setup(vf_dev->data->port_id,
+                                            queue_idx, nb_desc,
+                                            socket_id, rx_conf, mp);
+       rte_spinlock_unlock(&hv->vf_lock);
+       return ret;
+}
+
+void hn_vf_rx_queue_release(struct hn_data *hv, uint16_t queue_id)
+{
+       struct rte_eth_dev *vf_dev;
+
+       rte_spinlock_lock(&hv->vf_lock);
+       vf_dev = hv->vf_dev;
+       if (vf_dev && vf_dev->dev_ops->rx_queue_release) {
+               void *subq = vf_dev->data->rx_queues[queue_id];
+
+               (*vf_dev->dev_ops->rx_queue_release)(subq);
+       }
+       rte_spinlock_unlock(&hv->vf_lock);
+}
+
+int hn_vf_stats_get(struct rte_eth_dev *dev,
+                   struct rte_eth_stats *stats)
+{
+       struct hn_data *hv = dev->data->dev_private;
+       struct rte_eth_dev *vf_dev;
+       int ret = 0;
+
+       rte_spinlock_lock(&hv->vf_lock);
+       vf_dev = hv->vf_dev;
+       if (vf_dev)
+               ret = rte_eth_stats_get(vf_dev->data->port_id, stats);
+       rte_spinlock_unlock(&hv->vf_lock);
+       return ret;
+}
+
+int hn_vf_xstats_get_names(struct rte_eth_dev *dev,
+                          struct rte_eth_xstat_name *names,
+                          unsigned int n)
+{
+       struct hn_data *hv = dev->data->dev_private;
+       struct rte_eth_dev *vf_dev;
+       int i, count = 0;
+       char tmp[RTE_ETH_XSTATS_NAME_SIZE];
+
+       rte_spinlock_lock(&hv->vf_lock);
+       vf_dev = hv->vf_dev;
+       if (vf_dev && vf_dev->dev_ops->xstats_get_names)
+               count = vf_dev->dev_ops->xstats_get_names(vf_dev, names, n);
+       rte_spinlock_unlock(&hv->vf_lock);
+
+       /* add vf_ prefix to xstat names */
+       if (names) {
+               for (i = 0; i < count; i++) {
+                       snprintf(tmp, sizeof(tmp), "vf_%s", names[i].name);
+                       strlcpy(names[i].name, tmp, sizeof(names[i].name));
+               }
+       }
+
+       return count;
+}
+
+int hn_vf_xstats_get(struct rte_eth_dev *dev,
+                    struct rte_eth_xstat *xstats,
+                    unsigned int n)
+{
+       struct hn_data *hv = dev->data->dev_private;
+       struct rte_eth_dev *vf_dev;
+       int count = 0;
+
+       rte_spinlock_lock(&hv->vf_lock);
+       vf_dev = hv->vf_dev;
+       if (vf_dev && vf_dev->dev_ops->xstats_get)
+               count = vf_dev->dev_ops->xstats_get(vf_dev, xstats, n);
+       rte_spinlock_unlock(&hv->vf_lock);
+
+       return count;
+}
+
+void hn_vf_xstats_reset(struct rte_eth_dev *dev)
+{
+       struct hn_data *hv = dev->data->dev_private;
+       struct rte_eth_dev *vf_dev;
+
+       rte_spinlock_lock(&hv->vf_lock);
+       vf_dev = hv->vf_dev;
+       if (vf_dev && vf_dev->dev_ops->xstats_reset)
+               vf_dev->dev_ops->xstats_reset(vf_dev);
+       rte_spinlock_unlock(&hv->vf_lock);
+}
 
 
 build = dpdk_conf.has('RTE_LIBRTE_VMBUS_BUS')
 version = 2
-sources = files('hn_ethdev.c', 'hn_rxtx.c', 'hn_rndis.c', 'hn_nvs.c')
+sources = files('hn_ethdev.c', 'hn_rxtx.c', 'hn_rndis.c', 'hn_nvs.c', 'hn_vf.c')
 
 deps += ['bus_vmbus' ]