xen: import xenvirt pmd and vhost_xen
authorBruce Richardson <bruce.richardson@intel.com>
Wed, 12 Feb 2014 15:50:11 +0000 (15:50 +0000)
committerDavid Marchand <david.marchand@6wind.com>
Tue, 25 Feb 2014 20:29:19 +0000 (21:29 +0100)
This provides a para-virtualization packet switching solution, based on the
Xen hypervisor’s Grant Table, which provides simple and fast packet
switching capability between guest domains and host domain based on
MAC address or VLAN tag.

This solution is comprised of two components; a Poll Mode Driver (PMD)
as the front end in the guest domain and a switching back end in the
host domain.  XenStore is used to exchange configure information
between the PMD front end and switching back end,
including grant reference IDs for shared Virtio RX/TX rings, MAC
address, device state, and so on.

The front end PMD can be found in the Intel DPDK directory lib/
librte_pmd_xenvirt and back end example in examples/vhost_xen.

Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
20 files changed:
examples/vhost_xen/Makefile [new file with mode: 0644]
examples/vhost_xen/main.c [new file with mode: 0644]
examples/vhost_xen/main.h [new file with mode: 0644]
examples/vhost_xen/vhost_monitor.c [new file with mode: 0644]
examples/vhost_xen/virtio-net.h [new file with mode: 0644]
examples/vhost_xen/xen_vhost.h [new file with mode: 0644]
examples/vhost_xen/xenstore_parse.c [new file with mode: 0644]
lib/Makefile
lib/librte_eal/common/eal_common_nonpci_devs.c
lib/librte_eal/common/eal_common_whitelist.c
lib/librte_eal/linuxapp/eal/Makefile
lib/librte_pmd_xenvirt/Makefile [new file with mode: 0644]
lib/librte_pmd_xenvirt/rte_eth_xenvirt.c [new file with mode: 0644]
lib/librte_pmd_xenvirt/rte_eth_xenvirt.h [new file with mode: 0644]
lib/librte_pmd_xenvirt/rte_mempool_gntalloc.c [new file with mode: 0644]
lib/librte_pmd_xenvirt/rte_xen_lib.c [new file with mode: 0644]
lib/librte_pmd_xenvirt/rte_xen_lib.h [new file with mode: 0644]
lib/librte_pmd_xenvirt/virtio_logs.h [new file with mode: 0644]
lib/librte_pmd_xenvirt/virtqueue.h [new file with mode: 0644]
mk/rte.app.mk

diff --git a/examples/vhost_xen/Makefile b/examples/vhost_xen/Makefile
new file mode 100644 (file)
index 0000000..cfc4cc6
--- /dev/null
@@ -0,0 +1,51 @@
+#   BSD LICENSE
+# 
+#   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+#   All rights reserved.
+# 
+#   Redistribution and use in source and binary forms, with or without
+#   modification, are permitted provided that the following conditions
+#   are met:
+# 
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in
+#       the documentation and/or other materials provided with the
+#       distribution.
+#     * Neither the name of Intel Corporation nor the names of its
+#       contributors may be used to endorse or promote products derived
+#       from this software without specific prior written permission.
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ifeq ($(RTE_SDK),)
+$(error "Please define RTE_SDK environment variable")
+endif
+
+# Default target, can be overriden by command line or environment
+RTE_TARGET ?= x86_64-default-linuxapp-gcc
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# binary name
+APP = vhost-switch
+
+# all source are stored in SRCS-y
+SRCS-y := main.c vhost_monitor.c xenstore_parse.c
+
+CFLAGS += -O2 -I/usr/local/include -D_FILE_OFFSET_BITS=64 -Wno-unused-parameter
+CFLAGS += $(WERROR_FLAGS)
+LDFLAGS += -lxenstore
+
+include $(RTE_SDK)/mk/rte.extapp.mk
diff --git a/examples/vhost_xen/main.c b/examples/vhost_xen/main.c
new file mode 100644 (file)
index 0000000..eafc0aa
--- /dev/null
@@ -0,0 +1,1541 @@
+/*-
+ *   BSD LICENSE
+ * 
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ * 
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ * 
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <arpa/inet.h>
+#include <getopt.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/virtio_net.h>
+#include <linux/virtio_ring.h>
+#include <signal.h>
+#include <stdint.h>
+#include <sys/eventfd.h>
+#include <sys/param.h>
+#include <unistd.h>
+
+#include <rte_atomic.h>
+#include <rte_cycles.h>
+#include <rte_ethdev.h>
+#include <rte_log.h>
+#include <rte_string_fns.h>
+
+#include "main.h"
+#include "virtio-net.h"
+#include "xen_vhost.h"
+
+#define MAX_QUEUES 128
+
+/* the maximum number of external ports supported */
+#define MAX_SUP_PORTS 1
+
+/*
+ * Calculate the number of buffers needed per port
+ */
+#define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +            \
+                                                       (num_switching_cores*MAX_PKT_BURST) +                   \
+                                                       (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
+                                                       (num_switching_cores*MBUF_CACHE_SIZE))
+
+#define MBUF_CACHE_SIZE 64
+#define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
+
+/*
+ * RX and TX Prefetch, Host, and Write-back threshold values should be
+ * carefully set for optimal performance. Consult the network
+ * controller's datasheet and supporting DPDK documentation for guidance
+ * on how these parameters should be set.
+ */
+#define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
+#define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
+#define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
+
+/*
+ * These default values are optimized for use with the Intel(R) 82599 10 GbE
+ * Controller and the DPDK ixgbe PMD. Consider using other values for other
+ * network controllers and/or network drivers.
+ */
+#define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
+#define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
+#define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
+
+#define MAX_PKT_BURST 32               /* Max burst size for RX/TX */
+#define MAX_MRG_PKT_BURST 16   /* Max burst for merge buffers. Set to 1 due to performance issue. */
+#define BURST_TX_DRAIN_US 100  /* TX drain every ~100us */
+
+/* State of virtio device. */
+#define DEVICE_NOT_READY     0
+#define DEVICE_READY         1
+#define DEVICE_SAFE_REMOVE   2
+
+/* Config_core_flag status definitions. */
+#define REQUEST_DEV_REMOVAL 1
+#define ACK_DEV_REMOVAL 0
+
+/* Configurable number of RX/TX ring descriptors */
+#define RTE_TEST_RX_DESC_DEFAULT 128
+#define RTE_TEST_TX_DESC_DEFAULT 512
+
+#define INVALID_PORT_ID 0xFF
+
+/* Max number of devices. Limited by vmdq. */
+#define MAX_DEVICES 64
+
+/* Size of buffers used for rte_snprintfs. */
+#define MAX_PRINT_BUFF 6072
+
+
+/* Maximum long option length for option parsing. */
+#define MAX_LONG_OPT_SZ 64
+
+/* Used to compare MAC addresses. */
+#define MAC_ADDR_CMP 0xFFFFFFFFFFFF
+
+/* mask of enabled ports */
+static uint32_t enabled_port_mask = 0;
+
+/*Number of switching cores enabled*/
+static uint32_t num_switching_cores = 0;
+
+/* number of devices/queues to support*/
+static uint32_t num_queues = 0;
+uint32_t num_devices = 0;
+
+/* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
+static uint32_t enable_vm2vm = 1;
+/* Enable stats. */
+static uint32_t enable_stats = 0;
+
+/* Default configuration for rx and tx thresholds etc. */
+static const struct rte_eth_rxconf rx_conf_default = {
+       .rx_thresh = {
+               .pthresh = RX_PTHRESH,
+               .hthresh = RX_HTHRESH,
+               .wthresh = RX_WTHRESH,
+       },
+       .rx_drop_en = 1,
+};
+
+/*
+ * These default values are optimized for use with the Intel(R) 82599 10 GbE
+ * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
+ * network controllers and/or network drivers.
+ */
+static const struct rte_eth_txconf tx_conf_default = {
+       .tx_thresh = {
+               .pthresh = TX_PTHRESH,
+               .hthresh = TX_HTHRESH,
+               .wthresh = TX_WTHRESH,
+       },
+       .tx_free_thresh = 0, /* Use PMD default values */
+       .tx_rs_thresh = 0, /* Use PMD default values */
+};
+
+/* empty vmdq configuration structure. Filled in programatically */
+static const struct rte_eth_conf vmdq_conf_default = {
+       .rxmode = {
+               .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
+               .split_hdr_size = 0,
+               .header_split   = 0, /**< Header Split disabled */
+               .hw_ip_checksum = 0, /**< IP checksum offload disabled */
+               .hw_vlan_filter = 0, /**< VLAN filtering disabled */
+               /*
+                * It is necessary for 1G NIC such as I350,
+                * this fixes bug of ipv4 forwarding in guest can't
+                * forward pakets from one virtio dev to another virtio dev.
+                */
+               .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
+               .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
+               .hw_strip_crc   = 0, /**< CRC stripped by hardware */
+       },
+
+       .txmode = {
+               .mq_mode = ETH_MQ_TX_NONE,
+       },
+       .rx_adv_conf = {
+               /*
+                * should be overridden separately in code with
+                * appropriate values
+                */
+               .vmdq_rx_conf = {
+                       .nb_queue_pools = ETH_8_POOLS,
+                       .enable_default_pool = 0,
+                       .default_pool = 0,
+                       .nb_pool_maps = 0,
+                       .pool_map = {{0, 0},},
+               },
+       },
+};
+
+static unsigned lcore_ids[RTE_MAX_LCORE];
+static uint8_t ports[RTE_MAX_ETHPORTS];
+static unsigned num_ports = 0; /**< The number of ports specified in command line */
+
+const uint16_t vlan_tags[] = {
+       1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
+       1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
+       1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
+       1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
+       1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
+       1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
+       1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
+       1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
+};
+
+/* ethernet addresses of ports */
+static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
+
+/* heads for the main used and free linked lists for the data path. */
+static struct virtio_net_data_ll *ll_root_used = NULL;
+static struct virtio_net_data_ll *ll_root_free = NULL;
+
+/* Array of data core structures containing information on individual core linked lists. */
+static struct lcore_info lcore_info[RTE_MAX_LCORE];
+
+/* Used for queueing bursts of TX packets. */
+struct mbuf_table {
+       unsigned len;
+       unsigned txq_id;
+       struct rte_mbuf *m_table[MAX_PKT_BURST];
+};
+
+/* TX queue for each data core. */
+struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
+
+/* Vlan header struct used to insert vlan tags on TX. */
+struct vlan_ethhdr {
+       unsigned char   h_dest[ETH_ALEN];
+       unsigned char   h_source[ETH_ALEN];
+       __be16          h_vlan_proto;
+       __be16          h_vlan_TCI;
+       __be16          h_vlan_encapsulated_proto;
+};
+
+/* Header lengths. */
+#define VLAN_HLEN       4
+#define VLAN_ETH_HLEN   18
+
+/* Per-device statistics struct */
+struct device_statistics {
+       uint64_t tx_total;
+       rte_atomic64_t rx_total;
+       uint64_t tx;
+       rte_atomic64_t rx;
+} __rte_cache_aligned;
+struct device_statistics dev_statistics[MAX_DEVICES];
+
+/*
+ * Builds up the correct configuration for VMDQ VLAN pool map
+ * according to the pool & queue limits.
+ */
+static inline int
+get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
+{
+       struct rte_eth_vmdq_rx_conf conf;
+       unsigned i;
+
+       memset(&conf, 0, sizeof(conf));
+       conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
+       conf.nb_pool_maps = num_devices;
+
+       for (i = 0; i < conf.nb_pool_maps; i++) {
+               conf.pool_map[i].vlan_id = vlan_tags[ i ];
+               conf.pool_map[i].pools = (1UL << i);
+       }
+
+       (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
+       (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
+                  sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
+       return 0;
+}
+
+/*
+ * Validate the device number according to the max pool number gotten form dev_info
+ * If the device number is invalid, give the error message and return -1.
+ * Each device must have its own pool.
+ */
+static inline int
+validate_num_devices(uint32_t max_nb_devices)
+{
+       if (num_devices > max_nb_devices) {
+               RTE_LOG(ERR, PORT, "invalid number of devices\n");
+               return -1;
+       }
+       return 0;
+}
+
+/*
+ * Initialises a given port using global settings and with the rx buffers
+ * coming from the mbuf_pool passed as parameter
+ */
+static inline int
+port_init(uint8_t port, struct rte_mempool *mbuf_pool)
+{
+       struct rte_eth_dev_info dev_info;
+       struct rte_eth_conf port_conf;
+       uint16_t rx_rings, tx_rings = (uint16_t)rte_lcore_count();
+       const uint16_t rx_ring_size = RTE_TEST_RX_DESC_DEFAULT, tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
+       int retval;
+       uint16_t q;
+
+       /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
+       rte_eth_dev_info_get (port, &dev_info);
+
+       /*configure the number of supported virtio devices based on VMDQ limits */
+       num_devices = dev_info.max_vmdq_pools;
+       num_queues = dev_info.max_rx_queues;
+
+       retval = validate_num_devices(MAX_DEVICES);
+       if (retval < 0)
+               return retval;
+
+       /* Get port configuration. */
+       retval = get_eth_conf(&port_conf, num_devices);
+       if (retval < 0)
+               return retval;
+
+       if (port >= rte_eth_dev_count()) return -1;
+
+       rx_rings = (uint16_t)num_queues,
+       /* Configure ethernet device. */
+       retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
+       if (retval != 0)
+               return retval;
+
+       /* Setup the queues. */
+       for (q = 0; q < rx_rings; q ++) {
+               retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
+                                               rte_eth_dev_socket_id(port), &rx_conf_default,
+                                               mbuf_pool);
+               if (retval < 0)
+                       return retval;
+       }
+       for (q = 0; q < tx_rings; q ++) {
+               retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
+                                               rte_eth_dev_socket_id(port), &tx_conf_default);
+               if (retval < 0)
+                       return retval;
+       }
+
+       /* Start the device. */
+       retval  = rte_eth_dev_start(port);
+       if (retval < 0)
+               return retval;
+
+       rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
+       RTE_LOG(INFO, PORT, "Max virtio devices supported: %u\n", num_devices);
+       RTE_LOG(INFO, PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
+                       " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
+                       (unsigned)port,
+                       vmdq_ports_eth_addr[port].addr_bytes[0],
+                       vmdq_ports_eth_addr[port].addr_bytes[1],
+                       vmdq_ports_eth_addr[port].addr_bytes[2],
+                       vmdq_ports_eth_addr[port].addr_bytes[3],
+                       vmdq_ports_eth_addr[port].addr_bytes[4],
+                       vmdq_ports_eth_addr[port].addr_bytes[5]);
+
+       return 0;
+}
+
+/*
+ * Parse the portmask provided at run time.
+ */
+static int
+parse_portmask(const char *portmask)
+{
+       char *end = NULL;
+       unsigned long pm;
+
+       errno = 0;
+
+       /* parse hexadecimal string */
+       pm = strtoul(portmask, &end, 16);
+       if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
+               return -1;
+
+       if (pm == 0)
+               return -1;
+
+       return pm;
+
+}
+
+/*
+ * Parse num options at run time.
+ */
+static int
+parse_num_opt(const char *q_arg, uint32_t max_valid_value)
+{
+       char *end = NULL;
+       unsigned long num;
+
+       errno = 0;
+
+       /* parse unsigned int string */
+       num = strtoul(q_arg, &end, 10);
+       if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
+               return -1;
+
+       if (num > max_valid_value)
+               return -1;
+
+       return num;
+
+}
+
+/*
+ * Display usage
+ */
+static void
+us_vhost_usage(const char *prgname)
+{
+       RTE_LOG(INFO, CONFIG, "%s [EAL options] -- -p PORTMASK --vm2vm [0|1] --stats [0-N] --nb-devices ND\n"
+       "               -p PORTMASK: Set mask for ports to be used by application\n"
+       "               --vm2vm [0|1]: disable/enable(default) vm2vm comms\n"
+       "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n",
+              prgname);
+}
+
+/*
+ * Parse the arguments given in the command line of the application.
+ */
+static int
+us_vhost_parse_args(int argc, char **argv)
+{
+       int opt, ret;
+       int option_index;
+       unsigned i;
+       const char *prgname = argv[0];
+       static struct option long_option[] = {
+               {"vm2vm", required_argument, NULL, 0},
+               {"stats", required_argument, NULL, 0},
+               {NULL, 0, 0, 0}
+       };
+
+       /* Parse command line */
+       while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
+               switch (opt) {
+               /* Portmask */
+               case 'p':
+                       enabled_port_mask = parse_portmask(optarg);
+                       if (enabled_port_mask == 0) {
+                               RTE_LOG(INFO, CONFIG, "Invalid portmask\n");
+                               us_vhost_usage(prgname);
+                               return -1;
+                       }
+                       break;
+
+               case 0:
+                       /* Enable/disable vm2vm comms. */
+                       if (!strncmp(long_option[option_index].name, "vm2vm", MAX_LONG_OPT_SZ)) {
+                               ret = parse_num_opt(optarg, 1);
+                               if (ret == -1) {
+                                       RTE_LOG(INFO, CONFIG, "Invalid argument for vm2vm [0|1]\n");
+                                       us_vhost_usage(prgname);
+                                       return -1;
+                               } else {
+                                       enable_vm2vm = ret;
+                               }
+                       }
+
+                       /* Enable/disable stats. */
+                       if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
+                               ret = parse_num_opt(optarg, INT32_MAX);
+                               if (ret == -1) {
+                                       RTE_LOG(INFO, CONFIG, "Invalid argument for stats [0..N]\n");
+                                       us_vhost_usage(prgname);
+                                       return -1;
+                               } else {
+                                       enable_stats = ret;
+                               }
+                       }
+                       break;
+
+                       /* Invalid option - print options. */
+               default:
+                       us_vhost_usage(prgname);
+                       return -1;
+               }
+       }
+
+       for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
+               if (enabled_port_mask & (1 << i))
+                       ports[num_ports++] = (uint8_t)i;
+       }
+
+       if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
+               RTE_LOG(INFO, PORT, "Current enabled port number is %u,"
+                       "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+ * Update the global var NUM_PORTS and array PORTS according to system ports number
+ * and return valid ports number
+ */
+static unsigned check_ports_num(unsigned nb_ports)
+{
+       unsigned valid_num_ports = num_ports;
+       unsigned portid;
+
+       if (num_ports > nb_ports) {
+               RTE_LOG(INFO, PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
+                       num_ports, nb_ports);
+               num_ports = nb_ports;
+       }
+
+       for (portid = 0; portid < num_ports; portid ++) {
+               if (ports[portid] >= nb_ports) {
+                       RTE_LOG(INFO, PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
+                               ports[portid], (nb_ports - 1));
+                       ports[portid] = INVALID_PORT_ID;
+                       valid_num_ports--;
+               }
+       }
+       return valid_num_ports;
+}
+
+/*
+ * Macro to print out packet contents. Wrapped in debug define so that the
+ * data path is not effected when debug is disabled.
+ */
+#ifdef DEBUG
+#define PRINT_PACKET(device, addr, size, header) do {                                                                                                                          \
+       char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
+       unsigned int index;                                                                                                                                                                                             \
+       char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
+                                                                                                                                                                                                                                       \
+       if ((header))                                                                                                                                                                                                   \
+               rte_snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                              \
+       else                                                                                                                                                                                                                    \
+               rte_snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                              \
+       for (index = 0; index < (size); index++) {                                                                                                                                              \
+               rte_snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),        \
+                       "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
+       }                                                                                                                                                                                                                               \
+       rte_snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \
+                                                                                                                                                                                                                                       \
+       LOG_DEBUG(DATA, "%s", packet);                                                                                                                                                                  \
+} while(0)
+#else
+#define PRINT_PACKET(device, addr, size, header) do{} while(0)
+#endif
+
+/*
+ * Function to convert guest physical addresses to vhost virtual addresses. This
+ * is used to convert virtio buffer addresses.
+ */
+static inline uint64_t __attribute__((always_inline))
+gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa)
+{
+       struct virtio_memory_regions *region;
+       uint32_t regionidx;
+       uint64_t vhost_va = 0;
+
+       for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
+               region = &dev->mem->regions[regionidx];
+               if ((guest_pa >= region->guest_phys_address) &&
+                       (guest_pa <= region->guest_phys_address_end)) {
+                       vhost_va = region->address_offset + guest_pa;
+                       break;
+               }
+       }
+       LOG_DEBUG(DATA, "(%"PRIu64") GPA %p| VVA %p\n",
+               dev->device_fh, (void*)(uintptr_t)guest_pa, (void*)(uintptr_t)vhost_va);
+
+       return vhost_va;
+}
+
+/*
+ * This function adds buffers to the virtio devices RX virtqueue. Buffers can
+ * be received from the physical port or from another virtio device. A packet
+ * count is returned to indicate the number of packets that were succesfully
+ * added to the RX queue.
+ */
+static inline uint32_t __attribute__((always_inline))
+virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
+{
+       struct vhost_virtqueue *vq;
+       struct vring_desc *desc;
+       struct rte_mbuf *buff;
+       /* The virtio_hdr is initialised to 0. */
+       struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0,0,0,0,0,0},0};
+       uint64_t buff_addr = 0;
+       uint64_t buff_hdr_addr = 0;
+       uint32_t head[MAX_PKT_BURST], packet_len = 0;
+       uint32_t head_idx, packet_success = 0;
+       uint16_t avail_idx, res_cur_idx;
+       uint16_t res_base_idx, res_end_idx;
+       uint16_t free_entries;
+       uint8_t success = 0;
+
+       LOG_DEBUG(DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
+       vq = dev->virtqueue_rx;
+       count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
+       /* As many data cores may want access to available buffers, they need to be reserved. */
+       do {
+
+               res_base_idx = vq->last_used_idx_res;
+
+               avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+
+               free_entries = (avail_idx - res_base_idx);
+
+               /*check that we have enough buffers*/
+               if (unlikely(count > free_entries))
+                       count = free_entries;
+
+               if (count == 0)
+                       return 0;
+
+               res_end_idx = res_base_idx + count;
+               /* vq->last_used_idx_res is atomically updated. */
+               success = rte_atomic16_cmpset(&vq->last_used_idx_res, res_base_idx,
+                                                                       res_end_idx);
+       } while (unlikely(success == 0));
+       res_cur_idx = res_base_idx;
+       LOG_DEBUG(DATA, "(%"PRIu64") Current Index %d| End Index %d\n", dev->device_fh, res_cur_idx, res_end_idx);
+
+       /* Prefetch available ring to retrieve indexes. */
+       rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);
+
+       /* Retrieve all of the head indexes first to avoid caching issues. */
+       for (head_idx = 0; head_idx < count; head_idx++)
+               head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & (vq->size - 1)];
+
+       /*Prefetch descriptor index. */
+       rte_prefetch0(&vq->desc[head[packet_success]]);
+
+       while (res_cur_idx != res_end_idx) {
+               /* Get descriptor from available ring */
+               desc = &vq->desc[head[packet_success]];
+               /* Prefetch descriptor address. */
+               rte_prefetch0(desc);
+
+               buff = pkts[packet_success];
+
+               /* Convert from gpa to vva (guest physical addr -> vhost virtual addr) */
+               buff_addr = gpa_to_vva(dev, desc->addr);
+               /* Prefetch buffer address. */
+               rte_prefetch0((void*)(uintptr_t)buff_addr);
+
+               {
+                       /* Copy virtio_hdr to packet and increment buffer address */
+                       buff_hdr_addr = buff_addr;
+                       packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
+
+                       /*
+                        * If the descriptors are chained the header and data are placed in
+                        * separate buffers.
+                        */
+                       if (desc->flags & VRING_DESC_F_NEXT) {
+                               desc->len = vq->vhost_hlen;
+                               desc = &vq->desc[desc->next];
+                               /* Buffer address translation. */
+                               buff_addr = gpa_to_vva(dev, desc->addr);
+                               desc->len = rte_pktmbuf_data_len(buff);
+                       } else {
+                               buff_addr += vq->vhost_hlen;
+                               desc->len = packet_len;
+                       }
+               }
+
+               /* Update used ring with desc information */
+               vq->used->ring[res_cur_idx & (vq->size - 1)].id = head[packet_success];
+               vq->used->ring[res_cur_idx & (vq->size - 1)].len = packet_len;
+
+               /* Copy mbuf data to buffer */
+               rte_memcpy((void *)(uintptr_t)buff_addr, (const void*)buff->pkt.data, rte_pktmbuf_data_len(buff));
+
+               res_cur_idx++;
+               packet_success++;
+
+               /* mergeable is disabled then a header is required per buffer. */       
+               rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void*)&virtio_hdr, vq->vhost_hlen);
+               if (res_cur_idx < res_end_idx) {
+                       /* Prefetch descriptor index. */
+                       rte_prefetch0(&vq->desc[head[packet_success]]);
+               }
+       }
+
+       rte_compiler_barrier();
+
+       /* Wait until it's our turn to add our buffer to the used ring. */
+       while (unlikely(vq->last_used_idx != res_base_idx))
+               rte_pause();
+
+       *(volatile uint16_t *)&vq->used->idx += count;
+
+       vq->last_used_idx = res_end_idx;
+
+       return count;
+}
+
+/*
+ * Compares a packet destination MAC address to a device MAC address.
+ */
+static inline int __attribute__((always_inline))
+ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
+{
+       return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
+}
+
+/*
+ * This function registers mac along with a
+ * vlan tag to a VMDQ.
+ */
+static int
+link_vmdq(struct virtio_net *dev)
+{
+       int ret;
+       struct virtio_net_data_ll *dev_ll;
+
+       dev_ll = ll_root_used;
+
+       while (dev_ll != NULL) {
+               if ((dev != dev_ll->dev) && ether_addr_cmp(&dev->mac_address, &dev_ll->dev->mac_address)) {
+                       RTE_LOG(INFO, DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
+                       return -1;
+               }
+               dev_ll = dev_ll->next;
+       }
+
+       /* vlan_tag currently uses the device_id. */
+       dev->vlan_tag = vlan_tags[dev->device_fh];
+       dev->vmdq_rx_q = dev->device_fh * (num_queues/num_devices);
+
+       /* Print out VMDQ registration info. */
+       RTE_LOG(INFO, DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
+               dev->device_fh,
+               dev->mac_address.addr_bytes[0], dev->mac_address.addr_bytes[1],
+               dev->mac_address.addr_bytes[2], dev->mac_address.addr_bytes[3],
+               dev->mac_address.addr_bytes[4], dev->mac_address.addr_bytes[5],
+               dev->vlan_tag);
+
+       /* Register the MAC address. */
+       ret = rte_eth_dev_mac_addr_add(ports[0], &dev->mac_address, (uint32_t)dev->device_fh);
+       if (ret) {
+               RTE_LOG(ERR, DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
+                                                                               dev->device_fh);
+               return -1;
+       }
+
+       /* Enable stripping of the vlan tag as we handle routing. */
+       rte_eth_dev_set_vlan_strip_on_queue(ports[0], dev->vmdq_rx_q, 1);
+
+       rte_compiler_barrier();
+       /* Set device as ready for RX. */
+       dev->ready = DEVICE_READY;
+
+       return 0;
+}
+
+/*
+ * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
+ * queue before disabling RX on the device.
+ */
+static inline void
+unlink_vmdq(struct virtio_net *dev)
+{
+       unsigned i = 0;
+       unsigned rx_count;
+       struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
+
+       if (dev->ready == DEVICE_READY) {
+               /*clear MAC and VLAN settings*/
+               rte_eth_dev_mac_addr_remove(ports[0], &dev->mac_address);
+               for (i = 0; i < 6; i++)
+                       dev->mac_address.addr_bytes[i] = 0;
+
+               dev->vlan_tag = 0;
+
+               /*Clear out the receive buffers*/
+               rx_count = rte_eth_rx_burst(ports[0],
+                                       (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
+
+               while (rx_count) {
+                       for (i = 0; i < rx_count; i++)
+                               rte_pktmbuf_free(pkts_burst[i]);
+
+                       rx_count = rte_eth_rx_burst(ports[0],
+                                       (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
+               }
+
+               dev->ready = DEVICE_NOT_READY;
+       }
+}
+
+/*
+ * Check if the packet destination MAC address is for a local device. If so then put
+ * the packet on that devices RX queue. If not then return.
+ */
+static inline unsigned __attribute__((always_inline))
+virtio_tx_local(struct virtio_net *dev, struct rte_mbuf *m)
+{
+       struct virtio_net_data_ll *dev_ll;
+       struct ether_hdr *pkt_hdr;
+       uint64_t ret = 0;
+
+       pkt_hdr = (struct ether_hdr *)m->pkt.data;
+
+       /*get the used devices list*/
+       dev_ll = ll_root_used;
+
+       while (dev_ll != NULL) {
+               if (likely(dev_ll->dev->ready == DEVICE_READY) && ether_addr_cmp(&(pkt_hdr->d_addr),
+                                         &dev_ll->dev->mac_address)) {
+
+                       /* Drop the packet if the TX packet is destined for the TX device. */
+                       if (dev_ll->dev->device_fh == dev->device_fh) {
+                               LOG_DEBUG(DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
+                                                       dev_ll->dev->device_fh);
+                               return 0;
+                       }
+
+
+                       LOG_DEBUG(DATA, "(%"PRIu64") TX: MAC address is local\n", dev_ll->dev->device_fh);
+
+                       if (dev_ll->dev->remove) {
+                               /*drop the packet if the device is marked for removal*/
+                               LOG_DEBUG(DATA, "(%"PRIu64") Device is marked for removal\n", dev_ll->dev->device_fh);
+                       } else {
+                               /*send the packet to the local virtio device*/
+                               ret = virtio_dev_rx(dev_ll->dev, &m, 1);
+                               if (enable_stats) {
+                                       rte_atomic64_add(&dev_statistics[dev_ll->dev->device_fh].rx_total, 1);
+                                       rte_atomic64_add(&dev_statistics[dev_ll->dev->device_fh].rx, ret);
+                                       dev_statistics[dev->device_fh].tx_total++;
+                                       dev_statistics[dev->device_fh].tx += ret;
+                               }
+                       }
+
+                       return 0;
+               }
+               dev_ll = dev_ll->next;
+       }
+
+       return -1;
+}
+
+/*
+ * This function routes the TX packet to the correct interface. This may be a local device
+ * or the physical port.
+ */
+static inline void __attribute__((always_inline))
+virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m, struct rte_mempool *mbuf_pool, uint16_t vlan_tag)
+{
+       struct mbuf_table *tx_q;
+       struct vlan_ethhdr *vlan_hdr;
+       struct rte_mbuf **m_table;
+       struct rte_mbuf *mbuf;
+       unsigned len, ret;
+       const uint16_t lcore_id = rte_lcore_id();
+
+       /*check if destination is local VM*/
+       if (enable_vm2vm && (virtio_tx_local(dev, m) == 0)) {
+               return;
+       }
+
+       LOG_DEBUG(DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
+
+       /*Add packet to the port tx queue*/
+       tx_q = &lcore_tx_queue[lcore_id];
+       len = tx_q->len;
+
+       /* Allocate an mbuf and populate the structure. */
+       mbuf = rte_pktmbuf_alloc(mbuf_pool);
+       if(!mbuf)
+               return;
+
+       mbuf->pkt.data_len = m->pkt.data_len + VLAN_HLEN;
+       mbuf->pkt.pkt_len = mbuf->pkt.data_len;
+
+       /* Copy ethernet header to mbuf. */
+       rte_memcpy((void*)mbuf->pkt.data, (const void*)m->pkt.data, ETH_HLEN);
+
+
+       /* Setup vlan header. Bytes need to be re-ordered for network with htons()*/
+       vlan_hdr = (struct vlan_ethhdr *) mbuf->pkt.data;
+       vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto;
+       vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q);
+       vlan_hdr->h_vlan_TCI = htons(vlan_tag);
+
+       /* Copy the remaining packet contents to the mbuf. */
+       rte_memcpy((void*) ((uint8_t*)mbuf->pkt.data + VLAN_ETH_HLEN),
+               (const void*) ((uint8_t*)m->pkt.data + ETH_HLEN), (m->pkt.data_len - ETH_HLEN));
+       tx_q->m_table[len] = mbuf;
+       len++;
+       if (enable_stats) {
+               dev_statistics[dev->device_fh].tx_total++;
+               dev_statistics[dev->device_fh].tx++;
+       }
+
+       if (unlikely(len == MAX_PKT_BURST)) {
+               m_table = (struct rte_mbuf **)tx_q->m_table;
+               ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
+               /* Free any buffers not handled by TX and update the port stats. */
+               if (unlikely(ret < len)) {
+                       do {
+                               rte_pktmbuf_free(m_table[ret]);
+                       } while (++ret < len);
+               }
+
+               len = 0;
+       }
+
+       tx_q->len = len;
+       return;
+}
+
+static inline void __attribute__((always_inline))
+virtio_dev_tx(struct virtio_net* dev, struct rte_mempool *mbuf_pool)
+{
+       struct rte_mbuf m;
+       struct vhost_virtqueue *vq;
+       struct vring_desc *desc;
+       uint64_t buff_addr = 0;
+       uint32_t head[MAX_PKT_BURST];
+       uint32_t used_idx;
+       uint32_t i;
+       uint16_t free_entries, packet_success = 0;
+       uint16_t avail_idx;
+
+       vq = dev->virtqueue_tx;
+       avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+
+       /* If there are no available buffers then return. */
+       if (vq->last_used_idx == avail_idx)
+               return;
+
+       LOG_DEBUG(DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
+
+       /* Prefetch available ring to retrieve head indexes. */
+       rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
+
+       /*get the number of free entries in the ring*/
+       free_entries = avail_idx - vq->last_used_idx;
+       free_entries = unlikely(free_entries < MAX_PKT_BURST) ? free_entries : MAX_PKT_BURST;
+
+       LOG_DEBUG(DATA, "(%"PRIu64") Buffers available %d\n", dev->device_fh, free_entries);
+       /* Retrieve all of the head indexes first to avoid caching issues. */
+       for (i = 0; i < free_entries; i++)
+               head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
+
+       /* Prefetch descriptor index. */
+       rte_prefetch0(&vq->desc[head[packet_success]]);
+
+       while (packet_success < free_entries) {
+               desc = &vq->desc[head[packet_success]];
+               /* Prefetch descriptor address. */
+               rte_prefetch0(desc);
+
+               if (packet_success < (free_entries - 1)) {
+                       /* Prefetch descriptor index. */
+                       rte_prefetch0(&vq->desc[head[packet_success+1]]);
+               }
+
+               /* Update used index buffer information. */
+               used_idx = vq->last_used_idx & (vq->size - 1);
+               vq->used->ring[used_idx].id = head[packet_success];
+               vq->used->ring[used_idx].len = 0;
+
+               /* Discard first buffer as it is the virtio header */
+               desc = &vq->desc[desc->next];
+
+               /* Buffer address translation. */
+               buff_addr = gpa_to_vva(dev, desc->addr);
+               /* Prefetch buffer address. */
+               rte_prefetch0((void*)(uintptr_t)buff_addr);
+
+               /* Setup dummy mbuf. This is copied to a real mbuf if transmitted out the physical port. */
+               m.pkt.data_len = desc->len;
+               m.pkt.data = (void*)(uintptr_t)buff_addr;
+               m.pkt.nb_segs = 1; 
+
+               virtio_tx_route(dev, &m, mbuf_pool, 0);
+
+               vq->last_used_idx++;
+               packet_success++;
+       }
+
+       rte_compiler_barrier();
+       vq->used->idx += packet_success;
+       /* Kick guest if required. */
+}
+
+/*
+ * This function is called by each data core. It handles all RX/TX registered with the
+ * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
+ * with all devices in the main linked list.
+ */ 
+static int
+switch_worker(__attribute__((unused)) void *arg)
+{
+       struct rte_mempool *mbuf_pool = arg;
+       struct virtio_net *dev = NULL;
+       struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
+       struct virtio_net_data_ll *dev_ll;
+       struct mbuf_table *tx_q;
+       volatile struct lcore_ll_info *lcore_ll;
+       const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
+       uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
+       unsigned ret, i;
+       const uint16_t lcore_id = rte_lcore_id();
+       const uint16_t num_cores = (uint16_t)rte_lcore_count();
+       uint16_t rx_count = 0;
+
+       RTE_LOG(INFO, DATA, "Procesing on Core %u started \n", lcore_id);
+       lcore_ll = lcore_info[lcore_id].lcore_ll;
+       prev_tsc = 0;
+
+       tx_q = &lcore_tx_queue[lcore_id];
+       for (i = 0; i < num_cores; i ++) {
+               if (lcore_ids[i] == lcore_id) {
+                       tx_q->txq_id = i;
+                       break;
+               }
+       }
+
+       while(1) {
+               cur_tsc = rte_rdtsc();
+               /*
+                * TX burst queue drain
+                */
+               diff_tsc = cur_tsc - prev_tsc;
+               if (unlikely(diff_tsc > drain_tsc)) {
+
+                       if (tx_q->len) {
+                               LOG_DEBUG(DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
+
+                               /*Tx any packets in the queue*/
+                               ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
+                                                                          (struct rte_mbuf **)tx_q->m_table,
+                                                                          (uint16_t)tx_q->len);
+                               if (unlikely(ret < tx_q->len)) {
+                                       do {
+                                               rte_pktmbuf_free(tx_q->m_table[ret]);
+                                       } while (++ret < tx_q->len);
+                               }
+
+                               tx_q->len = 0;
+                       }
+
+                       prev_tsc = cur_tsc;
+
+               }
+
+               /* 
+                * Inform the configuration core that we have exited the linked list and that no devices are
+                * in use if requested.
+                */
+               if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL) 
+                       lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
+
+               /*
+                * Process devices
+                */
+               dev_ll = lcore_ll->ll_root_used;
+
+               while (dev_ll != NULL) {
+                       /*get virtio device ID*/
+                       dev = dev_ll->dev;
+
+                       if (unlikely(dev->remove)) {
+                               dev_ll = dev_ll->next;
+                               unlink_vmdq(dev);
+                               dev->ready = DEVICE_SAFE_REMOVE;
+                               continue;
+                       }
+                       if (likely(dev->ready == DEVICE_READY)) {
+                               /*Handle guest RX*/
+                               rx_count = rte_eth_rx_burst(ports[0],
+                                       (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
+
+                               if (rx_count) {
+                                       ret_count = virtio_dev_rx(dev, pkts_burst, rx_count);
+                                       if (enable_stats) {
+                                               rte_atomic64_add(&dev_statistics[dev_ll->dev->device_fh].rx_total, rx_count);
+                                               rte_atomic64_add(&dev_statistics[dev_ll->dev->device_fh].rx, ret_count);
+                                       }
+                                       while (likely(rx_count)) {
+                                               rx_count--;
+                                               rte_pktmbuf_free_seg(pkts_burst[rx_count]);
+                                       }
+
+                               }
+                       }
+
+                       if (likely(!dev->remove))
+                               /*Handle guest TX*/
+                               virtio_dev_tx(dev, mbuf_pool);
+
+                       /*move to the next device in the list*/
+                       dev_ll = dev_ll->next;
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * Add an entry to a used linked list. A free entry must first be found in the free linked list
+ * using get_data_ll_free_entry();
+ */
+static void
+add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, struct virtio_net_data_ll *ll_dev)
+{
+       struct virtio_net_data_ll *ll = *ll_root_addr;
+
+       /* Set next as NULL and use a compiler barrier to avoid reordering. */
+       ll_dev->next = NULL;
+       rte_compiler_barrier();
+
+       /* If ll == NULL then this is the first device. */
+       if (ll) {
+               /* Increment to the tail of the linked list. */
+               while ((ll->next != NULL) )
+                       ll = ll->next;
+
+               ll->next = ll_dev;
+       } else {
+               *ll_root_addr = ll_dev;
+       }
+}
+
+/*
+ * Remove an entry from a used linked list. The entry must then be added to the free linked list
+ * using put_data_ll_free_entry().
+ */
+static void
+rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr, struct virtio_net_data_ll *ll_dev, struct virtio_net_data_ll *ll_dev_last)
+{
+       struct virtio_net_data_ll *ll = *ll_root_addr;
+
+       if (ll_dev == ll)
+               *ll_root_addr = ll_dev->next;
+       else
+               ll_dev_last->next = ll_dev->next;
+}
+
+/*
+ * Find and return an entry from the free linked list.
+ */
+static struct virtio_net_data_ll *
+get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
+{
+       struct virtio_net_data_ll *ll_free = *ll_root_addr;
+       struct virtio_net_data_ll *ll_dev;
+
+       if (ll_free == NULL)
+               return NULL;
+
+       ll_dev = ll_free;
+       *ll_root_addr = ll_free->next;
+
+       return ll_dev;
+}
+
+/*
+ * Place an entry back on to the free linked list.
+ */
+static void
+put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr, struct virtio_net_data_ll *ll_dev)
+{
+       struct virtio_net_data_ll *ll_free = *ll_root_addr;
+
+       ll_dev->next = ll_free;
+       *ll_root_addr = ll_dev;
+}
+
+/*
+ * Creates a linked list of a given size.
+ */
+static struct virtio_net_data_ll *
+alloc_data_ll(uint32_t size)
+{
+       struct virtio_net_data_ll *ll_new;
+       uint32_t i;
+
+       /* Malloc and then chain the linked list. */
+       ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
+       if (ll_new == NULL) {
+               RTE_LOG(ERR, CONFIG, "Failed to allocate memory for ll_new.\n");
+               return NULL;
+       }
+
+       for (i = 0; i < size - 1; i++) {
+               ll_new[i].dev = NULL;
+               ll_new[i].next = &ll_new[i+1];
+       }
+       ll_new[i].next = NULL;
+
+       return (ll_new);
+}
+
+/*
+ * Create the main linked list along with each individual cores linked list. A used and a free list
+ * are created to manage entries.
+ */
+static int
+init_data_ll (void)
+{
+       int lcore;
+
+       RTE_LCORE_FOREACH_SLAVE(lcore) {
+               lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
+               if (lcore_info[lcore].lcore_ll == NULL) {
+                       RTE_LOG(ERR, CONFIG, "Failed to allocate memory for lcore_ll.\n");
+                       return -1;
+               }
+
+               lcore_info[lcore].lcore_ll->device_num = 0;
+               lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
+               lcore_info[lcore].lcore_ll->ll_root_used = NULL;
+               if (num_devices % num_switching_cores)
+                       lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
+               else
+                       lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
+       }
+
+       /* Allocate devices up to a maximum of MAX_DEVICES. */
+       ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
+
+       return 0;
+}
+/*
+ * Remove a device from the specific data core linked list and from the main linked list. The 
+ * rx/tx thread must be set the flag to indicate that it is safe to remove the device.
+ * used.
+ */
+static void
+destroy_device (volatile struct virtio_net *dev)
+{
+       struct virtio_net_data_ll *ll_lcore_dev_cur;
+       struct virtio_net_data_ll *ll_main_dev_cur;
+       struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
+       struct virtio_net_data_ll *ll_main_dev_last = NULL;
+       int lcore;
+
+       dev->flags &= ~VIRTIO_DEV_RUNNING;
+
+       /*set the remove flag. */
+       dev->remove = 1;
+
+       while(dev->ready != DEVICE_SAFE_REMOVE) {
+               rte_pause();
+       }
+
+       /* Search for entry to be removed from lcore ll */
+       ll_lcore_dev_cur = lcore_info[dev->coreid].lcore_ll->ll_root_used;
+       while (ll_lcore_dev_cur != NULL) {
+               if (ll_lcore_dev_cur->dev == dev) {
+                       break;
+               } else {
+                       ll_lcore_dev_last = ll_lcore_dev_cur;
+                       ll_lcore_dev_cur = ll_lcore_dev_cur->next;
+               }
+       }
+
+       /* Search for entry to be removed from main ll */
+       ll_main_dev_cur = ll_root_used;
+       ll_main_dev_last = NULL;
+       while (ll_main_dev_cur != NULL) {
+               if (ll_main_dev_cur->dev == dev) {
+                       break;
+               } else {
+                       ll_main_dev_last = ll_main_dev_cur;
+                       ll_main_dev_cur = ll_main_dev_cur->next;
+               }
+       }
+
+       if (ll_lcore_dev_cur == NULL || ll_main_dev_cur == NULL) {
+               RTE_LOG(ERR, XENHOST, "%s: could find device in per_cpu list or main_list\n", __func__);
+               return;
+       }
+
+       /* Remove entries from the lcore and main ll. */
+       rm_data_ll_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
+       rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
+
+       /* Set the dev_removal_flag on each lcore. */
+       RTE_LCORE_FOREACH_SLAVE(lcore) {
+               lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
+       }
+       
+       /* 
+        * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
+        * they can no longer access the device removed from the linked lists and that the devices
+        * are no longer in use.
+        */
+       RTE_LCORE_FOREACH_SLAVE(lcore) {
+               while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
+                       rte_pause();
+               }
+       }
+
+       /* Add the entries back to the lcore and main free ll.*/
+       put_data_ll_free_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
+       put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
+
+       /* Decrement number of device on the lcore. */
+       lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->device_num--;
+       
+       RTE_LOG(INFO, DATA, "  #####(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
+}
+
+/*
+ * A new device is added to a data core. First the device is added to the main linked list
+ * and the allocated to a specific data core.
+ */
+static int
+new_device (struct virtio_net *dev)
+{
+       struct virtio_net_data_ll *ll_dev;
+       int lcore, core_add = 0;
+       uint32_t device_num_min = num_devices;
+
+       /* Add device to main ll */
+       ll_dev = get_data_ll_free_entry(&ll_root_free);
+       if (ll_dev == NULL) {
+               RTE_LOG(INFO, DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
+                       "of %d devices per core has been reached\n",
+                       dev->device_fh, num_devices);
+               return -1;
+       }
+       ll_dev->dev = dev;
+       add_data_ll_entry(&ll_root_used, ll_dev);
+
+       /*reset ready flag*/
+       dev->ready = DEVICE_NOT_READY;
+       dev->remove = 0;
+
+       /* Find a suitable lcore to add the device. */
+       RTE_LCORE_FOREACH_SLAVE(lcore) {
+               if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
+                       device_num_min = lcore_info[lcore].lcore_ll->device_num;
+                       core_add = lcore;
+               }
+       }
+       /* Add device to lcore ll */
+       ll_dev->dev->coreid = core_add;
+       ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free);
+       if (ll_dev == NULL) {
+               RTE_LOG(INFO, DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
+               destroy_device(dev);
+               return -1;
+       }
+       ll_dev->dev = dev;
+       add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev);
+
+       /* Initialize device stats */
+       memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
+
+       lcore_info[ll_dev->dev->coreid].lcore_ll->device_num++;
+       dev->flags |= VIRTIO_DEV_RUNNING;
+
+       RTE_LOG(INFO, DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, dev->coreid);
+
+       link_vmdq(dev);
+
+       return 0;
+}
+
+/*
+ * These callback allow devices to be added to the data core when configuration
+ * has been fully complete.
+ */
+static const struct virtio_net_device_ops virtio_net_device_ops =
+{
+       .new_device =  new_device,
+       .destroy_device = destroy_device,
+};
+
+/*
+ * This is a thread will wake up after a period to print stats if the user has
+ * enabled them.
+ */
+static void
+print_stats(void)
+{
+       struct virtio_net_data_ll *dev_ll;
+       uint64_t tx_dropped, rx_dropped;
+       uint64_t tx, tx_total, rx, rx_total;
+       uint32_t device_fh;
+       const char clr[] = { 27, '[', '2', 'J', '\0' };
+       const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
+
+       while(1) {
+               sleep(enable_stats);
+
+               /* Clear screen and move to top left */
+               printf("%s%s", clr, top_left);
+
+               printf("\nDevice statistics ====================================");
+
+               dev_ll = ll_root_used;
+               while (dev_ll != NULL) {
+                       device_fh = (uint32_t)dev_ll->dev->device_fh;
+                       tx_total = dev_statistics[device_fh].tx_total;
+                       tx = dev_statistics[device_fh].tx;
+                       tx_dropped = tx_total - tx;
+                       rx_total = rte_atomic64_read(&dev_statistics[device_fh].rx_total);
+                       rx = rte_atomic64_read(&dev_statistics[device_fh].rx);
+                       rx_dropped = rx_total - rx;
+
+                       printf("\nStatistics for device %"PRIu32" ------------------------------"
+                                       "\nTX total:            %"PRIu64""
+                                       "\nTX dropped:          %"PRIu64""
+                                       "\nTX successful:               %"PRIu64""
+                                       "\nRX total:            %"PRIu64""
+                                       "\nRX dropped:          %"PRIu64""
+                                       "\nRX successful:               %"PRIu64"",
+                                       device_fh,
+                                       tx_total,
+                                       tx_dropped,
+                                       tx,
+                                       rx_total,
+                                       rx_dropped,
+                                       rx);
+
+                       dev_ll = dev_ll->next;
+               }
+               printf("\n======================================================\n");
+       }
+}
+
+
+int init_virtio_net(struct virtio_net_device_ops const * const ops);
+
+/* 
+ * Main function, does initialisation and calls the per-lcore functions. The CUSE
+ * device is also registered here to handle the IOCTLs.
+ */
+int
+MAIN(int argc, char *argv[])
+{
+       struct rte_mempool *mbuf_pool;
+       unsigned lcore_id, core_id = 0;
+       unsigned nb_ports, valid_num_ports;
+       int ret;
+       uint8_t portid;
+       static pthread_t tid;
+
+       /* init EAL */
+       ret = rte_eal_init(argc, argv);
+       if (ret < 0)
+               rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
+       argc -= ret;
+       argv += ret;
+
+       /* parse app arguments */
+       ret = us_vhost_parse_args(argc, argv);
+       if (ret < 0)
+               rte_exit(EXIT_FAILURE, "Invalid argument\n");
+
+       if (rte_pmd_init_all() != 0 || rte_eal_pci_probe() != 0)
+               rte_exit(EXIT_FAILURE, "Error with NIC driver initialization\n");
+
+       for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
+               if (rte_lcore_is_enabled(lcore_id))
+                       lcore_ids[core_id ++] = lcore_id;
+
+       if (rte_lcore_count() > RTE_MAX_LCORE)
+               rte_exit(EXIT_FAILURE,"Not enough cores\n");
+
+       /*set the number of swithcing cores available*/
+       num_switching_cores = rte_lcore_count()-1;
+
+       /* Get the number of physical ports. */
+       nb_ports = rte_eth_dev_count();
+       if (nb_ports > RTE_MAX_ETHPORTS)
+               nb_ports = RTE_MAX_ETHPORTS;
+
+       /*
+        * Update the global var NUM_PORTS and global array PORTS
+        * and get value of var VALID_NUM_PORTS according to system ports number
+        */
+       valid_num_ports = check_ports_num(nb_ports);
+
+       if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
+               RTE_LOG(INFO, PORT, "Current enabled port number is %u,"
+                       "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
+               return -1;
+       }
+
+       /* Create the mbuf pool. */
+       mbuf_pool = rte_mempool_create("MBUF_POOL", NUM_MBUFS_PER_PORT * valid_num_ports,
+                                      MBUF_SIZE, MBUF_CACHE_SIZE,
+                                      sizeof(struct rte_pktmbuf_pool_private),
+                                      rte_pktmbuf_pool_init, NULL,
+                                      rte_pktmbuf_init, NULL,
+                                      rte_socket_id(), 0);
+       if (mbuf_pool == NULL)
+               rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
+
+       /* Set log level. */
+       rte_set_log_level(LOG_LEVEL);
+
+       /* initialize all ports */
+       for (portid = 0; portid < nb_ports; portid++) {
+               /* skip ports that are not enabled */
+               if ((enabled_port_mask & (1 << portid)) == 0) {
+                       RTE_LOG(INFO, PORT, "Skipping disabled port %d\n", portid);
+                       continue;
+               }
+               if (port_init(portid, mbuf_pool) != 0)
+                       rte_exit(EXIT_FAILURE, "Cannot initialize network ports\n");
+       }
+
+       /* Initialise all linked lists. */
+       if (init_data_ll() == -1)
+               rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
+
+       /* Initialize device stats */
+       memset(&dev_statistics, 0, sizeof(dev_statistics));
+
+       /* Enable stats if the user option is set. */
+       if (enable_stats)
+               pthread_create(&tid, NULL, (void*)print_stats, NULL );
+
+       /* Launch all data cores. */
+       RTE_LCORE_FOREACH_SLAVE(lcore_id) {
+               rte_eal_remote_launch(switch_worker, mbuf_pool, lcore_id);
+       }
+
+       init_virtio_xen(&virtio_net_device_ops);
+
+       virtio_monitor_loop();
+       return 0;
+}
diff --git a/examples/vhost_xen/main.h b/examples/vhost_xen/main.h
new file mode 100644 (file)
index 0000000..0ec39cc
--- /dev/null
@@ -0,0 +1,85 @@
+/*-
+ *   BSD LICENSE
+ * 
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ * 
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ * 
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MAIN_H_
+#define _MAIN_H_
+
+#ifdef RTE_EXEC_ENV_BAREMETAL
+#define MAIN _main
+#else
+#define MAIN main
+#endif
+
+//#define DEBUG
+
+#ifdef DEBUG
+#define LOG_LEVEL RTE_LOG_DEBUG
+#define LOG_DEBUG(log_type, fmt, args...) \
+       RTE_LOG(DEBUG, log_type, fmt, ##args)
+#else
+#define LOG_LEVEL RTE_LOG_INFO
+#define LOG_DEBUG(log_type, fmt, args...) do{} while(0)
+#endif
+
+/* Macros for printing using RTE_LOG */
+#define RTE_LOGTYPE_CONFIG RTE_LOGTYPE_USER1
+#define RTE_LOGTYPE_DATA RTE_LOGTYPE_USER2
+#define RTE_LOGTYPE_PORT RTE_LOGTYPE_USER3
+
+/*
+ * Device linked list structure for data path.
+ */
+struct virtio_net_data_ll
+{
+       struct virtio_net          *dev;   /* Pointer to device created by configuration core. */
+       struct virtio_net_data_ll  *next;  /* Pointer to next device in linked list. */
+};
+
+/*
+ * Structure containing data core specific information.
+ */
+struct lcore_ll_info
+{
+       struct virtio_net_data_ll    *ll_root_free;     /* Pointer to head in free linked list. */
+       struct virtio_net_data_ll    *ll_root_used;         /* Pointer to head of used linked list. */
+       uint32_t                      device_num;       /* Number of devices on lcore. */
+       volatile  uint8_t             dev_removal_flag; /* Flag to synchronize device removal. */
+};
+
+struct lcore_info
+{
+       struct lcore_ll_info    *lcore_ll;      /* Pointer to data core specific lcore_ll_info struct */
+};
+
+int MAIN(int argc, char **argv);
+#endif /* _MAIN_H_ */
diff --git a/examples/vhost_xen/vhost_monitor.c b/examples/vhost_xen/vhost_monitor.c
new file mode 100644 (file)
index 0000000..5d67ab2
--- /dev/null
@@ -0,0 +1,595 @@
+/*-
+ *   BSD LICENSE
+ * 
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ * 
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ * 
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <sys/eventfd.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <xen/xen-compat.h>
+#if __XEN_LATEST_INTERFACE_VERSION__ < 0x00040200
+#include <xs.h>
+#else
+#include <xenstore.h>
+#endif
+#include <linux/virtio_ring.h>
+#include <linux/virtio_pci.h>
+#include <linux/virtio_net.h>
+
+#include <rte_ethdev.h>
+#include <rte_log.h>
+#include <rte_malloc.h>
+#include <rte_string_fns.h>
+
+#include "virtio-net.h"
+#include "xen_vhost.h"
+
+struct virtio_watch {
+       struct xs_handle *xs;
+       int watch_fd;
+};
+
+
+/* device ops to add/remove device to/from data core. */
+static struct virtio_net_device_ops const *notify_ops;
+
+/* root address of the linked list in the configuration core. */
+static struct virtio_net_config_ll *ll_root = NULL;
+
+/* root address of VM. */
+static struct xen_guestlist guest_root;
+
+static struct virtio_watch watch;
+
+static void
+vq_vring_init(struct vhost_virtqueue *vq, unsigned int num, uint8_t *p,
+       unsigned long align)
+{
+       vq->size = num;
+       vq->desc = (struct vring_desc *) p;
+       vq->avail = (struct vring_avail *) (p +
+               num * sizeof(struct vring_desc));
+       vq->used = (void *)
+               RTE_ALIGN_CEIL( (uintptr_t)(&vq->avail->ring[num]), align);
+
+}
+
+static int
+init_watch(void)
+{
+       struct xs_handle *xs;
+       int ret;
+       int fd;
+
+       /* get a connection to the daemon */
+       xs = xs_daemon_open();
+       if (xs == NULL) {
+               RTE_LOG(ERR, XENHOST, "xs_daemon_open failed\n");
+               return (-1);
+       }
+
+       ret = xs_watch(xs, "/local/domain", "mytoken");
+       if (ret == 0) {
+               RTE_LOG(ERR, XENHOST, "%s: xs_watch failed\n", __func__);
+               xs_daemon_close(xs);
+               return (-1);
+       }
+
+       /* We are notified of read availability on the watch via the file descriptor. */
+       fd = xs_fileno(xs);
+       watch.xs = xs;
+       watch.watch_fd = fd;
+
+       TAILQ_INIT(&guest_root);
+       return 0;
+}
+
+static struct xen_guest *
+get_xen_guest(int dom_id)
+{
+       struct xen_guest *guest = NULL;
+
+       TAILQ_FOREACH(guest, &guest_root, next) {
+               if(guest->dom_id == dom_id)
+                       return guest;
+       }
+
+       return (NULL);
+}
+
+
+static struct xen_guest * 
+add_xen_guest(int32_t dom_id)
+{
+       struct xen_guest *guest = NULL;
+
+       if ((guest = get_xen_guest(dom_id)) != NULL)
+               return guest;
+
+       guest = (struct xen_guest * )calloc(1, sizeof(struct xen_guest));
+       if (guest) {
+               RTE_LOG(ERR, XENHOST, "  %s: return newly created guest with %d rings\n", __func__, guest->vring_num);
+               TAILQ_INSERT_TAIL(&guest_root, guest, next);
+               guest->dom_id = dom_id;
+       }
+
+       return guest;
+}
+
+static void
+cleanup_device(struct virtio_net_config_ll *ll_dev)
+{
+       if (ll_dev == NULL)
+               return;
+       if (ll_dev->dev.virtqueue_rx) {
+               rte_free(ll_dev->dev.virtqueue_rx);
+               ll_dev->dev.virtqueue_rx = NULL;
+       }
+       if (ll_dev->dev.virtqueue_tx) {
+               rte_free(ll_dev->dev.virtqueue_tx);
+               ll_dev->dev.virtqueue_tx = NULL;
+       }
+       free(ll_dev);
+}
+
+/*
+ * Add entry containing a device to the device configuration linked list. 
+ */
+static void
+add_config_ll_entry(struct virtio_net_config_ll *new_ll_dev)
+{
+       struct virtio_net_config_ll *ll_dev = ll_root;
+
+       /* If ll_dev == NULL then this is the first device so go to else */
+       if (ll_dev) {
+               /* If the 1st device_id != 0 then we insert our device here. */
+               if (ll_dev->dev.device_fh != 0) {
+                       new_ll_dev->dev.device_fh = 0;
+                       new_ll_dev->next = ll_dev;
+                       ll_root = new_ll_dev;
+               } else {                
+                       /* increment through the ll until we find un unused device_id,
+                        * insert the device at that entry
+                        */
+                       while ((ll_dev->next != NULL) && (ll_dev->dev.device_fh == (ll_dev->next->dev.device_fh - 1)))
+                               ll_dev = ll_dev->next;
+                       
+                       new_ll_dev->dev.device_fh = ll_dev->dev.device_fh + 1;
+                       new_ll_dev->next = ll_dev->next;
+                       ll_dev->next = new_ll_dev;
+               }
+       } else {
+               ll_root = new_ll_dev;
+               ll_root->dev.device_fh = 0;
+       }
+}
+
+
+/*
+ * Remove an entry from the device configuration linked list.
+ */
+static struct virtio_net_config_ll *
+rm_config_ll_entry(struct virtio_net_config_ll *ll_dev, struct virtio_net_config_ll *ll_dev_last)
+{      
+       /* First remove the device and then clean it up. */
+       if (ll_dev == ll_root) {
+               ll_root = ll_dev->next;
+               cleanup_device(ll_dev);
+               return ll_root;
+       } else {
+               ll_dev_last->next = ll_dev->next;
+               cleanup_device(ll_dev);
+               return ll_dev_last->next;
+       }
+}
+
+/*
+ * Retrieves an entry from the devices configuration linked list.
+ */
+static struct virtio_net_config_ll * 
+get_config_ll_entry(unsigned int virtio_idx, unsigned int dom_id)
+{
+       struct virtio_net_config_ll *ll_dev = ll_root;
+
+       /* Loop through linked list until the dom_id is found. */
+       while (ll_dev != NULL) {
+               if (ll_dev->dev.dom_id == dom_id && ll_dev->dev.virtio_idx == virtio_idx) 
+                       return ll_dev;
+               ll_dev = ll_dev->next;
+       } 
+
+       return NULL;
+}
+
+/* 
+ * Initialise all variables in device structure. 
+ */
+static void
+init_dev(struct virtio_net *dev)
+{
+       RTE_SET_USED(dev);
+}
+
+
+static struct
+virtio_net_config_ll *new_device(unsigned int virtio_idx, struct xen_guest *guest)
+{
+       struct virtio_net_config_ll *new_ll_dev;
+       struct vhost_virtqueue *virtqueue_rx, *virtqueue_tx;
+       size_t size, vq_ring_size, vq_size = VQ_DESC_NUM;
+       void *vq_ring_virt_mem;
+       uint64_t gpa;
+       uint32_t i;
+
+       /* Setup device and virtqueues. */      
+       new_ll_dev   = calloc(1, sizeof(struct virtio_net_config_ll));
+       virtqueue_rx = rte_zmalloc(NULL, sizeof(struct vhost_virtqueue), CACHE_LINE_SIZE);
+       virtqueue_tx = rte_zmalloc(NULL, sizeof(struct vhost_virtqueue), CACHE_LINE_SIZE);
+       if (new_ll_dev == NULL || virtqueue_rx == NULL || virtqueue_tx == NULL)
+               goto err;
+
+       new_ll_dev->dev.virtqueue_rx = virtqueue_rx;    
+       new_ll_dev->dev.virtqueue_tx = virtqueue_tx;
+       new_ll_dev->dev.dom_id       = guest->dom_id;
+       new_ll_dev->dev.virtio_idx   = virtio_idx;
+       /* Initialise device and virtqueues. */
+       init_dev(&new_ll_dev->dev);
+
+       size = vring_size(vq_size, VIRTIO_PCI_VRING_ALIGN);
+       vq_ring_size = RTE_ALIGN_CEIL(size, VIRTIO_PCI_VRING_ALIGN);
+       (void)vq_ring_size;
+
+       vq_ring_virt_mem = guest->vring[virtio_idx].rxvring_addr;
+       vq_vring_init(virtqueue_rx, vq_size, vq_ring_virt_mem, VIRTIO_PCI_VRING_ALIGN);
+       virtqueue_rx->size = vq_size;
+       virtqueue_rx->vhost_hlen = sizeof(struct virtio_net_hdr);
+
+       vq_ring_virt_mem = guest->vring[virtio_idx].txvring_addr;
+       vq_vring_init(virtqueue_tx, vq_size, vq_ring_virt_mem, VIRTIO_PCI_VRING_ALIGN);
+       virtqueue_tx->size = vq_size;
+       memcpy(&new_ll_dev->dev.mac_address, &guest->vring[virtio_idx].addr, sizeof(struct ether_addr));
+
+       /* virtio_memory has to be one per domid */
+       new_ll_dev->dev.mem = malloc(sizeof(struct virtio_memory) + sizeof(struct virtio_memory_regions) * MAX_XENVIRT_MEMPOOL);
+       new_ll_dev->dev.mem->nregions = guest->pool_num;
+       for (i = 0; i < guest->pool_num; i++) {
+               gpa = new_ll_dev->dev.mem->regions[i].guest_phys_address = (uint64_t)guest->mempool[i].gva; 
+               new_ll_dev->dev.mem->regions[i].guest_phys_address_end = gpa + guest->mempool[i].mempfn_num * getpagesize();
+               new_ll_dev->dev.mem->regions[i].address_offset = (uint64_t)guest->mempool[i].hva - gpa;
+       }
+
+       new_ll_dev->next = NULL;
+
+       /* Add entry to device configuration linked list. */
+       add_config_ll_entry(new_ll_dev);
+       return new_ll_dev;
+err:
+       if (new_ll_dev)
+               free(new_ll_dev);
+       if (virtqueue_rx)
+               rte_free(virtqueue_rx);
+       if (virtqueue_tx)
+               rte_free(virtqueue_tx);
+       return NULL;
+}
+
+static void
+destroy_guest(struct xen_guest *guest)
+{
+       uint32_t i;
+
+       for (i = 0; i < guest->vring_num; i++)
+               cleanup_vring(&guest->vring[i]);
+       /* clean mempool */
+       for (i = 0; i < guest->pool_num; i++)
+               cleanup_mempool(&guest->mempool[i]);
+       free(guest);
+
+       return;
+}
+
+/*
+ * This function will cleanup the device and remove it from device configuration linked list.
+ */
+static void 
+destroy_device(unsigned int virtio_idx, unsigned int dom_id)
+{
+       struct virtio_net_config_ll *ll_dev_cur_ctx, *ll_dev_last = NULL;
+       struct virtio_net_config_ll *ll_dev_cur = ll_root;
+
+       /* clean virtio device */
+       struct xen_guest *guest = NULL;
+       guest = get_xen_guest(dom_id);
+       if (guest == NULL)
+               return;
+
+       /* Find the linked list entry for the device to be removed. */
+       ll_dev_cur_ctx = get_config_ll_entry(virtio_idx, dom_id);
+       while (ll_dev_cur != NULL) {
+               /* If the device is found or a device that doesn't exist is found then it is removed. */
+               if  (ll_dev_cur == ll_dev_cur_ctx) {
+                       if ((ll_dev_cur->dev.flags & VIRTIO_DEV_RUNNING))
+                               notify_ops->destroy_device(&(ll_dev_cur->dev));
+                       ll_dev_cur = rm_config_ll_entry(ll_dev_cur, ll_dev_last);
+               } else {
+                       ll_dev_last = ll_dev_cur;
+                       ll_dev_cur = ll_dev_cur->next;
+               }
+       }
+       RTE_LOG(INFO, XENHOST, "  %s guest:%p vring:%p rxvring:%p txvring:%p flag:%p\n",
+               __func__, guest, &guest->vring[virtio_idx], guest->vring[virtio_idx].rxvring_addr, guest->vring[virtio_idx].txvring_addr, guest->vring[virtio_idx].flag);
+       cleanup_vring(&guest->vring[virtio_idx]);
+       guest->vring[virtio_idx].removed = 1;
+       guest->vring_num -= 1;
+}
+
+
+
+
+static void 
+watch_unmap_event(void)
+{
+       int i;
+       struct xen_guest *guest  = NULL;
+       bool remove_request;
+
+       TAILQ_FOREACH(guest, &guest_root, next) {
+               for (i = 0; i < MAX_VIRTIO; i++) {
+                       if (guest->vring[i].dom_id && guest->vring[i].removed == 0 && *guest->vring[i].flag == 0) {
+                               RTE_LOG(INFO, XENHOST, "\n\n");
+                               RTE_LOG(INFO, XENHOST, "  #####%s:  (%d, %d) to be removed\n",
+                                       __func__,
+                                       guest->vring[i].dom_id,
+                                       i);
+                               destroy_device(i, guest->dom_id);
+                               RTE_LOG(INFO, XENHOST, "  %s: DOM %u, vring num: %d\n",
+                                       __func__,
+                                       guest->dom_id,
+                                       guest->vring_num);
+                       }
+               }
+       }
+
+_find_next_remove:
+       guest = NULL;
+       remove_request = false;
+       TAILQ_FOREACH(guest, &guest_root, next) {
+               if (guest->vring_num == 0) {
+                       remove_request = true;
+                       break;
+               }
+       }
+       if (remove_request == true) {
+               TAILQ_REMOVE(&guest_root, guest, next);
+               RTE_LOG(INFO, XENHOST, "  #####%s: destroy guest (%d)\n", __func__, guest->dom_id);
+               destroy_guest(guest);
+               goto _find_next_remove;
+       } 
+       return;
+}
+
+/*
+ * OK, if the guest starts first, it is ok.
+ * if host starts first, it is ok.
+ * if guest starts, and has run for sometime, and host stops and restarts,
+ * then last_used_idx  0? how to solve this. */
+
+static void virtio_init(void)
+{
+       uint32_t len, e_num;
+       uint32_t i,j;
+       char **dom;
+       char *status;
+       int dom_id;
+       char path[PATH_MAX];
+       char node[PATH_MAX];
+       xs_transaction_t th;
+       struct xen_guest *guest;
+       struct virtio_net_config_ll *net_config;
+       char *end;
+       int val;
+
+       /* init env for watch the node */
+       if (init_watch() < 0)
+               return;
+
+       dom = xs_directory(watch.xs, XBT_NULL, "/local/domain", &e_num);
+
+       for (i = 0; i < e_num; i++) {
+               errno = 0;
+               dom_id = strtol(dom[i], &end, 0);
+               if (errno != 0 || end == NULL || dom_id == 0)
+                       continue;
+
+               for (j = 0; j < RTE_MAX_ETHPORTS; j++) {
+                       rte_snprintf(node, PATH_MAX, "%s%d", VIRTIO_START, j);
+                       rte_snprintf(path, PATH_MAX, XEN_VM_NODE_FMT,
+                                       dom_id, node);
+
+                       th = xs_transaction_start(watch.xs);
+                       status = xs_read(watch.xs, th, path, &len);
+                       xs_transaction_end(watch.xs, th, false);
+
+                       if (status == NULL)
+                               break;
+
+                       /* if there's any valid virtio device */
+                       errno = 0;
+                       val = strtol(status, &end, 0);
+                       if (errno != 0 || end == NULL || dom_id == 0)
+                               val = 0;
+                       if (val == 1) {
+                               guest = add_xen_guest(dom_id);
+                               if (guest == NULL)
+                                       continue;
+                               RTE_LOG(INFO, XENHOST, "  there's a new virtio existed, new a virtio device\n\n");
+
+                               RTE_LOG(INFO, XENHOST, "  parse_vringnode dom_id %d virtioidx %d\n",dom_id,j);
+                               if (parse_vringnode(guest, j)) {
+                                       RTE_LOG(ERR, XENHOST, "  there is invalid information in xenstore\n");
+                                       TAILQ_REMOVE(&guest_root, guest, next);
+                                       destroy_guest(guest);
+
+                                       continue;
+                               }
+
+                               /*if pool_num > 0, then mempool has already been parsed*/
+                               if (guest->pool_num == 0 && parse_mempoolnode(guest)) {
+                                       RTE_LOG(ERR, XENHOST, "  there is error information in xenstore\n");
+                                       TAILQ_REMOVE(&guest_root, guest, next);
+                                       destroy_guest(guest);
+                                       continue;
+                               }
+
+                               net_config = new_device(j, guest);
+                               /* every thing is ready now, added into data core */
+                               notify_ops->new_device(&net_config->dev);
+                       }
+               }
+       }
+
+       free(dom);
+       return;
+}
+
+void
+virtio_monitor_loop(void)
+{
+       char **vec;
+       xs_transaction_t th;    
+       char *buf;
+       unsigned int len;
+       unsigned int dom_id;
+       uint32_t virtio_idx;
+       struct xen_guest *guest;
+       struct virtio_net_config_ll *net_config;
+       enum fieldnames {
+               FLD_NULL = 0,
+               FLD_LOCAL,
+               FLD_DOMAIN,
+               FLD_ID,
+               FLD_CONTROL,
+               FLD_DPDK,
+               FLD_NODE,
+               _NUM_FLD
+       };
+       char *str_fld[_NUM_FLD];
+       char *str;
+       char *end;
+
+       virtio_init();
+       while (1) {
+               watch_unmap_event();
+
+               usleep(50);
+               vec = xs_check_watch(watch.xs);
+
+               if (vec == NULL)
+                       continue;
+
+               th = xs_transaction_start(watch.xs);
+
+               buf = xs_read(watch.xs, th, vec[XS_WATCH_PATH],&len);
+               xs_transaction_end(watch.xs, th, false);
+
+               if (buf) {
+                       /* theres' some node for vhost existed */
+                       if (rte_strsplit(vec[XS_WATCH_PATH], strnlen(vec[XS_WATCH_PATH], PATH_MAX), 
+                                               str_fld, _NUM_FLD, '/') == _NUM_FLD) {
+                               if (strstr(str_fld[FLD_NODE], VIRTIO_START)) {
+                                       errno = 0;
+                                       str = str_fld[FLD_ID];
+                                       dom_id = strtoul(str, &end, 0);
+                                       if (errno != 0 || end == NULL || end == str ) {
+                                               RTE_LOG(INFO, XENHOST, "invalid domain id\n");
+                                               continue;
+                                       }
+
+                                       errno = 0;
+                                       str = str_fld[FLD_NODE] + sizeof(VIRTIO_START) - 1;
+                                       virtio_idx = strtoul(str, &end, 0);
+                                       if (errno != 0 || end == NULL || end == str
+                                                       || virtio_idx > MAX_VIRTIO) {
+                                               RTE_LOG(INFO, XENHOST, "invalid virtio idx\n");
+                                               continue;
+                                       }
+                                       RTE_LOG(INFO, XENHOST, "  #####virtio dev (%d, %d) is started\n", dom_id, virtio_idx);
+
+                                       guest = add_xen_guest(dom_id);
+                                       if (guest == NULL)
+                                               continue;
+                                       guest->dom_id = dom_id;
+                                       if (parse_vringnode(guest, virtio_idx)) {
+                                               RTE_LOG(ERR, XENHOST, "  there is invalid information in xenstore\n");
+                                               /*guest newly created? guest existed ?*/
+                                               TAILQ_REMOVE(&guest_root, guest, next);
+                                               destroy_guest(guest);
+                                               continue;
+                                       }
+                                       /*if pool_num > 0, then mempool has already been parsed*/
+                                       if (guest->pool_num == 0 && parse_mempoolnode(guest)) {
+                                               RTE_LOG(ERR, XENHOST, "  there is error information in xenstore\n");
+                                               TAILQ_REMOVE(&guest_root, guest, next);
+                                               destroy_guest(guest);
+                                               continue;
+                                       }
+
+
+                                       net_config = new_device(virtio_idx, guest);
+                                       RTE_LOG(INFO, XENHOST, "  Add to dataplane core\n");
+                                       notify_ops->new_device(&net_config->dev);
+
+                               }
+                       }
+               }
+
+               free(vec);              
+       }        
+       return;
+}
+
+/*
+ * Register ops so that we can add/remove device to data core.
+ */
+int 
+init_virtio_xen(struct virtio_net_device_ops const *const ops)
+{
+       notify_ops = ops;
+       if (xenhost_init())
+               return -1;
+       return 0;
+}
+
+
diff --git a/examples/vhost_xen/virtio-net.h b/examples/vhost_xen/virtio-net.h
new file mode 100644 (file)
index 0000000..8051a89
--- /dev/null
@@ -0,0 +1,115 @@
+/*-
+ *   BSD LICENSE
+ * 
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ * 
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ * 
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VIRTIO_NET_H_
+#define _VIRTIO_NET_H_
+
+#include <stdint.h>
+
+#define VQ_DESC_NUM 256
+/* Used to indicate that the device is running on a data core */
+#define VIRTIO_DEV_RUNNING 1
+
+/*
+ * Structure contains variables relevant to TX/RX virtqueues.
+ */
+struct vhost_virtqueue
+{
+       struct vring_desc  *desc;             /* Virtqueue descriptor ring. */
+       struct vring_avail *avail;            /* Virtqueue available ring. */
+       struct vring_used  *used;             /* Virtqueue used ring. */
+       uint32_t           size;              /* Size of descriptor ring. */
+       uint32_t           vhost_hlen;        /* Vhost header length (varies depending on RX merge buffers. */
+       volatile uint16_t  last_used_idx;     /* Last index used on the available ring */
+       volatile uint16_t  last_used_idx_res; /* Used for multiple devices reserving buffers. */
+} __rte_cache_aligned;
+
+/*
+ * Device structure contains all configuration information relating to the device.
+ */
+struct virtio_net
+{
+       struct vhost_virtqueue  *virtqueue_tx;  /* Contains all TX virtqueue information. */ 
+       struct vhost_virtqueue  *virtqueue_rx;  /* Contains all RX virtqueue information. */
+       struct virtio_memory    *mem;           /* QEMU memory and memory region information. */
+       struct ether_addr       mac_address;    /* Device MAC address (Obtained on first TX packet). */
+       uint32_t                flags;          /* Device flags. Only used to check if device is running on data core. */
+       uint32_t                vlan_tag;       /* Vlan tag for device. Currently set to device_id (0-63). */
+       uint32_t                vmdq_rx_q;
+       uint64_t                device_fh;      /* device identifier. */
+       uint16_t                coreid;
+       volatile uint8_t        ready;          /* A device is set as ready if the MAC address has been set. */
+       volatile uint8_t        remove;         /* Device is marked for removal from the data core. */
+       uint32_t                virtio_idx;     /* Index of virtio device */
+       uint32_t                dom_id;         /* Domain id of xen guest */
+} ___rte_cache_aligned;  
+
+/*
+ * Device linked list structure for configuration.
+ */
+struct virtio_net_config_ll
+{
+       struct virtio_net               dev;    /* Virtio device. */
+       struct virtio_net_config_ll     *next; /* Next entry on linked list. */
+};
+
+/*
+ * Information relating to memory regions including offsets to addresses in QEMUs memory file. 
+ */
+struct virtio_memory_regions {
+       uint64_t        guest_phys_address;     /* Base guest physical address of region. */
+       uint64_t        guest_phys_address_end; /* End guest physical address of region. */
+       uint64_t        memory_size;            /* Size of region. */
+       uint64_t        userspace_address;      /* Base userspace address of region. */
+       uint64_t        address_offset;         /* Offset of region for address translation. */
+};
+
+/*
+ * Memory structure includes region and mapping information. 
+ */
+struct virtio_memory {
+       uint32_t                        nregions;       /* Number of memory regions. */
+       struct virtio_memory_regions    regions[0];     /* Memory region information. */
+};
+
+/*
+ * Device operations to add/remove device.
+ */
+struct virtio_net_device_ops {
+       int (* new_device)(struct virtio_net *);        /* Add device. */
+       void (* destroy_device) (volatile struct virtio_net *); /* Remove device. */
+};
+
+struct vhost_net_device_ops const * get_virtio_net_callbacks(void);
+
+#endif
diff --git a/examples/vhost_xen/xen_vhost.h b/examples/vhost_xen/xen_vhost.h
new file mode 100644 (file)
index 0000000..944cf0c
--- /dev/null
@@ -0,0 +1,149 @@
+/*-
+ *   BSD LICENSE
+ * 
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ * 
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ * 
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _XEN_VHOST_H_
+#define _XEN_VHOST_H_
+
+#include <stdint.h>
+
+#include <rte_tailq.h>
+#include <rte_ether.h>
+
+#include "virtio-net.h"
+
+#define RTE_LOGTYPE_XENHOST RTE_LOGTYPE_USER1
+
+#define XEN_VM_ROOTNODE_FMT  "/local/domain/%d/control/dpdk"
+#define XEN_VM_NODE_FMT      "/local/domain/%d/control/dpdk/%s"
+#define XEN_MEMPOOL_SUFFIX   "mempool_gref"
+#define XEN_RXVRING_SUFFIX   "rx_vring_gref"
+#define XEN_TXVRING_SUFFIX   "tx_vring_gref"
+#define XEN_GVA_SUFFIX       "mempool_va"
+#define XEN_VRINGFLAG_SUFFIX "vring_flag"
+#define XEN_ADDR_SUFFIX      "ether_addr"
+#define VIRTIO_START         "event_type_start_"
+
+#define XEN_GREF_SPLITTOKEN  ','
+
+#define MAX_XENVIRT_MEMPOOL 16
+#define MAX_VIRTIO  32
+#define MAX_GREF_PER_NODE 64  /* 128 MB memory */
+
+#define PAGE_SIZE   4096
+#define PAGE_PFNNUM (PAGE_SIZE / sizeof(uint32_t))
+
+#define XEN_GNTDEV_FNAME "/dev/xen/gntdev"
+
+/* xen grant reference info in one grant node */
+struct xen_gnt {
+       uint32_t gref;  /* grant reference for this node */
+       union {
+               int gref;               /* grant reference */
+               uint32_t pfn_num;       /* guest pfn number of grant reference */
+       } gref_pfn[PAGE_PFNNUM];
+}__attribute__((__packed__));
+
+
+/* structure for mempool or vring node list */
+struct xen_gntnode {
+       uint32_t gnt_num;           /* grant reference number */
+       struct xen_gnt *gnt_info;   /* grant reference info */
+};
+
+
+struct xen_vring {
+       uint32_t dom_id;
+       uint32_t virtio_idx;    /* index of virtio device */
+       void *rxvring_addr;     /* mapped virtual address of rxvring */
+       void *txvring_addr;     /* mapped virtual address of txvring */
+       uint32_t rxpfn_num;     /* number of gpfn for rxvring */ 
+       uint32_t txpfn_num;     /* number of gpfn for txvring */
+       uint32_t *rxpfn_tbl;    /* array of rxvring gpfn */
+       uint32_t *txpfn_tbl;    /* array of txvring gpfn */
+       uint64_t *rx_pindex;    /* index used to release rx grefs */
+       uint64_t *tx_pindex;    /* index used to release tx grefs */
+       uint64_t  flag_index;
+       uint8_t  *flag;         /* cleared to zero on guest unmap */
+       struct ether_addr addr; /* ethernet address of virtio device */
+       uint8_t   removed;
+
+};
+
+struct xen_mempool {
+       uint32_t dom_id;      /* guest domain id */
+       uint32_t pool_idx;    /* index of memory pool */
+       void *gva;            /* guest virtual address of mbuf pool */
+       void *hva;            /* host virtual address of mbuf pool */
+       uint32_t mempfn_num;  /* number of gpfn for mbuf pool */
+       uint32_t *mempfn_tbl; /* array of mbuf pool gpfn */
+       uint64_t *pindex;     /* index used to release grefs */
+};
+
+struct xen_guest {
+       TAILQ_ENTRY(xen_guest) next;
+       int32_t dom_id;       /* guest domain id */
+       uint32_t pool_num;    /* number of mbuf pool of the guest */
+       uint32_t vring_num;   /* number of virtio ports of the guest */
+       /* array contain the guest mbuf pool info */
+       struct xen_mempool mempool[MAX_XENVIRT_MEMPOOL];
+       /* array contain the guest rx/tx vring info */
+       struct xen_vring vring[MAX_VIRTIO];
+};
+
+TAILQ_HEAD(xen_guestlist, xen_guest);
+
+int
+parse_mempoolnode(struct xen_guest *guest);
+
+int
+xenhost_init(void);
+
+int
+parse_vringnode(struct xen_guest *guest, uint32_t virtio_idx);
+
+int
+parse_mempoolnode(struct xen_guest *guest);
+
+void
+cleanup_mempool(struct xen_mempool *mempool);
+
+void
+cleanup_vring(struct xen_vring *vring);
+
+void
+virtio_monitor_loop(void);
+
+int
+init_virtio_xen(struct virtio_net_device_ops const * const);
+
+#endif
diff --git a/examples/vhost_xen/xenstore_parse.c b/examples/vhost_xen/xenstore_parse.c
new file mode 100644 (file)
index 0000000..6380b85
--- /dev/null
@@ -0,0 +1,786 @@
+/*-
+ *   BSD LICENSE
+ * 
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ * 
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ * 
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <xen/sys/gntalloc.h>
+#include <xen/sys/gntdev.h>
+#include <xen/xen-compat.h>
+#if __XEN_LATEST_INTERFACE_VERSION__ < 0x00040200
+#include <xs.h>
+#else
+#include <xenstore.h>
+#endif
+
+#include <rte_common.h>
+#include <rte_memory.h>
+#include <rte_tailq.h>
+#include <rte_eal.h>
+#include <rte_malloc.h>
+#include <rte_string_fns.h>
+#include <rte_log.h>
+#include <rte_debug.h>
+
+#include "xen_vhost.h"
+
+/* xenstore handle */
+static struct xs_handle *xs = NULL;
+
+/* gntdev file descriptor to map grant pages */
+static int d_fd = -1;
+/*
+ *  The grant node format in xenstore for vring/mpool is like:
+ *  idx#_rx_vring_gref = "gref1#, gref2#, gref3#"
+ *  idx#_mempool_gref  = "gref1#, gref2#, gref3#"
+ *  each gref# is the grant reference for a shared page.
+ *  In each shared page, we store the grant_node_item items.
+ */
+struct grant_node_item {
+       uint32_t gref;
+       uint32_t pfn;
+} __attribute__((packed));
+
+int cmdline_parse_etheraddr(void *tk, const char *srcbuf,
+                           void *res);
+
+/* Map grant ref refid at addr_ori*/
+static void *
+xen_grant_mmap(void *addr_ori, int domid, int refid, uint64_t *pindex) 
+{ 
+       struct ioctl_gntdev_map_grant_ref arg;
+       void *addr = NULL;
+       int pg_sz = getpagesize();
+
+       arg.count = 1;
+       arg.refs[0].domid = domid;
+       arg.refs[0].ref = refid;
+
+       int rv = ioctl(d_fd, IOCTL_GNTDEV_MAP_GRANT_REF, &arg); 
+       if (rv) { 
+               RTE_LOG(ERR, XENHOST, "  %s: (%d,%d) %s (ioctl failed)\n", __func__,
+                               domid, refid, strerror(errno)); 
+               return NULL; 
+       } 
+
+       if (addr_ori == NULL)
+               addr = mmap(addr_ori, pg_sz, PROT_READ|PROT_WRITE, MAP_SHARED, 
+                               d_fd, arg.index); 
+       else
+               addr = mmap(addr_ori, pg_sz, PROT_READ|PROT_WRITE, MAP_SHARED | MAP_FIXED,
+                               d_fd, arg.index); 
+
+       if (addr == MAP_FAILED) {
+               RTE_LOG(ERR, XENHOST, "  %s: (%d, %d) %s (map failed)\n", __func__,
+                               domid, refid, strerror(errno)); 
+               return NULL; 
+       }
+
+       if (pindex)
+               *pindex = arg.index;
+
+       return addr;
+} 
+
+/* Unmap one grant ref, and munmap must be called before this */
+static int
+xen_unmap_grant_ref(uint64_t index)
+{
+       struct ioctl_gntdev_unmap_grant_ref arg;
+       int rv;
+       
+       arg.count = 1;
+       arg.index = index;
+       rv = ioctl(d_fd, IOCTL_GNTDEV_UNMAP_GRANT_REF, &arg);
+       if (rv) {
+               RTE_LOG(ERR, XENHOST, "  %s: index 0x%" PRIx64 "unmap failed\n", __func__, index);
+               return -1;
+       }
+       return 0;
+}
+
+/*
+ * Reserve a virtual address space.
+ * On success, returns the pointer. On failure, returns NULL.
+ */
+static void *
+get_xen_virtual(size_t size, size_t page_sz)
+{
+       void *addr;
+       uintptr_t aligned_addr;
+
+       addr = mmap(NULL, size + page_sz, PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+       if (addr == MAP_FAILED) {
+               RTE_LOG(ERR, XENHOST, "failed get a virtual area\n");
+               return NULL;
+       }
+
+       aligned_addr = RTE_ALIGN_CEIL((uintptr_t)addr, page_sz);
+       munmap(addr, aligned_addr - (uintptr_t)addr);
+       munmap((void *)(aligned_addr + size), page_sz + (uintptr_t)addr - aligned_addr);
+       addr = (void *)(aligned_addr);
+
+       return addr;
+}
+
+static void
+free_xen_virtual(void *addr, size_t size, size_t page_sz __rte_unused)
+{
+       if (addr)
+               munmap(addr, size);
+}
+
+/*
+ * Returns val str in xenstore.
+ * @param path
+ *  Full path string for key
+ * @return
+ *  Pointer to Val str, NULL on failure
+ */
+static char *
+xen_read_node(char *path, uint32_t *len)
+{
+       char *buf;
+
+       buf = xs_read(xs, XBT_NULL, path, len);
+       return buf;
+} 
+
+static int
+cal_pagenum(struct xen_gnt *gnt)
+{
+       unsigned int i;
+       /*
+        * the items in the page are in the format of
+        * gref#,pfn#,...,gref#,pfn#
+        * FIXME, 0 is reserved by system, use it as terminator.
+        */
+       for (i = 0; i < (PAGE_PFNNUM) / 2; i++) {
+               if (gnt->gref_pfn[i * 2].gref <= 0)
+                       break;
+       }
+
+       return i;
+}
+
+/* Frees memory allocated to a grant node */
+static void
+xen_free_gntnode(struct xen_gntnode *gntnode)
+{
+       if (gntnode == NULL)
+               return;
+       if (gntnode->gnt_info)
+               free(gntnode->gnt_info);
+       free(gntnode);  
+}
+
+/*
+ * Parse a grant node.
+ * @param domid
+ *  Guest domain id.
+ * @param path
+ *  Full path string for a grant node, like for the following (key, val) pair
+ *  idx#_mempool_gref = "gref#, gref#, gref#"
+ *  path = 'local/domain/domid/control/dpdk/idx#_mempool_gref'
+ *  gref# is a shared page contain packed (gref,pfn) entries
+ * @return
+ *  Returns the pointer to xen_gntnode
+ */
+static struct xen_gntnode *
+parse_gntnode(int dom_id, char *path)
+{
+       char **gref_list = NULL;
+       uint32_t i, len, gref_num;
+       void *addr = NULL;
+       char *buf = NULL;
+       struct xen_gntnode *gntnode = NULL;
+       struct xen_gnt *gnt = NULL;
+       int pg_sz = getpagesize();
+       char *end;
+       uint64_t index;
+
+       if ((buf = xen_read_node(path, &len)) == NULL)
+               goto err;
+
+       gref_list = malloc(MAX_GREF_PER_NODE * sizeof(char *));
+       if (gref_list == NULL)
+               goto err;
+
+       gref_num = rte_strsplit(buf, len, gref_list, MAX_GREF_PER_NODE,
+                       XEN_GREF_SPLITTOKEN);
+       if (gref_num == 0) {
+               RTE_LOG(ERR, XENHOST, "  %s: invalid grant node format\n", __func__);
+               goto err;
+       }
+
+       gntnode = (struct xen_gntnode *)calloc(1, sizeof(struct xen_gntnode));
+       gnt = (struct xen_gnt *)calloc(gref_num, sizeof(struct xen_gnt));
+       if (gnt == NULL || gntnode == NULL)
+               goto err;
+       
+       for (i = 0; i < gref_num; i++) {
+               errno = 0;
+               gnt[i].gref = strtol(gref_list[i], &end, 0);
+               if (errno != 0 || end == NULL || end == gref_list[i] ||
+                       (*end != '\0' &&  *end != XEN_GREF_SPLITTOKEN)) {
+                       RTE_LOG(ERR, XENHOST, "  %s: parse grant node item failed\n", __func__);
+                       goto err;
+               }
+               addr = xen_grant_mmap(NULL, dom_id, gnt[i].gref, &index);
+               if (addr == NULL) {
+                       RTE_LOG(ERR, XENHOST, "  %s: map gref %u failed\n", __func__, gnt[i].gref); 
+                       goto err;
+               }
+               RTE_LOG(INFO, XENHOST, "      %s: map gref %u to %p\n", __func__, gnt[i].gref, addr);
+               memcpy(gnt[i].gref_pfn, addr, pg_sz);
+               if (munmap(addr, pg_sz)) {
+                       RTE_LOG(INFO, XENHOST, "  %s: unmap gref %u failed\n", __func__, gnt[i].gref);
+                       goto err;
+               }
+               if (xen_unmap_grant_ref(index)) {
+                       RTE_LOG(INFO, XENHOST, "  %s: release gref %u failed\n", __func__, gnt[i].gref);
+                       goto err;
+               }
+                       
+       }
+       
+       gntnode->gnt_num  = gref_num;
+       gntnode->gnt_info = gnt;        
+
+       free(buf);
+       free(gref_list);
+       return gntnode;
+
+err:
+       if (gnt)
+               free(gnt);
+       if (gntnode)
+               free(gntnode);
+       if (gref_list)
+               free(gref_list);
+       if (buf)
+               free(buf);
+       return NULL;
+}
+
+/*
+ * This function maps grant node of vring or mbuf pool to a continous virtual address space,
+ * and returns mapped address, pfn array, index array
+ * @param gntnode
+ *  Pointer to grant node
+ * @param domid
+ *  Guest domain id
+ * @param ppfn
+ *  Pointer to pfn array, caller should free this array
+ * @param pgs
+ *  Pointer to number of pages
+ * @param ppindex
+ *  Pointer to index array, used to release grefs when to free this node
+ * @return
+ *  Pointer to mapped virtual address, NULL on failure
+ */
+static void *
+map_gntnode(struct xen_gntnode *gntnode, int domid, uint32_t **ppfn, uint32_t *pgs, uint64_t **ppindex)
+{
+       struct xen_gnt *gnt;
+       uint32_t i, j;
+       size_t total_pages = 0;
+       void *addr;
+       uint32_t *pfn;
+       uint64_t *pindex;
+       uint32_t pfn_num = 0;
+       int pg_sz;
+
+       if (gntnode == NULL)
+               return NULL;
+
+       pg_sz = getpagesize();
+       for (i = 0; i < gntnode->gnt_num; i++) {
+               gnt = gntnode->gnt_info + i;
+               total_pages += cal_pagenum(gnt);  
+       }
+       if ((addr = get_xen_virtual(total_pages * pg_sz, pg_sz)) == NULL) {
+               RTE_LOG(ERR, XENHOST, "  %s: failed get_xen_virtual\n", __func__);
+               return NULL;
+       }
+       pfn = calloc(total_pages, (size_t)sizeof(uint32_t));
+       pindex = calloc(total_pages, (size_t)sizeof(uint64_t));
+       if (pfn == NULL || pindex == NULL) {
+               free_xen_virtual(addr, total_pages * pg_sz, pg_sz);
+               free(pfn);
+               free(pindex);
+               return NULL;
+       }
+
+       RTE_LOG(INFO, XENHOST, "    %s: total pages:%zu, map to [%p, %p]\n", __func__, total_pages, addr, RTE_PTR_ADD(addr, total_pages * pg_sz - 1));
+       for (i = 0; i < gntnode->gnt_num; i++) {
+               gnt = gntnode->gnt_info + i;
+               for (j = 0; j < (PAGE_PFNNUM) / 2; j++) {
+                       if ((gnt->gref_pfn[j * 2].gref) <= 0)
+                               goto _end;
+                       /*alternative: batch map, or through libxc*/
+                       if (xen_grant_mmap(RTE_PTR_ADD(addr, pfn_num * pg_sz),
+                                       domid, 
+                                       gnt->gref_pfn[j * 2].gref,
+                                       &pindex[pfn_num]) == NULL) {
+                               goto mmap_failed;
+                       }
+                       pfn[pfn_num] = gnt->gref_pfn[j * 2 + 1].pfn_num;
+                       pfn_num++;
+               }
+       }
+
+mmap_failed:
+       if (pfn_num)
+               munmap(addr, pfn_num * pg_sz);
+       for (i = 0; i < pfn_num; i++) {
+               xen_unmap_grant_ref(pindex[i]);
+       }
+       free(pindex);
+       free(pfn);
+       return NULL;
+
+_end:
+       if (ppindex)
+               *ppindex = pindex;
+       else
+               free(pindex);
+       if (ppfn)
+               *ppfn = pfn;
+       else
+               free(pfn);
+       if (pgs)
+               *pgs = total_pages;
+
+       return addr;
+}
+
+static int
+parse_mpool_va(struct xen_mempool *mempool)
+{
+       char path[PATH_MAX] = {0};
+       char *buf;
+       uint32_t len;
+       char *end;
+       int ret = -1;
+
+       errno = 0;
+       rte_snprintf(path, sizeof(path), 
+               XEN_VM_ROOTNODE_FMT"/%d_"XEN_GVA_SUFFIX,
+               mempool->dom_id, mempool->pool_idx);
+               
+       if((buf = xen_read_node(path, &len)) == NULL)
+               goto out;
+       mempool->gva = (void *)strtoul(buf, &end, 16);
+       if (errno != 0 || end == NULL || end == buf || *end != '\0') {
+               mempool->gva = NULL;
+               goto out;
+       }
+       ret = 0;        
+out:
+       if (buf)
+               free(buf);
+       return ret;
+}
+
+/*
+ * map mbuf pool
+ */
+static int 
+map_mempoolnode(struct xen_gntnode *gntnode,
+                       struct xen_mempool *mempool)
+{
+       if (gntnode == NULL || mempool == NULL)
+               return -1;
+       
+       mempool->hva = 
+               map_gntnode(gntnode, mempool->dom_id, &mempool->mempfn_tbl, &mempool->mempfn_num, &mempool->pindex);
+
+       RTE_LOG(INFO, XENHOST, "  %s: map mempool at %p\n", __func__, (void *)mempool->hva);
+       if (mempool->hva)
+               return 0;
+       else {
+               return -1;
+       }
+}
+
+void
+cleanup_mempool(struct xen_mempool *mempool)
+{
+       int pg_sz = getpagesize();
+       uint32_t i;
+       
+       if (mempool->hva)
+               munmap(mempool->hva, mempool->mempfn_num * pg_sz);
+       mempool->hva = NULL;
+
+       if (mempool->pindex) {
+               RTE_LOG(INFO, XENHOST, "  %s: unmap dom %02u mempool%02u %u grefs\n",
+                       __func__,
+                       mempool->dom_id,
+                       mempool->pool_idx,
+                       mempool->mempfn_num);
+               for (i = 0; i < mempool->mempfn_num; i ++) {
+                       xen_unmap_grant_ref(mempool->pindex[i]); 
+               }
+       }
+       mempool->pindex = NULL;
+
+       if (mempool->mempfn_tbl)
+               free(mempool->mempfn_tbl);
+       mempool->mempfn_tbl = NULL;
+}
+
+/*
+ * process mempool node idx#_mempool_gref, idx = 0, 1, 2...
+ * untill we encounter a node that doesn't exist.
+ */
+int
+parse_mempoolnode(struct xen_guest *guest)
+{
+       uint32_t i, len;
+       char path[PATH_MAX] = {0};
+       struct xen_gntnode *gntnode = NULL;
+       struct xen_mempool *mempool = NULL;
+       char *buf;
+
+       bzero(&guest->mempool, MAX_XENVIRT_MEMPOOL * sizeof(guest->mempool[0]));
+       guest->pool_num = 0;
+
+       while (1) {
+               /* check if null terminated */
+               rte_snprintf(path, sizeof(path),
+                       XEN_VM_ROOTNODE_FMT"/%d_"XEN_MEMPOOL_SUFFIX, 
+                       guest->dom_id,
+                       guest->pool_num);
+               
+               if ((buf = xen_read_node(path, &len)) != NULL) {
+                       /* this node exists */
+                       free(buf);
+               } else {
+                       if (guest->pool_num == 0) {
+                               RTE_LOG(ERR, PMD, "no mempool found\n");
+                               return -1;
+                       }
+                       break;
+               }
+
+               mempool = &guest->mempool[guest->pool_num];
+               mempool->dom_id = guest->dom_id;
+               mempool->pool_idx = guest->pool_num;
+       
+               RTE_LOG(INFO, XENHOST, "  %s: mempool %u parse gntnode %s\n", __func__, guest->pool_num, path);
+               gntnode = parse_gntnode(guest->dom_id, path);
+               if (gntnode == NULL)
+                       goto err;
+
+               if (parse_mpool_va(mempool))
+                       goto err;
+
+               RTE_LOG(INFO, XENHOST, "  %s: mempool %u map gntnode %s\n", __func__, guest->pool_num, path);
+               if (map_mempoolnode(gntnode, mempool))
+                       goto err;
+
+               xen_free_gntnode(gntnode);
+               guest->pool_num++;
+       }
+
+       return 0;
+err:
+       if (gntnode)
+               xen_free_gntnode(gntnode);
+       for (i = 0; i <  MAX_XENVIRT_MEMPOOL ; i++) {
+               cleanup_mempool(&guest->mempool[i]);
+       }
+       /* reinitialise mempool */
+       bzero(&guest->mempool, MAX_XENVIRT_MEMPOOL * sizeof(guest->mempool[0]));
+       return -1;
+}
+
+static int
+xen_map_vringflag(struct xen_vring *vring)
+{
+       char path[PATH_MAX] = {0};
+       char *buf;
+       uint32_t len,gref;
+       int pg_sz = getpagesize();
+       char *end;
+
+       rte_snprintf(path, sizeof(path), 
+               XEN_VM_ROOTNODE_FMT"/%d_"XEN_VRINGFLAG_SUFFIX,
+               vring->dom_id, vring->virtio_idx);
+               
+       if((buf = xen_read_node(path, &len)) == NULL)
+               goto err;
+
+       errno = 0;
+       gref = strtol(buf, &end, 0);
+       if (errno != 0 || end == NULL || end == buf) {
+               goto err;
+       }
+       vring->flag = xen_grant_mmap(0, vring->dom_id, gref, &vring->flag_index);
+       if (vring->flag == NULL || *vring->flag == 0)
+               goto err;
+
+       free(buf);
+       return 0;
+err:
+       if (buf)
+               free(buf);
+       if (vring->flag) {
+               munmap(vring->flag, pg_sz);
+               vring->flag = NULL;
+               xen_unmap_grant_ref(vring->flag_index);
+       }
+       return -1;
+}
+
+
+static int
+xen_map_rxvringnode(struct xen_gntnode *gntnode,
+                               struct xen_vring *vring)
+{
+       vring->rxvring_addr =
+               map_gntnode(gntnode, vring->dom_id, &vring->rxpfn_tbl, &vring->rxpfn_num, &vring->rx_pindex);
+       RTE_LOG(INFO, XENHOST, "  %s: map rx vring at %p\n", __func__, (void *)vring->rxvring_addr);
+       if (vring->rxvring_addr)
+               return 0;
+       else 
+               return -1;
+}
+
+static int
+xen_map_txvringnode(struct xen_gntnode *gntnode,
+                               struct xen_vring *vring)
+{
+       vring->txvring_addr =
+               map_gntnode(gntnode, vring->dom_id, &vring->txpfn_tbl, &vring->txpfn_num, &vring->tx_pindex);
+       RTE_LOG(INFO, XENHOST, "  %s: map tx vring at %p\n", __func__, (void *)vring->txvring_addr);
+       if (vring->txvring_addr)
+               return 0;
+       else
+               return -1;
+}
+
+void
+cleanup_vring(struct xen_vring *vring)
+{
+       int pg_sz = getpagesize();
+       uint32_t i;
+
+       RTE_LOG(INFO, XENHOST, "  %s: cleanup dom %u vring %u\n", __func__, vring->dom_id, vring->virtio_idx);
+       if (vring->rxvring_addr) {
+               munmap(vring->rxvring_addr, vring->rxpfn_num * pg_sz);
+               RTE_LOG(INFO, XENHOST, "  %s: unmap rx vring [%p, %p]\n",
+                       __func__,
+                       vring->rxvring_addr,
+                       RTE_PTR_ADD(vring->rxvring_addr,
+                       vring->rxpfn_num * pg_sz - 1));
+       }
+       vring->rxvring_addr = NULL;
+
+
+       if (vring->rx_pindex) {
+               RTE_LOG(INFO, XENHOST, "  %s: unmap rx vring %u grefs\n", __func__, vring->rxpfn_num);
+               for (i = 0; i < vring->rxpfn_num; i++) {
+                       xen_unmap_grant_ref(vring->rx_pindex[i]);
+               }
+       }
+       vring->rx_pindex = NULL;
+
+       if (vring->rxpfn_tbl)
+               free(vring->rxpfn_tbl);
+       vring->rxpfn_tbl = NULL;
+
+       if (vring->txvring_addr) {
+               munmap(vring->txvring_addr, vring->txpfn_num * pg_sz);
+               RTE_LOG(INFO, XENHOST, "  %s: unmap tx vring [%p, %p]\n",
+                       __func__,
+                       vring->txvring_addr,
+                       RTE_PTR_ADD(vring->txvring_addr,
+                       vring->txpfn_num * pg_sz - 1));
+       }
+       vring->txvring_addr = NULL;
+
+       if (vring->tx_pindex) {
+               RTE_LOG(INFO, XENHOST, "  %s: unmap tx vring %u grefs\n", __func__, vring->txpfn_num);
+               for (i = 0; i < vring->txpfn_num; i++) {
+                       xen_unmap_grant_ref(vring->tx_pindex[i]);
+               }
+       }
+       vring->tx_pindex = NULL;
+
+       if (vring->txpfn_tbl)
+               free(vring->txpfn_tbl);
+       vring->txpfn_tbl = NULL;
+
+       if (vring->flag) {
+               if (!munmap((void *)vring->flag, pg_sz))
+                       RTE_LOG(INFO, XENHOST, "  %s: unmap flag page at %p\n", __func__, vring->flag);
+               if (!xen_unmap_grant_ref(vring->flag_index))
+                       RTE_LOG(INFO, XENHOST, "  %s: release flag ref index 0x%" PRIx64 "\n", __func__, vring->flag_index);
+       }
+       vring->flag = NULL;
+       return;
+}
+
+
+
+static int
+xen_parse_etheraddr(struct xen_vring *vring)
+{
+       char path[PATH_MAX] = {0};
+       char *buf;
+       uint32_t len;
+       int ret = -1;
+
+       rte_snprintf(path, sizeof(path), 
+               XEN_VM_ROOTNODE_FMT"/%d_"XEN_ADDR_SUFFIX,
+               vring->dom_id, vring->virtio_idx);
+
+       if ((buf = xen_read_node(path, &len)) == NULL)
+               goto out;
+
+       if (cmdline_parse_etheraddr(NULL, buf, &vring->addr) < 0)
+               goto out;
+       ret = 0;
+out:
+       if (buf)
+               free(buf);
+       return ret;
+}
+
+
+int
+parse_vringnode(struct xen_guest *guest, uint32_t virtio_idx)
+{
+       char path[PATH_MAX] = {0};
+       struct xen_gntnode *rx_gntnode = NULL;
+       struct xen_gntnode *tx_gntnode = NULL;
+       struct xen_vring *vring = NULL;
+
+       /*check if null terminated */
+       rte_snprintf(path, sizeof(path),
+               XEN_VM_ROOTNODE_FMT"/%d_"XEN_RXVRING_SUFFIX,
+               guest->dom_id,
+               virtio_idx);
+       
+       RTE_LOG(INFO, XENHOST, "  %s: virtio %u parse rx gntnode %s\n", __func__, virtio_idx, path);
+       rx_gntnode = parse_gntnode(guest->dom_id, path);
+       if (rx_gntnode == NULL)
+               goto err;
+       
+       /*check if null terminated */
+       rte_snprintf(path, sizeof(path),
+               XEN_VM_ROOTNODE_FMT"/%d_"XEN_TXVRING_SUFFIX,
+               guest->dom_id,
+               virtio_idx);
+
+       RTE_LOG(INFO, XENHOST, "  %s: virtio %u parse tx gntnode %s\n", __func__, virtio_idx, path);
+       tx_gntnode = parse_gntnode(guest->dom_id, path);
+       if (tx_gntnode == NULL)
+               goto err;
+
+       vring = &guest->vring[virtio_idx];
+       bzero(vring, sizeof(*vring));
+       vring->dom_id = guest->dom_id;
+       vring->virtio_idx = virtio_idx;
+               
+       if (xen_parse_etheraddr(vring) != 0)
+               goto err;
+
+       RTE_LOG(INFO, XENHOST, "  %s: virtio %u map rx gntnode %s\n", __func__, virtio_idx, path);
+       if (xen_map_rxvringnode(rx_gntnode, vring) != 0)
+               goto err;
+
+       RTE_LOG(INFO, XENHOST, "  %s: virtio %u map tx gntnode %s\n", __func__, virtio_idx, path);
+       if (xen_map_txvringnode(tx_gntnode, vring) != 0)
+               goto err;
+
+       if (xen_map_vringflag(vring) != 0)
+               goto err;
+
+       guest->vring_num++;
+
+       xen_free_gntnode(rx_gntnode);
+       xen_free_gntnode(tx_gntnode);
+
+       return 0;
+
+err:
+       if (rx_gntnode)
+               xen_free_gntnode(rx_gntnode);
+       if (tx_gntnode)
+               xen_free_gntnode(tx_gntnode);
+       if (vring) {
+               cleanup_vring(vring);
+               bzero(vring, sizeof(*vring));
+       }
+       return -1;
+}
+
+/*
+ * Open xen grant dev driver
+ * @return
+ *  0 on success, -1 on failure.
+ */
+static int
+xen_grant_init(void)
+{
+       d_fd = open(XEN_GNTDEV_FNAME, O_RDWR);
+
+       return d_fd == -1? (-1): (0); 
+}
+
+/*
+ * Initialise xenstore handle and open grant dev driver.
+ * @return
+ *  0 on success, -1 on failure.
+ */
+int
+xenhost_init(void)
+{
+       xs = xs_daemon_open();
+       if (xs == NULL) {
+               rte_panic("failed initialize xen daemon handler");
+               return -1;
+       }
+       if (xen_grant_init())
+               return -1;
+       return 0;
+}
index c9f0111..8d3e9dd 100644 (file)
@@ -45,6 +45,7 @@ DIRS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += librte_pmd_ixgbe
 DIRS-$(CONFIG_RTE_LIBRTE_PMD_RING) += librte_pmd_ring
 DIRS-$(CONFIG_RTE_LIBRTE_PMD_PCAP) += librte_pmd_pcap
 DIRS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += librte_pmd_virtio
+DIRS-$(CONFIG_RTE_LIBRTE_PMD_XENVIRT) += librte_pmd_xenvirt
 DIRS-$(CONFIG_RTE_LIBRTE_HASH) += librte_hash
 DIRS-$(CONFIG_RTE_LIBRTE_LPM) += librte_lpm
 DIRS-$(CONFIG_RTE_LIBRTE_NET) += librte_net
index 2fef4cd..1c9c88b 100644 (file)
@@ -39,6 +39,9 @@
 #ifdef RTE_LIBRTE_PMD_PCAP
 #include <rte_eth_pcap.h>
 #endif
+#ifdef RTE_LIBRTE_PMD_XENVIRT
+#include <rte_eth_xenvirt.h>
+#endif
 #include "eal_private.h"
 
 struct device_init {
@@ -59,6 +62,12 @@ struct device_init dev_types[] = {
                        .dev_prefix = RTE_ETH_PCAP_PARAM_NAME,
                        .init_fn = rte_pmd_pcap_init
                },
+#endif
+#ifdef RTE_LIBRTE_PMD_XENVIRT
+               {
+                       .dev_prefix = RTE_ETH_XENVIRT_PARAM_NAME,
+                       .init_fn = rte_pmd_xenvirt_init
+               },
 #endif
                {
                        .dev_prefix = "-nodev-",
index 0592757..9cc0446 100644 (file)
@@ -51,6 +51,9 @@
 #ifdef RTE_LIBRTE_PMD_PCAP
 #include <rte_eth_pcap.h>
 #endif
+#ifdef RTE_LIBRTE_PMD_XENVIRT
+#include <rte_eth_xenvirt.h>
+#endif
 #include "eal_private.h"
 
 static char dev_list_str[4096];
@@ -102,6 +105,9 @@ is_valid_wl_entry(const char *device_str, size_t dev_buf_len)
 #endif
 #ifdef RTE_LIBRTE_PMD_PCAP
                        RTE_ETH_PCAP_PARAM_NAME,
+#endif
+#ifdef RTE_LIBRTE_PMD_XENVIRT
+                       RTE_ETH_XENVIRT_PARAM_NAME,
 #endif
                        "-nodev-" /* dummy value to prevent compiler warnings */
        };
index b212473..b525130 100644 (file)
@@ -44,6 +44,7 @@ CFLAGS += -I$(RTE_SDK)/lib/librte_ether
 CFLAGS += -I$(RTE_SDK)/lib/librte_ivshmem
 CFLAGS += -I$(RTE_SDK)/lib/librte_pmd_ring
 CFLAGS += -I$(RTE_SDK)/lib/librte_pmd_pcap
+CFLAGS += -I$(RTE_SDK)/lib/librte_pmd_xenvirt
 CFLAGS += $(WERROR_FLAGS) -O3
 
 # specific to linuxapp exec-env
diff --git a/lib/librte_pmd_xenvirt/Makefile b/lib/librte_pmd_xenvirt/Makefile
new file mode 100644 (file)
index 0000000..bf6d432
--- /dev/null
@@ -0,0 +1,58 @@
+#   BSD LICENSE
+# 
+#   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+#   All rights reserved.
+# 
+#   Redistribution and use in source and binary forms, with or without
+#   modification, are permitted provided that the following conditions
+#   are met:
+# 
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in
+#       the documentation and/or other materials provided with the
+#       distribution.
+#     * Neither the name of Intel Corporation nor the names of its
+#       contributors may be used to endorse or promote products derived
+#       from this software without specific prior written permission.
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+#
+# library name
+#
+LIB = librte_pmd_xenvirt.a
+
+CFLAGS += -O3
+CFLAGS += $(WERROR_FLAGS)
+
+#
+# all source are stored in SRCS-y
+#
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_XENVIRT) += rte_eth_xenvirt.c rte_mempool_gntalloc.c rte_xen_lib.c
+
+#
+# Export include files
+#
+SYMLINK-y-include += rte_eth_xenvirt.h
+
+# this lib depends upon:
+DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_XENVIRT) += lib/librte_eal lib/librte_ether
+DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_XENVIRT) += lib/librte_mempool lib/librte_mbuf
+DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_XENVIRT) += lib/librte_net lib/librte_malloc
+DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_XENVIRT) += lib/librte_cmdline
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_pmd_xenvirt/rte_eth_xenvirt.c b/lib/librte_pmd_xenvirt/rte_eth_xenvirt.c
new file mode 100644 (file)
index 0000000..bad8dd4
--- /dev/null
@@ -0,0 +1,706 @@
+/*-
+ *   BSD LICENSE
+ * 
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ * 
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ * 
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <errno.h>
+#include <sys/user.h>
+#include <linux/binfmts.h>
+#include <xen/xen-compat.h>
+#if __XEN_LATEST_INTERFACE_VERSION__ < 0x00040200
+#include <xs.h>
+#else
+#include <xenstore.h>
+#endif
+#include <linux/virtio_ring.h>
+
+#include <rte_mbuf.h>
+#include <rte_ethdev.h>
+#include <rte_malloc.h>
+#include <rte_memcpy.h>
+#include <rte_string_fns.h>
+#include <cmdline_parse.h>
+#include <cmdline_parse_etheraddr.h>
+
+#include "rte_xen_lib.h"
+#include "virtqueue.h"
+#include "rte_eth_xenvirt.h"
+
+#define VQ_DESC_NUM 256
+#define VIRTIO_MBUF_BURST_SZ 64
+
+/* virtio_idx is increased after new device is created.*/
+static int virtio_idx = 0;
+
+static const char *drivername = "xen dummy virtio PMD";
+
+static struct rte_eth_link pmd_link = {
+               .link_speed = 10000,
+               .link_duplex = ETH_LINK_FULL_DUPLEX,
+               .link_status = 0
+};
+
+static inline struct rte_mbuf *
+rte_rxmbuf_alloc(struct rte_mempool *mp)
+{
+       struct rte_mbuf *m;
+
+       m = __rte_mbuf_raw_alloc(mp);
+       __rte_mbuf_sanity_check_raw(m, RTE_MBUF_PKT, 0);
+
+       return m;
+}
+
+
+static uint16_t
+eth_xenvirt_rx(void *q, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+       struct virtqueue *rxvq = q;
+       struct rte_mbuf *rxm, *new_mbuf;
+       uint16_t nb_used, num;
+       uint32_t len[VIRTIO_MBUF_BURST_SZ];
+       uint32_t i;
+       struct pmd_internals *pi = rxvq->internals;
+
+       nb_used = VIRTQUEUE_NUSED(rxvq);
+
+       rte_compiler_barrier(); /* rmb */
+       num = (uint16_t)(likely(nb_used <= nb_pkts) ? nb_used : nb_pkts);
+       num = (uint16_t)(likely(num <= VIRTIO_MBUF_BURST_SZ) ? num : VIRTIO_MBUF_BURST_SZ);
+       if (unlikely(num == 0)) return 0;
+
+       num = virtqueue_dequeue_burst(rxvq, rx_pkts, len, num);
+       PMD_RX_LOG(DEBUG, "used:%d dequeue:%d\n", nb_used, num);
+       for (i = 0; i < num ; i ++) {
+               rxm = rx_pkts[i];
+               PMD_RX_LOG(DEBUG, "packet len:%d\n", len[i]);
+               rxm->pkt.next = NULL;
+               rxm->pkt.data = (char *)rxm->buf_addr + RTE_PKTMBUF_HEADROOM;
+               rxm->pkt.data_len = (uint16_t)(len[i] - sizeof(struct virtio_net_hdr));
+               rxm->pkt.nb_segs = 1;
+               rxm->pkt.in_port = pi->port_id;
+               rxm->pkt.pkt_len  = (uint32_t)(len[i] - sizeof(struct virtio_net_hdr));
+       }
+       /* allocate new mbuf for the used descriptor */
+       while (likely(!virtqueue_full(rxvq))) {
+               new_mbuf = rte_rxmbuf_alloc(rxvq->mpool);
+               if (unlikely(new_mbuf == NULL)) {
+                       break;
+               }
+               if (unlikely(virtqueue_enqueue_recv_refill(rxvq, new_mbuf))) {
+                       rte_pktmbuf_free_seg(new_mbuf);
+                       break;
+               }
+       }
+       pi->eth_stats.ipackets += num;
+       return num;
+}
+
+static uint16_t
+eth_xenvirt_tx(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+       struct virtqueue *txvq = tx_queue;
+       struct rte_mbuf *txm;
+       uint16_t nb_used, nb_tx, num, i;
+       int error;
+       uint32_t len[VIRTIO_MBUF_BURST_SZ];
+       struct rte_mbuf *snd_pkts[VIRTIO_MBUF_BURST_SZ];
+       struct pmd_internals *pi = txvq->internals;
+
+       nb_tx = 0;
+
+       if (unlikely(nb_pkts == 0))
+               return 0;
+
+       PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
+       nb_used = VIRTQUEUE_NUSED(txvq);
+
+       rte_compiler_barrier();   /* rmb */
+
+       num = (uint16_t)(likely(nb_used <= VIRTIO_MBUF_BURST_SZ) ? nb_used : VIRTIO_MBUF_BURST_SZ);
+       num = virtqueue_dequeue_burst(txvq, snd_pkts, len, num);
+
+       for (i = 0; i < num ; i ++) {
+               /* mergable not supported, one segment only */
+               rte_pktmbuf_free_seg(snd_pkts[i]);
+       }
+
+       while (nb_tx < nb_pkts) {
+               if (likely(!virtqueue_full(txvq))) {
+               /* TODO drop tx_pkts if it contains multiple segments */
+                       txm = tx_pkts[nb_tx];
+                       error = virtqueue_enqueue_xmit(txvq, txm);
+                       if (unlikely(error)) {
+                               if (error == ENOSPC)
+                                       PMD_TX_LOG(ERR, "virtqueue_enqueue Free count = 0\n");
+                               else if (error == EMSGSIZE)
+                                       PMD_TX_LOG(ERR, "virtqueue_enqueue Free count < 1\n");
+                               else
+                                       PMD_TX_LOG(ERR, "virtqueue_enqueue error: %d\n", error);
+                               break;
+                       }
+                       nb_tx++;
+               } else {
+                       PMD_TX_LOG(ERR, "No free tx descriptors to transmit\n");
+                       /* virtqueue_notify not needed in our para-virt solution */
+                       break;
+               }
+       }
+       pi->eth_stats.opackets += nb_tx;
+       return nb_tx;
+}
+
+static int
+eth_dev_configure(struct rte_eth_dev *dev __rte_unused)
+{
+       RTE_LOG(ERR, PMD, "%s\n", __func__);
+       return 0;
+}
+
+/*
+ * Create a shared page between guest and host.
+ * Host monitors this page if it is cleared on unmap, and then
+ * do necessary clean up.
+ */
+static void
+gntalloc_vring_flag(int vtidx)
+{
+       char key_str[PATH_MAX];
+       char val_str[PATH_MAX];
+       uint32_t gref_tmp;
+       void *ptr;
+
+       if (grefwatch_from_alloc(&gref_tmp, &ptr)) {
+               RTE_LOG(ERR, PMD, "grefwatch_from_alloc error\n");
+               exit(0);
+       }
+
+       *(uint8_t *)ptr = MAP_FLAG;
+       rte_snprintf(val_str, sizeof(val_str), "%u", gref_tmp);
+       rte_snprintf(key_str, sizeof(key_str),
+               DPDK_XENSTORE_PATH"%d"VRING_FLAG_STR, vtidx);
+       xenstore_write(key_str, val_str);
+}
+
+/*
+ * Notify host this virtio device is started.
+ * Host could start polling this device.
+ */
+static void
+dev_start_notify(int vtidx)
+{
+       char key_str[PATH_MAX];
+       char val_str[PATH_MAX];
+
+       RTE_LOG(INFO, PMD, "%s: virtio %d is started\n", __func__, vtidx);
+       gntalloc_vring_flag(vtidx);
+
+       rte_snprintf(key_str, sizeof(key_str), "%s%s%d",
+               DPDK_XENSTORE_PATH, EVENT_TYPE_START_STR,
+                       vtidx);
+       rte_snprintf(val_str, sizeof(val_str), "1");
+       xenstore_write(key_str, val_str);
+}
+
+/*
+ * Notify host this virtio device is stopped.
+ * Host could stop polling this device.
+ */
+static void
+dev_stop_notify(int vtidx)
+{
+       RTE_SET_USED(vtidx);
+}
+
+
+static int
+update_mac_address(struct ether_addr *mac_addrs, int vtidx)
+{
+       char key_str[PATH_MAX];
+       char val_str[PATH_MAX];
+       int rv;
+
+       if (mac_addrs == NULL) {
+               RTE_LOG(ERR, PMD, "%s: NULL pointer mac specified\n", __func__);
+               return -1;
+       }
+       rv = rte_snprintf(key_str, sizeof(key_str),
+                       DPDK_XENSTORE_PATH"%d_ether_addr", vtidx);
+       if (rv == -1)
+               return rv;
+       rv = rte_snprintf(val_str, sizeof(val_str), "%02x:%02x:%02x:%02x:%02x:%02x",
+                       mac_addrs->addr_bytes[0],
+                       mac_addrs->addr_bytes[1],
+                       mac_addrs->addr_bytes[2],
+                       mac_addrs->addr_bytes[3],
+                       mac_addrs->addr_bytes[4],
+                       mac_addrs->addr_bytes[5]);
+       if (rv == -1)
+               return rv;
+       if (xenstore_write(key_str, val_str))
+               return rv;
+       return 0;
+}
+
+
+static int
+eth_dev_start(struct rte_eth_dev *dev)
+{
+       struct virtqueue *rxvq = dev->data->rx_queues[0];
+       struct virtqueue *txvq = dev->data->tx_queues[0];
+       struct rte_mbuf *m;
+       struct pmd_internals *pi = (struct pmd_internals *)dev->data->dev_private;
+       int rv;
+
+       dev->data->dev_link.link_status = 1;
+       while (!virtqueue_full(rxvq)) {
+               m = rte_rxmbuf_alloc(rxvq->mpool);
+               if (m == NULL)
+                       break;
+               /* Enqueue allocated buffers. */
+               if (virtqueue_enqueue_recv_refill(rxvq, m)) {
+                       rte_pktmbuf_free_seg(m);
+                       break;
+               }
+       }
+
+       rxvq->internals = pi;
+       txvq->internals = pi;
+
+       rv = update_mac_address(dev->data->mac_addrs, pi->virtio_idx);
+       if (rv)
+               return -1;
+       dev_start_notify(pi->virtio_idx);
+
+       return 0;
+}
+
+static void
+eth_dev_stop(struct rte_eth_dev *dev)
+{
+       struct pmd_internals *pi = (struct pmd_internals *)dev->data->dev_private;
+
+       dev->data->dev_link.link_status = 0;
+       dev_stop_notify(pi->virtio_idx);
+}
+
+/*
+ * Notify host this virtio device is closed.
+ * Host could do necessary clean up to this device.
+ */
+static void
+eth_dev_close(struct rte_eth_dev *dev)
+{
+       RTE_SET_USED(dev);
+}
+
+static void
+eth_dev_info(struct rte_eth_dev *dev,
+               struct rte_eth_dev_info *dev_info)
+{
+       struct pmd_internals *internals = dev->data->dev_private;
+
+       RTE_SET_USED(internals);
+       dev_info->driver_name = drivername;
+       dev_info->max_mac_addrs = 1;
+       dev_info->max_rx_pktlen = (uint32_t)2048;
+       dev_info->max_rx_queues = (uint16_t)1;
+       dev_info->max_tx_queues = (uint16_t)1;
+       dev_info->min_rx_bufsize = 0;
+       dev_info->pci_dev = NULL;
+}
+
+static void
+eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
+{
+       struct pmd_internals *internals = dev->data->dev_private;
+       if(stats)
+               rte_memcpy(stats, &internals->eth_stats, sizeof(*stats));
+}
+
+static void
+eth_stats_reset(struct rte_eth_dev *dev)
+{
+       struct pmd_internals *internals = dev->data->dev_private;
+       /* Reset software totals */
+       memset(&internals->eth_stats, 0, sizeof(internals->eth_stats));
+}
+
+static void
+eth_queue_release(void *q __rte_unused)
+{
+}
+
+static int
+eth_link_update(struct rte_eth_dev *dev __rte_unused,
+               int wait_to_complete __rte_unused)
+{
+       return 0;
+}
+
+/*
+ * Create shared vring between guest and host.
+ * Memory is allocated through grant alloc driver, so it is not physical continous.
+ */
+static void *
+gntalloc_vring_create(int queue_type, uint32_t size, int vtidx)
+{
+       char key_str[PATH_MAX] = {0};
+       char val_str[PATH_MAX] = {0};
+       void *va = NULL;
+       int pg_size;
+       uint32_t pg_num;
+       uint32_t *gref_arr = NULL;
+       phys_addr_t *pa_arr = NULL;
+       uint64_t start_index;
+       int rv;
+
+       pg_size = getpagesize();
+       size    = RTE_ALIGN_CEIL(size, pg_size);
+       pg_num  = size / pg_size;
+
+       gref_arr = calloc(pg_num, sizeof(gref_arr[0]));
+       pa_arr  = calloc(pg_num, sizeof(pa_arr[0]));
+
+       if (gref_arr == NULL || pa_arr == NULL) {
+               RTE_LOG(ERR, PMD, "%s: calloc failed\n", __func__);
+               goto out;
+       }
+
+       va  = gntalloc(size, gref_arr, &start_index);
+       if (va == NULL) {
+               RTE_LOG(ERR, PMD, "%s: gntalloc failed\n", __func__);
+               goto out;
+       }
+
+       if (get_phys_map(va, pa_arr, pg_num, pg_size))
+               goto out;
+
+       /* write in xenstore gref and pfn for each page of vring */
+       if (grant_node_create(pg_num, gref_arr, pa_arr, val_str, sizeof(val_str))) {
+               gntfree(va, size, start_index);
+               va = NULL;
+               goto out;
+       }
+
+       if (queue_type == VTNET_RQ)
+               rv = rte_snprintf(key_str, sizeof(key_str), DPDK_XENSTORE_PATH"%d"RXVRING_XENSTORE_STR, vtidx);
+       else 
+               rv = rte_snprintf(key_str, sizeof(key_str), DPDK_XENSTORE_PATH"%d"TXVRING_XENSTORE_STR, vtidx);
+       if (rv == -1 || xenstore_write(key_str, val_str) == -1) {
+               gntfree(va, size, start_index);
+               va = NULL;
+       }
+out:
+       if (pa_arr)
+               free(pa_arr);
+       if (gref_arr)
+               free(gref_arr);
+
+       return va;
+}
+
+
+
+static struct virtqueue *
+virtio_queue_setup(struct rte_eth_dev *dev, int queue_type)
+{
+       struct virtqueue *vq = NULL;
+       uint16_t vq_size = VQ_DESC_NUM;
+       int i = 0;
+       char vq_name[VIRTQUEUE_MAX_NAME_SZ];
+       size_t size;
+       struct vring *vr;
+
+       /* Allocate memory for virtqueue. */
+       if (queue_type == VTNET_RQ) {
+               rte_snprintf(vq_name, sizeof(vq_name), "port%d_rvq",
+                               dev->data->port_id);
+               vq = rte_zmalloc(vq_name, sizeof(struct virtqueue) +
+                       vq_size * sizeof(struct vq_desc_extra), CACHE_LINE_SIZE);
+               if (vq == NULL) {
+                       RTE_LOG(ERR, PMD, "%s: unabled to allocate virtqueue\n", __func__);
+                       return NULL;
+               }
+               memcpy(vq->vq_name, vq_name, sizeof(vq->vq_name));
+       } else if(queue_type == VTNET_TQ) {
+               rte_snprintf(vq_name, sizeof(vq_name), "port%d_tvq",
+                       dev->data->port_id);
+               vq = rte_zmalloc(vq_name, sizeof(struct virtqueue) +
+                       vq_size * sizeof(struct vq_desc_extra), CACHE_LINE_SIZE);
+               if (vq == NULL) {
+                       RTE_LOG(ERR, PMD, "%s: unabled to allocate virtqueue\n", __func__);
+                       return NULL;
+               }
+               memcpy(vq->vq_name, vq_name, sizeof(vq->vq_name));
+       }
+
+       memcpy(vq->vq_name, vq_name, sizeof(vq->vq_name));
+
+       vq->vq_alignment = VIRTIO_PCI_VRING_ALIGN;
+       vq->vq_nentries = vq_size;
+       vq->vq_free_cnt = vq_size;
+       /* Calcuate vring size according to virtio spec */
+       size = vring_size(vq_size, VIRTIO_PCI_VRING_ALIGN);
+       vq->vq_ring_size = RTE_ALIGN_CEIL(size, VIRTIO_PCI_VRING_ALIGN);
+       /* Allocate memory for virtio vring through gntalloc driver*/
+       vq->vq_ring_virt_mem = gntalloc_vring_create(queue_type, vq->vq_ring_size,
+               ((struct pmd_internals *)dev->data->dev_private)->virtio_idx);
+       memset(vq->vq_ring_virt_mem, 0, vq->vq_ring_size);
+       vr = &vq->vq_ring;
+       vring_init(vr, vq_size, vq->vq_ring_virt_mem, vq->vq_alignment);
+       /* 
+        * Locally maintained last consumed index, this idex trails 
+        * vq_ring.used->idx.
+        */
+       vq->vq_used_cons_idx = 0;
+       vq->vq_desc_head_idx = 0;
+       vq->vq_free_cnt = vq->vq_nentries;
+       memset(vq->vq_descx, 0, sizeof(struct vq_desc_extra) * vq->vq_nentries);
+
+       /* Chain all the descriptors in the ring with an END */
+       for (i = 0; i < vq_size - 1; i++)
+               vr->desc[i].next = (uint16_t)(i + 1);
+       vr->desc[i].next = VQ_RING_DESC_CHAIN_END;
+
+       return vq;
+}
+
+static int
+eth_rx_queue_setup(struct rte_eth_dev *dev,uint16_t rx_queue_id,
+                               uint16_t nb_rx_desc __rte_unused,
+                               unsigned int socket_id __rte_unused,
+                               const struct rte_eth_rxconf *rx_conf __rte_unused,
+                               struct rte_mempool *mb_pool)
+{
+       struct virtqueue *vq;
+       vq = dev->data->rx_queues[rx_queue_id] = virtio_queue_setup(dev, VTNET_RQ);
+       vq->mpool = mb_pool;
+       return 0;
+}
+
+static int
+eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
+                               uint16_t nb_tx_desc __rte_unused,
+                               unsigned int socket_id __rte_unused,
+                               const struct rte_eth_txconf *tx_conf __rte_unused)
+{
+       dev->data->tx_queues[tx_queue_id] = virtio_queue_setup(dev, VTNET_TQ);
+       return 0;
+}
+
+
+
+static struct eth_dev_ops ops = {
+               .dev_start = eth_dev_start,
+               .dev_stop = eth_dev_stop,
+               .dev_close = eth_dev_close,
+               .dev_configure = eth_dev_configure,
+               .dev_infos_get = eth_dev_info,
+               .rx_queue_setup = eth_rx_queue_setup,
+               .tx_queue_setup = eth_tx_queue_setup,
+               .rx_queue_release = eth_queue_release,
+               .tx_queue_release = eth_queue_release,
+               .link_update = eth_link_update,
+               .stats_get = eth_stats_get,
+               .stats_reset = eth_stats_reset,
+};
+
+
+static int 
+rte_eth_xenvirt_parse_args(struct xenvirt_dict *dict,
+                       const char *name, const char *params)
+{
+       int i;
+       char *pairs[RTE_ETH_XENVIRT_MAX_ARGS];
+       int num_of_pairs;
+       char *pair[2];
+       char *args;
+       int ret = -1;
+
+       if (params == NULL)
+               return 0;
+
+       args = rte_zmalloc(NULL, strlen(params) + 1, CACHE_LINE_SIZE);
+       if (args == NULL) {
+               RTE_LOG(ERR, PMD, "Couldn't parse %s device \n", name);
+               return -1;
+       }
+       rte_memcpy(args, params, strlen(params));
+
+       num_of_pairs = rte_strsplit(args, strnlen(args, MAX_ARG_STRLEN),
+                                       pairs,
+                                       RTE_ETH_XENVIRT_MAX_ARGS ,
+                                       RTE_ETH_XENVIRT_PAIRS_DELIM);
+
+       for (i = 0; i < num_of_pairs; i++) {
+               pair[0] = NULL;
+               pair[1] = NULL;
+               rte_strsplit(pairs[i], strnlen(pairs[i], MAX_ARG_STRLEN),
+                                       pair, 2,
+                                       RTE_ETH_XENVIRT_KEY_VALUE_DELIM);
+
+               if (pair[0] == NULL || pair[1] == NULL || pair[0][0] == 0
+                       || pair[1][0] == 0) {
+                       RTE_LOG(ERR, PMD,
+                               "Couldn't parse %s device,"
+                               "wrong key or value \n", name);
+                       goto err;
+               }
+
+               if (!strncmp(pair[0], RTE_ETH_XENVIRT_MAC_PARAM,
+                               sizeof(RTE_ETH_XENVIRT_MAC_PARAM))) {
+                       if (cmdline_parse_etheraddr(NULL,
+                                                       pair[1],
+                                                       &dict->addr) < 0) {
+                               RTE_LOG(ERR, PMD,
+                                       "Invalid %s device ether address\n",
+                                       name);
+                               goto err;
+                       }
+
+                       dict->addr_valid = 1;
+               }
+       }
+
+       ret = 0;
+err:
+       rte_free(args);
+       return ret;
+}
+
+enum dev_action {
+       DEV_CREATE,
+       DEV_ATTACH
+};
+
+
+static int
+eth_dev_xenvirt_create(const char *name, const char *params,
+               const unsigned numa_node,
+                enum dev_action action)
+{
+       struct rte_eth_dev_data *data = NULL;
+       struct rte_pci_device *pci_dev = NULL;
+       struct pmd_internals *internals = NULL;
+       struct rte_eth_dev *eth_dev = NULL;
+       struct xenvirt_dict dict;
+       bzero(&dict, sizeof(struct xenvirt_dict));
+
+       RTE_LOG(INFO, PMD, "Creating virtio rings backed ethdev on numa socket %u\n",
+                       numa_node);
+       RTE_SET_USED(action);
+
+       if (rte_eth_xenvirt_parse_args(&dict, name, params) < 0) {
+               RTE_LOG(ERR, PMD, "%s: Failed to parse ethdev parameters\n", __func__);
+               return -1;
+       }
+
+       /* now do all data allocation - for eth_dev structure, dummy pci driver
+        * and internal (private) data
+        */
+       data = rte_zmalloc_socket(name, sizeof(*data), 0, numa_node);
+       if (data == NULL)
+               goto err;
+
+       pci_dev = rte_zmalloc_socket(name, sizeof(*pci_dev), 0, numa_node);
+       if (pci_dev == NULL)
+               goto err;
+
+       internals = rte_zmalloc_socket(name, sizeof(*internals), 0, numa_node);
+       if (internals == NULL)
+               goto err;
+
+       /* reserve an ethdev entry */
+       eth_dev = rte_eth_dev_allocate();
+       if (eth_dev == NULL)
+               goto err;
+
+       pci_dev->numa_node = numa_node;
+
+       data->dev_private = internals;
+       data->port_id = eth_dev->data->port_id;
+       data->nb_rx_queues = (uint16_t)1;
+       data->nb_tx_queues = (uint16_t)1;
+       data->dev_link = pmd_link;
+       data->mac_addrs = rte_zmalloc("xen_virtio", ETHER_ADDR_LEN, 0);
+
+       if(dict.addr_valid)
+               memcpy(&data->mac_addrs->addr_bytes, &dict.addr, sizeof(struct ether_addr));
+       else
+               eth_random_addr(&data->mac_addrs->addr_bytes[0]);
+
+       eth_dev->data = data;
+       eth_dev->dev_ops = &ops;
+       eth_dev->pci_dev = pci_dev;
+
+       eth_dev->rx_pkt_burst = eth_xenvirt_rx;
+       eth_dev->tx_pkt_burst = eth_xenvirt_tx;
+
+       internals->virtio_idx = virtio_idx++;
+       internals->port_id = eth_dev->data->port_id;
+
+       return 0;
+
+err:
+       if (data)
+               rte_free(data);
+       if (pci_dev)
+               rte_free(pci_dev);
+       if (internals)
+               rte_free(internals);
+       return -1;
+}
+
+
+/*TODO: Support multiple process model */
+int
+rte_pmd_xenvirt_init(const char *name, const char *params)
+{
+       if (virtio_idx == 0) {
+               if (xenstore_init() != 0) {
+                       RTE_LOG(ERR, PMD, "%s: xenstore init failed\n", __func__);
+                       return -1;
+               }
+               if (gntalloc_open() != 0) {
+                       RTE_LOG(ERR, PMD, "%s: grant init failed\n", __func__);
+                       return -1;
+               }
+       }
+       eth_dev_xenvirt_create(name, params, rte_socket_id(), DEV_CREATE);
+       return 0;
+}
diff --git a/lib/librte_pmd_xenvirt/rte_eth_xenvirt.h b/lib/librte_pmd_xenvirt/rte_eth_xenvirt.h
new file mode 100644 (file)
index 0000000..cb1924a
--- /dev/null
@@ -0,0 +1,70 @@
+/*-
+ *   BSD LICENSE
+ * 
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ * 
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ * 
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_ETH_XENVIRT_H_
+#define _RTE_ETH_XENVIRT_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_mempool.h>
+#include <rte_ring.h>
+
+#define RTE_ETH_XENVIRT_PARAM_NAME "eth_xenvirt"
+
+/**
+ * For use by the EAL only. Called as part of EAL init to set up any dummy NICs
+ * configured on command line.
+ */
+int rte_pmd_xenvirt_init(const char *name, const char *params);
+
+/**
+ * Creates mempool for xen virtio PMD.
+ * This function uses memzone_reserve to allocate memory for meta data,
+ * and uses grant alloc driver to allocate memory for data area.
+ * The input parameters are exactly the same as rte_mempool_create.
+ */
+struct rte_mempool *
+rte_mempool_gntalloc_create(const char *name, unsigned elt_num, unsigned elt_size,
+                  unsigned cache_size, unsigned private_data_size,
+                  rte_mempool_ctor_t *mp_init, void *mp_init_arg,
+                  rte_mempool_obj_ctor_t *obj_init, void *obj_init_arg,
+                  int socket_id, unsigned flags);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/lib/librte_pmd_xenvirt/rte_mempool_gntalloc.c b/lib/librte_pmd_xenvirt/rte_mempool_gntalloc.c
new file mode 100644 (file)
index 0000000..c1c2db0
--- /dev/null
@@ -0,0 +1,298 @@
+/*-
+ *   BSD LICENSE
+ * 
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ * 
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ * 
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include <xen/sys/gntalloc.h>
+
+#include <rte_common.h>
+#include <rte_mempool.h>
+#include <rte_memory.h>
+#include <rte_errno.h>
+
+#include "rte_xen_lib.h"
+#include "rte_eth_xenvirt.h"
+
+struct _gntarr {
+       uint32_t gref;
+       phys_addr_t pa;
+       uint64_t index;
+       void *va;
+};
+
+struct _mempool_gntalloc_info {
+       struct rte_mempool *mp;
+       uint32_t pg_num;
+       uint32_t *gref_arr;
+       phys_addr_t *pa_arr;
+       void *va;
+       uint32_t mempool_idx;
+       uint64_t start_index;
+};
+
+
+static rte_atomic32_t global_xenvirt_mempool_idx = RTE_ATOMIC32_INIT(-1);
+
+static int
+compare(const void *p1, const void *p2)
+{
+       return ((const struct _gntarr *)p1)->pa  - ((const struct _gntarr *)p2)->pa;
+}
+
+
+static struct _mempool_gntalloc_info
+_create_mempool(const char *name, unsigned elt_num, unsigned elt_size,
+                  unsigned cache_size, unsigned private_data_size,
+                  rte_mempool_ctor_t *mp_init, void *mp_init_arg,
+                  rte_mempool_obj_ctor_t *obj_init, void *obj_init_arg,
+                  int socket_id, unsigned flags)
+{
+       struct _mempool_gntalloc_info mgi;
+       struct rte_mempool *mp = NULL;
+       struct rte_mempool_objsz  objsz;
+       uint32_t pg_num, rpg_num, pg_shift, pg_sz;
+       char *va, *orig_va, *uv; /* uv: from which, the pages could be freed */
+       ssize_t sz, usz; /* usz: unused size */
+       /*
+        * for each page allocated through xen_gntalloc driver,
+        * gref_arr:stores grant references,
+        * pa_arr: stores physical address,
+        * gnt_arr: stores all meta dat
+        */
+       uint32_t *gref_arr = NULL;
+       phys_addr_t *pa_arr = NULL;
+       struct _gntarr *gnt_arr = NULL;
+       /* start index of the grant referances, used for dealloc*/
+       uint64_t start_index;
+       uint32_t i, j;
+       int rv = 0;
+       struct ioctl_gntalloc_dealloc_gref arg;
+
+       mgi.mp = NULL;
+       va = orig_va = uv = NULL;
+       pg_num = rpg_num = 0;
+       sz = 0;
+
+       pg_sz = getpagesize();
+       if (rte_is_power_of_2(pg_sz) == 0) {
+               goto out;
+       }
+       pg_shift = rte_bsf32(pg_sz);
+
+       rte_mempool_calc_obj_size(elt_size, flags, &objsz);
+       sz = rte_mempool_xmem_size(elt_num, objsz.total_size, pg_shift);
+       pg_num = sz >> pg_shift;
+
+       pa_arr = calloc(pg_num, sizeof(pa_arr[0]));
+       gref_arr = calloc(pg_num, sizeof(gref_arr[0]));
+       gnt_arr  = calloc(pg_num, sizeof(gnt_arr[0]));
+       if ((gnt_arr == NULL) || (gref_arr == NULL) || (pa_arr == NULL))
+               goto out;
+
+       /* grant index is continuous in ascending order */
+       orig_va = gntalloc(sz, gref_arr, &start_index);
+       if (orig_va == NULL)
+               goto out;
+
+       get_phys_map(orig_va, pa_arr, pg_num, pg_sz);
+       for (i = 0; i < pg_num; i++) {
+               gnt_arr[i].index = start_index + i * pg_sz;
+               gnt_arr[i].gref = gref_arr[i];
+               gnt_arr[i].pa = pa_arr[i];
+               gnt_arr[i].va  = RTE_PTR_ADD(orig_va, i * pg_sz);
+       }
+       qsort(gnt_arr, pg_num, sizeof(struct _gntarr), compare);
+
+       va = get_xen_virtual(sz, pg_sz);
+       if (va == NULL) {
+               goto out;
+       }
+
+       /*
+        * map one by one, as index isn't continuous now.
+        * pg_num VMAs, doesn't linux has a limitation on this?
+        */
+       for (i = 0; i < pg_num; i++) {
+       /* update gref_arr and pa_arr after sort */
+               gref_arr[i] = gnt_arr[i].gref;
+               pa_arr[i]   = gnt_arr[i].pa;
+               gnt_arr[i].va = mmap(va + i * pg_sz, pg_sz, PROT_READ | PROT_WRITE,
+                       MAP_SHARED | MAP_FIXED, gntalloc_fd, gnt_arr[i].index);
+               if ((gnt_arr[i].va == MAP_FAILED) || (gnt_arr[i].va != (va + i * pg_sz))) {
+                       RTE_LOG(ERR, PMD, "failed to map %d pages\n", i);
+                       goto mmap_failed;
+               }
+       }
+
+       /*
+        * Check that allocated size is big enough to hold elt_num
+        * objects and a calcualte how many bytes are actually required.
+        */
+       usz = rte_mempool_xmem_usage(va, elt_num, objsz.total_size, pa_arr, pg_num, pg_shift);
+       if (usz < 0) {
+               mp = NULL;
+               i = pg_num;
+               goto mmap_failed;
+       } else {
+               /* unmap unused pages if any */
+               uv = RTE_PTR_ADD(va, usz);
+               if ((usz = va + sz - uv) > 0) {
+
+                       RTE_LOG(ERR, PMD,
+                               "%s(%s): unmap unused %zu of %zu "
+                               "mmaped bytes @%p orig:%p\n",
+                               __func__, name, usz, sz, uv, va);
+                       munmap(uv, usz);
+                       i = (sz - usz) / pg_sz;
+                       for (; i < pg_num; i++) {
+                               arg.count = 1;
+                               arg.index = gnt_arr[i].index;
+                               rv = ioctl(gntalloc_fd, IOCTL_GNTALLOC_DEALLOC_GREF, &arg);
+                               if (rv) {
+                                       /* shouldn't fail here */
+                                       RTE_LOG(ERR, PMD, "va=%p pa=%p index=%p %s\n",
+                                               gnt_arr[i].va,
+                                               (void *)gnt_arr[i].pa, 
+                                               (void *)arg.index, strerror(errno));
+                                       rte_panic("gntdealloc failed when freeing pages\n");
+                               }
+                       }
+
+                       rpg_num = (sz - usz) >> pg_shift;
+               } else
+                       rpg_num = pg_num;
+                       
+               mp = rte_mempool_xmem_create(name, elt_num, elt_size,
+                               cache_size, private_data_size,
+                               mp_init, mp_init_arg,
+                               obj_init, obj_init_arg,
+                               socket_id, flags, va, pa_arr, rpg_num, pg_shift);
+
+               RTE_VERIFY(elt_num == mp->size);
+       }
+       mgi.mp = mp;
+       mgi.pg_num = rpg_num;
+       mgi.gref_arr = gref_arr;
+       mgi.pa_arr = pa_arr;
+       if (mp)
+               mgi.mempool_idx = rte_atomic32_add_return(&global_xenvirt_mempool_idx, 1);
+       mgi.start_index = start_index;
+       mgi.va = va;
+
+       if (mp == NULL) {
+               i = pg_num;
+               goto mmap_failed;
+       } 
+
+/*
+ * unmap only, without deallocate grant reference.
+ * unused pages have already been unmaped,
+ * unmap twice will fail, but it is safe.
+ */
+mmap_failed:
+       for (j = 0; j < i; j++) {
+               if (gnt_arr[i].va)
+                       munmap(gnt_arr[i].va, pg_sz);
+       }
+out:
+       if (gnt_arr)
+               free(gnt_arr);
+       if (orig_va)
+               munmap(orig_va, sz);
+       if (mp == NULL) {
+               if (gref_arr)
+                       free(gref_arr);
+               if (pa_arr)
+                       free(pa_arr);
+
+               /* some gref has already been de-allocated from the list in the driver,
+                * so dealloc one by one, and it is safe to deallocate twice 
+                */
+               if (orig_va) {
+                       for (i = 0; i < pg_num; i++) {
+                               arg.index = start_index + i * pg_sz;
+                               rv = ioctl(gntalloc_fd, IOCTL_GNTALLOC_DEALLOC_GREF, arg);
+                       }
+               }
+       }
+       return mgi;
+}
+
+struct rte_mempool *
+rte_mempool_gntalloc_create(const char *name, unsigned elt_num, unsigned elt_size,
+                  unsigned cache_size, unsigned private_data_size,
+                  rte_mempool_ctor_t *mp_init, void *mp_init_arg,
+                  rte_mempool_obj_ctor_t *obj_init, void *obj_init_arg,
+                  int socket_id, unsigned flags)
+{
+       int rv;
+       uint32_t i;
+       struct _mempool_gntalloc_info mgi;
+       struct ioctl_gntalloc_dealloc_gref arg;
+       int pg_sz = getpagesize();
+
+       mgi = _create_mempool(name, elt_num, elt_size,
+                       cache_size, private_data_size,
+                       mp_init, mp_init_arg,
+                       obj_init, obj_init_arg,
+                       socket_id, flags);
+       if (mgi.mp) {
+               rv = grant_gntalloc_mbuf_pool(mgi.mp,
+                       mgi.pg_num,
+                       mgi.gref_arr,
+                       mgi.pa_arr,
+                       mgi.mempool_idx);
+               free(mgi.gref_arr);
+               free(mgi.pa_arr);
+               if (rv == 0)
+                       return mgi.mp;
+               /*
+                * in _create_mempool, unused pages have already been unmapped, deallocagted
+                * unmap and dealloc the remained ones here.
+                */
+               munmap(mgi.va, pg_sz * mgi.pg_num);
+               for (i = 0; i < mgi.pg_num; i++) {
+                       arg.index = mgi.start_index + i * pg_sz;
+                       rv = ioctl(gntalloc_fd, IOCTL_GNTALLOC_DEALLOC_GREF, arg);
+               }
+               return NULL;
+       }
+       return NULL;
+
+
+
+}
diff --git a/lib/librte_pmd_xenvirt/rte_xen_lib.c b/lib/librte_pmd_xenvirt/rte_xen_lib.c
new file mode 100644 (file)
index 0000000..1baa7e4
--- /dev/null
@@ -0,0 +1,430 @@
+/*-
+ *   BSD LICENSE
+ * 
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ * 
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ * 
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <xen/xen-compat.h>
+#if __XEN_LATEST_INTERFACE_VERSION__ < 0x00040200
+#include <xs.h>
+#else
+#include <xenstore.h>
+#endif
+#include <xen/sys/gntalloc.h>
+
+#include <rte_common.h>
+#include <rte_string_fns.h>
+
+#include "rte_xen_lib.h"
+
+/*
+ * The grant node format in xenstore for vring/mpool is:
+ * 0_rx_vring_gref = "gref1#, gref2#, gref3#"
+ * 0_mempool_gref  = "gref1#, gref2#, gref3#"
+ * each gref# is a grant reference for a shared page.
+ * In each shared page, we store the grant_node_item items.
+ */
+struct grant_node_item {
+       uint32_t gref;
+       uint32_t pfn;
+} __attribute__((packed));
+
+/* fd for xen_gntalloc driver, used to allocate grant pages*/
+int gntalloc_fd = -1;
+
+/* xenstore path for local domain, now it is '/local/domain/domid/' */
+static char *dompath = NULL;
+/* handle to xenstore read/write operations */
+static struct xs_handle *xs = NULL;
+
+/*
+ * Reserve a virtual address space.
+ * On success, returns the pointer. On failure, returns NULL.
+ */
+void *
+get_xen_virtual(size_t size, size_t page_sz)
+{
+       void *addr;
+       uintptr_t aligned_addr;
+
+       addr = mmap(NULL, size + page_sz, PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+       if (addr == MAP_FAILED) {
+               RTE_LOG(ERR, PMD, "failed get a virtual area\n");
+               return NULL;
+       }
+
+       aligned_addr = RTE_ALIGN_CEIL((uintptr_t)addr, page_sz);
+       addr = (void *)(aligned_addr);
+
+       return addr;
+}
+
+/*
+ * Get the physical address for virtual memory starting at va.
+ */
+int
+get_phys_map(void *va, phys_addr_t pa[], uint32_t pg_num, uint32_t pg_sz)
+{
+       int32_t fd, rc = 0;
+       uint32_t i, nb;
+       off_t ofs;
+
+       ofs = (uintptr_t)va / pg_sz * sizeof(*pa);
+       nb = pg_num * sizeof(*pa);
+
+       if ((fd = open(PAGEMAP_FNAME, O_RDONLY)) < 0 ||
+                       (rc = pread(fd, pa, nb, ofs)) < 0 ||
+                       (rc -= nb) != 0) {
+               RTE_LOG(ERR, PMD, "%s: failed read of %u bytes from \'%s\' "
+                       "at offset %zu, error code: %d\n",
+                       __func__, nb, PAGEMAP_FNAME, ofs, errno);
+               rc = ENOENT;
+       }
+
+       close(fd);
+       for (i = 0; i != pg_num; i++)
+               pa[i] = (pa[i] & PAGEMAP_PFN_MASK) * pg_sz;
+
+       return rc;
+}
+
+int
+gntalloc_open(void)
+{
+       gntalloc_fd = open(XEN_GNTALLOC_FNAME, O_RDWR);
+       return (gntalloc_fd != -1) ? 0 : -1;
+}
+
+void
+gntalloc_close(void)
+{
+       if (gntalloc_fd != -1)
+               close(gntalloc_fd);
+       gntalloc_fd = -1;
+}
+
+void *
+gntalloc(size_t size, uint32_t *gref, uint64_t *start_index)
+{
+       int page_size = getpagesize();
+       uint32_t i, pg_num;
+       void *va;
+       int rv;
+       struct ioctl_gntalloc_alloc_gref *arg;
+       struct ioctl_gntalloc_dealloc_gref arg_d;
+
+       if (size % page_size) {
+               RTE_LOG(ERR, PMD, "%s: %zu isn't multiple of page size\n",
+                       __func__, size);
+               return NULL;
+       }
+
+       pg_num = size / page_size;
+       arg = malloc(sizeof(*arg) + (pg_num - 1) * sizeof(uint32_t));
+       if (arg == NULL)
+               return NULL;
+       arg->domid = DOM0_DOMID;
+       arg->flags = GNTALLOC_FLAG_WRITABLE;
+       arg->count = pg_num;
+
+       rv = ioctl(gntalloc_fd, IOCTL_GNTALLOC_ALLOC_GREF, arg);
+       if (rv) {
+               RTE_LOG(ERR, PMD, "%s: ioctl error\n", __func__);
+               free(arg);
+               return NULL;
+       }
+
+       va = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, gntalloc_fd, arg->index);
+       if (va == MAP_FAILED) {
+               RTE_LOG(ERR, PMD, "%s: mmap failed\n", __func__);
+               arg_d.count = pg_num;
+               arg_d.index = arg->index;
+               ioctl(gntalloc_fd, IOCTL_GNTALLOC_DEALLOC_GREF, arg_d);
+               free(arg);
+               return NULL;
+       }
+
+       if (gref) {
+               for (i = 0; i < pg_num; i++) {
+                       gref[i] = arg->gref_ids[i];
+               }
+       }
+       if (start_index)
+               *start_index = arg->index;
+
+       free(arg);
+
+       return va;
+}
+
+int
+grefwatch_from_alloc(uint32_t *gref, void **pptr)
+{
+       int rv;
+       void *ptr;
+       int pg_size = getpagesize();
+       struct ioctl_gntalloc_alloc_gref arg = {
+               .domid = DOM0_DOMID,
+               .flags = GNTALLOC_FLAG_WRITABLE,
+               .count = 1
+       };
+       struct ioctl_gntalloc_dealloc_gref arg_d;
+       struct ioctl_gntalloc_unmap_notify notify = {
+               .action = UNMAP_NOTIFY_CLEAR_BYTE
+       };
+
+       rv = ioctl(gntalloc_fd, IOCTL_GNTALLOC_ALLOC_GREF, &arg);
+       if (rv) {
+               RTE_LOG(ERR, PMD, "%s: ioctl error\n", __func__);
+               return -1;
+       }
+
+       ptr = (void *)mmap(NULL, pg_size, PROT_READ|PROT_WRITE, MAP_SHARED, gntalloc_fd, arg.index);
+       arg_d.index = arg.index;
+       arg_d.count = 1;
+       if (ptr == MAP_FAILED) {
+               RTE_LOG(ERR, PMD, "%s: mmap failed\n", __func__);
+               ioctl(gntalloc_fd, IOCTL_GNTALLOC_DEALLOC_GREF, &arg_d);
+               return -1;
+       }
+       if (pptr)
+               *pptr = ptr;
+       if (gref)
+               *gref = arg.gref_ids[0];
+
+       notify.index = arg.index;
+       rv = ioctl(gntalloc_fd, IOCTL_GNTALLOC_SET_UNMAP_NOTIFY, &notify);
+       if (rv) {
+               RTE_LOG(ERR, PMD, "%s: unmap notify failed\n", __func__);
+               munmap(ptr, pg_size);
+               ioctl(gntalloc_fd, IOCTL_GNTALLOC_DEALLOC_GREF, &arg_d);
+               return -1;
+       }
+
+       return 0;
+}
+
+void
+gntfree(void *va, size_t sz, uint64_t start_index)
+{
+       struct ioctl_gntalloc_dealloc_gref arg_d;
+
+       if (va && sz) {
+               munmap(va, sz);
+               arg_d.count = sz / getpagesize();
+               arg_d.index = start_index;
+               ioctl(gntalloc_fd, IOCTL_GNTALLOC_DEALLOC_GREF, &arg_d);
+       }
+}
+
+static int 
+xenstore_cleanup(void)
+{
+       char store_path[PATH_MAX] = {0};
+
+       if (rte_snprintf(store_path, sizeof(store_path),
+               "%s%s", dompath, DPDK_XENSTORE_NODE) == -1)
+               return -1;
+
+       if (xs_rm(xs, XBT_NULL, store_path) == false) {
+               RTE_LOG(ERR, PMD, "%s: failed cleanup node\n", __func__);
+               return -1;
+       }
+
+       return 0;
+}
+
+int
+xenstore_init(void)
+{
+       unsigned int len, domid;
+       char *buf;
+       static int cleanup = 0;
+       char *end;
+
+       xs = xs_domain_open();
+       if (xs == NULL) {
+               RTE_LOG(ERR, PMD,"%s: xs_domain_open failed\n", __func__);
+               return -1;
+       }
+       buf = xs_read(xs, XBT_NULL, "domid", &len);
+       if (buf == NULL) {
+               RTE_LOG(ERR, PMD, "%s: failed read domid\n", __func__);
+               return -1;
+       }
+       errno = 0;
+       domid = strtoul(buf, &end, 0);
+       if (errno != 0 || end == NULL || end == buf ||  domid == 0)
+               return -1;
+
+       RTE_LOG(INFO, PMD, "retrieved dom ID = %d\n", domid);
+
+       dompath = xs_get_domain_path(xs, domid);
+       if (dompath == NULL)
+               return -1;
+
+       xs_transaction_start(xs); /* When to stop transaction */
+
+       if (cleanup == 0) {
+               if (xenstore_cleanup())
+                       return -1;
+               cleanup = 1;
+       }
+
+       return 0;
+}
+
+int
+xenstore_write(const char *key_str, const char *val_str)
+{
+       char grant_path[PATH_MAX];
+       int rv, len;
+
+       if (xs == NULL) {
+               RTE_LOG(ERR, PMD, "%s: xenstore init failed\n", __func__);
+               return -1;
+       }
+       rv = rte_snprintf(grant_path, sizeof(grant_path), "%s%s", dompath, key_str);
+       if (rv == -1) {
+               RTE_LOG(ERR, PMD, "%s: rte_snprintf %s %s failed\n",
+                       __func__, dompath, key_str);
+               return -1;
+       }
+       len = strnlen(val_str, PATH_MAX);
+
+       if (xs_write(xs, XBT_NULL, grant_path, val_str, len) == false) {
+               RTE_LOG(ERR, PMD, "%s: xs_write failed\n", __func__);
+               return -1;
+       }
+
+       return 0;
+}
+
+int
+grant_node_create(uint32_t pg_num, uint32_t *gref_arr, phys_addr_t *pa_arr, char *val_str, size_t str_size)
+{
+       uint64_t start_index;
+       int pg_size;
+       uint32_t pg_shift;
+       void *ptr = NULL;
+       uint32_t count, entries_per_pg;
+       uint32_t i, j = 0, k = 0;;
+       uint32_t *gref_tmp;
+       int first = 1;
+       char tmp_str[PATH_MAX] = {0};
+       int rv = -1;
+
+       pg_size = getpagesize();
+       if (rte_is_power_of_2(pg_size) == 0) {
+               return -1;
+       }
+       pg_shift = rte_bsf32(pg_size);
+       if (pg_size % sizeof(struct grant_node_item)) {
+               RTE_LOG(ERR, PMD, "pg_size isn't a multiple of grant node item\n");
+               return -1;
+       }
+
+       entries_per_pg = pg_size / sizeof(struct grant_node_item);
+       count  = (pg_num +  entries_per_pg - 1 ) / entries_per_pg;
+       gref_tmp = malloc(count * sizeof(uint32_t));
+       if (gref_tmp == NULL)
+               return -1;
+       ptr = gntalloc(pg_size * count, gref_tmp, &start_index);
+       if (ptr == NULL) {
+               RTE_LOG(ERR, PMD, "%s: gntalloc error of %d pages\n", __func__, count);
+               free(gref_tmp);
+               return -1;
+       }
+
+       while (j < pg_num) {
+               if (first) {
+                       rv = rte_snprintf(val_str, str_size, "%u", gref_tmp[k]);
+                       first = 0;
+               } else {
+                       rte_snprintf(tmp_str, PATH_MAX, "%s", val_str);
+                       rv = rte_snprintf(val_str, str_size, "%s,%u", tmp_str, gref_tmp[k]);
+               }
+               k++;
+               if (rv == -1)
+                       break;
+
+               for (i = 0; i < entries_per_pg && j < pg_num ; i++) {
+                       ((struct grant_node_item *)ptr)->gref = gref_arr[j];
+                       ((struct grant_node_item *)ptr)->pfn =  pa_arr[j] >> pg_shift;
+                       ptr = RTE_PTR_ADD(ptr, sizeof(struct grant_node_item));
+                       j++;
+               }
+       }
+       if (rv == -1) {
+               gntfree(ptr, pg_size * count, start_index);
+       } else
+               rv = 0;
+       free(gref_tmp);
+       return rv;
+}
+
+
+int
+grant_gntalloc_mbuf_pool(struct rte_mempool *mpool, uint32_t pg_num, uint32_t *gref_arr, phys_addr_t *pa_arr, int mempool_idx)
+{
+       char key_str[PATH_MAX] = {0};
+       char val_str[PATH_MAX] = {0};
+
+       rte_snprintf(val_str, sizeof(val_str), "");
+
+       if (grant_node_create(pg_num, gref_arr, pa_arr, val_str, sizeof(val_str))) {
+               return -1;
+       }
+
+       if (rte_snprintf(key_str, sizeof(key_str),
+               DPDK_XENSTORE_PATH"%d"MEMPOOL_XENSTORE_STR, mempool_idx) == -1)
+               return -1;
+       if (xenstore_write(key_str, val_str) == -1)
+               return -1;
+
+       if (rte_snprintf(key_str, sizeof(key_str),
+               DPDK_XENSTORE_PATH"%d"MEMPOOL_VA_XENSTORE_STR, mempool_idx) == -1)
+               return -1;
+       if (rte_snprintf(val_str, sizeof(val_str), "%p", (uintptr_t)mpool->elt_va_start) == -1)
+               return -1;
+       if (xenstore_write(key_str, val_str) == -1)
+               return -1;
+
+       return 0;
+}
diff --git a/lib/librte_pmd_xenvirt/rte_xen_lib.h b/lib/librte_pmd_xenvirt/rte_xen_lib.h
new file mode 100644 (file)
index 0000000..e555893
--- /dev/null
@@ -0,0 +1,113 @@
+/*-
+ *   BSD LICENSE
+ * 
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ * 
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ * 
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_XEN_DUMMY_PMD_H
+#define _RTE_XEN_DUMMY_PMD_H
+
+#include <stdint.h>
+
+#include <rte_common.h>
+#include <rte_mempool.h>
+#include <rte_ether.h>
+
+#define        PAGEMAP_FNAME           "/proc/self/pagemap"
+#define XEN_GNTALLOC_FNAME      "/dev/xen/gntalloc"
+#define DPDK_XENSTORE_PATH      "/control/dpdk/"
+#define DPDK_XENSTORE_NODE      "/control/dpdk"
+/*format 0_mempool_gref = "1537,1524,1533" */
+#define MEMPOOL_XENSTORE_STR    "_mempool_gref"
+/*format 0_mempool_va = 0x80340000 */
+#define MEMPOOL_VA_XENSTORE_STR "_mempool_va"
+/*format 0_rx_vring_gref  = "1537,1524,1533" */
+#define RXVRING_XENSTORE_STR    "_rx_vring_gref"
+/*format 0_tx_vring_gref  = "1537,1524,1533" */
+#define TXVRING_XENSTORE_STR    "_tx_vring_gref"
+#define VRING_FLAG_STR          "_vring_flag"
+/*format: event_type_start_0 = 1*/
+#define EVENT_TYPE_START_STR    "event_type_start_"
+
+#define DOM0_DOMID 0
+/*
+ * the pfn (page frame number) are bits 0-54 (see pagemap.txt in linux
+ * Documentation).
+ */
+#define PAGEMAP_PFN_BITS       54
+#define PAGEMAP_PFN_MASK       RTE_LEN2MASK(PAGEMAP_PFN_BITS, phys_addr_t)
+
+#define MAP_FLAG       0xA5
+
+#define RTE_ETH_XENVIRT_PAIRS_DELIM    ';'
+#define RTE_ETH_XENVIRT_KEY_VALUE_DELIM        '='
+#define RTE_ETH_XENVIRT_MAX_ARGS       1
+#define RTE_ETH_XENVIRT_MAC_PARAM      "mac"
+struct xenvirt_dict {
+       uint8_t addr_valid;
+       struct ether_addr addr;
+};
+
+extern int gntalloc_fd;
+
+int
+gntalloc_open(void);
+
+void
+gntalloc_close(void);
+
+void *
+gntalloc(size_t sz, uint32_t *gref, uint64_t *start_index);
+
+void
+gntfree(void *va, size_t sz, uint64_t start_index);
+
+int
+xenstore_init(void);
+
+int
+xenstore_write(const char *key_str, const char *val_str);
+
+int
+get_phys_map(void *va, phys_addr_t pa[], uint32_t pg_num, uint32_t pg_sz);
+
+void *
+get_xen_virtual(size_t size, size_t page_sz);
+
+int
+grefwatch_from_alloc(uint32_t *gref, void **pptr);
+
+
+int grant_node_create(uint32_t pg_num, uint32_t *gref_arr, phys_addr_t *pa_arr, char *val_str, size_t str_size);
+
+int
+grant_gntalloc_mbuf_pool(struct rte_mempool *mpool, uint32_t pg_num, uint32_t *gref_arr, phys_addr_t *pa_arr, int mempool_idx);
+
+#endif
diff --git a/lib/librte_pmd_xenvirt/virtio_logs.h b/lib/librte_pmd_xenvirt/virtio_logs.h
new file mode 100644 (file)
index 0000000..2591c6a
--- /dev/null
@@ -0,0 +1,70 @@
+/*-
+ *   BSD LICENSE
+ * 
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ * 
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ * 
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VIRTIO_LOGS_H_
+#define _VIRTIO_LOGS_H_
+
+#include <rte_log.h>
+
+#ifdef RTE_LIBRTE_VIRTIO_DEBUG_INIT
+#define PMD_INIT_LOG(level, fmt, args...) \
+       RTE_LOG(level, PMD, "%s(): " fmt "\n", __func__, ## args)
+#define PMD_INIT_FUNC_TRACE() PMD_INIT_LOG(DEBUG, " >>")
+#else
+#define PMD_INIT_LOG(level, fmt, args...) do { } while(0)
+#define PMD_INIT_FUNC_TRACE() do { } while(0)
+#endif
+
+#ifdef RTE_LIBRTE_VIRTIO_DEBUG_RX
+#define PMD_RX_LOG(level, fmt, args...) \
+       RTE_LOG(level, PMD, "%s() rx: " fmt , __func__, ## args)
+#else
+#define PMD_RX_LOG(level, fmt, args...) do { } while(0)
+#endif
+
+#ifdef RTE_LIBRTE_VIRTIO_DEBUG_TX
+#define PMD_TX_LOG(level, fmt, args...) \
+       RTE_LOG(level, PMD, "%s() tx: " fmt , __func__, ## args)
+#else
+#define PMD_TX_LOG(level, fmt, args...) do { } while(0)
+#endif
+
+
+#ifdef RTE_LIBRTE_VIRTIO_DEBUG_DRIVER
+#define PMD_DRV_LOG(level, fmt, args...) \
+       RTE_LOG(level, PMD, "%s(): " fmt , __func__, ## args)
+#else
+#define PMD_DRV_LOG(level, fmt, args...) do { } while(0)
+#endif
+
+#endif /* _VIRTIO_LOGS_H_ */
diff --git a/lib/librte_pmd_xenvirt/virtqueue.h b/lib/librte_pmd_xenvirt/virtqueue.h
new file mode 100644 (file)
index 0000000..3844448
--- /dev/null
@@ -0,0 +1,279 @@
+/*-
+ *   BSD LICENSE
+ * 
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ * 
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ * 
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ * 
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VIRTQUEUE_H_
+#define _VIRTQUEUE_H_
+
+#include <stdint.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_net.h>
+
+#include <rte_atomic.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+#include <rte_mbuf.h>
+
+#include "virtio_logs.h"
+
+/* The alignment to use between consumer and producer parts of vring. */
+#define VIRTIO_PCI_VRING_ALIGN 4096
+
+/*
+ * Address translatio is between gva<->hva,
+ * rather than gpa<->hva in virito spec.
+ */
+#define RTE_MBUF_DATA_DMA_ADDR(mb) \
+       ((uint64_t)((mb)->pkt.data))
+
+enum { VTNET_RQ = 0, VTNET_TQ = 1, VTNET_CQ = 2 };
+
+/**
+ * The maximum virtqueue size is 2^15. Use that value as the end of
+ * descriptor chain terminator since it will never be a valid index
+ * in the descriptor table. This is used to verify we are correctly
+ * handling vq_free_cnt.
+ */
+#define VQ_RING_DESC_CHAIN_END 32768
+
+#define VIRTQUEUE_MAX_NAME_SZ  32
+
+struct pmd_internals {
+       struct rte_eth_stats eth_stats;
+       int port_id;
+       int virtio_idx;
+};
+
+
+struct virtqueue {
+       char vq_name[VIRTQUEUE_MAX_NAME_SZ];
+       struct rte_mempool       *mpool;  /**< mempool for mbuf allocation */
+       uint16_t    queue_id;             /**< DPDK queue index. */
+       uint16_t    vq_queue_index;       /**< PCI queue index */
+       uint8_t     port_id;              /**< Device port identifier. */
+
+       void        *vq_ring_virt_mem;    /**< virtual address of vring*/
+       int         vq_alignment;
+       int         vq_ring_size;
+
+       struct vring vq_ring;    /**< vring keeping desc, used and avail */
+       struct pmd_internals *internals;  /**< virtio device internal info. */
+       uint16_t    vq_nentries; /**< vring desc numbers */
+       uint16_t    vq_desc_head_idx;
+       uint16_t    vq_free_cnt; /**< num of desc available */
+       uint16_t vq_used_cons_idx; /**< Last consumed desc in used table, trails vq_ring.used->idx*/
+
+       struct vq_desc_extra {
+               void              *cookie;
+               uint16_t          ndescs;
+       } vq_descx[0] __rte_cache_aligned;
+};
+
+
+#ifdef  RTE_LIBRTE_XENVIRT_DEBUG_DUMP
+#define VIRTQUEUE_DUMP(vq) do { \
+       uint16_t used_idx, nused; \
+       used_idx = (vq)->vq_ring.used->idx; \
+       nused = (uint16_t)(used_idx - (vq)->vq_used_cons_idx); \
+       PMD_INIT_LOG(DEBUG, \
+         "VQ: %s - size=%d; free=%d; used=%d; desc_head_idx=%d;" \
+         " avail.idx=%d; used_cons_idx=%d; used.idx=%d;" \
+         " avail.flags=0x%x; used.flags=0x%x\n", \
+         (vq)->vq_name, (vq)->vq_nentries, (vq)->vq_free_cnt, nused, \
+         (vq)->vq_desc_head_idx, (vq)->vq_ring.avail->idx, \
+         (vq)->vq_used_cons_idx, (vq)->vq_ring.used->idx, \
+         (vq)->vq_ring.avail->flags, (vq)->vq_ring.used->flags); \
+} while (0)
+#else
+#define VIRTQUEUE_DUMP(vq) do { } while (0)
+#endif
+
+
+/**
+ *  Dump virtqueue internal structures, for debug purpose only.
+ */
+void virtqueue_dump(struct virtqueue *vq);
+
+/**
+ *  Get all mbufs to be freed.
+ */
+struct rte_mbuf * virtqueue_detatch_unused(struct virtqueue *vq);
+
+static inline int __attribute__((always_inline))
+virtqueue_full(const struct virtqueue *vq)
+{
+       return (vq->vq_free_cnt == 0);
+}
+
+#define VIRTQUEUE_NUSED(vq) ((uint16_t)((vq)->vq_ring.used->idx - (vq)->vq_used_cons_idx))
+
+static inline void __attribute__((always_inline))
+vq_ring_update_avail(struct virtqueue *vq, uint16_t desc_idx)
+{
+       uint16_t avail_idx;
+       /*
+        * Place the head of the descriptor chain into the next slot and make
+        * it usable to the host. The chain is made available now rather than
+        * deferring to virtqueue_notify() in the hopes that if the host is
+        * currently running on another CPU, we can keep it processing the new
+        * descriptor.
+        */
+       avail_idx = (uint16_t)(vq->vq_ring.avail->idx & (vq->vq_nentries - 1));
+       vq->vq_ring.avail->ring[avail_idx] = desc_idx;
+       rte_compiler_barrier();  /* wmb , for IA memory model barrier is enough*/
+       vq->vq_ring.avail->idx++;
+}
+
+static inline void  __attribute__((always_inline))
+vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx)
+{
+       struct vring_desc *dp;
+       struct vq_desc_extra *dxp;
+
+       dp  = &vq->vq_ring.desc[desc_idx];
+       dxp = &vq->vq_descx[desc_idx];
+       vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt + dxp->ndescs);
+       while (dp->flags & VRING_DESC_F_NEXT) {
+               dp = &vq->vq_ring.desc[dp->next];
+       }
+       dxp->ndescs = 0;
+       
+       /*
+        * We must append the existing free chain, if any, to the end of
+        * newly freed chain. If the virtqueue was completely used, then
+        * head would be VQ_RING_DESC_CHAIN_END (ASSERTed above).
+        */
+       dp->next = vq->vq_desc_head_idx;
+       vq->vq_desc_head_idx = desc_idx;
+}
+
+static inline int  __attribute__((always_inline))
+virtqueue_enqueue_recv_refill(struct virtqueue *rxvq, struct rte_mbuf *cookie)
+{
+       const uint16_t needed = 1;
+       const uint16_t head_idx = rxvq->vq_desc_head_idx;
+       struct vring_desc *start_dp = rxvq->vq_ring.desc;
+       struct vq_desc_extra *dxp;
+
+       if (unlikely(rxvq->vq_free_cnt == 0))
+               return -ENOSPC;
+       if (unlikely(rxvq->vq_free_cnt < needed))
+               return -EMSGSIZE;
+       if (unlikely(head_idx >= rxvq->vq_nentries))
+               return -EFAULT;
+
+       dxp = &rxvq->vq_descx[head_idx];
+       dxp->cookie = (void *)cookie;
+       dxp->ndescs = needed;
+
+       start_dp[head_idx].addr  =
+               (uint64_t) ((uint64_t)cookie->buf_addr + RTE_PKTMBUF_HEADROOM - sizeof(struct virtio_net_hdr));
+       start_dp[head_idx].len   = cookie->buf_len - RTE_PKTMBUF_HEADROOM + sizeof(struct virtio_net_hdr);
+       start_dp[head_idx].flags = VRING_DESC_F_WRITE;
+       rxvq->vq_desc_head_idx   = start_dp[head_idx].next;
+       rxvq->vq_free_cnt        = (uint16_t)(rxvq->vq_free_cnt - needed);
+       vq_ring_update_avail(rxvq, head_idx);
+
+       return 0;
+}
+
+static inline int  __attribute__((always_inline))
+virtqueue_enqueue_xmit(struct virtqueue *txvq, struct rte_mbuf *cookie)
+{
+
+       const uint16_t needed = 2;
+       struct vring_desc *start_dp =  txvq->vq_ring.desc;
+       uint16_t head_idx = txvq->vq_desc_head_idx;
+       uint16_t idx      = head_idx;
+       struct vq_desc_extra *dxp;
+
+       if (unlikely(txvq->vq_free_cnt == 0))
+               return -ENOSPC;
+       if (unlikely(txvq->vq_free_cnt < needed))
+               return -EMSGSIZE;
+       if (unlikely(head_idx >= txvq->vq_nentries)) 
+               return -EFAULT;
+
+       dxp = &txvq->vq_descx[idx];
+       dxp->cookie = (void *)cookie;
+       dxp->ndescs = needed;
+
+       start_dp = txvq->vq_ring.desc;
+       start_dp[idx].addr  = 0; 
+/*
+ * TODO: save one desc here?
+ */
+       start_dp[idx].len   = sizeof(struct virtio_net_hdr);
+       start_dp[idx].flags = VRING_DESC_F_NEXT;
+       start_dp[idx].addr  = (uintptr_t)NULL;
+       idx = start_dp[idx].next;
+       start_dp[idx].addr  = RTE_MBUF_DATA_DMA_ADDR(cookie);
+       start_dp[idx].len   = cookie->pkt.data_len;
+       start_dp[idx].flags = 0;
+       idx = start_dp[idx].next;
+       txvq->vq_desc_head_idx = idx;
+       txvq->vq_free_cnt = (uint16_t)(txvq->vq_free_cnt - needed);
+       vq_ring_update_avail(txvq, head_idx);
+
+       return 0;
+}
+
+static inline uint16_t  __attribute__((always_inline))
+virtqueue_dequeue_burst(struct virtqueue *vq, struct rte_mbuf **rx_pkts, uint32_t *len, uint16_t num)
+{
+       struct vring_used_elem *uep;
+       struct rte_mbuf *cookie;
+       uint16_t used_idx, desc_idx;
+       uint16_t i;
+       /*  Caller does the check */
+       for (i = 0; i < num ; i ++) {
+               used_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
+               uep = &vq->vq_ring.used->ring[used_idx];
+               desc_idx = (uint16_t) uep->id;
+               cookie = (struct rte_mbuf *)vq->vq_descx[desc_idx].cookie;
+               if (unlikely(cookie == NULL)) {
+                       PMD_DRV_LOG(ERR, "vring descriptor with no mbuf cookie at %u\n", 
+                               vq->vq_used_cons_idx);
+                       RTE_LOG(ERR, PMD, "%s: inconsistent (%u, %u)\n", __func__, used_idx , desc_idx);
+                       break;
+               }
+               len[i] = uep->len;
+               rx_pkts[i]  = cookie;
+               vq->vq_used_cons_idx++;
+               vq_ring_free_chain(vq, desc_idx);
+               vq->vq_descx[desc_idx].cookie = NULL;
+       }
+       return i;
+}
+
+#endif /* _VIRTQUEUE_H_ */
index a974dc8..f19a1ab 100644 (file)
@@ -82,10 +82,6 @@ ifeq ($(CONFIG_RTE_LIBRTE_VIRTIO_PMD),y)
 LDLIBS += -lrte_pmd_virtio
 endif
 
-ifeq ($(CONFIG_RTE_LIBRTE_CMDLINE),y)
-LDLIBS += -lrte_cmdline
-endif
-
 ifeq ($(CONFIG_RTE_LIBRTE_TIMER),y)
 LDLIBS += -lrte_timer
 endif
@@ -155,6 +151,16 @@ ifeq ($(CONFIG_RTE_LIBRTE_EAL),y)
 LDLIBS += -lrte_eal
 endif
 
+
+ifeq ($(CONFIG_RTE_LIBRTE_PMD_XENVIRT),y)
+LDLIBS += -lrte_pmd_xenvirt
+LDLIBS += -lxenstore
+endif
+
+ifeq ($(CONFIG_RTE_LIBRTE_CMDLINE),y)
+LDLIBS += -lrte_cmdline
+endif
+
 ifeq ($(CONFIG_RTE_LIBRTE_PMD_PCAP),y)
 LDLIBS += -lrte_pmd_pcap -lpcap
 endif