vhost: use new APIs to handle features
[dpdk.git] / lib / librte_vhost / vhost_user.c
index c4714b7..65fd0fc 100644 (file)
@@ -51,6 +51,9 @@
 #include "vhost.h"
 #include "vhost_user.h"
 
+#define VIRTIO_MIN_MTU 68
+#define VIRTIO_MAX_MTU 65535
+
 static const char *vhost_message_str[VHOST_USER_MAX] = {
        [VHOST_USER_NONE] = "VHOST_USER_NONE",
        [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
@@ -72,20 +75,9 @@ static const char *vhost_message_str[VHOST_USER_MAX] = {
        [VHOST_USER_GET_QUEUE_NUM]  = "VHOST_USER_GET_QUEUE_NUM",
        [VHOST_USER_SET_VRING_ENABLE]  = "VHOST_USER_SET_VRING_ENABLE",
        [VHOST_USER_SEND_RARP]  = "VHOST_USER_SEND_RARP",
+       [VHOST_USER_NET_SET_MTU]  = "VHOST_USER_NET_SET_MTU",
 };
 
-struct orig_region_map {
-       int fd;
-       uint64_t mapped_address;
-       uint64_t mapped_size;
-       uint64_t blksz;
-};
-
-#define orig_region(ptr, nregions) \
-       ((struct orig_region_map *)RTE_PTR_ADD((ptr), \
-               sizeof(struct virtio_memory) + \
-               sizeof(struct virtio_memory_regions) * (nregions)))
-
 static uint64_t
 get_blk_size(int fd)
 {
@@ -99,18 +91,17 @@ get_blk_size(int fd)
 static void
 free_mem_region(struct virtio_net *dev)
 {
-       struct orig_region_map *region;
-       unsigned int idx;
+       uint32_t i;
+       struct virtio_memory_region *reg;
 
        if (!dev || !dev->mem)
                return;
 
-       region = orig_region(dev->mem, dev->mem->nregions);
-       for (idx = 0; idx < dev->mem->nregions; idx++) {
-               if (region[idx].mapped_address) {
-                       munmap((void *)(uintptr_t)region[idx].mapped_address,
-                                       region[idx].mapped_size);
-                       close(region[idx].fd);
+       for (i = 0; i < dev->mem->nregions; i++) {
+               reg = &dev->mem->regions[i];
+               if (reg->host_user_addr) {
+                       munmap(reg->mmap_addr, reg->mmap_size);
+                       close(reg->fd);
                }
        }
 }
@@ -120,7 +111,7 @@ vhost_backend_cleanup(struct virtio_net *dev)
 {
        if (dev->mem) {
                free_mem_region(dev);
-               free(dev->mem);
+               rte_free(dev->mem);
                dev->mem = NULL;
        }
        if (dev->log_addr) {
@@ -134,29 +125,17 @@ vhost_backend_cleanup(struct virtio_net *dev)
  * the device hasn't been initialised.
  */
 static int
-vhost_set_owner(int vid)
+vhost_user_set_owner(void)
 {
-       struct virtio_net *dev;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
        return 0;
 }
 
 static int
-vhost_reset_owner(int vid)
+vhost_user_reset_owner(struct virtio_net *dev)
 {
-       struct virtio_net *dev;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
        if (dev->flags & VIRTIO_DEV_RUNNING) {
                dev->flags &= ~VIRTIO_DEV_RUNNING;
-               notify_ops->destroy_device(vid);
+               notify_ops->destroy_device(dev->vid);
        }
 
        cleanup_device(dev, 0);
@@ -167,35 +146,28 @@ vhost_reset_owner(int vid)
 /*
  * The features that we support are requested.
  */
-static int
-vhost_get_features(int vid, uint64_t *pu)
+static uint64_t
+vhost_user_get_features(struct virtio_net *dev)
 {
-       struct virtio_net *dev;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
+       uint64_t features = 0;
 
-       /* Send our supported features. */
-       *pu = VHOST_FEATURES;
-       return 0;
+       rte_vhost_driver_get_features(dev->ifname, &features);
+       return features;
 }
 
 /*
  * We receive the negotiated features supported by us and the virtio device.
  */
 static int
-vhost_set_features(int vid, uint64_t *pu)
+vhost_user_set_features(struct virtio_net *dev, uint64_t features)
 {
-       struct virtio_net *dev;
+       uint64_t vhost_features = 0;
 
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-       if (*pu & ~VHOST_FEATURES)
+       rte_vhost_driver_get_features(dev->ifname, &vhost_features);
+       if (features & ~vhost_features)
                return -1;
 
-       dev->features = *pu;
+       dev->features = features;
        if (dev->features &
                ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) {
                dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
@@ -215,16 +187,35 @@ vhost_set_features(int vid, uint64_t *pu)
  * The virtio device sends us the size of the descriptor ring.
  */
 static int
-vhost_set_vring_num(int vid, struct vhost_vring_state *state)
+vhost_user_set_vring_num(struct virtio_net *dev,
+                        struct vhost_vring_state *state)
 {
-       struct virtio_net *dev;
+       struct vhost_virtqueue *vq = dev->virtqueue[state->index];
+
+       vq->size = state->num;
+
+       if (dev->dequeue_zero_copy) {
+               vq->nr_zmbuf = 0;
+               vq->last_zmbuf_idx = 0;
+               vq->zmbuf_size = vq->size;
+               vq->zmbufs = rte_zmalloc(NULL, vq->zmbuf_size *
+                                        sizeof(struct zcopy_mbuf), 0);
+               if (vq->zmbufs == NULL) {
+                       RTE_LOG(WARNING, VHOST_CONFIG,
+                               "failed to allocate mem for zero copy; "
+                               "zero copy is force disabled\n");
+                       dev->dequeue_zero_copy = 0;
+               }
+       }
 
-       dev = get_device(vid);
-       if (dev == NULL)
+       vq->shadow_used_ring = rte_malloc(NULL,
+                               vq->size * sizeof(struct vring_used_elem),
+                               RTE_CACHE_LINE_SIZE);
+       if (!vq->shadow_used_ring) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "failed to allocate memory for shadow used ring.\n");
                return -1;
-
-       /* State->index refers to the queue index. The txq is 1, rxq is 0. */
-       dev->virtqueue[state->index]->size = state->num;
+       }
 
        return 0;
 }
@@ -317,25 +308,23 @@ numa_realloc(struct virtio_net *dev, int index __rte_unused)
  * used to convert the ring addresses to our address space.
  */
 static uint64_t
-qva_to_vva(struct virtio_net *dev, uint64_t qemu_va)
+qva_to_vva(struct virtio_net *dev, uint64_t qva)
 {
-       struct virtio_memory_regions *region;
-       uint64_t vhost_va = 0;
-       uint32_t regionidx = 0;
+       struct virtio_memory_region *reg;
+       uint32_t i;
 
        /* Find the region where the address lives. */
-       for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
-               region = &dev->mem->regions[regionidx];
-               if ((qemu_va >= region->userspace_address) &&
-                       (qemu_va <= region->userspace_address +
-                       region->memory_size)) {
-                       vhost_va = qemu_va + region->guest_phys_address +
-                               region->address_offset -
-                               region->userspace_address;
-                       break;
+       for (i = 0; i < dev->mem->nregions; i++) {
+               reg = &dev->mem->regions[i];
+
+               if (qva >= reg->guest_user_addr &&
+                   qva <  reg->guest_user_addr + reg->size) {
+                       return qva - reg->guest_user_addr +
+                              reg->host_user_addr;
                }
        }
-       return vhost_va;
+
+       return 0;
 }
 
 /*
@@ -343,13 +332,11 @@ qva_to_vva(struct virtio_net *dev, uint64_t qemu_va)
  * This function then converts these to our address space.
  */
 static int
-vhost_set_vring_addr(int vid, struct vhost_vring_addr *addr)
+vhost_user_set_vring_addr(struct virtio_net *dev, struct vhost_vring_addr *addr)
 {
-       struct virtio_net *dev;
        struct vhost_virtqueue *vq;
 
-       dev = get_device(vid);
-       if ((dev == NULL) || (dev->mem == NULL))
+       if (dev->mem == NULL)
                return -1;
 
        /* addr->index refers to the queue index. The txq 1, rxq is 0. */
@@ -391,7 +378,8 @@ vhost_set_vring_addr(int vid, struct vhost_vring_addr *addr)
                        "last_used_idx (%u) and vq->used->idx (%u) mismatches; "
                        "some packets maybe resent for Tx and dropped for Rx\n",
                        vq->last_used_idx, vq->used->idx);
-               vq->last_used_idx     = vq->used->idx;
+               vq->last_used_idx  = vq->used->idx;
+               vq->last_avail_idx = vq->used->idx;
        }
 
        vq->log_guest_addr = addr->log_guest_addr;
@@ -412,133 +400,133 @@ vhost_set_vring_addr(int vid, struct vhost_vring_addr *addr)
  * The virtio device sends us the available ring last used index.
  */
 static int
-vhost_set_vring_base(int vid, struct vhost_vring_state *state)
+vhost_user_set_vring_base(struct virtio_net *dev,
+                         struct vhost_vring_state *state)
 {
-       struct virtio_net *dev;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       /* State->index refers to the queue index. The txq is 1, rxq is 0. */
-       dev->virtqueue[state->index]->last_used_idx = state->num;
+       dev->virtqueue[state->index]->last_used_idx  = state->num;
+       dev->virtqueue[state->index]->last_avail_idx = state->num;
 
        return 0;
 }
 
-/*
- * We send the virtio device our available ring last used index.
- */
-static int
-vhost_get_vring_base(int vid, uint32_t index,
-       struct vhost_vring_state *state)
+static void
+add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
+                  uint64_t host_phys_addr, uint64_t size)
 {
-       struct virtio_net *dev;
+       struct guest_page *page, *last_page;
 
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
+       if (dev->nr_guest_pages == dev->max_guest_pages) {
+               dev->max_guest_pages *= 2;
+               dev->guest_pages = realloc(dev->guest_pages,
+                                       dev->max_guest_pages * sizeof(*page));
+       }
 
-       state->index = index;
-       /* State->index refers to the queue index. The txq is 1, rxq is 0. */
-       state->num = dev->virtqueue[state->index]->last_used_idx;
+       if (dev->nr_guest_pages > 0) {
+               last_page = &dev->guest_pages[dev->nr_guest_pages - 1];
+               /* merge if the two pages are continuous */
+               if (host_phys_addr == last_page->host_phys_addr +
+                                     last_page->size) {
+                       last_page->size += size;
+                       return;
+               }
+       }
 
-       return 0;
+       page = &dev->guest_pages[dev->nr_guest_pages++];
+       page->guest_phys_addr = guest_phys_addr;
+       page->host_phys_addr  = host_phys_addr;
+       page->size = size;
 }
 
-/*
- * The virtio device sends an eventfd to interrupt the guest. This fd gets
- * copied into our process space.
- */
-static int
-vhost_set_vring_call(int vid, struct vhost_vring_file *file)
+static void
+add_guest_pages(struct virtio_net *dev, struct virtio_memory_region *reg,
+               uint64_t page_size)
 {
-       struct virtio_net *dev;
-       struct vhost_virtqueue *vq;
-       uint32_t cur_qp_idx = file->index / VIRTIO_QNUM;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       /*
-        * FIXME: VHOST_SET_VRING_CALL is the first per-vring message
-        * we get, so we do vring queue pair allocation here.
-        */
-       if (cur_qp_idx + 1 > dev->virt_qp_nb) {
-               if (alloc_vring_queue_pair(dev, cur_qp_idx) < 0)
-                       return -1;
+       uint64_t reg_size = reg->size;
+       uint64_t host_user_addr  = reg->host_user_addr;
+       uint64_t guest_phys_addr = reg->guest_phys_addr;
+       uint64_t host_phys_addr;
+       uint64_t size;
+
+       host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr);
+       size = page_size - (guest_phys_addr & (page_size - 1));
+       size = RTE_MIN(size, reg_size);
+
+       add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size);
+       host_user_addr  += size;
+       guest_phys_addr += size;
+       reg_size -= size;
+
+       while (reg_size > 0) {
+               size = RTE_MIN(reg_size, page_size);
+               host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)
+                                                 host_user_addr);
+               add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size);
+
+               host_user_addr  += size;
+               guest_phys_addr += size;
+               reg_size -= size;
        }
-
-       /* file->index refers to the queue index. The txq is 1, rxq is 0. */
-       vq = dev->virtqueue[file->index];
-       assert(vq != NULL);
-
-       if (vq->callfd >= 0)
-               close(vq->callfd);
-
-       vq->callfd = file->fd;
-
-       return 0;
 }
 
-/*
- * The virtio device sends an eventfd that it can use to notify us.
- * This fd gets copied into our process space.
- */
-static int
-vhost_set_vring_kick(int vid, struct vhost_vring_file *file)
+#ifdef RTE_LIBRTE_VHOST_DEBUG
+/* TODO: enable it only in debug mode? */
+static void
+dump_guest_pages(struct virtio_net *dev)
 {
-       struct virtio_net *dev;
-       struct vhost_virtqueue *vq;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       /* file->index refers to the queue index. The txq is 1, rxq is 0. */
-       vq = dev->virtqueue[file->index];
-
-       if (vq->kickfd >= 0)
-               close(vq->kickfd);
+       uint32_t i;
+       struct guest_page *page;
 
-       vq->kickfd = file->fd;
+       for (i = 0; i < dev->nr_guest_pages; i++) {
+               page = &dev->guest_pages[i];
 
-       return 0;
+               RTE_LOG(INFO, VHOST_CONFIG,
+                       "guest physical page region %u\n"
+                       "\t guest_phys_addr: %" PRIx64 "\n"
+                       "\t host_phys_addr : %" PRIx64 "\n"
+                       "\t size           : %" PRIx64 "\n",
+                       i,
+                       page->guest_phys_addr,
+                       page->host_phys_addr,
+                       page->size);
+       }
 }
+#else
+#define dump_guest_pages(dev)
+#endif
 
 static int
-user_set_mem_table(int vid, struct VhostUserMsg *pmsg)
+vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
 {
        struct VhostUserMemory memory = pmsg->payload.memory;
-       struct virtio_memory_regions *pregion;
-       uint64_t mapped_address, mapped_size;
-       struct virtio_net *dev;
-       unsigned int idx = 0;
-       struct orig_region_map *pregion_orig;
+       struct virtio_memory_region *reg;
+       void *mmap_addr;
+       uint64_t mmap_size;
+       uint64_t mmap_offset;
        uint64_t alignment;
-
-       /* unmap old memory regions one by one*/
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
+       uint32_t i;
+       int fd;
 
        /* Remove from the data plane. */
        if (dev->flags & VIRTIO_DEV_RUNNING) {
                dev->flags &= ~VIRTIO_DEV_RUNNING;
-               notify_ops->destroy_device(vid);
+               notify_ops->destroy_device(dev->vid);
        }
 
        if (dev->mem) {
                free_mem_region(dev);
-               free(dev->mem);
+               rte_free(dev->mem);
                dev->mem = NULL;
        }
 
-       dev->mem = calloc(1,
-               sizeof(struct virtio_memory) +
-               sizeof(struct virtio_memory_regions) * memory.nregions +
-               sizeof(struct orig_region_map) * memory.nregions);
+       dev->nr_guest_pages = 0;
+       if (!dev->guest_pages) {
+               dev->max_guest_pages = 8;
+               dev->guest_pages = malloc(dev->max_guest_pages *
+                                               sizeof(struct guest_page));
+       }
+
+       dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct virtio_memory) +
+               sizeof(struct virtio_memory_region) * memory.nregions, 0);
        if (dev->mem == NULL) {
                RTE_LOG(ERR, VHOST_CONFIG,
                        "(%d) failed to allocate memory for dev->mem\n",
@@ -547,22 +535,17 @@ user_set_mem_table(int vid, struct VhostUserMsg *pmsg)
        }
        dev->mem->nregions = memory.nregions;
 
-       pregion_orig = orig_region(dev->mem, memory.nregions);
-       for (idx = 0; idx < memory.nregions; idx++) {
-               pregion = &dev->mem->regions[idx];
-               pregion->guest_phys_address =
-                       memory.regions[idx].guest_phys_addr;
-               pregion->guest_phys_address_end =
-                       memory.regions[idx].guest_phys_addr +
-                       memory.regions[idx].memory_size;
-               pregion->memory_size =
-                       memory.regions[idx].memory_size;
-               pregion->userspace_address =
-                       memory.regions[idx].userspace_addr;
-
-               /* This is ugly */
-               mapped_size = memory.regions[idx].memory_size +
-                       memory.regions[idx].mmap_offset;
+       for (i = 0; i < memory.nregions; i++) {
+               fd  = pmsg->fds[i];
+               reg = &dev->mem->regions[i];
+
+               reg->guest_phys_addr = memory.regions[i].guest_phys_addr;
+               reg->guest_user_addr = memory.regions[i].userspace_addr;
+               reg->size            = memory.regions[i].memory_size;
+               reg->fd              = fd;
+
+               mmap_offset = memory.regions[i].mmap_offset;
+               mmap_size   = reg->size + mmap_offset;
 
                /* mmap() without flag of MAP_ANONYMOUS, should be called
                 * with length argument aligned with hugepagesz at older
@@ -572,67 +555,57 @@ user_set_mem_table(int vid, struct VhostUserMsg *pmsg)
                 * to avoid failure, make sure in caller to keep length
                 * aligned.
                 */
-               alignment = get_blk_size(pmsg->fds[idx]);
+               alignment = get_blk_size(fd);
                if (alignment == (uint64_t)-1) {
                        RTE_LOG(ERR, VHOST_CONFIG,
                                "couldn't get hugepage size through fstat\n");
                        goto err_mmap;
                }
-               mapped_size = RTE_ALIGN_CEIL(mapped_size, alignment);
+               mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment);
 
-               mapped_address = (uint64_t)(uintptr_t)mmap(NULL,
-                       mapped_size,
-                       PROT_READ | PROT_WRITE, MAP_SHARED,
-                       pmsg->fds[idx],
-                       0);
+               mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+                                MAP_SHARED | MAP_POPULATE, fd, 0);
 
-               RTE_LOG(INFO, VHOST_CONFIG,
-                       "mapped region %d fd:%d to:%p sz:0x%"PRIx64" "
-                       "off:0x%"PRIx64" align:0x%"PRIx64"\n",
-                       idx, pmsg->fds[idx], (void *)(uintptr_t)mapped_address,
-                       mapped_size, memory.regions[idx].mmap_offset,
-                       alignment);
-
-               if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) {
+               if (mmap_addr == MAP_FAILED) {
                        RTE_LOG(ERR, VHOST_CONFIG,
-                               "mmap qemu guest failed.\n");
+                               "mmap region %u failed.\n", i);
                        goto err_mmap;
                }
 
-               pregion_orig[idx].mapped_address = mapped_address;
-               pregion_orig[idx].mapped_size = mapped_size;
-               pregion_orig[idx].blksz = alignment;
-               pregion_orig[idx].fd = pmsg->fds[idx];
-
-               mapped_address +=  memory.regions[idx].mmap_offset;
+               reg->mmap_addr = mmap_addr;
+               reg->mmap_size = mmap_size;
+               reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
+                                     mmap_offset;
 
-               pregion->address_offset = mapped_address -
-                       pregion->guest_phys_address;
+               if (dev->dequeue_zero_copy)
+                       add_guest_pages(dev, reg, alignment);
 
-               if (memory.regions[idx].guest_phys_addr == 0) {
-                       dev->mem->base_address =
-                               memory.regions[idx].userspace_addr;
-                       dev->mem->mapped_address =
-                               pregion->address_offset;
-               }
-
-               LOG_DEBUG(VHOST_CONFIG,
-                       "REGION: %u GPA: %p QEMU VA: %p SIZE (%"PRIu64")\n",
-                       idx,
-                       (void *)(uintptr_t)pregion->guest_phys_address,
-                       (void *)(uintptr_t)pregion->userspace_address,
-                        pregion->memory_size);
+               RTE_LOG(INFO, VHOST_CONFIG,
+                       "guest memory region %u, size: 0x%" PRIx64 "\n"
+                       "\t guest physical addr: 0x%" PRIx64 "\n"
+                       "\t guest virtual  addr: 0x%" PRIx64 "\n"
+                       "\t host  virtual  addr: 0x%" PRIx64 "\n"
+                       "\t mmap addr : 0x%" PRIx64 "\n"
+                       "\t mmap size : 0x%" PRIx64 "\n"
+                       "\t mmap align: 0x%" PRIx64 "\n"
+                       "\t mmap off  : 0x%" PRIx64 "\n",
+                       i, reg->size,
+                       reg->guest_phys_addr,
+                       reg->guest_user_addr,
+                       reg->host_user_addr,
+                       (uint64_t)(uintptr_t)mmap_addr,
+                       mmap_size,
+                       alignment,
+                       mmap_offset);
        }
 
+       dump_guest_pages(dev);
+
        return 0;
 
 err_mmap:
-       while (idx--) {
-               munmap((void *)(uintptr_t)pregion_orig[idx].mapped_address,
-                               pregion_orig[idx].mapped_size);
-               close(pregion_orig[idx].fd);
-       }
-       free(dev->mem);
+       free_mem_region(dev);
+       rte_free(dev->mem);
        dev->mem = NULL;
        return -1;
 }
@@ -668,9 +641,10 @@ virtio_is_ready(struct virtio_net *dev)
 }
 
 static void
-user_set_vring_call(int vid, struct VhostUserMsg *pmsg)
+vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg)
 {
        struct vhost_vring_file file;
+       struct vhost_virtqueue *vq;
 
        file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
        if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
@@ -679,7 +653,12 @@ user_set_vring_call(int vid, struct VhostUserMsg *pmsg)
                file.fd = pmsg->fds[0];
        RTE_LOG(INFO, VHOST_CONFIG,
                "vring call idx:%d file:%d\n", file.index, file.fd);
-       vhost_set_vring_call(vid, &file);
+
+       vq = dev->virtqueue[file.index];
+       if (vq->callfd >= 0)
+               close(vq->callfd);
+
+       vq->callfd = file.fd;
 }
 
 /*
@@ -687,13 +666,10 @@ user_set_vring_call(int vid, struct VhostUserMsg *pmsg)
  *  device is ready for packet processing.
  */
 static void
-user_set_vring_kick(int vid, struct VhostUserMsg *pmsg)
+vhost_user_set_vring_kick(struct virtio_net *dev, struct VhostUserMsg *pmsg)
 {
        struct vhost_vring_file file;
-       struct virtio_net *dev = get_device(vid);
-
-       if (!dev)
-               return;
+       struct vhost_virtqueue *vq;
 
        file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
        if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
@@ -702,32 +678,62 @@ user_set_vring_kick(int vid, struct VhostUserMsg *pmsg)
                file.fd = pmsg->fds[0];
        RTE_LOG(INFO, VHOST_CONFIG,
                "vring kick idx:%d file:%d\n", file.index, file.fd);
-       vhost_set_vring_kick(vid, &file);
 
-       if (virtio_is_ready(dev) && !(dev->flags & VIRTIO_DEV_RUNNING)) {
-               if (notify_ops->new_device(vid) == 0)
-                       dev->flags |= VIRTIO_DEV_RUNNING;
+       vq = dev->virtqueue[file.index];
+       if (vq->kickfd >= 0)
+               close(vq->kickfd);
+       vq->kickfd = file.fd;
+
+       if (virtio_is_ready(dev)) {
+               dev->flags |= VIRTIO_DEV_READY;
+
+               if (!(dev->flags & VIRTIO_DEV_RUNNING)) {
+                       if (dev->dequeue_zero_copy) {
+                               RTE_LOG(INFO, VHOST_CONFIG,
+                                               "dequeue zero copy is enabled\n");
+                       }
+
+                       if (notify_ops->new_device(dev->vid) == 0)
+                               dev->flags |= VIRTIO_DEV_RUNNING;
+               }
        }
 }
 
+static void
+free_zmbufs(struct vhost_virtqueue *vq)
+{
+       struct zcopy_mbuf *zmbuf, *next;
+
+       for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
+            zmbuf != NULL; zmbuf = next) {
+               next = TAILQ_NEXT(zmbuf, next);
+
+               rte_pktmbuf_free(zmbuf->mbuf);
+               TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
+       }
+
+       rte_free(vq->zmbufs);
+}
+
 /*
  * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
  */
 static int
-user_get_vring_base(int vid, struct vhost_vring_state *state)
+vhost_user_get_vring_base(struct virtio_net *dev,
+                         struct vhost_vring_state *state)
 {
-       struct virtio_net *dev = get_device(vid);
+       struct vhost_virtqueue *vq = dev->virtqueue[state->index];
 
-       if (dev == NULL)
-               return -1;
        /* We have to stop the queue (virtio) if it is running. */
        if (dev->flags & VIRTIO_DEV_RUNNING) {
                dev->flags &= ~VIRTIO_DEV_RUNNING;
-               notify_ops->destroy_device(vid);
+               notify_ops->destroy_device(dev->vid);
        }
 
+       dev->flags &= ~VIRTIO_DEV_READY;
+
        /* Here we are safe to get the last used index */
-       vhost_get_vring_base(vid, state->index, state);
+       state->num = vq->last_used_idx;
 
        RTE_LOG(INFO, VHOST_CONFIG,
                "vring base idx:%d file:%d\n", state->index, state->num);
@@ -736,10 +742,15 @@ user_get_vring_base(int vid, struct vhost_vring_state *state)
         * sent and only sent in vhost_vring_stop.
         * TODO: cleanup the vring, it isn't usable since here.
         */
-       if (dev->virtqueue[state->index]->kickfd >= 0)
-               close(dev->virtqueue[state->index]->kickfd);
+       if (vq->kickfd >= 0)
+               close(vq->kickfd);
 
-       dev->virtqueue[state->index]->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+       vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+
+       if (dev->dequeue_zero_copy)
+               free_zmbufs(vq);
+       rte_free(vq->shadow_used_ring);
+       vq->shadow_used_ring = NULL;
 
        return 0;
 }
@@ -749,21 +760,17 @@ user_get_vring_base(int vid, struct vhost_vring_state *state)
  * enable the virtio queue pair.
  */
 static int
-user_set_vring_enable(int vid, struct vhost_vring_state *state)
+vhost_user_set_vring_enable(struct virtio_net *dev,
+                           struct vhost_vring_state *state)
 {
-       struct virtio_net *dev;
        int enable = (int)state->num;
 
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
        RTE_LOG(INFO, VHOST_CONFIG,
                "set queue enable: %d to qp idx: %d\n",
                enable, state->index);
 
        if (notify_ops->vring_state_changed)
-               notify_ops->vring_state_changed(vid, state->index, enable);
+               notify_ops->vring_state_changed(dev->vid, state->index, enable);
 
        dev->virtqueue[state->index]->enabled = enable;
 
@@ -771,29 +778,22 @@ user_set_vring_enable(int vid, struct vhost_vring_state *state)
 }
 
 static void
-user_set_protocol_features(int vid, uint64_t protocol_features)
+vhost_user_set_protocol_features(struct virtio_net *dev,
+                                uint64_t protocol_features)
 {
-       struct virtio_net *dev;
-
-       dev = get_device(vid);
-       if (dev == NULL || protocol_features & ~VHOST_USER_PROTOCOL_FEATURES)
+       if (protocol_features & ~VHOST_USER_PROTOCOL_FEATURES)
                return;
 
        dev->protocol_features = protocol_features;
 }
 
 static int
-user_set_log_base(int vid, struct VhostUserMsg *msg)
+vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg)
 {
-       struct virtio_net *dev;
        int fd = msg->fds[0];
        uint64_t size, off;
        void *addr;
 
-       dev = get_device(vid);
-       if (!dev)
-               return -1;
-
        if (fd < 0) {
                RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd);
                return -1;
@@ -846,15 +846,10 @@ user_set_log_base(int vid, struct VhostUserMsg *msg)
  * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
  */
 static int
-user_send_rarp(int vid, struct VhostUserMsg *msg)
+vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg)
 {
-       struct virtio_net *dev;
        uint8_t *mac = (uint8_t *)&msg->payload.u64;
 
-       dev = get_device(vid);
-       if (!dev)
-               return -1;
-
        RTE_LOG(DEBUG, VHOST_CONFIG,
                ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
                mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
@@ -873,6 +868,22 @@ user_send_rarp(int vid, struct VhostUserMsg *msg)
        return 0;
 }
 
+static int
+vhost_user_net_set_mtu(struct virtio_net *dev, struct VhostUserMsg *msg)
+{
+       if (msg->payload.u64 < VIRTIO_MIN_MTU ||
+                       msg->payload.u64 > VIRTIO_MAX_MTU) {
+               RTE_LOG(ERR, VHOST_CONFIG, "Invalid MTU size (%"PRIu64")\n",
+                               msg->payload.u64);
+
+               return -1;
+       }
+
+       dev->mtu = msg->payload.u64;
+
+       return 0;
+}
+
 /* return bytes# of read on success or negative val on failure. */
 static int
 read_vhost_message(int sockfd, struct VhostUserMsg *msg)
@@ -912,6 +923,7 @@ send_vhost_message(int sockfd, struct VhostUserMsg *msg)
                return 0;
 
        msg->flags &= ~VHOST_USER_VERSION_MASK;
+       msg->flags &= ~VHOST_USER_NEED_REPLY;
        msg->flags |= VHOST_USER_VERSION;
        msg->flags |= VHOST_USER_REPLY_MASK;
 
@@ -921,13 +933,57 @@ send_vhost_message(int sockfd, struct VhostUserMsg *msg)
        return ret;
 }
 
+/*
+ * Allocate a queue pair if it hasn't been allocated yet
+ */
+static int
+vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, VhostUserMsg *msg)
+{
+       uint16_t vring_idx;
+       uint16_t qp_idx;
+
+       switch (msg->request) {
+       case VHOST_USER_SET_VRING_KICK:
+       case VHOST_USER_SET_VRING_CALL:
+       case VHOST_USER_SET_VRING_ERR:
+               vring_idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+               break;
+       case VHOST_USER_SET_VRING_NUM:
+       case VHOST_USER_SET_VRING_BASE:
+       case VHOST_USER_SET_VRING_ENABLE:
+               vring_idx = msg->payload.state.index;
+               break;
+       case VHOST_USER_SET_VRING_ADDR:
+               vring_idx = msg->payload.addr.index;
+               break;
+       default:
+               return 0;
+       }
+
+       qp_idx = vring_idx / VIRTIO_QNUM;
+       if (qp_idx >= VHOST_MAX_QUEUE_PAIRS) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "invalid vring index: %u\n", vring_idx);
+               return -1;
+       }
+
+       if (dev->virtqueue[qp_idx * VIRTIO_QNUM])
+               return 0;
+
+       return alloc_vring_queue_pair(dev, qp_idx);
+}
+
 int
 vhost_user_msg_handler(int vid, int fd)
 {
+       struct virtio_net *dev;
        struct VhostUserMsg msg;
-       uint64_t features = 0;
        int ret;
 
+       dev = get_device(vid);
+       if (dev == NULL)
+               return -1;
+
        ret = read_vhost_message(fd, &msg);
        if (ret <= 0 || msg.request >= VHOST_USER_MAX) {
                if (ret < 0)
@@ -943,18 +999,25 @@ vhost_user_msg_handler(int vid, int fd)
                return -1;
        }
 
+       ret = 0;
        RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
                vhost_message_str[msg.request]);
+
+       ret = vhost_user_check_and_alloc_queue_pair(dev, &msg);
+       if (ret < 0) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "failed to alloc queue\n");
+               return -1;
+       }
+
        switch (msg.request) {
        case VHOST_USER_GET_FEATURES:
-               ret = vhost_get_features(vid, &features);
-               msg.payload.u64 = features;
+               msg.payload.u64 = vhost_user_get_features(dev);
                msg.size = sizeof(msg.payload.u64);
                send_vhost_message(fd, &msg);
                break;
        case VHOST_USER_SET_FEATURES:
-               features = msg.payload.u64;
-               vhost_set_features(vid, &features);
+               vhost_user_set_features(dev, msg.payload.u64);
                break;
 
        case VHOST_USER_GET_PROTOCOL_FEATURES:
@@ -963,22 +1026,22 @@ vhost_user_msg_handler(int vid, int fd)
                send_vhost_message(fd, &msg);
                break;
        case VHOST_USER_SET_PROTOCOL_FEATURES:
-               user_set_protocol_features(vid, msg.payload.u64);
+               vhost_user_set_protocol_features(dev, msg.payload.u64);
                break;
 
        case VHOST_USER_SET_OWNER:
-               vhost_set_owner(vid);
+               vhost_user_set_owner();
                break;
        case VHOST_USER_RESET_OWNER:
-               vhost_reset_owner(vid);
+               vhost_user_reset_owner(dev);
                break;
 
        case VHOST_USER_SET_MEM_TABLE:
-               user_set_mem_table(vid, &msg);
+               ret = vhost_user_set_mem_table(dev, &msg);
                break;
 
        case VHOST_USER_SET_LOG_BASE:
-               user_set_log_base(vid, &msg);
+               vhost_user_set_log_base(dev, &msg);
 
                /* it needs a reply */
                msg.size = sizeof(msg.payload.u64);
@@ -990,26 +1053,26 @@ vhost_user_msg_handler(int vid, int fd)
                break;
 
        case VHOST_USER_SET_VRING_NUM:
-               vhost_set_vring_num(vid, &msg.payload.state);
+               vhost_user_set_vring_num(dev, &msg.payload.state);
                break;
        case VHOST_USER_SET_VRING_ADDR:
-               vhost_set_vring_addr(vid, &msg.payload.addr);
+               vhost_user_set_vring_addr(dev, &msg.payload.addr);
                break;
        case VHOST_USER_SET_VRING_BASE:
-               vhost_set_vring_base(vid, &msg.payload.state);
+               vhost_user_set_vring_base(dev, &msg.payload.state);
                break;
 
        case VHOST_USER_GET_VRING_BASE:
-               ret = user_get_vring_base(vid, &msg.payload.state);
+               vhost_user_get_vring_base(dev, &msg.payload.state);
                msg.size = sizeof(msg.payload.state);
                send_vhost_message(fd, &msg);
                break;
 
        case VHOST_USER_SET_VRING_KICK:
-               user_set_vring_kick(vid, &msg);
+               vhost_user_set_vring_kick(dev, &msg);
                break;
        case VHOST_USER_SET_VRING_CALL:
-               user_set_vring_call(vid, &msg);
+               vhost_user_set_vring_call(dev, &msg);
                break;
 
        case VHOST_USER_SET_VRING_ERR:
@@ -1025,16 +1088,27 @@ vhost_user_msg_handler(int vid, int fd)
                break;
 
        case VHOST_USER_SET_VRING_ENABLE:
-               user_set_vring_enable(vid, &msg.payload.state);
+               vhost_user_set_vring_enable(dev, &msg.payload.state);
                break;
        case VHOST_USER_SEND_RARP:
-               user_send_rarp(vid, &msg);
+               vhost_user_send_rarp(dev, &msg);
+               break;
+
+       case VHOST_USER_NET_SET_MTU:
+               ret = vhost_user_net_set_mtu(dev, &msg);
                break;
 
        default:
+               ret = -1;
                break;
 
        }
 
+       if (msg.flags & VHOST_USER_NEED_REPLY) {
+               msg.payload.u64 = !!ret;
+               msg.size = sizeof(msg.payload.u64);
+               send_vhost_message(fd, &msg);
+       }
+
        return 0;
 }