vhost: refactor code structure
authorYuanhan Liu <yuanhan.liu@linux.intel.com>
Thu, 18 Aug 2016 08:48:39 +0000 (16:48 +0800)
committerYuanhan Liu <yuanhan.liu@linux.intel.com>
Tue, 13 Sep 2016 03:25:08 +0000 (05:25 +0200)
The code structure is a bit messy now. For example, vhost-user message
handling is spread to three different files:

    vhost-net-user.c  virtio-net.c  virtio-net-user.c

Where, vhost-net-user.c is the entrance to handle all those messages
and then invoke the right method for a specific message. Some of them
are stored at virtio-net.c, while others are stored at virtio-net-user.c.

The truth is all of them should be in one file, vhost_user.c.

So this patch refactors the source code structure: mainly on renaming
files and moving code from one file to another file that is more suitable
for storing it. Thus, no functional changes are made.

After the refactor, the code structure becomes to:

- socket.c      handles all vhost-user socket file related stuff, such
                as, socket file creation for server mode, reconnection
                for client mode.

- vhost.c       mainly on stuff like vhost device creation/destroy/reset.
                Most of the vhost API implementation are there, too.

- vhost_user.c  all stuff about vhost-user messages handling goes there.

- virtio_net.c  all stuff about virtio-net should go there. It has virtio
                net Rx/Tx implementation only so far: it's just a rename
                from vhost_rxtx.c

Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
14 files changed:
lib/librte_vhost/Makefile
lib/librte_vhost/socket.c [new file with mode: 0644]
lib/librte_vhost/vhost-net-user.c [deleted file]
lib/librte_vhost/vhost-net-user.h [deleted file]
lib/librte_vhost/vhost-net.h [deleted file]
lib/librte_vhost/vhost.c [new file with mode: 0644]
lib/librte_vhost/vhost.h [new file with mode: 0644]
lib/librte_vhost/vhost_rxtx.c [deleted file]
lib/librte_vhost/vhost_user.c [new file with mode: 0644]
lib/librte_vhost/vhost_user.h [new file with mode: 0644]
lib/librte_vhost/virtio-net-user.c [deleted file]
lib/librte_vhost/virtio-net-user.h [deleted file]
lib/librte_vhost/virtio-net.c [deleted file]
lib/librte_vhost/virtio_net.c [new file with mode: 0644]

index 277390f..415ffc6 100644 (file)
@@ -47,10 +47,8 @@ LDLIBS += -lnuma
 endif
 
 # all source are stored in SRCS-y
-SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := virtio-net.c vhost_rxtx.c
-SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += vhost-net-user.c
-SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += virtio-net-user.c
-SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += fd_man.c
+SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c socket.c vhost.c vhost_user.c \
+                                  virtio_net.c
 
 # install includes
 SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_virtio_net.h
diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
new file mode 100644 (file)
index 0000000..bf03f84
--- /dev/null
@@ -0,0 +1,610 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/queue.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+
+#include <rte_log.h>
+
+#include "fd_man.h"
+#include "vhost.h"
+#include "vhost_user.h"
+
+/*
+ * Every time rte_vhost_driver_register() is invoked, an associated
+ * vhost_user_socket struct will be created.
+ */
+struct vhost_user_socket {
+       char *path;
+       int listenfd;
+       int connfd;
+       bool is_server;
+       bool reconnect;
+};
+
+struct vhost_user_connection {
+       struct vhost_user_socket *vsocket;
+       int vid;
+};
+
+#define MAX_VHOST_SOCKET 1024
+struct vhost_user {
+       struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET];
+       struct fdset fdset;
+       int vsocket_cnt;
+       pthread_mutex_t mutex;
+};
+
+#define MAX_VIRTIO_BACKLOG 128
+
+static void vhost_user_server_new_connection(int fd, void *data, int *remove);
+static void vhost_user_read_cb(int fd, void *dat, int *remove);
+static int vhost_user_create_client(struct vhost_user_socket *vsocket);
+
+static struct vhost_user vhost_user = {
+       .fdset = {
+               .fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} },
+               .fd_mutex = PTHREAD_MUTEX_INITIALIZER,
+               .num = 0
+       },
+       .vsocket_cnt = 0,
+       .mutex = PTHREAD_MUTEX_INITIALIZER,
+};
+
+/* return bytes# of read on success or negative val on failure. */
+int
+read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
+{
+       struct iovec iov;
+       struct msghdr msgh;
+       size_t fdsize = fd_num * sizeof(int);
+       char control[CMSG_SPACE(fdsize)];
+       struct cmsghdr *cmsg;
+       int ret;
+
+       memset(&msgh, 0, sizeof(msgh));
+       iov.iov_base = buf;
+       iov.iov_len  = buflen;
+
+       msgh.msg_iov = &iov;
+       msgh.msg_iovlen = 1;
+       msgh.msg_control = control;
+       msgh.msg_controllen = sizeof(control);
+
+       ret = recvmsg(sockfd, &msgh, 0);
+       if (ret <= 0) {
+               RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n");
+               return ret;
+       }
+
+       if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
+               RTE_LOG(ERR, VHOST_CONFIG, "truncted msg\n");
+               return -1;
+       }
+
+       for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+               cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+               if ((cmsg->cmsg_level == SOL_SOCKET) &&
+                       (cmsg->cmsg_type == SCM_RIGHTS)) {
+                       memcpy(fds, CMSG_DATA(cmsg), fdsize);
+                       break;
+               }
+       }
+
+       return ret;
+}
+
+int
+send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
+{
+
+       struct iovec iov;
+       struct msghdr msgh;
+       size_t fdsize = fd_num * sizeof(int);
+       char control[CMSG_SPACE(fdsize)];
+       struct cmsghdr *cmsg;
+       int ret;
+
+       memset(&msgh, 0, sizeof(msgh));
+       iov.iov_base = buf;
+       iov.iov_len = buflen;
+
+       msgh.msg_iov = &iov;
+       msgh.msg_iovlen = 1;
+
+       if (fds && fd_num > 0) {
+               msgh.msg_control = control;
+               msgh.msg_controllen = sizeof(control);
+               cmsg = CMSG_FIRSTHDR(&msgh);
+               cmsg->cmsg_len = CMSG_LEN(fdsize);
+               cmsg->cmsg_level = SOL_SOCKET;
+               cmsg->cmsg_type = SCM_RIGHTS;
+               memcpy(CMSG_DATA(cmsg), fds, fdsize);
+       } else {
+               msgh.msg_control = NULL;
+               msgh.msg_controllen = 0;
+       }
+
+       do {
+               ret = sendmsg(sockfd, &msgh, 0);
+       } while (ret < 0 && errno == EINTR);
+
+       if (ret < 0) {
+               RTE_LOG(ERR, VHOST_CONFIG,  "sendmsg error\n");
+               return ret;
+       }
+
+       return ret;
+}
+
+static void
+vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
+{
+       int vid;
+       size_t size;
+       struct vhost_user_connection *conn;
+       int ret;
+
+       conn = malloc(sizeof(*conn));
+       if (conn == NULL) {
+               close(fd);
+               return;
+       }
+
+       vid = vhost_new_device();
+       if (vid == -1) {
+               close(fd);
+               free(conn);
+               return;
+       }
+
+       size = strnlen(vsocket->path, PATH_MAX);
+       vhost_set_ifname(vid, vsocket->path, size);
+
+       RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid);
+
+       vsocket->connfd = fd;
+       conn->vsocket = vsocket;
+       conn->vid = vid;
+       ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb,
+                       NULL, conn);
+       if (ret < 0) {
+               vsocket->connfd = -1;
+               free(conn);
+               close(fd);
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "failed to add fd %d into vhost server fdset\n",
+                       fd);
+       }
+}
+
+/* call back when there is new vhost-user connection from client  */
+static void
+vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused)
+{
+       struct vhost_user_socket *vsocket = dat;
+
+       fd = accept(fd, NULL, NULL);
+       if (fd < 0)
+               return;
+
+       RTE_LOG(INFO, VHOST_CONFIG, "new vhost user connection is %d\n", fd);
+       vhost_user_add_connection(fd, vsocket);
+}
+
+static void
+vhost_user_read_cb(int connfd, void *dat, int *remove)
+{
+       struct vhost_user_connection *conn = dat;
+       struct vhost_user_socket *vsocket = conn->vsocket;
+       int ret;
+
+       ret = vhost_user_msg_handler(conn->vid, connfd);
+       if (ret < 0) {
+               vsocket->connfd = -1;
+               close(connfd);
+               *remove = 1;
+               free(conn);
+               vhost_destroy_device(conn->vid);
+
+               if (vsocket->reconnect)
+                       vhost_user_create_client(vsocket);
+       }
+}
+
+static int
+create_unix_socket(const char *path, struct sockaddr_un *un, bool is_server)
+{
+       int fd;
+
+       fd = socket(AF_UNIX, SOCK_STREAM, 0);
+       if (fd < 0)
+               return -1;
+       RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n",
+               is_server ? "server" : "client", fd);
+
+       if (!is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "vhost-user: can't set nonblocking mode for socket, fd: "
+                       "%d (%s)\n", fd, strerror(errno));
+               close(fd);
+               return -1;
+       }
+
+       memset(un, 0, sizeof(*un));
+       un->sun_family = AF_UNIX;
+       strncpy(un->sun_path, path, sizeof(un->sun_path));
+       un->sun_path[sizeof(un->sun_path) - 1] = '\0';
+
+       return fd;
+}
+
+static int
+vhost_user_create_server(struct vhost_user_socket *vsocket)
+{
+       int fd;
+       int ret;
+       struct sockaddr_un un;
+       const char *path = vsocket->path;
+
+       fd = create_unix_socket(path, &un, vsocket->is_server);
+       if (fd < 0)
+               return -1;
+
+       ret = bind(fd, (struct sockaddr *)&un, sizeof(un));
+       if (ret < 0) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "failed to bind to %s: %s; remove it and try again\n",
+                       path, strerror(errno));
+               goto err;
+       }
+       RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path);
+
+       ret = listen(fd, MAX_VIRTIO_BACKLOG);
+       if (ret < 0)
+               goto err;
+
+       vsocket->listenfd = fd;
+       ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection,
+                 NULL, vsocket);
+       if (ret < 0) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "failed to add listen fd %d to vhost server fdset\n",
+                       fd);
+               goto err;
+       }
+
+       return 0;
+
+err:
+       close(fd);
+       return -1;
+}
+
+struct vhost_user_reconnect {
+       struct sockaddr_un un;
+       int fd;
+       struct vhost_user_socket *vsocket;
+
+       TAILQ_ENTRY(vhost_user_reconnect) next;
+};
+
+TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect);
+struct vhost_user_reconnect_list {
+       struct vhost_user_reconnect_tailq_list head;
+       pthread_mutex_t mutex;
+};
+
+static struct vhost_user_reconnect_list reconn_list;
+static pthread_t reconn_tid;
+
+static int
+vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz)
+{
+       int ret, flags;
+
+       ret = connect(fd, un, sz);
+       if (ret < 0 && errno != EISCONN)
+               return -1;
+
+       flags = fcntl(fd, F_GETFL, 0);
+       if (flags < 0) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "can't get flags for connfd %d\n", fd);
+               return -2;
+       }
+       if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                               "can't disable nonblocking on fd %d\n", fd);
+               return -2;
+       }
+       return 0;
+}
+
+static void *
+vhost_user_client_reconnect(void *arg __rte_unused)
+{
+       int ret;
+       struct vhost_user_reconnect *reconn, *next;
+
+       while (1) {
+               pthread_mutex_lock(&reconn_list.mutex);
+
+               /*
+                * An equal implementation of TAILQ_FOREACH_SAFE,
+                * which does not exist on all platforms.
+                */
+               for (reconn = TAILQ_FIRST(&reconn_list.head);
+                    reconn != NULL; reconn = next) {
+                       next = TAILQ_NEXT(reconn, next);
+
+                       ret = vhost_user_connect_nonblock(reconn->fd,
+                                               (struct sockaddr *)&reconn->un,
+                                               sizeof(reconn->un));
+                       if (ret == -2) {
+                               close(reconn->fd);
+                               RTE_LOG(ERR, VHOST_CONFIG,
+                                       "reconnection for fd %d failed\n",
+                                       reconn->fd);
+                               goto remove_fd;
+                       }
+                       if (ret == -1)
+                               continue;
+
+                       RTE_LOG(INFO, VHOST_CONFIG,
+                               "%s: connected\n", reconn->vsocket->path);
+                       vhost_user_add_connection(reconn->fd, reconn->vsocket);
+remove_fd:
+                       TAILQ_REMOVE(&reconn_list.head, reconn, next);
+                       free(reconn);
+               }
+
+               pthread_mutex_unlock(&reconn_list.mutex);
+               sleep(1);
+       }
+
+       return NULL;
+}
+
+static int
+vhost_user_reconnect_init(void)
+{
+       int ret;
+
+       pthread_mutex_init(&reconn_list.mutex, NULL);
+       TAILQ_INIT(&reconn_list.head);
+
+       ret = pthread_create(&reconn_tid, NULL,
+                            vhost_user_client_reconnect, NULL);
+       if (ret < 0)
+               RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread");
+
+       return ret;
+}
+
+static int
+vhost_user_create_client(struct vhost_user_socket *vsocket)
+{
+       int fd;
+       int ret;
+       struct sockaddr_un un;
+       const char *path = vsocket->path;
+       struct vhost_user_reconnect *reconn;
+
+       fd = create_unix_socket(path, &un, vsocket->is_server);
+       if (fd < 0)
+               return -1;
+
+       ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&un,
+                                         sizeof(un));
+       if (ret == 0) {
+               vhost_user_add_connection(fd, vsocket);
+               return 0;
+       }
+
+       RTE_LOG(ERR, VHOST_CONFIG,
+               "failed to connect to %s: %s\n",
+               path, strerror(errno));
+
+       if (ret == -2 || !vsocket->reconnect) {
+               close(fd);
+               return -1;
+       }
+
+       RTE_LOG(ERR, VHOST_CONFIG, "%s: reconnecting...\n", path);
+       reconn = malloc(sizeof(*reconn));
+       if (reconn == NULL) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "failed to allocate memory for reconnect\n");
+               close(fd);
+               return -1;
+       }
+       reconn->un = un;
+       reconn->fd = fd;
+       reconn->vsocket = vsocket;
+       pthread_mutex_lock(&reconn_list.mutex);
+       TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next);
+       pthread_mutex_unlock(&reconn_list.mutex);
+
+       return 0;
+}
+
+/*
+ * Register a new vhost-user socket; here we could act as server
+ * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag
+ * is set.
+ */
+int
+rte_vhost_driver_register(const char *path, uint64_t flags)
+{
+       int ret = -1;
+       struct vhost_user_socket *vsocket;
+
+       if (!path)
+               return -1;
+
+       pthread_mutex_lock(&vhost_user.mutex);
+
+       if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "error: the number of vhost sockets reaches maximum\n");
+               goto out;
+       }
+
+       vsocket = malloc(sizeof(struct vhost_user_socket));
+       if (!vsocket)
+               goto out;
+       memset(vsocket, 0, sizeof(struct vhost_user_socket));
+       vsocket->path = strdup(path);
+       vsocket->connfd = -1;
+
+       if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
+               vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);
+               if (vsocket->reconnect && reconn_tid == 0) {
+                       if (vhost_user_reconnect_init() < 0) {
+                               free(vsocket->path);
+                               free(vsocket);
+                               goto out;
+                       }
+               }
+               ret = vhost_user_create_client(vsocket);
+       } else {
+               vsocket->is_server = true;
+               ret = vhost_user_create_server(vsocket);
+       }
+       if (ret < 0) {
+               free(vsocket->path);
+               free(vsocket);
+               goto out;
+       }
+
+       vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket;
+
+out:
+       pthread_mutex_unlock(&vhost_user.mutex);
+
+       return ret;
+}
+
+static bool
+vhost_user_remove_reconnect(struct vhost_user_socket *vsocket)
+{
+       int found = false;
+       struct vhost_user_reconnect *reconn, *next;
+
+       pthread_mutex_lock(&reconn_list.mutex);
+
+       for (reconn = TAILQ_FIRST(&reconn_list.head);
+            reconn != NULL; reconn = next) {
+               next = TAILQ_NEXT(reconn, next);
+
+               if (reconn->vsocket == vsocket) {
+                       TAILQ_REMOVE(&reconn_list.head, reconn, next);
+                       close(reconn->fd);
+                       free(reconn);
+                       found = true;
+                       break;
+               }
+       }
+       pthread_mutex_unlock(&reconn_list.mutex);
+       return found;
+}
+
+/**
+ * Unregister the specified vhost socket
+ */
+int
+rte_vhost_driver_unregister(const char *path)
+{
+       int i;
+       int count;
+       struct vhost_user_connection *conn;
+
+       pthread_mutex_lock(&vhost_user.mutex);
+
+       for (i = 0; i < vhost_user.vsocket_cnt; i++) {
+               struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
+
+               if (!strcmp(vsocket->path, path)) {
+                       if (vsocket->is_server) {
+                               fdset_del(&vhost_user.fdset, vsocket->listenfd);
+                               close(vsocket->listenfd);
+                               unlink(path);
+                       } else if (vsocket->reconnect) {
+                               vhost_user_remove_reconnect(vsocket);
+                       }
+
+                       conn = fdset_del(&vhost_user.fdset, vsocket->connfd);
+                       if (conn) {
+                               RTE_LOG(INFO, VHOST_CONFIG,
+                                       "free connfd = %d for device '%s'\n",
+                                       vsocket->connfd, path);
+                               close(vsocket->connfd);
+                               vhost_destroy_device(conn->vid);
+                               free(conn);
+                       }
+
+                       free(vsocket->path);
+                       free(vsocket);
+
+                       count = --vhost_user.vsocket_cnt;
+                       vhost_user.vsockets[i] = vhost_user.vsockets[count];
+                       vhost_user.vsockets[count] = NULL;
+                       pthread_mutex_unlock(&vhost_user.mutex);
+
+                       return 0;
+               }
+       }
+       pthread_mutex_unlock(&vhost_user.mutex);
+
+       return -1;
+}
+
+int
+rte_vhost_driver_session_start(void)
+{
+       fdset_event_dispatch(&vhost_user.fdset);
+       return 0;
+}
diff --git a/lib/librte_vhost/vhost-net-user.c b/lib/librte_vhost/vhost-net-user.c
deleted file mode 100644 (file)
index b35594d..0000000
+++ /dev/null
@@ -1,795 +0,0 @@
-/*-
- *   BSD LICENSE
- *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- *   All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdbool.h>
-#include <limits.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <sys/queue.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <pthread.h>
-
-#include <rte_log.h>
-#include <rte_virtio_net.h>
-
-#include "fd_man.h"
-#include "vhost-net-user.h"
-#include "vhost-net.h"
-#include "virtio-net-user.h"
-
-/*
- * Every time rte_vhost_driver_register() is invoked, an associated
- * vhost_user_socket struct will be created.
- */
-struct vhost_user_socket {
-       char *path;
-       int listenfd;
-       int connfd;
-       bool is_server;
-       bool reconnect;
-};
-
-struct vhost_user_connection {
-       struct vhost_user_socket *vsocket;
-       int vid;
-};
-
-#define MAX_VHOST_SOCKET 1024
-struct vhost_user {
-       struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET];
-       struct fdset fdset;
-       int vsocket_cnt;
-       pthread_mutex_t mutex;
-};
-
-#define MAX_VIRTIO_BACKLOG 128
-
-static void vhost_user_server_new_connection(int fd, void *data, int *remove);
-static void vhost_user_msg_handler(int fd, void *dat, int *remove);
-static int vhost_user_create_client(struct vhost_user_socket *vsocket);
-
-static struct vhost_user vhost_user = {
-       .fdset = {
-               .fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} },
-               .fd_mutex = PTHREAD_MUTEX_INITIALIZER,
-               .num = 0
-       },
-       .vsocket_cnt = 0,
-       .mutex = PTHREAD_MUTEX_INITIALIZER,
-};
-
-static const char *vhost_message_str[VHOST_USER_MAX] = {
-       [VHOST_USER_NONE] = "VHOST_USER_NONE",
-       [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
-       [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
-       [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
-       [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
-       [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
-       [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
-       [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
-       [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
-       [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
-       [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
-       [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
-       [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
-       [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
-       [VHOST_USER_SET_VRING_ERR]  = "VHOST_USER_SET_VRING_ERR",
-       [VHOST_USER_GET_PROTOCOL_FEATURES]  = "VHOST_USER_GET_PROTOCOL_FEATURES",
-       [VHOST_USER_SET_PROTOCOL_FEATURES]  = "VHOST_USER_SET_PROTOCOL_FEATURES",
-       [VHOST_USER_GET_QUEUE_NUM]  = "VHOST_USER_GET_QUEUE_NUM",
-       [VHOST_USER_SET_VRING_ENABLE]  = "VHOST_USER_SET_VRING_ENABLE",
-       [VHOST_USER_SEND_RARP]  = "VHOST_USER_SEND_RARP",
-};
-
-/* return bytes# of read on success or negative val on failure. */
-static int
-read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
-{
-       struct iovec iov;
-       struct msghdr msgh;
-       size_t fdsize = fd_num * sizeof(int);
-       char control[CMSG_SPACE(fdsize)];
-       struct cmsghdr *cmsg;
-       int ret;
-
-       memset(&msgh, 0, sizeof(msgh));
-       iov.iov_base = buf;
-       iov.iov_len  = buflen;
-
-       msgh.msg_iov = &iov;
-       msgh.msg_iovlen = 1;
-       msgh.msg_control = control;
-       msgh.msg_controllen = sizeof(control);
-
-       ret = recvmsg(sockfd, &msgh, 0);
-       if (ret <= 0) {
-               RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n");
-               return ret;
-       }
-
-       if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
-               RTE_LOG(ERR, VHOST_CONFIG, "truncted msg\n");
-               return -1;
-       }
-
-       for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
-               cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
-               if ((cmsg->cmsg_level == SOL_SOCKET) &&
-                       (cmsg->cmsg_type == SCM_RIGHTS)) {
-                       memcpy(fds, CMSG_DATA(cmsg), fdsize);
-                       break;
-               }
-       }
-
-       return ret;
-}
-
-/* return bytes# of read on success or negative val on failure. */
-static int
-read_vhost_message(int sockfd, struct VhostUserMsg *msg)
-{
-       int ret;
-
-       ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
-               msg->fds, VHOST_MEMORY_MAX_NREGIONS);
-       if (ret <= 0)
-               return ret;
-
-       if (msg && msg->size) {
-               if (msg->size > sizeof(msg->payload)) {
-                       RTE_LOG(ERR, VHOST_CONFIG,
-                               "invalid msg size: %d\n", msg->size);
-                       return -1;
-               }
-               ret = read(sockfd, &msg->payload, msg->size);
-               if (ret <= 0)
-                       return ret;
-               if (ret != (int)msg->size) {
-                       RTE_LOG(ERR, VHOST_CONFIG,
-                               "read control message failed\n");
-                       return -1;
-               }
-       }
-
-       return ret;
-}
-
-static int
-send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
-{
-
-       struct iovec iov;
-       struct msghdr msgh;
-       size_t fdsize = fd_num * sizeof(int);
-       char control[CMSG_SPACE(fdsize)];
-       struct cmsghdr *cmsg;
-       int ret;
-
-       memset(&msgh, 0, sizeof(msgh));
-       iov.iov_base = buf;
-       iov.iov_len = buflen;
-
-       msgh.msg_iov = &iov;
-       msgh.msg_iovlen = 1;
-
-       if (fds && fd_num > 0) {
-               msgh.msg_control = control;
-               msgh.msg_controllen = sizeof(control);
-               cmsg = CMSG_FIRSTHDR(&msgh);
-               cmsg->cmsg_len = CMSG_LEN(fdsize);
-               cmsg->cmsg_level = SOL_SOCKET;
-               cmsg->cmsg_type = SCM_RIGHTS;
-               memcpy(CMSG_DATA(cmsg), fds, fdsize);
-       } else {
-               msgh.msg_control = NULL;
-               msgh.msg_controllen = 0;
-       }
-
-       do {
-               ret = sendmsg(sockfd, &msgh, 0);
-       } while (ret < 0 && errno == EINTR);
-
-       if (ret < 0) {
-               RTE_LOG(ERR, VHOST_CONFIG,  "sendmsg error\n");
-               return ret;
-       }
-
-       return ret;
-}
-
-static int
-send_vhost_message(int sockfd, struct VhostUserMsg *msg)
-{
-       int ret;
-
-       if (!msg)
-               return 0;
-
-       msg->flags &= ~VHOST_USER_VERSION_MASK;
-       msg->flags |= VHOST_USER_VERSION;
-       msg->flags |= VHOST_USER_REPLY_MASK;
-
-       ret = send_fd_message(sockfd, (char *)msg,
-               VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
-
-       return ret;
-}
-
-
-static void
-vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
-{
-       int vid;
-       size_t size;
-       struct vhost_user_connection *conn;
-       int ret;
-
-       conn = malloc(sizeof(*conn));
-       if (conn == NULL) {
-               close(fd);
-               return;
-       }
-
-       vid = vhost_new_device();
-       if (vid == -1) {
-               close(fd);
-               free(conn);
-               return;
-       }
-
-       size = strnlen(vsocket->path, PATH_MAX);
-       vhost_set_ifname(vid, vsocket->path, size);
-
-       RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid);
-
-       vsocket->connfd = fd;
-       conn->vsocket = vsocket;
-       conn->vid = vid;
-       ret = fdset_add(&vhost_user.fdset, fd, vhost_user_msg_handler,
-                       NULL, conn);
-       if (ret < 0) {
-               vsocket->connfd = -1;
-               free(conn);
-               close(fd);
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "failed to add fd %d into vhost server fdset\n",
-                       fd);
-       }
-}
-
-/* call back when there is new vhost-user connection from client  */
-static void
-vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused)
-{
-       struct vhost_user_socket *vsocket = dat;
-
-       fd = accept(fd, NULL, NULL);
-       if (fd < 0)
-               return;
-
-       RTE_LOG(INFO, VHOST_CONFIG, "new vhost user connection is %d\n", fd);
-       vhost_user_add_connection(fd, vsocket);
-}
-
-/* callback when there is message on the connfd */
-static void
-vhost_user_msg_handler(int connfd, void *dat, int *remove)
-{
-       int vid;
-       struct vhost_user_connection *conn = dat;
-       struct VhostUserMsg msg;
-       uint64_t features;
-       int ret;
-
-       vid = conn->vid;
-       ret = read_vhost_message(connfd, &msg);
-       if (ret <= 0 || msg.request >= VHOST_USER_MAX) {
-               struct vhost_user_socket *vsocket = conn->vsocket;
-
-               if (ret < 0)
-                       RTE_LOG(ERR, VHOST_CONFIG,
-                               "vhost read message failed\n");
-               else if (ret == 0)
-                       RTE_LOG(INFO, VHOST_CONFIG,
-                               "vhost peer closed\n");
-               else
-                       RTE_LOG(ERR, VHOST_CONFIG,
-                               "vhost read incorrect message\n");
-
-               vsocket->connfd = -1;
-               close(connfd);
-               *remove = 1;
-               free(conn);
-               vhost_destroy_device(vid);
-
-               if (vsocket->reconnect)
-                       vhost_user_create_client(vsocket);
-
-               return;
-       }
-
-       RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
-               vhost_message_str[msg.request]);
-       switch (msg.request) {
-       case VHOST_USER_GET_FEATURES:
-               ret = vhost_get_features(vid, &features);
-               msg.payload.u64 = features;
-               msg.size = sizeof(msg.payload.u64);
-               send_vhost_message(connfd, &msg);
-               break;
-       case VHOST_USER_SET_FEATURES:
-               features = msg.payload.u64;
-               vhost_set_features(vid, &features);
-               break;
-
-       case VHOST_USER_GET_PROTOCOL_FEATURES:
-               msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES;
-               msg.size = sizeof(msg.payload.u64);
-               send_vhost_message(connfd, &msg);
-               break;
-       case VHOST_USER_SET_PROTOCOL_FEATURES:
-               user_set_protocol_features(vid, msg.payload.u64);
-               break;
-
-       case VHOST_USER_SET_OWNER:
-               vhost_set_owner(vid);
-               break;
-       case VHOST_USER_RESET_OWNER:
-               vhost_reset_owner(vid);
-               break;
-
-       case VHOST_USER_SET_MEM_TABLE:
-               user_set_mem_table(vid, &msg);
-               break;
-
-       case VHOST_USER_SET_LOG_BASE:
-               user_set_log_base(vid, &msg);
-
-               /* it needs a reply */
-               msg.size = sizeof(msg.payload.u64);
-               send_vhost_message(connfd, &msg);
-               break;
-       case VHOST_USER_SET_LOG_FD:
-               close(msg.fds[0]);
-               RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
-               break;
-
-       case VHOST_USER_SET_VRING_NUM:
-               vhost_set_vring_num(vid, &msg.payload.state);
-               break;
-       case VHOST_USER_SET_VRING_ADDR:
-               vhost_set_vring_addr(vid, &msg.payload.addr);
-               break;
-       case VHOST_USER_SET_VRING_BASE:
-               vhost_set_vring_base(vid, &msg.payload.state);
-               break;
-
-       case VHOST_USER_GET_VRING_BASE:
-               ret = user_get_vring_base(vid, &msg.payload.state);
-               msg.size = sizeof(msg.payload.state);
-               send_vhost_message(connfd, &msg);
-               break;
-
-       case VHOST_USER_SET_VRING_KICK:
-               user_set_vring_kick(vid, &msg);
-               break;
-       case VHOST_USER_SET_VRING_CALL:
-               user_set_vring_call(vid, &msg);
-               break;
-
-       case VHOST_USER_SET_VRING_ERR:
-               if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK))
-                       close(msg.fds[0]);
-               RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
-               break;
-
-       case VHOST_USER_GET_QUEUE_NUM:
-               msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS;
-               msg.size = sizeof(msg.payload.u64);
-               send_vhost_message(connfd, &msg);
-               break;
-
-       case VHOST_USER_SET_VRING_ENABLE:
-               user_set_vring_enable(vid, &msg.payload.state);
-               break;
-       case VHOST_USER_SEND_RARP:
-               user_send_rarp(vid, &msg);
-               break;
-
-       default:
-               break;
-
-       }
-}
-
-static int
-create_unix_socket(const char *path, struct sockaddr_un *un, bool is_server)
-{
-       int fd;
-
-       fd = socket(AF_UNIX, SOCK_STREAM, 0);
-       if (fd < 0)
-               return -1;
-       RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n",
-               is_server ? "server" : "client", fd);
-
-       if (!is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "vhost-user: can't set nonblocking mode for socket, fd: "
-                       "%d (%s)\n", fd, strerror(errno));
-               close(fd);
-               return -1;
-       }
-
-       memset(un, 0, sizeof(*un));
-       un->sun_family = AF_UNIX;
-       strncpy(un->sun_path, path, sizeof(un->sun_path));
-       un->sun_path[sizeof(un->sun_path) - 1] = '\0';
-
-       return fd;
-}
-
-static int
-vhost_user_create_server(struct vhost_user_socket *vsocket)
-{
-       int fd;
-       int ret;
-       struct sockaddr_un un;
-       const char *path = vsocket->path;
-
-       fd = create_unix_socket(path, &un, vsocket->is_server);
-       if (fd < 0)
-               return -1;
-
-       ret = bind(fd, (struct sockaddr *)&un, sizeof(un));
-       if (ret < 0) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "failed to bind to %s: %s; remove it and try again\n",
-                       path, strerror(errno));
-               goto err;
-       }
-       RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path);
-
-       ret = listen(fd, MAX_VIRTIO_BACKLOG);
-       if (ret < 0)
-               goto err;
-
-       vsocket->listenfd = fd;
-       ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection,
-                 NULL, vsocket);
-       if (ret < 0) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "failed to add listen fd %d to vhost server fdset\n",
-                       fd);
-               goto err;
-       }
-
-       return 0;
-
-err:
-       close(fd);
-       return -1;
-}
-
-struct vhost_user_reconnect {
-       struct sockaddr_un un;
-       int fd;
-       struct vhost_user_socket *vsocket;
-
-       TAILQ_ENTRY(vhost_user_reconnect) next;
-};
-
-TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect);
-struct vhost_user_reconnect_list {
-       struct vhost_user_reconnect_tailq_list head;
-       pthread_mutex_t mutex;
-};
-
-static struct vhost_user_reconnect_list reconn_list;
-static pthread_t reconn_tid;
-
-static int
-vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz)
-{
-       int ret, flags;
-
-       ret = connect(fd, un, sz);
-       if (ret < 0 && errno != EISCONN)
-               return -1;
-
-       flags = fcntl(fd, F_GETFL, 0);
-       if (flags < 0) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "can't get flags for connfd %d\n", fd);
-               return -2;
-       }
-       if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                               "can't disable nonblocking on fd %d\n", fd);
-               return -2;
-       }
-       return 0;
-}
-
-static void *
-vhost_user_client_reconnect(void *arg __rte_unused)
-{
-       int ret;
-       struct vhost_user_reconnect *reconn, *next;
-
-       while (1) {
-               pthread_mutex_lock(&reconn_list.mutex);
-
-               /*
-                * An equal implementation of TAILQ_FOREACH_SAFE,
-                * which does not exist on all platforms.
-                */
-               for (reconn = TAILQ_FIRST(&reconn_list.head);
-                    reconn != NULL; reconn = next) {
-                       next = TAILQ_NEXT(reconn, next);
-
-                       ret = vhost_user_connect_nonblock(reconn->fd,
-                                               (struct sockaddr *)&reconn->un,
-                                               sizeof(reconn->un));
-                       if (ret == -2) {
-                               close(reconn->fd);
-                               RTE_LOG(ERR, VHOST_CONFIG,
-                                       "reconnection for fd %d failed\n",
-                                       reconn->fd);
-                               goto remove_fd;
-                       }
-                       if (ret == -1)
-                               continue;
-
-                       RTE_LOG(INFO, VHOST_CONFIG,
-                               "%s: connected\n", reconn->vsocket->path);
-                       vhost_user_add_connection(reconn->fd, reconn->vsocket);
-remove_fd:
-                       TAILQ_REMOVE(&reconn_list.head, reconn, next);
-                       free(reconn);
-               }
-
-               pthread_mutex_unlock(&reconn_list.mutex);
-               sleep(1);
-       }
-
-       return NULL;
-}
-
-static int
-vhost_user_reconnect_init(void)
-{
-       int ret;
-
-       pthread_mutex_init(&reconn_list.mutex, NULL);
-       TAILQ_INIT(&reconn_list.head);
-
-       ret = pthread_create(&reconn_tid, NULL,
-                            vhost_user_client_reconnect, NULL);
-       if (ret < 0)
-               RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread");
-
-       return ret;
-}
-
-static int
-vhost_user_create_client(struct vhost_user_socket *vsocket)
-{
-       int fd;
-       int ret;
-       struct sockaddr_un un;
-       const char *path = vsocket->path;
-       struct vhost_user_reconnect *reconn;
-
-       fd = create_unix_socket(path, &un, vsocket->is_server);
-       if (fd < 0)
-               return -1;
-
-       ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&un,
-                                         sizeof(un));
-       if (ret == 0) {
-               vhost_user_add_connection(fd, vsocket);
-               return 0;
-       }
-
-       RTE_LOG(ERR, VHOST_CONFIG,
-               "failed to connect to %s: %s\n",
-               path, strerror(errno));
-
-       if (ret == -2 || !vsocket->reconnect) {
-               close(fd);
-               return -1;
-       }
-
-       RTE_LOG(ERR, VHOST_CONFIG, "%s: reconnecting...\n", path);
-       reconn = malloc(sizeof(*reconn));
-       if (reconn == NULL) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "failed to allocate memory for reconnect\n");
-               close(fd);
-               return -1;
-       }
-       reconn->un = un;
-       reconn->fd = fd;
-       reconn->vsocket = vsocket;
-       pthread_mutex_lock(&reconn_list.mutex);
-       TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next);
-       pthread_mutex_unlock(&reconn_list.mutex);
-
-       return 0;
-}
-
-/*
- * Register a new vhost-user socket; here we could act as server
- * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag
- * is set.
- */
-int
-rte_vhost_driver_register(const char *path, uint64_t flags)
-{
-       int ret = -1;
-       struct vhost_user_socket *vsocket;
-
-       if (!path)
-               return -1;
-
-       pthread_mutex_lock(&vhost_user.mutex);
-
-       if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "error: the number of vhost sockets reaches maximum\n");
-               goto out;
-       }
-
-       vsocket = malloc(sizeof(struct vhost_user_socket));
-       if (!vsocket)
-               goto out;
-       memset(vsocket, 0, sizeof(struct vhost_user_socket));
-       vsocket->path = strdup(path);
-       vsocket->connfd = -1;
-
-       if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
-               vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);
-               if (vsocket->reconnect && reconn_tid == 0) {
-                       if (vhost_user_reconnect_init() < 0) {
-                               free(vsocket->path);
-                               free(vsocket);
-                               goto out;
-                       }
-               }
-               ret = vhost_user_create_client(vsocket);
-       } else {
-               vsocket->is_server = true;
-               ret = vhost_user_create_server(vsocket);
-       }
-       if (ret < 0) {
-               free(vsocket->path);
-               free(vsocket);
-               goto out;
-       }
-
-       vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket;
-
-out:
-       pthread_mutex_unlock(&vhost_user.mutex);
-
-       return ret;
-}
-
-static bool
-vhost_user_remove_reconnect(struct vhost_user_socket *vsocket)
-{
-       int found = false;
-       struct vhost_user_reconnect *reconn, *next;
-
-       pthread_mutex_lock(&reconn_list.mutex);
-
-       for (reconn = TAILQ_FIRST(&reconn_list.head);
-            reconn != NULL; reconn = next) {
-               next = TAILQ_NEXT(reconn, next);
-
-               if (reconn->vsocket == vsocket) {
-                       TAILQ_REMOVE(&reconn_list.head, reconn, next);
-                       close(reconn->fd);
-                       free(reconn);
-                       found = true;
-                       break;
-               }
-       }
-       pthread_mutex_unlock(&reconn_list.mutex);
-       return found;
-}
-
-/**
- * Unregister the specified vhost socket
- */
-int
-rte_vhost_driver_unregister(const char *path)
-{
-       int i;
-       int count;
-       struct vhost_user_connection *conn;
-
-       pthread_mutex_lock(&vhost_user.mutex);
-
-       for (i = 0; i < vhost_user.vsocket_cnt; i++) {
-               struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
-
-               if (!strcmp(vsocket->path, path)) {
-                       if (vsocket->is_server) {
-                               fdset_del(&vhost_user.fdset, vsocket->listenfd);
-                               close(vsocket->listenfd);
-                               unlink(path);
-                       } else if (vsocket->reconnect) {
-                               vhost_user_remove_reconnect(vsocket);
-                       }
-
-                       conn = fdset_del(&vhost_user.fdset, vsocket->connfd);
-                       if (conn) {
-                               RTE_LOG(INFO, VHOST_CONFIG,
-                                       "free connfd = %d for device '%s'\n",
-                                       vsocket->connfd, path);
-                               close(vsocket->connfd);
-                               vhost_destroy_device(conn->vid);
-                               free(conn);
-                       }
-
-                       free(vsocket->path);
-                       free(vsocket);
-
-                       count = --vhost_user.vsocket_cnt;
-                       vhost_user.vsockets[i] = vhost_user.vsockets[count];
-                       vhost_user.vsockets[count] = NULL;
-                       pthread_mutex_unlock(&vhost_user.mutex);
-
-                       return 0;
-               }
-       }
-       pthread_mutex_unlock(&vhost_user.mutex);
-
-       return -1;
-}
-
-int
-rte_vhost_driver_session_start(void)
-{
-       fdset_event_dispatch(&vhost_user.fdset);
-       return 0;
-}
diff --git a/lib/librte_vhost/vhost-net-user.h b/lib/librte_vhost/vhost-net-user.h
deleted file mode 100644 (file)
index f533239..0000000
+++ /dev/null
@@ -1,113 +0,0 @@
-/*-
- *   BSD LICENSE
- *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- *   All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _VHOST_NET_USER_H
-#define _VHOST_NET_USER_H
-
-#include <stdint.h>
-#include <linux/vhost.h>
-
-#include "rte_virtio_net.h"
-
-/* refer to hw/virtio/vhost-user.c */
-
-#define VHOST_MEMORY_MAX_NREGIONS 8
-
-typedef enum VhostUserRequest {
-       VHOST_USER_NONE = 0,
-       VHOST_USER_GET_FEATURES = 1,
-       VHOST_USER_SET_FEATURES = 2,
-       VHOST_USER_SET_OWNER = 3,
-       VHOST_USER_RESET_OWNER = 4,
-       VHOST_USER_SET_MEM_TABLE = 5,
-       VHOST_USER_SET_LOG_BASE = 6,
-       VHOST_USER_SET_LOG_FD = 7,
-       VHOST_USER_SET_VRING_NUM = 8,
-       VHOST_USER_SET_VRING_ADDR = 9,
-       VHOST_USER_SET_VRING_BASE = 10,
-       VHOST_USER_GET_VRING_BASE = 11,
-       VHOST_USER_SET_VRING_KICK = 12,
-       VHOST_USER_SET_VRING_CALL = 13,
-       VHOST_USER_SET_VRING_ERR = 14,
-       VHOST_USER_GET_PROTOCOL_FEATURES = 15,
-       VHOST_USER_SET_PROTOCOL_FEATURES = 16,
-       VHOST_USER_GET_QUEUE_NUM = 17,
-       VHOST_USER_SET_VRING_ENABLE = 18,
-       VHOST_USER_SEND_RARP = 19,
-       VHOST_USER_MAX
-} VhostUserRequest;
-
-typedef struct VhostUserMemoryRegion {
-       uint64_t guest_phys_addr;
-       uint64_t memory_size;
-       uint64_t userspace_addr;
-       uint64_t mmap_offset;
-} VhostUserMemoryRegion;
-
-typedef struct VhostUserMemory {
-       uint32_t nregions;
-       uint32_t padding;
-       VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
-} VhostUserMemory;
-
-typedef struct VhostUserLog {
-       uint64_t mmap_size;
-       uint64_t mmap_offset;
-} VhostUserLog;
-
-typedef struct VhostUserMsg {
-       VhostUserRequest request;
-
-#define VHOST_USER_VERSION_MASK     0x3
-#define VHOST_USER_REPLY_MASK       (0x1 << 2)
-       uint32_t flags;
-       uint32_t size; /* the following payload size */
-       union {
-#define VHOST_USER_VRING_IDX_MASK   0xff
-#define VHOST_USER_VRING_NOFD_MASK  (0x1<<8)
-               uint64_t u64;
-               struct vhost_vring_state state;
-               struct vhost_vring_addr addr;
-               VhostUserMemory memory;
-               VhostUserLog    log;
-       } payload;
-       int fds[VHOST_MEMORY_MAX_NREGIONS];
-} __attribute((packed)) VhostUserMsg;
-
-#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
-
-/* The version of the protocol we support */
-#define VHOST_USER_VERSION    0x1
-
-/*****************************************************************************/
-#endif
diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
deleted file mode 100644 (file)
index 38593a2..0000000
+++ /dev/null
@@ -1,250 +0,0 @@
-/*-
- *   BSD LICENSE
- *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- *   All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _VHOST_NET_CDEV_H_
-#define _VHOST_NET_CDEV_H_
-#include <stdint.h>
-#include <stdio.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <linux/vhost.h>
-
-#include <rte_log.h>
-
-#include "rte_virtio_net.h"
-
-/* Used to indicate that the device is running on a data core */
-#define VIRTIO_DEV_RUNNING 1
-
-/* Backend value set by guest. */
-#define VIRTIO_DEV_STOPPED -1
-
-#define BUF_VECTOR_MAX 256
-
-/**
- * Structure contains buffer address, length and descriptor index
- * from vring to do scatter RX.
- */
-struct buf_vector {
-       uint64_t buf_addr;
-       uint32_t buf_len;
-       uint32_t desc_idx;
-};
-
-/**
- * Structure contains variables relevant to RX/TX virtqueues.
- */
-struct vhost_virtqueue {
-       struct vring_desc       *desc;
-       struct vring_avail      *avail;
-       struct vring_used       *used;
-       uint32_t                size;
-
-       /* Last index used on the available ring */
-       volatile uint16_t       last_used_idx;
-#define VIRTIO_INVALID_EVENTFD         (-1)
-#define VIRTIO_UNINITIALIZED_EVENTFD   (-2)
-
-       /* Backend value to determine if device should started/stopped */
-       int                     backend;
-       /* Used to notify the guest (trigger interrupt) */
-       int                     callfd;
-       /* Currently unused as polling mode is enabled */
-       int                     kickfd;
-       int                     enabled;
-
-       /* Physical address of used ring, for logging */
-       uint64_t                log_guest_addr;
-} __rte_cache_aligned;
-
-/* Old kernels have no such macro defined */
-#ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
- #define VIRTIO_NET_F_GUEST_ANNOUNCE 21
-#endif
-
-
-/*
- * Make an extra wrapper for VIRTIO_NET_F_MQ and
- * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX as they are
- * introduced since kernel v3.8. This makes our
- * code buildable for older kernel.
- */
-#ifdef VIRTIO_NET_F_MQ
- #define VHOST_MAX_QUEUE_PAIRS VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX
- #define VHOST_SUPPORTS_MQ     (1ULL << VIRTIO_NET_F_MQ)
-#else
- #define VHOST_MAX_QUEUE_PAIRS 1
- #define VHOST_SUPPORTS_MQ     0
-#endif
-
-/*
- * Define virtio 1.0 for older kernels
- */
-#ifndef VIRTIO_F_VERSION_1
- #define VIRTIO_F_VERSION_1 32
-#endif
-
-/**
- * Device structure contains all configuration information relating
- * to the device.
- */
-struct virtio_net {
-       /* Frontend (QEMU) memory and memory region information */
-       struct virtio_memory    *mem;
-       uint64_t                features;
-       uint64_t                protocol_features;
-       int                     vid;
-       uint32_t                flags;
-       uint16_t                vhost_hlen;
-       /* to tell if we need broadcast rarp packet */
-       rte_atomic16_t          broadcast_rarp;
-       uint32_t                virt_qp_nb;
-       struct vhost_virtqueue  *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
-#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
-       char                    ifname[IF_NAME_SZ];
-       uint64_t                log_size;
-       uint64_t                log_base;
-       uint64_t                log_addr;
-       struct ether_addr       mac;
-
-} __rte_cache_aligned;
-
-/**
- * Information relating to memory regions including offsets to
- * addresses in QEMUs memory file.
- */
-struct virtio_memory_regions {
-       uint64_t guest_phys_address;
-       uint64_t guest_phys_address_end;
-       uint64_t memory_size;
-       uint64_t userspace_address;
-       uint64_t address_offset;
-};
-
-
-/**
- * Memory structure includes region and mapping information.
- */
-struct virtio_memory {
-       /* Base QEMU userspace address of the memory file. */
-       uint64_t base_address;
-       uint64_t mapped_address;
-       uint64_t mapped_size;
-       uint32_t nregions;
-       struct virtio_memory_regions regions[0];
-};
-
-
-/* Macros for printing using RTE_LOG */
-#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1
-#define RTE_LOGTYPE_VHOST_DATA   RTE_LOGTYPE_USER1
-
-#ifdef RTE_LIBRTE_VHOST_DEBUG
-#define VHOST_MAX_PRINT_BUFF 6072
-#define LOG_LEVEL RTE_LOG_DEBUG
-#define LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args)
-#define PRINT_PACKET(device, addr, size, header) do { \
-       char *pkt_addr = (char *)(addr); \
-       unsigned int index; \
-       char packet[VHOST_MAX_PRINT_BUFF]; \
-       \
-       if ((header)) \
-               snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Header size %d: ", (device->vid), (size)); \
-       else \
-               snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Packet size %d: ", (device->vid), (size)); \
-       for (index = 0; index < (size); index++) { \
-               snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \
-                       "%02hhx ", pkt_addr[index]); \
-       } \
-       snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \
-       \
-       LOG_DEBUG(VHOST_DATA, "%s", packet); \
-} while (0)
-#else
-#define LOG_LEVEL RTE_LOG_INFO
-#define LOG_DEBUG(log_type, fmt, args...) do {} while (0)
-#define PRINT_PACKET(device, addr, size, header) do {} while (0)
-#endif
-
-/**
- * Function to convert guest physical addresses to vhost virtual addresses.
- * This is used to convert guest virtio buffer addresses.
- */
-static inline uint64_t __attribute__((always_inline))
-gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa)
-{
-       struct virtio_memory_regions *region;
-       uint32_t regionidx;
-       uint64_t vhost_va = 0;
-
-       for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
-               region = &dev->mem->regions[regionidx];
-               if ((guest_pa >= region->guest_phys_address) &&
-                       (guest_pa <= region->guest_phys_address_end)) {
-                       vhost_va = region->address_offset + guest_pa;
-                       break;
-               }
-       }
-       return vhost_va;
-}
-
-struct virtio_net_device_ops const *notify_ops;
-struct virtio_net *get_device(int vid);
-
-int vhost_new_device(void);
-void vhost_destroy_device(int);
-
-void vhost_set_ifname(int, const char *if_name, unsigned int if_len);
-
-int vhost_get_features(int, uint64_t *);
-int vhost_set_features(int, uint64_t *);
-
-int vhost_set_vring_num(int, struct vhost_vring_state *);
-int vhost_set_vring_addr(int, struct vhost_vring_addr *);
-int vhost_set_vring_base(int, struct vhost_vring_state *);
-int vhost_get_vring_base(int, uint32_t, struct vhost_vring_state *);
-
-int vhost_set_vring_kick(int, struct vhost_vring_file *);
-int vhost_set_vring_call(int, struct vhost_vring_file *);
-
-int vhost_set_backend(int, struct vhost_vring_file *);
-
-int vhost_set_owner(int);
-int vhost_reset_owner(int);
-
-/*
- * Backend-specific cleanup. Defined by vhost-cuse and vhost-user.
- */
-void vhost_backend_cleanup(struct virtio_net *dev);
-
-#endif /* _VHOST_NET_CDEV_H_ */
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
new file mode 100644 (file)
index 0000000..46095c3
--- /dev/null
@@ -0,0 +1,409 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#ifdef RTE_LIBRTE_VHOST_NUMA
+#include <numaif.h>
+#endif
+
+#include <rte_ethdev.h>
+#include <rte_log.h>
+#include <rte_string_fns.h>
+#include <rte_memory.h>
+#include <rte_malloc.h>
+#include <rte_virtio_net.h>
+
+#include "vhost.h"
+
+#define VHOST_USER_F_PROTOCOL_FEATURES 30
+
+/* Features supported by this lib. */
+#define VHOST_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \
+                               (1ULL << VIRTIO_NET_F_CTRL_VQ) | \
+                               (1ULL << VIRTIO_NET_F_CTRL_RX) | \
+                               (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \
+                               (VHOST_SUPPORTS_MQ)            | \
+                               (1ULL << VIRTIO_F_VERSION_1)   | \
+                               (1ULL << VHOST_F_LOG_ALL)      | \
+                               (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
+                               (1ULL << VIRTIO_NET_F_HOST_TSO4) | \
+                               (1ULL << VIRTIO_NET_F_HOST_TSO6) | \
+                               (1ULL << VIRTIO_NET_F_CSUM)    | \
+                               (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
+                               (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
+                               (1ULL << VIRTIO_NET_F_GUEST_TSO6))
+
+uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES;
+
+struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
+
+/* device ops to add/remove device to/from data core. */
+struct virtio_net_device_ops const *notify_ops;
+
+struct virtio_net *
+get_device(int vid)
+{
+       struct virtio_net *dev = vhost_devices[vid];
+
+       if (unlikely(!dev)) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "(%d) device not found.\n", vid);
+       }
+
+       return dev;
+}
+
+static void
+cleanup_vq(struct vhost_virtqueue *vq, int destroy)
+{
+       if ((vq->callfd >= 0) && (destroy != 0))
+               close(vq->callfd);
+       if (vq->kickfd >= 0)
+               close(vq->kickfd);
+}
+
+/*
+ * Unmap any memory, close any file descriptors and
+ * free any memory owned by a device.
+ */
+void
+cleanup_device(struct virtio_net *dev, int destroy)
+{
+       uint32_t i;
+
+       vhost_backend_cleanup(dev);
+
+       for (i = 0; i < dev->virt_qp_nb; i++) {
+               cleanup_vq(dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ], destroy);
+               cleanup_vq(dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ], destroy);
+       }
+}
+
+/*
+ * Release virtqueues and device memory.
+ */
+static void
+free_device(struct virtio_net *dev)
+{
+       uint32_t i;
+
+       for (i = 0; i < dev->virt_qp_nb; i++)
+               rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
+
+       rte_free(dev);
+}
+
+static void
+init_vring_queue(struct vhost_virtqueue *vq, int qp_idx)
+{
+       memset(vq, 0, sizeof(struct vhost_virtqueue));
+
+       vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+       vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD;
+
+       /* Backends are set to -1 indicating an inactive device. */
+       vq->backend = -1;
+
+       /* always set the default vq pair to enabled */
+       if (qp_idx == 0)
+               vq->enabled = 1;
+}
+
+static void
+init_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx)
+{
+       uint32_t base_idx = qp_idx * VIRTIO_QNUM;
+
+       init_vring_queue(dev->virtqueue[base_idx + VIRTIO_RXQ], qp_idx);
+       init_vring_queue(dev->virtqueue[base_idx + VIRTIO_TXQ], qp_idx);
+}
+
+static void
+reset_vring_queue(struct vhost_virtqueue *vq, int qp_idx)
+{
+       int callfd;
+
+       callfd = vq->callfd;
+       init_vring_queue(vq, qp_idx);
+       vq->callfd = callfd;
+}
+
+static void
+reset_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx)
+{
+       uint32_t base_idx = qp_idx * VIRTIO_QNUM;
+
+       reset_vring_queue(dev->virtqueue[base_idx + VIRTIO_RXQ], qp_idx);
+       reset_vring_queue(dev->virtqueue[base_idx + VIRTIO_TXQ], qp_idx);
+}
+
+int
+alloc_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx)
+{
+       struct vhost_virtqueue *virtqueue = NULL;
+       uint32_t virt_rx_q_idx = qp_idx * VIRTIO_QNUM + VIRTIO_RXQ;
+       uint32_t virt_tx_q_idx = qp_idx * VIRTIO_QNUM + VIRTIO_TXQ;
+
+       virtqueue = rte_malloc(NULL,
+                              sizeof(struct vhost_virtqueue) * VIRTIO_QNUM, 0);
+       if (virtqueue == NULL) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "Failed to allocate memory for virt qp:%d.\n", qp_idx);
+               return -1;
+       }
+
+       dev->virtqueue[virt_rx_q_idx] = virtqueue;
+       dev->virtqueue[virt_tx_q_idx] = virtqueue + VIRTIO_TXQ;
+
+       init_vring_queue_pair(dev, qp_idx);
+
+       dev->virt_qp_nb += 1;
+
+       return 0;
+}
+
+/*
+ * Reset some variables in device structure, while keeping few
+ * others untouched, such as vid, ifname, virt_qp_nb: they
+ * should be same unless the device is removed.
+ */
+void
+reset_device(struct virtio_net *dev)
+{
+       uint32_t i;
+
+       dev->features = 0;
+       dev->protocol_features = 0;
+       dev->flags = 0;
+
+       for (i = 0; i < dev->virt_qp_nb; i++)
+               reset_vring_queue_pair(dev, i);
+}
+
+/*
+ * Function is called from the CUSE open function. The device structure is
+ * initialised and a new entry is added to the device configuration linked
+ * list.
+ */
+int
+vhost_new_device(void)
+{
+       struct virtio_net *dev;
+       int i;
+
+       dev = rte_zmalloc(NULL, sizeof(struct virtio_net), 0);
+       if (dev == NULL) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "Failed to allocate memory for new dev.\n");
+               return -1;
+       }
+
+       for (i = 0; i < MAX_VHOST_DEVICE; i++) {
+               if (vhost_devices[i] == NULL)
+                       break;
+       }
+       if (i == MAX_VHOST_DEVICE) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "Failed to find a free slot for new device.\n");
+               return -1;
+       }
+
+       vhost_devices[i] = dev;
+       dev->vid = i;
+
+       return i;
+}
+
+/*
+ * Function is called from the CUSE release function. This function will
+ * cleanup the device and remove it from device configuration linked list.
+ */
+void
+vhost_destroy_device(int vid)
+{
+       struct virtio_net *dev = get_device(vid);
+
+       if (dev == NULL)
+               return;
+
+       if (dev->flags & VIRTIO_DEV_RUNNING) {
+               dev->flags &= ~VIRTIO_DEV_RUNNING;
+               notify_ops->destroy_device(vid);
+       }
+
+       cleanup_device(dev, 1);
+       free_device(dev);
+
+       vhost_devices[vid] = NULL;
+}
+
+void
+vhost_set_ifname(int vid, const char *if_name, unsigned int if_len)
+{
+       struct virtio_net *dev;
+       unsigned int len;
+
+       dev = get_device(vid);
+       if (dev == NULL)
+               return;
+
+       len = if_len > sizeof(dev->ifname) ?
+               sizeof(dev->ifname) : if_len;
+
+       strncpy(dev->ifname, if_name, len);
+       dev->ifname[sizeof(dev->ifname) - 1] = '\0';
+}
+
+
+int
+rte_vhost_get_numa_node(int vid)
+{
+#ifdef RTE_LIBRTE_VHOST_NUMA
+       struct virtio_net *dev = get_device(vid);
+       int numa_node;
+       int ret;
+
+       if (dev == NULL)
+               return -1;
+
+       ret = get_mempolicy(&numa_node, NULL, 0, dev,
+                           MPOL_F_NODE | MPOL_F_ADDR);
+       if (ret < 0) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "(%d) failed to query numa node: %d\n", vid, ret);
+               return -1;
+       }
+
+       return numa_node;
+#else
+       RTE_SET_USED(vid);
+       return -1;
+#endif
+}
+
+uint32_t
+rte_vhost_get_queue_num(int vid)
+{
+       struct virtio_net *dev = get_device(vid);
+
+       if (dev == NULL)
+               return 0;
+
+       return dev->virt_qp_nb;
+}
+
+int
+rte_vhost_get_ifname(int vid, char *buf, size_t len)
+{
+       struct virtio_net *dev = get_device(vid);
+
+       if (dev == NULL)
+               return -1;
+
+       len = RTE_MIN(len, sizeof(dev->ifname));
+
+       strncpy(buf, dev->ifname, len);
+       buf[len - 1] = '\0';
+
+       return 0;
+}
+
+uint16_t
+rte_vhost_avail_entries(int vid, uint16_t queue_id)
+{
+       struct virtio_net *dev;
+       struct vhost_virtqueue *vq;
+
+       dev = get_device(vid);
+       if (!dev)
+               return 0;
+
+       vq = dev->virtqueue[queue_id];
+       if (!vq->enabled)
+               return 0;
+
+       return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx;
+}
+
+int
+rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable)
+{
+       struct virtio_net *dev = get_device(vid);
+
+       if (dev == NULL)
+               return -1;
+
+       if (enable) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "guest notification isn't supported.\n");
+               return -1;
+       }
+
+       dev->virtqueue[queue_id]->used->flags = VRING_USED_F_NO_NOTIFY;
+       return 0;
+}
+
+uint64_t rte_vhost_feature_get(void)
+{
+       return VHOST_FEATURES;
+}
+
+int rte_vhost_feature_disable(uint64_t feature_mask)
+{
+       VHOST_FEATURES = VHOST_FEATURES & ~feature_mask;
+       return 0;
+}
+
+int rte_vhost_feature_enable(uint64_t feature_mask)
+{
+       if ((feature_mask & VHOST_SUPPORTED_FEATURES) == feature_mask) {
+               VHOST_FEATURES = VHOST_FEATURES | feature_mask;
+               return 0;
+       }
+       return -1;
+}
+
+/*
+ * Register ops so that we can add/remove device to data core.
+ */
+int
+rte_vhost_driver_callback_register(struct virtio_net_device_ops const * const ops)
+{
+       notify_ops = ops;
+
+       return 0;
+}
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
new file mode 100644 (file)
index 0000000..c2dfc3c
--- /dev/null
@@ -0,0 +1,242 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VHOST_NET_CDEV_H_
+#define _VHOST_NET_CDEV_H_
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <linux/vhost.h>
+
+#include <rte_log.h>
+
+#include "rte_virtio_net.h"
+
+/* Used to indicate that the device is running on a data core */
+#define VIRTIO_DEV_RUNNING 1
+
+/* Backend value set by guest. */
+#define VIRTIO_DEV_STOPPED -1
+
+#define BUF_VECTOR_MAX 256
+
+/**
+ * Structure contains buffer address, length and descriptor index
+ * from vring to do scatter RX.
+ */
+struct buf_vector {
+       uint64_t buf_addr;
+       uint32_t buf_len;
+       uint32_t desc_idx;
+};
+
+/**
+ * Structure contains variables relevant to RX/TX virtqueues.
+ */
+struct vhost_virtqueue {
+       struct vring_desc       *desc;
+       struct vring_avail      *avail;
+       struct vring_used       *used;
+       uint32_t                size;
+
+       /* Last index used on the available ring */
+       volatile uint16_t       last_used_idx;
+#define VIRTIO_INVALID_EVENTFD         (-1)
+#define VIRTIO_UNINITIALIZED_EVENTFD   (-2)
+
+       /* Backend value to determine if device should started/stopped */
+       int                     backend;
+       /* Used to notify the guest (trigger interrupt) */
+       int                     callfd;
+       /* Currently unused as polling mode is enabled */
+       int                     kickfd;
+       int                     enabled;
+
+       /* Physical address of used ring, for logging */
+       uint64_t                log_guest_addr;
+} __rte_cache_aligned;
+
+/* Old kernels have no such macro defined */
+#ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
+ #define VIRTIO_NET_F_GUEST_ANNOUNCE 21
+#endif
+
+
+/*
+ * Make an extra wrapper for VIRTIO_NET_F_MQ and
+ * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX as they are
+ * introduced since kernel v3.8. This makes our
+ * code buildable for older kernel.
+ */
+#ifdef VIRTIO_NET_F_MQ
+ #define VHOST_MAX_QUEUE_PAIRS VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX
+ #define VHOST_SUPPORTS_MQ     (1ULL << VIRTIO_NET_F_MQ)
+#else
+ #define VHOST_MAX_QUEUE_PAIRS 1
+ #define VHOST_SUPPORTS_MQ     0
+#endif
+
+/*
+ * Define virtio 1.0 for older kernels
+ */
+#ifndef VIRTIO_F_VERSION_1
+ #define VIRTIO_F_VERSION_1 32
+#endif
+
+/**
+ * Device structure contains all configuration information relating
+ * to the device.
+ */
+struct virtio_net {
+       /* Frontend (QEMU) memory and memory region information */
+       struct virtio_memory    *mem;
+       uint64_t                features;
+       uint64_t                protocol_features;
+       int                     vid;
+       uint32_t                flags;
+       uint16_t                vhost_hlen;
+       /* to tell if we need broadcast rarp packet */
+       rte_atomic16_t          broadcast_rarp;
+       uint32_t                virt_qp_nb;
+       struct vhost_virtqueue  *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
+#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
+       char                    ifname[IF_NAME_SZ];
+       uint64_t                log_size;
+       uint64_t                log_base;
+       uint64_t                log_addr;
+       struct ether_addr       mac;
+
+} __rte_cache_aligned;
+
+/**
+ * Information relating to memory regions including offsets to
+ * addresses in QEMUs memory file.
+ */
+struct virtio_memory_regions {
+       uint64_t guest_phys_address;
+       uint64_t guest_phys_address_end;
+       uint64_t memory_size;
+       uint64_t userspace_address;
+       uint64_t address_offset;
+};
+
+
+/**
+ * Memory structure includes region and mapping information.
+ */
+struct virtio_memory {
+       /* Base QEMU userspace address of the memory file. */
+       uint64_t base_address;
+       uint64_t mapped_address;
+       uint64_t mapped_size;
+       uint32_t nregions;
+       struct virtio_memory_regions regions[0];
+};
+
+
+/* Macros for printing using RTE_LOG */
+#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1
+#define RTE_LOGTYPE_VHOST_DATA   RTE_LOGTYPE_USER1
+
+#ifdef RTE_LIBRTE_VHOST_DEBUG
+#define VHOST_MAX_PRINT_BUFF 6072
+#define LOG_LEVEL RTE_LOG_DEBUG
+#define LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args)
+#define PRINT_PACKET(device, addr, size, header) do { \
+       char *pkt_addr = (char *)(addr); \
+       unsigned int index; \
+       char packet[VHOST_MAX_PRINT_BUFF]; \
+       \
+       if ((header)) \
+               snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Header size %d: ", (device->vid), (size)); \
+       else \
+               snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Packet size %d: ", (device->vid), (size)); \
+       for (index = 0; index < (size); index++) { \
+               snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \
+                       "%02hhx ", pkt_addr[index]); \
+       } \
+       snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \
+       \
+       LOG_DEBUG(VHOST_DATA, "%s", packet); \
+} while (0)
+#else
+#define LOG_LEVEL RTE_LOG_INFO
+#define LOG_DEBUG(log_type, fmt, args...) do {} while (0)
+#define PRINT_PACKET(device, addr, size, header) do {} while (0)
+#endif
+
+extern uint64_t VHOST_FEATURES;
+#define MAX_VHOST_DEVICE       1024
+extern struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
+
+/**
+ * Function to convert guest physical addresses to vhost virtual addresses.
+ * This is used to convert guest virtio buffer addresses.
+ */
+static inline uint64_t __attribute__((always_inline))
+gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa)
+{
+       struct virtio_memory_regions *region;
+       uint32_t regionidx;
+       uint64_t vhost_va = 0;
+
+       for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
+               region = &dev->mem->regions[regionidx];
+               if ((guest_pa >= region->guest_phys_address) &&
+                       (guest_pa <= region->guest_phys_address_end)) {
+                       vhost_va = region->address_offset + guest_pa;
+                       break;
+               }
+       }
+       return vhost_va;
+}
+
+struct virtio_net_device_ops const *notify_ops;
+struct virtio_net *get_device(int vid);
+
+int vhost_new_device(void);
+void cleanup_device(struct virtio_net *dev, int destroy);
+void reset_device(struct virtio_net *dev);
+void vhost_destroy_device(int);
+
+int alloc_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx);
+
+void vhost_set_ifname(int, const char *if_name, unsigned int if_len);
+
+/*
+ * Backend-specific cleanup. Defined by vhost-cuse and vhost-user.
+ */
+void vhost_backend_cleanup(struct virtio_net *dev);
+
+#endif /* _VHOST_NET_CDEV_H_ */
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
deleted file mode 100644 (file)
index 08a73fd..0000000
+++ /dev/null
@@ -1,924 +0,0 @@
-/*-
- *   BSD LICENSE
- *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- *   All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <stdint.h>
-#include <stdbool.h>
-#include <linux/virtio_net.h>
-
-#include <rte_mbuf.h>
-#include <rte_memcpy.h>
-#include <rte_ether.h>
-#include <rte_ip.h>
-#include <rte_virtio_net.h>
-#include <rte_tcp.h>
-#include <rte_udp.h>
-#include <rte_sctp.h>
-#include <rte_arp.h>
-
-#include "vhost-net.h"
-
-#define MAX_PKT_BURST 32
-#define VHOST_LOG_PAGE 4096
-
-static inline void __attribute__((always_inline))
-vhost_log_page(uint8_t *log_base, uint64_t page)
-{
-       log_base[page / 8] |= 1 << (page % 8);
-}
-
-static inline void __attribute__((always_inline))
-vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
-{
-       uint64_t page;
-
-       if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
-                  !dev->log_base || !len))
-               return;
-
-       if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
-               return;
-
-       /* To make sure guest memory updates are committed before logging */
-       rte_smp_wmb();
-
-       page = addr / VHOST_LOG_PAGE;
-       while (page * VHOST_LOG_PAGE < addr + len) {
-               vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
-               page += 1;
-       }
-}
-
-static inline void __attribute__((always_inline))
-vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
-                    uint64_t offset, uint64_t len)
-{
-       vhost_log_write(dev, vq->log_guest_addr + offset, len);
-}
-
-static bool
-is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
-{
-       return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
-}
-
-static void
-virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
-{
-       if (m_buf->ol_flags & PKT_TX_L4_MASK) {
-               net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
-               net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
-
-               switch (m_buf->ol_flags & PKT_TX_L4_MASK) {
-               case PKT_TX_TCP_CKSUM:
-                       net_hdr->csum_offset = (offsetof(struct tcp_hdr,
-                                               cksum));
-                       break;
-               case PKT_TX_UDP_CKSUM:
-                       net_hdr->csum_offset = (offsetof(struct udp_hdr,
-                                               dgram_cksum));
-                       break;
-               case PKT_TX_SCTP_CKSUM:
-                       net_hdr->csum_offset = (offsetof(struct sctp_hdr,
-                                               cksum));
-                       break;
-               }
-       }
-
-       if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
-               if (m_buf->ol_flags & PKT_TX_IPV4)
-                       net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
-               else
-                       net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
-               net_hdr->gso_size = m_buf->tso_segsz;
-               net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
-                                       + m_buf->l4_len;
-       }
-}
-
-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
-                   struct virtio_net_hdr_mrg_rxbuf hdr)
-{
-       if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
-               *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
-       else
-               *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
-}
-
-static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
-                 struct rte_mbuf *m, uint16_t desc_idx)
-{
-       uint32_t desc_avail, desc_offset;
-       uint32_t mbuf_avail, mbuf_offset;
-       uint32_t cpy_len;
-       struct vring_desc *desc;
-       uint64_t desc_addr;
-       struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-
-       desc = &vq->desc[desc_idx];
-       desc_addr = gpa_to_vva(dev, desc->addr);
-       /*
-        * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
-        * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
-        * otherwise stores offset on the stack instead of in a register.
-        */
-       if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
-               return -1;
-
-       rte_prefetch0((void *)(uintptr_t)desc_addr);
-
-       virtio_enqueue_offload(m, &virtio_hdr.hdr);
-       copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
-       vhost_log_write(dev, desc->addr, dev->vhost_hlen);
-       PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
-       desc_offset = dev->vhost_hlen;
-       desc_avail  = desc->len - dev->vhost_hlen;
-
-       mbuf_avail  = rte_pktmbuf_data_len(m);
-       mbuf_offset = 0;
-       while (mbuf_avail != 0 || m->next != NULL) {
-               /* done with current mbuf, fetch next */
-               if (mbuf_avail == 0) {
-                       m = m->next;
-
-                       mbuf_offset = 0;
-                       mbuf_avail  = rte_pktmbuf_data_len(m);
-               }
-
-               /* done with current desc buf, fetch next */
-               if (desc_avail == 0) {
-                       if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
-                               /* Room in vring buffer is not enough */
-                               return -1;
-                       }
-                       if (unlikely(desc->next >= vq->size))
-                               return -1;
-
-                       desc = &vq->desc[desc->next];
-                       desc_addr = gpa_to_vva(dev, desc->addr);
-                       if (unlikely(!desc_addr))
-                               return -1;
-
-                       desc_offset = 0;
-                       desc_avail  = desc->len;
-               }
-
-               cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-               rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
-                       rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
-                       cpy_len);
-               vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
-               PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
-                            cpy_len, 0);
-
-               mbuf_avail  -= cpy_len;
-               mbuf_offset += cpy_len;
-               desc_avail  -= cpy_len;
-               desc_offset += cpy_len;
-       }
-
-       return 0;
-}
-
-/**
- * This function adds buffers to the virtio devices RX virtqueue. Buffers can
- * be received from the physical port or from another virtio device. A packet
- * count is returned to indicate the number of packets that are succesfully
- * added to the RX queue. This function works when the mbuf is scattered, but
- * it doesn't support the mergeable feature.
- */
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
-             struct rte_mbuf **pkts, uint32_t count)
-{
-       struct vhost_virtqueue *vq;
-       uint16_t avail_idx, free_entries, start_idx;
-       uint16_t desc_indexes[MAX_PKT_BURST];
-       uint16_t used_idx;
-       uint32_t i;
-
-       LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
-       if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
-               RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
-                       dev->vid, __func__, queue_id);
-               return 0;
-       }
-
-       vq = dev->virtqueue[queue_id];
-       if (unlikely(vq->enabled == 0))
-               return 0;
-
-       avail_idx = *((volatile uint16_t *)&vq->avail->idx);
-       start_idx = vq->last_used_idx;
-       free_entries = avail_idx - start_idx;
-       count = RTE_MIN(count, free_entries);
-       count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
-       if (count == 0)
-               return 0;
-
-       LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
-               dev->vid, start_idx, start_idx + count);
-
-       /* Retrieve all of the desc indexes first to avoid caching issues. */
-       rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
-       for (i = 0; i < count; i++) {
-               used_idx = (start_idx + i) & (vq->size - 1);
-               desc_indexes[i] = vq->avail->ring[used_idx];
-               vq->used->ring[used_idx].id = desc_indexes[i];
-               vq->used->ring[used_idx].len = pkts[i]->pkt_len +
-                                              dev->vhost_hlen;
-               vhost_log_used_vring(dev, vq,
-                       offsetof(struct vring_used, ring[used_idx]),
-                       sizeof(vq->used->ring[used_idx]));
-       }
-
-       rte_prefetch0(&vq->desc[desc_indexes[0]]);
-       for (i = 0; i < count; i++) {
-               uint16_t desc_idx = desc_indexes[i];
-               int err;
-
-               err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
-               if (unlikely(err)) {
-                       used_idx = (start_idx + i) & (vq->size - 1);
-                       vq->used->ring[used_idx].len = dev->vhost_hlen;
-                       vhost_log_used_vring(dev, vq,
-                               offsetof(struct vring_used, ring[used_idx]),
-                               sizeof(vq->used->ring[used_idx]));
-               }
-
-               if (i + 1 < count)
-                       rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
-       }
-
-       rte_smp_wmb();
-
-       *(volatile uint16_t *)&vq->used->idx += count;
-       vq->last_used_idx += count;
-       vhost_log_used_vring(dev, vq,
-               offsetof(struct vring_used, idx),
-               sizeof(vq->used->idx));
-
-       /* flush used->idx update before we read avail->flags. */
-       rte_mb();
-
-       /* Kick the guest if necessary. */
-       if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
-                       && (vq->callfd >= 0))
-               eventfd_write(vq->callfd, (eventfd_t)1);
-       return count;
-}
-
-static inline int
-fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
-            uint32_t *allocated, uint32_t *vec_idx,
-            struct buf_vector *buf_vec)
-{
-       uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
-       uint32_t vec_id = *vec_idx;
-       uint32_t len    = *allocated;
-
-       while (1) {
-               if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
-                       return -1;
-
-               len += vq->desc[idx].len;
-               buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
-               buf_vec[vec_id].buf_len  = vq->desc[idx].len;
-               buf_vec[vec_id].desc_idx = idx;
-               vec_id++;
-
-               if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0)
-                       break;
-
-               idx = vq->desc[idx].next;
-       }
-
-       *allocated = len;
-       *vec_idx   = vec_id;
-
-       return 0;
-}
-
-/*
- * Returns -1 on fail, 0 on success
- */
-static inline int
-reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
-                           uint16_t *end, struct buf_vector *buf_vec)
-{
-       uint16_t cur_idx;
-       uint16_t avail_idx;
-       uint32_t allocated = 0;
-       uint32_t vec_idx = 0;
-       uint16_t tries = 0;
-
-       cur_idx  = vq->last_used_idx;
-
-       while (1) {
-               avail_idx = *((volatile uint16_t *)&vq->avail->idx);
-               if (unlikely(cur_idx == avail_idx))
-                       return -1;
-
-               if (unlikely(fill_vec_buf(vq, cur_idx, &allocated,
-                                         &vec_idx, buf_vec) < 0))
-                       return -1;
-
-               cur_idx++;
-               tries++;
-
-               if (allocated >= size)
-                       break;
-
-               /*
-                * if we tried all available ring items, and still
-                * can't get enough buf, it means something abnormal
-                * happened.
-                */
-               if (unlikely(tries >= vq->size))
-                       return -1;
-       }
-
-       *end = cur_idx;
-       return 0;
-}
-
-static inline uint32_t __attribute__((always_inline))
-copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
-                           uint16_t end_idx, struct rte_mbuf *m,
-                           struct buf_vector *buf_vec)
-{
-       struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-       uint32_t vec_idx = 0;
-       uint16_t start_idx = vq->last_used_idx;
-       uint16_t cur_idx = start_idx;
-       uint64_t desc_addr;
-       uint32_t mbuf_offset, mbuf_avail;
-       uint32_t desc_offset, desc_avail;
-       uint32_t cpy_len;
-       uint16_t desc_idx, used_idx;
-
-       if (unlikely(m == NULL))
-               return 0;
-
-       LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
-               dev->vid, cur_idx, end_idx);
-
-       desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
-       if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
-               return 0;
-
-       rte_prefetch0((void *)(uintptr_t)desc_addr);
-
-       virtio_hdr.num_buffers = end_idx - start_idx;
-       LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
-               dev->vid, virtio_hdr.num_buffers);
-
-       virtio_enqueue_offload(m, &virtio_hdr.hdr);
-       copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
-       vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
-       PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
-       desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
-       desc_offset = dev->vhost_hlen;
-
-       mbuf_avail  = rte_pktmbuf_data_len(m);
-       mbuf_offset = 0;
-       while (mbuf_avail != 0 || m->next != NULL) {
-               /* done with current desc buf, get the next one */
-               if (desc_avail == 0) {
-                       desc_idx = buf_vec[vec_idx].desc_idx;
-
-                       if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
-                               /* Update used ring with desc information */
-                               used_idx = cur_idx++ & (vq->size - 1);
-                               vq->used->ring[used_idx].id  = desc_idx;
-                               vq->used->ring[used_idx].len = desc_offset;
-                               vhost_log_used_vring(dev, vq,
-                                       offsetof(struct vring_used,
-                                                ring[used_idx]),
-                                       sizeof(vq->used->ring[used_idx]));
-                       }
-
-                       vec_idx++;
-                       desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
-                       if (unlikely(!desc_addr))
-                               return 0;
-
-                       /* Prefetch buffer address. */
-                       rte_prefetch0((void *)(uintptr_t)desc_addr);
-                       desc_offset = 0;
-                       desc_avail  = buf_vec[vec_idx].buf_len;
-               }
-
-               /* done with current mbuf, get the next one */
-               if (mbuf_avail == 0) {
-                       m = m->next;
-
-                       mbuf_offset = 0;
-                       mbuf_avail  = rte_pktmbuf_data_len(m);
-               }
-
-               cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-               rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
-                       rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
-                       cpy_len);
-               vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
-                       cpy_len);
-               PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
-                       cpy_len, 0);
-
-               mbuf_avail  -= cpy_len;
-               mbuf_offset += cpy_len;
-               desc_avail  -= cpy_len;
-               desc_offset += cpy_len;
-       }
-
-       used_idx = cur_idx & (vq->size - 1);
-       vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
-       vq->used->ring[used_idx].len = desc_offset;
-       vhost_log_used_vring(dev, vq,
-               offsetof(struct vring_used, ring[used_idx]),
-               sizeof(vq->used->ring[used_idx]));
-
-       return end_idx - start_idx;
-}
-
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
-       struct rte_mbuf **pkts, uint32_t count)
-{
-       struct vhost_virtqueue *vq;
-       uint32_t pkt_idx = 0, nr_used = 0;
-       uint16_t end;
-       struct buf_vector buf_vec[BUF_VECTOR_MAX];
-
-       LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
-       if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
-               RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
-                       dev->vid, __func__, queue_id);
-               return 0;
-       }
-
-       vq = dev->virtqueue[queue_id];
-       if (unlikely(vq->enabled == 0))
-               return 0;
-
-       count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
-       if (count == 0)
-               return 0;
-
-       for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
-               uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
-
-               if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len,
-                                                        &end, buf_vec) < 0)) {
-                       LOG_DEBUG(VHOST_DATA,
-                               "(%d) failed to get enough desc from vring\n",
-                               dev->vid);
-                       break;
-               }
-
-               nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
-                                                     pkts[pkt_idx], buf_vec);
-               rte_smp_wmb();
-
-               *(volatile uint16_t *)&vq->used->idx += nr_used;
-               vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
-                       sizeof(vq->used->idx));
-               vq->last_used_idx += nr_used;
-       }
-
-       if (likely(pkt_idx)) {
-               /* flush used->idx update before we read avail->flags. */
-               rte_mb();
-
-               /* Kick the guest if necessary. */
-               if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
-                               && (vq->callfd >= 0))
-                       eventfd_write(vq->callfd, (eventfd_t)1);
-       }
-
-       return pkt_idx;
-}
-
-uint16_t
-rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
-       struct rte_mbuf **pkts, uint16_t count)
-{
-       struct virtio_net *dev = get_device(vid);
-
-       if (!dev)
-               return 0;
-
-       if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
-               return virtio_dev_merge_rx(dev, queue_id, pkts, count);
-       else
-               return virtio_dev_rx(dev, queue_id, pkts, count);
-}
-
-static void
-parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
-{
-       struct ipv4_hdr *ipv4_hdr;
-       struct ipv6_hdr *ipv6_hdr;
-       void *l3_hdr = NULL;
-       struct ether_hdr *eth_hdr;
-       uint16_t ethertype;
-
-       eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
-
-       m->l2_len = sizeof(struct ether_hdr);
-       ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
-
-       if (ethertype == ETHER_TYPE_VLAN) {
-               struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
-
-               m->l2_len += sizeof(struct vlan_hdr);
-               ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
-       }
-
-       l3_hdr = (char *)eth_hdr + m->l2_len;
-
-       switch (ethertype) {
-       case ETHER_TYPE_IPv4:
-               ipv4_hdr = (struct ipv4_hdr *)l3_hdr;
-               *l4_proto = ipv4_hdr->next_proto_id;
-               m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
-               *l4_hdr = (char *)l3_hdr + m->l3_len;
-               m->ol_flags |= PKT_TX_IPV4;
-               break;
-       case ETHER_TYPE_IPv6:
-               ipv6_hdr = (struct ipv6_hdr *)l3_hdr;
-               *l4_proto = ipv6_hdr->proto;
-               m->l3_len = sizeof(struct ipv6_hdr);
-               *l4_hdr = (char *)l3_hdr + m->l3_len;
-               m->ol_flags |= PKT_TX_IPV6;
-               break;
-       default:
-               m->l3_len = 0;
-               *l4_proto = 0;
-               break;
-       }
-}
-
-static inline void __attribute__((always_inline))
-vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
-{
-       uint16_t l4_proto = 0;
-       void *l4_hdr = NULL;
-       struct tcp_hdr *tcp_hdr = NULL;
-
-       parse_ethernet(m, &l4_proto, &l4_hdr);
-       if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
-               if (hdr->csum_start == (m->l2_len + m->l3_len)) {
-                       switch (hdr->csum_offset) {
-                       case (offsetof(struct tcp_hdr, cksum)):
-                               if (l4_proto == IPPROTO_TCP)
-                                       m->ol_flags |= PKT_TX_TCP_CKSUM;
-                               break;
-                       case (offsetof(struct udp_hdr, dgram_cksum)):
-                               if (l4_proto == IPPROTO_UDP)
-                                       m->ol_flags |= PKT_TX_UDP_CKSUM;
-                               break;
-                       case (offsetof(struct sctp_hdr, cksum)):
-                               if (l4_proto == IPPROTO_SCTP)
-                                       m->ol_flags |= PKT_TX_SCTP_CKSUM;
-                               break;
-                       default:
-                               break;
-                       }
-               }
-       }
-
-       if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
-               switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
-               case VIRTIO_NET_HDR_GSO_TCPV4:
-               case VIRTIO_NET_HDR_GSO_TCPV6:
-                       tcp_hdr = (struct tcp_hdr *)l4_hdr;
-                       m->ol_flags |= PKT_TX_TCP_SEG;
-                       m->tso_segsz = hdr->gso_size;
-                       m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
-                       break;
-               default:
-                       RTE_LOG(WARNING, VHOST_DATA,
-                               "unsupported gso type %u.\n", hdr->gso_type);
-                       break;
-               }
-       }
-}
-
-#define RARP_PKT_SIZE  64
-
-static int
-make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
-{
-       struct ether_hdr *eth_hdr;
-       struct arp_hdr  *rarp;
-
-       if (rarp_mbuf->buf_len < 64) {
-               RTE_LOG(WARNING, VHOST_DATA,
-                       "failed to make RARP; mbuf size too small %u (< %d)\n",
-                       rarp_mbuf->buf_len, RARP_PKT_SIZE);
-               return -1;
-       }
-
-       /* Ethernet header. */
-       eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0);
-       memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN);
-       ether_addr_copy(mac, &eth_hdr->s_addr);
-       eth_hdr->ether_type = htons(ETHER_TYPE_RARP);
-
-       /* RARP header. */
-       rarp = (struct arp_hdr *)(eth_hdr + 1);
-       rarp->arp_hrd = htons(ARP_HRD_ETHER);
-       rarp->arp_pro = htons(ETHER_TYPE_IPv4);
-       rarp->arp_hln = ETHER_ADDR_LEN;
-       rarp->arp_pln = 4;
-       rarp->arp_op  = htons(ARP_OP_REVREQUEST);
-
-       ether_addr_copy(mac, &rarp->arp_data.arp_sha);
-       ether_addr_copy(mac, &rarp->arp_data.arp_tha);
-       memset(&rarp->arp_data.arp_sip, 0x00, 4);
-       memset(&rarp->arp_data.arp_tip, 0x00, 4);
-
-       rarp_mbuf->pkt_len  = rarp_mbuf->data_len = RARP_PKT_SIZE;
-
-       return 0;
-}
-
-static inline int __attribute__((always_inline))
-copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
-                 struct rte_mbuf *m, uint16_t desc_idx,
-                 struct rte_mempool *mbuf_pool)
-{
-       struct vring_desc *desc;
-       uint64_t desc_addr;
-       uint32_t desc_avail, desc_offset;
-       uint32_t mbuf_avail, mbuf_offset;
-       uint32_t cpy_len;
-       struct rte_mbuf *cur = m, *prev = m;
-       struct virtio_net_hdr *hdr;
-       /* A counter to avoid desc dead loop chain */
-       uint32_t nr_desc = 1;
-
-       desc = &vq->desc[desc_idx];
-       if (unlikely(desc->len < dev->vhost_hlen))
-               return -1;
-
-       desc_addr = gpa_to_vva(dev, desc->addr);
-       if (unlikely(!desc_addr))
-               return -1;
-
-       hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr);
-       rte_prefetch0(hdr);
-
-       /*
-        * A virtio driver normally uses at least 2 desc buffers
-        * for Tx: the first for storing the header, and others
-        * for storing the data.
-        */
-       if (likely((desc->len == dev->vhost_hlen) &&
-                  (desc->flags & VRING_DESC_F_NEXT) != 0)) {
-               desc = &vq->desc[desc->next];
-
-               desc_addr = gpa_to_vva(dev, desc->addr);
-               if (unlikely(!desc_addr))
-                       return -1;
-
-               rte_prefetch0((void *)(uintptr_t)desc_addr);
-
-               desc_offset = 0;
-               desc_avail  = desc->len;
-               nr_desc    += 1;
-
-               PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0);
-       } else {
-               desc_avail  = desc->len - dev->vhost_hlen;
-               desc_offset = dev->vhost_hlen;
-       }
-
-       mbuf_offset = 0;
-       mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
-       while (1) {
-               cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-               rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset),
-                       (void *)((uintptr_t)(desc_addr + desc_offset)),
-                       cpy_len);
-
-               mbuf_avail  -= cpy_len;
-               mbuf_offset += cpy_len;
-               desc_avail  -= cpy_len;
-               desc_offset += cpy_len;
-
-               /* This desc reaches to its end, get the next one */
-               if (desc_avail == 0) {
-                       if ((desc->flags & VRING_DESC_F_NEXT) == 0)
-                               break;
-
-                       if (unlikely(desc->next >= vq->size ||
-                                    ++nr_desc > vq->size))
-                               return -1;
-                       desc = &vq->desc[desc->next];
-
-                       desc_addr = gpa_to_vva(dev, desc->addr);
-                       if (unlikely(!desc_addr))
-                               return -1;
-
-                       rte_prefetch0((void *)(uintptr_t)desc_addr);
-
-                       desc_offset = 0;
-                       desc_avail  = desc->len;
-
-                       PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0);
-               }
-
-               /*
-                * This mbuf reaches to its end, get a new one
-                * to hold more data.
-                */
-               if (mbuf_avail == 0) {
-                       cur = rte_pktmbuf_alloc(mbuf_pool);
-                       if (unlikely(cur == NULL)) {
-                               RTE_LOG(ERR, VHOST_DATA, "Failed to "
-                                       "allocate memory for mbuf.\n");
-                               return -1;
-                       }
-
-                       prev->next = cur;
-                       prev->data_len = mbuf_offset;
-                       m->nb_segs += 1;
-                       m->pkt_len += mbuf_offset;
-                       prev = cur;
-
-                       mbuf_offset = 0;
-                       mbuf_avail  = cur->buf_len - RTE_PKTMBUF_HEADROOM;
-               }
-       }
-
-       prev->data_len = mbuf_offset;
-       m->pkt_len    += mbuf_offset;
-
-       if (hdr->flags != 0 || hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE)
-               vhost_dequeue_offload(hdr, m);
-
-       return 0;
-}
-
-uint16_t
-rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
-       struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
-{
-       struct virtio_net *dev;
-       struct rte_mbuf *rarp_mbuf = NULL;
-       struct vhost_virtqueue *vq;
-       uint32_t desc_indexes[MAX_PKT_BURST];
-       uint32_t used_idx;
-       uint32_t i = 0;
-       uint16_t free_entries;
-       uint16_t avail_idx;
-
-       dev = get_device(vid);
-       if (!dev)
-               return 0;
-
-       if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) {
-               RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
-                       dev->vid, __func__, queue_id);
-               return 0;
-       }
-
-       vq = dev->virtqueue[queue_id];
-       if (unlikely(vq->enabled == 0))
-               return 0;
-
-       /*
-        * Construct a RARP broadcast packet, and inject it to the "pkts"
-        * array, to looks like that guest actually send such packet.
-        *
-        * Check user_send_rarp() for more information.
-        */
-       if (unlikely(rte_atomic16_cmpset((volatile uint16_t *)
-                                        &dev->broadcast_rarp.cnt, 1, 0))) {
-               rarp_mbuf = rte_pktmbuf_alloc(mbuf_pool);
-               if (rarp_mbuf == NULL) {
-                       RTE_LOG(ERR, VHOST_DATA,
-                               "Failed to allocate memory for mbuf.\n");
-                       return 0;
-               }
-
-               if (make_rarp_packet(rarp_mbuf, &dev->mac)) {
-                       rte_pktmbuf_free(rarp_mbuf);
-                       rarp_mbuf = NULL;
-               } else {
-                       count -= 1;
-               }
-       }
-
-       avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
-       free_entries = avail_idx - vq->last_used_idx;
-       if (free_entries == 0)
-               goto out;
-
-       LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
-
-       /* Prefetch available ring to retrieve head indexes. */
-       used_idx = vq->last_used_idx & (vq->size - 1);
-       rte_prefetch0(&vq->avail->ring[used_idx]);
-       rte_prefetch0(&vq->used->ring[used_idx]);
-
-       count = RTE_MIN(count, MAX_PKT_BURST);
-       count = RTE_MIN(count, free_entries);
-       LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
-                       dev->vid, count);
-
-       /* Retrieve all of the head indexes first to avoid caching issues. */
-       for (i = 0; i < count; i++) {
-               used_idx = (vq->last_used_idx + i) & (vq->size - 1);
-               desc_indexes[i] = vq->avail->ring[used_idx];
-
-               vq->used->ring[used_idx].id  = desc_indexes[i];
-               vq->used->ring[used_idx].len = 0;
-               vhost_log_used_vring(dev, vq,
-                               offsetof(struct vring_used, ring[used_idx]),
-                               sizeof(vq->used->ring[used_idx]));
-       }
-
-       /* Prefetch descriptor index. */
-       rte_prefetch0(&vq->desc[desc_indexes[0]]);
-       for (i = 0; i < count; i++) {
-               int err;
-
-               if (likely(i + 1 < count))
-                       rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
-
-               pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
-               if (unlikely(pkts[i] == NULL)) {
-                       RTE_LOG(ERR, VHOST_DATA,
-                               "Failed to allocate memory for mbuf.\n");
-                       break;
-               }
-               err = copy_desc_to_mbuf(dev, vq, pkts[i], desc_indexes[i],
-                                       mbuf_pool);
-               if (unlikely(err)) {
-                       rte_pktmbuf_free(pkts[i]);
-                       break;
-               }
-       }
-
-       rte_smp_wmb();
-       rte_smp_rmb();
-       vq->used->idx += i;
-       vq->last_used_idx += i;
-       vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
-                       sizeof(vq->used->idx));
-
-       /* Kick guest if required. */
-       if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
-                       && (vq->callfd >= 0))
-               eventfd_write(vq->callfd, (eventfd_t)1);
-
-out:
-       if (unlikely(rarp_mbuf != NULL)) {
-               /*
-                * Inject it to the head of "pkts" array, so that switch's mac
-                * learning table will get updated first.
-                */
-               memmove(&pkts[1], pkts, i * sizeof(struct rte_mbuf *));
-               pkts[0] = rarp_mbuf;
-               i += 1;
-       }
-
-       return i;
-}
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
new file mode 100644 (file)
index 0000000..c4714b7
--- /dev/null
@@ -0,0 +1,1040 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <assert.h>
+#ifdef RTE_LIBRTE_VHOST_NUMA
+#include <numaif.h>
+#endif
+
+#include <rte_common.h>
+#include <rte_malloc.h>
+#include <rte_log.h>
+
+#include "vhost.h"
+#include "vhost_user.h"
+
+static const char *vhost_message_str[VHOST_USER_MAX] = {
+       [VHOST_USER_NONE] = "VHOST_USER_NONE",
+       [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
+       [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
+       [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
+       [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
+       [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
+       [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
+       [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
+       [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
+       [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
+       [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
+       [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
+       [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
+       [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
+       [VHOST_USER_SET_VRING_ERR]  = "VHOST_USER_SET_VRING_ERR",
+       [VHOST_USER_GET_PROTOCOL_FEATURES]  = "VHOST_USER_GET_PROTOCOL_FEATURES",
+       [VHOST_USER_SET_PROTOCOL_FEATURES]  = "VHOST_USER_SET_PROTOCOL_FEATURES",
+       [VHOST_USER_GET_QUEUE_NUM]  = "VHOST_USER_GET_QUEUE_NUM",
+       [VHOST_USER_SET_VRING_ENABLE]  = "VHOST_USER_SET_VRING_ENABLE",
+       [VHOST_USER_SEND_RARP]  = "VHOST_USER_SEND_RARP",
+};
+
+struct orig_region_map {
+       int fd;
+       uint64_t mapped_address;
+       uint64_t mapped_size;
+       uint64_t blksz;
+};
+
+#define orig_region(ptr, nregions) \
+       ((struct orig_region_map *)RTE_PTR_ADD((ptr), \
+               sizeof(struct virtio_memory) + \
+               sizeof(struct virtio_memory_regions) * (nregions)))
+
+static uint64_t
+get_blk_size(int fd)
+{
+       struct stat stat;
+       int ret;
+
+       ret = fstat(fd, &stat);
+       return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
+}
+
+static void
+free_mem_region(struct virtio_net *dev)
+{
+       struct orig_region_map *region;
+       unsigned int idx;
+
+       if (!dev || !dev->mem)
+               return;
+
+       region = orig_region(dev->mem, dev->mem->nregions);
+       for (idx = 0; idx < dev->mem->nregions; idx++) {
+               if (region[idx].mapped_address) {
+                       munmap((void *)(uintptr_t)region[idx].mapped_address,
+                                       region[idx].mapped_size);
+                       close(region[idx].fd);
+               }
+       }
+}
+
+void
+vhost_backend_cleanup(struct virtio_net *dev)
+{
+       if (dev->mem) {
+               free_mem_region(dev);
+               free(dev->mem);
+               dev->mem = NULL;
+       }
+       if (dev->log_addr) {
+               munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
+               dev->log_addr = 0;
+       }
+}
+
+/*
+ * This function just returns success at the moment unless
+ * the device hasn't been initialised.
+ */
+static int
+vhost_set_owner(int vid)
+{
+       struct virtio_net *dev;
+
+       dev = get_device(vid);
+       if (dev == NULL)
+               return -1;
+
+       return 0;
+}
+
+static int
+vhost_reset_owner(int vid)
+{
+       struct virtio_net *dev;
+
+       dev = get_device(vid);
+       if (dev == NULL)
+               return -1;
+
+       if (dev->flags & VIRTIO_DEV_RUNNING) {
+               dev->flags &= ~VIRTIO_DEV_RUNNING;
+               notify_ops->destroy_device(vid);
+       }
+
+       cleanup_device(dev, 0);
+       reset_device(dev);
+       return 0;
+}
+
+/*
+ * The features that we support are requested.
+ */
+static int
+vhost_get_features(int vid, uint64_t *pu)
+{
+       struct virtio_net *dev;
+
+       dev = get_device(vid);
+       if (dev == NULL)
+               return -1;
+
+       /* Send our supported features. */
+       *pu = VHOST_FEATURES;
+       return 0;
+}
+
+/*
+ * We receive the negotiated features supported by us and the virtio device.
+ */
+static int
+vhost_set_features(int vid, uint64_t *pu)
+{
+       struct virtio_net *dev;
+
+       dev = get_device(vid);
+       if (dev == NULL)
+               return -1;
+       if (*pu & ~VHOST_FEATURES)
+               return -1;
+
+       dev->features = *pu;
+       if (dev->features &
+               ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) {
+               dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+       } else {
+               dev->vhost_hlen = sizeof(struct virtio_net_hdr);
+       }
+       LOG_DEBUG(VHOST_CONFIG,
+               "(%d) mergeable RX buffers %s, virtio 1 %s\n",
+               dev->vid,
+               (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off",
+               (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off");
+
+       return 0;
+}
+
+/*
+ * The virtio device sends us the size of the descriptor ring.
+ */
+static int
+vhost_set_vring_num(int vid, struct vhost_vring_state *state)
+{
+       struct virtio_net *dev;
+
+       dev = get_device(vid);
+       if (dev == NULL)
+               return -1;
+
+       /* State->index refers to the queue index. The txq is 1, rxq is 0. */
+       dev->virtqueue[state->index]->size = state->num;
+
+       return 0;
+}
+
+/*
+ * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the
+ * same numa node as the memory of vring descriptor.
+ */
+#ifdef RTE_LIBRTE_VHOST_NUMA
+static struct virtio_net*
+numa_realloc(struct virtio_net *dev, int index)
+{
+       int oldnode, newnode;
+       struct virtio_net *old_dev;
+       struct vhost_virtqueue *old_vq, *vq;
+       int ret;
+
+       /*
+        * vq is allocated on pairs, we should try to do realloc
+        * on first queue of one queue pair only.
+        */
+       if (index % VIRTIO_QNUM != 0)
+               return dev;
+
+       old_dev = dev;
+       vq = old_vq = dev->virtqueue[index];
+
+       ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc,
+                           MPOL_F_NODE | MPOL_F_ADDR);
+
+       /* check if we need to reallocate vq */
+       ret |= get_mempolicy(&oldnode, NULL, 0, old_vq,
+                            MPOL_F_NODE | MPOL_F_ADDR);
+       if (ret) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "Unable to get vq numa information.\n");
+               return dev;
+       }
+       if (oldnode != newnode) {
+               RTE_LOG(INFO, VHOST_CONFIG,
+                       "reallocate vq from %d to %d node\n", oldnode, newnode);
+               vq = rte_malloc_socket(NULL, sizeof(*vq) * VIRTIO_QNUM, 0,
+                                      newnode);
+               if (!vq)
+                       return dev;
+
+               memcpy(vq, old_vq, sizeof(*vq) * VIRTIO_QNUM);
+               rte_free(old_vq);
+       }
+
+       /* check if we need to reallocate dev */
+       ret = get_mempolicy(&oldnode, NULL, 0, old_dev,
+                           MPOL_F_NODE | MPOL_F_ADDR);
+       if (ret) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "Unable to get dev numa information.\n");
+               goto out;
+       }
+       if (oldnode != newnode) {
+               RTE_LOG(INFO, VHOST_CONFIG,
+                       "reallocate dev from %d to %d node\n",
+                       oldnode, newnode);
+               dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode);
+               if (!dev) {
+                       dev = old_dev;
+                       goto out;
+               }
+
+               memcpy(dev, old_dev, sizeof(*dev));
+               rte_free(old_dev);
+       }
+
+out:
+       dev->virtqueue[index] = vq;
+       dev->virtqueue[index + 1] = vq + 1;
+       vhost_devices[dev->vid] = dev;
+
+       return dev;
+}
+#else
+static struct virtio_net*
+numa_realloc(struct virtio_net *dev, int index __rte_unused)
+{
+       return dev;
+}
+#endif
+
+/*
+ * Converts QEMU virtual address to Vhost virtual address. This function is
+ * used to convert the ring addresses to our address space.
+ */
+static uint64_t
+qva_to_vva(struct virtio_net *dev, uint64_t qemu_va)
+{
+       struct virtio_memory_regions *region;
+       uint64_t vhost_va = 0;
+       uint32_t regionidx = 0;
+
+       /* Find the region where the address lives. */
+       for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
+               region = &dev->mem->regions[regionidx];
+               if ((qemu_va >= region->userspace_address) &&
+                       (qemu_va <= region->userspace_address +
+                       region->memory_size)) {
+                       vhost_va = qemu_va + region->guest_phys_address +
+                               region->address_offset -
+                               region->userspace_address;
+                       break;
+               }
+       }
+       return vhost_va;
+}
+
+/*
+ * The virtio device sends us the desc, used and avail ring addresses.
+ * This function then converts these to our address space.
+ */
+static int
+vhost_set_vring_addr(int vid, struct vhost_vring_addr *addr)
+{
+       struct virtio_net *dev;
+       struct vhost_virtqueue *vq;
+
+       dev = get_device(vid);
+       if ((dev == NULL) || (dev->mem == NULL))
+               return -1;
+
+       /* addr->index refers to the queue index. The txq 1, rxq is 0. */
+       vq = dev->virtqueue[addr->index];
+
+       /* The addresses are converted from QEMU virtual to Vhost virtual. */
+       vq->desc = (struct vring_desc *)(uintptr_t)qva_to_vva(dev,
+                       addr->desc_user_addr);
+       if (vq->desc == 0) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "(%d) failed to find desc ring address.\n",
+                       dev->vid);
+               return -1;
+       }
+
+       dev = numa_realloc(dev, addr->index);
+       vq = dev->virtqueue[addr->index];
+
+       vq->avail = (struct vring_avail *)(uintptr_t)qva_to_vva(dev,
+                       addr->avail_user_addr);
+       if (vq->avail == 0) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "(%d) failed to find avail ring address.\n",
+                       dev->vid);
+               return -1;
+       }
+
+       vq->used = (struct vring_used *)(uintptr_t)qva_to_vva(dev,
+                       addr->used_user_addr);
+       if (vq->used == 0) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "(%d) failed to find used ring address.\n",
+                       dev->vid);
+               return -1;
+       }
+
+       if (vq->last_used_idx != vq->used->idx) {
+               RTE_LOG(WARNING, VHOST_CONFIG,
+                       "last_used_idx (%u) and vq->used->idx (%u) mismatches; "
+                       "some packets maybe resent for Tx and dropped for Rx\n",
+                       vq->last_used_idx, vq->used->idx);
+               vq->last_used_idx     = vq->used->idx;
+       }
+
+       vq->log_guest_addr = addr->log_guest_addr;
+
+       LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n",
+                       dev->vid, vq->desc);
+       LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n",
+                       dev->vid, vq->avail);
+       LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n",
+                       dev->vid, vq->used);
+       LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n",
+                       dev->vid, vq->log_guest_addr);
+
+       return 0;
+}
+
+/*
+ * The virtio device sends us the available ring last used index.
+ */
+static int
+vhost_set_vring_base(int vid, struct vhost_vring_state *state)
+{
+       struct virtio_net *dev;
+
+       dev = get_device(vid);
+       if (dev == NULL)
+               return -1;
+
+       /* State->index refers to the queue index. The txq is 1, rxq is 0. */
+       dev->virtqueue[state->index]->last_used_idx = state->num;
+
+       return 0;
+}
+
+/*
+ * We send the virtio device our available ring last used index.
+ */
+static int
+vhost_get_vring_base(int vid, uint32_t index,
+       struct vhost_vring_state *state)
+{
+       struct virtio_net *dev;
+
+       dev = get_device(vid);
+       if (dev == NULL)
+               return -1;
+
+       state->index = index;
+       /* State->index refers to the queue index. The txq is 1, rxq is 0. */
+       state->num = dev->virtqueue[state->index]->last_used_idx;
+
+       return 0;
+}
+
+/*
+ * The virtio device sends an eventfd to interrupt the guest. This fd gets
+ * copied into our process space.
+ */
+static int
+vhost_set_vring_call(int vid, struct vhost_vring_file *file)
+{
+       struct virtio_net *dev;
+       struct vhost_virtqueue *vq;
+       uint32_t cur_qp_idx = file->index / VIRTIO_QNUM;
+
+       dev = get_device(vid);
+       if (dev == NULL)
+               return -1;
+
+       /*
+        * FIXME: VHOST_SET_VRING_CALL is the first per-vring message
+        * we get, so we do vring queue pair allocation here.
+        */
+       if (cur_qp_idx + 1 > dev->virt_qp_nb) {
+               if (alloc_vring_queue_pair(dev, cur_qp_idx) < 0)
+                       return -1;
+       }
+
+       /* file->index refers to the queue index. The txq is 1, rxq is 0. */
+       vq = dev->virtqueue[file->index];
+       assert(vq != NULL);
+
+       if (vq->callfd >= 0)
+               close(vq->callfd);
+
+       vq->callfd = file->fd;
+
+       return 0;
+}
+
+/*
+ * The virtio device sends an eventfd that it can use to notify us.
+ * This fd gets copied into our process space.
+ */
+static int
+vhost_set_vring_kick(int vid, struct vhost_vring_file *file)
+{
+       struct virtio_net *dev;
+       struct vhost_virtqueue *vq;
+
+       dev = get_device(vid);
+       if (dev == NULL)
+               return -1;
+
+       /* file->index refers to the queue index. The txq is 1, rxq is 0. */
+       vq = dev->virtqueue[file->index];
+
+       if (vq->kickfd >= 0)
+               close(vq->kickfd);
+
+       vq->kickfd = file->fd;
+
+       return 0;
+}
+
+static int
+user_set_mem_table(int vid, struct VhostUserMsg *pmsg)
+{
+       struct VhostUserMemory memory = pmsg->payload.memory;
+       struct virtio_memory_regions *pregion;
+       uint64_t mapped_address, mapped_size;
+       struct virtio_net *dev;
+       unsigned int idx = 0;
+       struct orig_region_map *pregion_orig;
+       uint64_t alignment;
+
+       /* unmap old memory regions one by one*/
+       dev = get_device(vid);
+       if (dev == NULL)
+               return -1;
+
+       /* Remove from the data plane. */
+       if (dev->flags & VIRTIO_DEV_RUNNING) {
+               dev->flags &= ~VIRTIO_DEV_RUNNING;
+               notify_ops->destroy_device(vid);
+       }
+
+       if (dev->mem) {
+               free_mem_region(dev);
+               free(dev->mem);
+               dev->mem = NULL;
+       }
+
+       dev->mem = calloc(1,
+               sizeof(struct virtio_memory) +
+               sizeof(struct virtio_memory_regions) * memory.nregions +
+               sizeof(struct orig_region_map) * memory.nregions);
+       if (dev->mem == NULL) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "(%d) failed to allocate memory for dev->mem\n",
+                       dev->vid);
+               return -1;
+       }
+       dev->mem->nregions = memory.nregions;
+
+       pregion_orig = orig_region(dev->mem, memory.nregions);
+       for (idx = 0; idx < memory.nregions; idx++) {
+               pregion = &dev->mem->regions[idx];
+               pregion->guest_phys_address =
+                       memory.regions[idx].guest_phys_addr;
+               pregion->guest_phys_address_end =
+                       memory.regions[idx].guest_phys_addr +
+                       memory.regions[idx].memory_size;
+               pregion->memory_size =
+                       memory.regions[idx].memory_size;
+               pregion->userspace_address =
+                       memory.regions[idx].userspace_addr;
+
+               /* This is ugly */
+               mapped_size = memory.regions[idx].memory_size +
+                       memory.regions[idx].mmap_offset;
+
+               /* mmap() without flag of MAP_ANONYMOUS, should be called
+                * with length argument aligned with hugepagesz at older
+                * longterm version Linux, like 2.6.32 and 3.2.72, or
+                * mmap() will fail with EINVAL.
+                *
+                * to avoid failure, make sure in caller to keep length
+                * aligned.
+                */
+               alignment = get_blk_size(pmsg->fds[idx]);
+               if (alignment == (uint64_t)-1) {
+                       RTE_LOG(ERR, VHOST_CONFIG,
+                               "couldn't get hugepage size through fstat\n");
+                       goto err_mmap;
+               }
+               mapped_size = RTE_ALIGN_CEIL(mapped_size, alignment);
+
+               mapped_address = (uint64_t)(uintptr_t)mmap(NULL,
+                       mapped_size,
+                       PROT_READ | PROT_WRITE, MAP_SHARED,
+                       pmsg->fds[idx],
+                       0);
+
+               RTE_LOG(INFO, VHOST_CONFIG,
+                       "mapped region %d fd:%d to:%p sz:0x%"PRIx64" "
+                       "off:0x%"PRIx64" align:0x%"PRIx64"\n",
+                       idx, pmsg->fds[idx], (void *)(uintptr_t)mapped_address,
+                       mapped_size, memory.regions[idx].mmap_offset,
+                       alignment);
+
+               if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) {
+                       RTE_LOG(ERR, VHOST_CONFIG,
+                               "mmap qemu guest failed.\n");
+                       goto err_mmap;
+               }
+
+               pregion_orig[idx].mapped_address = mapped_address;
+               pregion_orig[idx].mapped_size = mapped_size;
+               pregion_orig[idx].blksz = alignment;
+               pregion_orig[idx].fd = pmsg->fds[idx];
+
+               mapped_address +=  memory.regions[idx].mmap_offset;
+
+               pregion->address_offset = mapped_address -
+                       pregion->guest_phys_address;
+
+               if (memory.regions[idx].guest_phys_addr == 0) {
+                       dev->mem->base_address =
+                               memory.regions[idx].userspace_addr;
+                       dev->mem->mapped_address =
+                               pregion->address_offset;
+               }
+
+               LOG_DEBUG(VHOST_CONFIG,
+                       "REGION: %u GPA: %p QEMU VA: %p SIZE (%"PRIu64")\n",
+                       idx,
+                       (void *)(uintptr_t)pregion->guest_phys_address,
+                       (void *)(uintptr_t)pregion->userspace_address,
+                        pregion->memory_size);
+       }
+
+       return 0;
+
+err_mmap:
+       while (idx--) {
+               munmap((void *)(uintptr_t)pregion_orig[idx].mapped_address,
+                               pregion_orig[idx].mapped_size);
+               close(pregion_orig[idx].fd);
+       }
+       free(dev->mem);
+       dev->mem = NULL;
+       return -1;
+}
+
+static int
+vq_is_ready(struct vhost_virtqueue *vq)
+{
+       return vq && vq->desc   &&
+              vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD &&
+              vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD;
+}
+
+static int
+virtio_is_ready(struct virtio_net *dev)
+{
+       struct vhost_virtqueue *rvq, *tvq;
+       uint32_t i;
+
+       for (i = 0; i < dev->virt_qp_nb; i++) {
+               rvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ];
+               tvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ];
+
+               if (!vq_is_ready(rvq) || !vq_is_ready(tvq)) {
+                       RTE_LOG(INFO, VHOST_CONFIG,
+                               "virtio is not ready for processing.\n");
+                       return 0;
+               }
+       }
+
+       RTE_LOG(INFO, VHOST_CONFIG,
+               "virtio is now ready for processing.\n");
+       return 1;
+}
+
+static void
+user_set_vring_call(int vid, struct VhostUserMsg *pmsg)
+{
+       struct vhost_vring_file file;
+
+       file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+       if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
+               file.fd = VIRTIO_INVALID_EVENTFD;
+       else
+               file.fd = pmsg->fds[0];
+       RTE_LOG(INFO, VHOST_CONFIG,
+               "vring call idx:%d file:%d\n", file.index, file.fd);
+       vhost_set_vring_call(vid, &file);
+}
+
+/*
+ *  In vhost-user, when we receive kick message, will test whether virtio
+ *  device is ready for packet processing.
+ */
+static void
+user_set_vring_kick(int vid, struct VhostUserMsg *pmsg)
+{
+       struct vhost_vring_file file;
+       struct virtio_net *dev = get_device(vid);
+
+       if (!dev)
+               return;
+
+       file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+       if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
+               file.fd = VIRTIO_INVALID_EVENTFD;
+       else
+               file.fd = pmsg->fds[0];
+       RTE_LOG(INFO, VHOST_CONFIG,
+               "vring kick idx:%d file:%d\n", file.index, file.fd);
+       vhost_set_vring_kick(vid, &file);
+
+       if (virtio_is_ready(dev) && !(dev->flags & VIRTIO_DEV_RUNNING)) {
+               if (notify_ops->new_device(vid) == 0)
+                       dev->flags |= VIRTIO_DEV_RUNNING;
+       }
+}
+
+/*
+ * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
+ */
+static int
+user_get_vring_base(int vid, struct vhost_vring_state *state)
+{
+       struct virtio_net *dev = get_device(vid);
+
+       if (dev == NULL)
+               return -1;
+       /* We have to stop the queue (virtio) if it is running. */
+       if (dev->flags & VIRTIO_DEV_RUNNING) {
+               dev->flags &= ~VIRTIO_DEV_RUNNING;
+               notify_ops->destroy_device(vid);
+       }
+
+       /* Here we are safe to get the last used index */
+       vhost_get_vring_base(vid, state->index, state);
+
+       RTE_LOG(INFO, VHOST_CONFIG,
+               "vring base idx:%d file:%d\n", state->index, state->num);
+       /*
+        * Based on current qemu vhost-user implementation, this message is
+        * sent and only sent in vhost_vring_stop.
+        * TODO: cleanup the vring, it isn't usable since here.
+        */
+       if (dev->virtqueue[state->index]->kickfd >= 0)
+               close(dev->virtqueue[state->index]->kickfd);
+
+       dev->virtqueue[state->index]->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+
+       return 0;
+}
+
+/*
+ * when virtio queues are ready to work, qemu will send us to
+ * enable the virtio queue pair.
+ */
+static int
+user_set_vring_enable(int vid, struct vhost_vring_state *state)
+{
+       struct virtio_net *dev;
+       int enable = (int)state->num;
+
+       dev = get_device(vid);
+       if (dev == NULL)
+               return -1;
+
+       RTE_LOG(INFO, VHOST_CONFIG,
+               "set queue enable: %d to qp idx: %d\n",
+               enable, state->index);
+
+       if (notify_ops->vring_state_changed)
+               notify_ops->vring_state_changed(vid, state->index, enable);
+
+       dev->virtqueue[state->index]->enabled = enable;
+
+       return 0;
+}
+
+static void
+user_set_protocol_features(int vid, uint64_t protocol_features)
+{
+       struct virtio_net *dev;
+
+       dev = get_device(vid);
+       if (dev == NULL || protocol_features & ~VHOST_USER_PROTOCOL_FEATURES)
+               return;
+
+       dev->protocol_features = protocol_features;
+}
+
+static int
+user_set_log_base(int vid, struct VhostUserMsg *msg)
+{
+       struct virtio_net *dev;
+       int fd = msg->fds[0];
+       uint64_t size, off;
+       void *addr;
+
+       dev = get_device(vid);
+       if (!dev)
+               return -1;
+
+       if (fd < 0) {
+               RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd);
+               return -1;
+       }
+
+       if (msg->size != sizeof(VhostUserLog)) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "invalid log base msg size: %"PRId32" != %d\n",
+                       msg->size, (int)sizeof(VhostUserLog));
+               return -1;
+       }
+
+       size = msg->payload.log.mmap_size;
+       off  = msg->payload.log.mmap_offset;
+       RTE_LOG(INFO, VHOST_CONFIG,
+               "log mmap size: %"PRId64", offset: %"PRId64"\n",
+               size, off);
+
+       /*
+        * mmap from 0 to workaround a hugepage mmap bug: mmap will
+        * fail when offset is not page size aligned.
+        */
+       addr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+       close(fd);
+       if (addr == MAP_FAILED) {
+               RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n");
+               return -1;
+       }
+
+       /*
+        * Free previously mapped log memory on occasionally
+        * multiple VHOST_USER_SET_LOG_BASE.
+        */
+       if (dev->log_addr) {
+               munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
+       }
+       dev->log_addr = (uint64_t)(uintptr_t)addr;
+       dev->log_base = dev->log_addr + off;
+       dev->log_size = size;
+
+       return 0;
+}
+
+/*
+ * An rarp packet is constructed and broadcasted to notify switches about
+ * the new location of the migrated VM, so that packets from outside will
+ * not be lost after migration.
+ *
+ * However, we don't actually "send" a rarp packet here, instead, we set
+ * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
+ */
+static int
+user_send_rarp(int vid, struct VhostUserMsg *msg)
+{
+       struct virtio_net *dev;
+       uint8_t *mac = (uint8_t *)&msg->payload.u64;
+
+       dev = get_device(vid);
+       if (!dev)
+               return -1;
+
+       RTE_LOG(DEBUG, VHOST_CONFIG,
+               ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
+               mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
+       memcpy(dev->mac.addr_bytes, mac, 6);
+
+       /*
+        * Set the flag to inject a RARP broadcast packet at
+        * rte_vhost_dequeue_burst().
+        *
+        * rte_smp_wmb() is for making sure the mac is copied
+        * before the flag is set.
+        */
+       rte_smp_wmb();
+       rte_atomic16_set(&dev->broadcast_rarp, 1);
+
+       return 0;
+}
+
+/* return bytes# of read on success or negative val on failure. */
+static int
+read_vhost_message(int sockfd, struct VhostUserMsg *msg)
+{
+       int ret;
+
+       ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
+               msg->fds, VHOST_MEMORY_MAX_NREGIONS);
+       if (ret <= 0)
+               return ret;
+
+       if (msg && msg->size) {
+               if (msg->size > sizeof(msg->payload)) {
+                       RTE_LOG(ERR, VHOST_CONFIG,
+                               "invalid msg size: %d\n", msg->size);
+                       return -1;
+               }
+               ret = read(sockfd, &msg->payload, msg->size);
+               if (ret <= 0)
+                       return ret;
+               if (ret != (int)msg->size) {
+                       RTE_LOG(ERR, VHOST_CONFIG,
+                               "read control message failed\n");
+                       return -1;
+               }
+       }
+
+       return ret;
+}
+
+static int
+send_vhost_message(int sockfd, struct VhostUserMsg *msg)
+{
+       int ret;
+
+       if (!msg)
+               return 0;
+
+       msg->flags &= ~VHOST_USER_VERSION_MASK;
+       msg->flags |= VHOST_USER_VERSION;
+       msg->flags |= VHOST_USER_REPLY_MASK;
+
+       ret = send_fd_message(sockfd, (char *)msg,
+               VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
+
+       return ret;
+}
+
+int
+vhost_user_msg_handler(int vid, int fd)
+{
+       struct VhostUserMsg msg;
+       uint64_t features = 0;
+       int ret;
+
+       ret = read_vhost_message(fd, &msg);
+       if (ret <= 0 || msg.request >= VHOST_USER_MAX) {
+               if (ret < 0)
+                       RTE_LOG(ERR, VHOST_CONFIG,
+                               "vhost read message failed\n");
+               else if (ret == 0)
+                       RTE_LOG(INFO, VHOST_CONFIG,
+                               "vhost peer closed\n");
+               else
+                       RTE_LOG(ERR, VHOST_CONFIG,
+                               "vhost read incorrect message\n");
+
+               return -1;
+       }
+
+       RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
+               vhost_message_str[msg.request]);
+       switch (msg.request) {
+       case VHOST_USER_GET_FEATURES:
+               ret = vhost_get_features(vid, &features);
+               msg.payload.u64 = features;
+               msg.size = sizeof(msg.payload.u64);
+               send_vhost_message(fd, &msg);
+               break;
+       case VHOST_USER_SET_FEATURES:
+               features = msg.payload.u64;
+               vhost_set_features(vid, &features);
+               break;
+
+       case VHOST_USER_GET_PROTOCOL_FEATURES:
+               msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES;
+               msg.size = sizeof(msg.payload.u64);
+               send_vhost_message(fd, &msg);
+               break;
+       case VHOST_USER_SET_PROTOCOL_FEATURES:
+               user_set_protocol_features(vid, msg.payload.u64);
+               break;
+
+       case VHOST_USER_SET_OWNER:
+               vhost_set_owner(vid);
+               break;
+       case VHOST_USER_RESET_OWNER:
+               vhost_reset_owner(vid);
+               break;
+
+       case VHOST_USER_SET_MEM_TABLE:
+               user_set_mem_table(vid, &msg);
+               break;
+
+       case VHOST_USER_SET_LOG_BASE:
+               user_set_log_base(vid, &msg);
+
+               /* it needs a reply */
+               msg.size = sizeof(msg.payload.u64);
+               send_vhost_message(fd, &msg);
+               break;
+       case VHOST_USER_SET_LOG_FD:
+               close(msg.fds[0]);
+               RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
+               break;
+
+       case VHOST_USER_SET_VRING_NUM:
+               vhost_set_vring_num(vid, &msg.payload.state);
+               break;
+       case VHOST_USER_SET_VRING_ADDR:
+               vhost_set_vring_addr(vid, &msg.payload.addr);
+               break;
+       case VHOST_USER_SET_VRING_BASE:
+               vhost_set_vring_base(vid, &msg.payload.state);
+               break;
+
+       case VHOST_USER_GET_VRING_BASE:
+               ret = user_get_vring_base(vid, &msg.payload.state);
+               msg.size = sizeof(msg.payload.state);
+               send_vhost_message(fd, &msg);
+               break;
+
+       case VHOST_USER_SET_VRING_KICK:
+               user_set_vring_kick(vid, &msg);
+               break;
+       case VHOST_USER_SET_VRING_CALL:
+               user_set_vring_call(vid, &msg);
+               break;
+
+       case VHOST_USER_SET_VRING_ERR:
+               if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK))
+                       close(msg.fds[0]);
+               RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
+               break;
+
+       case VHOST_USER_GET_QUEUE_NUM:
+               msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS;
+               msg.size = sizeof(msg.payload.u64);
+               send_vhost_message(fd, &msg);
+               break;
+
+       case VHOST_USER_SET_VRING_ENABLE:
+               user_set_vring_enable(vid, &msg.payload.state);
+               break;
+       case VHOST_USER_SEND_RARP:
+               user_send_rarp(vid, &msg);
+               break;
+
+       default:
+               break;
+
+       }
+
+       return 0;
+}
diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h
new file mode 100644 (file)
index 0000000..ba78d32
--- /dev/null
@@ -0,0 +1,128 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VHOST_NET_USER_H
+#define _VHOST_NET_USER_H
+
+#include <stdint.h>
+#include <linux/vhost.h>
+
+#include "rte_virtio_net.h"
+
+/* refer to hw/virtio/vhost-user.c */
+
+#define VHOST_MEMORY_MAX_NREGIONS 8
+
+#define VHOST_USER_PROTOCOL_F_MQ       0
+#define VHOST_USER_PROTOCOL_F_LOG_SHMFD        1
+#define VHOST_USER_PROTOCOL_F_RARP     2
+
+#define VHOST_USER_PROTOCOL_FEATURES   ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
+                                        (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\
+                                        (1ULL << VHOST_USER_PROTOCOL_F_RARP))
+
+typedef enum VhostUserRequest {
+       VHOST_USER_NONE = 0,
+       VHOST_USER_GET_FEATURES = 1,
+       VHOST_USER_SET_FEATURES = 2,
+       VHOST_USER_SET_OWNER = 3,
+       VHOST_USER_RESET_OWNER = 4,
+       VHOST_USER_SET_MEM_TABLE = 5,
+       VHOST_USER_SET_LOG_BASE = 6,
+       VHOST_USER_SET_LOG_FD = 7,
+       VHOST_USER_SET_VRING_NUM = 8,
+       VHOST_USER_SET_VRING_ADDR = 9,
+       VHOST_USER_SET_VRING_BASE = 10,
+       VHOST_USER_GET_VRING_BASE = 11,
+       VHOST_USER_SET_VRING_KICK = 12,
+       VHOST_USER_SET_VRING_CALL = 13,
+       VHOST_USER_SET_VRING_ERR = 14,
+       VHOST_USER_GET_PROTOCOL_FEATURES = 15,
+       VHOST_USER_SET_PROTOCOL_FEATURES = 16,
+       VHOST_USER_GET_QUEUE_NUM = 17,
+       VHOST_USER_SET_VRING_ENABLE = 18,
+       VHOST_USER_SEND_RARP = 19,
+       VHOST_USER_MAX
+} VhostUserRequest;
+
+typedef struct VhostUserMemoryRegion {
+       uint64_t guest_phys_addr;
+       uint64_t memory_size;
+       uint64_t userspace_addr;
+       uint64_t mmap_offset;
+} VhostUserMemoryRegion;
+
+typedef struct VhostUserMemory {
+       uint32_t nregions;
+       uint32_t padding;
+       VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
+} VhostUserMemory;
+
+typedef struct VhostUserLog {
+       uint64_t mmap_size;
+       uint64_t mmap_offset;
+} VhostUserLog;
+
+typedef struct VhostUserMsg {
+       VhostUserRequest request;
+
+#define VHOST_USER_VERSION_MASK     0x3
+#define VHOST_USER_REPLY_MASK       (0x1 << 2)
+       uint32_t flags;
+       uint32_t size; /* the following payload size */
+       union {
+#define VHOST_USER_VRING_IDX_MASK   0xff
+#define VHOST_USER_VRING_NOFD_MASK  (0x1<<8)
+               uint64_t u64;
+               struct vhost_vring_state state;
+               struct vhost_vring_addr addr;
+               VhostUserMemory memory;
+               VhostUserLog    log;
+       } payload;
+       int fds[VHOST_MEMORY_MAX_NREGIONS];
+} __attribute((packed)) VhostUserMsg;
+
+#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
+
+/* The version of the protocol we support */
+#define VHOST_USER_VERSION    0x1
+
+
+/* vhost_user.c */
+int vhost_user_msg_handler(int vid, int fd);
+
+/* socket.c */
+int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num);
+int send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num);
+
+#endif
diff --git a/lib/librte_vhost/virtio-net-user.c b/lib/librte_vhost/virtio-net-user.c
deleted file mode 100644 (file)
index e7c4347..0000000
+++ /dev/null
@@ -1,470 +0,0 @@
-/*-
- *   BSD LICENSE
- *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- *   All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include <rte_common.h>
-#include <rte_log.h>
-
-#include "virtio-net-user.h"
-#include "vhost-net-user.h"
-#include "vhost-net.h"
-
-struct orig_region_map {
-       int fd;
-       uint64_t mapped_address;
-       uint64_t mapped_size;
-       uint64_t blksz;
-};
-
-#define orig_region(ptr, nregions) \
-       ((struct orig_region_map *)RTE_PTR_ADD((ptr), \
-               sizeof(struct virtio_memory) + \
-               sizeof(struct virtio_memory_regions) * (nregions)))
-
-static uint64_t
-get_blk_size(int fd)
-{
-       struct stat stat;
-       int ret;
-
-       ret = fstat(fd, &stat);
-       return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
-}
-
-static void
-free_mem_region(struct virtio_net *dev)
-{
-       struct orig_region_map *region;
-       unsigned int idx;
-
-       if (!dev || !dev->mem)
-               return;
-
-       region = orig_region(dev->mem, dev->mem->nregions);
-       for (idx = 0; idx < dev->mem->nregions; idx++) {
-               if (region[idx].mapped_address) {
-                       munmap((void *)(uintptr_t)region[idx].mapped_address,
-                                       region[idx].mapped_size);
-                       close(region[idx].fd);
-               }
-       }
-}
-
-void
-vhost_backend_cleanup(struct virtio_net *dev)
-{
-       if (dev->mem) {
-               free_mem_region(dev);
-               free(dev->mem);
-               dev->mem = NULL;
-       }
-       if (dev->log_addr) {
-               munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
-               dev->log_addr = 0;
-       }
-}
-
-int
-user_set_mem_table(int vid, struct VhostUserMsg *pmsg)
-{
-       struct VhostUserMemory memory = pmsg->payload.memory;
-       struct virtio_memory_regions *pregion;
-       uint64_t mapped_address, mapped_size;
-       struct virtio_net *dev;
-       unsigned int idx = 0;
-       struct orig_region_map *pregion_orig;
-       uint64_t alignment;
-
-       /* unmap old memory regions one by one*/
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       /* Remove from the data plane. */
-       if (dev->flags & VIRTIO_DEV_RUNNING) {
-               dev->flags &= ~VIRTIO_DEV_RUNNING;
-               notify_ops->destroy_device(vid);
-       }
-
-       if (dev->mem) {
-               free_mem_region(dev);
-               free(dev->mem);
-               dev->mem = NULL;
-       }
-
-       dev->mem = calloc(1,
-               sizeof(struct virtio_memory) +
-               sizeof(struct virtio_memory_regions) * memory.nregions +
-               sizeof(struct orig_region_map) * memory.nregions);
-       if (dev->mem == NULL) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "(%d) failed to allocate memory for dev->mem\n",
-                       dev->vid);
-               return -1;
-       }
-       dev->mem->nregions = memory.nregions;
-
-       pregion_orig = orig_region(dev->mem, memory.nregions);
-       for (idx = 0; idx < memory.nregions; idx++) {
-               pregion = &dev->mem->regions[idx];
-               pregion->guest_phys_address =
-                       memory.regions[idx].guest_phys_addr;
-               pregion->guest_phys_address_end =
-                       memory.regions[idx].guest_phys_addr +
-                       memory.regions[idx].memory_size;
-               pregion->memory_size =
-                       memory.regions[idx].memory_size;
-               pregion->userspace_address =
-                       memory.regions[idx].userspace_addr;
-
-               /* This is ugly */
-               mapped_size = memory.regions[idx].memory_size +
-                       memory.regions[idx].mmap_offset;
-
-               /* mmap() without flag of MAP_ANONYMOUS, should be called
-                * with length argument aligned with hugepagesz at older
-                * longterm version Linux, like 2.6.32 and 3.2.72, or
-                * mmap() will fail with EINVAL.
-                *
-                * to avoid failure, make sure in caller to keep length
-                * aligned.
-                */
-               alignment = get_blk_size(pmsg->fds[idx]);
-               if (alignment == (uint64_t)-1) {
-                       RTE_LOG(ERR, VHOST_CONFIG,
-                               "couldn't get hugepage size through fstat\n");
-                       goto err_mmap;
-               }
-               mapped_size = RTE_ALIGN_CEIL(mapped_size, alignment);
-
-               mapped_address = (uint64_t)(uintptr_t)mmap(NULL,
-                       mapped_size,
-                       PROT_READ | PROT_WRITE, MAP_SHARED,
-                       pmsg->fds[idx],
-                       0);
-
-               RTE_LOG(INFO, VHOST_CONFIG,
-                       "mapped region %d fd:%d to:%p sz:0x%"PRIx64" "
-                       "off:0x%"PRIx64" align:0x%"PRIx64"\n",
-                       idx, pmsg->fds[idx], (void *)(uintptr_t)mapped_address,
-                       mapped_size, memory.regions[idx].mmap_offset,
-                       alignment);
-
-               if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) {
-                       RTE_LOG(ERR, VHOST_CONFIG,
-                               "mmap qemu guest failed.\n");
-                       goto err_mmap;
-               }
-
-               pregion_orig[idx].mapped_address = mapped_address;
-               pregion_orig[idx].mapped_size = mapped_size;
-               pregion_orig[idx].blksz = alignment;
-               pregion_orig[idx].fd = pmsg->fds[idx];
-
-               mapped_address +=  memory.regions[idx].mmap_offset;
-
-               pregion->address_offset = mapped_address -
-                       pregion->guest_phys_address;
-
-               if (memory.regions[idx].guest_phys_addr == 0) {
-                       dev->mem->base_address =
-                               memory.regions[idx].userspace_addr;
-                       dev->mem->mapped_address =
-                               pregion->address_offset;
-               }
-
-               LOG_DEBUG(VHOST_CONFIG,
-                       "REGION: %u GPA: %p QEMU VA: %p SIZE (%"PRIu64")\n",
-                       idx,
-                       (void *)(uintptr_t)pregion->guest_phys_address,
-                       (void *)(uintptr_t)pregion->userspace_address,
-                        pregion->memory_size);
-       }
-
-       return 0;
-
-err_mmap:
-       while (idx--) {
-               munmap((void *)(uintptr_t)pregion_orig[idx].mapped_address,
-                               pregion_orig[idx].mapped_size);
-               close(pregion_orig[idx].fd);
-       }
-       free(dev->mem);
-       dev->mem = NULL;
-       return -1;
-}
-
-static int
-vq_is_ready(struct vhost_virtqueue *vq)
-{
-       return vq && vq->desc   &&
-              vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD &&
-              vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD;
-}
-
-static int
-virtio_is_ready(struct virtio_net *dev)
-{
-       struct vhost_virtqueue *rvq, *tvq;
-       uint32_t i;
-
-       for (i = 0; i < dev->virt_qp_nb; i++) {
-               rvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ];
-               tvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ];
-
-               if (!vq_is_ready(rvq) || !vq_is_ready(tvq)) {
-                       RTE_LOG(INFO, VHOST_CONFIG,
-                               "virtio is not ready for processing.\n");
-                       return 0;
-               }
-       }
-
-       RTE_LOG(INFO, VHOST_CONFIG,
-               "virtio is now ready for processing.\n");
-       return 1;
-}
-
-void
-user_set_vring_call(int vid, struct VhostUserMsg *pmsg)
-{
-       struct vhost_vring_file file;
-
-       file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
-       if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
-               file.fd = VIRTIO_INVALID_EVENTFD;
-       else
-               file.fd = pmsg->fds[0];
-       RTE_LOG(INFO, VHOST_CONFIG,
-               "vring call idx:%d file:%d\n", file.index, file.fd);
-       vhost_set_vring_call(vid, &file);
-}
-
-
-/*
- *  In vhost-user, when we receive kick message, will test whether virtio
- *  device is ready for packet processing.
- */
-void
-user_set_vring_kick(int vid, struct VhostUserMsg *pmsg)
-{
-       struct vhost_vring_file file;
-       struct virtio_net *dev = get_device(vid);
-
-       if (!dev)
-               return;
-
-       file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
-       if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
-               file.fd = VIRTIO_INVALID_EVENTFD;
-       else
-               file.fd = pmsg->fds[0];
-       RTE_LOG(INFO, VHOST_CONFIG,
-               "vring kick idx:%d file:%d\n", file.index, file.fd);
-       vhost_set_vring_kick(vid, &file);
-
-       if (virtio_is_ready(dev) && !(dev->flags & VIRTIO_DEV_RUNNING)) {
-               if (notify_ops->new_device(vid) == 0)
-                       dev->flags |= VIRTIO_DEV_RUNNING;
-       }
-}
-
-/*
- * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
- */
-int
-user_get_vring_base(int vid, struct vhost_vring_state *state)
-{
-       struct virtio_net *dev = get_device(vid);
-
-       if (dev == NULL)
-               return -1;
-       /* We have to stop the queue (virtio) if it is running. */
-       if (dev->flags & VIRTIO_DEV_RUNNING) {
-               dev->flags &= ~VIRTIO_DEV_RUNNING;
-               notify_ops->destroy_device(vid);
-       }
-
-       /* Here we are safe to get the last used index */
-       vhost_get_vring_base(vid, state->index, state);
-
-       RTE_LOG(INFO, VHOST_CONFIG,
-               "vring base idx:%d file:%d\n", state->index, state->num);
-       /*
-        * Based on current qemu vhost-user implementation, this message is
-        * sent and only sent in vhost_vring_stop.
-        * TODO: cleanup the vring, it isn't usable since here.
-        */
-       if (dev->virtqueue[state->index]->kickfd >= 0)
-               close(dev->virtqueue[state->index]->kickfd);
-
-       dev->virtqueue[state->index]->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
-
-       return 0;
-}
-
-/*
- * when virtio queues are ready to work, qemu will send us to
- * enable the virtio queue pair.
- */
-int
-user_set_vring_enable(int vid, struct vhost_vring_state *state)
-{
-       struct virtio_net *dev;
-       int enable = (int)state->num;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       RTE_LOG(INFO, VHOST_CONFIG,
-               "set queue enable: %d to qp idx: %d\n",
-               enable, state->index);
-
-       if (notify_ops->vring_state_changed)
-               notify_ops->vring_state_changed(vid, state->index, enable);
-
-       dev->virtqueue[state->index]->enabled = enable;
-
-       return 0;
-}
-
-void
-user_set_protocol_features(int vid, uint64_t protocol_features)
-{
-       struct virtio_net *dev;
-
-       dev = get_device(vid);
-       if (dev == NULL || protocol_features & ~VHOST_USER_PROTOCOL_FEATURES)
-               return;
-
-       dev->protocol_features = protocol_features;
-}
-
-int
-user_set_log_base(int vid, struct VhostUserMsg *msg)
-{
-       struct virtio_net *dev;
-       int fd = msg->fds[0];
-       uint64_t size, off;
-       void *addr;
-
-       dev = get_device(vid);
-       if (!dev)
-               return -1;
-
-       if (fd < 0) {
-               RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd);
-               return -1;
-       }
-
-       if (msg->size != sizeof(VhostUserLog)) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "invalid log base msg size: %"PRId32" != %d\n",
-                       msg->size, (int)sizeof(VhostUserLog));
-               return -1;
-       }
-
-       size = msg->payload.log.mmap_size;
-       off  = msg->payload.log.mmap_offset;
-       RTE_LOG(INFO, VHOST_CONFIG,
-               "log mmap size: %"PRId64", offset: %"PRId64"\n",
-               size, off);
-
-       /*
-        * mmap from 0 to workaround a hugepage mmap bug: mmap will
-        * fail when offset is not page size aligned.
-        */
-       addr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-       close(fd);
-       if (addr == MAP_FAILED) {
-               RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n");
-               return -1;
-       }
-
-       /*
-        * Free previously mapped log memory on occasionally
-        * multiple VHOST_USER_SET_LOG_BASE.
-        */
-       if (dev->log_addr) {
-               munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
-       }
-       dev->log_addr = (uint64_t)(uintptr_t)addr;
-       dev->log_base = dev->log_addr + off;
-       dev->log_size = size;
-
-       return 0;
-}
-
-/*
- * An rarp packet is constructed and broadcasted to notify switches about
- * the new location of the migrated VM, so that packets from outside will
- * not be lost after migration.
- *
- * However, we don't actually "send" a rarp packet here, instead, we set
- * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
- */
-int
-user_send_rarp(int vid, struct VhostUserMsg *msg)
-{
-       struct virtio_net *dev;
-       uint8_t *mac = (uint8_t *)&msg->payload.u64;
-
-       dev = get_device(vid);
-       if (!dev)
-               return -1;
-
-       RTE_LOG(DEBUG, VHOST_CONFIG,
-               ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
-               mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
-       memcpy(dev->mac.addr_bytes, mac, 6);
-
-       /*
-        * Set the flag to inject a RARP broadcast packet at
-        * rte_vhost_dequeue_burst().
-        *
-        * rte_smp_wmb() is for making sure the mac is copied
-        * before the flag is set.
-        */
-       rte_smp_wmb();
-       rte_atomic16_set(&dev->broadcast_rarp, 1);
-
-       return 0;
-}
diff --git a/lib/librte_vhost/virtio-net-user.h b/lib/librte_vhost/virtio-net-user.h
deleted file mode 100644 (file)
index e1b967b..0000000
+++ /dev/null
@@ -1,62 +0,0 @@
-/*-
- *   BSD LICENSE
- *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- *   All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _VIRTIO_NET_USER_H
-#define _VIRTIO_NET_USER_H
-
-#include "vhost-net.h"
-#include "vhost-net-user.h"
-
-#define VHOST_USER_PROTOCOL_F_MQ       0
-#define VHOST_USER_PROTOCOL_F_LOG_SHMFD        1
-#define VHOST_USER_PROTOCOL_F_RARP     2
-
-#define VHOST_USER_PROTOCOL_FEATURES   ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
-                                        (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\
-                                        (1ULL << VHOST_USER_PROTOCOL_F_RARP))
-
-int user_set_mem_table(int, struct VhostUserMsg *);
-
-void user_set_vring_call(int, struct VhostUserMsg *);
-
-void user_set_vring_kick(int, struct VhostUserMsg *);
-
-void user_set_protocol_features(int vid, uint64_t protocol_features);
-int user_set_log_base(int vid, struct VhostUserMsg *);
-int user_send_rarp(int vid, struct VhostUserMsg *);
-
-int user_get_vring_base(int, struct vhost_vring_state *);
-
-int user_set_vring_enable(int vid, struct vhost_vring_state *state);
-
-#endif
diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
deleted file mode 100644 (file)
index 1785695..0000000
+++ /dev/null
@@ -1,847 +0,0 @@
-/*-
- *   BSD LICENSE
- *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- *   All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/vhost.h>
-#include <linux/virtio_net.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <sys/mman.h>
-#include <unistd.h>
-#ifdef RTE_LIBRTE_VHOST_NUMA
-#include <numaif.h>
-#endif
-
-#include <sys/socket.h>
-
-#include <rte_ethdev.h>
-#include <rte_log.h>
-#include <rte_string_fns.h>
-#include <rte_memory.h>
-#include <rte_malloc.h>
-#include <rte_virtio_net.h>
-
-#include "vhost-net.h"
-
-#define MAX_VHOST_DEVICE       1024
-static struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
-
-/* device ops to add/remove device to/from data core. */
-struct virtio_net_device_ops const *notify_ops;
-
-#define VHOST_USER_F_PROTOCOL_FEATURES 30
-
-/* Features supported by this lib. */
-#define VHOST_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \
-                               (1ULL << VIRTIO_NET_F_CTRL_VQ) | \
-                               (1ULL << VIRTIO_NET_F_CTRL_RX) | \
-                               (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \
-                               (VHOST_SUPPORTS_MQ)            | \
-                               (1ULL << VIRTIO_F_VERSION_1)   | \
-                               (1ULL << VHOST_F_LOG_ALL)      | \
-                               (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
-                               (1ULL << VIRTIO_NET_F_HOST_TSO4) | \
-                               (1ULL << VIRTIO_NET_F_HOST_TSO6) | \
-                               (1ULL << VIRTIO_NET_F_CSUM)    | \
-                               (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
-                               (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
-                               (1ULL << VIRTIO_NET_F_GUEST_TSO6))
-
-static uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES;
-
-
-/*
- * Converts QEMU virtual address to Vhost virtual address. This function is
- * used to convert the ring addresses to our address space.
- */
-static uint64_t
-qva_to_vva(struct virtio_net *dev, uint64_t qemu_va)
-{
-       struct virtio_memory_regions *region;
-       uint64_t vhost_va = 0;
-       uint32_t regionidx = 0;
-
-       /* Find the region where the address lives. */
-       for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
-               region = &dev->mem->regions[regionidx];
-               if ((qemu_va >= region->userspace_address) &&
-                       (qemu_va <= region->userspace_address +
-                       region->memory_size)) {
-                       vhost_va = qemu_va + region->guest_phys_address +
-                               region->address_offset -
-                               region->userspace_address;
-                       break;
-               }
-       }
-       return vhost_va;
-}
-
-struct virtio_net *
-get_device(int vid)
-{
-       struct virtio_net *dev = vhost_devices[vid];
-
-       if (unlikely(!dev)) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "(%d) device not found.\n", vid);
-       }
-
-       return dev;
-}
-
-static void
-cleanup_vq(struct vhost_virtqueue *vq, int destroy)
-{
-       if ((vq->callfd >= 0) && (destroy != 0))
-               close(vq->callfd);
-       if (vq->kickfd >= 0)
-               close(vq->kickfd);
-}
-
-/*
- * Unmap any memory, close any file descriptors and
- * free any memory owned by a device.
- */
-static void
-cleanup_device(struct virtio_net *dev, int destroy)
-{
-       uint32_t i;
-
-       vhost_backend_cleanup(dev);
-
-       for (i = 0; i < dev->virt_qp_nb; i++) {
-               cleanup_vq(dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ], destroy);
-               cleanup_vq(dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ], destroy);
-       }
-}
-
-/*
- * Release virtqueues and device memory.
- */
-static void
-free_device(struct virtio_net *dev)
-{
-       uint32_t i;
-
-       for (i = 0; i < dev->virt_qp_nb; i++)
-               rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
-
-       rte_free(dev);
-}
-
-static void
-init_vring_queue(struct vhost_virtqueue *vq, int qp_idx)
-{
-       memset(vq, 0, sizeof(struct vhost_virtqueue));
-
-       vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
-       vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD;
-
-       /* Backends are set to -1 indicating an inactive device. */
-       vq->backend = -1;
-
-       /* always set the default vq pair to enabled */
-       if (qp_idx == 0)
-               vq->enabled = 1;
-}
-
-static void
-init_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx)
-{
-       uint32_t base_idx = qp_idx * VIRTIO_QNUM;
-
-       init_vring_queue(dev->virtqueue[base_idx + VIRTIO_RXQ], qp_idx);
-       init_vring_queue(dev->virtqueue[base_idx + VIRTIO_TXQ], qp_idx);
-}
-
-static void
-reset_vring_queue(struct vhost_virtqueue *vq, int qp_idx)
-{
-       int callfd;
-
-       callfd = vq->callfd;
-       init_vring_queue(vq, qp_idx);
-       vq->callfd = callfd;
-}
-
-static void
-reset_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx)
-{
-       uint32_t base_idx = qp_idx * VIRTIO_QNUM;
-
-       reset_vring_queue(dev->virtqueue[base_idx + VIRTIO_RXQ], qp_idx);
-       reset_vring_queue(dev->virtqueue[base_idx + VIRTIO_TXQ], qp_idx);
-}
-
-static int
-alloc_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx)
-{
-       struct vhost_virtqueue *virtqueue = NULL;
-       uint32_t virt_rx_q_idx = qp_idx * VIRTIO_QNUM + VIRTIO_RXQ;
-       uint32_t virt_tx_q_idx = qp_idx * VIRTIO_QNUM + VIRTIO_TXQ;
-
-       virtqueue = rte_malloc(NULL,
-                              sizeof(struct vhost_virtqueue) * VIRTIO_QNUM, 0);
-       if (virtqueue == NULL) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "Failed to allocate memory for virt qp:%d.\n", qp_idx);
-               return -1;
-       }
-
-       dev->virtqueue[virt_rx_q_idx] = virtqueue;
-       dev->virtqueue[virt_tx_q_idx] = virtqueue + VIRTIO_TXQ;
-
-       init_vring_queue_pair(dev, qp_idx);
-
-       dev->virt_qp_nb += 1;
-
-       return 0;
-}
-
-/*
- * Reset some variables in device structure, while keeping few
- * others untouched, such as vid, ifname, virt_qp_nb: they
- * should be same unless the device is removed.
- */
-static void
-reset_device(struct virtio_net *dev)
-{
-       uint32_t i;
-
-       dev->features = 0;
-       dev->protocol_features = 0;
-       dev->flags = 0;
-
-       for (i = 0; i < dev->virt_qp_nb; i++)
-               reset_vring_queue_pair(dev, i);
-}
-
-/*
- * Function is called from the CUSE open function. The device structure is
- * initialised and a new entry is added to the device configuration linked
- * list.
- */
-int
-vhost_new_device(void)
-{
-       struct virtio_net *dev;
-       int i;
-
-       dev = rte_zmalloc(NULL, sizeof(struct virtio_net), 0);
-       if (dev == NULL) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "Failed to allocate memory for new dev.\n");
-               return -1;
-       }
-
-       for (i = 0; i < MAX_VHOST_DEVICE; i++) {
-               if (vhost_devices[i] == NULL)
-                       break;
-       }
-       if (i == MAX_VHOST_DEVICE) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "Failed to find a free slot for new device.\n");
-               return -1;
-       }
-
-       vhost_devices[i] = dev;
-       dev->vid = i;
-
-       return i;
-}
-
-/*
- * Function is called from the CUSE release function. This function will
- * cleanup the device and remove it from device configuration linked list.
- */
-void
-vhost_destroy_device(int vid)
-{
-       struct virtio_net *dev = get_device(vid);
-
-       if (dev == NULL)
-               return;
-
-       if (dev->flags & VIRTIO_DEV_RUNNING) {
-               dev->flags &= ~VIRTIO_DEV_RUNNING;
-               notify_ops->destroy_device(vid);
-       }
-
-       cleanup_device(dev, 1);
-       free_device(dev);
-
-       vhost_devices[vid] = NULL;
-}
-
-void
-vhost_set_ifname(int vid, const char *if_name, unsigned int if_len)
-{
-       struct virtio_net *dev;
-       unsigned int len;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return;
-
-       len = if_len > sizeof(dev->ifname) ?
-               sizeof(dev->ifname) : if_len;
-
-       strncpy(dev->ifname, if_name, len);
-       dev->ifname[sizeof(dev->ifname) - 1] = '\0';
-}
-
-
-/*
- * Called from CUSE IOCTL: VHOST_SET_OWNER
- * This function just returns success at the moment unless
- * the device hasn't been initialised.
- */
-int
-vhost_set_owner(int vid)
-{
-       struct virtio_net *dev;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       return 0;
-}
-
-/*
- * Called from CUSE IOCTL: VHOST_RESET_OWNER
- */
-int
-vhost_reset_owner(int vid)
-{
-       struct virtio_net *dev;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       if (dev->flags & VIRTIO_DEV_RUNNING) {
-               dev->flags &= ~VIRTIO_DEV_RUNNING;
-               notify_ops->destroy_device(vid);
-       }
-
-       cleanup_device(dev, 0);
-       reset_device(dev);
-       return 0;
-}
-
-/*
- * Called from CUSE IOCTL: VHOST_GET_FEATURES
- * The features that we support are requested.
- */
-int
-vhost_get_features(int vid, uint64_t *pu)
-{
-       struct virtio_net *dev;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       /* Send our supported features. */
-       *pu = VHOST_FEATURES;
-       return 0;
-}
-
-/*
- * Called from CUSE IOCTL: VHOST_SET_FEATURES
- * We receive the negotiated features supported by us and the virtio device.
- */
-int
-vhost_set_features(int vid, uint64_t *pu)
-{
-       struct virtio_net *dev;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-       if (*pu & ~VHOST_FEATURES)
-               return -1;
-
-       dev->features = *pu;
-       if (dev->features &
-               ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) {
-               dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
-       } else {
-               dev->vhost_hlen = sizeof(struct virtio_net_hdr);
-       }
-       LOG_DEBUG(VHOST_CONFIG,
-               "(%d) mergeable RX buffers %s, virtio 1 %s\n",
-               dev->vid,
-               (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off",
-               (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off");
-
-       return 0;
-}
-
-/*
- * Called from CUSE IOCTL: VHOST_SET_VRING_NUM
- * The virtio device sends us the size of the descriptor ring.
- */
-int
-vhost_set_vring_num(int vid, struct vhost_vring_state *state)
-{
-       struct virtio_net *dev;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       /* State->index refers to the queue index. The txq is 1, rxq is 0. */
-       dev->virtqueue[state->index]->size = state->num;
-
-       return 0;
-}
-
-/*
- * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the
- * same numa node as the memory of vring descriptor.
- */
-#ifdef RTE_LIBRTE_VHOST_NUMA
-static struct virtio_net*
-numa_realloc(struct virtio_net *dev, int index)
-{
-       int oldnode, newnode;
-       struct virtio_net *old_dev;
-       struct vhost_virtqueue *old_vq, *vq;
-       int ret;
-
-       /*
-        * vq is allocated on pairs, we should try to do realloc
-        * on first queue of one queue pair only.
-        */
-       if (index % VIRTIO_QNUM != 0)
-               return dev;
-
-       old_dev = dev;
-       vq = old_vq = dev->virtqueue[index];
-
-       ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc,
-                           MPOL_F_NODE | MPOL_F_ADDR);
-
-       /* check if we need to reallocate vq */
-       ret |= get_mempolicy(&oldnode, NULL, 0, old_vq,
-                            MPOL_F_NODE | MPOL_F_ADDR);
-       if (ret) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "Unable to get vq numa information.\n");
-               return dev;
-       }
-       if (oldnode != newnode) {
-               RTE_LOG(INFO, VHOST_CONFIG,
-                       "reallocate vq from %d to %d node\n", oldnode, newnode);
-               vq = rte_malloc_socket(NULL, sizeof(*vq) * VIRTIO_QNUM, 0,
-                                      newnode);
-               if (!vq)
-                       return dev;
-
-               memcpy(vq, old_vq, sizeof(*vq) * VIRTIO_QNUM);
-               rte_free(old_vq);
-       }
-
-       /* check if we need to reallocate dev */
-       ret = get_mempolicy(&oldnode, NULL, 0, old_dev,
-                           MPOL_F_NODE | MPOL_F_ADDR);
-       if (ret) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "Unable to get dev numa information.\n");
-               goto out;
-       }
-       if (oldnode != newnode) {
-               RTE_LOG(INFO, VHOST_CONFIG,
-                       "reallocate dev from %d to %d node\n",
-                       oldnode, newnode);
-               dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode);
-               if (!dev) {
-                       dev = old_dev;
-                       goto out;
-               }
-
-               memcpy(dev, old_dev, sizeof(*dev));
-               rte_free(old_dev);
-       }
-
-out:
-       dev->virtqueue[index] = vq;
-       dev->virtqueue[index + 1] = vq + 1;
-       vhost_devices[dev->vid] = dev;
-
-       return dev;
-}
-#else
-static struct virtio_net*
-numa_realloc(struct virtio_net *dev, int index __rte_unused)
-{
-       return dev;
-}
-#endif
-
-/*
- * Called from CUSE IOCTL: VHOST_SET_VRING_ADDR
- * The virtio device sends us the desc, used and avail ring addresses.
- * This function then converts these to our address space.
- */
-int
-vhost_set_vring_addr(int vid, struct vhost_vring_addr *addr)
-{
-       struct virtio_net *dev;
-       struct vhost_virtqueue *vq;
-
-       dev = get_device(vid);
-       if ((dev == NULL) || (dev->mem == NULL))
-               return -1;
-
-       /* addr->index refers to the queue index. The txq 1, rxq is 0. */
-       vq = dev->virtqueue[addr->index];
-
-       /* The addresses are converted from QEMU virtual to Vhost virtual. */
-       vq->desc = (struct vring_desc *)(uintptr_t)qva_to_vva(dev,
-                       addr->desc_user_addr);
-       if (vq->desc == 0) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "(%d) failed to find desc ring address.\n",
-                       dev->vid);
-               return -1;
-       }
-
-       dev = numa_realloc(dev, addr->index);
-       vq = dev->virtqueue[addr->index];
-
-       vq->avail = (struct vring_avail *)(uintptr_t)qva_to_vva(dev,
-                       addr->avail_user_addr);
-       if (vq->avail == 0) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "(%d) failed to find avail ring address.\n",
-                       dev->vid);
-               return -1;
-       }
-
-       vq->used = (struct vring_used *)(uintptr_t)qva_to_vva(dev,
-                       addr->used_user_addr);
-       if (vq->used == 0) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "(%d) failed to find used ring address.\n",
-                       dev->vid);
-               return -1;
-       }
-
-       if (vq->last_used_idx != vq->used->idx) {
-               RTE_LOG(WARNING, VHOST_CONFIG,
-                       "last_used_idx (%u) and vq->used->idx (%u) mismatches; "
-                       "some packets maybe resent for Tx and dropped for Rx\n",
-                       vq->last_used_idx, vq->used->idx);
-               vq->last_used_idx     = vq->used->idx;
-       }
-
-       vq->log_guest_addr = addr->log_guest_addr;
-
-       LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n",
-                       dev->vid, vq->desc);
-       LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n",
-                       dev->vid, vq->avail);
-       LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n",
-                       dev->vid, vq->used);
-       LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n",
-                       dev->vid, vq->log_guest_addr);
-
-       return 0;
-}
-
-/*
- * Called from CUSE IOCTL: VHOST_SET_VRING_BASE
- * The virtio device sends us the available ring last used index.
- */
-int
-vhost_set_vring_base(int vid, struct vhost_vring_state *state)
-{
-       struct virtio_net *dev;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       /* State->index refers to the queue index. The txq is 1, rxq is 0. */
-       dev->virtqueue[state->index]->last_used_idx = state->num;
-
-       return 0;
-}
-
-/*
- * Called from CUSE IOCTL: VHOST_GET_VRING_BASE
- * We send the virtio device our available ring last used index.
- */
-int
-vhost_get_vring_base(int vid, uint32_t index,
-       struct vhost_vring_state *state)
-{
-       struct virtio_net *dev;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       state->index = index;
-       /* State->index refers to the queue index. The txq is 1, rxq is 0. */
-       state->num = dev->virtqueue[state->index]->last_used_idx;
-
-       return 0;
-}
-
-
-/*
- * Called from CUSE IOCTL: VHOST_SET_VRING_CALL
- * The virtio device sends an eventfd to interrupt the guest. This fd gets
- * copied into our process space.
- */
-int
-vhost_set_vring_call(int vid, struct vhost_vring_file *file)
-{
-       struct virtio_net *dev;
-       struct vhost_virtqueue *vq;
-       uint32_t cur_qp_idx = file->index / VIRTIO_QNUM;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       /*
-        * FIXME: VHOST_SET_VRING_CALL is the first per-vring message
-        * we get, so we do vring queue pair allocation here.
-        */
-       if (cur_qp_idx + 1 > dev->virt_qp_nb) {
-               if (alloc_vring_queue_pair(dev, cur_qp_idx) < 0)
-                       return -1;
-       }
-
-       /* file->index refers to the queue index. The txq is 1, rxq is 0. */
-       vq = dev->virtqueue[file->index];
-       assert(vq != NULL);
-
-       if (vq->callfd >= 0)
-               close(vq->callfd);
-
-       vq->callfd = file->fd;
-
-       return 0;
-}
-
-/*
- * Called from CUSE IOCTL: VHOST_SET_VRING_KICK
- * The virtio device sends an eventfd that it can use to notify us.
- * This fd gets copied into our process space.
- */
-int
-vhost_set_vring_kick(int vid, struct vhost_vring_file *file)
-{
-       struct virtio_net *dev;
-       struct vhost_virtqueue *vq;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       /* file->index refers to the queue index. The txq is 1, rxq is 0. */
-       vq = dev->virtqueue[file->index];
-
-       if (vq->kickfd >= 0)
-               close(vq->kickfd);
-
-       vq->kickfd = file->fd;
-
-       return 0;
-}
-
-/*
- * Called from CUSE IOCTL: VHOST_NET_SET_BACKEND
- * To complete device initialisation when the virtio driver is loaded,
- * we are provided with a valid fd for a tap device (not used by us).
- * If this happens then we can add the device to a data core.
- * When the virtio driver is removed we get fd=-1.
- * At that point we remove the device from the data core.
- * The device will still exist in the device configuration linked list.
- */
-int
-vhost_set_backend(int vid, struct vhost_vring_file *file)
-{
-       struct virtio_net *dev;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       /* file->index refers to the queue index. The txq is 1, rxq is 0. */
-       dev->virtqueue[file->index]->backend = file->fd;
-
-       /*
-        * If the device isn't already running and both backend fds are set,
-        * we add the device.
-        */
-       if (!(dev->flags & VIRTIO_DEV_RUNNING)) {
-               if (dev->virtqueue[VIRTIO_TXQ]->backend != VIRTIO_DEV_STOPPED &&
-                   dev->virtqueue[VIRTIO_RXQ]->backend != VIRTIO_DEV_STOPPED) {
-                       if (notify_ops->new_device(vid) < 0)
-                               return -1;
-                       dev->flags |= VIRTIO_DEV_RUNNING;
-               }
-       } else if (file->fd == VIRTIO_DEV_STOPPED) {
-               dev->flags &= ~VIRTIO_DEV_RUNNING;
-               notify_ops->destroy_device(vid);
-       }
-
-       return 0;
-}
-
-int
-rte_vhost_get_numa_node(int vid)
-{
-#ifdef RTE_LIBRTE_VHOST_NUMA
-       struct virtio_net *dev = get_device(vid);
-       int numa_node;
-       int ret;
-
-       if (dev == NULL)
-               return -1;
-
-       ret = get_mempolicy(&numa_node, NULL, 0, dev,
-                           MPOL_F_NODE | MPOL_F_ADDR);
-       if (ret < 0) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "(%d) failed to query numa node: %d\n", vid, ret);
-               return -1;
-       }
-
-       return numa_node;
-#else
-       RTE_SET_USED(vid);
-       return -1;
-#endif
-}
-
-uint32_t
-rte_vhost_get_queue_num(int vid)
-{
-       struct virtio_net *dev = get_device(vid);
-
-       if (dev == NULL)
-               return 0;
-
-       return dev->virt_qp_nb;
-}
-
-int
-rte_vhost_get_ifname(int vid, char *buf, size_t len)
-{
-       struct virtio_net *dev = get_device(vid);
-
-       if (dev == NULL)
-               return -1;
-
-       len = RTE_MIN(len, sizeof(dev->ifname));
-
-       strncpy(buf, dev->ifname, len);
-       buf[len - 1] = '\0';
-
-       return 0;
-}
-
-uint16_t
-rte_vhost_avail_entries(int vid, uint16_t queue_id)
-{
-       struct virtio_net *dev;
-       struct vhost_virtqueue *vq;
-
-       dev = get_device(vid);
-       if (!dev)
-               return 0;
-
-       vq = dev->virtqueue[queue_id];
-       if (!vq->enabled)
-               return 0;
-
-       return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx;
-}
-
-int
-rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable)
-{
-       struct virtio_net *dev = get_device(vid);
-
-       if (dev == NULL)
-               return -1;
-
-       if (enable) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "guest notification isn't supported.\n");
-               return -1;
-       }
-
-       dev->virtqueue[queue_id]->used->flags = VRING_USED_F_NO_NOTIFY;
-       return 0;
-}
-
-uint64_t rte_vhost_feature_get(void)
-{
-       return VHOST_FEATURES;
-}
-
-int rte_vhost_feature_disable(uint64_t feature_mask)
-{
-       VHOST_FEATURES = VHOST_FEATURES & ~feature_mask;
-       return 0;
-}
-
-int rte_vhost_feature_enable(uint64_t feature_mask)
-{
-       if ((feature_mask & VHOST_SUPPORTED_FEATURES) == feature_mask) {
-               VHOST_FEATURES = VHOST_FEATURES | feature_mask;
-               return 0;
-       }
-       return -1;
-}
-
-/*
- * Register ops so that we can add/remove device to data core.
- */
-int
-rte_vhost_driver_callback_register(struct virtio_net_device_ops const * const ops)
-{
-       notify_ops = ops;
-
-       return 0;
-}
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
new file mode 100644 (file)
index 0000000..8a151af
--- /dev/null
@@ -0,0 +1,924 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <linux/virtio_net.h>
+
+#include <rte_mbuf.h>
+#include <rte_memcpy.h>
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_virtio_net.h>
+#include <rte_tcp.h>
+#include <rte_udp.h>
+#include <rte_sctp.h>
+#include <rte_arp.h>
+
+#include "vhost.h"
+
+#define MAX_PKT_BURST 32
+#define VHOST_LOG_PAGE 4096
+
+static inline void __attribute__((always_inline))
+vhost_log_page(uint8_t *log_base, uint64_t page)
+{
+       log_base[page / 8] |= 1 << (page % 8);
+}
+
+static inline void __attribute__((always_inline))
+vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
+{
+       uint64_t page;
+
+       if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
+                  !dev->log_base || !len))
+               return;
+
+       if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
+               return;
+
+       /* To make sure guest memory updates are committed before logging */
+       rte_smp_wmb();
+
+       page = addr / VHOST_LOG_PAGE;
+       while (page * VHOST_LOG_PAGE < addr + len) {
+               vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
+               page += 1;
+       }
+}
+
+static inline void __attribute__((always_inline))
+vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+                    uint64_t offset, uint64_t len)
+{
+       vhost_log_write(dev, vq->log_guest_addr + offset, len);
+}
+
+static bool
+is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
+{
+       return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
+}
+
+static void
+virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
+{
+       if (m_buf->ol_flags & PKT_TX_L4_MASK) {
+               net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+               net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
+
+               switch (m_buf->ol_flags & PKT_TX_L4_MASK) {
+               case PKT_TX_TCP_CKSUM:
+                       net_hdr->csum_offset = (offsetof(struct tcp_hdr,
+                                               cksum));
+                       break;
+               case PKT_TX_UDP_CKSUM:
+                       net_hdr->csum_offset = (offsetof(struct udp_hdr,
+                                               dgram_cksum));
+                       break;
+               case PKT_TX_SCTP_CKSUM:
+                       net_hdr->csum_offset = (offsetof(struct sctp_hdr,
+                                               cksum));
+                       break;
+               }
+       }
+
+       if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
+               if (m_buf->ol_flags & PKT_TX_IPV4)
+                       net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+               else
+                       net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+               net_hdr->gso_size = m_buf->tso_segsz;
+               net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
+                                       + m_buf->l4_len;
+       }
+}
+
+static inline void
+copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
+                   struct virtio_net_hdr_mrg_rxbuf hdr)
+{
+       if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
+               *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
+       else
+               *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
+}
+
+static inline int __attribute__((always_inline))
+copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
+                 struct rte_mbuf *m, uint16_t desc_idx)
+{
+       uint32_t desc_avail, desc_offset;
+       uint32_t mbuf_avail, mbuf_offset;
+       uint32_t cpy_len;
+       struct vring_desc *desc;
+       uint64_t desc_addr;
+       struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
+
+       desc = &vq->desc[desc_idx];
+       desc_addr = gpa_to_vva(dev, desc->addr);
+       /*
+        * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
+        * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
+        * otherwise stores offset on the stack instead of in a register.
+        */
+       if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
+               return -1;
+
+       rte_prefetch0((void *)(uintptr_t)desc_addr);
+
+       virtio_enqueue_offload(m, &virtio_hdr.hdr);
+       copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
+       vhost_log_write(dev, desc->addr, dev->vhost_hlen);
+       PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
+
+       desc_offset = dev->vhost_hlen;
+       desc_avail  = desc->len - dev->vhost_hlen;
+
+       mbuf_avail  = rte_pktmbuf_data_len(m);
+       mbuf_offset = 0;
+       while (mbuf_avail != 0 || m->next != NULL) {
+               /* done with current mbuf, fetch next */
+               if (mbuf_avail == 0) {
+                       m = m->next;
+
+                       mbuf_offset = 0;
+                       mbuf_avail  = rte_pktmbuf_data_len(m);
+               }
+
+               /* done with current desc buf, fetch next */
+               if (desc_avail == 0) {
+                       if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
+                               /* Room in vring buffer is not enough */
+                               return -1;
+                       }
+                       if (unlikely(desc->next >= vq->size))
+                               return -1;
+
+                       desc = &vq->desc[desc->next];
+                       desc_addr = gpa_to_vva(dev, desc->addr);
+                       if (unlikely(!desc_addr))
+                               return -1;
+
+                       desc_offset = 0;
+                       desc_avail  = desc->len;
+               }
+
+               cpy_len = RTE_MIN(desc_avail, mbuf_avail);
+               rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
+                       rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
+                       cpy_len);
+               vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
+               PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
+                            cpy_len, 0);
+
+               mbuf_avail  -= cpy_len;
+               mbuf_offset += cpy_len;
+               desc_avail  -= cpy_len;
+               desc_offset += cpy_len;
+       }
+
+       return 0;
+}
+
+/**
+ * This function adds buffers to the virtio devices RX virtqueue. Buffers can
+ * be received from the physical port or from another virtio device. A packet
+ * count is returned to indicate the number of packets that are succesfully
+ * added to the RX queue. This function works when the mbuf is scattered, but
+ * it doesn't support the mergeable feature.
+ */
+static inline uint32_t __attribute__((always_inline))
+virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
+             struct rte_mbuf **pkts, uint32_t count)
+{
+       struct vhost_virtqueue *vq;
+       uint16_t avail_idx, free_entries, start_idx;
+       uint16_t desc_indexes[MAX_PKT_BURST];
+       uint16_t used_idx;
+       uint32_t i;
+
+       LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
+       if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
+               RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
+                       dev->vid, __func__, queue_id);
+               return 0;
+       }
+
+       vq = dev->virtqueue[queue_id];
+       if (unlikely(vq->enabled == 0))
+               return 0;
+
+       avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+       start_idx = vq->last_used_idx;
+       free_entries = avail_idx - start_idx;
+       count = RTE_MIN(count, free_entries);
+       count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
+       if (count == 0)
+               return 0;
+
+       LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
+               dev->vid, start_idx, start_idx + count);
+
+       /* Retrieve all of the desc indexes first to avoid caching issues. */
+       rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
+       for (i = 0; i < count; i++) {
+               used_idx = (start_idx + i) & (vq->size - 1);
+               desc_indexes[i] = vq->avail->ring[used_idx];
+               vq->used->ring[used_idx].id = desc_indexes[i];
+               vq->used->ring[used_idx].len = pkts[i]->pkt_len +
+                                              dev->vhost_hlen;
+               vhost_log_used_vring(dev, vq,
+                       offsetof(struct vring_used, ring[used_idx]),
+                       sizeof(vq->used->ring[used_idx]));
+       }
+
+       rte_prefetch0(&vq->desc[desc_indexes[0]]);
+       for (i = 0; i < count; i++) {
+               uint16_t desc_idx = desc_indexes[i];
+               int err;
+
+               err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
+               if (unlikely(err)) {
+                       used_idx = (start_idx + i) & (vq->size - 1);
+                       vq->used->ring[used_idx].len = dev->vhost_hlen;
+                       vhost_log_used_vring(dev, vq,
+                               offsetof(struct vring_used, ring[used_idx]),
+                               sizeof(vq->used->ring[used_idx]));
+               }
+
+               if (i + 1 < count)
+                       rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
+       }
+
+       rte_smp_wmb();
+
+       *(volatile uint16_t *)&vq->used->idx += count;
+       vq->last_used_idx += count;
+       vhost_log_used_vring(dev, vq,
+               offsetof(struct vring_used, idx),
+               sizeof(vq->used->idx));
+
+       /* flush used->idx update before we read avail->flags. */
+       rte_mb();
+
+       /* Kick the guest if necessary. */
+       if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
+                       && (vq->callfd >= 0))
+               eventfd_write(vq->callfd, (eventfd_t)1);
+       return count;
+}
+
+static inline int
+fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
+            uint32_t *allocated, uint32_t *vec_idx,
+            struct buf_vector *buf_vec)
+{
+       uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
+       uint32_t vec_id = *vec_idx;
+       uint32_t len    = *allocated;
+
+       while (1) {
+               if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
+                       return -1;
+
+               len += vq->desc[idx].len;
+               buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
+               buf_vec[vec_id].buf_len  = vq->desc[idx].len;
+               buf_vec[vec_id].desc_idx = idx;
+               vec_id++;
+
+               if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0)
+                       break;
+
+               idx = vq->desc[idx].next;
+       }
+
+       *allocated = len;
+       *vec_idx   = vec_id;
+
+       return 0;
+}
+
+/*
+ * Returns -1 on fail, 0 on success
+ */
+static inline int
+reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
+                           uint16_t *end, struct buf_vector *buf_vec)
+{
+       uint16_t cur_idx;
+       uint16_t avail_idx;
+       uint32_t allocated = 0;
+       uint32_t vec_idx = 0;
+       uint16_t tries = 0;
+
+       cur_idx  = vq->last_used_idx;
+
+       while (1) {
+               avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+               if (unlikely(cur_idx == avail_idx))
+                       return -1;
+
+               if (unlikely(fill_vec_buf(vq, cur_idx, &allocated,
+                                         &vec_idx, buf_vec) < 0))
+                       return -1;
+
+               cur_idx++;
+               tries++;
+
+               if (allocated >= size)
+                       break;
+
+               /*
+                * if we tried all available ring items, and still
+                * can't get enough buf, it means something abnormal
+                * happened.
+                */
+               if (unlikely(tries >= vq->size))
+                       return -1;
+       }
+
+       *end = cur_idx;
+       return 0;
+}
+
+static inline uint32_t __attribute__((always_inline))
+copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
+                           uint16_t end_idx, struct rte_mbuf *m,
+                           struct buf_vector *buf_vec)
+{
+       struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
+       uint32_t vec_idx = 0;
+       uint16_t start_idx = vq->last_used_idx;
+       uint16_t cur_idx = start_idx;
+       uint64_t desc_addr;
+       uint32_t mbuf_offset, mbuf_avail;
+       uint32_t desc_offset, desc_avail;
+       uint32_t cpy_len;
+       uint16_t desc_idx, used_idx;
+
+       if (unlikely(m == NULL))
+               return 0;
+
+       LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
+               dev->vid, cur_idx, end_idx);
+
+       desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
+       if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
+               return 0;
+
+       rte_prefetch0((void *)(uintptr_t)desc_addr);
+
+       virtio_hdr.num_buffers = end_idx - start_idx;
+       LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
+               dev->vid, virtio_hdr.num_buffers);
+
+       virtio_enqueue_offload(m, &virtio_hdr.hdr);
+       copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
+       vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
+       PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
+
+       desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
+       desc_offset = dev->vhost_hlen;
+
+       mbuf_avail  = rte_pktmbuf_data_len(m);
+       mbuf_offset = 0;
+       while (mbuf_avail != 0 || m->next != NULL) {
+               /* done with current desc buf, get the next one */
+               if (desc_avail == 0) {
+                       desc_idx = buf_vec[vec_idx].desc_idx;
+
+                       if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
+                               /* Update used ring with desc information */
+                               used_idx = cur_idx++ & (vq->size - 1);
+                               vq->used->ring[used_idx].id  = desc_idx;
+                               vq->used->ring[used_idx].len = desc_offset;
+                               vhost_log_used_vring(dev, vq,
+                                       offsetof(struct vring_used,
+                                                ring[used_idx]),
+                                       sizeof(vq->used->ring[used_idx]));
+                       }
+
+                       vec_idx++;
+                       desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
+                       if (unlikely(!desc_addr))
+                               return 0;
+
+                       /* Prefetch buffer address. */
+                       rte_prefetch0((void *)(uintptr_t)desc_addr);
+                       desc_offset = 0;
+                       desc_avail  = buf_vec[vec_idx].buf_len;
+               }
+
+               /* done with current mbuf, get the next one */
+               if (mbuf_avail == 0) {
+                       m = m->next;
+
+                       mbuf_offset = 0;
+                       mbuf_avail  = rte_pktmbuf_data_len(m);
+               }
+
+               cpy_len = RTE_MIN(desc_avail, mbuf_avail);
+               rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
+                       rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
+                       cpy_len);
+               vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
+                       cpy_len);
+               PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
+                       cpy_len, 0);
+
+               mbuf_avail  -= cpy_len;
+               mbuf_offset += cpy_len;
+               desc_avail  -= cpy_len;
+               desc_offset += cpy_len;
+       }
+
+       used_idx = cur_idx & (vq->size - 1);
+       vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
+       vq->used->ring[used_idx].len = desc_offset;
+       vhost_log_used_vring(dev, vq,
+               offsetof(struct vring_used, ring[used_idx]),
+               sizeof(vq->used->ring[used_idx]));
+
+       return end_idx - start_idx;
+}
+
+static inline uint32_t __attribute__((always_inline))
+virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
+       struct rte_mbuf **pkts, uint32_t count)
+{
+       struct vhost_virtqueue *vq;
+       uint32_t pkt_idx = 0, nr_used = 0;
+       uint16_t end;
+       struct buf_vector buf_vec[BUF_VECTOR_MAX];
+
+       LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
+       if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
+               RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
+                       dev->vid, __func__, queue_id);
+               return 0;
+       }
+
+       vq = dev->virtqueue[queue_id];
+       if (unlikely(vq->enabled == 0))
+               return 0;
+
+       count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
+       if (count == 0)
+               return 0;
+
+       for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+               uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
+
+               if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len,
+                                                        &end, buf_vec) < 0)) {
+                       LOG_DEBUG(VHOST_DATA,
+                               "(%d) failed to get enough desc from vring\n",
+                               dev->vid);
+                       break;
+               }
+
+               nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
+                                                     pkts[pkt_idx], buf_vec);
+               rte_smp_wmb();
+
+               *(volatile uint16_t *)&vq->used->idx += nr_used;
+               vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+                       sizeof(vq->used->idx));
+               vq->last_used_idx += nr_used;
+       }
+
+       if (likely(pkt_idx)) {
+               /* flush used->idx update before we read avail->flags. */
+               rte_mb();
+
+               /* Kick the guest if necessary. */
+               if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
+                               && (vq->callfd >= 0))
+                       eventfd_write(vq->callfd, (eventfd_t)1);
+       }
+
+       return pkt_idx;
+}
+
+uint16_t
+rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+       struct rte_mbuf **pkts, uint16_t count)
+{
+       struct virtio_net *dev = get_device(vid);
+
+       if (!dev)
+               return 0;
+
+       if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
+               return virtio_dev_merge_rx(dev, queue_id, pkts, count);
+       else
+               return virtio_dev_rx(dev, queue_id, pkts, count);
+}
+
+static void
+parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
+{
+       struct ipv4_hdr *ipv4_hdr;
+       struct ipv6_hdr *ipv6_hdr;
+       void *l3_hdr = NULL;
+       struct ether_hdr *eth_hdr;
+       uint16_t ethertype;
+
+       eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
+
+       m->l2_len = sizeof(struct ether_hdr);
+       ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
+
+       if (ethertype == ETHER_TYPE_VLAN) {
+               struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
+
+               m->l2_len += sizeof(struct vlan_hdr);
+               ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
+       }
+
+       l3_hdr = (char *)eth_hdr + m->l2_len;
+
+       switch (ethertype) {
+       case ETHER_TYPE_IPv4:
+               ipv4_hdr = (struct ipv4_hdr *)l3_hdr;
+               *l4_proto = ipv4_hdr->next_proto_id;
+               m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
+               *l4_hdr = (char *)l3_hdr + m->l3_len;
+               m->ol_flags |= PKT_TX_IPV4;
+               break;
+       case ETHER_TYPE_IPv6:
+               ipv6_hdr = (struct ipv6_hdr *)l3_hdr;
+               *l4_proto = ipv6_hdr->proto;
+               m->l3_len = sizeof(struct ipv6_hdr);
+               *l4_hdr = (char *)l3_hdr + m->l3_len;
+               m->ol_flags |= PKT_TX_IPV6;
+               break;
+       default:
+               m->l3_len = 0;
+               *l4_proto = 0;
+               break;
+       }
+}
+
+static inline void __attribute__((always_inline))
+vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
+{
+       uint16_t l4_proto = 0;
+       void *l4_hdr = NULL;
+       struct tcp_hdr *tcp_hdr = NULL;
+
+       parse_ethernet(m, &l4_proto, &l4_hdr);
+       if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+               if (hdr->csum_start == (m->l2_len + m->l3_len)) {
+                       switch (hdr->csum_offset) {
+                       case (offsetof(struct tcp_hdr, cksum)):
+                               if (l4_proto == IPPROTO_TCP)
+                                       m->ol_flags |= PKT_TX_TCP_CKSUM;
+                               break;
+                       case (offsetof(struct udp_hdr, dgram_cksum)):
+                               if (l4_proto == IPPROTO_UDP)
+                                       m->ol_flags |= PKT_TX_UDP_CKSUM;
+                               break;
+                       case (offsetof(struct sctp_hdr, cksum)):
+                               if (l4_proto == IPPROTO_SCTP)
+                                       m->ol_flags |= PKT_TX_SCTP_CKSUM;
+                               break;
+                       default:
+                               break;
+                       }
+               }
+       }
+
+       if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+               switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
+               case VIRTIO_NET_HDR_GSO_TCPV4:
+               case VIRTIO_NET_HDR_GSO_TCPV6:
+                       tcp_hdr = (struct tcp_hdr *)l4_hdr;
+                       m->ol_flags |= PKT_TX_TCP_SEG;
+                       m->tso_segsz = hdr->gso_size;
+                       m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
+                       break;
+               default:
+                       RTE_LOG(WARNING, VHOST_DATA,
+                               "unsupported gso type %u.\n", hdr->gso_type);
+                       break;
+               }
+       }
+}
+
+#define RARP_PKT_SIZE  64
+
+static int
+make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
+{
+       struct ether_hdr *eth_hdr;
+       struct arp_hdr  *rarp;
+
+       if (rarp_mbuf->buf_len < 64) {
+               RTE_LOG(WARNING, VHOST_DATA,
+                       "failed to make RARP; mbuf size too small %u (< %d)\n",
+                       rarp_mbuf->buf_len, RARP_PKT_SIZE);
+               return -1;
+       }
+
+       /* Ethernet header. */
+       eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0);
+       memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN);
+       ether_addr_copy(mac, &eth_hdr->s_addr);
+       eth_hdr->ether_type = htons(ETHER_TYPE_RARP);
+
+       /* RARP header. */
+       rarp = (struct arp_hdr *)(eth_hdr + 1);
+       rarp->arp_hrd = htons(ARP_HRD_ETHER);
+       rarp->arp_pro = htons(ETHER_TYPE_IPv4);
+       rarp->arp_hln = ETHER_ADDR_LEN;
+       rarp->arp_pln = 4;
+       rarp->arp_op  = htons(ARP_OP_REVREQUEST);
+
+       ether_addr_copy(mac, &rarp->arp_data.arp_sha);
+       ether_addr_copy(mac, &rarp->arp_data.arp_tha);
+       memset(&rarp->arp_data.arp_sip, 0x00, 4);
+       memset(&rarp->arp_data.arp_tip, 0x00, 4);
+
+       rarp_mbuf->pkt_len  = rarp_mbuf->data_len = RARP_PKT_SIZE;
+
+       return 0;
+}
+
+static inline int __attribute__((always_inline))
+copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
+                 struct rte_mbuf *m, uint16_t desc_idx,
+                 struct rte_mempool *mbuf_pool)
+{
+       struct vring_desc *desc;
+       uint64_t desc_addr;
+       uint32_t desc_avail, desc_offset;
+       uint32_t mbuf_avail, mbuf_offset;
+       uint32_t cpy_len;
+       struct rte_mbuf *cur = m, *prev = m;
+       struct virtio_net_hdr *hdr;
+       /* A counter to avoid desc dead loop chain */
+       uint32_t nr_desc = 1;
+
+       desc = &vq->desc[desc_idx];
+       if (unlikely(desc->len < dev->vhost_hlen))
+               return -1;
+
+       desc_addr = gpa_to_vva(dev, desc->addr);
+       if (unlikely(!desc_addr))
+               return -1;
+
+       hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr);
+       rte_prefetch0(hdr);
+
+       /*
+        * A virtio driver normally uses at least 2 desc buffers
+        * for Tx: the first for storing the header, and others
+        * for storing the data.
+        */
+       if (likely((desc->len == dev->vhost_hlen) &&
+                  (desc->flags & VRING_DESC_F_NEXT) != 0)) {
+               desc = &vq->desc[desc->next];
+
+               desc_addr = gpa_to_vva(dev, desc->addr);
+               if (unlikely(!desc_addr))
+                       return -1;
+
+               rte_prefetch0((void *)(uintptr_t)desc_addr);
+
+               desc_offset = 0;
+               desc_avail  = desc->len;
+               nr_desc    += 1;
+
+               PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0);
+       } else {
+               desc_avail  = desc->len - dev->vhost_hlen;
+               desc_offset = dev->vhost_hlen;
+       }
+
+       mbuf_offset = 0;
+       mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
+       while (1) {
+               cpy_len = RTE_MIN(desc_avail, mbuf_avail);
+               rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset),
+                       (void *)((uintptr_t)(desc_addr + desc_offset)),
+                       cpy_len);
+
+               mbuf_avail  -= cpy_len;
+               mbuf_offset += cpy_len;
+               desc_avail  -= cpy_len;
+               desc_offset += cpy_len;
+
+               /* This desc reaches to its end, get the next one */
+               if (desc_avail == 0) {
+                       if ((desc->flags & VRING_DESC_F_NEXT) == 0)
+                               break;
+
+                       if (unlikely(desc->next >= vq->size ||
+                                    ++nr_desc > vq->size))
+                               return -1;
+                       desc = &vq->desc[desc->next];
+
+                       desc_addr = gpa_to_vva(dev, desc->addr);
+                       if (unlikely(!desc_addr))
+                               return -1;
+
+                       rte_prefetch0((void *)(uintptr_t)desc_addr);
+
+                       desc_offset = 0;
+                       desc_avail  = desc->len;
+
+                       PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0);
+               }
+
+               /*
+                * This mbuf reaches to its end, get a new one
+                * to hold more data.
+                */
+               if (mbuf_avail == 0) {
+                       cur = rte_pktmbuf_alloc(mbuf_pool);
+                       if (unlikely(cur == NULL)) {
+                               RTE_LOG(ERR, VHOST_DATA, "Failed to "
+                                       "allocate memory for mbuf.\n");
+                               return -1;
+                       }
+
+                       prev->next = cur;
+                       prev->data_len = mbuf_offset;
+                       m->nb_segs += 1;
+                       m->pkt_len += mbuf_offset;
+                       prev = cur;
+
+                       mbuf_offset = 0;
+                       mbuf_avail  = cur->buf_len - RTE_PKTMBUF_HEADROOM;
+               }
+       }
+
+       prev->data_len = mbuf_offset;
+       m->pkt_len    += mbuf_offset;
+
+       if (hdr->flags != 0 || hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE)
+               vhost_dequeue_offload(hdr, m);
+
+       return 0;
+}
+
+uint16_t
+rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
+       struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
+{
+       struct virtio_net *dev;
+       struct rte_mbuf *rarp_mbuf = NULL;
+       struct vhost_virtqueue *vq;
+       uint32_t desc_indexes[MAX_PKT_BURST];
+       uint32_t used_idx;
+       uint32_t i = 0;
+       uint16_t free_entries;
+       uint16_t avail_idx;
+
+       dev = get_device(vid);
+       if (!dev)
+               return 0;
+
+       if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) {
+               RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
+                       dev->vid, __func__, queue_id);
+               return 0;
+       }
+
+       vq = dev->virtqueue[queue_id];
+       if (unlikely(vq->enabled == 0))
+               return 0;
+
+       /*
+        * Construct a RARP broadcast packet, and inject it to the "pkts"
+        * array, to looks like that guest actually send such packet.
+        *
+        * Check user_send_rarp() for more information.
+        */
+       if (unlikely(rte_atomic16_cmpset((volatile uint16_t *)
+                                        &dev->broadcast_rarp.cnt, 1, 0))) {
+               rarp_mbuf = rte_pktmbuf_alloc(mbuf_pool);
+               if (rarp_mbuf == NULL) {
+                       RTE_LOG(ERR, VHOST_DATA,
+                               "Failed to allocate memory for mbuf.\n");
+                       return 0;
+               }
+
+               if (make_rarp_packet(rarp_mbuf, &dev->mac)) {
+                       rte_pktmbuf_free(rarp_mbuf);
+                       rarp_mbuf = NULL;
+               } else {
+                       count -= 1;
+               }
+       }
+
+       avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
+       free_entries = avail_idx - vq->last_used_idx;
+       if (free_entries == 0)
+               goto out;
+
+       LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
+
+       /* Prefetch available ring to retrieve head indexes. */
+       used_idx = vq->last_used_idx & (vq->size - 1);
+       rte_prefetch0(&vq->avail->ring[used_idx]);
+       rte_prefetch0(&vq->used->ring[used_idx]);
+
+       count = RTE_MIN(count, MAX_PKT_BURST);
+       count = RTE_MIN(count, free_entries);
+       LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
+                       dev->vid, count);
+
+       /* Retrieve all of the head indexes first to avoid caching issues. */
+       for (i = 0; i < count; i++) {
+               used_idx = (vq->last_used_idx + i) & (vq->size - 1);
+               desc_indexes[i] = vq->avail->ring[used_idx];
+
+               vq->used->ring[used_idx].id  = desc_indexes[i];
+               vq->used->ring[used_idx].len = 0;
+               vhost_log_used_vring(dev, vq,
+                               offsetof(struct vring_used, ring[used_idx]),
+                               sizeof(vq->used->ring[used_idx]));
+       }
+
+       /* Prefetch descriptor index. */
+       rte_prefetch0(&vq->desc[desc_indexes[0]]);
+       for (i = 0; i < count; i++) {
+               int err;
+
+               if (likely(i + 1 < count))
+                       rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
+
+               pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
+               if (unlikely(pkts[i] == NULL)) {
+                       RTE_LOG(ERR, VHOST_DATA,
+                               "Failed to allocate memory for mbuf.\n");
+                       break;
+               }
+               err = copy_desc_to_mbuf(dev, vq, pkts[i], desc_indexes[i],
+                                       mbuf_pool);
+               if (unlikely(err)) {
+                       rte_pktmbuf_free(pkts[i]);
+                       break;
+               }
+       }
+
+       rte_smp_wmb();
+       rte_smp_rmb();
+       vq->used->idx += i;
+       vq->last_used_idx += i;
+       vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+                       sizeof(vq->used->idx));
+
+       /* Kick guest if required. */
+       if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
+                       && (vq->callfd >= 0))
+               eventfd_write(vq->callfd, (eventfd_t)1);
+
+out:
+       if (unlikely(rarp_mbuf != NULL)) {
+               /*
+                * Inject it to the head of "pkts" array, so that switch's mac
+                * learning table will get updated first.
+                */
+               memmove(&pkts[1], pkts, i * sizeof(struct rte_mbuf *));
+               pkts[0] = rarp_mbuf;
+               i += 1;
+       }
+
+       return i;
+}