vhost: refactor code structure

author Yuanhan Liu <yuanhan.liu@linux.intel.com>

Thu, 18 Aug 2016 08:48:39 +0000 (16:48 +0800)

committer Yuanhan Liu <yuanhan.liu@linux.intel.com>

Tue, 13 Sep 2016 03:25:08 +0000 (05:25 +0200)
author Yuanhan Liu <yuanhan.liu@linux.intel.com>
Thu, 18 Aug 2016 08:48:39 +0000 (16:48 +0800)
committer Yuanhan Liu <yuanhan.liu@linux.intel.com>
Tue, 13 Sep 2016 03:25:08 +0000 (05:25 +0200)
diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile

index 277390f611e030b811c1e109bc113a01f65bdc53..415ffc6e34f888368ec29afc664ece6e3e80106b 100644 (file)
--- a/lib/librte_vhost/Makefile
+++ b/lib/librte_vhost/Makefile
@@ -47,10 +47,8 @@ LDLIBS += -lnuma
  endif
  
  # all source are stored in SRCS-y
-SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := virtio-net.c vhost_rxtx.c
-SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += vhost-net-user.c
-SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += virtio-net-user.c
-SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += fd_man.c
+SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c socket.c vhost.c vhost_user.c \
+                                  virtio_net.c
  
  # install includes
  SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_virtio_net.h
diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c

new file mode 100644 (file)

index 0000000..bf03f84
--- /dev/null
+++ b/lib/librte_vhost/socket.c
@@ -0,0 +1,610 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/queue.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+
+#include <rte_log.h>
+
+#include "fd_man.h"
+#include "vhost.h"
+#include "vhost_user.h"
+
+/*
+ * Every time rte_vhost_driver_register() is invoked, an associated
+ * vhost_user_socket struct will be created.
+ */
+struct vhost_user_socket {
+       char *path;
+       int listenfd;
+       int connfd;
+       bool is_server;
+       bool reconnect;
+};
+
+struct vhost_user_connection {
+       struct vhost_user_socket *vsocket;
+       int vid;
+};
+
+#define MAX_VHOST_SOCKET 1024
+struct vhost_user {
+       struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET];
+       struct fdset fdset;
+       int vsocket_cnt;
+       pthread_mutex_t mutex;
+};
+
+#define MAX_VIRTIO_BACKLOG 128
+
+static void vhost_user_server_new_connection(int fd, void *data, int *remove);
+static void vhost_user_read_cb(int fd, void *dat, int *remove);
+static int vhost_user_create_client(struct vhost_user_socket *vsocket);
+
+static struct vhost_user vhost_user = {
+       .fdset = {
+               .fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} },
+               .fd_mutex = PTHREAD_MUTEX_INITIALIZER,
+               .num = 0
+       },
+       .vsocket_cnt = 0,
+       .mutex = PTHREAD_MUTEX_INITIALIZER,
+};
+
+/* return bytes# of read on success or negative val on failure. */
+int
+read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
+{
+       struct iovec iov;
+       struct msghdr msgh;
+       size_t fdsize = fd_num * sizeof(int);
+       char control[CMSG_SPACE(fdsize)];
+       struct cmsghdr *cmsg;
+       int ret;
+
+       memset(&msgh, 0, sizeof(msgh));
+       iov.iov_base = buf;
+       iov.iov_len  = buflen;
+
+       msgh.msg_iov = &iov;
+       msgh.msg_iovlen = 1;
+       msgh.msg_control = control;
+       msgh.msg_controllen = sizeof(control);
+
+       ret = recvmsg(sockfd, &msgh, 0);
+       if (ret <= 0) {
+               RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n");
+               return ret;
+       }
+
+       if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
+               RTE_LOG(ERR, VHOST_CONFIG, "truncted msg\n");
+               return -1;
+       }
+
+       for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+               cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+               if ((cmsg->cmsg_level == SOL_SOCKET) &&
+                       (cmsg->cmsg_type == SCM_RIGHTS)) {
+                       memcpy(fds, CMSG_DATA(cmsg), fdsize);
+                       break;
+               }
+       }
+
+       return ret;
+}
+
+int
+send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
+{
+
+       struct iovec iov;
+       struct msghdr msgh;
+       size_t fdsize = fd_num * sizeof(int);
+       char control[CMSG_SPACE(fdsize)];
+       struct cmsghdr *cmsg;
+       int ret;
+
+       memset(&msgh, 0, sizeof(msgh));
+       iov.iov_base = buf;
+       iov.iov_len = buflen;
+
+       msgh.msg_iov = &iov;
+       msgh.msg_iovlen = 1;
+
+       if (fds && fd_num > 0) {
+               msgh.msg_control = control;
+               msgh.msg_controllen = sizeof(control);
+               cmsg = CMSG_FIRSTHDR(&msgh);
+               cmsg->cmsg_len = CMSG_LEN(fdsize);
+               cmsg->cmsg_level = SOL_SOCKET;
+               cmsg->cmsg_type = SCM_RIGHTS;
+               memcpy(CMSG_DATA(cmsg), fds, fdsize);
+       } else {
+               msgh.msg_control = NULL;
+               msgh.msg_controllen = 0;
+       }
+
+       do {
+               ret = sendmsg(sockfd, &msgh, 0);
+       } while (ret < 0 && errno == EINTR);
+
+       if (ret < 0) {
+               RTE_LOG(ERR, VHOST_CONFIG,  "sendmsg error\n");
+               return ret;
+       }
+
+       return ret;
+}
+
+static void
+vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
+{
+       int vid;
+       size_t size;
+       struct vhost_user_connection *conn;
+       int ret;
+
+       conn = malloc(sizeof(*conn));
+       if (conn == NULL) {
+               close(fd);
+               return;
+       }
+
+       vid = vhost_new_device();
+       if (vid == -1) {
+               close(fd);
+               free(conn);
+               return;
+       }
+
+       size = strnlen(vsocket->path, PATH_MAX);
+       vhost_set_ifname(vid, vsocket->path, size);
+
+       RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid);
+
+       vsocket->connfd = fd;
+       conn->vsocket = vsocket;
+       conn->vid = vid;
+       ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb,
+                       NULL, conn);
+       if (ret < 0) {
+               vsocket->connfd = -1;
+               free(conn);
+               close(fd);
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "failed to add fd %d into vhost server fdset\n",
+                       fd);
+       }
+}
+
+/* call back when there is new vhost-user connection from client  */
+static void
+vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused)
+{
+       struct vhost_user_socket *vsocket = dat;
+
+       fd = accept(fd, NULL, NULL);
+       if (fd < 0)
+               return;
+
+       RTE_LOG(INFO, VHOST_CONFIG, "new vhost user connection is %d\n", fd);
+       vhost_user_add_connection(fd, vsocket);
+}
+
+static void
+vhost_user_read_cb(int connfd, void *dat, int *remove)
+{
+       struct vhost_user_connection *conn = dat;
+       struct vhost_user_socket *vsocket = conn->vsocket;
+       int ret;
+
+       ret = vhost_user_msg_handler(conn->vid, connfd);
+       if (ret < 0) {
+               vsocket->connfd = -1;
+               close(connfd);
+               *remove = 1;
+               free(conn);
+               vhost_destroy_device(conn->vid);
+
+               if (vsocket->reconnect)
+                       vhost_user_create_client(vsocket);
+       }
+}
+
+static int
+create_unix_socket(const char *path, struct sockaddr_un *un, bool is_server)
+{
+       int fd;
+
+       fd = socket(AF_UNIX, SOCK_STREAM, 0);
+       if (fd < 0)
+               return -1;
+       RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n",
+               is_server ? "server" : "client", fd);
+
+       if (!is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "vhost-user: can't set nonblocking mode for socket, fd: "
+                       "%d (%s)\n", fd, strerror(errno));
+               close(fd);
+               return -1;
+       }
+
+       memset(un, 0, sizeof(*un));
+       un->sun_family = AF_UNIX;
+       strncpy(un->sun_path, path, sizeof(un->sun_path));
+       un->sun_path[sizeof(un->sun_path) - 1] = '\0';
+
+       return fd;
+}
+
+static int
+vhost_user_create_server(struct vhost_user_socket *vsocket)
+{
+       int fd;
+       int ret;
+       struct sockaddr_un un;
+       const char *path = vsocket->path;
+
+       fd = create_unix_socket(path, &un, vsocket->is_server);
+       if (fd < 0)
+               return -1;
+
+       ret = bind(fd, (struct sockaddr *)&un, sizeof(un));
+       if (ret < 0) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "failed to bind to %s: %s; remove it and try again\n",
+                       path, strerror(errno));
+               goto err;
+       }
+       RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path);
+
+       ret = listen(fd, MAX_VIRTIO_BACKLOG);
+       if (ret < 0)
+               goto err;
+
+       vsocket->listenfd = fd;
+       ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection,
+                 NULL, vsocket);
+       if (ret < 0) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "failed to add listen fd %d to vhost server fdset\n",
+                       fd);
+               goto err;
+       }
+
+       return 0;
+
+err:
+       close(fd);
+       return -1;
+}
+
+struct vhost_user_reconnect {
+       struct sockaddr_un un;
+       int fd;
+       struct vhost_user_socket *vsocket;
+
+       TAILQ_ENTRY(vhost_user_reconnect) next;
+};
+
+TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect);
+struct vhost_user_reconnect_list {
+       struct vhost_user_reconnect_tailq_list head;
+       pthread_mutex_t mutex;
+};
+
+static struct vhost_user_reconnect_list reconn_list;
+static pthread_t reconn_tid;
+
+static int
+vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz)
+{
+       int ret, flags;
+
+       ret = connect(fd, un, sz);
+       if (ret < 0 && errno != EISCONN)
+               return -1;
+
+       flags = fcntl(fd, F_GETFL, 0);
+       if (flags < 0) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "can't get flags for connfd %d\n", fd);
+               return -2;
+       }
+       if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                               "can't disable nonblocking on fd %d\n", fd);
+               return -2;
+       }
+       return 0;
+}
+
+static void *
+vhost_user_client_reconnect(void *arg __rte_unused)
+{
+       int ret;
+       struct vhost_user_reconnect *reconn, *next;
+
+       while (1) {
+               pthread_mutex_lock(&reconn_list.mutex);
+
+               /*
+                * An equal implementation of TAILQ_FOREACH_SAFE,
+                * which does not exist on all platforms.
+                */
+               for (reconn = TAILQ_FIRST(&reconn_list.head);
+                    reconn != NULL; reconn = next) {
+                       next = TAILQ_NEXT(reconn, next);
+
+                       ret = vhost_user_connect_nonblock(reconn->fd,
+                                               (struct sockaddr *)&reconn->un,
+                                               sizeof(reconn->un));
+                       if (ret == -2) {
+                               close(reconn->fd);
+                               RTE_LOG(ERR, VHOST_CONFIG,
+                                       "reconnection for fd %d failed\n",
+                                       reconn->fd);
+                               goto remove_fd;
+                       }
+                       if (ret == -1)
+                               continue;
+
+                       RTE_LOG(INFO, VHOST_CONFIG,
+                               "%s: connected\n", reconn->vsocket->path);
+                       vhost_user_add_connection(reconn->fd, reconn->vsocket);
+remove_fd:
+                       TAILQ_REMOVE(&reconn_list.head, reconn, next);
+                       free(reconn);
+               }
+
+               pthread_mutex_unlock(&reconn_list.mutex);
+               sleep(1);
+       }
+
+       return NULL;
+}
+
+static int
+vhost_user_reconnect_init(void)
+{
+       int ret;
+
+       pthread_mutex_init(&reconn_list.mutex, NULL);
+       TAILQ_INIT(&reconn_list.head);
+
+       ret = pthread_create(&reconn_tid, NULL,
+                            vhost_user_client_reconnect, NULL);
+       if (ret < 0)
+               RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread");
+
+       return ret;
+}
+
+static int
+vhost_user_create_client(struct vhost_user_socket *vsocket)
+{
+       int fd;
+       int ret;
+       struct sockaddr_un un;
+       const char *path = vsocket->path;
+       struct vhost_user_reconnect *reconn;
+
+       fd = create_unix_socket(path, &un, vsocket->is_server);
+       if (fd < 0)
+               return -1;
+
+       ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&un,
+                                         sizeof(un));
+       if (ret == 0) {
+               vhost_user_add_connection(fd, vsocket);
+               return 0;
+       }
+
+       RTE_LOG(ERR, VHOST_CONFIG,
+               "failed to connect to %s: %s\n",
+               path, strerror(errno));
+
+       if (ret == -2 || !vsocket->reconnect) {
+               close(fd);
+               return -1;
+       }
+
+       RTE_LOG(ERR, VHOST_CONFIG, "%s: reconnecting...\n", path);
+       reconn = malloc(sizeof(*reconn));
+       if (reconn == NULL) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "failed to allocate memory for reconnect\n");
+               close(fd);
+               return -1;
+       }
+       reconn->un = un;
+       reconn->fd = fd;
+       reconn->vsocket = vsocket;
+       pthread_mutex_lock(&reconn_list.mutex);
+       TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next);
+       pthread_mutex_unlock(&reconn_list.mutex);
+
+       return 0;
+}
+
+/*
+ * Register a new vhost-user socket; here we could act as server
+ * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag
+ * is set.
+ */
+int
+rte_vhost_driver_register(const char *path, uint64_t flags)
+{
+       int ret = -1;
+       struct vhost_user_socket *vsocket;
+
+       if (!path)
+               return -1;
+
+       pthread_mutex_lock(&vhost_user.mutex);
+
+       if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "error: the number of vhost sockets reaches maximum\n");
+               goto out;
+       }
+
+       vsocket = malloc(sizeof(struct vhost_user_socket));
+       if (!vsocket)
+               goto out;
+       memset(vsocket, 0, sizeof(struct vhost_user_socket));
+       vsocket->path = strdup(path);
+       vsocket->connfd = -1;
+
+       if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
+               vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);
+               if (vsocket->reconnect && reconn_tid == 0) {
+                       if (vhost_user_reconnect_init() < 0) {
+                               free(vsocket->path);
+                               free(vsocket);
+                               goto out;
+                       }
+               }
+               ret = vhost_user_create_client(vsocket);
+       } else {
+               vsocket->is_server = true;
+               ret = vhost_user_create_server(vsocket);
+       }
+       if (ret < 0) {
+               free(vsocket->path);
+               free(vsocket);
+               goto out;
+       }
+
+       vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket;
+
+out:
+       pthread_mutex_unlock(&vhost_user.mutex);
+
+       return ret;
+}
+
+static bool
+vhost_user_remove_reconnect(struct vhost_user_socket *vsocket)
+{
+       int found = false;
+       struct vhost_user_reconnect *reconn, *next;
+
+       pthread_mutex_lock(&reconn_list.mutex);
+
+       for (reconn = TAILQ_FIRST(&reconn_list.head);
+            reconn != NULL; reconn = next) {
+               next = TAILQ_NEXT(reconn, next);
+
+               if (reconn->vsocket == vsocket) {
+                       TAILQ_REMOVE(&reconn_list.head, reconn, next);
+                       close(reconn->fd);
+                       free(reconn);
+                       found = true;
+                       break;
+               }
+       }
+       pthread_mutex_unlock(&reconn_list.mutex);
+       return found;
+}
+
+/**
+ * Unregister the specified vhost socket
+ */
+int
+rte_vhost_driver_unregister(const char *path)
+{
+       int i;
+       int count;
+       struct vhost_user_connection *conn;
+
+       pthread_mutex_lock(&vhost_user.mutex);
+
+       for (i = 0; i < vhost_user.vsocket_cnt; i++) {
+               struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
+
+               if (!strcmp(vsocket->path, path)) {
+                       if (vsocket->is_server) {
+                               fdset_del(&vhost_user.fdset, vsocket->listenfd);
+                               close(vsocket->listenfd);
+                               unlink(path);
+                       } else if (vsocket->reconnect) {
+                               vhost_user_remove_reconnect(vsocket);
+                       }
+
+                       conn = fdset_del(&vhost_user.fdset, vsocket->connfd);
+                       if (conn) {
+                               RTE_LOG(INFO, VHOST_CONFIG,
+                                       "free connfd = %d for device '%s'\n",
+                                       vsocket->connfd, path);
+                               close(vsocket->connfd);
+                               vhost_destroy_device(conn->vid);
+                               free(conn);
+                       }
+
+                       free(vsocket->path);
+                       free(vsocket);
+
+                       count = --vhost_user.vsocket_cnt;
+                       vhost_user.vsockets[i] = vhost_user.vsockets[count];
+                       vhost_user.vsockets[count] = NULL;
+                       pthread_mutex_unlock(&vhost_user.mutex);
+
+                       return 0;
+               }
+       }
+       pthread_mutex_unlock(&vhost_user.mutex);
+
+       return -1;
+}
+
+int
+rte_vhost_driver_session_start(void)
+{
+       fdset_event_dispatch(&vhost_user.fdset);
+       return 0;
+}
diff --git a/lib/librte_vhost/vhost-net-user.c b/lib/librte_vhost/vhost-net-user.c

deleted file mode 100644 (file)

index b35594d..0000000
--- a/lib/librte_vhost/vhost-net-user.c
+++ /dev/null
@@ -1,795 +0,0 @@
-/*-
- *   BSD LICENSE
- *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- *   All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdbool.h>
-#include <limits.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <sys/queue.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <pthread.h>
-
-#include <rte_log.h>
-#include <rte_virtio_net.h>
-
-#include "fd_man.h"
-#include "vhost-net-user.h"
-#include "vhost-net.h"
-#include "virtio-net-user.h"
-
-/*
- * Every time rte_vhost_driver_register() is invoked, an associated
- * vhost_user_socket struct will be created.
- */
-struct vhost_user_socket {
-       char *path;
-       int listenfd;
-       int connfd;
-       bool is_server;
-       bool reconnect;
-};
-
-struct vhost_user_connection {
-       struct vhost_user_socket *vsocket;
-       int vid;
-};
-
-#define MAX_VHOST_SOCKET 1024
-struct vhost_user {
-       struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET];
-       struct fdset fdset;
-       int vsocket_cnt;
-       pthread_mutex_t mutex;
-};
-
-#define MAX_VIRTIO_BACKLOG 128
-
-static void vhost_user_server_new_connection(int fd, void *data, int *remove);
-static void vhost_user_msg_handler(int fd, void *dat, int *remove);
-static int vhost_user_create_client(struct vhost_user_socket *vsocket);
-
-static struct vhost_user vhost_user = {
-       .fdset = {
-               .fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} },
-               .fd_mutex = PTHREAD_MUTEX_INITIALIZER,
-               .num = 0
-       },
-       .vsocket_cnt = 0,
-       .mutex = PTHREAD_MUTEX_INITIALIZER,
-};
-
-static const char *vhost_message_str[VHOST_USER_MAX] = {
-       [VHOST_USER_NONE] = "VHOST_USER_NONE",
-       [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
-       [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
-       [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
-       [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
-       [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
-       [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
-       [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
-       [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
-       [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
-       [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
-       [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
-       [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
-       [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
-       [VHOST_USER_SET_VRING_ERR]  = "VHOST_USER_SET_VRING_ERR",
-       [VHOST_USER_GET_PROTOCOL_FEATURES]  = "VHOST_USER_GET_PROTOCOL_FEATURES",
-       [VHOST_USER_SET_PROTOCOL_FEATURES]  = "VHOST_USER_SET_PROTOCOL_FEATURES",
-       [VHOST_USER_GET_QUEUE_NUM]  = "VHOST_USER_GET_QUEUE_NUM",
-       [VHOST_USER_SET_VRING_ENABLE]  = "VHOST_USER_SET_VRING_ENABLE",
-       [VHOST_USER_SEND_RARP]  = "VHOST_USER_SEND_RARP",
-};
-
-/* return bytes# of read on success or negative val on failure. */
-static int
-read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
-{
-       struct iovec iov;
-       struct msghdr msgh;
-       size_t fdsize = fd_num * sizeof(int);
-       char control[CMSG_SPACE(fdsize)];
-       struct cmsghdr *cmsg;
-       int ret;
-
-       memset(&msgh, 0, sizeof(msgh));
-       iov.iov_base = buf;
-       iov.iov_len  = buflen;
-
-       msgh.msg_iov = &iov;
-       msgh.msg_iovlen = 1;
-       msgh.msg_control = control;
-       msgh.msg_controllen = sizeof(control);
-
-       ret = recvmsg(sockfd, &msgh, 0);
-       if (ret <= 0) {
-               RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n");
-               return ret;
-       }
-
-       if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
-               RTE_LOG(ERR, VHOST_CONFIG, "truncted msg\n");
-               return -1;
-       }
-
-       for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
-               cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
-               if ((cmsg->cmsg_level == SOL_SOCKET) &&
-                       (cmsg->cmsg_type == SCM_RIGHTS)) {
-                       memcpy(fds, CMSG_DATA(cmsg), fdsize);
-                       break;
-               }
-       }
-
-       return ret;
-}
-
-/* return bytes# of read on success or negative val on failure. */
-static int
-read_vhost_message(int sockfd, struct VhostUserMsg *msg)
-{
-       int ret;
-
-       ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
-               msg->fds, VHOST_MEMORY_MAX_NREGIONS);
-       if (ret <= 0)
-               return ret;
-
-       if (msg && msg->size) {
-               if (msg->size > sizeof(msg->payload)) {
-                       RTE_LOG(ERR, VHOST_CONFIG,
-                               "invalid msg size: %d\n", msg->size);
-                       return -1;
-               }
-               ret = read(sockfd, &msg->payload, msg->size);
-               if (ret <= 0)
-                       return ret;
-               if (ret != (int)msg->size) {
-                       RTE_LOG(ERR, VHOST_CONFIG,
-                               "read control message failed\n");
-                       return -1;
-               }
-       }
-
-       return ret;
-}
-
-static int
-send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
-{
-
-       struct iovec iov;
-       struct msghdr msgh;
-       size_t fdsize = fd_num * sizeof(int);
-       char control[CMSG_SPACE(fdsize)];
-       struct cmsghdr *cmsg;
-       int ret;
-
-       memset(&msgh, 0, sizeof(msgh));
-       iov.iov_base = buf;
-       iov.iov_len = buflen;
-
-       msgh.msg_iov = &iov;
-       msgh.msg_iovlen = 1;
-
-       if (fds && fd_num > 0) {
-               msgh.msg_control = control;
-               msgh.msg_controllen = sizeof(control);
-               cmsg = CMSG_FIRSTHDR(&msgh);
-               cmsg->cmsg_len = CMSG_LEN(fdsize);
-               cmsg->cmsg_level = SOL_SOCKET;
-               cmsg->cmsg_type = SCM_RIGHTS;
-               memcpy(CMSG_DATA(cmsg), fds, fdsize);
-       } else {
-               msgh.msg_control = NULL;
-               msgh.msg_controllen = 0;
-       }
-
-       do {
-               ret = sendmsg(sockfd, &msgh, 0);
-       } while (ret < 0 && errno == EINTR);
-
-       if (ret < 0) {
-               RTE_LOG(ERR, VHOST_CONFIG,  "sendmsg error\n");
-               return ret;
-       }
-
-       return ret;
-}
-
-static int
-send_vhost_message(int sockfd, struct VhostUserMsg *msg)
-{
-       int ret;
-
-       if (!msg)
-               return 0;
-
-       msg->flags &= ~VHOST_USER_VERSION_MASK;
-       msg->flags |= VHOST_USER_VERSION;
-       msg->flags |= VHOST_USER_REPLY_MASK;
-
-       ret = send_fd_message(sockfd, (char *)msg,
-               VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
-
-       return ret;
-}
-
-
-static void
-vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
-{
-       int vid;
-       size_t size;
-       struct vhost_user_connection *conn;
-       int ret;
-
-       conn = malloc(sizeof(*conn));
-       if (conn == NULL) {
-               close(fd);
-               return;
-       }
-
-       vid = vhost_new_device();
-       if (vid == -1) {
-               close(fd);
-               free(conn);
-               return;
-       }
-
-       size = strnlen(vsocket->path, PATH_MAX);
-       vhost_set_ifname(vid, vsocket->path, size);
-
-       RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid);
-
-       vsocket->connfd = fd;
-       conn->vsocket = vsocket;
-       conn->vid = vid;
-       ret = fdset_add(&vhost_user.fdset, fd, vhost_user_msg_handler,
-                       NULL, conn);
-       if (ret < 0) {
-               vsocket->connfd = -1;
-               free(conn);
-               close(fd);
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "failed to add fd %d into vhost server fdset\n",
-                       fd);
-       }
-}
-
-/* call back when there is new vhost-user connection from client  */
-static void
-vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused)
-{
-       struct vhost_user_socket *vsocket = dat;
-
-       fd = accept(fd, NULL, NULL);
-       if (fd < 0)
-               return;
-
-       RTE_LOG(INFO, VHOST_CONFIG, "new vhost user connection is %d\n", fd);
-       vhost_user_add_connection(fd, vsocket);
-}
-
-/* callback when there is message on the connfd */
-static void
-vhost_user_msg_handler(int connfd, void *dat, int *remove)
-{
-       int vid;
-       struct vhost_user_connection *conn = dat;
-       struct VhostUserMsg msg;
-       uint64_t features;
-       int ret;
-
-       vid = conn->vid;
-       ret = read_vhost_message(connfd, &msg);
-       if (ret <= 0 || msg.request >= VHOST_USER_MAX) {
-               struct vhost_user_socket *vsocket = conn->vsocket;
-
-               if (ret < 0)
-                       RTE_LOG(ERR, VHOST_CONFIG,
-                               "vhost read message failed\n");
-               else if (ret == 0)
-                       RTE_LOG(INFO, VHOST_CONFIG,
-                               "vhost peer closed\n");
-               else
-                       RTE_LOG(ERR, VHOST_CONFIG,
-                               "vhost read incorrect message\n");
-
-               vsocket->connfd = -1;
-               close(connfd);
-               *remove = 1;
-               free(conn);
-               vhost_destroy_device(vid);
-
-               if (vsocket->reconnect)
-                       vhost_user_create_client(vsocket);
-
-               return;
-       }
-
-       RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
-               vhost_message_str[msg.request]);
-       switch (msg.request) {
-       case VHOST_USER_GET_FEATURES:
-               ret = vhost_get_features(vid, &features);
-               msg.payload.u64 = features;
-               msg.size = sizeof(msg.payload.u64);
-               send_vhost_message(connfd, &msg);
-               break;
-       case VHOST_USER_SET_FEATURES:
-               features = msg.payload.u64;
-               vhost_set_features(vid, &features);
-               break;
-
-       case VHOST_USER_GET_PROTOCOL_FEATURES:
-               msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES;
-               msg.size = sizeof(msg.payload.u64);
-               send_vhost_message(connfd, &msg);
-               break;
-       case VHOST_USER_SET_PROTOCOL_FEATURES:
-               user_set_protocol_features(vid, msg.payload.u64);
-               break;
-
-       case VHOST_USER_SET_OWNER:
-               vhost_set_owner(vid);
-               break;
-       case VHOST_USER_RESET_OWNER:
-               vhost_reset_owner(vid);
-               break;
-
-       case VHOST_USER_SET_MEM_TABLE:
-               user_set_mem_table(vid, &msg);
-               break;
-
-       case VHOST_USER_SET_LOG_BASE:
-               user_set_log_base(vid, &msg);
-
-               /* it needs a reply */
-               msg.size = sizeof(msg.payload.u64);
-               send_vhost_message(connfd, &msg);
-               break;
-       case VHOST_USER_SET_LOG_FD:
-               close(msg.fds[0]);
-               RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
-               break;
-
-       case VHOST_USER_SET_VRING_NUM:
-               vhost_set_vring_num(vid, &msg.payload.state);
-               break;
-       case VHOST_USER_SET_VRING_ADDR:
-               vhost_set_vring_addr(vid, &msg.payload.addr);
-               break;
-       case VHOST_USER_SET_VRING_BASE:
-               vhost_set_vring_base(vid, &msg.payload.state);
-               break;
-
-       case VHOST_USER_GET_VRING_BASE:
-               ret = user_get_vring_base(vid, &msg.payload.state);
-               msg.size = sizeof(msg.payload.state);
-               send_vhost_message(connfd, &msg);
-               break;
-
-       case VHOST_USER_SET_VRING_KICK:
-               user_set_vring_kick(vid, &msg);
-               break;
-       case VHOST_USER_SET_VRING_CALL:
-               user_set_vring_call(vid, &msg);
-               break;
-
-       case VHOST_USER_SET_VRING_ERR:
-               if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK))
-                       close(msg.fds[0]);
-               RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
-               break;
-
-       case VHOST_USER_GET_QUEUE_NUM:
-               msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS;
-               msg.size = sizeof(msg.payload.u64);
-               send_vhost_message(connfd, &msg);
-               break;
-
-       case VHOST_USER_SET_VRING_ENABLE:
-               user_set_vring_enable(vid, &msg.payload.state);
-               break;
-       case VHOST_USER_SEND_RARP:
-               user_send_rarp(vid, &msg);
-               break;
-
-       default:
-               break;
-
-       }
-}
-
-static int
-create_unix_socket(const char *path, struct sockaddr_un *un, bool is_server)
-{
-       int fd;
-
-       fd = socket(AF_UNIX, SOCK_STREAM, 0);
-       if (fd < 0)
-               return -1;
-       RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n",
-               is_server ? "server" : "client", fd);
-
-       if (!is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "vhost-user: can't set nonblocking mode for socket, fd: "
-                       "%d (%s)\n", fd, strerror(errno));
-               close(fd);
-               return -1;
-       }
-
-       memset(un, 0, sizeof(*un));
-       un->sun_family = AF_UNIX;
-       strncpy(un->sun_path, path, sizeof(un->sun_path));
-       un->sun_path[sizeof(un->sun_path) - 1] = '\0';
-
-       return fd;
-}
-
-static int
-vhost_user_create_server(struct vhost_user_socket *vsocket)
-{
-       int fd;
-       int ret;
-       struct sockaddr_un un;
-       const char *path = vsocket->path;
-
-       fd = create_unix_socket(path, &un, vsocket->is_server);
-       if (fd < 0)
-               return -1;
-
-       ret = bind(fd, (struct sockaddr *)&un, sizeof(un));
-       if (ret < 0) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "failed to bind to %s: %s; remove it and try again\n",
-                       path, strerror(errno));
-               goto err;
-       }
-       RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path);
-
-       ret = listen(fd, MAX_VIRTIO_BACKLOG);
-       if (ret < 0)
-               goto err;
-
-       vsocket->listenfd = fd;
-       ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection,
-                 NULL, vsocket);
-       if (ret < 0) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "failed to add listen fd %d to vhost server fdset\n",
-                       fd);
-               goto err;
-       }
-
-       return 0;
-
-err:
-       close(fd);
-       return -1;
-}
-
-struct vhost_user_reconnect {
-       struct sockaddr_un un;
-       int fd;
-       struct vhost_user_socket *vsocket;
-
-       TAILQ_ENTRY(vhost_user_reconnect) next;
-};
-
-TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect);
-struct vhost_user_reconnect_list {
-       struct vhost_user_reconnect_tailq_list head;
-       pthread_mutex_t mutex;
-};
-
-static struct vhost_user_reconnect_list reconn_list;
-static pthread_t reconn_tid;
-
-static int
-vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz)
-{
-       int ret, flags;
-
-       ret = connect(fd, un, sz);
-       if (ret < 0 && errno != EISCONN)
-               return -1;
-
-       flags = fcntl(fd, F_GETFL, 0);
-       if (flags < 0) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "can't get flags for connfd %d\n", fd);
-               return -2;
-       }
-       if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                               "can't disable nonblocking on fd %d\n", fd);
-               return -2;
-       }
-       return 0;
-}
-
-static void *
-vhost_user_client_reconnect(void *arg __rte_unused)
-{
-       int ret;
-       struct vhost_user_reconnect *reconn, *next;
-
-       while (1) {
-               pthread_mutex_lock(&reconn_list.mutex);
-
-               /*
-                * An equal implementation of TAILQ_FOREACH_SAFE,
-                * which does not exist on all platforms.
-                */
-               for (reconn = TAILQ_FIRST(&reconn_list.head);
-                    reconn != NULL; reconn = next) {
-                       next = TAILQ_NEXT(reconn, next);
-
-                       ret = vhost_user_connect_nonblock(reconn->fd,
-                                               (struct sockaddr *)&reconn->un,
-                                               sizeof(reconn->un));
-                       if (ret == -2) {
-                               close(reconn->fd);
-                               RTE_LOG(ERR, VHOST_CONFIG,
-                                       "reconnection for fd %d failed\n",
-                                       reconn->fd);
-                               goto remove_fd;
-                       }
-                       if (ret == -1)
-                               continue;
-
-                       RTE_LOG(INFO, VHOST_CONFIG,
-                               "%s: connected\n", reconn->vsocket->path);
-                       vhost_user_add_connection(reconn->fd, reconn->vsocket);
-remove_fd:
-                       TAILQ_REMOVE(&reconn_list.head, reconn, next);
-                       free(reconn);
-               }
-
-               pthread_mutex_unlock(&reconn_list.mutex);
-               sleep(1);
-       }
-
-       return NULL;
-}
-
-static int
-vhost_user_reconnect_init(void)
-{
-       int ret;
-
-       pthread_mutex_init(&reconn_list.mutex, NULL);
-       TAILQ_INIT(&reconn_list.head);
-
-       ret = pthread_create(&reconn_tid, NULL,
-                            vhost_user_client_reconnect, NULL);
-       if (ret < 0)
-               RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread");
-
-       return ret;
-}
-
-static int
-vhost_user_create_client(struct vhost_user_socket *vsocket)
-{
-       int fd;
-       int ret;
-       struct sockaddr_un un;
-       const char *path = vsocket->path;
-       struct vhost_user_reconnect *reconn;
-
-       fd = create_unix_socket(path, &un, vsocket->is_server);
-       if (fd < 0)
-               return -1;
-
-       ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&un,
-                                         sizeof(un));
-       if (ret == 0) {
-               vhost_user_add_connection(fd, vsocket);
-               return 0;
-       }
-
-       RTE_LOG(ERR, VHOST_CONFIG,
-               "failed to connect to %s: %s\n",
-               path, strerror(errno));
-
-       if (ret == -2 || !vsocket->reconnect) {
-               close(fd);
-               return -1;
-       }
-
-       RTE_LOG(ERR, VHOST_CONFIG, "%s: reconnecting...\n", path);
-       reconn = malloc(sizeof(*reconn));
-       if (reconn == NULL) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "failed to allocate memory for reconnect\n");
-               close(fd);
-               return -1;
-       }
-       reconn->un = un;
-       reconn->fd = fd;
-       reconn->vsocket = vsocket;
-       pthread_mutex_lock(&reconn_list.mutex);
-       TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next);
-       pthread_mutex_unlock(&reconn_list.mutex);
-
-       return 0;
-}
-
-/*
- * Register a new vhost-user socket; here we could act as server
- * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag
- * is set.
- */
-int
-rte_vhost_driver_register(const char *path, uint64_t flags)
-{
-       int ret = -1;
-       struct vhost_user_socket *vsocket;
-
-       if (!path)
-               return -1;
-
-       pthread_mutex_lock(&vhost_user.mutex);
-
-       if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "error: the number of vhost sockets reaches maximum\n");
-               goto out;
-       }
-
-       vsocket = malloc(sizeof(struct vhost_user_socket));
-       if (!vsocket)
-               goto out;
-       memset(vsocket, 0, sizeof(struct vhost_user_socket));
-       vsocket->path = strdup(path);
-       vsocket->connfd = -1;
-
-       if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
-               vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);
-               if (vsocket->reconnect && reconn_tid == 0) {
-                       if (vhost_user_reconnect_init() < 0) {
-                               free(vsocket->path);
-                               free(vsocket);
-                               goto out;
-                       }
-               }
-               ret = vhost_user_create_client(vsocket);
-       } else {
-               vsocket->is_server = true;
-               ret = vhost_user_create_server(vsocket);
-       }
-       if (ret < 0) {
-               free(vsocket->path);
-               free(vsocket);
-               goto out;
-       }
-
-       vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket;
-
-out:
-       pthread_mutex_unlock(&vhost_user.mutex);
-
-       return ret;
-}
-
-static bool
-vhost_user_remove_reconnect(struct vhost_user_socket *vsocket)
-{
-       int found = false;
-       struct vhost_user_reconnect *reconn, *next;
-
-       pthread_mutex_lock(&reconn_list.mutex);
-
-       for (reconn = TAILQ_FIRST(&reconn_list.head);
-            reconn != NULL; reconn = next) {
-               next = TAILQ_NEXT(reconn, next);
-
-               if (reconn->vsocket == vsocket) {
-                       TAILQ_REMOVE(&reconn_list.head, reconn, next);
-                       close(reconn->fd);
-                       free(reconn);
-                       found = true;
-                       break;
-               }
-       }
-       pthread_mutex_unlock(&reconn_list.mutex);
-       return found;
-}
-
-/**
- * Unregister the specified vhost socket
- */
-int
-rte_vhost_driver_unregister(const char *path)
-{
-       int i;
-       int count;
-       struct vhost_user_connection *conn;
-
-       pthread_mutex_lock(&vhost_user.mutex);
-
-       for (i = 0; i < vhost_user.vsocket_cnt; i++) {
-               struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
-
-               if (!strcmp(vsocket->path, path)) {
-                       if (vsocket->is_server) {
-                               fdset_del(&vhost_user.fdset, vsocket->listenfd);
-                               close(vsocket->listenfd);
-                               unlink(path);
-                       } else if (vsocket->reconnect) {
-                               vhost_user_remove_reconnect(vsocket);
-                       }
-
-                       conn = fdset_del(&vhost_user.fdset, vsocket->connfd);
-                       if (conn) {
-                               RTE_LOG(INFO, VHOST_CONFIG,
-                                       "free connfd = %d for device '%s'\n",
-                                       vsocket->connfd, path);
-                               close(vsocket->connfd);
-                               vhost_destroy_device(conn->vid);
-                               free(conn);
-                       }
-
-                       free(vsocket->path);
-                       free(vsocket);
-
-                       count = --vhost_user.vsocket_cnt;
-                       vhost_user.vsockets[i] = vhost_user.vsockets[count];
-                       vhost_user.vsockets[count] = NULL;
-                       pthread_mutex_unlock(&vhost_user.mutex);
-
-                       return 0;
-               }
-       }
-       pthread_mutex_unlock(&vhost_user.mutex);
-
-       return -1;
-}
-
-int
-rte_vhost_driver_session_start(void)
-{
-       fdset_event_dispatch(&vhost_user.fdset);
-       return 0;
-}
diff --git a/lib/librte_vhost/vhost-net-user.h b/lib/librte_vhost/vhost-net-user.h

deleted file mode 100644 (file)

index f533239..0000000
--- a/lib/librte_vhost/vhost-net-user.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*-
- *   BSD LICENSE
- *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- *   All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _VHOST_NET_USER_H
-#define _VHOST_NET_USER_H
-
-#include <stdint.h>
-#include <linux/vhost.h>
-
-#include "rte_virtio_net.h"
-
-/* refer to hw/virtio/vhost-user.c */
-
-#define VHOST_MEMORY_MAX_NREGIONS 8
-
-typedef enum VhostUserRequest {
-       VHOST_USER_NONE = 0,
-       VHOST_USER_GET_FEATURES = 1,
-       VHOST_USER_SET_FEATURES = 2,
-       VHOST_USER_SET_OWNER = 3,
-       VHOST_USER_RESET_OWNER = 4,
-       VHOST_USER_SET_MEM_TABLE = 5,
-       VHOST_USER_SET_LOG_BASE = 6,
-       VHOST_USER_SET_LOG_FD = 7,
-       VHOST_USER_SET_VRING_NUM = 8,
-       VHOST_USER_SET_VRING_ADDR = 9,
-       VHOST_USER_SET_VRING_BASE = 10,
-       VHOST_USER_GET_VRING_BASE = 11,
-       VHOST_USER_SET_VRING_KICK = 12,
-       VHOST_USER_SET_VRING_CALL = 13,
-       VHOST_USER_SET_VRING_ERR = 14,
-       VHOST_USER_GET_PROTOCOL_FEATURES = 15,
-       VHOST_USER_SET_PROTOCOL_FEATURES = 16,
-       VHOST_USER_GET_QUEUE_NUM = 17,
-       VHOST_USER_SET_VRING_ENABLE = 18,
-       VHOST_USER_SEND_RARP = 19,
-       VHOST_USER_MAX
-} VhostUserRequest;
-
-typedef struct VhostUserMemoryRegion {
-       uint64_t guest_phys_addr;
-       uint64_t memory_size;
-       uint64_t userspace_addr;
-       uint64_t mmap_offset;
-} VhostUserMemoryRegion;
-
-typedef struct VhostUserMemory {
-       uint32_t nregions;
-       uint32_t padding;
-       VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
-} VhostUserMemory;
-
-typedef struct VhostUserLog {
-       uint64_t mmap_size;
-       uint64_t mmap_offset;
-} VhostUserLog;
-
-typedef struct VhostUserMsg {
-       VhostUserRequest request;
-
-#define VHOST_USER_VERSION_MASK     0x3
-#define VHOST_USER_REPLY_MASK       (0x1 << 2)
-       uint32_t flags;
-       uint32_t size; /* the following payload size */
-       union {
-#define VHOST_USER_VRING_IDX_MASK   0xff
-#define VHOST_USER_VRING_NOFD_MASK  (0x1<<8)
-               uint64_t u64;
-               struct vhost_vring_state state;
-               struct vhost_vring_addr addr;
-               VhostUserMemory memory;
-               VhostUserLog    log;
-       } payload;
-       int fds[VHOST_MEMORY_MAX_NREGIONS];
-} __attribute((packed)) VhostUserMsg;
-
-#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
-
-/* The version of the protocol we support */
-#define VHOST_USER_VERSION    0x1
-
-/*****************************************************************************/
-#endif
diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h

deleted file mode 100644 (file)

index 38593a2..0000000
--- a/lib/librte_vhost/vhost-net.h
+++ /dev/null
@@ -1,250 +0,0 @@
-/*-
- *   BSD LICENSE
- *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- *   All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _VHOST_NET_CDEV_H_
-#define _VHOST_NET_CDEV_H_
-#include <stdint.h>
-#include <stdio.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <linux/vhost.h>
-
-#include <rte_log.h>
-
-#include "rte_virtio_net.h"
-
-/* Used to indicate that the device is running on a data core */
-#define VIRTIO_DEV_RUNNING 1
-
-/* Backend value set by guest. */
-#define VIRTIO_DEV_STOPPED -1
-
-#define BUF_VECTOR_MAX 256
-
-/**
- * Structure contains buffer address, length and descriptor index
- * from vring to do scatter RX.
- */
-struct buf_vector {
-       uint64_t buf_addr;
-       uint32_t buf_len;
-       uint32_t desc_idx;
-};
-
-/**
- * Structure contains variables relevant to RX/TX virtqueues.
- */
-struct vhost_virtqueue {
-       struct vring_desc       *desc;
-       struct vring_avail      *avail;
-       struct vring_used       *used;
-       uint32_t                size;
-
-       /* Last index used on the available ring */
-       volatile uint16_t       last_used_idx;
-#define VIRTIO_INVALID_EVENTFD         (-1)
-#define VIRTIO_UNINITIALIZED_EVENTFD   (-2)
-
-       /* Backend value to determine if device should started/stopped */
-       int                     backend;
-       /* Used to notify the guest (trigger interrupt) */
-       int                     callfd;
-       /* Currently unused as polling mode is enabled */
-       int                     kickfd;
-       int                     enabled;
-
-       /* Physical address of used ring, for logging */
-       uint64_t                log_guest_addr;
-} __rte_cache_aligned;
-
-/* Old kernels have no such macro defined */
-#ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
- #define VIRTIO_NET_F_GUEST_ANNOUNCE 21
-#endif
-
-
-/*
- * Make an extra wrapper for VIRTIO_NET_F_MQ and
- * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX as they are
- * introduced since kernel v3.8. This makes our
- * code buildable for older kernel.
- */
-#ifdef VIRTIO_NET_F_MQ
- #define VHOST_MAX_QUEUE_PAIRS VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX
- #define VHOST_SUPPORTS_MQ     (1ULL << VIRTIO_NET_F_MQ)
-#else
- #define VHOST_MAX_QUEUE_PAIRS 1
- #define VHOST_SUPPORTS_MQ     0
-#endif
-
-/*
- * Define virtio 1.0 for older kernels
- */
-#ifndef VIRTIO_F_VERSION_1
- #define VIRTIO_F_VERSION_1 32
-#endif
-
-/**
- * Device structure contains all configuration information relating
- * to the device.
- */
-struct virtio_net {
-       /* Frontend (QEMU) memory and memory region information */
-       struct virtio_memory    *mem;
-       uint64_t                features;
-       uint64_t                protocol_features;
-       int                     vid;
-       uint32_t                flags;
-       uint16_t                vhost_hlen;
-       /* to tell if we need broadcast rarp packet */
-       rte_atomic16_t          broadcast_rarp;
-       uint32_t                virt_qp_nb;
-       struct vhost_virtqueue  *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
-#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
-       char                    ifname[IF_NAME_SZ];
-       uint64_t                log_size;
-       uint64_t                log_base;
-       uint64_t                log_addr;
-       struct ether_addr       mac;
-
-} __rte_cache_aligned;
-
-/**
- * Information relating to memory regions including offsets to
- * addresses in QEMUs memory file.
- */
-struct virtio_memory_regions {
-       uint64_t guest_phys_address;
-       uint64_t guest_phys_address_end;
-       uint64_t memory_size;
-       uint64_t userspace_address;
-       uint64_t address_offset;
-};
-
-
-/**
- * Memory structure includes region and mapping information.
- */
-struct virtio_memory {
-       /* Base QEMU userspace address of the memory file. */
-       uint64_t base_address;
-       uint64_t mapped_address;
-       uint64_t mapped_size;
-       uint32_t nregions;
-       struct virtio_memory_regions regions[0];
-};
-
-
-/* Macros for printing using RTE_LOG */
-#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1
-#define RTE_LOGTYPE_VHOST_DATA   RTE_LOGTYPE_USER1
-
-#ifdef RTE_LIBRTE_VHOST_DEBUG
-#define VHOST_MAX_PRINT_BUFF 6072
-#define LOG_LEVEL RTE_LOG_DEBUG
-#define LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args)
-#define PRINT_PACKET(device, addr, size, header) do { \
-       char *pkt_addr = (char *)(addr); \
-       unsigned int index; \
-       char packet[VHOST_MAX_PRINT_BUFF]; \
-       \
-       if ((header)) \
-               snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Header size %d: ", (device->vid), (size)); \
-       else \
-               snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Packet size %d: ", (device->vid), (size)); \
-       for (index = 0; index < (size); index++) { \
-               snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \
-                       "%02hhx ", pkt_addr[index]); \
-       } \
-       snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \
-       \
-       LOG_DEBUG(VHOST_DATA, "%s", packet); \
-} while (0)
-#else
-#define LOG_LEVEL RTE_LOG_INFO
-#define LOG_DEBUG(log_type, fmt, args...) do {} while (0)
-#define PRINT_PACKET(device, addr, size, header) do {} while (0)
-#endif
-
-/**
- * Function to convert guest physical addresses to vhost virtual addresses.
- * This is used to convert guest virtio buffer addresses.
- */
-static inline uint64_t __attribute__((always_inline))
-gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa)
-{
-       struct virtio_memory_regions *region;
-       uint32_t regionidx;
-       uint64_t vhost_va = 0;
-
-       for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
-               region = &dev->mem->regions[regionidx];
-               if ((guest_pa >= region->guest_phys_address) &&
-                       (guest_pa <= region->guest_phys_address_end)) {
-                       vhost_va = region->address_offset + guest_pa;
-                       break;
-               }
-       }
-       return vhost_va;
-}
-
-struct virtio_net_device_ops const *notify_ops;
-struct virtio_net *get_device(int vid);
-
-int vhost_new_device(void);
-void vhost_destroy_device(int);
-
-void vhost_set_ifname(int, const char *if_name, unsigned int if_len);
-
-int vhost_get_features(int, uint64_t *);
-int vhost_set_features(int, uint64_t *);
-
-int vhost_set_vring_num(int, struct vhost_vring_state *);
-int vhost_set_vring_addr(int, struct vhost_vring_addr *);
-int vhost_set_vring_base(int, struct vhost_vring_state *);
-int vhost_get_vring_base(int, uint32_t, struct vhost_vring_state *);
-
-int vhost_set_vring_kick(int, struct vhost_vring_file *);
-int vhost_set_vring_call(int, struct vhost_vring_file *);
-
-int vhost_set_backend(int, struct vhost_vring_file *);
-
-int vhost_set_owner(int);
-int vhost_reset_owner(int);
-
-/*
- * Backend-specific cleanup. Defined by vhost-cuse and vhost-user.
- */
-void vhost_backend_cleanup(struct virtio_net *dev);
-
-#endif /* _VHOST_NET_CDEV_H_ */
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c

new file mode 100644 (file)

index 0000000..46095c3
--- /dev/null
+++ b/lib/librte_vhost/vhost.c
@@ -0,0 +1,409 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#ifdef RTE_LIBRTE_VHOST_NUMA
+#include <numaif.h>
+#endif
+
+#include <rte_ethdev.h>
+#include <rte_log.h>
+#include <rte_string_fns.h>
+#include <rte_memory.h>
+#include <rte_malloc.h>
+#include <rte_virtio_net.h>
+
+#include "vhost.h"
+
+#define VHOST_USER_F_PROTOCOL_FEATURES 30
+
+/* Features supported by this lib. */
+#define VHOST_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \
+                               (1ULL << VIRTIO_NET_F_CTRL_VQ) | \
+                               (1ULL << VIRTIO_NET_F_CTRL_RX) | \
+                               (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \
+                               (VHOST_SUPPORTS_MQ)            | \
+                               (1ULL << VIRTIO_F_VERSION_1)   | \
+                               (1ULL << VHOST_F_LOG_ALL)      | \
+                               (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
+                               (1ULL << VIRTIO_NET_F_HOST_TSO4) | \
+                               (1ULL << VIRTIO_NET_F_HOST_TSO6) | \
+                               (1ULL << VIRTIO_NET_F_CSUM)    | \
+                               (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
+                               (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
+                               (1ULL << VIRTIO_NET_F_GUEST_TSO6))
+
+uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES;
+
+struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
+
+/* device ops to add/remove device to/from data core. */
+struct virtio_net_device_ops const *notify_ops;
+
+struct virtio_net *
+get_device(int vid)
+{
+       struct virtio_net *dev = vhost_devices[vid];
+
+       if (unlikely(!dev)) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "(%d) device not found.\n", vid);
+       }
+
+       return dev;
+}
+
+static void
+cleanup_vq(struct vhost_virtqueue *vq, int destroy)
+{
+       if ((vq->callfd >= 0) && (destroy != 0))
+               close(vq->callfd);
+       if (vq->kickfd >= 0)
+               close(vq->kickfd);
+}
+
+/*
+ * Unmap any memory, close any file descriptors and
+ * free any memory owned by a device.
+ */
+void
+cleanup_device(struct virtio_net *dev, int destroy)
+{
+       uint32_t i;
+
+       vhost_backend_cleanup(dev);
+
+       for (i = 0; i < dev->virt_qp_nb; i++) {
+               cleanup_vq(dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ], destroy);
+               cleanup_vq(dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ], destroy);
+       }
+}
+
+/*
+ * Release virtqueues and device memory.
+ */
+static void
+free_device(struct virtio_net *dev)
+{
+       uint32_t i;
+
+       for (i = 0; i < dev->virt_qp_nb; i++)
+               rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
+
+       rte_free(dev);
+}
+
+static void
+init_vring_queue(struct vhost_virtqueue *vq, int qp_idx)
+{
+       memset(vq, 0, sizeof(struct vhost_virtqueue));
+
+       vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+       vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD;
+
+       /* Backends are set to -1 indicating an inactive device. */
+       vq->backend = -1;
+
+       /* always set the default vq pair to enabled */
+       if (qp_idx == 0)
+               vq->enabled = 1;
+}
+
+static void
+init_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx)
+{
+       uint32_t base_idx = qp_idx * VIRTIO_QNUM;
+
+       init_vring_queue(dev->virtqueue[base_idx + VIRTIO_RXQ], qp_idx);
+       init_vring_queue(dev->virtqueue[base_idx + VIRTIO_TXQ], qp_idx);
+}
+
+static void
+reset_vring_queue(struct vhost_virtqueue *vq, int qp_idx)
+{
+       int callfd;
+
+       callfd = vq->callfd;
+       init_vring_queue(vq, qp_idx);
+       vq->callfd = callfd;
+}
+
+static void
+reset_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx)
+{
+       uint32_t base_idx = qp_idx * VIRTIO_QNUM;
+
+       reset_vring_queue(dev->virtqueue[base_idx + VIRTIO_RXQ], qp_idx);
+       reset_vring_queue(dev->virtqueue[base_idx + VIRTIO_TXQ], qp_idx);
+}
+
+int
+alloc_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx)
+{
+       struct vhost_virtqueue *virtqueue = NULL;
+       uint32_t virt_rx_q_idx = qp_idx * VIRTIO_QNUM + VIRTIO_RXQ;
+       uint32_t virt_tx_q_idx = qp_idx * VIRTIO_QNUM + VIRTIO_TXQ;
+
+       virtqueue = rte_malloc(NULL,
+                              sizeof(struct vhost_virtqueue) * VIRTIO_QNUM, 0);
+       if (virtqueue == NULL) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "Failed to allocate memory for virt qp:%d.\n", qp_idx);
+               return -1;
+       }
+
+       dev->virtqueue[virt_rx_q_idx] = virtqueue;
+       dev->virtqueue[virt_tx_q_idx] = virtqueue + VIRTIO_TXQ;
+
+       init_vring_queue_pair(dev, qp_idx);
+
+       dev->virt_qp_nb += 1;
+
+       return 0;
+}
+
+/*
+ * Reset some variables in device structure, while keeping few
+ * others untouched, such as vid, ifname, virt_qp_nb: they
+ * should be same unless the device is removed.
+ */
+void
+reset_device(struct virtio_net *dev)
+{
+       uint32_t i;
+
+       dev->features = 0;
+       dev->protocol_features = 0;
+       dev->flags = 0;
+
+       for (i = 0; i < dev->virt_qp_nb; i++)
+               reset_vring_queue_pair(dev, i);
+}
+
+/*
+ * Function is called from the CUSE open function. The device structure is
+ * initialised and a new entry is added to the device configuration linked
+ * list.
+ */
+int
+vhost_new_device(void)
+{
+       struct virtio_net *dev;
+       int i;
+
+       dev = rte_zmalloc(NULL, sizeof(struct virtio_net), 0);
+       if (dev == NULL) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "Failed to allocate memory for new dev.\n");
+               return -1;
+       }
+
+       for (i = 0; i < MAX_VHOST_DEVICE; i++) {
+               if (vhost_devices[i] == NULL)
+                       break;
+       }
+       if (i == MAX_VHOST_DEVICE) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "Failed to find a free slot for new device.\n");
+               return -1;
+       }
+
+       vhost_devices[i] = dev;
+       dev->vid = i;
+
+       return i;
+}
+
+/*
+ * Function is called from the CUSE release function. This function will
+ * cleanup the device and remove it from device configuration linked list.
+ */
+void
+vhost_destroy_device(int vid)
+{
+       struct virtio_net *dev = get_device(vid);
+
+       if (dev == NULL)
+               return;
+
+       if (dev->flags & VIRTIO_DEV_RUNNING) {
+               dev->flags &= ~VIRTIO_DEV_RUNNING;
+               notify_ops->destroy_device(vid);
+       }
+
+       cleanup_device(dev, 1);
+       free_device(dev);
+
+       vhost_devices[vid] = NULL;
+}
+
+void
+vhost_set_ifname(int vid, const char *if_name, unsigned int if_len)
+{
+       struct virtio_net *dev;
+       unsigned int len;
+
+       dev = get_device(vid);
+       if (dev == NULL)
+               return;
+
+       len = if_len > sizeof(dev->ifname) ?
+               sizeof(dev->ifname) : if_len;
+
+       strncpy(dev->ifname, if_name, len);
+       dev->ifname[sizeof(dev->ifname) - 1] = '\0';
+}
+
+
+int
+rte_vhost_get_numa_node(int vid)
+{
+#ifdef RTE_LIBRTE_VHOST_NUMA
+       struct virtio_net *dev = get_device(vid);
+       int numa_node;
+       int ret;
+
+       if (dev == NULL)
+               return -1;
+
+       ret = get_mempolicy(&numa_node, NULL, 0, dev,
+                           MPOL_F_NODE | MPOL_F_ADDR);
+       if (ret < 0) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "(%d) failed to query numa node: %d\n", vid, ret);
+               return -1;
+       }
+
+       return numa_node;
+#else
+       RTE_SET_USED(vid);
+       return -1;
+#endif
+}
+
+uint32_t
+rte_vhost_get_queue_num(int vid)
+{
+       struct virtio_net *dev = get_device(vid);
+
+       if (dev == NULL)
+               return 0;
+
+       return dev->virt_qp_nb;
+}
+
+int
+rte_vhost_get_ifname(int vid, char *buf, size_t len)
+{
+       struct virtio_net *dev = get_device(vid);
+
+       if (dev == NULL)
+               return -1;
+
+       len = RTE_MIN(len, sizeof(dev->ifname));
+
+       strncpy(buf, dev->ifname, len);
+       buf[len - 1] = '\0';
+
+       return 0;
+}
+
+uint16_t
+rte_vhost_avail_entries(int vid, uint16_t queue_id)
+{
+       struct virtio_net *dev;
+       struct vhost_virtqueue *vq;
+
+       dev = get_device(vid);
+       if (!dev)
+               return 0;
+
+       vq = dev->virtqueue[queue_id];
+       if (!vq->enabled)
+               return 0;
+
+       return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx;
+}
+
+int
+rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable)
+{
+       struct virtio_net *dev = get_device(vid);
+
+       if (dev == NULL)
+               return -1;
+
+       if (enable) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "guest notification isn't supported.\n");
+               return -1;
+       }
+
+       dev->virtqueue[queue_id]->used->flags = VRING_USED_F_NO_NOTIFY;
+       return 0;
+}
+
+uint64_t rte_vhost_feature_get(void)
+{
+       return VHOST_FEATURES;
+}
+
+int rte_vhost_feature_disable(uint64_t feature_mask)
+{
+       VHOST_FEATURES = VHOST_FEATURES & ~feature_mask;
+       return 0;
+}
+
+int rte_vhost_feature_enable(uint64_t feature_mask)
+{
+       if ((feature_mask & VHOST_SUPPORTED_FEATURES) == feature_mask) {
+               VHOST_FEATURES = VHOST_FEATURES | feature_mask;
+               return 0;
+       }
+       return -1;
+}
+
+/*
+ * Register ops so that we can add/remove device to data core.
+ */
+int
+rte_vhost_driver_callback_register(struct virtio_net_device_ops const * const ops)
+{
+       notify_ops = ops;
+
+       return 0;
+}
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h

new file mode 100644 (file)

index 0000000..c2dfc3c
--- /dev/null
+++ b/lib/librte_vhost/vhost.h
@@ -0,0 +1,242 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VHOST_NET_CDEV_H_
+#define _VHOST_NET_CDEV_H_
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <linux/vhost.h>
+
+#include <rte_log.h>
+
+#include "rte_virtio_net.h"
+
+/* Used to indicate that the device is running on a data core */
+#define VIRTIO_DEV_RUNNING 1
+
+/* Backend value set by guest. */
+#define VIRTIO_DEV_STOPPED -1
+
+#define BUF_VECTOR_MAX 256
+
+/**
+ * Structure contains buffer address, length and descriptor index
+ * from vring to do scatter RX.
+ */
+struct buf_vector {
+       uint64_t buf_addr;
+       uint32_t buf_len;
+       uint32_t desc_idx;
+};
+
+/**
+ * Structure contains variables relevant to RX/TX virtqueues.
+ */
+struct vhost_virtqueue {
+       struct vring_desc       *desc;
+       struct vring_avail      *avail;
+       struct vring_used       *used;
+       uint32_t                size;
+
+       /* Last index used on the available ring */
+       volatile uint16_t       last_used_idx;
+#define VIRTIO_INVALID_EVENTFD         (-1)
+#define VIRTIO_UNINITIALIZED_EVENTFD   (-2)
+
+       /* Backend value to determine if device should started/stopped */
+       int                     backend;
+       /* Used to notify the guest (trigger interrupt) */
+       int                     callfd;
+       /* Currently unused as polling mode is enabled */
+       int                     kickfd;
+       int                     enabled;
+
+       /* Physical address of used ring, for logging */
+       uint64_t                log_guest_addr;
+} __rte_cache_aligned;
+
+/* Old kernels have no such macro defined */
+#ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
+ #define VIRTIO_NET_F_GUEST_ANNOUNCE 21
+#endif
+
+
+/*
+ * Make an extra wrapper for VIRTIO_NET_F_MQ and
+ * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX as they are
+ * introduced since kernel v3.8. This makes our
+ * code buildable for older kernel.
+ */
+#ifdef VIRTIO_NET_F_MQ
+ #define VHOST_MAX_QUEUE_PAIRS VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX
+ #define VHOST_SUPPORTS_MQ     (1ULL << VIRTIO_NET_F_MQ)
+#else
+ #define VHOST_MAX_QUEUE_PAIRS 1
+ #define VHOST_SUPPORTS_MQ     0
+#endif
+
+/*
+ * Define virtio 1.0 for older kernels
+ */
+#ifndef VIRTIO_F_VERSION_1
+ #define VIRTIO_F_VERSION_1 32
+#endif
+
+/**
+ * Device structure contains all configuration information relating
+ * to the device.
+ */
+struct virtio_net {
+       /* Frontend (QEMU) memory and memory region information */
+       struct virtio_memory    *mem;
+       uint64_t                features;
+       uint64_t                protocol_features;
+       int                     vid;
+       uint32_t                flags;
+       uint16_t                vhost_hlen;
+       /* to tell if we need broadcast rarp packet */
+       rte_atomic16_t          broadcast_rarp;
+       uint32_t                virt_qp_nb;
+       struct vhost_virtqueue  *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
+#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
+       char                    ifname[IF_NAME_SZ];
+       uint64_t                log_size;
+       uint64_t                log_base;
+       uint64_t                log_addr;
+       struct ether_addr       mac;
+
+} __rte_cache_aligned;
+
+/**
+ * Information relating to memory regions including offsets to
+ * addresses in QEMUs memory file.
+ */
+struct virtio_memory_regions {
+       uint64_t guest_phys_address;
+       uint64_t guest_phys_address_end;
+       uint64_t memory_size;
+       uint64_t userspace_address;
+       uint64_t address_offset;
+};
+
+
+/**
+ * Memory structure includes region and mapping information.
+ */
+struct virtio_memory {
+       /* Base QEMU userspace address of the memory file. */
+       uint64_t base_address;
+       uint64_t mapped_address;
+       uint64_t mapped_size;
+       uint32_t nregions;
+       struct virtio_memory_regions regions[0];
+};
+
+
+/* Macros for printing using RTE_LOG */
+#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1
+#define RTE_LOGTYPE_VHOST_DATA   RTE_LOGTYPE_USER1
+
+#ifdef RTE_LIBRTE_VHOST_DEBUG
+#define VHOST_MAX_PRINT_BUFF 6072
+#define LOG_LEVEL RTE_LOG_DEBUG
+#define LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args)
+#define PRINT_PACKET(device, addr, size, header) do { \
+       char *pkt_addr = (char *)(addr); \
+       unsigned int index; \
+       char packet[VHOST_MAX_PRINT_BUFF]; \
+       \
+       if ((header)) \
+               snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Header size %d: ", (device->vid), (size)); \
+       else \
+               snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Packet size %d: ", (device->vid), (size)); \
+       for (index = 0; index < (size); index++) { \
+               snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \
+                       "%02hhx ", pkt_addr[index]); \
+       } \
+       snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \
+       \
+       LOG_DEBUG(VHOST_DATA, "%s", packet); \
+} while (0)
+#else
+#define LOG_LEVEL RTE_LOG_INFO
+#define LOG_DEBUG(log_type, fmt, args...) do {} while (0)
+#define PRINT_PACKET(device, addr, size, header) do {} while (0)
+#endif
+
+extern uint64_t VHOST_FEATURES;
+#define MAX_VHOST_DEVICE       1024
+extern struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
+
+/**
+ * Function to convert guest physical addresses to vhost virtual addresses.
+ * This is used to convert guest virtio buffer addresses.
+ */
+static inline uint64_t __attribute__((always_inline))
+gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa)
+{
+       struct virtio_memory_regions *region;
+       uint32_t regionidx;
+       uint64_t vhost_va = 0;
+
+       for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
+               region = &dev->mem->regions[regionidx];
+               if ((guest_pa >= region->guest_phys_address) &&
+                       (guest_pa <= region->guest_phys_address_end)) {
+                       vhost_va = region->address_offset + guest_pa;
+                       break;
+               }
+       }
+       return vhost_va;
+}
+
+struct virtio_net_device_ops const *notify_ops;
+struct virtio_net *get_device(int vid);
+
+int vhost_new_device(void);
+void cleanup_device(struct virtio_net *dev, int destroy);
+void reset_device(struct virtio_net *dev);
+void vhost_destroy_device(int);
+
+int alloc_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx);
+
+void vhost_set_ifname(int, const char *if_name, unsigned int if_len);
+
+/*
+ * Backend-specific cleanup. Defined by vhost-cuse and vhost-user.
+ */
+void vhost_backend_cleanup(struct virtio_net *dev);
+
+#endif /* _VHOST_NET_CDEV_H_ */
diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c

deleted file mode 100644 (file)

index 08a73fd..0000000
--- a/lib/librte_vhost/vhost_rxtx.c
+++ /dev/null
@@ -1,924 +0,0 @@
-/*-
- *   BSD LICENSE
- *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- *   All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <stdint.h>
-#include <stdbool.h>
-#include <linux/virtio_net.h>
-
-#include <rte_mbuf.h>
-#include <rte_memcpy.h>
-#include <rte_ether.h>
-#include <rte_ip.h>
-#include <rte_virtio_net.h>
-#include <rte_tcp.h>
-#include <rte_udp.h>
-#include <rte_sctp.h>
-#include <rte_arp.h>
-
-#include "vhost-net.h"
-
-#define MAX_PKT_BURST 32
-#define VHOST_LOG_PAGE 4096
-
-static inline void __attribute__((always_inline))
-vhost_log_page(uint8_t *log_base, uint64_t page)
-{
-       log_base[page / 8] |= 1 << (page % 8);
-}
-
-static inline void __attribute__((always_inline))
-vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
-{
-       uint64_t page;
-
-       if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
-                  !dev->log_base || !len))
-               return;
-
-       if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
-               return;
-
-       /* To make sure guest memory updates are committed before logging */
-       rte_smp_wmb();
-
-       page = addr / VHOST_LOG_PAGE;
-       while (page * VHOST_LOG_PAGE < addr + len) {
-               vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
-               page += 1;
-       }
-}
-
-static inline void __attribute__((always_inline))
-vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
-                    uint64_t offset, uint64_t len)
-{
-       vhost_log_write(dev, vq->log_guest_addr + offset, len);
-}
-
-static bool
-is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
-{
-       return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
-}
-
-static void
-virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
-{
-       if (m_buf->ol_flags & PKT_TX_L4_MASK) {
-               net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
-               net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
-
-               switch (m_buf->ol_flags & PKT_TX_L4_MASK) {
-               case PKT_TX_TCP_CKSUM:
-                       net_hdr->csum_offset = (offsetof(struct tcp_hdr,
-                                               cksum));
-                       break;
-               case PKT_TX_UDP_CKSUM:
-                       net_hdr->csum_offset = (offsetof(struct udp_hdr,
-                                               dgram_cksum));
-                       break;
-               case PKT_TX_SCTP_CKSUM:
-                       net_hdr->csum_offset = (offsetof(struct sctp_hdr,
-                                               cksum));
-                       break;
-               }
-       }
-
-       if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
-               if (m_buf->ol_flags & PKT_TX_IPV4)
-                       net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
-               else
-                       net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
-               net_hdr->gso_size = m_buf->tso_segsz;
-               net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
-                                       + m_buf->l4_len;
-       }
-}
-
-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
-                   struct virtio_net_hdr_mrg_rxbuf hdr)
-{
-       if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
-               *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
-       else
-               *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
-}
-
-static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
-                 struct rte_mbuf *m, uint16_t desc_idx)
-{
-       uint32_t desc_avail, desc_offset;
-       uint32_t mbuf_avail, mbuf_offset;
-       uint32_t cpy_len;
-       struct vring_desc *desc;
-       uint64_t desc_addr;
-       struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-
-       desc = &vq->desc[desc_idx];
-       desc_addr = gpa_to_vva(dev, desc->addr);
-       /*
-        * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
-        * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
-        * otherwise stores offset on the stack instead of in a register.
-        */
-       if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
-               return -1;
-
-       rte_prefetch0((void *)(uintptr_t)desc_addr);
-
-       virtio_enqueue_offload(m, &virtio_hdr.hdr);
-       copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
-       vhost_log_write(dev, desc->addr, dev->vhost_hlen);
-       PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
-       desc_offset = dev->vhost_hlen;
-       desc_avail  = desc->len - dev->vhost_hlen;
-
-       mbuf_avail  = rte_pktmbuf_data_len(m);
-       mbuf_offset = 0;
-       while (mbuf_avail != 0 || m->next != NULL) {
-               /* done with current mbuf, fetch next */
-               if (mbuf_avail == 0) {
-                       m = m->next;
-
-                       mbuf_offset = 0;
-                       mbuf_avail  = rte_pktmbuf_data_len(m);
-               }
-
-               /* done with current desc buf, fetch next */
-               if (desc_avail == 0) {
-                       if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
-                               /* Room in vring buffer is not enough */
-                               return -1;
-                       }
-                       if (unlikely(desc->next >= vq->size))
-                               return -1;
-
-                       desc = &vq->desc[desc->next];
-                       desc_addr = gpa_to_vva(dev, desc->addr);
-                       if (unlikely(!desc_addr))
-                               return -1;
-
-                       desc_offset = 0;
-                       desc_avail  = desc->len;
-               }
-
-               cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-               rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
-                       rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
-                       cpy_len);
-               vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
-               PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
-                            cpy_len, 0);
-
-               mbuf_avail  -= cpy_len;
-               mbuf_offset += cpy_len;
-               desc_avail  -= cpy_len;
-               desc_offset += cpy_len;
-       }
-
-       return 0;
-}
-
-/**
- * This function adds buffers to the virtio devices RX virtqueue. Buffers can
- * be received from the physical port or from another virtio device. A packet
- * count is returned to indicate the number of packets that are succesfully
- * added to the RX queue. This function works when the mbuf is scattered, but
- * it doesn't support the mergeable feature.
- */
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
-             struct rte_mbuf **pkts, uint32_t count)
-{
-       struct vhost_virtqueue *vq;
-       uint16_t avail_idx, free_entries, start_idx;
-       uint16_t desc_indexes[MAX_PKT_BURST];
-       uint16_t used_idx;
-       uint32_t i;
-
-       LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
-       if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
-               RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
-                       dev->vid, __func__, queue_id);
-               return 0;
-       }
-
-       vq = dev->virtqueue[queue_id];
-       if (unlikely(vq->enabled == 0))
-               return 0;
-
-       avail_idx = *((volatile uint16_t *)&vq->avail->idx);
-       start_idx = vq->last_used_idx;
-       free_entries = avail_idx - start_idx;
-       count = RTE_MIN(count, free_entries);
-       count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
-       if (count == 0)
-               return 0;
-
-       LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
-               dev->vid, start_idx, start_idx + count);
-
-       /* Retrieve all of the desc indexes first to avoid caching issues. */
-       rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
-       for (i = 0; i < count; i++) {
-               used_idx = (start_idx + i) & (vq->size - 1);
-               desc_indexes[i] = vq->avail->ring[used_idx];
-               vq->used->ring[used_idx].id = desc_indexes[i];
-               vq->used->ring[used_idx].len = pkts[i]->pkt_len +
-                                              dev->vhost_hlen;
-               vhost_log_used_vring(dev, vq,
-                       offsetof(struct vring_used, ring[used_idx]),
-                       sizeof(vq->used->ring[used_idx]));
-       }
-
-       rte_prefetch0(&vq->desc[desc_indexes[0]]);
-       for (i = 0; i < count; i++) {
-               uint16_t desc_idx = desc_indexes[i];
-               int err;
-
-               err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
-               if (unlikely(err)) {
-                       used_idx = (start_idx + i) & (vq->size - 1);
-                       vq->used->ring[used_idx].len = dev->vhost_hlen;
-                       vhost_log_used_vring(dev, vq,
-                               offsetof(struct vring_used, ring[used_idx]),
-                               sizeof(vq->used->ring[used_idx]));
-               }
-
-               if (i + 1 < count)
-                       rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
-       }
-
-       rte_smp_wmb();
-
-       *(volatile uint16_t *)&vq->used->idx += count;
-       vq->last_used_idx += count;
-       vhost_log_used_vring(dev, vq,
-               offsetof(struct vring_used, idx),
-               sizeof(vq->used->idx));
-
-       /* flush used->idx update before we read avail->flags. */
-       rte_mb();
-
-       /* Kick the guest if necessary. */
-       if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
-                       && (vq->callfd >= 0))
-               eventfd_write(vq->callfd, (eventfd_t)1);
-       return count;
-}
-
-static inline int
-fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
-            uint32_t *allocated, uint32_t *vec_idx,
-            struct buf_vector *buf_vec)
-{
-       uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
-       uint32_t vec_id = *vec_idx;
-       uint32_t len    = *allocated;
-
-       while (1) {
-               if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
-                       return -1;
-
-               len += vq->desc[idx].len;
-               buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
-               buf_vec[vec_id].buf_len  = vq->desc[idx].len;
-               buf_vec[vec_id].desc_idx = idx;
-               vec_id++;
-
-               if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0)
-                       break;
-
-               idx = vq->desc[idx].next;
-       }
-
-       *allocated = len;
-       *vec_idx   = vec_id;
-
-       return 0;
-}
-
-/*
- * Returns -1 on fail, 0 on success
- */
-static inline int
-reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
-                           uint16_t *end, struct buf_vector *buf_vec)
-{
-       uint16_t cur_idx;
-       uint16_t avail_idx;
-       uint32_t allocated = 0;
-       uint32_t vec_idx = 0;
-       uint16_t tries = 0;
-
-       cur_idx  = vq->last_used_idx;
-
-       while (1) {
-               avail_idx = *((volatile uint16_t *)&vq->avail->idx);
-               if (unlikely(cur_idx == avail_idx))
-                       return -1;
-
-               if (unlikely(fill_vec_buf(vq, cur_idx, &allocated,
-                                         &vec_idx, buf_vec) < 0))
-                       return -1;
-
-               cur_idx++;
-               tries++;
-
-               if (allocated >= size)
-                       break;
-
-               /*
-                * if we tried all available ring items, and still
-                * can't get enough buf, it means something abnormal
-                * happened.
-                */
-               if (unlikely(tries >= vq->size))
-                       return -1;
-       }
-
-       *end = cur_idx;
-       return 0;
-}
-
-static inline uint32_t __attribute__((always_inline))
-copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
-                           uint16_t end_idx, struct rte_mbuf *m,
-                           struct buf_vector *buf_vec)
-{
-       struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-       uint32_t vec_idx = 0;
-       uint16_t start_idx = vq->last_used_idx;
-       uint16_t cur_idx = start_idx;
-       uint64_t desc_addr;
-       uint32_t mbuf_offset, mbuf_avail;
-       uint32_t desc_offset, desc_avail;
-       uint32_t cpy_len;
-       uint16_t desc_idx, used_idx;
-
-       if (unlikely(m == NULL))
-               return 0;
-
-       LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
-               dev->vid, cur_idx, end_idx);
-
-       desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
-       if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
-               return 0;
-
-       rte_prefetch0((void *)(uintptr_t)desc_addr);
-
-       virtio_hdr.num_buffers = end_idx - start_idx;
-       LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
-               dev->vid, virtio_hdr.num_buffers);
-
-       virtio_enqueue_offload(m, &virtio_hdr.hdr);
-       copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
-       vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
-       PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
-       desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
-       desc_offset = dev->vhost_hlen;
-
-       mbuf_avail  = rte_pktmbuf_data_len(m);
-       mbuf_offset = 0;
-       while (mbuf_avail != 0 || m->next != NULL) {
-               /* done with current desc buf, get the next one */
-               if (desc_avail == 0) {
-                       desc_idx = buf_vec[vec_idx].desc_idx;
-
-                       if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
-                               /* Update used ring with desc information */
-                               used_idx = cur_idx++ & (vq->size - 1);
-                               vq->used->ring[used_idx].id  = desc_idx;
-                               vq->used->ring[used_idx].len = desc_offset;
-                               vhost_log_used_vring(dev, vq,
-                                       offsetof(struct vring_used,
-                                                ring[used_idx]),
-                                       sizeof(vq->used->ring[used_idx]));
-                       }
-
-                       vec_idx++;
-                       desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
-                       if (unlikely(!desc_addr))
-                               return 0;
-
-                       /* Prefetch buffer address. */
-                       rte_prefetch0((void *)(uintptr_t)desc_addr);
-                       desc_offset = 0;
-                       desc_avail  = buf_vec[vec_idx].buf_len;
-               }
-
-               /* done with current mbuf, get the next one */
-               if (mbuf_avail == 0) {
-                       m = m->next;
-
-                       mbuf_offset = 0;
-                       mbuf_avail  = rte_pktmbuf_data_len(m);
-               }
-
-               cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-               rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
-                       rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
-                       cpy_len);
-               vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
-                       cpy_len);
-               PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
-                       cpy_len, 0);
-
-               mbuf_avail  -= cpy_len;
-               mbuf_offset += cpy_len;
-               desc_avail  -= cpy_len;
-               desc_offset += cpy_len;
-       }
-
-       used_idx = cur_idx & (vq->size - 1);
-       vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
-       vq->used->ring[used_idx].len = desc_offset;
-       vhost_log_used_vring(dev, vq,
-               offsetof(struct vring_used, ring[used_idx]),
-               sizeof(vq->used->ring[used_idx]));
-
-       return end_idx - start_idx;
-}
-
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
-       struct rte_mbuf **pkts, uint32_t count)
-{
-       struct vhost_virtqueue *vq;
-       uint32_t pkt_idx = 0, nr_used = 0;
-       uint16_t end;
-       struct buf_vector buf_vec[BUF_VECTOR_MAX];
-
-       LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
-       if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
-               RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
-                       dev->vid, __func__, queue_id);
-               return 0;
-       }
-
-       vq = dev->virtqueue[queue_id];
-       if (unlikely(vq->enabled == 0))
-               return 0;
-
-       count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
-       if (count == 0)
-               return 0;
-
-       for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
-               uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
-
-               if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len,
-                                                        &end, buf_vec) < 0)) {
-                       LOG_DEBUG(VHOST_DATA,
-                               "(%d) failed to get enough desc from vring\n",
-                               dev->vid);
-                       break;
-               }
-
-               nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
-                                                     pkts[pkt_idx], buf_vec);
-               rte_smp_wmb();
-
-               *(volatile uint16_t *)&vq->used->idx += nr_used;
-               vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
-                       sizeof(vq->used->idx));
-               vq->last_used_idx += nr_used;
-       }
-
-       if (likely(pkt_idx)) {
-               /* flush used->idx update before we read avail->flags. */
-               rte_mb();
-
-               /* Kick the guest if necessary. */
-               if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
-                               && (vq->callfd >= 0))
-                       eventfd_write(vq->callfd, (eventfd_t)1);
-       }
-
-       return pkt_idx;
-}
-
-uint16_t
-rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
-       struct rte_mbuf **pkts, uint16_t count)
-{
-       struct virtio_net *dev = get_device(vid);
-
-       if (!dev)
-               return 0;
-
-       if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
-               return virtio_dev_merge_rx(dev, queue_id, pkts, count);
-       else
-               return virtio_dev_rx(dev, queue_id, pkts, count);
-}
-
-static void
-parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
-{
-       struct ipv4_hdr *ipv4_hdr;
-       struct ipv6_hdr *ipv6_hdr;
-       void *l3_hdr = NULL;
-       struct ether_hdr *eth_hdr;
-       uint16_t ethertype;
-
-       eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
-
-       m->l2_len = sizeof(struct ether_hdr);
-       ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
-
-       if (ethertype == ETHER_TYPE_VLAN) {
-               struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
-
-               m->l2_len += sizeof(struct vlan_hdr);
-               ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
-       }
-
-       l3_hdr = (char *)eth_hdr + m->l2_len;
-
-       switch (ethertype) {
-       case ETHER_TYPE_IPv4:
-               ipv4_hdr = (struct ipv4_hdr *)l3_hdr;
-               *l4_proto = ipv4_hdr->next_proto_id;
-               m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
-               *l4_hdr = (char *)l3_hdr + m->l3_len;
-               m->ol_flags |= PKT_TX_IPV4;
-               break;
-       case ETHER_TYPE_IPv6:
-               ipv6_hdr = (struct ipv6_hdr *)l3_hdr;
-               *l4_proto = ipv6_hdr->proto;
-               m->l3_len = sizeof(struct ipv6_hdr);
-               *l4_hdr = (char *)l3_hdr + m->l3_len;
-               m->ol_flags |= PKT_TX_IPV6;
-               break;
-       default:
-               m->l3_len = 0;
-               *l4_proto = 0;
-               break;
-       }
-}
-
-static inline void __attribute__((always_inline))
-vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
-{
-       uint16_t l4_proto = 0;
-       void *l4_hdr = NULL;
-       struct tcp_hdr *tcp_hdr = NULL;
-
-       parse_ethernet(m, &l4_proto, &l4_hdr);
-       if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
-               if (hdr->csum_start == (m->l2_len + m->l3_len)) {
-                       switch (hdr->csum_offset) {
-                       case (offsetof(struct tcp_hdr, cksum)):
-                               if (l4_proto == IPPROTO_TCP)
-                                       m->ol_flags |= PKT_TX_TCP_CKSUM;
-                               break;
-                       case (offsetof(struct udp_hdr, dgram_cksum)):
-                               if (l4_proto == IPPROTO_UDP)
-                                       m->ol_flags |= PKT_TX_UDP_CKSUM;
-                               break;
-                       case (offsetof(struct sctp_hdr, cksum)):
-                               if (l4_proto == IPPROTO_SCTP)
-                                       m->ol_flags |= PKT_TX_SCTP_CKSUM;
-                               break;
-                       default:
-                               break;
-                       }
-               }
-       }
-
-       if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
-               switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
-               case VIRTIO_NET_HDR_GSO_TCPV4:
-               case VIRTIO_NET_HDR_GSO_TCPV6:
-                       tcp_hdr = (struct tcp_hdr *)l4_hdr;
-                       m->ol_flags |= PKT_TX_TCP_SEG;
-                       m->tso_segsz = hdr->gso_size;
-                       m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
-                       break;
-               default:
-                       RTE_LOG(WARNING, VHOST_DATA,
-                               "unsupported gso type %u.\n", hdr->gso_type);
-                       break;
-               }
-       }
-}
-
-#define RARP_PKT_SIZE  64
-
-static int
-make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
-{
-       struct ether_hdr *eth_hdr;
-       struct arp_hdr  *rarp;
-
-       if (rarp_mbuf->buf_len < 64) {
-               RTE_LOG(WARNING, VHOST_DATA,
-                       "failed to make RARP; mbuf size too small %u (< %d)\n",
-                       rarp_mbuf->buf_len, RARP_PKT_SIZE);
-               return -1;
-       }
-
-       /* Ethernet header. */
-       eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0);
-       memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN);
-       ether_addr_copy(mac, &eth_hdr->s_addr);
-       eth_hdr->ether_type = htons(ETHER_TYPE_RARP);
-
-       /* RARP header. */
-       rarp = (struct arp_hdr *)(eth_hdr + 1);
-       rarp->arp_hrd = htons(ARP_HRD_ETHER);
-       rarp->arp_pro = htons(ETHER_TYPE_IPv4);
-       rarp->arp_hln = ETHER_ADDR_LEN;
-       rarp->arp_pln = 4;
-       rarp->arp_op  = htons(ARP_OP_REVREQUEST);
-
-       ether_addr_copy(mac, &rarp->arp_data.arp_sha);
-       ether_addr_copy(mac, &rarp->arp_data.arp_tha);
-       memset(&rarp->arp_data.arp_sip, 0x00, 4);
-       memset(&rarp->arp_data.arp_tip, 0x00, 4);
-
-       rarp_mbuf->pkt_len  = rarp_mbuf->data_len = RARP_PKT_SIZE;
-
-       return 0;
-}
-
-static inline int __attribute__((always_inline))
-copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
-                 struct rte_mbuf *m, uint16_t desc_idx,
-                 struct rte_mempool *mbuf_pool)
-{
-       struct vring_desc *desc;
-       uint64_t desc_addr;
-       uint32_t desc_avail, desc_offset;
-       uint32_t mbuf_avail, mbuf_offset;
-       uint32_t cpy_len;
-       struct rte_mbuf *cur = m, *prev = m;
-       struct virtio_net_hdr *hdr;
-       /* A counter to avoid desc dead loop chain */
-       uint32_t nr_desc = 1;
-
-       desc = &vq->desc[desc_idx];
-       if (unlikely(desc->len < dev->vhost_hlen))
-               return -1;
-
-       desc_addr = gpa_to_vva(dev, desc->addr);
-       if (unlikely(!desc_addr))
-               return -1;
-
-       hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr);
-       rte_prefetch0(hdr);
-
-       /*
-        * A virtio driver normally uses at least 2 desc buffers
-        * for Tx: the first for storing the header, and others
-        * for storing the data.
-        */
-       if (likely((desc->len == dev->vhost_hlen) &&
-                  (desc->flags & VRING_DESC_F_NEXT) != 0)) {
-               desc = &vq->desc[desc->next];
-
-               desc_addr = gpa_to_vva(dev, desc->addr);
-               if (unlikely(!desc_addr))
-                       return -1;
-
-               rte_prefetch0((void *)(uintptr_t)desc_addr);
-
-               desc_offset = 0;
-               desc_avail  = desc->len;
-               nr_desc    += 1;
-
-               PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0);
-       } else {
-               desc_avail  = desc->len - dev->vhost_hlen;
-               desc_offset = dev->vhost_hlen;
-       }
-
-       mbuf_offset = 0;
-       mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
-       while (1) {
-               cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-               rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset),
-                       (void *)((uintptr_t)(desc_addr + desc_offset)),
-                       cpy_len);
-
-               mbuf_avail  -= cpy_len;
-               mbuf_offset += cpy_len;
-               desc_avail  -= cpy_len;
-               desc_offset += cpy_len;
-
-               /* This desc reaches to its end, get the next one */
-               if (desc_avail == 0) {
-                       if ((desc->flags & VRING_DESC_F_NEXT) == 0)
-                               break;
-
-                       if (unlikely(desc->next >= vq->size ||
-                                    ++nr_desc > vq->size))
-                               return -1;
-                       desc = &vq->desc[desc->next];
-
-                       desc_addr = gpa_to_vva(dev, desc->addr);
-                       if (unlikely(!desc_addr))
-                               return -1;
-
-                       rte_prefetch0((void *)(uintptr_t)desc_addr);
-
-                       desc_offset = 0;
-                       desc_avail  = desc->len;
-
-                       PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0);
-               }
-
-               /*
-                * This mbuf reaches to its end, get a new one
-                * to hold more data.
-                */
-               if (mbuf_avail == 0) {
-                       cur = rte_pktmbuf_alloc(mbuf_pool);
-                       if (unlikely(cur == NULL)) {
-                               RTE_LOG(ERR, VHOST_DATA, "Failed to "
-                                       "allocate memory for mbuf.\n");
-                               return -1;
-                       }
-
-                       prev->next = cur;
-                       prev->data_len = mbuf_offset;
-                       m->nb_segs += 1;
-                       m->pkt_len += mbuf_offset;
-                       prev = cur;
-
-                       mbuf_offset = 0;
-                       mbuf_avail  = cur->buf_len - RTE_PKTMBUF_HEADROOM;
-               }
-       }
-
-       prev->data_len = mbuf_offset;
-       m->pkt_len    += mbuf_offset;
-
-       if (hdr->flags != 0 || hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE)
-               vhost_dequeue_offload(hdr, m);
-
-       return 0;
-}
-
-uint16_t
-rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
-       struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
-{
-       struct virtio_net *dev;
-       struct rte_mbuf *rarp_mbuf = NULL;
-       struct vhost_virtqueue *vq;
-       uint32_t desc_indexes[MAX_PKT_BURST];
-       uint32_t used_idx;
-       uint32_t i = 0;
-       uint16_t free_entries;
-       uint16_t avail_idx;
-
-       dev = get_device(vid);
-       if (!dev)
-               return 0;
-
-       if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) {
-               RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
-                       dev->vid, __func__, queue_id);
-               return 0;
-       }
-
-       vq = dev->virtqueue[queue_id];
-       if (unlikely(vq->enabled == 0))
-               return 0;
-
-       /*
-        * Construct a RARP broadcast packet, and inject it to the "pkts"
-        * array, to looks like that guest actually send such packet.
-        *
-        * Check user_send_rarp() for more information.
-        */
-       if (unlikely(rte_atomic16_cmpset((volatile uint16_t *)
-                                        &dev->broadcast_rarp.cnt, 1, 0))) {
-               rarp_mbuf = rte_pktmbuf_alloc(mbuf_pool);
-               if (rarp_mbuf == NULL) {
-                       RTE_LOG(ERR, VHOST_DATA,
-                               "Failed to allocate memory for mbuf.\n");
-                       return 0;
-               }
-
-               if (make_rarp_packet(rarp_mbuf, &dev->mac)) {
-                       rte_pktmbuf_free(rarp_mbuf);
-                       rarp_mbuf = NULL;
-               } else {
-                       count -= 1;
-               }
-       }
-
-       avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
-       free_entries = avail_idx - vq->last_used_idx;
-       if (free_entries == 0)
-               goto out;
-
-       LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
-
-       /* Prefetch available ring to retrieve head indexes. */
-       used_idx = vq->last_used_idx & (vq->size - 1);
-       rte_prefetch0(&vq->avail->ring[used_idx]);
-       rte_prefetch0(&vq->used->ring[used_idx]);
-
-       count = RTE_MIN(count, MAX_PKT_BURST);
-       count = RTE_MIN(count, free_entries);
-       LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
-                       dev->vid, count);
-
-       /* Retrieve all of the head indexes first to avoid caching issues. */
-       for (i = 0; i < count; i++) {
-               used_idx = (vq->last_used_idx + i) & (vq->size - 1);
-               desc_indexes[i] = vq->avail->ring[used_idx];
-
-               vq->used->ring[used_idx].id  = desc_indexes[i];
-               vq->used->ring[used_idx].len = 0;
-               vhost_log_used_vring(dev, vq,
-                               offsetof(struct vring_used, ring[used_idx]),
-                               sizeof(vq->used->ring[used_idx]));
-       }
-
-       /* Prefetch descriptor index. */
-       rte_prefetch0(&vq->desc[desc_indexes[0]]);
-       for (i = 0; i < count; i++) {
-               int err;
-
-               if (likely(i + 1 < count))
-                       rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
-
-               pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
-               if (unlikely(pkts[i] == NULL)) {
-                       RTE_LOG(ERR, VHOST_DATA,
-                               "Failed to allocate memory for mbuf.\n");
-                       break;
-               }
-               err = copy_desc_to_mbuf(dev, vq, pkts[i], desc_indexes[i],
-                                       mbuf_pool);
-               if (unlikely(err)) {
-                       rte_pktmbuf_free(pkts[i]);
-                       break;
-               }
-       }
-
-       rte_smp_wmb();
-       rte_smp_rmb();
-       vq->used->idx += i;
-       vq->last_used_idx += i;
-       vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
-                       sizeof(vq->used->idx));
-
-       /* Kick guest if required. */
-       if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
-                       && (vq->callfd >= 0))
-               eventfd_write(vq->callfd, (eventfd_t)1);
-
-out:
-       if (unlikely(rarp_mbuf != NULL)) {
-               /*
-                * Inject it to the head of "pkts" array, so that switch's mac
-                * learning table will get updated first.
-                */
-               memmove(&pkts[1], pkts, i * sizeof(struct rte_mbuf *));
-               pkts[0] = rarp_mbuf;
-               i += 1;
-       }
-
-       return i;
-}
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c

new file mode 100644 (file)

index 0000000..c4714b7
--- /dev/null
+++ b/lib/librte_vhost/vhost_user.c
@@ -0,0 +1,1040 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <assert.h>
+#ifdef RTE_LIBRTE_VHOST_NUMA
+#include <numaif.h>
+#endif
+
+#include <rte_common.h>
+#include <rte_malloc.h>
+#include <rte_log.h>
+
+#include "vhost.h"
+#include "vhost_user.h"
+
+static const char *vhost_message_str[VHOST_USER_MAX] = {
+       [VHOST_USER_NONE] = "VHOST_USER_NONE",
+       [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
+       [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
+       [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
+       [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
+       [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
+       [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
+       [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
+       [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
+       [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
+       [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
+       [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
+       [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
+       [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
+       [VHOST_USER_SET_VRING_ERR]  = "VHOST_USER_SET_VRING_ERR",
+       [VHOST_USER_GET_PROTOCOL_FEATURES]  = "VHOST_USER_GET_PROTOCOL_FEATURES",
+       [VHOST_USER_SET_PROTOCOL_FEATURES]  = "VHOST_USER_SET_PROTOCOL_FEATURES",
+       [VHOST_USER_GET_QUEUE_NUM]  = "VHOST_USER_GET_QUEUE_NUM",
+       [VHOST_USER_SET_VRING_ENABLE]  = "VHOST_USER_SET_VRING_ENABLE",
+       [VHOST_USER_SEND_RARP]  = "VHOST_USER_SEND_RARP",
+};
+
+struct orig_region_map {
+       int fd;
+       uint64_t mapped_address;
+       uint64_t mapped_size;
+       uint64_t blksz;
+};
+
+#define orig_region(ptr, nregions) \
+       ((struct orig_region_map *)RTE_PTR_ADD((ptr), \
+               sizeof(struct virtio_memory) + \
+               sizeof(struct virtio_memory_regions) * (nregions)))
+
+static uint64_t
+get_blk_size(int fd)
+{
+       struct stat stat;
+       int ret;
+
+       ret = fstat(fd, &stat);
+       return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
+}
+
+static void
+free_mem_region(struct virtio_net *dev)
+{
+       struct orig_region_map *region;
+       unsigned int idx;
+
+       if (!dev || !dev->mem)
+               return;
+
+       region = orig_region(dev->mem, dev->mem->nregions);
+       for (idx = 0; idx < dev->mem->nregions; idx++) {
+               if (region[idx].mapped_address) {
+                       munmap((void *)(uintptr_t)region[idx].mapped_address,
+                                       region[idx].mapped_size);
+                       close(region[idx].fd);
+               }
+       }
+}
+
+void
+vhost_backend_cleanup(struct virtio_net *dev)
+{
+       if (dev->mem) {
+               free_mem_region(dev);
+               free(dev->mem);
+               dev->mem = NULL;
+       }
+       if (dev->log_addr) {
+               munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
+               dev->log_addr = 0;
+       }
+}
+
+/*
+ * This function just returns success at the moment unless
+ * the device hasn't been initialised.
+ */
+static int
+vhost_set_owner(int vid)
+{
+       struct virtio_net *dev;
+
+       dev = get_device(vid);
+       if (dev == NULL)
+               return -1;
+
+       return 0;
+}
+
+static int
+vhost_reset_owner(int vid)
+{
+       struct virtio_net *dev;
+
+       dev = get_device(vid);
+       if (dev == NULL)
+               return -1;
+
+       if (dev->flags & VIRTIO_DEV_RUNNING) {
+               dev->flags &= ~VIRTIO_DEV_RUNNING;
+               notify_ops->destroy_device(vid);
+       }
+
+       cleanup_device(dev, 0);
+       reset_device(dev);
+       return 0;
+}
+
+/*
+ * The features that we support are requested.
+ */
+static int
+vhost_get_features(int vid, uint64_t *pu)
+{
+       struct virtio_net *dev;
+
+       dev = get_device(vid);
+       if (dev == NULL)
+               return -1;
+
+       /* Send our supported features. */
+       *pu = VHOST_FEATURES;
+       return 0;
+}
+
+/*
+ * We receive the negotiated features supported by us and the virtio device.
+ */
+static int
+vhost_set_features(int vid, uint64_t *pu)
+{
+       struct virtio_net *dev;
+
+       dev = get_device(vid);
+       if (dev == NULL)
+               return -1;
+       if (*pu & ~VHOST_FEATURES)
+               return -1;
+
+       dev->features = *pu;
+       if (dev->features &
+               ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) {
+               dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+       } else {
+               dev->vhost_hlen = sizeof(struct virtio_net_hdr);
+       }
+       LOG_DEBUG(VHOST_CONFIG,
+               "(%d) mergeable RX buffers %s, virtio 1 %s\n",
+               dev->vid,
+               (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off",
+               (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off");
+
+       return 0;
+}
+
+/*
+ * The virtio device sends us the size of the descriptor ring.
+ */
+static int
+vhost_set_vring_num(int vid, struct vhost_vring_state *state)
+{
+       struct virtio_net *dev;
+
+       dev = get_device(vid);
+       if (dev == NULL)
+               return -1;
+
+       /* State->index refers to the queue index. The txq is 1, rxq is 0. */
+       dev->virtqueue[state->index]->size = state->num;
+
+       return 0;
+}
+
+/*
+ * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the
+ * same numa node as the memory of vring descriptor.
+ */
+#ifdef RTE_LIBRTE_VHOST_NUMA
+static struct virtio_net*
+numa_realloc(struct virtio_net *dev, int index)
+{
+       int oldnode, newnode;
+       struct virtio_net *old_dev;
+       struct vhost_virtqueue *old_vq, *vq;
+       int ret;
+
+       /*
+        * vq is allocated on pairs, we should try to do realloc
+        * on first queue of one queue pair only.
+        */
+       if (index % VIRTIO_QNUM != 0)
+               return dev;
+
+       old_dev = dev;
+       vq = old_vq = dev->virtqueue[index];
+
+       ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc,
+                           MPOL_F_NODE | MPOL_F_ADDR);
+
+       /* check if we need to reallocate vq */
+       ret |= get_mempolicy(&oldnode, NULL, 0, old_vq,
+                            MPOL_F_NODE | MPOL_F_ADDR);
+       if (ret) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "Unable to get vq numa information.\n");
+               return dev;
+       }
+       if (oldnode != newnode) {
+               RTE_LOG(INFO, VHOST_CONFIG,
+                       "reallocate vq from %d to %d node\n", oldnode, newnode);
+               vq = rte_malloc_socket(NULL, sizeof(*vq) * VIRTIO_QNUM, 0,
+                                      newnode);
+               if (!vq)
+                       return dev;
+
+               memcpy(vq, old_vq, sizeof(*vq) * VIRTIO_QNUM);
+               rte_free(old_vq);
+       }
+
+       /* check if we need to reallocate dev */
+       ret = get_mempolicy(&oldnode, NULL, 0, old_dev,
+                           MPOL_F_NODE | MPOL_F_ADDR);
+       if (ret) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "Unable to get dev numa information.\n");
+               goto out;
+       }
+       if (oldnode != newnode) {
+               RTE_LOG(INFO, VHOST_CONFIG,
+                       "reallocate dev from %d to %d node\n",
+                       oldnode, newnode);
+               dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode);
+               if (!dev) {
+                       dev = old_dev;
+                       goto out;
+               }
+
+               memcpy(dev, old_dev, sizeof(*dev));
+               rte_free(old_dev);
+       }
+
+out:
+       dev->virtqueue[index] = vq;
+       dev->virtqueue[index + 1] = vq + 1;
+       vhost_devices[dev->vid] = dev;
+
+       return dev;
+}
+#else
+static struct virtio_net*
+numa_realloc(struct virtio_net *dev, int index __rte_unused)
+{
+       return dev;
+}
+#endif
+
+/*
+ * Converts QEMU virtual address to Vhost virtual address. This function is
+ * used to convert the ring addresses to our address space.
+ */
+static uint64_t
+qva_to_vva(struct virtio_net *dev, uint64_t qemu_va)
+{
+       struct virtio_memory_regions *region;
+       uint64_t vhost_va = 0;
+       uint32_t regionidx = 0;
+
+       /* Find the region where the address lives. */
+       for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
+               region = &dev->mem->regions[regionidx];
+               if ((qemu_va >= region->userspace_address) &&
+                       (qemu_va <= region->userspace_address +
+                       region->memory_size)) {
+                       vhost_va = qemu_va + region->guest_phys_address +
+                               region->address_offset -
+                               region->userspace_address;
+                       break;
+               }
+       }
+       return vhost_va;
+}
+
+/*
+ * The virtio device sends us the desc, used and avail ring addresses.
+ * This function then converts these to our address space.
+ */
+static int
+vhost_set_vring_addr(int vid, struct vhost_vring_addr *addr)
+{
+       struct virtio_net *dev;
+       struct vhost_virtqueue *vq;
+
+       dev = get_device(vid);
+       if ((dev == NULL) || (dev->mem == NULL))
+               return -1;
+
+       /* addr->index refers to the queue index. The txq 1, rxq is 0. */
+       vq = dev->virtqueue[addr->index];
+
+       /* The addresses are converted from QEMU virtual to Vhost virtual. */
+       vq->desc = (struct vring_desc *)(uintptr_t)qva_to_vva(dev,
+                       addr->desc_user_addr);
+       if (vq->desc == 0) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "(%d) failed to find desc ring address.\n",
+                       dev->vid);
+               return -1;
+       }
+
+       dev = numa_realloc(dev, addr->index);
+       vq = dev->virtqueue[addr->index];
+
+       vq->avail = (struct vring_avail *)(uintptr_t)qva_to_vva(dev,
+                       addr->avail_user_addr);
+       if (vq->avail == 0) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "(%d) failed to find avail ring address.\n",
+                       dev->vid);
+               return -1;
+       }
+
+       vq->used = (struct vring_used *)(uintptr_t)qva_to_vva(dev,
+                       addr->used_user_addr);
+       if (vq->used == 0) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "(%d) failed to find used ring address.\n",
+                       dev->vid);
+               return -1;
+       }
+
+       if (vq->last_used_idx != vq->used->idx) {
+               RTE_LOG(WARNING, VHOST_CONFIG,
+                       "last_used_idx (%u) and vq->used->idx (%u) mismatches; "
+                       "some packets maybe resent for Tx and dropped for Rx\n",
+                       vq->last_used_idx, vq->used->idx);
+               vq->last_used_idx     = vq->used->idx;
+       }
+
+       vq->log_guest_addr = addr->log_guest_addr;
+
+       LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n",
+                       dev->vid, vq->desc);
+       LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n",
+                       dev->vid, vq->avail);
+       LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n",
+                       dev->vid, vq->used);
+       LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n",
+                       dev->vid, vq->log_guest_addr);
+
+       return 0;
+}
+
+/*
+ * The virtio device sends us the available ring last used index.
+ */
+static int
+vhost_set_vring_base(int vid, struct vhost_vring_state *state)
+{
+       struct virtio_net *dev;
+
+       dev = get_device(vid);
+       if (dev == NULL)
+               return -1;
+
+       /* State->index refers to the queue index. The txq is 1, rxq is 0. */
+       dev->virtqueue[state->index]->last_used_idx = state->num;
+
+       return 0;
+}
+
+/*
+ * We send the virtio device our available ring last used index.
+ */
+static int
+vhost_get_vring_base(int vid, uint32_t index,
+       struct vhost_vring_state *state)
+{
+       struct virtio_net *dev;
+
+       dev = get_device(vid);
+       if (dev == NULL)
+               return -1;
+
+       state->index = index;
+       /* State->index refers to the queue index. The txq is 1, rxq is 0. */
+       state->num = dev->virtqueue[state->index]->last_used_idx;
+
+       return 0;
+}
+
+/*
+ * The virtio device sends an eventfd to interrupt the guest. This fd gets
+ * copied into our process space.
+ */
+static int
+vhost_set_vring_call(int vid, struct vhost_vring_file *file)
+{
+       struct virtio_net *dev;
+       struct vhost_virtqueue *vq;
+       uint32_t cur_qp_idx = file->index / VIRTIO_QNUM;
+
+       dev = get_device(vid);
+       if (dev == NULL)
+               return -1;
+
+       /*
+        * FIXME: VHOST_SET_VRING_CALL is the first per-vring message
+        * we get, so we do vring queue pair allocation here.
+        */
+       if (cur_qp_idx + 1 > dev->virt_qp_nb) {
+               if (alloc_vring_queue_pair(dev, cur_qp_idx) < 0)
+                       return -1;
+       }
+
+       /* file->index refers to the queue index. The txq is 1, rxq is 0. */
+       vq = dev->virtqueue[file->index];
+       assert(vq != NULL);
+
+       if (vq->callfd >= 0)
+               close(vq->callfd);
+
+       vq->callfd = file->fd;
+
+       return 0;
+}
+
+/*
+ * The virtio device sends an eventfd that it can use to notify us.
+ * This fd gets copied into our process space.
+ */
+static int
+vhost_set_vring_kick(int vid, struct vhost_vring_file *file)
+{
+       struct virtio_net *dev;
+       struct vhost_virtqueue *vq;
+
+       dev = get_device(vid);
+       if (dev == NULL)
+               return -1;
+
+       /* file->index refers to the queue index. The txq is 1, rxq is 0. */
+       vq = dev->virtqueue[file->index];
+
+       if (vq->kickfd >= 0)
+               close(vq->kickfd);
+
+       vq->kickfd = file->fd;
+
+       return 0;
+}
+
+static int
+user_set_mem_table(int vid, struct VhostUserMsg *pmsg)
+{
+       struct VhostUserMemory memory = pmsg->payload.memory;
+       struct virtio_memory_regions *pregion;
+       uint64_t mapped_address, mapped_size;
+       struct virtio_net *dev;
+       unsigned int idx = 0;
+       struct orig_region_map *pregion_orig;
+       uint64_t alignment;
+
+       /* unmap old memory regions one by one*/
+       dev = get_device(vid);
+       if (dev == NULL)
+               return -1;
+
+       /* Remove from the data plane. */
+       if (dev->flags & VIRTIO_DEV_RUNNING) {
+               dev->flags &= ~VIRTIO_DEV_RUNNING;
+               notify_ops->destroy_device(vid);
+       }
+
+       if (dev->mem) {
+               free_mem_region(dev);
+               free(dev->mem);
+               dev->mem = NULL;
+       }
+
+       dev->mem = calloc(1,
+               sizeof(struct virtio_memory) +
+               sizeof(struct virtio_memory_regions) * memory.nregions +
+               sizeof(struct orig_region_map) * memory.nregions);
+       if (dev->mem == NULL) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "(%d) failed to allocate memory for dev->mem\n",
+                       dev->vid);
+               return -1;
+       }
+       dev->mem->nregions = memory.nregions;
+
+       pregion_orig = orig_region(dev->mem, memory.nregions);
+       for (idx = 0; idx < memory.nregions; idx++) {
+               pregion = &dev->mem->regions[idx];
+               pregion->guest_phys_address =
+                       memory.regions[idx].guest_phys_addr;
+               pregion->guest_phys_address_end =
+                       memory.regions[idx].guest_phys_addr +
+                       memory.regions[idx].memory_size;
+               pregion->memory_size =
+                       memory.regions[idx].memory_size;
+               pregion->userspace_address =
+                       memory.regions[idx].userspace_addr;
+
+               /* This is ugly */
+               mapped_size = memory.regions[idx].memory_size +
+                       memory.regions[idx].mmap_offset;
+
+               /* mmap() without flag of MAP_ANONYMOUS, should be called
+                * with length argument aligned with hugepagesz at older
+                * longterm version Linux, like 2.6.32 and 3.2.72, or
+                * mmap() will fail with EINVAL.
+                *
+                * to avoid failure, make sure in caller to keep length
+                * aligned.
+                */
+               alignment = get_blk_size(pmsg->fds[idx]);
+               if (alignment == (uint64_t)-1) {
+                       RTE_LOG(ERR, VHOST_CONFIG,
+                               "couldn't get hugepage size through fstat\n");
+                       goto err_mmap;
+               }
+               mapped_size = RTE_ALIGN_CEIL(mapped_size, alignment);
+
+               mapped_address = (uint64_t)(uintptr_t)mmap(NULL,
+                       mapped_size,
+                       PROT_READ | PROT_WRITE, MAP_SHARED,
+                       pmsg->fds[idx],
+                       0);
+
+               RTE_LOG(INFO, VHOST_CONFIG,
+                       "mapped region %d fd:%d to:%p sz:0x%"PRIx64" "
+                       "off:0x%"PRIx64" align:0x%"PRIx64"\n",
+                       idx, pmsg->fds[idx], (void *)(uintptr_t)mapped_address,
+                       mapped_size, memory.regions[idx].mmap_offset,
+                       alignment);
+
+               if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) {
+                       RTE_LOG(ERR, VHOST_CONFIG,
+                               "mmap qemu guest failed.\n");
+                       goto err_mmap;
+               }
+
+               pregion_orig[idx].mapped_address = mapped_address;
+               pregion_orig[idx].mapped_size = mapped_size;
+               pregion_orig[idx].blksz = alignment;
+               pregion_orig[idx].fd = pmsg->fds[idx];
+
+               mapped_address +=  memory.regions[idx].mmap_offset;
+
+               pregion->address_offset = mapped_address -
+                       pregion->guest_phys_address;
+
+               if (memory.regions[idx].guest_phys_addr == 0) {
+                       dev->mem->base_address =
+                               memory.regions[idx].userspace_addr;
+                       dev->mem->mapped_address =
+                               pregion->address_offset;
+               }
+
+               LOG_DEBUG(VHOST_CONFIG,
+                       "REGION: %u GPA: %p QEMU VA: %p SIZE (%"PRIu64")\n",
+                       idx,
+                       (void *)(uintptr_t)pregion->guest_phys_address,
+                       (void *)(uintptr_t)pregion->userspace_address,
+                        pregion->memory_size);
+       }
+
+       return 0;
+
+err_mmap:
+       while (idx--) {
+               munmap((void *)(uintptr_t)pregion_orig[idx].mapped_address,
+                               pregion_orig[idx].mapped_size);
+               close(pregion_orig[idx].fd);
+       }
+       free(dev->mem);
+       dev->mem = NULL;
+       return -1;
+}
+
+static int
+vq_is_ready(struct vhost_virtqueue *vq)
+{
+       return vq && vq->desc   &&
+              vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD &&
+              vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD;
+}
+
+static int
+virtio_is_ready(struct virtio_net *dev)
+{
+       struct vhost_virtqueue *rvq, *tvq;
+       uint32_t i;
+
+       for (i = 0; i < dev->virt_qp_nb; i++) {
+               rvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ];
+               tvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ];
+
+               if (!vq_is_ready(rvq) || !vq_is_ready(tvq)) {
+                       RTE_LOG(INFO, VHOST_CONFIG,
+                               "virtio is not ready for processing.\n");
+                       return 0;
+               }
+       }
+
+       RTE_LOG(INFO, VHOST_CONFIG,
+               "virtio is now ready for processing.\n");
+       return 1;
+}
+
+static void
+user_set_vring_call(int vid, struct VhostUserMsg *pmsg)
+{
+       struct vhost_vring_file file;
+
+       file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+       if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
+               file.fd = VIRTIO_INVALID_EVENTFD;
+       else
+               file.fd = pmsg->fds[0];
+       RTE_LOG(INFO, VHOST_CONFIG,
+               "vring call idx:%d file:%d\n", file.index, file.fd);
+       vhost_set_vring_call(vid, &file);
+}
+
+/*
+ *  In vhost-user, when we receive kick message, will test whether virtio
+ *  device is ready for packet processing.
+ */
+static void
+user_set_vring_kick(int vid, struct VhostUserMsg *pmsg)
+{
+       struct vhost_vring_file file;
+       struct virtio_net *dev = get_device(vid);
+
+       if (!dev)
+               return;
+
+       file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+       if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
+               file.fd = VIRTIO_INVALID_EVENTFD;
+       else
+               file.fd = pmsg->fds[0];
+       RTE_LOG(INFO, VHOST_CONFIG,
+               "vring kick idx:%d file:%d\n", file.index, file.fd);
+       vhost_set_vring_kick(vid, &file);
+
+       if (virtio_is_ready(dev) && !(dev->flags & VIRTIO_DEV_RUNNING)) {
+               if (notify_ops->new_device(vid) == 0)
+                       dev->flags |= VIRTIO_DEV_RUNNING;
+       }
+}
+
+/*
+ * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
+ */
+static int
+user_get_vring_base(int vid, struct vhost_vring_state *state)
+{
+       struct virtio_net *dev = get_device(vid);
+
+       if (dev == NULL)
+               return -1;
+       /* We have to stop the queue (virtio) if it is running. */
+       if (dev->flags & VIRTIO_DEV_RUNNING) {
+               dev->flags &= ~VIRTIO_DEV_RUNNING;
+               notify_ops->destroy_device(vid);
+       }
+
+       /* Here we are safe to get the last used index */
+       vhost_get_vring_base(vid, state->index, state);
+
+       RTE_LOG(INFO, VHOST_CONFIG,
+               "vring base idx:%d file:%d\n", state->index, state->num);
+       /*
+        * Based on current qemu vhost-user implementation, this message is
+        * sent and only sent in vhost_vring_stop.
+        * TODO: cleanup the vring, it isn't usable since here.
+        */
+       if (dev->virtqueue[state->index]->kickfd >= 0)
+               close(dev->virtqueue[state->index]->kickfd);
+
+       dev->virtqueue[state->index]->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+
+       return 0;
+}
+
+/*
+ * when virtio queues are ready to work, qemu will send us to
+ * enable the virtio queue pair.
+ */
+static int
+user_set_vring_enable(int vid, struct vhost_vring_state *state)
+{
+       struct virtio_net *dev;
+       int enable = (int)state->num;
+
+       dev = get_device(vid);
+       if (dev == NULL)
+               return -1;
+
+       RTE_LOG(INFO, VHOST_CONFIG,
+               "set queue enable: %d to qp idx: %d\n",
+               enable, state->index);
+
+       if (notify_ops->vring_state_changed)
+               notify_ops->vring_state_changed(vid, state->index, enable);
+
+       dev->virtqueue[state->index]->enabled = enable;
+
+       return 0;
+}
+
+static void
+user_set_protocol_features(int vid, uint64_t protocol_features)
+{
+       struct virtio_net *dev;
+
+       dev = get_device(vid);
+       if (dev == NULL || protocol_features & ~VHOST_USER_PROTOCOL_FEATURES)
+               return;
+
+       dev->protocol_features = protocol_features;
+}
+
+static int
+user_set_log_base(int vid, struct VhostUserMsg *msg)
+{
+       struct virtio_net *dev;
+       int fd = msg->fds[0];
+       uint64_t size, off;
+       void *addr;
+
+       dev = get_device(vid);
+       if (!dev)
+               return -1;
+
+       if (fd < 0) {
+               RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd);
+               return -1;
+       }
+
+       if (msg->size != sizeof(VhostUserLog)) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "invalid log base msg size: %"PRId32" != %d\n",
+                       msg->size, (int)sizeof(VhostUserLog));
+               return -1;
+       }
+
+       size = msg->payload.log.mmap_size;
+       off  = msg->payload.log.mmap_offset;
+       RTE_LOG(INFO, VHOST_CONFIG,
+               "log mmap size: %"PRId64", offset: %"PRId64"\n",
+               size, off);
+
+       /*
+        * mmap from 0 to workaround a hugepage mmap bug: mmap will
+        * fail when offset is not page size aligned.
+        */
+       addr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+       close(fd);
+       if (addr == MAP_FAILED) {
+               RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n");
+               return -1;
+       }
+
+       /*
+        * Free previously mapped log memory on occasionally
+        * multiple VHOST_USER_SET_LOG_BASE.
+        */
+       if (dev->log_addr) {
+               munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
+       }
+       dev->log_addr = (uint64_t)(uintptr_t)addr;
+       dev->log_base = dev->log_addr + off;
+       dev->log_size = size;
+
+       return 0;
+}
+
+/*
+ * An rarp packet is constructed and broadcasted to notify switches about
+ * the new location of the migrated VM, so that packets from outside will
+ * not be lost after migration.
+ *
+ * However, we don't actually "send" a rarp packet here, instead, we set
+ * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
+ */
+static int
+user_send_rarp(int vid, struct VhostUserMsg *msg)
+{
+       struct virtio_net *dev;
+       uint8_t *mac = (uint8_t *)&msg->payload.u64;
+
+       dev = get_device(vid);
+       if (!dev)
+               return -1;
+
+       RTE_LOG(DEBUG, VHOST_CONFIG,
+               ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
+               mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
+       memcpy(dev->mac.addr_bytes, mac, 6);
+
+       /*
+        * Set the flag to inject a RARP broadcast packet at
+        * rte_vhost_dequeue_burst().
+        *
+        * rte_smp_wmb() is for making sure the mac is copied
+        * before the flag is set.
+        */
+       rte_smp_wmb();
+       rte_atomic16_set(&dev->broadcast_rarp, 1);
+
+       return 0;
+}
+
+/* return bytes# of read on success or negative val on failure. */
+static int
+read_vhost_message(int sockfd, struct VhostUserMsg *msg)
+{
+       int ret;
+
+       ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
+               msg->fds, VHOST_MEMORY_MAX_NREGIONS);
+       if (ret <= 0)
+               return ret;
+
+       if (msg && msg->size) {
+               if (msg->size > sizeof(msg->payload)) {
+                       RTE_LOG(ERR, VHOST_CONFIG,
+                               "invalid msg size: %d\n", msg->size);
+                       return -1;
+               }
+               ret = read(sockfd, &msg->payload, msg->size);
+               if (ret <= 0)
+                       return ret;
+               if (ret != (int)msg->size) {
+                       RTE_LOG(ERR, VHOST_CONFIG,
+                               "read control message failed\n");
+                       return -1;
+               }
+       }
+
+       return ret;
+}
+
+static int
+send_vhost_message(int sockfd, struct VhostUserMsg *msg)
+{
+       int ret;
+
+       if (!msg)
+               return 0;
+
+       msg->flags &= ~VHOST_USER_VERSION_MASK;
+       msg->flags |= VHOST_USER_VERSION;
+       msg->flags |= VHOST_USER_REPLY_MASK;
+
+       ret = send_fd_message(sockfd, (char *)msg,
+               VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
+
+       return ret;
+}
+
+int
+vhost_user_msg_handler(int vid, int fd)
+{
+       struct VhostUserMsg msg;
+       uint64_t features = 0;
+       int ret;
+
+       ret = read_vhost_message(fd, &msg);
+       if (ret <= 0 || msg.request >= VHOST_USER_MAX) {
+               if (ret < 0)
+                       RTE_LOG(ERR, VHOST_CONFIG,
+                               "vhost read message failed\n");
+               else if (ret == 0)
+                       RTE_LOG(INFO, VHOST_CONFIG,
+                               "vhost peer closed\n");
+               else
+                       RTE_LOG(ERR, VHOST_CONFIG,
+                               "vhost read incorrect message\n");
+
+               return -1;
+       }
+
+       RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
+               vhost_message_str[msg.request]);
+       switch (msg.request) {
+       case VHOST_USER_GET_FEATURES:
+               ret = vhost_get_features(vid, &features);
+               msg.payload.u64 = features;
+               msg.size = sizeof(msg.payload.u64);
+               send_vhost_message(fd, &msg);
+               break;
+       case VHOST_USER_SET_FEATURES:
+               features = msg.payload.u64;
+               vhost_set_features(vid, &features);
+               break;
+
+       case VHOST_USER_GET_PROTOCOL_FEATURES:
+               msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES;
+               msg.size = sizeof(msg.payload.u64);
+               send_vhost_message(fd, &msg);
+               break;
+       case VHOST_USER_SET_PROTOCOL_FEATURES:
+               user_set_protocol_features(vid, msg.payload.u64);
+               break;
+
+       case VHOST_USER_SET_OWNER:
+               vhost_set_owner(vid);
+               break;
+       case VHOST_USER_RESET_OWNER:
+               vhost_reset_owner(vid);
+               break;
+
+       case VHOST_USER_SET_MEM_TABLE:
+               user_set_mem_table(vid, &msg);
+               break;
+
+       case VHOST_USER_SET_LOG_BASE:
+               user_set_log_base(vid, &msg);
+
+               /* it needs a reply */
+               msg.size = sizeof(msg.payload.u64);
+               send_vhost_message(fd, &msg);
+               break;
+       case VHOST_USER_SET_LOG_FD:
+               close(msg.fds[0]);
+               RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
+               break;
+
+       case VHOST_USER_SET_VRING_NUM:
+               vhost_set_vring_num(vid, &msg.payload.state);
+               break;
+       case VHOST_USER_SET_VRING_ADDR:
+               vhost_set_vring_addr(vid, &msg.payload.addr);
+               break;
+       case VHOST_USER_SET_VRING_BASE:
+               vhost_set_vring_base(vid, &msg.payload.state);
+               break;
+
+       case VHOST_USER_GET_VRING_BASE:
+               ret = user_get_vring_base(vid, &msg.payload.state);
+               msg.size = sizeof(msg.payload.state);
+               send_vhost_message(fd, &msg);
+               break;
+
+       case VHOST_USER_SET_VRING_KICK:
+               user_set_vring_kick(vid, &msg);
+               break;
+       case VHOST_USER_SET_VRING_CALL:
+               user_set_vring_call(vid, &msg);
+               break;
+
+       case VHOST_USER_SET_VRING_ERR:
+               if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK))
+                       close(msg.fds[0]);
+               RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
+               break;
+
+       case VHOST_USER_GET_QUEUE_NUM:
+               msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS;
+               msg.size = sizeof(msg.payload.u64);
+               send_vhost_message(fd, &msg);
+               break;
+
+       case VHOST_USER_SET_VRING_ENABLE:
+               user_set_vring_enable(vid, &msg.payload.state);
+               break;
+       case VHOST_USER_SEND_RARP:
+               user_send_rarp(vid, &msg);
+               break;
+
+       default:
+               break;
+
+       }
+
+       return 0;
+}
diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h

new file mode 100644 (file)

index 0000000..ba78d32
--- /dev/null
+++ b/lib/librte_vhost/vhost_user.h
@@ -0,0 +1,128 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VHOST_NET_USER_H
+#define _VHOST_NET_USER_H
+
+#include <stdint.h>
+#include <linux/vhost.h>
+
+#include "rte_virtio_net.h"
+
+/* refer to hw/virtio/vhost-user.c */
+
+#define VHOST_MEMORY_MAX_NREGIONS 8
+
+#define VHOST_USER_PROTOCOL_F_MQ       0
+#define VHOST_USER_PROTOCOL_F_LOG_SHMFD        1
+#define VHOST_USER_PROTOCOL_F_RARP     2
+
+#define VHOST_USER_PROTOCOL_FEATURES   ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
+                                        (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\
+                                        (1ULL << VHOST_USER_PROTOCOL_F_RARP))
+
+typedef enum VhostUserRequest {
+       VHOST_USER_NONE = 0,
+       VHOST_USER_GET_FEATURES = 1,
+       VHOST_USER_SET_FEATURES = 2,
+       VHOST_USER_SET_OWNER = 3,
+       VHOST_USER_RESET_OWNER = 4,
+       VHOST_USER_SET_MEM_TABLE = 5,
+       VHOST_USER_SET_LOG_BASE = 6,
+       VHOST_USER_SET_LOG_FD = 7,
+       VHOST_USER_SET_VRING_NUM = 8,
+       VHOST_USER_SET_VRING_ADDR = 9,
+       VHOST_USER_SET_VRING_BASE = 10,
+       VHOST_USER_GET_VRING_BASE = 11,
+       VHOST_USER_SET_VRING_KICK = 12,
+       VHOST_USER_SET_VRING_CALL = 13,
+       VHOST_USER_SET_VRING_ERR = 14,
+       VHOST_USER_GET_PROTOCOL_FEATURES = 15,
+       VHOST_USER_SET_PROTOCOL_FEATURES = 16,
+       VHOST_USER_GET_QUEUE_NUM = 17,
+       VHOST_USER_SET_VRING_ENABLE = 18,
+       VHOST_USER_SEND_RARP = 19,
+       VHOST_USER_MAX
+} VhostUserRequest;
+
+typedef struct VhostUserMemoryRegion {
+       uint64_t guest_phys_addr;
+       uint64_t memory_size;
+       uint64_t userspace_addr;
+       uint64_t mmap_offset;
+} VhostUserMemoryRegion;
+
+typedef struct VhostUserMemory {
+       uint32_t nregions;
+       uint32_t padding;
+       VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
+} VhostUserMemory;
+
+typedef struct VhostUserLog {
+       uint64_t mmap_size;
+       uint64_t mmap_offset;
+} VhostUserLog;
+
+typedef struct VhostUserMsg {
+       VhostUserRequest request;
+
+#define VHOST_USER_VERSION_MASK     0x3
+#define VHOST_USER_REPLY_MASK       (0x1 << 2)
+       uint32_t flags;
+       uint32_t size; /* the following payload size */
+       union {
+#define VHOST_USER_VRING_IDX_MASK   0xff
+#define VHOST_USER_VRING_NOFD_MASK  (0x1<<8)
+               uint64_t u64;
+               struct vhost_vring_state state;
+               struct vhost_vring_addr addr;
+               VhostUserMemory memory;
+               VhostUserLog    log;
+       } payload;
+       int fds[VHOST_MEMORY_MAX_NREGIONS];
+} __attribute((packed)) VhostUserMsg;
+
+#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
+
+/* The version of the protocol we support */
+#define VHOST_USER_VERSION    0x1
+
+
+/* vhost_user.c */
+int vhost_user_msg_handler(int vid, int fd);
+
+/* socket.c */
+int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num);
+int send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num);
+
+#endif
diff --git a/lib/librte_vhost/virtio-net-user.c b/lib/librte_vhost/virtio-net-user.c

deleted file mode 100644 (file)

index e7c4347..0000000
--- a/lib/librte_vhost/virtio-net-user.c
+++ /dev/null
@@ -1,470 +0,0 @@
-/*-
- *   BSD LICENSE
- *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- *   All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include <rte_common.h>
-#include <rte_log.h>
-
-#include "virtio-net-user.h"
-#include "vhost-net-user.h"
-#include "vhost-net.h"
-
-struct orig_region_map {
-       int fd;
-       uint64_t mapped_address;
-       uint64_t mapped_size;
-       uint64_t blksz;
-};
-
-#define orig_region(ptr, nregions) \
-       ((struct orig_region_map *)RTE_PTR_ADD((ptr), \
-               sizeof(struct virtio_memory) + \
-               sizeof(struct virtio_memory_regions) * (nregions)))
-
-static uint64_t
-get_blk_size(int fd)
-{
-       struct stat stat;
-       int ret;
-
-       ret = fstat(fd, &stat);
-       return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
-}
-
-static void
-free_mem_region(struct virtio_net *dev)
-{
-       struct orig_region_map *region;
-       unsigned int idx;
-
-       if (!dev || !dev->mem)
-               return;
-
-       region = orig_region(dev->mem, dev->mem->nregions);
-       for (idx = 0; idx < dev->mem->nregions; idx++) {
-               if (region[idx].mapped_address) {
-                       munmap((void *)(uintptr_t)region[idx].mapped_address,
-                                       region[idx].mapped_size);
-                       close(region[idx].fd);
-               }
-       }
-}
-
-void
-vhost_backend_cleanup(struct virtio_net *dev)
-{
-       if (dev->mem) {
-               free_mem_region(dev);
-               free(dev->mem);
-               dev->mem = NULL;
-       }
-       if (dev->log_addr) {
-               munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
-               dev->log_addr = 0;
-       }
-}
-
-int
-user_set_mem_table(int vid, struct VhostUserMsg *pmsg)
-{
-       struct VhostUserMemory memory = pmsg->payload.memory;
-       struct virtio_memory_regions *pregion;
-       uint64_t mapped_address, mapped_size;
-       struct virtio_net *dev;
-       unsigned int idx = 0;
-       struct orig_region_map *pregion_orig;
-       uint64_t alignment;
-
-       /* unmap old memory regions one by one*/
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       /* Remove from the data plane. */
-       if (dev->flags & VIRTIO_DEV_RUNNING) {
-               dev->flags &= ~VIRTIO_DEV_RUNNING;
-               notify_ops->destroy_device(vid);
-       }
-
-       if (dev->mem) {
-               free_mem_region(dev);
-               free(dev->mem);
-               dev->mem = NULL;
-       }
-
-       dev->mem = calloc(1,
-               sizeof(struct virtio_memory) +
-               sizeof(struct virtio_memory_regions) * memory.nregions +
-               sizeof(struct orig_region_map) * memory.nregions);
-       if (dev->mem == NULL) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "(%d) failed to allocate memory for dev->mem\n",
-                       dev->vid);
-               return -1;
-       }
-       dev->mem->nregions = memory.nregions;
-
-       pregion_orig = orig_region(dev->mem, memory.nregions);
-       for (idx = 0; idx < memory.nregions; idx++) {
-               pregion = &dev->mem->regions[idx];
-               pregion->guest_phys_address =
-                       memory.regions[idx].guest_phys_addr;
-               pregion->guest_phys_address_end =
-                       memory.regions[idx].guest_phys_addr +
-                       memory.regions[idx].memory_size;
-               pregion->memory_size =
-                       memory.regions[idx].memory_size;
-               pregion->userspace_address =
-                       memory.regions[idx].userspace_addr;
-
-               /* This is ugly */
-               mapped_size = memory.regions[idx].memory_size +
-                       memory.regions[idx].mmap_offset;
-
-               /* mmap() without flag of MAP_ANONYMOUS, should be called
-                * with length argument aligned with hugepagesz at older
-                * longterm version Linux, like 2.6.32 and 3.2.72, or
-                * mmap() will fail with EINVAL.
-                *
-                * to avoid failure, make sure in caller to keep length
-                * aligned.
-                */
-               alignment = get_blk_size(pmsg->fds[idx]);
-               if (alignment == (uint64_t)-1) {
-                       RTE_LOG(ERR, VHOST_CONFIG,
-                               "couldn't get hugepage size through fstat\n");
-                       goto err_mmap;
-               }
-               mapped_size = RTE_ALIGN_CEIL(mapped_size, alignment);
-
-               mapped_address = (uint64_t)(uintptr_t)mmap(NULL,
-                       mapped_size,
-                       PROT_READ | PROT_WRITE, MAP_SHARED,
-                       pmsg->fds[idx],
-                       0);
-
-               RTE_LOG(INFO, VHOST_CONFIG,
-                       "mapped region %d fd:%d to:%p sz:0x%"PRIx64" "
-                       "off:0x%"PRIx64" align:0x%"PRIx64"\n",
-                       idx, pmsg->fds[idx], (void *)(uintptr_t)mapped_address,
-                       mapped_size, memory.regions[idx].mmap_offset,
-                       alignment);
-
-               if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) {
-                       RTE_LOG(ERR, VHOST_CONFIG,
-                               "mmap qemu guest failed.\n");
-                       goto err_mmap;
-               }
-
-               pregion_orig[idx].mapped_address = mapped_address;
-               pregion_orig[idx].mapped_size = mapped_size;
-               pregion_orig[idx].blksz = alignment;
-               pregion_orig[idx].fd = pmsg->fds[idx];
-
-               mapped_address +=  memory.regions[idx].mmap_offset;
-
-               pregion->address_offset = mapped_address -
-                       pregion->guest_phys_address;
-
-               if (memory.regions[idx].guest_phys_addr == 0) {
-                       dev->mem->base_address =
-                               memory.regions[idx].userspace_addr;
-                       dev->mem->mapped_address =
-                               pregion->address_offset;
-               }
-
-               LOG_DEBUG(VHOST_CONFIG,
-                       "REGION: %u GPA: %p QEMU VA: %p SIZE (%"PRIu64")\n",
-                       idx,
-                       (void *)(uintptr_t)pregion->guest_phys_address,
-                       (void *)(uintptr_t)pregion->userspace_address,
-                        pregion->memory_size);
-       }
-
-       return 0;
-
-err_mmap:
-       while (idx--) {
-               munmap((void *)(uintptr_t)pregion_orig[idx].mapped_address,
-                               pregion_orig[idx].mapped_size);
-               close(pregion_orig[idx].fd);
-       }
-       free(dev->mem);
-       dev->mem = NULL;
-       return -1;
-}
-
-static int
-vq_is_ready(struct vhost_virtqueue *vq)
-{
-       return vq && vq->desc   &&
-              vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD &&
-              vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD;
-}
-
-static int
-virtio_is_ready(struct virtio_net *dev)
-{
-       struct vhost_virtqueue *rvq, *tvq;
-       uint32_t i;
-
-       for (i = 0; i < dev->virt_qp_nb; i++) {
-               rvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ];
-               tvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ];
-
-               if (!vq_is_ready(rvq) || !vq_is_ready(tvq)) {
-                       RTE_LOG(INFO, VHOST_CONFIG,
-                               "virtio is not ready for processing.\n");
-                       return 0;
-               }
-       }
-
-       RTE_LOG(INFO, VHOST_CONFIG,
-               "virtio is now ready for processing.\n");
-       return 1;
-}
-
-void
-user_set_vring_call(int vid, struct VhostUserMsg *pmsg)
-{
-       struct vhost_vring_file file;
-
-       file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
-       if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
-               file.fd = VIRTIO_INVALID_EVENTFD;
-       else
-               file.fd = pmsg->fds[0];
-       RTE_LOG(INFO, VHOST_CONFIG,
-               "vring call idx:%d file:%d\n", file.index, file.fd);
-       vhost_set_vring_call(vid, &file);
-}
-
-
-/*
- *  In vhost-user, when we receive kick message, will test whether virtio
- *  device is ready for packet processing.
- */
-void
-user_set_vring_kick(int vid, struct VhostUserMsg *pmsg)
-{
-       struct vhost_vring_file file;
-       struct virtio_net *dev = get_device(vid);
-
-       if (!dev)
-               return;
-
-       file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
-       if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
-               file.fd = VIRTIO_INVALID_EVENTFD;
-       else
-               file.fd = pmsg->fds[0];
-       RTE_LOG(INFO, VHOST_CONFIG,
-               "vring kick idx:%d file:%d\n", file.index, file.fd);
-       vhost_set_vring_kick(vid, &file);
-
-       if (virtio_is_ready(dev) && !(dev->flags & VIRTIO_DEV_RUNNING)) {
-               if (notify_ops->new_device(vid) == 0)
-                       dev->flags |= VIRTIO_DEV_RUNNING;
-       }
-}
-
-/*
- * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
- */
-int
-user_get_vring_base(int vid, struct vhost_vring_state *state)
-{
-       struct virtio_net *dev = get_device(vid);
-
-       if (dev == NULL)
-               return -1;
-       /* We have to stop the queue (virtio) if it is running. */
-       if (dev->flags & VIRTIO_DEV_RUNNING) {
-               dev->flags &= ~VIRTIO_DEV_RUNNING;
-               notify_ops->destroy_device(vid);
-       }
-
-       /* Here we are safe to get the last used index */
-       vhost_get_vring_base(vid, state->index, state);
-
-       RTE_LOG(INFO, VHOST_CONFIG,
-               "vring base idx:%d file:%d\n", state->index, state->num);
-       /*
-        * Based on current qemu vhost-user implementation, this message is
-        * sent and only sent in vhost_vring_stop.
-        * TODO: cleanup the vring, it isn't usable since here.
-        */
-       if (dev->virtqueue[state->index]->kickfd >= 0)
-               close(dev->virtqueue[state->index]->kickfd);
-
-       dev->virtqueue[state->index]->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
-
-       return 0;
-}
-
-/*
- * when virtio queues are ready to work, qemu will send us to
- * enable the virtio queue pair.
- */
-int
-user_set_vring_enable(int vid, struct vhost_vring_state *state)
-{
-       struct virtio_net *dev;
-       int enable = (int)state->num;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       RTE_LOG(INFO, VHOST_CONFIG,
-               "set queue enable: %d to qp idx: %d\n",
-               enable, state->index);
-
-       if (notify_ops->vring_state_changed)
-               notify_ops->vring_state_changed(vid, state->index, enable);
-
-       dev->virtqueue[state->index]->enabled = enable;
-
-       return 0;
-}
-
-void
-user_set_protocol_features(int vid, uint64_t protocol_features)
-{
-       struct virtio_net *dev;
-
-       dev = get_device(vid);
-       if (dev == NULL || protocol_features & ~VHOST_USER_PROTOCOL_FEATURES)
-               return;
-
-       dev->protocol_features = protocol_features;
-}
-
-int
-user_set_log_base(int vid, struct VhostUserMsg *msg)
-{
-       struct virtio_net *dev;
-       int fd = msg->fds[0];
-       uint64_t size, off;
-       void *addr;
-
-       dev = get_device(vid);
-       if (!dev)
-               return -1;
-
-       if (fd < 0) {
-               RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd);
-               return -1;
-       }
-
-       if (msg->size != sizeof(VhostUserLog)) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "invalid log base msg size: %"PRId32" != %d\n",
-                       msg->size, (int)sizeof(VhostUserLog));
-               return -1;
-       }
-
-       size = msg->payload.log.mmap_size;
-       off  = msg->payload.log.mmap_offset;
-       RTE_LOG(INFO, VHOST_CONFIG,
-               "log mmap size: %"PRId64", offset: %"PRId64"\n",
-               size, off);
-
-       /*
-        * mmap from 0 to workaround a hugepage mmap bug: mmap will
-        * fail when offset is not page size aligned.
-        */
-       addr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-       close(fd);
-       if (addr == MAP_FAILED) {
-               RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n");
-               return -1;
-       }
-
-       /*
-        * Free previously mapped log memory on occasionally
-        * multiple VHOST_USER_SET_LOG_BASE.
-        */
-       if (dev->log_addr) {
-               munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
-       }
-       dev->log_addr = (uint64_t)(uintptr_t)addr;
-       dev->log_base = dev->log_addr + off;
-       dev->log_size = size;
-
-       return 0;
-}
-
-/*
- * An rarp packet is constructed and broadcasted to notify switches about
- * the new location of the migrated VM, so that packets from outside will
- * not be lost after migration.
- *
- * However, we don't actually "send" a rarp packet here, instead, we set
- * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
- */
-int
-user_send_rarp(int vid, struct VhostUserMsg *msg)
-{
-       struct virtio_net *dev;
-       uint8_t *mac = (uint8_t *)&msg->payload.u64;
-
-       dev = get_device(vid);
-       if (!dev)
-               return -1;
-
-       RTE_LOG(DEBUG, VHOST_CONFIG,
-               ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
-               mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
-       memcpy(dev->mac.addr_bytes, mac, 6);
-
-       /*
-        * Set the flag to inject a RARP broadcast packet at
-        * rte_vhost_dequeue_burst().
-        *
-        * rte_smp_wmb() is for making sure the mac is copied
-        * before the flag is set.
-        */
-       rte_smp_wmb();
-       rte_atomic16_set(&dev->broadcast_rarp, 1);
-
-       return 0;
-}
diff --git a/lib/librte_vhost/virtio-net-user.h b/lib/librte_vhost/virtio-net-user.h

deleted file mode 100644 (file)

index e1b967b..0000000
--- a/lib/librte_vhost/virtio-net-user.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*-
- *   BSD LICENSE
- *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- *   All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _VIRTIO_NET_USER_H
-#define _VIRTIO_NET_USER_H
-
-#include "vhost-net.h"
-#include "vhost-net-user.h"
-
-#define VHOST_USER_PROTOCOL_F_MQ       0
-#define VHOST_USER_PROTOCOL_F_LOG_SHMFD        1
-#define VHOST_USER_PROTOCOL_F_RARP     2
-
-#define VHOST_USER_PROTOCOL_FEATURES   ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
-                                        (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\
-                                        (1ULL << VHOST_USER_PROTOCOL_F_RARP))
-
-int user_set_mem_table(int, struct VhostUserMsg *);
-
-void user_set_vring_call(int, struct VhostUserMsg *);
-
-void user_set_vring_kick(int, struct VhostUserMsg *);
-
-void user_set_protocol_features(int vid, uint64_t protocol_features);
-int user_set_log_base(int vid, struct VhostUserMsg *);
-int user_send_rarp(int vid, struct VhostUserMsg *);
-
-int user_get_vring_base(int, struct vhost_vring_state *);
-
-int user_set_vring_enable(int vid, struct vhost_vring_state *state);
-
-#endif
diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c

deleted file mode 100644 (file)

index 1785695..0000000
--- a/lib/librte_vhost/virtio-net.c
+++ /dev/null
@@ -1,847 +0,0 @@
-/*-
- *   BSD LICENSE
- *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- *   All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/vhost.h>
-#include <linux/virtio_net.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <sys/mman.h>
-#include <unistd.h>
-#ifdef RTE_LIBRTE_VHOST_NUMA
-#include <numaif.h>
-#endif
-
-#include <sys/socket.h>
-
-#include <rte_ethdev.h>
-#include <rte_log.h>
-#include <rte_string_fns.h>
-#include <rte_memory.h>
-#include <rte_malloc.h>
-#include <rte_virtio_net.h>
-
-#include "vhost-net.h"
-
-#define MAX_VHOST_DEVICE       1024
-static struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
-
-/* device ops to add/remove device to/from data core. */
-struct virtio_net_device_ops const *notify_ops;
-
-#define VHOST_USER_F_PROTOCOL_FEATURES 30
-
-/* Features supported by this lib. */
-#define VHOST_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \
-                               (1ULL << VIRTIO_NET_F_CTRL_VQ) | \
-                               (1ULL << VIRTIO_NET_F_CTRL_RX) | \
-                               (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \
-                               (VHOST_SUPPORTS_MQ)            | \
-                               (1ULL << VIRTIO_F_VERSION_1)   | \
-                               (1ULL << VHOST_F_LOG_ALL)      | \
-                               (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
-                               (1ULL << VIRTIO_NET_F_HOST_TSO4) | \
-                               (1ULL << VIRTIO_NET_F_HOST_TSO6) | \
-                               (1ULL << VIRTIO_NET_F_CSUM)    | \
-                               (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
-                               (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
-                               (1ULL << VIRTIO_NET_F_GUEST_TSO6))
-
-static uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES;
-
-
-/*
- * Converts QEMU virtual address to Vhost virtual address. This function is
- * used to convert the ring addresses to our address space.
- */
-static uint64_t
-qva_to_vva(struct virtio_net *dev, uint64_t qemu_va)
-{
-       struct virtio_memory_regions *region;
-       uint64_t vhost_va = 0;
-       uint32_t regionidx = 0;
-
-       /* Find the region where the address lives. */
-       for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
-               region = &dev->mem->regions[regionidx];
-               if ((qemu_va >= region->userspace_address) &&
-                       (qemu_va <= region->userspace_address +
-                       region->memory_size)) {
-                       vhost_va = qemu_va + region->guest_phys_address +
-                               region->address_offset -
-                               region->userspace_address;
-                       break;
-               }
-       }
-       return vhost_va;
-}
-
-struct virtio_net *
-get_device(int vid)
-{
-       struct virtio_net *dev = vhost_devices[vid];
-
-       if (unlikely(!dev)) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "(%d) device not found.\n", vid);
-       }
-
-       return dev;
-}
-
-static void
-cleanup_vq(struct vhost_virtqueue *vq, int destroy)
-{
-       if ((vq->callfd >= 0) && (destroy != 0))
-               close(vq->callfd);
-       if (vq->kickfd >= 0)
-               close(vq->kickfd);
-}
-
-/*
- * Unmap any memory, close any file descriptors and
- * free any memory owned by a device.
- */
-static void
-cleanup_device(struct virtio_net *dev, int destroy)
-{
-       uint32_t i;
-
-       vhost_backend_cleanup(dev);
-
-       for (i = 0; i < dev->virt_qp_nb; i++) {
-               cleanup_vq(dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ], destroy);
-               cleanup_vq(dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ], destroy);
-       }
-}
-
-/*
- * Release virtqueues and device memory.
- */
-static void
-free_device(struct virtio_net *dev)
-{
-       uint32_t i;
-
-       for (i = 0; i < dev->virt_qp_nb; i++)
-               rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
-
-       rte_free(dev);
-}
-
-static void
-init_vring_queue(struct vhost_virtqueue *vq, int qp_idx)
-{
-       memset(vq, 0, sizeof(struct vhost_virtqueue));
-
-       vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
-       vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD;
-
-       /* Backends are set to -1 indicating an inactive device. */
-       vq->backend = -1;
-
-       /* always set the default vq pair to enabled */
-       if (qp_idx == 0)
-               vq->enabled = 1;
-}
-
-static void
-init_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx)
-{
-       uint32_t base_idx = qp_idx * VIRTIO_QNUM;
-
-       init_vring_queue(dev->virtqueue[base_idx + VIRTIO_RXQ], qp_idx);
-       init_vring_queue(dev->virtqueue[base_idx + VIRTIO_TXQ], qp_idx);
-}
-
-static void
-reset_vring_queue(struct vhost_virtqueue *vq, int qp_idx)
-{
-       int callfd;
-
-       callfd = vq->callfd;
-       init_vring_queue(vq, qp_idx);
-       vq->callfd = callfd;
-}
-
-static void
-reset_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx)
-{
-       uint32_t base_idx = qp_idx * VIRTIO_QNUM;
-
-       reset_vring_queue(dev->virtqueue[base_idx + VIRTIO_RXQ], qp_idx);
-       reset_vring_queue(dev->virtqueue[base_idx + VIRTIO_TXQ], qp_idx);
-}
-
-static int
-alloc_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx)
-{
-       struct vhost_virtqueue *virtqueue = NULL;
-       uint32_t virt_rx_q_idx = qp_idx * VIRTIO_QNUM + VIRTIO_RXQ;
-       uint32_t virt_tx_q_idx = qp_idx * VIRTIO_QNUM + VIRTIO_TXQ;
-
-       virtqueue = rte_malloc(NULL,
-                              sizeof(struct vhost_virtqueue) * VIRTIO_QNUM, 0);
-       if (virtqueue == NULL) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "Failed to allocate memory for virt qp:%d.\n", qp_idx);
-               return -1;
-       }
-
-       dev->virtqueue[virt_rx_q_idx] = virtqueue;
-       dev->virtqueue[virt_tx_q_idx] = virtqueue + VIRTIO_TXQ;
-
-       init_vring_queue_pair(dev, qp_idx);
-
-       dev->virt_qp_nb += 1;
-
-       return 0;
-}
-
-/*
- * Reset some variables in device structure, while keeping few
- * others untouched, such as vid, ifname, virt_qp_nb: they
- * should be same unless the device is removed.
- */
-static void
-reset_device(struct virtio_net *dev)
-{
-       uint32_t i;
-
-       dev->features = 0;
-       dev->protocol_features = 0;
-       dev->flags = 0;
-
-       for (i = 0; i < dev->virt_qp_nb; i++)
-               reset_vring_queue_pair(dev, i);
-}
-
-/*
- * Function is called from the CUSE open function. The device structure is
- * initialised and a new entry is added to the device configuration linked
- * list.
- */
-int
-vhost_new_device(void)
-{
-       struct virtio_net *dev;
-       int i;
-
-       dev = rte_zmalloc(NULL, sizeof(struct virtio_net), 0);
-       if (dev == NULL) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "Failed to allocate memory for new dev.\n");
-               return -1;
-       }
-
-       for (i = 0; i < MAX_VHOST_DEVICE; i++) {
-               if (vhost_devices[i] == NULL)
-                       break;
-       }
-       if (i == MAX_VHOST_DEVICE) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "Failed to find a free slot for new device.\n");
-               return -1;
-       }
-
-       vhost_devices[i] = dev;
-       dev->vid = i;
-
-       return i;
-}
-
-/*
- * Function is called from the CUSE release function. This function will
- * cleanup the device and remove it from device configuration linked list.
- */
-void
-vhost_destroy_device(int vid)
-{
-       struct virtio_net *dev = get_device(vid);
-
-       if (dev == NULL)
-               return;
-
-       if (dev->flags & VIRTIO_DEV_RUNNING) {
-               dev->flags &= ~VIRTIO_DEV_RUNNING;
-               notify_ops->destroy_device(vid);
-       }
-
-       cleanup_device(dev, 1);
-       free_device(dev);
-
-       vhost_devices[vid] = NULL;
-}
-
-void
-vhost_set_ifname(int vid, const char *if_name, unsigned int if_len)
-{
-       struct virtio_net *dev;
-       unsigned int len;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return;
-
-       len = if_len > sizeof(dev->ifname) ?
-               sizeof(dev->ifname) : if_len;
-
-       strncpy(dev->ifname, if_name, len);
-       dev->ifname[sizeof(dev->ifname) - 1] = '\0';
-}
-
-
-/*
- * Called from CUSE IOCTL: VHOST_SET_OWNER
- * This function just returns success at the moment unless
- * the device hasn't been initialised.
- */
-int
-vhost_set_owner(int vid)
-{
-       struct virtio_net *dev;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       return 0;
-}
-
-/*
- * Called from CUSE IOCTL: VHOST_RESET_OWNER
- */
-int
-vhost_reset_owner(int vid)
-{
-       struct virtio_net *dev;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       if (dev->flags & VIRTIO_DEV_RUNNING) {
-               dev->flags &= ~VIRTIO_DEV_RUNNING;
-               notify_ops->destroy_device(vid);
-       }
-
-       cleanup_device(dev, 0);
-       reset_device(dev);
-       return 0;
-}
-
-/*
- * Called from CUSE IOCTL: VHOST_GET_FEATURES
- * The features that we support are requested.
- */
-int
-vhost_get_features(int vid, uint64_t *pu)
-{
-       struct virtio_net *dev;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       /* Send our supported features. */
-       *pu = VHOST_FEATURES;
-       return 0;
-}
-
-/*
- * Called from CUSE IOCTL: VHOST_SET_FEATURES
- * We receive the negotiated features supported by us and the virtio device.
- */
-int
-vhost_set_features(int vid, uint64_t *pu)
-{
-       struct virtio_net *dev;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-       if (*pu & ~VHOST_FEATURES)
-               return -1;
-
-       dev->features = *pu;
-       if (dev->features &
-               ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) {
-               dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
-       } else {
-               dev->vhost_hlen = sizeof(struct virtio_net_hdr);
-       }
-       LOG_DEBUG(VHOST_CONFIG,
-               "(%d) mergeable RX buffers %s, virtio 1 %s\n",
-               dev->vid,
-               (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off",
-               (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off");
-
-       return 0;
-}
-
-/*
- * Called from CUSE IOCTL: VHOST_SET_VRING_NUM
- * The virtio device sends us the size of the descriptor ring.
- */
-int
-vhost_set_vring_num(int vid, struct vhost_vring_state *state)
-{
-       struct virtio_net *dev;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       /* State->index refers to the queue index. The txq is 1, rxq is 0. */
-       dev->virtqueue[state->index]->size = state->num;
-
-       return 0;
-}
-
-/*
- * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the
- * same numa node as the memory of vring descriptor.
- */
-#ifdef RTE_LIBRTE_VHOST_NUMA
-static struct virtio_net*
-numa_realloc(struct virtio_net *dev, int index)
-{
-       int oldnode, newnode;
-       struct virtio_net *old_dev;
-       struct vhost_virtqueue *old_vq, *vq;
-       int ret;
-
-       /*
-        * vq is allocated on pairs, we should try to do realloc
-        * on first queue of one queue pair only.
-        */
-       if (index % VIRTIO_QNUM != 0)
-               return dev;
-
-       old_dev = dev;
-       vq = old_vq = dev->virtqueue[index];
-
-       ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc,
-                           MPOL_F_NODE | MPOL_F_ADDR);
-
-       /* check if we need to reallocate vq */
-       ret |= get_mempolicy(&oldnode, NULL, 0, old_vq,
-                            MPOL_F_NODE | MPOL_F_ADDR);
-       if (ret) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "Unable to get vq numa information.\n");
-               return dev;
-       }
-       if (oldnode != newnode) {
-               RTE_LOG(INFO, VHOST_CONFIG,
-                       "reallocate vq from %d to %d node\n", oldnode, newnode);
-               vq = rte_malloc_socket(NULL, sizeof(*vq) * VIRTIO_QNUM, 0,
-                                      newnode);
-               if (!vq)
-                       return dev;
-
-               memcpy(vq, old_vq, sizeof(*vq) * VIRTIO_QNUM);
-               rte_free(old_vq);
-       }
-
-       /* check if we need to reallocate dev */
-       ret = get_mempolicy(&oldnode, NULL, 0, old_dev,
-                           MPOL_F_NODE | MPOL_F_ADDR);
-       if (ret) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "Unable to get dev numa information.\n");
-               goto out;
-       }
-       if (oldnode != newnode) {
-               RTE_LOG(INFO, VHOST_CONFIG,
-                       "reallocate dev from %d to %d node\n",
-                       oldnode, newnode);
-               dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode);
-               if (!dev) {
-                       dev = old_dev;
-                       goto out;
-               }
-
-               memcpy(dev, old_dev, sizeof(*dev));
-               rte_free(old_dev);
-       }
-
-out:
-       dev->virtqueue[index] = vq;
-       dev->virtqueue[index + 1] = vq + 1;
-       vhost_devices[dev->vid] = dev;
-
-       return dev;
-}
-#else
-static struct virtio_net*
-numa_realloc(struct virtio_net *dev, int index __rte_unused)
-{
-       return dev;
-}
-#endif
-
-/*
- * Called from CUSE IOCTL: VHOST_SET_VRING_ADDR
- * The virtio device sends us the desc, used and avail ring addresses.
- * This function then converts these to our address space.
- */
-int
-vhost_set_vring_addr(int vid, struct vhost_vring_addr *addr)
-{
-       struct virtio_net *dev;
-       struct vhost_virtqueue *vq;
-
-       dev = get_device(vid);
-       if ((dev == NULL) || (dev->mem == NULL))
-               return -1;
-
-       /* addr->index refers to the queue index. The txq 1, rxq is 0. */
-       vq = dev->virtqueue[addr->index];
-
-       /* The addresses are converted from QEMU virtual to Vhost virtual. */
-       vq->desc = (struct vring_desc *)(uintptr_t)qva_to_vva(dev,
-                       addr->desc_user_addr);
-       if (vq->desc == 0) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "(%d) failed to find desc ring address.\n",
-                       dev->vid);
-               return -1;
-       }
-
-       dev = numa_realloc(dev, addr->index);
-       vq = dev->virtqueue[addr->index];
-
-       vq->avail = (struct vring_avail *)(uintptr_t)qva_to_vva(dev,
-                       addr->avail_user_addr);
-       if (vq->avail == 0) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "(%d) failed to find avail ring address.\n",
-                       dev->vid);
-               return -1;
-       }
-
-       vq->used = (struct vring_used *)(uintptr_t)qva_to_vva(dev,
-                       addr->used_user_addr);
-       if (vq->used == 0) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "(%d) failed to find used ring address.\n",
-                       dev->vid);
-               return -1;
-       }
-
-       if (vq->last_used_idx != vq->used->idx) {
-               RTE_LOG(WARNING, VHOST_CONFIG,
-                       "last_used_idx (%u) and vq->used->idx (%u) mismatches; "
-                       "some packets maybe resent for Tx and dropped for Rx\n",
-                       vq->last_used_idx, vq->used->idx);
-               vq->last_used_idx     = vq->used->idx;
-       }
-
-       vq->log_guest_addr = addr->log_guest_addr;
-
-       LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n",
-                       dev->vid, vq->desc);
-       LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n",
-                       dev->vid, vq->avail);
-       LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n",
-                       dev->vid, vq->used);
-       LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n",
-                       dev->vid, vq->log_guest_addr);
-
-       return 0;
-}
-
-/*
- * Called from CUSE IOCTL: VHOST_SET_VRING_BASE
- * The virtio device sends us the available ring last used index.
- */
-int
-vhost_set_vring_base(int vid, struct vhost_vring_state *state)
-{
-       struct virtio_net *dev;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       /* State->index refers to the queue index. The txq is 1, rxq is 0. */
-       dev->virtqueue[state->index]->last_used_idx = state->num;
-
-       return 0;
-}
-
-/*
- * Called from CUSE IOCTL: VHOST_GET_VRING_BASE
- * We send the virtio device our available ring last used index.
- */
-int
-vhost_get_vring_base(int vid, uint32_t index,
-       struct vhost_vring_state *state)
-{
-       struct virtio_net *dev;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       state->index = index;
-       /* State->index refers to the queue index. The txq is 1, rxq is 0. */
-       state->num = dev->virtqueue[state->index]->last_used_idx;
-
-       return 0;
-}
-
-
-/*
- * Called from CUSE IOCTL: VHOST_SET_VRING_CALL
- * The virtio device sends an eventfd to interrupt the guest. This fd gets
- * copied into our process space.
- */
-int
-vhost_set_vring_call(int vid, struct vhost_vring_file *file)
-{
-       struct virtio_net *dev;
-       struct vhost_virtqueue *vq;
-       uint32_t cur_qp_idx = file->index / VIRTIO_QNUM;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       /*
-        * FIXME: VHOST_SET_VRING_CALL is the first per-vring message
-        * we get, so we do vring queue pair allocation here.
-        */
-       if (cur_qp_idx + 1 > dev->virt_qp_nb) {
-               if (alloc_vring_queue_pair(dev, cur_qp_idx) < 0)
-                       return -1;
-       }
-
-       /* file->index refers to the queue index. The txq is 1, rxq is 0. */
-       vq = dev->virtqueue[file->index];
-       assert(vq != NULL);
-
-       if (vq->callfd >= 0)
-               close(vq->callfd);
-
-       vq->callfd = file->fd;
-
-       return 0;
-}
-
-/*
- * Called from CUSE IOCTL: VHOST_SET_VRING_KICK
- * The virtio device sends an eventfd that it can use to notify us.
- * This fd gets copied into our process space.
- */
-int
-vhost_set_vring_kick(int vid, struct vhost_vring_file *file)
-{
-       struct virtio_net *dev;
-       struct vhost_virtqueue *vq;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       /* file->index refers to the queue index. The txq is 1, rxq is 0. */
-       vq = dev->virtqueue[file->index];
-
-       if (vq->kickfd >= 0)
-               close(vq->kickfd);
-
-       vq->kickfd = file->fd;
-
-       return 0;
-}
-
-/*
- * Called from CUSE IOCTL: VHOST_NET_SET_BACKEND
- * To complete device initialisation when the virtio driver is loaded,
- * we are provided with a valid fd for a tap device (not used by us).
- * If this happens then we can add the device to a data core.
- * When the virtio driver is removed we get fd=-1.
- * At that point we remove the device from the data core.
- * The device will still exist in the device configuration linked list.
- */
-int
-vhost_set_backend(int vid, struct vhost_vring_file *file)
-{
-       struct virtio_net *dev;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       /* file->index refers to the queue index. The txq is 1, rxq is 0. */
-       dev->virtqueue[file->index]->backend = file->fd;
-
-       /*
-        * If the device isn't already running and both backend fds are set,
-        * we add the device.
-        */
-       if (!(dev->flags & VIRTIO_DEV_RUNNING)) {
-               if (dev->virtqueue[VIRTIO_TXQ]->backend != VIRTIO_DEV_STOPPED &&
-                   dev->virtqueue[VIRTIO_RXQ]->backend != VIRTIO_DEV_STOPPED) {
-                       if (notify_ops->new_device(vid) < 0)
-                               return -1;
-                       dev->flags |= VIRTIO_DEV_RUNNING;
-               }
-       } else if (file->fd == VIRTIO_DEV_STOPPED) {
-               dev->flags &= ~VIRTIO_DEV_RUNNING;
-               notify_ops->destroy_device(vid);
-       }
-
-       return 0;
-}
-
-int
-rte_vhost_get_numa_node(int vid)
-{
-#ifdef RTE_LIBRTE_VHOST_NUMA
-       struct virtio_net *dev = get_device(vid);
-       int numa_node;
-       int ret;
-
-       if (dev == NULL)
-               return -1;
-
-       ret = get_mempolicy(&numa_node, NULL, 0, dev,
-                           MPOL_F_NODE | MPOL_F_ADDR);
-       if (ret < 0) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "(%d) failed to query numa node: %d\n", vid, ret);
-               return -1;
-       }
-
-       return numa_node;
-#else
-       RTE_SET_USED(vid);
-       return -1;
-#endif
-}
-
-uint32_t
-rte_vhost_get_queue_num(int vid)
-{
-       struct virtio_net *dev = get_device(vid);
-
-       if (dev == NULL)
-               return 0;
-
-       return dev->virt_qp_nb;
-}
-
-int
-rte_vhost_get_ifname(int vid, char *buf, size_t len)
-{
-       struct virtio_net *dev = get_device(vid);
-
-       if (dev == NULL)
-               return -1;
-
-       len = RTE_MIN(len, sizeof(dev->ifname));
-
-       strncpy(buf, dev->ifname, len);
-       buf[len - 1] = '\0';
-
-       return 0;
-}
-
-uint16_t
-rte_vhost_avail_entries(int vid, uint16_t queue_id)
-{
-       struct virtio_net *dev;
-       struct vhost_virtqueue *vq;
-
-       dev = get_device(vid);
-       if (!dev)
-               return 0;
-
-       vq = dev->virtqueue[queue_id];
-       if (!vq->enabled)
-               return 0;
-
-       return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx;
-}
-
-int
-rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable)
-{
-       struct virtio_net *dev = get_device(vid);
-
-       if (dev == NULL)
-               return -1;
-
-       if (enable) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "guest notification isn't supported.\n");
-               return -1;
-       }
-
-       dev->virtqueue[queue_id]->used->flags = VRING_USED_F_NO_NOTIFY;
-       return 0;
-}
-
-uint64_t rte_vhost_feature_get(void)
-{
-       return VHOST_FEATURES;
-}
-
-int rte_vhost_feature_disable(uint64_t feature_mask)
-{
-       VHOST_FEATURES = VHOST_FEATURES & ~feature_mask;
-       return 0;
-}
-
-int rte_vhost_feature_enable(uint64_t feature_mask)
-{
-       if ((feature_mask & VHOST_SUPPORTED_FEATURES) == feature_mask) {
-               VHOST_FEATURES = VHOST_FEATURES | feature_mask;
-               return 0;
-       }
-       return -1;
-}
-
-/*
- * Register ops so that we can add/remove device to data core.
- */
-int
-rte_vhost_driver_callback_register(struct virtio_net_device_ops const * const ops)
-{
-       notify_ops = ops;
-
-       return 0;
-}
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c

new file mode 100644 (file)

index 0000000..8a151af
--- /dev/null
+++ b/lib/librte_vhost/virtio_net.c
@@ -0,0 +1,924 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <linux/virtio_net.h>
+
+#include <rte_mbuf.h>
+#include <rte_memcpy.h>
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_virtio_net.h>
+#include <rte_tcp.h>
+#include <rte_udp.h>
+#include <rte_sctp.h>
+#include <rte_arp.h>
+
+#include "vhost.h"
+
+#define MAX_PKT_BURST 32
+#define VHOST_LOG_PAGE 4096
+
+static inline void __attribute__((always_inline))
+vhost_log_page(uint8_t *log_base, uint64_t page)
+{
+       log_base[page / 8] |= 1 << (page % 8);
+}
+
+static inline void __attribute__((always_inline))
+vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
+{
+       uint64_t page;
+
+       if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
+                  !dev->log_base || !len))
+               return;
+
+       if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
+               return;
+
+       /* To make sure guest memory updates are committed before logging */
+       rte_smp_wmb();
+
+       page = addr / VHOST_LOG_PAGE;
+       while (page * VHOST_LOG_PAGE < addr + len) {
+               vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
+               page += 1;
+       }
+}
+
+static inline void __attribute__((always_inline))
+vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+                    uint64_t offset, uint64_t len)
+{
+       vhost_log_write(dev, vq->log_guest_addr + offset, len);
+}
+
+static bool
+is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
+{
+       return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
+}
+
+static void
+virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
+{
+       if (m_buf->ol_flags & PKT_TX_L4_MASK) {
+               net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+               net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
+
+               switch (m_buf->ol_flags & PKT_TX_L4_MASK) {
+               case PKT_TX_TCP_CKSUM:
+                       net_hdr->csum_offset = (offsetof(struct tcp_hdr,
+                                               cksum));
+                       break;
+               case PKT_TX_UDP_CKSUM:
+                       net_hdr->csum_offset = (offsetof(struct udp_hdr,
+                                               dgram_cksum));
+                       break;
+               case PKT_TX_SCTP_CKSUM:
+                       net_hdr->csum_offset = (offsetof(struct sctp_hdr,
+                                               cksum));
+                       break;
+               }
+       }
+
+       if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
+               if (m_buf->ol_flags & PKT_TX_IPV4)
+                       net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+               else
+                       net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+               net_hdr->gso_size = m_buf->tso_segsz;
+               net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
+                                       + m_buf->l4_len;
+       }
+}
+
+static inline void
+copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
+                   struct virtio_net_hdr_mrg_rxbuf hdr)
+{
+       if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
+               *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
+       else
+               *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
+}
+
+static inline int __attribute__((always_inline))
+copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
+                 struct rte_mbuf *m, uint16_t desc_idx)
+{
+       uint32_t desc_avail, desc_offset;
+       uint32_t mbuf_avail, mbuf_offset;
+       uint32_t cpy_len;
+       struct vring_desc *desc;
+       uint64_t desc_addr;
+       struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
+
+       desc = &vq->desc[desc_idx];
+       desc_addr = gpa_to_vva(dev, desc->addr);
+       /*
+        * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
+        * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
+        * otherwise stores offset on the stack instead of in a register.
+        */
+       if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
+               return -1;
+
+       rte_prefetch0((void *)(uintptr_t)desc_addr);
+
+       virtio_enqueue_offload(m, &virtio_hdr.hdr);
+       copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
+       vhost_log_write(dev, desc->addr, dev->vhost_hlen);
+       PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
+
+       desc_offset = dev->vhost_hlen;
+       desc_avail  = desc->len - dev->vhost_hlen;
+
+       mbuf_avail  = rte_pktmbuf_data_len(m);
+       mbuf_offset = 0;
+       while (mbuf_avail != 0 || m->next != NULL) {
+               /* done with current mbuf, fetch next */
+               if (mbuf_avail == 0) {
+                       m = m->next;
+
+                       mbuf_offset = 0;
+                       mbuf_avail  = rte_pktmbuf_data_len(m);
+               }
+
+               /* done with current desc buf, fetch next */
+               if (desc_avail == 0) {
+                       if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
+                               /* Room in vring buffer is not enough */
+                               return -1;
+                       }
+                       if (unlikely(desc->next >= vq->size))
+                               return -1;
+
+                       desc = &vq->desc[desc->next];
+                       desc_addr = gpa_to_vva(dev, desc->addr);
+                       if (unlikely(!desc_addr))
+                               return -1;
+
+                       desc_offset = 0;
+                       desc_avail  = desc->len;
+               }
+
+               cpy_len = RTE_MIN(desc_avail, mbuf_avail);
+               rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
+                       rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
+                       cpy_len);
+               vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
+               PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
+                            cpy_len, 0);
+
+               mbuf_avail  -= cpy_len;
+               mbuf_offset += cpy_len;
+               desc_avail  -= cpy_len;
+               desc_offset += cpy_len;
+       }
+
+       return 0;
+}
+
+/**
+ * This function adds buffers to the virtio devices RX virtqueue. Buffers can
+ * be received from the physical port or from another virtio device. A packet
+ * count is returned to indicate the number of packets that are succesfully
+ * added to the RX queue. This function works when the mbuf is scattered, but
+ * it doesn't support the mergeable feature.
+ */
+static inline uint32_t __attribute__((always_inline))
+virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
+             struct rte_mbuf **pkts, uint32_t count)
+{
+       struct vhost_virtqueue *vq;
+       uint16_t avail_idx, free_entries, start_idx;
+       uint16_t desc_indexes[MAX_PKT_BURST];
+       uint16_t used_idx;
+       uint32_t i;
+
+       LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
+       if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
+               RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
+                       dev->vid, __func__, queue_id);
+               return 0;
+       }
+
+       vq = dev->virtqueue[queue_id];
+       if (unlikely(vq->enabled == 0))
+               return 0;
+
+       avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+       start_idx = vq->last_used_idx;
+       free_entries = avail_idx - start_idx;
+       count = RTE_MIN(count, free_entries);
+       count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
+       if (count == 0)
+               return 0;
+
+       LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
+               dev->vid, start_idx, start_idx + count);
+
+       /* Retrieve all of the desc indexes first to avoid caching issues. */
+       rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
+       for (i = 0; i < count; i++) {
+               used_idx = (start_idx + i) & (vq->size - 1);
+               desc_indexes[i] = vq->avail->ring[used_idx];
+               vq->used->ring[used_idx].id = desc_indexes[i];
+               vq->used->ring[used_idx].len = pkts[i]->pkt_len +
+                                              dev->vhost_hlen;
+               vhost_log_used_vring(dev, vq,
+                       offsetof(struct vring_used, ring[used_idx]),
+                       sizeof(vq->used->ring[used_idx]));
+       }
+
+       rte_prefetch0(&vq->desc[desc_indexes[0]]);
+       for (i = 0; i < count; i++) {
+               uint16_t desc_idx = desc_indexes[i];
+               int err;
+
+               err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
+               if (unlikely(err)) {
+                       used_idx = (start_idx + i) & (vq->size - 1);
+                       vq->used->ring[used_idx].len = dev->vhost_hlen;
+                       vhost_log_used_vring(dev, vq,
+                               offsetof(struct vring_used, ring[used_idx]),
+                               sizeof(vq->used->ring[used_idx]));
+               }
+
+               if (i + 1 < count)
+                       rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
+       }
+
+       rte_smp_wmb();
+
+       *(volatile uint16_t *)&vq->used->idx += count;
+       vq->last_used_idx += count;
+       vhost_log_used_vring(dev, vq,
+               offsetof(struct vring_used, idx),
+               sizeof(vq->used->idx));
+
+       /* flush used->idx update before we read avail->flags. */
+       rte_mb();
+
+       /* Kick the guest if necessary. */
+       if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
+                       && (vq->callfd >= 0))
+               eventfd_write(vq->callfd, (eventfd_t)1);
+       return count;
+}
+
+static inline int
+fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
+            uint32_t *allocated, uint32_t *vec_idx,
+            struct buf_vector *buf_vec)
+{
+       uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
+       uint32_t vec_id = *vec_idx;
+       uint32_t len    = *allocated;
+
+       while (1) {
+               if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
+                       return -1;
+
+               len += vq->desc[idx].len;
+               buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
+               buf_vec[vec_id].buf_len  = vq->desc[idx].len;
+               buf_vec[vec_id].desc_idx = idx;
+               vec_id++;
+
+               if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0)
+                       break;
+
+               idx = vq->desc[idx].next;
+       }
+
+       *allocated = len;
+       *vec_idx   = vec_id;
+
+       return 0;
+}
+
+/*
+ * Returns -1 on fail, 0 on success
+ */
+static inline int
+reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
+                           uint16_t *end, struct buf_vector *buf_vec)
+{
+       uint16_t cur_idx;
+       uint16_t avail_idx;
+       uint32_t allocated = 0;
+       uint32_t vec_idx = 0;
+       uint16_t tries = 0;
+
+       cur_idx  = vq->last_used_idx;
+
+       while (1) {
+               avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+               if (unlikely(cur_idx == avail_idx))
+                       return -1;
+
+               if (unlikely(fill_vec_buf(vq, cur_idx, &allocated,
+                                         &vec_idx, buf_vec) < 0))
+                       return -1;
+
+               cur_idx++;
+               tries++;
+
+               if (allocated >= size)
+                       break;
+
+               /*
+                * if we tried all available ring items, and still
+                * can't get enough buf, it means something abnormal
+                * happened.
+                */
+               if (unlikely(tries >= vq->size))
+                       return -1;
+       }
+
+       *end = cur_idx;
+       return 0;
+}
+
+static inline uint32_t __attribute__((always_inline))
+copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
+                           uint16_t end_idx, struct rte_mbuf *m,
+                           struct buf_vector *buf_vec)
+{
+       struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
+       uint32_t vec_idx = 0;
+       uint16_t start_idx = vq->last_used_idx;
+       uint16_t cur_idx = start_idx;
+       uint64_t desc_addr;
+       uint32_t mbuf_offset, mbuf_avail;
+       uint32_t desc_offset, desc_avail;
+       uint32_t cpy_len;
+       uint16_t desc_idx, used_idx;
+
+       if (unlikely(m == NULL))
+               return 0;
+
+       LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
+               dev->vid, cur_idx, end_idx);
+
+       desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
+       if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
+               return 0;
+
+       rte_prefetch0((void *)(uintptr_t)desc_addr);
+
+       virtio_hdr.num_buffers = end_idx - start_idx;
+       LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
+               dev->vid, virtio_hdr.num_buffers);
+
+       virtio_enqueue_offload(m, &virtio_hdr.hdr);
+       copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
+       vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
+       PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
+
+       desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
+       desc_offset = dev->vhost_hlen;
+
+       mbuf_avail  = rte_pktmbuf_data_len(m);
+       mbuf_offset = 0;
+       while (mbuf_avail != 0 || m->next != NULL) {
+               /* done with current desc buf, get the next one */
+               if (desc_avail == 0) {
+                       desc_idx = buf_vec[vec_idx].desc_idx;
+
+                       if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
+                               /* Update used ring with desc information */
+                               used_idx = cur_idx++ & (vq->size - 1);
+                               vq->used->ring[used_idx].id  = desc_idx;
+                               vq->used->ring[used_idx].len = desc_offset;
+                               vhost_log_used_vring(dev, vq,
+                                       offsetof(struct vring_used,
+                                                ring[used_idx]),
+                                       sizeof(vq->used->ring[used_idx]));
+                       }
+
+                       vec_idx++;
+                       desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
+                       if (unlikely(!desc_addr))
+                               return 0;
+
+                       /* Prefetch buffer address. */
+                       rte_prefetch0((void *)(uintptr_t)desc_addr);
+                       desc_offset = 0;
+                       desc_avail  = buf_vec[vec_idx].buf_len;
+               }
+
+               /* done with current mbuf, get the next one */
+               if (mbuf_avail == 0) {
+                       m = m->next;
+
+                       mbuf_offset = 0;
+                       mbuf_avail  = rte_pktmbuf_data_len(m);
+               }
+
+               cpy_len = RTE_MIN(desc_avail, mbuf_avail);
+               rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
+                       rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
+                       cpy_len);
+               vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
+                       cpy_len);
+               PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
+                       cpy_len, 0);
+
+               mbuf_avail  -= cpy_len;
+               mbuf_offset += cpy_len;
+               desc_avail  -= cpy_len;
+               desc_offset += cpy_len;
+       }
+
+       used_idx = cur_idx & (vq->size - 1);
+       vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
+       vq->used->ring[used_idx].len = desc_offset;
+       vhost_log_used_vring(dev, vq,
+               offsetof(struct vring_used, ring[used_idx]),
+               sizeof(vq->used->ring[used_idx]));
+
+       return end_idx - start_idx;
+}
+
+static inline uint32_t __attribute__((always_inline))
+virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
+       struct rte_mbuf **pkts, uint32_t count)
+{
+       struct vhost_virtqueue *vq;
+       uint32_t pkt_idx = 0, nr_used = 0;
+       uint16_t end;
+       struct buf_vector buf_vec[BUF_VECTOR_MAX];
+
+       LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
+       if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
+               RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
+                       dev->vid, __func__, queue_id);
+               return 0;
+       }
+
+       vq = dev->virtqueue[queue_id];
+       if (unlikely(vq->enabled == 0))
+               return 0;
+
+       count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
+       if (count == 0)
+               return 0;
+
+       for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+               uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
+
+               if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len,
+                                                        &end, buf_vec) < 0)) {
+                       LOG_DEBUG(VHOST_DATA,
+                               "(%d) failed to get enough desc from vring\n",
+                               dev->vid);
+                       break;
+               }
+
+               nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
+                                                     pkts[pkt_idx], buf_vec);
+               rte_smp_wmb();
+
+               *(volatile uint16_t *)&vq->used->idx += nr_used;
+               vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+                       sizeof(vq->used->idx));
+               vq->last_used_idx += nr_used;
+       }
+
+       if (likely(pkt_idx)) {
+               /* flush used->idx update before we read avail->flags. */
+               rte_mb();
+
+               /* Kick the guest if necessary. */
+               if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
+                               && (vq->callfd >= 0))
+                       eventfd_write(vq->callfd, (eventfd_t)1);
+       }
+
+       return pkt_idx;
+}
+
+uint16_t
+rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+       struct rte_mbuf **pkts, uint16_t count)
+{
+       struct virtio_net *dev = get_device(vid);
+
+       if (!dev)
+               return 0;
+
+       if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
+               return virtio_dev_merge_rx(dev, queue_id, pkts, count);
+       else
+               return virtio_dev_rx(dev, queue_id, pkts, count);
+}
+
+static void
+parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
+{
+       struct ipv4_hdr *ipv4_hdr;
+       struct ipv6_hdr *ipv6_hdr;
+       void *l3_hdr = NULL;
+       struct ether_hdr *eth_hdr;
+       uint16_t ethertype;
+
+       eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
+
+       m->l2_len = sizeof(struct ether_hdr);
+       ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
+
+       if (ethertype == ETHER_TYPE_VLAN) {
+               struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
+
+               m->l2_len += sizeof(struct vlan_hdr);
+               ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
+       }
+
+       l3_hdr = (char *)eth_hdr + m->l2_len;
+
+       switch (ethertype) {
+       case ETHER_TYPE_IPv4:
+               ipv4_hdr = (struct ipv4_hdr *)l3_hdr;
+               *l4_proto = ipv4_hdr->next_proto_id;
+               m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
+               *l4_hdr = (char *)l3_hdr + m->l3_len;
+               m->ol_flags |= PKT_TX_IPV4;
+               break;
+       case ETHER_TYPE_IPv6:
+               ipv6_hdr = (struct ipv6_hdr *)l3_hdr;
+               *l4_proto = ipv6_hdr->proto;
+               m->l3_len = sizeof(struct ipv6_hdr);
+               *l4_hdr = (char *)l3_hdr + m->l3_len;
+               m->ol_flags |= PKT_TX_IPV6;
+               break;
+       default:
+               m->l3_len = 0;
+               *l4_proto = 0;
+               break;
+       }
+}
+
+static inline void __attribute__((always_inline))
+vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
+{
+       uint16_t l4_proto = 0;
+       void *l4_hdr = NULL;
+       struct tcp_hdr *tcp_hdr = NULL;
+
+       parse_ethernet(m, &l4_proto, &l4_hdr);
+       if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+               if (hdr->csum_start == (m->l2_len + m->l3_len)) {
+                       switch (hdr->csum_offset) {
+                       case (offsetof(struct tcp_hdr, cksum)):
+                               if (l4_proto == IPPROTO_TCP)
+                                       m->ol_flags |= PKT_TX_TCP_CKSUM;
+                               break;
+                       case (offsetof(struct udp_hdr, dgram_cksum)):
+                               if (l4_proto == IPPROTO_UDP)
+                                       m->ol_flags |= PKT_TX_UDP_CKSUM;
+                               break;
+                       case (offsetof(struct sctp_hdr, cksum)):
+                               if (l4_proto == IPPROTO_SCTP)
+                                       m->ol_flags |= PKT_TX_SCTP_CKSUM;
+                               break;
+                       default:
+                               break;
+                       }
+               }
+       }
+
+       if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+               switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
+               case VIRTIO_NET_HDR_GSO_TCPV4:
+               case VIRTIO_NET_HDR_GSO_TCPV6:
+                       tcp_hdr = (struct tcp_hdr *)l4_hdr;
+                       m->ol_flags |= PKT_TX_TCP_SEG;
+                       m->tso_segsz = hdr->gso_size;
+                       m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
+                       break;
+               default:
+                       RTE_LOG(WARNING, VHOST_DATA,
+                               "unsupported gso type %u.\n", hdr->gso_type);
+                       break;
+               }
+       }
+}
+
+#define RARP_PKT_SIZE  64
+
+static int
+make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
+{
+       struct ether_hdr *eth_hdr;
+       struct arp_hdr  *rarp;
+
+       if (rarp_mbuf->buf_len < 64) {
+               RTE_LOG(WARNING, VHOST_DATA,
+                       "failed to make RARP; mbuf size too small %u (< %d)\n",
+                       rarp_mbuf->buf_len, RARP_PKT_SIZE);
+               return -1;
+       }
+
+       /* Ethernet header. */
+       eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0);
+       memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN);
+       ether_addr_copy(mac, &eth_hdr->s_addr);
+       eth_hdr->ether_type = htons(ETHER_TYPE_RARP);
+
+       /* RARP header. */
+       rarp = (struct arp_hdr *)(eth_hdr + 1);
+       rarp->arp_hrd = htons(ARP_HRD_ETHER);
+       rarp->arp_pro = htons(ETHER_TYPE_IPv4);
+       rarp->arp_hln = ETHER_ADDR_LEN;
+       rarp->arp_pln = 4;
+       rarp->arp_op  = htons(ARP_OP_REVREQUEST);
+
+       ether_addr_copy(mac, &rarp->arp_data.arp_sha);
+       ether_addr_copy(mac, &rarp->arp_data.arp_tha);
+       memset(&rarp->arp_data.arp_sip, 0x00, 4);
+       memset(&rarp->arp_data.arp_tip, 0x00, 4);
+
+       rarp_mbuf->pkt_len  = rarp_mbuf->data_len = RARP_PKT_SIZE;
+
+       return 0;
+}
+
+static inline int __attribute__((always_inline))
+copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
+                 struct rte_mbuf *m, uint16_t desc_idx,
+                 struct rte_mempool *mbuf_pool)
+{
+       struct vring_desc *desc;
+       uint64_t desc_addr;
+       uint32_t desc_avail, desc_offset;
+       uint32_t mbuf_avail, mbuf_offset;
+       uint32_t cpy_len;
+       struct rte_mbuf *cur = m, *prev = m;
+       struct virtio_net_hdr *hdr;
+       /* A counter to avoid desc dead loop chain */
+       uint32_t nr_desc = 1;
+
+       desc = &vq->desc[desc_idx];
+       if (unlikely(desc->len < dev->vhost_hlen))
+               return -1;
+
+       desc_addr = gpa_to_vva(dev, desc->addr);
+       if (unlikely(!desc_addr))
+               return -1;
+
+       hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr);
+       rte_prefetch0(hdr);
+
+       /*
+        * A virtio driver normally uses at least 2 desc buffers
+        * for Tx: the first for storing the header, and others
+        * for storing the data.
+        */
+       if (likely((desc->len == dev->vhost_hlen) &&
+                  (desc->flags & VRING_DESC_F_NEXT) != 0)) {
+               desc = &vq->desc[desc->next];
+
+               desc_addr = gpa_to_vva(dev, desc->addr);
+               if (unlikely(!desc_addr))
+                       return -1;
+
+               rte_prefetch0((void *)(uintptr_t)desc_addr);
+
+               desc_offset = 0;
+               desc_avail  = desc->len;
+               nr_desc    += 1;
+
+               PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0);
+       } else {
+               desc_avail  = desc->len - dev->vhost_hlen;
+               desc_offset = dev->vhost_hlen;
+       }
+
+       mbuf_offset = 0;
+       mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
+       while (1) {
+               cpy_len = RTE_MIN(desc_avail, mbuf_avail);
+               rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset),
+                       (void *)((uintptr_t)(desc_addr + desc_offset)),
+                       cpy_len);
+
+               mbuf_avail  -= cpy_len;
+               mbuf_offset += cpy_len;
+               desc_avail  -= cpy_len;
+               desc_offset += cpy_len;
+
+               /* This desc reaches to its end, get the next one */
+               if (desc_avail == 0) {
+                       if ((desc->flags & VRING_DESC_F_NEXT) == 0)
+                               break;
+
+                       if (unlikely(desc->next >= vq->size ||
+                                    ++nr_desc > vq->size))
+                               return -1;
+                       desc = &vq->desc[desc->next];
+
+                       desc_addr = gpa_to_vva(dev, desc->addr);
+                       if (unlikely(!desc_addr))
+                               return -1;
+
+                       rte_prefetch0((void *)(uintptr_t)desc_addr);
+
+                       desc_offset = 0;
+                       desc_avail  = desc->len;
+
+                       PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0);
+               }
+
+               /*
+                * This mbuf reaches to its end, get a new one
+                * to hold more data.
+                */
+               if (mbuf_avail == 0) {
+                       cur = rte_pktmbuf_alloc(mbuf_pool);
+                       if (unlikely(cur == NULL)) {
+                               RTE_LOG(ERR, VHOST_DATA, "Failed to "
+                                       "allocate memory for mbuf.\n");
+                               return -1;
+                       }
+
+                       prev->next = cur;
+                       prev->data_len = mbuf_offset;
+                       m->nb_segs += 1;
+                       m->pkt_len += mbuf_offset;
+                       prev = cur;
+
+                       mbuf_offset = 0;
+                       mbuf_avail  = cur->buf_len - RTE_PKTMBUF_HEADROOM;
+               }
+       }
+
+       prev->data_len = mbuf_offset;
+       m->pkt_len    += mbuf_offset;
+
+       if (hdr->flags != 0 || hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE)
+               vhost_dequeue_offload(hdr, m);
+
+       return 0;
+}
+
+uint16_t
+rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
+       struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
+{
+       struct virtio_net *dev;
+       struct rte_mbuf *rarp_mbuf = NULL;
+       struct vhost_virtqueue *vq;
+       uint32_t desc_indexes[MAX_PKT_BURST];
+       uint32_t used_idx;
+       uint32_t i = 0;
+       uint16_t free_entries;
+       uint16_t avail_idx;
+
+       dev = get_device(vid);
+       if (!dev)
+               return 0;
+
+       if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) {
+               RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
+                       dev->vid, __func__, queue_id);
+               return 0;
+       }
+
+       vq = dev->virtqueue[queue_id];
+       if (unlikely(vq->enabled == 0))
+               return 0;
+
+       /*
+        * Construct a RARP broadcast packet, and inject it to the "pkts"
+        * array, to looks like that guest actually send such packet.
+        *
+        * Check user_send_rarp() for more information.
+        */
+       if (unlikely(rte_atomic16_cmpset((volatile uint16_t *)
+                                        &dev->broadcast_rarp.cnt, 1, 0))) {
+               rarp_mbuf = rte_pktmbuf_alloc(mbuf_pool);
+               if (rarp_mbuf == NULL) {
+                       RTE_LOG(ERR, VHOST_DATA,
+                               "Failed to allocate memory for mbuf.\n");
+                       return 0;
+               }
+
+               if (make_rarp_packet(rarp_mbuf, &dev->mac)) {
+                       rte_pktmbuf_free(rarp_mbuf);
+                       rarp_mbuf = NULL;
+               } else {
+                       count -= 1;
+               }
+       }
+
+       avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
+       free_entries = avail_idx - vq->last_used_idx;
+       if (free_entries == 0)
+               goto out;
+
+       LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
+
+       /* Prefetch available ring to retrieve head indexes. */
+       used_idx = vq->last_used_idx & (vq->size - 1);
+       rte_prefetch0(&vq->avail->ring[used_idx]);
+       rte_prefetch0(&vq->used->ring[used_idx]);
+
+       count = RTE_MIN(count, MAX_PKT_BURST);
+       count = RTE_MIN(count, free_entries);
+       LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
+                       dev->vid, count);
+
+       /* Retrieve all of the head indexes first to avoid caching issues. */
+       for (i = 0; i < count; i++) {
+               used_idx = (vq->last_used_idx + i) & (vq->size - 1);
+               desc_indexes[i] = vq->avail->ring[used_idx];
+
+               vq->used->ring[used_idx].id  = desc_indexes[i];
+               vq->used->ring[used_idx].len = 0;
+               vhost_log_used_vring(dev, vq,
+                               offsetof(struct vring_used, ring[used_idx]),
+                               sizeof(vq->used->ring[used_idx]));
+       }
+
+       /* Prefetch descriptor index. */
+       rte_prefetch0(&vq->desc[desc_indexes[0]]);
+       for (i = 0; i < count; i++) {
+               int err;
+
+               if (likely(i + 1 < count))
+                       rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
+
+               pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
+               if (unlikely(pkts[i] == NULL)) {
+                       RTE_LOG(ERR, VHOST_DATA,
+                               "Failed to allocate memory for mbuf.\n");
+                       break;
+               }
+               err = copy_desc_to_mbuf(dev, vq, pkts[i], desc_indexes[i],
+                                       mbuf_pool);
+               if (unlikely(err)) {
+                       rte_pktmbuf_free(pkts[i]);
+                       break;
+               }
+       }
+
+       rte_smp_wmb();
+       rte_smp_rmb();
+       vq->used->idx += i;
+       vq->last_used_idx += i;
+       vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+                       sizeof(vq->used->idx));
+
+       /* Kick guest if required. */
+       if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
+                       && (vq->callfd >= 0))
+               eventfd_write(vq->callfd, (eventfd_t)1);
+
+out:
+       if (unlikely(rarp_mbuf != NULL)) {
+               /*
+                * Inject it to the head of "pkts" array, so that switch's mac
+                * learning table will get updated first.
+                */
+               memmove(&pkts[1], pkts, i * sizeof(struct rte_mbuf *));
+               pkts[0] = rarp_mbuf;
+               i += 1;
+       }
+
+       return i;
+}
author	Yuanhan Liu <yuanhan.liu@linux.intel.com>
	Thu, 18 Aug 2016 08:48:39 +0000 (16:48 +0800)
committer	Yuanhan Liu <yuanhan.liu@linux.intel.com>
	Tue, 13 Sep 2016 03:25:08 +0000 (05:25 +0200)
lib/librte_vhost/Makefile		patch \| blob \| history
lib/librte_vhost/socket.c	[new file with mode: 0644]	patch \| blob
lib/librte_vhost/vhost-net-user.c	[deleted file]	patch \| blob \| history
lib/librte_vhost/vhost-net-user.h	[deleted file]	patch \| blob \| history
lib/librte_vhost/vhost-net.h	[deleted file]	patch \| blob \| history
lib/librte_vhost/vhost.c	[new file with mode: 0644]	patch \| blob
lib/librte_vhost/vhost.h	[new file with mode: 0644]	patch \| blob
lib/librte_vhost/vhost_rxtx.c	[deleted file]	patch \| blob \| history
lib/librte_vhost/vhost_user.c	[new file with mode: 0644]	patch \| blob
lib/librte_vhost/vhost_user.h	[new file with mode: 0644]	patch \| blob
lib/librte_vhost/virtio-net-user.c	[deleted file]	patch \| blob \| history
lib/librte_vhost/virtio-net-user.h	[deleted file]	patch \| blob \| history
lib/librte_vhost/virtio-net.c	[deleted file]	patch \| blob \| history
lib/librte_vhost/virtio_net.c	[new file with mode: 0644]	patch \| blob