vhost: remove sub-directory
authorYuanhan Liu <yuanhan.liu@linux.intel.com>
Thu, 18 Aug 2016 08:48:38 +0000 (16:48 +0800)
committerYuanhan Liu <yuanhan.liu@linux.intel.com>
Tue, 13 Sep 2016 03:25:08 +0000 (05:25 +0200)
We now have one vhost implementation; no sub source dir is needed.
Remove it by move them to upper dir.

Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
13 files changed:
lib/librte_vhost/Makefile
lib/librte_vhost/fd_man.c [new file with mode: 0644]
lib/librte_vhost/fd_man.h [new file with mode: 0644]
lib/librte_vhost/vhost-net-user.c [new file with mode: 0644]
lib/librte_vhost/vhost-net-user.h [new file with mode: 0644]
lib/librte_vhost/vhost_user/fd_man.c [deleted file]
lib/librte_vhost/vhost_user/fd_man.h [deleted file]
lib/librte_vhost/vhost_user/vhost-net-user.c [deleted file]
lib/librte_vhost/vhost_user/vhost-net-user.h [deleted file]
lib/librte_vhost/vhost_user/virtio-net-user.c [deleted file]
lib/librte_vhost/vhost_user/virtio-net-user.h [deleted file]
lib/librte_vhost/virtio-net-user.c [new file with mode: 0644]
lib/librte_vhost/virtio-net-user.h [new file with mode: 0644]

index fb4e7f8..277390f 100644 (file)
@@ -48,9 +48,9 @@ endif
 
 # all source are stored in SRCS-y
 SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := virtio-net.c vhost_rxtx.c
-SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += vhost_user/vhost-net-user.c
-SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += vhost_user/virtio-net-user.c
-SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += vhost_user/fd_man.c
+SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += vhost-net-user.c
+SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += virtio-net-user.c
+SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += fd_man.c
 
 # install includes
 SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_virtio_net.h
diff --git a/lib/librte_vhost/fd_man.c b/lib/librte_vhost/fd_man.c
new file mode 100644 (file)
index 0000000..2d3eeb7
--- /dev/null
@@ -0,0 +1,299 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+
+#include "fd_man.h"
+
+/**
+ * Returns the index in the fdset for a given fd.
+ * If fd is -1, it means to search for a free entry.
+ * @return
+ *   index for the fd, or -1 if fd isn't in the fdset.
+ */
+static int
+fdset_find_fd(struct fdset *pfdset, int fd)
+{
+       int i;
+
+       if (pfdset == NULL)
+               return -1;
+
+       for (i = 0; i < MAX_FDS && pfdset->fd[i].fd != fd; i++)
+               ;
+
+       return i ==  MAX_FDS ? -1 : i;
+}
+
+static int
+fdset_find_free_slot(struct fdset *pfdset)
+{
+       return fdset_find_fd(pfdset, -1);
+}
+
+static int
+fdset_add_fd(struct fdset  *pfdset, int idx, int fd,
+       fd_cb rcb, fd_cb wcb, void *dat)
+{
+       struct fdentry *pfdentry;
+
+       if (pfdset == NULL || idx >= MAX_FDS || fd >= FD_SETSIZE)
+               return -1;
+
+       pfdentry = &pfdset->fd[idx];
+       pfdentry->fd = fd;
+       pfdentry->rcb = rcb;
+       pfdentry->wcb = wcb;
+       pfdentry->dat = dat;
+
+       return 0;
+}
+
+/**
+ * Fill the read/write fd_set with the fds in the fdset.
+ * @return
+ *  the maximum fds filled in the read/write fd_set.
+ */
+static int
+fdset_fill(fd_set *rfset, fd_set *wfset, struct fdset *pfdset)
+{
+       struct fdentry *pfdentry;
+       int i, maxfds = -1;
+       int num = MAX_FDS;
+
+       if (pfdset == NULL)
+               return -1;
+
+       for (i = 0; i < num; i++) {
+               pfdentry = &pfdset->fd[i];
+               if (pfdentry->fd != -1) {
+                       int added = 0;
+                       if (pfdentry->rcb && rfset) {
+                               FD_SET(pfdentry->fd, rfset);
+                               added = 1;
+                       }
+                       if (pfdentry->wcb && wfset) {
+                               FD_SET(pfdentry->fd, wfset);
+                               added = 1;
+                       }
+                       if (added)
+                               maxfds = pfdentry->fd < maxfds ?
+                                       maxfds : pfdentry->fd;
+               }
+       }
+       return maxfds;
+}
+
+void
+fdset_init(struct fdset *pfdset)
+{
+       int i;
+
+       if (pfdset == NULL)
+               return;
+
+       for (i = 0; i < MAX_FDS; i++) {
+               pfdset->fd[i].fd = -1;
+               pfdset->fd[i].dat = NULL;
+       }
+       pfdset->num = 0;
+}
+
+/**
+ * Register the fd in the fdset with read/write handler and context.
+ */
+int
+fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, void *dat)
+{
+       int i;
+
+       if (pfdset == NULL || fd == -1)
+               return -1;
+
+       pthread_mutex_lock(&pfdset->fd_mutex);
+
+       /* Find a free slot in the list. */
+       i = fdset_find_free_slot(pfdset);
+       if (i == -1 || fdset_add_fd(pfdset, i, fd, rcb, wcb, dat) < 0) {
+               pthread_mutex_unlock(&pfdset->fd_mutex);
+               return -2;
+       }
+
+       pfdset->num++;
+
+       pthread_mutex_unlock(&pfdset->fd_mutex);
+
+       return 0;
+}
+
+/**
+ *  Unregister the fd from the fdset.
+ *  Returns context of a given fd or NULL.
+ */
+void *
+fdset_del(struct fdset *pfdset, int fd)
+{
+       int i;
+       void *dat = NULL;
+
+       if (pfdset == NULL || fd == -1)
+               return NULL;
+
+       do {
+               pthread_mutex_lock(&pfdset->fd_mutex);
+
+               i = fdset_find_fd(pfdset, fd);
+               if (i != -1 && pfdset->fd[i].busy == 0) {
+                       /* busy indicates r/wcb is executing! */
+                       dat = pfdset->fd[i].dat;
+                       pfdset->fd[i].fd = -1;
+                       pfdset->fd[i].rcb = pfdset->fd[i].wcb = NULL;
+                       pfdset->fd[i].dat = NULL;
+                       pfdset->num--;
+                       i = -1;
+               }
+               pthread_mutex_unlock(&pfdset->fd_mutex);
+       } while (i != -1);
+
+       return dat;
+}
+
+/**
+ *  Unregister the fd at the specified slot from the fdset.
+ */
+static void
+fdset_del_slot(struct fdset *pfdset, int index)
+{
+       if (pfdset == NULL || index < 0 || index >= MAX_FDS)
+               return;
+
+       pthread_mutex_lock(&pfdset->fd_mutex);
+
+       pfdset->fd[index].fd = -1;
+       pfdset->fd[index].rcb = pfdset->fd[index].wcb = NULL;
+       pfdset->fd[index].dat = NULL;
+       pfdset->num--;
+
+       pthread_mutex_unlock(&pfdset->fd_mutex);
+}
+
+/**
+ * This functions runs in infinite blocking loop until there is no fd in
+ * pfdset. It calls corresponding r/w handler if there is event on the fd.
+ *
+ * Before the callback is called, we set the flag to busy status; If other
+ * thread(now rte_vhost_driver_unregister) calls fdset_del concurrently, it
+ * will wait until the flag is reset to zero(which indicates the callback is
+ * finished), then it could free the context after fdset_del.
+ */
+void
+fdset_event_dispatch(struct fdset *pfdset)
+{
+       fd_set rfds, wfds;
+       int i, maxfds;
+       struct fdentry *pfdentry;
+       int num = MAX_FDS;
+       fd_cb rcb, wcb;
+       void *dat;
+       int fd;
+       int remove1, remove2;
+       int ret;
+
+       if (pfdset == NULL)
+               return;
+
+       while (1) {
+               struct timeval tv;
+               tv.tv_sec = 1;
+               tv.tv_usec = 0;
+               FD_ZERO(&rfds);
+               FD_ZERO(&wfds);
+               pthread_mutex_lock(&pfdset->fd_mutex);
+
+               maxfds = fdset_fill(&rfds, &wfds, pfdset);
+
+               pthread_mutex_unlock(&pfdset->fd_mutex);
+
+               /*
+                * When select is blocked, other threads might unregister
+                * listenfds from and register new listenfds into fdset.
+                * When select returns, the entries for listenfds in the fdset
+                * might have been updated. It is ok if there is unwanted call
+                * for new listenfds.
+                */
+               ret = select(maxfds + 1, &rfds, &wfds, NULL, &tv);
+               if (ret <= 0)
+                       continue;
+
+               for (i = 0; i < num; i++) {
+                       remove1 = remove2 = 0;
+                       pthread_mutex_lock(&pfdset->fd_mutex);
+                       pfdentry = &pfdset->fd[i];
+                       fd = pfdentry->fd;
+                       rcb = pfdentry->rcb;
+                       wcb = pfdentry->wcb;
+                       dat = pfdentry->dat;
+                       pfdentry->busy = 1;
+                       pthread_mutex_unlock(&pfdset->fd_mutex);
+                       if (fd >= 0 && FD_ISSET(fd, &rfds) && rcb)
+                               rcb(fd, dat, &remove1);
+                       if (fd >= 0 && FD_ISSET(fd, &wfds) && wcb)
+                               wcb(fd, dat, &remove2);
+                       pfdentry->busy = 0;
+                       /*
+                        * fdset_del needs to check busy flag.
+                        * We don't allow fdset_del to be called in callback
+                        * directly.
+                        */
+                       /*
+                        * When we are to clean up the fd from fdset,
+                        * because the fd is closed in the cb,
+                        * the old fd val could be reused by when creates new
+                        * listen fd in another thread, we couldn't call
+                        * fd_set_del.
+                        */
+                       if (remove1 || remove2)
+                               fdset_del_slot(pfdset, i);
+               }
+       }
+}
diff --git a/lib/librte_vhost/fd_man.h b/lib/librte_vhost/fd_man.h
new file mode 100644 (file)
index 0000000..bd66ed1
--- /dev/null
@@ -0,0 +1,67 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _FD_MAN_H_
+#define _FD_MAN_H_
+#include <stdint.h>
+#include <pthread.h>
+
+#define MAX_FDS 1024
+
+typedef void (*fd_cb)(int fd, void *dat, int *remove);
+
+struct fdentry {
+       int fd;         /* -1 indicates this entry is empty */
+       fd_cb rcb;      /* callback when this fd is readable. */
+       fd_cb wcb;      /* callback when this fd is writeable.*/
+       void *dat;      /* fd context */
+       int busy;       /* whether this entry is being used in cb. */
+};
+
+struct fdset {
+       struct fdentry fd[MAX_FDS];
+       pthread_mutex_t fd_mutex;
+       int num;        /* current fd number of this fdset */
+};
+
+
+void fdset_init(struct fdset *pfdset);
+
+int fdset_add(struct fdset *pfdset, int fd,
+       fd_cb rcb, fd_cb wcb, void *dat);
+
+void *fdset_del(struct fdset *pfdset, int fd);
+
+void fdset_event_dispatch(struct fdset *pfdset);
+
+#endif
diff --git a/lib/librte_vhost/vhost-net-user.c b/lib/librte_vhost/vhost-net-user.c
new file mode 100644 (file)
index 0000000..b35594d
--- /dev/null
@@ -0,0 +1,795 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/queue.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+
+#include <rte_log.h>
+#include <rte_virtio_net.h>
+
+#include "fd_man.h"
+#include "vhost-net-user.h"
+#include "vhost-net.h"
+#include "virtio-net-user.h"
+
+/*
+ * Every time rte_vhost_driver_register() is invoked, an associated
+ * vhost_user_socket struct will be created.
+ */
+struct vhost_user_socket {
+       char *path;
+       int listenfd;
+       int connfd;
+       bool is_server;
+       bool reconnect;
+};
+
+struct vhost_user_connection {
+       struct vhost_user_socket *vsocket;
+       int vid;
+};
+
+#define MAX_VHOST_SOCKET 1024
+struct vhost_user {
+       struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET];
+       struct fdset fdset;
+       int vsocket_cnt;
+       pthread_mutex_t mutex;
+};
+
+#define MAX_VIRTIO_BACKLOG 128
+
+static void vhost_user_server_new_connection(int fd, void *data, int *remove);
+static void vhost_user_msg_handler(int fd, void *dat, int *remove);
+static int vhost_user_create_client(struct vhost_user_socket *vsocket);
+
+static struct vhost_user vhost_user = {
+       .fdset = {
+               .fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} },
+               .fd_mutex = PTHREAD_MUTEX_INITIALIZER,
+               .num = 0
+       },
+       .vsocket_cnt = 0,
+       .mutex = PTHREAD_MUTEX_INITIALIZER,
+};
+
+static const char *vhost_message_str[VHOST_USER_MAX] = {
+       [VHOST_USER_NONE] = "VHOST_USER_NONE",
+       [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
+       [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
+       [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
+       [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
+       [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
+       [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
+       [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
+       [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
+       [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
+       [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
+       [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
+       [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
+       [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
+       [VHOST_USER_SET_VRING_ERR]  = "VHOST_USER_SET_VRING_ERR",
+       [VHOST_USER_GET_PROTOCOL_FEATURES]  = "VHOST_USER_GET_PROTOCOL_FEATURES",
+       [VHOST_USER_SET_PROTOCOL_FEATURES]  = "VHOST_USER_SET_PROTOCOL_FEATURES",
+       [VHOST_USER_GET_QUEUE_NUM]  = "VHOST_USER_GET_QUEUE_NUM",
+       [VHOST_USER_SET_VRING_ENABLE]  = "VHOST_USER_SET_VRING_ENABLE",
+       [VHOST_USER_SEND_RARP]  = "VHOST_USER_SEND_RARP",
+};
+
+/* return bytes# of read on success or negative val on failure. */
+static int
+read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
+{
+       struct iovec iov;
+       struct msghdr msgh;
+       size_t fdsize = fd_num * sizeof(int);
+       char control[CMSG_SPACE(fdsize)];
+       struct cmsghdr *cmsg;
+       int ret;
+
+       memset(&msgh, 0, sizeof(msgh));
+       iov.iov_base = buf;
+       iov.iov_len  = buflen;
+
+       msgh.msg_iov = &iov;
+       msgh.msg_iovlen = 1;
+       msgh.msg_control = control;
+       msgh.msg_controllen = sizeof(control);
+
+       ret = recvmsg(sockfd, &msgh, 0);
+       if (ret <= 0) {
+               RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n");
+               return ret;
+       }
+
+       if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
+               RTE_LOG(ERR, VHOST_CONFIG, "truncted msg\n");
+               return -1;
+       }
+
+       for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+               cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+               if ((cmsg->cmsg_level == SOL_SOCKET) &&
+                       (cmsg->cmsg_type == SCM_RIGHTS)) {
+                       memcpy(fds, CMSG_DATA(cmsg), fdsize);
+                       break;
+               }
+       }
+
+       return ret;
+}
+
+/* return bytes# of read on success or negative val on failure. */
+static int
+read_vhost_message(int sockfd, struct VhostUserMsg *msg)
+{
+       int ret;
+
+       ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
+               msg->fds, VHOST_MEMORY_MAX_NREGIONS);
+       if (ret <= 0)
+               return ret;
+
+       if (msg && msg->size) {
+               if (msg->size > sizeof(msg->payload)) {
+                       RTE_LOG(ERR, VHOST_CONFIG,
+                               "invalid msg size: %d\n", msg->size);
+                       return -1;
+               }
+               ret = read(sockfd, &msg->payload, msg->size);
+               if (ret <= 0)
+                       return ret;
+               if (ret != (int)msg->size) {
+                       RTE_LOG(ERR, VHOST_CONFIG,
+                               "read control message failed\n");
+                       return -1;
+               }
+       }
+
+       return ret;
+}
+
+static int
+send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
+{
+
+       struct iovec iov;
+       struct msghdr msgh;
+       size_t fdsize = fd_num * sizeof(int);
+       char control[CMSG_SPACE(fdsize)];
+       struct cmsghdr *cmsg;
+       int ret;
+
+       memset(&msgh, 0, sizeof(msgh));
+       iov.iov_base = buf;
+       iov.iov_len = buflen;
+
+       msgh.msg_iov = &iov;
+       msgh.msg_iovlen = 1;
+
+       if (fds && fd_num > 0) {
+               msgh.msg_control = control;
+               msgh.msg_controllen = sizeof(control);
+               cmsg = CMSG_FIRSTHDR(&msgh);
+               cmsg->cmsg_len = CMSG_LEN(fdsize);
+               cmsg->cmsg_level = SOL_SOCKET;
+               cmsg->cmsg_type = SCM_RIGHTS;
+               memcpy(CMSG_DATA(cmsg), fds, fdsize);
+       } else {
+               msgh.msg_control = NULL;
+               msgh.msg_controllen = 0;
+       }
+
+       do {
+               ret = sendmsg(sockfd, &msgh, 0);
+       } while (ret < 0 && errno == EINTR);
+
+       if (ret < 0) {
+               RTE_LOG(ERR, VHOST_CONFIG,  "sendmsg error\n");
+               return ret;
+       }
+
+       return ret;
+}
+
+static int
+send_vhost_message(int sockfd, struct VhostUserMsg *msg)
+{
+       int ret;
+
+       if (!msg)
+               return 0;
+
+       msg->flags &= ~VHOST_USER_VERSION_MASK;
+       msg->flags |= VHOST_USER_VERSION;
+       msg->flags |= VHOST_USER_REPLY_MASK;
+
+       ret = send_fd_message(sockfd, (char *)msg,
+               VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
+
+       return ret;
+}
+
+
+static void
+vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
+{
+       int vid;
+       size_t size;
+       struct vhost_user_connection *conn;
+       int ret;
+
+       conn = malloc(sizeof(*conn));
+       if (conn == NULL) {
+               close(fd);
+               return;
+       }
+
+       vid = vhost_new_device();
+       if (vid == -1) {
+               close(fd);
+               free(conn);
+               return;
+       }
+
+       size = strnlen(vsocket->path, PATH_MAX);
+       vhost_set_ifname(vid, vsocket->path, size);
+
+       RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid);
+
+       vsocket->connfd = fd;
+       conn->vsocket = vsocket;
+       conn->vid = vid;
+       ret = fdset_add(&vhost_user.fdset, fd, vhost_user_msg_handler,
+                       NULL, conn);
+       if (ret < 0) {
+               vsocket->connfd = -1;
+               free(conn);
+               close(fd);
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "failed to add fd %d into vhost server fdset\n",
+                       fd);
+       }
+}
+
+/* call back when there is new vhost-user connection from client  */
+static void
+vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused)
+{
+       struct vhost_user_socket *vsocket = dat;
+
+       fd = accept(fd, NULL, NULL);
+       if (fd < 0)
+               return;
+
+       RTE_LOG(INFO, VHOST_CONFIG, "new vhost user connection is %d\n", fd);
+       vhost_user_add_connection(fd, vsocket);
+}
+
+/* callback when there is message on the connfd */
+static void
+vhost_user_msg_handler(int connfd, void *dat, int *remove)
+{
+       int vid;
+       struct vhost_user_connection *conn = dat;
+       struct VhostUserMsg msg;
+       uint64_t features;
+       int ret;
+
+       vid = conn->vid;
+       ret = read_vhost_message(connfd, &msg);
+       if (ret <= 0 || msg.request >= VHOST_USER_MAX) {
+               struct vhost_user_socket *vsocket = conn->vsocket;
+
+               if (ret < 0)
+                       RTE_LOG(ERR, VHOST_CONFIG,
+                               "vhost read message failed\n");
+               else if (ret == 0)
+                       RTE_LOG(INFO, VHOST_CONFIG,
+                               "vhost peer closed\n");
+               else
+                       RTE_LOG(ERR, VHOST_CONFIG,
+                               "vhost read incorrect message\n");
+
+               vsocket->connfd = -1;
+               close(connfd);
+               *remove = 1;
+               free(conn);
+               vhost_destroy_device(vid);
+
+               if (vsocket->reconnect)
+                       vhost_user_create_client(vsocket);
+
+               return;
+       }
+
+       RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
+               vhost_message_str[msg.request]);
+       switch (msg.request) {
+       case VHOST_USER_GET_FEATURES:
+               ret = vhost_get_features(vid, &features);
+               msg.payload.u64 = features;
+               msg.size = sizeof(msg.payload.u64);
+               send_vhost_message(connfd, &msg);
+               break;
+       case VHOST_USER_SET_FEATURES:
+               features = msg.payload.u64;
+               vhost_set_features(vid, &features);
+               break;
+
+       case VHOST_USER_GET_PROTOCOL_FEATURES:
+               msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES;
+               msg.size = sizeof(msg.payload.u64);
+               send_vhost_message(connfd, &msg);
+               break;
+       case VHOST_USER_SET_PROTOCOL_FEATURES:
+               user_set_protocol_features(vid, msg.payload.u64);
+               break;
+
+       case VHOST_USER_SET_OWNER:
+               vhost_set_owner(vid);
+               break;
+       case VHOST_USER_RESET_OWNER:
+               vhost_reset_owner(vid);
+               break;
+
+       case VHOST_USER_SET_MEM_TABLE:
+               user_set_mem_table(vid, &msg);
+               break;
+
+       case VHOST_USER_SET_LOG_BASE:
+               user_set_log_base(vid, &msg);
+
+               /* it needs a reply */
+               msg.size = sizeof(msg.payload.u64);
+               send_vhost_message(connfd, &msg);
+               break;
+       case VHOST_USER_SET_LOG_FD:
+               close(msg.fds[0]);
+               RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
+               break;
+
+       case VHOST_USER_SET_VRING_NUM:
+               vhost_set_vring_num(vid, &msg.payload.state);
+               break;
+       case VHOST_USER_SET_VRING_ADDR:
+               vhost_set_vring_addr(vid, &msg.payload.addr);
+               break;
+       case VHOST_USER_SET_VRING_BASE:
+               vhost_set_vring_base(vid, &msg.payload.state);
+               break;
+
+       case VHOST_USER_GET_VRING_BASE:
+               ret = user_get_vring_base(vid, &msg.payload.state);
+               msg.size = sizeof(msg.payload.state);
+               send_vhost_message(connfd, &msg);
+               break;
+
+       case VHOST_USER_SET_VRING_KICK:
+               user_set_vring_kick(vid, &msg);
+               break;
+       case VHOST_USER_SET_VRING_CALL:
+               user_set_vring_call(vid, &msg);
+               break;
+
+       case VHOST_USER_SET_VRING_ERR:
+               if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK))
+                       close(msg.fds[0]);
+               RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
+               break;
+
+       case VHOST_USER_GET_QUEUE_NUM:
+               msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS;
+               msg.size = sizeof(msg.payload.u64);
+               send_vhost_message(connfd, &msg);
+               break;
+
+       case VHOST_USER_SET_VRING_ENABLE:
+               user_set_vring_enable(vid, &msg.payload.state);
+               break;
+       case VHOST_USER_SEND_RARP:
+               user_send_rarp(vid, &msg);
+               break;
+
+       default:
+               break;
+
+       }
+}
+
+static int
+create_unix_socket(const char *path, struct sockaddr_un *un, bool is_server)
+{
+       int fd;
+
+       fd = socket(AF_UNIX, SOCK_STREAM, 0);
+       if (fd < 0)
+               return -1;
+       RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n",
+               is_server ? "server" : "client", fd);
+
+       if (!is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "vhost-user: can't set nonblocking mode for socket, fd: "
+                       "%d (%s)\n", fd, strerror(errno));
+               close(fd);
+               return -1;
+       }
+
+       memset(un, 0, sizeof(*un));
+       un->sun_family = AF_UNIX;
+       strncpy(un->sun_path, path, sizeof(un->sun_path));
+       un->sun_path[sizeof(un->sun_path) - 1] = '\0';
+
+       return fd;
+}
+
+static int
+vhost_user_create_server(struct vhost_user_socket *vsocket)
+{
+       int fd;
+       int ret;
+       struct sockaddr_un un;
+       const char *path = vsocket->path;
+
+       fd = create_unix_socket(path, &un, vsocket->is_server);
+       if (fd < 0)
+               return -1;
+
+       ret = bind(fd, (struct sockaddr *)&un, sizeof(un));
+       if (ret < 0) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "failed to bind to %s: %s; remove it and try again\n",
+                       path, strerror(errno));
+               goto err;
+       }
+       RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path);
+
+       ret = listen(fd, MAX_VIRTIO_BACKLOG);
+       if (ret < 0)
+               goto err;
+
+       vsocket->listenfd = fd;
+       ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection,
+                 NULL, vsocket);
+       if (ret < 0) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "failed to add listen fd %d to vhost server fdset\n",
+                       fd);
+               goto err;
+       }
+
+       return 0;
+
+err:
+       close(fd);
+       return -1;
+}
+
+struct vhost_user_reconnect {
+       struct sockaddr_un un;
+       int fd;
+       struct vhost_user_socket *vsocket;
+
+       TAILQ_ENTRY(vhost_user_reconnect) next;
+};
+
+TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect);
+struct vhost_user_reconnect_list {
+       struct vhost_user_reconnect_tailq_list head;
+       pthread_mutex_t mutex;
+};
+
+static struct vhost_user_reconnect_list reconn_list;
+static pthread_t reconn_tid;
+
+static int
+vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz)
+{
+       int ret, flags;
+
+       ret = connect(fd, un, sz);
+       if (ret < 0 && errno != EISCONN)
+               return -1;
+
+       flags = fcntl(fd, F_GETFL, 0);
+       if (flags < 0) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "can't get flags for connfd %d\n", fd);
+               return -2;
+       }
+       if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                               "can't disable nonblocking on fd %d\n", fd);
+               return -2;
+       }
+       return 0;
+}
+
+static void *
+vhost_user_client_reconnect(void *arg __rte_unused)
+{
+       int ret;
+       struct vhost_user_reconnect *reconn, *next;
+
+       while (1) {
+               pthread_mutex_lock(&reconn_list.mutex);
+
+               /*
+                * An equal implementation of TAILQ_FOREACH_SAFE,
+                * which does not exist on all platforms.
+                */
+               for (reconn = TAILQ_FIRST(&reconn_list.head);
+                    reconn != NULL; reconn = next) {
+                       next = TAILQ_NEXT(reconn, next);
+
+                       ret = vhost_user_connect_nonblock(reconn->fd,
+                                               (struct sockaddr *)&reconn->un,
+                                               sizeof(reconn->un));
+                       if (ret == -2) {
+                               close(reconn->fd);
+                               RTE_LOG(ERR, VHOST_CONFIG,
+                                       "reconnection for fd %d failed\n",
+                                       reconn->fd);
+                               goto remove_fd;
+                       }
+                       if (ret == -1)
+                               continue;
+
+                       RTE_LOG(INFO, VHOST_CONFIG,
+                               "%s: connected\n", reconn->vsocket->path);
+                       vhost_user_add_connection(reconn->fd, reconn->vsocket);
+remove_fd:
+                       TAILQ_REMOVE(&reconn_list.head, reconn, next);
+                       free(reconn);
+               }
+
+               pthread_mutex_unlock(&reconn_list.mutex);
+               sleep(1);
+       }
+
+       return NULL;
+}
+
+static int
+vhost_user_reconnect_init(void)
+{
+       int ret;
+
+       pthread_mutex_init(&reconn_list.mutex, NULL);
+       TAILQ_INIT(&reconn_list.head);
+
+       ret = pthread_create(&reconn_tid, NULL,
+                            vhost_user_client_reconnect, NULL);
+       if (ret < 0)
+               RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread");
+
+       return ret;
+}
+
+static int
+vhost_user_create_client(struct vhost_user_socket *vsocket)
+{
+       int fd;
+       int ret;
+       struct sockaddr_un un;
+       const char *path = vsocket->path;
+       struct vhost_user_reconnect *reconn;
+
+       fd = create_unix_socket(path, &un, vsocket->is_server);
+       if (fd < 0)
+               return -1;
+
+       ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&un,
+                                         sizeof(un));
+       if (ret == 0) {
+               vhost_user_add_connection(fd, vsocket);
+               return 0;
+       }
+
+       RTE_LOG(ERR, VHOST_CONFIG,
+               "failed to connect to %s: %s\n",
+               path, strerror(errno));
+
+       if (ret == -2 || !vsocket->reconnect) {
+               close(fd);
+               return -1;
+       }
+
+       RTE_LOG(ERR, VHOST_CONFIG, "%s: reconnecting...\n", path);
+       reconn = malloc(sizeof(*reconn));
+       if (reconn == NULL) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "failed to allocate memory for reconnect\n");
+               close(fd);
+               return -1;
+       }
+       reconn->un = un;
+       reconn->fd = fd;
+       reconn->vsocket = vsocket;
+       pthread_mutex_lock(&reconn_list.mutex);
+       TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next);
+       pthread_mutex_unlock(&reconn_list.mutex);
+
+       return 0;
+}
+
+/*
+ * Register a new vhost-user socket; here we could act as server
+ * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag
+ * is set.
+ */
+int
+rte_vhost_driver_register(const char *path, uint64_t flags)
+{
+       int ret = -1;
+       struct vhost_user_socket *vsocket;
+
+       if (!path)
+               return -1;
+
+       pthread_mutex_lock(&vhost_user.mutex);
+
+       if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "error: the number of vhost sockets reaches maximum\n");
+               goto out;
+       }
+
+       vsocket = malloc(sizeof(struct vhost_user_socket));
+       if (!vsocket)
+               goto out;
+       memset(vsocket, 0, sizeof(struct vhost_user_socket));
+       vsocket->path = strdup(path);
+       vsocket->connfd = -1;
+
+       if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
+               vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);
+               if (vsocket->reconnect && reconn_tid == 0) {
+                       if (vhost_user_reconnect_init() < 0) {
+                               free(vsocket->path);
+                               free(vsocket);
+                               goto out;
+                       }
+               }
+               ret = vhost_user_create_client(vsocket);
+       } else {
+               vsocket->is_server = true;
+               ret = vhost_user_create_server(vsocket);
+       }
+       if (ret < 0) {
+               free(vsocket->path);
+               free(vsocket);
+               goto out;
+       }
+
+       vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket;
+
+out:
+       pthread_mutex_unlock(&vhost_user.mutex);
+
+       return ret;
+}
+
+static bool
+vhost_user_remove_reconnect(struct vhost_user_socket *vsocket)
+{
+       int found = false;
+       struct vhost_user_reconnect *reconn, *next;
+
+       pthread_mutex_lock(&reconn_list.mutex);
+
+       for (reconn = TAILQ_FIRST(&reconn_list.head);
+            reconn != NULL; reconn = next) {
+               next = TAILQ_NEXT(reconn, next);
+
+               if (reconn->vsocket == vsocket) {
+                       TAILQ_REMOVE(&reconn_list.head, reconn, next);
+                       close(reconn->fd);
+                       free(reconn);
+                       found = true;
+                       break;
+               }
+       }
+       pthread_mutex_unlock(&reconn_list.mutex);
+       return found;
+}
+
+/**
+ * Unregister the specified vhost socket
+ */
+int
+rte_vhost_driver_unregister(const char *path)
+{
+       int i;
+       int count;
+       struct vhost_user_connection *conn;
+
+       pthread_mutex_lock(&vhost_user.mutex);
+
+       for (i = 0; i < vhost_user.vsocket_cnt; i++) {
+               struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
+
+               if (!strcmp(vsocket->path, path)) {
+                       if (vsocket->is_server) {
+                               fdset_del(&vhost_user.fdset, vsocket->listenfd);
+                               close(vsocket->listenfd);
+                               unlink(path);
+                       } else if (vsocket->reconnect) {
+                               vhost_user_remove_reconnect(vsocket);
+                       }
+
+                       conn = fdset_del(&vhost_user.fdset, vsocket->connfd);
+                       if (conn) {
+                               RTE_LOG(INFO, VHOST_CONFIG,
+                                       "free connfd = %d for device '%s'\n",
+                                       vsocket->connfd, path);
+                               close(vsocket->connfd);
+                               vhost_destroy_device(conn->vid);
+                               free(conn);
+                       }
+
+                       free(vsocket->path);
+                       free(vsocket);
+
+                       count = --vhost_user.vsocket_cnt;
+                       vhost_user.vsockets[i] = vhost_user.vsockets[count];
+                       vhost_user.vsockets[count] = NULL;
+                       pthread_mutex_unlock(&vhost_user.mutex);
+
+                       return 0;
+               }
+       }
+       pthread_mutex_unlock(&vhost_user.mutex);
+
+       return -1;
+}
+
+int
+rte_vhost_driver_session_start(void)
+{
+       fdset_event_dispatch(&vhost_user.fdset);
+       return 0;
+}
diff --git a/lib/librte_vhost/vhost-net-user.h b/lib/librte_vhost/vhost-net-user.h
new file mode 100644 (file)
index 0000000..f533239
--- /dev/null
@@ -0,0 +1,113 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VHOST_NET_USER_H
+#define _VHOST_NET_USER_H
+
+#include <stdint.h>
+#include <linux/vhost.h>
+
+#include "rte_virtio_net.h"
+
+/* refer to hw/virtio/vhost-user.c */
+
+#define VHOST_MEMORY_MAX_NREGIONS 8
+
+typedef enum VhostUserRequest {
+       VHOST_USER_NONE = 0,
+       VHOST_USER_GET_FEATURES = 1,
+       VHOST_USER_SET_FEATURES = 2,
+       VHOST_USER_SET_OWNER = 3,
+       VHOST_USER_RESET_OWNER = 4,
+       VHOST_USER_SET_MEM_TABLE = 5,
+       VHOST_USER_SET_LOG_BASE = 6,
+       VHOST_USER_SET_LOG_FD = 7,
+       VHOST_USER_SET_VRING_NUM = 8,
+       VHOST_USER_SET_VRING_ADDR = 9,
+       VHOST_USER_SET_VRING_BASE = 10,
+       VHOST_USER_GET_VRING_BASE = 11,
+       VHOST_USER_SET_VRING_KICK = 12,
+       VHOST_USER_SET_VRING_CALL = 13,
+       VHOST_USER_SET_VRING_ERR = 14,
+       VHOST_USER_GET_PROTOCOL_FEATURES = 15,
+       VHOST_USER_SET_PROTOCOL_FEATURES = 16,
+       VHOST_USER_GET_QUEUE_NUM = 17,
+       VHOST_USER_SET_VRING_ENABLE = 18,
+       VHOST_USER_SEND_RARP = 19,
+       VHOST_USER_MAX
+} VhostUserRequest;
+
+typedef struct VhostUserMemoryRegion {
+       uint64_t guest_phys_addr;
+       uint64_t memory_size;
+       uint64_t userspace_addr;
+       uint64_t mmap_offset;
+} VhostUserMemoryRegion;
+
+typedef struct VhostUserMemory {
+       uint32_t nregions;
+       uint32_t padding;
+       VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
+} VhostUserMemory;
+
+typedef struct VhostUserLog {
+       uint64_t mmap_size;
+       uint64_t mmap_offset;
+} VhostUserLog;
+
+typedef struct VhostUserMsg {
+       VhostUserRequest request;
+
+#define VHOST_USER_VERSION_MASK     0x3
+#define VHOST_USER_REPLY_MASK       (0x1 << 2)
+       uint32_t flags;
+       uint32_t size; /* the following payload size */
+       union {
+#define VHOST_USER_VRING_IDX_MASK   0xff
+#define VHOST_USER_VRING_NOFD_MASK  (0x1<<8)
+               uint64_t u64;
+               struct vhost_vring_state state;
+               struct vhost_vring_addr addr;
+               VhostUserMemory memory;
+               VhostUserLog    log;
+       } payload;
+       int fds[VHOST_MEMORY_MAX_NREGIONS];
+} __attribute((packed)) VhostUserMsg;
+
+#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
+
+/* The version of the protocol we support */
+#define VHOST_USER_VERSION    0x1
+
+/*****************************************************************************/
+#endif
diff --git a/lib/librte_vhost/vhost_user/fd_man.c b/lib/librte_vhost/vhost_user/fd_man.c
deleted file mode 100644 (file)
index 2d3eeb7..0000000
+++ /dev/null
@@ -1,299 +0,0 @@
-/*-
- *   BSD LICENSE
- *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- *   All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/socket.h>
-#include <sys/select.h>
-#include <sys/time.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#include <rte_common.h>
-#include <rte_log.h>
-
-#include "fd_man.h"
-
-/**
- * Returns the index in the fdset for a given fd.
- * If fd is -1, it means to search for a free entry.
- * @return
- *   index for the fd, or -1 if fd isn't in the fdset.
- */
-static int
-fdset_find_fd(struct fdset *pfdset, int fd)
-{
-       int i;
-
-       if (pfdset == NULL)
-               return -1;
-
-       for (i = 0; i < MAX_FDS && pfdset->fd[i].fd != fd; i++)
-               ;
-
-       return i ==  MAX_FDS ? -1 : i;
-}
-
-static int
-fdset_find_free_slot(struct fdset *pfdset)
-{
-       return fdset_find_fd(pfdset, -1);
-}
-
-static int
-fdset_add_fd(struct fdset  *pfdset, int idx, int fd,
-       fd_cb rcb, fd_cb wcb, void *dat)
-{
-       struct fdentry *pfdentry;
-
-       if (pfdset == NULL || idx >= MAX_FDS || fd >= FD_SETSIZE)
-               return -1;
-
-       pfdentry = &pfdset->fd[idx];
-       pfdentry->fd = fd;
-       pfdentry->rcb = rcb;
-       pfdentry->wcb = wcb;
-       pfdentry->dat = dat;
-
-       return 0;
-}
-
-/**
- * Fill the read/write fd_set with the fds in the fdset.
- * @return
- *  the maximum fds filled in the read/write fd_set.
- */
-static int
-fdset_fill(fd_set *rfset, fd_set *wfset, struct fdset *pfdset)
-{
-       struct fdentry *pfdentry;
-       int i, maxfds = -1;
-       int num = MAX_FDS;
-
-       if (pfdset == NULL)
-               return -1;
-
-       for (i = 0; i < num; i++) {
-               pfdentry = &pfdset->fd[i];
-               if (pfdentry->fd != -1) {
-                       int added = 0;
-                       if (pfdentry->rcb && rfset) {
-                               FD_SET(pfdentry->fd, rfset);
-                               added = 1;
-                       }
-                       if (pfdentry->wcb && wfset) {
-                               FD_SET(pfdentry->fd, wfset);
-                               added = 1;
-                       }
-                       if (added)
-                               maxfds = pfdentry->fd < maxfds ?
-                                       maxfds : pfdentry->fd;
-               }
-       }
-       return maxfds;
-}
-
-void
-fdset_init(struct fdset *pfdset)
-{
-       int i;
-
-       if (pfdset == NULL)
-               return;
-
-       for (i = 0; i < MAX_FDS; i++) {
-               pfdset->fd[i].fd = -1;
-               pfdset->fd[i].dat = NULL;
-       }
-       pfdset->num = 0;
-}
-
-/**
- * Register the fd in the fdset with read/write handler and context.
- */
-int
-fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, void *dat)
-{
-       int i;
-
-       if (pfdset == NULL || fd == -1)
-               return -1;
-
-       pthread_mutex_lock(&pfdset->fd_mutex);
-
-       /* Find a free slot in the list. */
-       i = fdset_find_free_slot(pfdset);
-       if (i == -1 || fdset_add_fd(pfdset, i, fd, rcb, wcb, dat) < 0) {
-               pthread_mutex_unlock(&pfdset->fd_mutex);
-               return -2;
-       }
-
-       pfdset->num++;
-
-       pthread_mutex_unlock(&pfdset->fd_mutex);
-
-       return 0;
-}
-
-/**
- *  Unregister the fd from the fdset.
- *  Returns context of a given fd or NULL.
- */
-void *
-fdset_del(struct fdset *pfdset, int fd)
-{
-       int i;
-       void *dat = NULL;
-
-       if (pfdset == NULL || fd == -1)
-               return NULL;
-
-       do {
-               pthread_mutex_lock(&pfdset->fd_mutex);
-
-               i = fdset_find_fd(pfdset, fd);
-               if (i != -1 && pfdset->fd[i].busy == 0) {
-                       /* busy indicates r/wcb is executing! */
-                       dat = pfdset->fd[i].dat;
-                       pfdset->fd[i].fd = -1;
-                       pfdset->fd[i].rcb = pfdset->fd[i].wcb = NULL;
-                       pfdset->fd[i].dat = NULL;
-                       pfdset->num--;
-                       i = -1;
-               }
-               pthread_mutex_unlock(&pfdset->fd_mutex);
-       } while (i != -1);
-
-       return dat;
-}
-
-/**
- *  Unregister the fd at the specified slot from the fdset.
- */
-static void
-fdset_del_slot(struct fdset *pfdset, int index)
-{
-       if (pfdset == NULL || index < 0 || index >= MAX_FDS)
-               return;
-
-       pthread_mutex_lock(&pfdset->fd_mutex);
-
-       pfdset->fd[index].fd = -1;
-       pfdset->fd[index].rcb = pfdset->fd[index].wcb = NULL;
-       pfdset->fd[index].dat = NULL;
-       pfdset->num--;
-
-       pthread_mutex_unlock(&pfdset->fd_mutex);
-}
-
-/**
- * This functions runs in infinite blocking loop until there is no fd in
- * pfdset. It calls corresponding r/w handler if there is event on the fd.
- *
- * Before the callback is called, we set the flag to busy status; If other
- * thread(now rte_vhost_driver_unregister) calls fdset_del concurrently, it
- * will wait until the flag is reset to zero(which indicates the callback is
- * finished), then it could free the context after fdset_del.
- */
-void
-fdset_event_dispatch(struct fdset *pfdset)
-{
-       fd_set rfds, wfds;
-       int i, maxfds;
-       struct fdentry *pfdentry;
-       int num = MAX_FDS;
-       fd_cb rcb, wcb;
-       void *dat;
-       int fd;
-       int remove1, remove2;
-       int ret;
-
-       if (pfdset == NULL)
-               return;
-
-       while (1) {
-               struct timeval tv;
-               tv.tv_sec = 1;
-               tv.tv_usec = 0;
-               FD_ZERO(&rfds);
-               FD_ZERO(&wfds);
-               pthread_mutex_lock(&pfdset->fd_mutex);
-
-               maxfds = fdset_fill(&rfds, &wfds, pfdset);
-
-               pthread_mutex_unlock(&pfdset->fd_mutex);
-
-               /*
-                * When select is blocked, other threads might unregister
-                * listenfds from and register new listenfds into fdset.
-                * When select returns, the entries for listenfds in the fdset
-                * might have been updated. It is ok if there is unwanted call
-                * for new listenfds.
-                */
-               ret = select(maxfds + 1, &rfds, &wfds, NULL, &tv);
-               if (ret <= 0)
-                       continue;
-
-               for (i = 0; i < num; i++) {
-                       remove1 = remove2 = 0;
-                       pthread_mutex_lock(&pfdset->fd_mutex);
-                       pfdentry = &pfdset->fd[i];
-                       fd = pfdentry->fd;
-                       rcb = pfdentry->rcb;
-                       wcb = pfdentry->wcb;
-                       dat = pfdentry->dat;
-                       pfdentry->busy = 1;
-                       pthread_mutex_unlock(&pfdset->fd_mutex);
-                       if (fd >= 0 && FD_ISSET(fd, &rfds) && rcb)
-                               rcb(fd, dat, &remove1);
-                       if (fd >= 0 && FD_ISSET(fd, &wfds) && wcb)
-                               wcb(fd, dat, &remove2);
-                       pfdentry->busy = 0;
-                       /*
-                        * fdset_del needs to check busy flag.
-                        * We don't allow fdset_del to be called in callback
-                        * directly.
-                        */
-                       /*
-                        * When we are to clean up the fd from fdset,
-                        * because the fd is closed in the cb,
-                        * the old fd val could be reused by when creates new
-                        * listen fd in another thread, we couldn't call
-                        * fd_set_del.
-                        */
-                       if (remove1 || remove2)
-                               fdset_del_slot(pfdset, i);
-               }
-       }
-}
diff --git a/lib/librte_vhost/vhost_user/fd_man.h b/lib/librte_vhost/vhost_user/fd_man.h
deleted file mode 100644 (file)
index bd66ed1..0000000
+++ /dev/null
@@ -1,67 +0,0 @@
-/*-
- *   BSD LICENSE
- *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- *   All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _FD_MAN_H_
-#define _FD_MAN_H_
-#include <stdint.h>
-#include <pthread.h>
-
-#define MAX_FDS 1024
-
-typedef void (*fd_cb)(int fd, void *dat, int *remove);
-
-struct fdentry {
-       int fd;         /* -1 indicates this entry is empty */
-       fd_cb rcb;      /* callback when this fd is readable. */
-       fd_cb wcb;      /* callback when this fd is writeable.*/
-       void *dat;      /* fd context */
-       int busy;       /* whether this entry is being used in cb. */
-};
-
-struct fdset {
-       struct fdentry fd[MAX_FDS];
-       pthread_mutex_t fd_mutex;
-       int num;        /* current fd number of this fdset */
-};
-
-
-void fdset_init(struct fdset *pfdset);
-
-int fdset_add(struct fdset *pfdset, int fd,
-       fd_cb rcb, fd_cb wcb, void *dat);
-
-void *fdset_del(struct fdset *pfdset, int fd);
-
-void fdset_event_dispatch(struct fdset *pfdset);
-
-#endif
diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.c b/lib/librte_vhost/vhost_user/vhost-net-user.c
deleted file mode 100644 (file)
index b35594d..0000000
+++ /dev/null
@@ -1,795 +0,0 @@
-/*-
- *   BSD LICENSE
- *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- *   All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdbool.h>
-#include <limits.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <sys/queue.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <pthread.h>
-
-#include <rte_log.h>
-#include <rte_virtio_net.h>
-
-#include "fd_man.h"
-#include "vhost-net-user.h"
-#include "vhost-net.h"
-#include "virtio-net-user.h"
-
-/*
- * Every time rte_vhost_driver_register() is invoked, an associated
- * vhost_user_socket struct will be created.
- */
-struct vhost_user_socket {
-       char *path;
-       int listenfd;
-       int connfd;
-       bool is_server;
-       bool reconnect;
-};
-
-struct vhost_user_connection {
-       struct vhost_user_socket *vsocket;
-       int vid;
-};
-
-#define MAX_VHOST_SOCKET 1024
-struct vhost_user {
-       struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET];
-       struct fdset fdset;
-       int vsocket_cnt;
-       pthread_mutex_t mutex;
-};
-
-#define MAX_VIRTIO_BACKLOG 128
-
-static void vhost_user_server_new_connection(int fd, void *data, int *remove);
-static void vhost_user_msg_handler(int fd, void *dat, int *remove);
-static int vhost_user_create_client(struct vhost_user_socket *vsocket);
-
-static struct vhost_user vhost_user = {
-       .fdset = {
-               .fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} },
-               .fd_mutex = PTHREAD_MUTEX_INITIALIZER,
-               .num = 0
-       },
-       .vsocket_cnt = 0,
-       .mutex = PTHREAD_MUTEX_INITIALIZER,
-};
-
-static const char *vhost_message_str[VHOST_USER_MAX] = {
-       [VHOST_USER_NONE] = "VHOST_USER_NONE",
-       [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
-       [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
-       [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
-       [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
-       [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
-       [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
-       [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
-       [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
-       [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
-       [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
-       [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
-       [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
-       [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
-       [VHOST_USER_SET_VRING_ERR]  = "VHOST_USER_SET_VRING_ERR",
-       [VHOST_USER_GET_PROTOCOL_FEATURES]  = "VHOST_USER_GET_PROTOCOL_FEATURES",
-       [VHOST_USER_SET_PROTOCOL_FEATURES]  = "VHOST_USER_SET_PROTOCOL_FEATURES",
-       [VHOST_USER_GET_QUEUE_NUM]  = "VHOST_USER_GET_QUEUE_NUM",
-       [VHOST_USER_SET_VRING_ENABLE]  = "VHOST_USER_SET_VRING_ENABLE",
-       [VHOST_USER_SEND_RARP]  = "VHOST_USER_SEND_RARP",
-};
-
-/* return bytes# of read on success or negative val on failure. */
-static int
-read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
-{
-       struct iovec iov;
-       struct msghdr msgh;
-       size_t fdsize = fd_num * sizeof(int);
-       char control[CMSG_SPACE(fdsize)];
-       struct cmsghdr *cmsg;
-       int ret;
-
-       memset(&msgh, 0, sizeof(msgh));
-       iov.iov_base = buf;
-       iov.iov_len  = buflen;
-
-       msgh.msg_iov = &iov;
-       msgh.msg_iovlen = 1;
-       msgh.msg_control = control;
-       msgh.msg_controllen = sizeof(control);
-
-       ret = recvmsg(sockfd, &msgh, 0);
-       if (ret <= 0) {
-               RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n");
-               return ret;
-       }
-
-       if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
-               RTE_LOG(ERR, VHOST_CONFIG, "truncted msg\n");
-               return -1;
-       }
-
-       for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
-               cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
-               if ((cmsg->cmsg_level == SOL_SOCKET) &&
-                       (cmsg->cmsg_type == SCM_RIGHTS)) {
-                       memcpy(fds, CMSG_DATA(cmsg), fdsize);
-                       break;
-               }
-       }
-
-       return ret;
-}
-
-/* return bytes# of read on success or negative val on failure. */
-static int
-read_vhost_message(int sockfd, struct VhostUserMsg *msg)
-{
-       int ret;
-
-       ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
-               msg->fds, VHOST_MEMORY_MAX_NREGIONS);
-       if (ret <= 0)
-               return ret;
-
-       if (msg && msg->size) {
-               if (msg->size > sizeof(msg->payload)) {
-                       RTE_LOG(ERR, VHOST_CONFIG,
-                               "invalid msg size: %d\n", msg->size);
-                       return -1;
-               }
-               ret = read(sockfd, &msg->payload, msg->size);
-               if (ret <= 0)
-                       return ret;
-               if (ret != (int)msg->size) {
-                       RTE_LOG(ERR, VHOST_CONFIG,
-                               "read control message failed\n");
-                       return -1;
-               }
-       }
-
-       return ret;
-}
-
-static int
-send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
-{
-
-       struct iovec iov;
-       struct msghdr msgh;
-       size_t fdsize = fd_num * sizeof(int);
-       char control[CMSG_SPACE(fdsize)];
-       struct cmsghdr *cmsg;
-       int ret;
-
-       memset(&msgh, 0, sizeof(msgh));
-       iov.iov_base = buf;
-       iov.iov_len = buflen;
-
-       msgh.msg_iov = &iov;
-       msgh.msg_iovlen = 1;
-
-       if (fds && fd_num > 0) {
-               msgh.msg_control = control;
-               msgh.msg_controllen = sizeof(control);
-               cmsg = CMSG_FIRSTHDR(&msgh);
-               cmsg->cmsg_len = CMSG_LEN(fdsize);
-               cmsg->cmsg_level = SOL_SOCKET;
-               cmsg->cmsg_type = SCM_RIGHTS;
-               memcpy(CMSG_DATA(cmsg), fds, fdsize);
-       } else {
-               msgh.msg_control = NULL;
-               msgh.msg_controllen = 0;
-       }
-
-       do {
-               ret = sendmsg(sockfd, &msgh, 0);
-       } while (ret < 0 && errno == EINTR);
-
-       if (ret < 0) {
-               RTE_LOG(ERR, VHOST_CONFIG,  "sendmsg error\n");
-               return ret;
-       }
-
-       return ret;
-}
-
-static int
-send_vhost_message(int sockfd, struct VhostUserMsg *msg)
-{
-       int ret;
-
-       if (!msg)
-               return 0;
-
-       msg->flags &= ~VHOST_USER_VERSION_MASK;
-       msg->flags |= VHOST_USER_VERSION;
-       msg->flags |= VHOST_USER_REPLY_MASK;
-
-       ret = send_fd_message(sockfd, (char *)msg,
-               VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
-
-       return ret;
-}
-
-
-static void
-vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
-{
-       int vid;
-       size_t size;
-       struct vhost_user_connection *conn;
-       int ret;
-
-       conn = malloc(sizeof(*conn));
-       if (conn == NULL) {
-               close(fd);
-               return;
-       }
-
-       vid = vhost_new_device();
-       if (vid == -1) {
-               close(fd);
-               free(conn);
-               return;
-       }
-
-       size = strnlen(vsocket->path, PATH_MAX);
-       vhost_set_ifname(vid, vsocket->path, size);
-
-       RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid);
-
-       vsocket->connfd = fd;
-       conn->vsocket = vsocket;
-       conn->vid = vid;
-       ret = fdset_add(&vhost_user.fdset, fd, vhost_user_msg_handler,
-                       NULL, conn);
-       if (ret < 0) {
-               vsocket->connfd = -1;
-               free(conn);
-               close(fd);
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "failed to add fd %d into vhost server fdset\n",
-                       fd);
-       }
-}
-
-/* call back when there is new vhost-user connection from client  */
-static void
-vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused)
-{
-       struct vhost_user_socket *vsocket = dat;
-
-       fd = accept(fd, NULL, NULL);
-       if (fd < 0)
-               return;
-
-       RTE_LOG(INFO, VHOST_CONFIG, "new vhost user connection is %d\n", fd);
-       vhost_user_add_connection(fd, vsocket);
-}
-
-/* callback when there is message on the connfd */
-static void
-vhost_user_msg_handler(int connfd, void *dat, int *remove)
-{
-       int vid;
-       struct vhost_user_connection *conn = dat;
-       struct VhostUserMsg msg;
-       uint64_t features;
-       int ret;
-
-       vid = conn->vid;
-       ret = read_vhost_message(connfd, &msg);
-       if (ret <= 0 || msg.request >= VHOST_USER_MAX) {
-               struct vhost_user_socket *vsocket = conn->vsocket;
-
-               if (ret < 0)
-                       RTE_LOG(ERR, VHOST_CONFIG,
-                               "vhost read message failed\n");
-               else if (ret == 0)
-                       RTE_LOG(INFO, VHOST_CONFIG,
-                               "vhost peer closed\n");
-               else
-                       RTE_LOG(ERR, VHOST_CONFIG,
-                               "vhost read incorrect message\n");
-
-               vsocket->connfd = -1;
-               close(connfd);
-               *remove = 1;
-               free(conn);
-               vhost_destroy_device(vid);
-
-               if (vsocket->reconnect)
-                       vhost_user_create_client(vsocket);
-
-               return;
-       }
-
-       RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
-               vhost_message_str[msg.request]);
-       switch (msg.request) {
-       case VHOST_USER_GET_FEATURES:
-               ret = vhost_get_features(vid, &features);
-               msg.payload.u64 = features;
-               msg.size = sizeof(msg.payload.u64);
-               send_vhost_message(connfd, &msg);
-               break;
-       case VHOST_USER_SET_FEATURES:
-               features = msg.payload.u64;
-               vhost_set_features(vid, &features);
-               break;
-
-       case VHOST_USER_GET_PROTOCOL_FEATURES:
-               msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES;
-               msg.size = sizeof(msg.payload.u64);
-               send_vhost_message(connfd, &msg);
-               break;
-       case VHOST_USER_SET_PROTOCOL_FEATURES:
-               user_set_protocol_features(vid, msg.payload.u64);
-               break;
-
-       case VHOST_USER_SET_OWNER:
-               vhost_set_owner(vid);
-               break;
-       case VHOST_USER_RESET_OWNER:
-               vhost_reset_owner(vid);
-               break;
-
-       case VHOST_USER_SET_MEM_TABLE:
-               user_set_mem_table(vid, &msg);
-               break;
-
-       case VHOST_USER_SET_LOG_BASE:
-               user_set_log_base(vid, &msg);
-
-               /* it needs a reply */
-               msg.size = sizeof(msg.payload.u64);
-               send_vhost_message(connfd, &msg);
-               break;
-       case VHOST_USER_SET_LOG_FD:
-               close(msg.fds[0]);
-               RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
-               break;
-
-       case VHOST_USER_SET_VRING_NUM:
-               vhost_set_vring_num(vid, &msg.payload.state);
-               break;
-       case VHOST_USER_SET_VRING_ADDR:
-               vhost_set_vring_addr(vid, &msg.payload.addr);
-               break;
-       case VHOST_USER_SET_VRING_BASE:
-               vhost_set_vring_base(vid, &msg.payload.state);
-               break;
-
-       case VHOST_USER_GET_VRING_BASE:
-               ret = user_get_vring_base(vid, &msg.payload.state);
-               msg.size = sizeof(msg.payload.state);
-               send_vhost_message(connfd, &msg);
-               break;
-
-       case VHOST_USER_SET_VRING_KICK:
-               user_set_vring_kick(vid, &msg);
-               break;
-       case VHOST_USER_SET_VRING_CALL:
-               user_set_vring_call(vid, &msg);
-               break;
-
-       case VHOST_USER_SET_VRING_ERR:
-               if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK))
-                       close(msg.fds[0]);
-               RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
-               break;
-
-       case VHOST_USER_GET_QUEUE_NUM:
-               msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS;
-               msg.size = sizeof(msg.payload.u64);
-               send_vhost_message(connfd, &msg);
-               break;
-
-       case VHOST_USER_SET_VRING_ENABLE:
-               user_set_vring_enable(vid, &msg.payload.state);
-               break;
-       case VHOST_USER_SEND_RARP:
-               user_send_rarp(vid, &msg);
-               break;
-
-       default:
-               break;
-
-       }
-}
-
-static int
-create_unix_socket(const char *path, struct sockaddr_un *un, bool is_server)
-{
-       int fd;
-
-       fd = socket(AF_UNIX, SOCK_STREAM, 0);
-       if (fd < 0)
-               return -1;
-       RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n",
-               is_server ? "server" : "client", fd);
-
-       if (!is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "vhost-user: can't set nonblocking mode for socket, fd: "
-                       "%d (%s)\n", fd, strerror(errno));
-               close(fd);
-               return -1;
-       }
-
-       memset(un, 0, sizeof(*un));
-       un->sun_family = AF_UNIX;
-       strncpy(un->sun_path, path, sizeof(un->sun_path));
-       un->sun_path[sizeof(un->sun_path) - 1] = '\0';
-
-       return fd;
-}
-
-static int
-vhost_user_create_server(struct vhost_user_socket *vsocket)
-{
-       int fd;
-       int ret;
-       struct sockaddr_un un;
-       const char *path = vsocket->path;
-
-       fd = create_unix_socket(path, &un, vsocket->is_server);
-       if (fd < 0)
-               return -1;
-
-       ret = bind(fd, (struct sockaddr *)&un, sizeof(un));
-       if (ret < 0) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "failed to bind to %s: %s; remove it and try again\n",
-                       path, strerror(errno));
-               goto err;
-       }
-       RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path);
-
-       ret = listen(fd, MAX_VIRTIO_BACKLOG);
-       if (ret < 0)
-               goto err;
-
-       vsocket->listenfd = fd;
-       ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection,
-                 NULL, vsocket);
-       if (ret < 0) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "failed to add listen fd %d to vhost server fdset\n",
-                       fd);
-               goto err;
-       }
-
-       return 0;
-
-err:
-       close(fd);
-       return -1;
-}
-
-struct vhost_user_reconnect {
-       struct sockaddr_un un;
-       int fd;
-       struct vhost_user_socket *vsocket;
-
-       TAILQ_ENTRY(vhost_user_reconnect) next;
-};
-
-TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect);
-struct vhost_user_reconnect_list {
-       struct vhost_user_reconnect_tailq_list head;
-       pthread_mutex_t mutex;
-};
-
-static struct vhost_user_reconnect_list reconn_list;
-static pthread_t reconn_tid;
-
-static int
-vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz)
-{
-       int ret, flags;
-
-       ret = connect(fd, un, sz);
-       if (ret < 0 && errno != EISCONN)
-               return -1;
-
-       flags = fcntl(fd, F_GETFL, 0);
-       if (flags < 0) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "can't get flags for connfd %d\n", fd);
-               return -2;
-       }
-       if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                               "can't disable nonblocking on fd %d\n", fd);
-               return -2;
-       }
-       return 0;
-}
-
-static void *
-vhost_user_client_reconnect(void *arg __rte_unused)
-{
-       int ret;
-       struct vhost_user_reconnect *reconn, *next;
-
-       while (1) {
-               pthread_mutex_lock(&reconn_list.mutex);
-
-               /*
-                * An equal implementation of TAILQ_FOREACH_SAFE,
-                * which does not exist on all platforms.
-                */
-               for (reconn = TAILQ_FIRST(&reconn_list.head);
-                    reconn != NULL; reconn = next) {
-                       next = TAILQ_NEXT(reconn, next);
-
-                       ret = vhost_user_connect_nonblock(reconn->fd,
-                                               (struct sockaddr *)&reconn->un,
-                                               sizeof(reconn->un));
-                       if (ret == -2) {
-                               close(reconn->fd);
-                               RTE_LOG(ERR, VHOST_CONFIG,
-                                       "reconnection for fd %d failed\n",
-                                       reconn->fd);
-                               goto remove_fd;
-                       }
-                       if (ret == -1)
-                               continue;
-
-                       RTE_LOG(INFO, VHOST_CONFIG,
-                               "%s: connected\n", reconn->vsocket->path);
-                       vhost_user_add_connection(reconn->fd, reconn->vsocket);
-remove_fd:
-                       TAILQ_REMOVE(&reconn_list.head, reconn, next);
-                       free(reconn);
-               }
-
-               pthread_mutex_unlock(&reconn_list.mutex);
-               sleep(1);
-       }
-
-       return NULL;
-}
-
-static int
-vhost_user_reconnect_init(void)
-{
-       int ret;
-
-       pthread_mutex_init(&reconn_list.mutex, NULL);
-       TAILQ_INIT(&reconn_list.head);
-
-       ret = pthread_create(&reconn_tid, NULL,
-                            vhost_user_client_reconnect, NULL);
-       if (ret < 0)
-               RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread");
-
-       return ret;
-}
-
-static int
-vhost_user_create_client(struct vhost_user_socket *vsocket)
-{
-       int fd;
-       int ret;
-       struct sockaddr_un un;
-       const char *path = vsocket->path;
-       struct vhost_user_reconnect *reconn;
-
-       fd = create_unix_socket(path, &un, vsocket->is_server);
-       if (fd < 0)
-               return -1;
-
-       ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&un,
-                                         sizeof(un));
-       if (ret == 0) {
-               vhost_user_add_connection(fd, vsocket);
-               return 0;
-       }
-
-       RTE_LOG(ERR, VHOST_CONFIG,
-               "failed to connect to %s: %s\n",
-               path, strerror(errno));
-
-       if (ret == -2 || !vsocket->reconnect) {
-               close(fd);
-               return -1;
-       }
-
-       RTE_LOG(ERR, VHOST_CONFIG, "%s: reconnecting...\n", path);
-       reconn = malloc(sizeof(*reconn));
-       if (reconn == NULL) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "failed to allocate memory for reconnect\n");
-               close(fd);
-               return -1;
-       }
-       reconn->un = un;
-       reconn->fd = fd;
-       reconn->vsocket = vsocket;
-       pthread_mutex_lock(&reconn_list.mutex);
-       TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next);
-       pthread_mutex_unlock(&reconn_list.mutex);
-
-       return 0;
-}
-
-/*
- * Register a new vhost-user socket; here we could act as server
- * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag
- * is set.
- */
-int
-rte_vhost_driver_register(const char *path, uint64_t flags)
-{
-       int ret = -1;
-       struct vhost_user_socket *vsocket;
-
-       if (!path)
-               return -1;
-
-       pthread_mutex_lock(&vhost_user.mutex);
-
-       if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "error: the number of vhost sockets reaches maximum\n");
-               goto out;
-       }
-
-       vsocket = malloc(sizeof(struct vhost_user_socket));
-       if (!vsocket)
-               goto out;
-       memset(vsocket, 0, sizeof(struct vhost_user_socket));
-       vsocket->path = strdup(path);
-       vsocket->connfd = -1;
-
-       if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
-               vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);
-               if (vsocket->reconnect && reconn_tid == 0) {
-                       if (vhost_user_reconnect_init() < 0) {
-                               free(vsocket->path);
-                               free(vsocket);
-                               goto out;
-                       }
-               }
-               ret = vhost_user_create_client(vsocket);
-       } else {
-               vsocket->is_server = true;
-               ret = vhost_user_create_server(vsocket);
-       }
-       if (ret < 0) {
-               free(vsocket->path);
-               free(vsocket);
-               goto out;
-       }
-
-       vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket;
-
-out:
-       pthread_mutex_unlock(&vhost_user.mutex);
-
-       return ret;
-}
-
-static bool
-vhost_user_remove_reconnect(struct vhost_user_socket *vsocket)
-{
-       int found = false;
-       struct vhost_user_reconnect *reconn, *next;
-
-       pthread_mutex_lock(&reconn_list.mutex);
-
-       for (reconn = TAILQ_FIRST(&reconn_list.head);
-            reconn != NULL; reconn = next) {
-               next = TAILQ_NEXT(reconn, next);
-
-               if (reconn->vsocket == vsocket) {
-                       TAILQ_REMOVE(&reconn_list.head, reconn, next);
-                       close(reconn->fd);
-                       free(reconn);
-                       found = true;
-                       break;
-               }
-       }
-       pthread_mutex_unlock(&reconn_list.mutex);
-       return found;
-}
-
-/**
- * Unregister the specified vhost socket
- */
-int
-rte_vhost_driver_unregister(const char *path)
-{
-       int i;
-       int count;
-       struct vhost_user_connection *conn;
-
-       pthread_mutex_lock(&vhost_user.mutex);
-
-       for (i = 0; i < vhost_user.vsocket_cnt; i++) {
-               struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
-
-               if (!strcmp(vsocket->path, path)) {
-                       if (vsocket->is_server) {
-                               fdset_del(&vhost_user.fdset, vsocket->listenfd);
-                               close(vsocket->listenfd);
-                               unlink(path);
-                       } else if (vsocket->reconnect) {
-                               vhost_user_remove_reconnect(vsocket);
-                       }
-
-                       conn = fdset_del(&vhost_user.fdset, vsocket->connfd);
-                       if (conn) {
-                               RTE_LOG(INFO, VHOST_CONFIG,
-                                       "free connfd = %d for device '%s'\n",
-                                       vsocket->connfd, path);
-                               close(vsocket->connfd);
-                               vhost_destroy_device(conn->vid);
-                               free(conn);
-                       }
-
-                       free(vsocket->path);
-                       free(vsocket);
-
-                       count = --vhost_user.vsocket_cnt;
-                       vhost_user.vsockets[i] = vhost_user.vsockets[count];
-                       vhost_user.vsockets[count] = NULL;
-                       pthread_mutex_unlock(&vhost_user.mutex);
-
-                       return 0;
-               }
-       }
-       pthread_mutex_unlock(&vhost_user.mutex);
-
-       return -1;
-}
-
-int
-rte_vhost_driver_session_start(void)
-{
-       fdset_event_dispatch(&vhost_user.fdset);
-       return 0;
-}
diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.h b/lib/librte_vhost/vhost_user/vhost-net-user.h
deleted file mode 100644 (file)
index f533239..0000000
+++ /dev/null
@@ -1,113 +0,0 @@
-/*-
- *   BSD LICENSE
- *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- *   All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _VHOST_NET_USER_H
-#define _VHOST_NET_USER_H
-
-#include <stdint.h>
-#include <linux/vhost.h>
-
-#include "rte_virtio_net.h"
-
-/* refer to hw/virtio/vhost-user.c */
-
-#define VHOST_MEMORY_MAX_NREGIONS 8
-
-typedef enum VhostUserRequest {
-       VHOST_USER_NONE = 0,
-       VHOST_USER_GET_FEATURES = 1,
-       VHOST_USER_SET_FEATURES = 2,
-       VHOST_USER_SET_OWNER = 3,
-       VHOST_USER_RESET_OWNER = 4,
-       VHOST_USER_SET_MEM_TABLE = 5,
-       VHOST_USER_SET_LOG_BASE = 6,
-       VHOST_USER_SET_LOG_FD = 7,
-       VHOST_USER_SET_VRING_NUM = 8,
-       VHOST_USER_SET_VRING_ADDR = 9,
-       VHOST_USER_SET_VRING_BASE = 10,
-       VHOST_USER_GET_VRING_BASE = 11,
-       VHOST_USER_SET_VRING_KICK = 12,
-       VHOST_USER_SET_VRING_CALL = 13,
-       VHOST_USER_SET_VRING_ERR = 14,
-       VHOST_USER_GET_PROTOCOL_FEATURES = 15,
-       VHOST_USER_SET_PROTOCOL_FEATURES = 16,
-       VHOST_USER_GET_QUEUE_NUM = 17,
-       VHOST_USER_SET_VRING_ENABLE = 18,
-       VHOST_USER_SEND_RARP = 19,
-       VHOST_USER_MAX
-} VhostUserRequest;
-
-typedef struct VhostUserMemoryRegion {
-       uint64_t guest_phys_addr;
-       uint64_t memory_size;
-       uint64_t userspace_addr;
-       uint64_t mmap_offset;
-} VhostUserMemoryRegion;
-
-typedef struct VhostUserMemory {
-       uint32_t nregions;
-       uint32_t padding;
-       VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
-} VhostUserMemory;
-
-typedef struct VhostUserLog {
-       uint64_t mmap_size;
-       uint64_t mmap_offset;
-} VhostUserLog;
-
-typedef struct VhostUserMsg {
-       VhostUserRequest request;
-
-#define VHOST_USER_VERSION_MASK     0x3
-#define VHOST_USER_REPLY_MASK       (0x1 << 2)
-       uint32_t flags;
-       uint32_t size; /* the following payload size */
-       union {
-#define VHOST_USER_VRING_IDX_MASK   0xff
-#define VHOST_USER_VRING_NOFD_MASK  (0x1<<8)
-               uint64_t u64;
-               struct vhost_vring_state state;
-               struct vhost_vring_addr addr;
-               VhostUserMemory memory;
-               VhostUserLog    log;
-       } payload;
-       int fds[VHOST_MEMORY_MAX_NREGIONS];
-} __attribute((packed)) VhostUserMsg;
-
-#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
-
-/* The version of the protocol we support */
-#define VHOST_USER_VERSION    0x1
-
-/*****************************************************************************/
-#endif
diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.c b/lib/librte_vhost/vhost_user/virtio-net-user.c
deleted file mode 100644 (file)
index e7c4347..0000000
+++ /dev/null
@@ -1,470 +0,0 @@
-/*-
- *   BSD LICENSE
- *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- *   All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include <rte_common.h>
-#include <rte_log.h>
-
-#include "virtio-net-user.h"
-#include "vhost-net-user.h"
-#include "vhost-net.h"
-
-struct orig_region_map {
-       int fd;
-       uint64_t mapped_address;
-       uint64_t mapped_size;
-       uint64_t blksz;
-};
-
-#define orig_region(ptr, nregions) \
-       ((struct orig_region_map *)RTE_PTR_ADD((ptr), \
-               sizeof(struct virtio_memory) + \
-               sizeof(struct virtio_memory_regions) * (nregions)))
-
-static uint64_t
-get_blk_size(int fd)
-{
-       struct stat stat;
-       int ret;
-
-       ret = fstat(fd, &stat);
-       return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
-}
-
-static void
-free_mem_region(struct virtio_net *dev)
-{
-       struct orig_region_map *region;
-       unsigned int idx;
-
-       if (!dev || !dev->mem)
-               return;
-
-       region = orig_region(dev->mem, dev->mem->nregions);
-       for (idx = 0; idx < dev->mem->nregions; idx++) {
-               if (region[idx].mapped_address) {
-                       munmap((void *)(uintptr_t)region[idx].mapped_address,
-                                       region[idx].mapped_size);
-                       close(region[idx].fd);
-               }
-       }
-}
-
-void
-vhost_backend_cleanup(struct virtio_net *dev)
-{
-       if (dev->mem) {
-               free_mem_region(dev);
-               free(dev->mem);
-               dev->mem = NULL;
-       }
-       if (dev->log_addr) {
-               munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
-               dev->log_addr = 0;
-       }
-}
-
-int
-user_set_mem_table(int vid, struct VhostUserMsg *pmsg)
-{
-       struct VhostUserMemory memory = pmsg->payload.memory;
-       struct virtio_memory_regions *pregion;
-       uint64_t mapped_address, mapped_size;
-       struct virtio_net *dev;
-       unsigned int idx = 0;
-       struct orig_region_map *pregion_orig;
-       uint64_t alignment;
-
-       /* unmap old memory regions one by one*/
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       /* Remove from the data plane. */
-       if (dev->flags & VIRTIO_DEV_RUNNING) {
-               dev->flags &= ~VIRTIO_DEV_RUNNING;
-               notify_ops->destroy_device(vid);
-       }
-
-       if (dev->mem) {
-               free_mem_region(dev);
-               free(dev->mem);
-               dev->mem = NULL;
-       }
-
-       dev->mem = calloc(1,
-               sizeof(struct virtio_memory) +
-               sizeof(struct virtio_memory_regions) * memory.nregions +
-               sizeof(struct orig_region_map) * memory.nregions);
-       if (dev->mem == NULL) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "(%d) failed to allocate memory for dev->mem\n",
-                       dev->vid);
-               return -1;
-       }
-       dev->mem->nregions = memory.nregions;
-
-       pregion_orig = orig_region(dev->mem, memory.nregions);
-       for (idx = 0; idx < memory.nregions; idx++) {
-               pregion = &dev->mem->regions[idx];
-               pregion->guest_phys_address =
-                       memory.regions[idx].guest_phys_addr;
-               pregion->guest_phys_address_end =
-                       memory.regions[idx].guest_phys_addr +
-                       memory.regions[idx].memory_size;
-               pregion->memory_size =
-                       memory.regions[idx].memory_size;
-               pregion->userspace_address =
-                       memory.regions[idx].userspace_addr;
-
-               /* This is ugly */
-               mapped_size = memory.regions[idx].memory_size +
-                       memory.regions[idx].mmap_offset;
-
-               /* mmap() without flag of MAP_ANONYMOUS, should be called
-                * with length argument aligned with hugepagesz at older
-                * longterm version Linux, like 2.6.32 and 3.2.72, or
-                * mmap() will fail with EINVAL.
-                *
-                * to avoid failure, make sure in caller to keep length
-                * aligned.
-                */
-               alignment = get_blk_size(pmsg->fds[idx]);
-               if (alignment == (uint64_t)-1) {
-                       RTE_LOG(ERR, VHOST_CONFIG,
-                               "couldn't get hugepage size through fstat\n");
-                       goto err_mmap;
-               }
-               mapped_size = RTE_ALIGN_CEIL(mapped_size, alignment);
-
-               mapped_address = (uint64_t)(uintptr_t)mmap(NULL,
-                       mapped_size,
-                       PROT_READ | PROT_WRITE, MAP_SHARED,
-                       pmsg->fds[idx],
-                       0);
-
-               RTE_LOG(INFO, VHOST_CONFIG,
-                       "mapped region %d fd:%d to:%p sz:0x%"PRIx64" "
-                       "off:0x%"PRIx64" align:0x%"PRIx64"\n",
-                       idx, pmsg->fds[idx], (void *)(uintptr_t)mapped_address,
-                       mapped_size, memory.regions[idx].mmap_offset,
-                       alignment);
-
-               if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) {
-                       RTE_LOG(ERR, VHOST_CONFIG,
-                               "mmap qemu guest failed.\n");
-                       goto err_mmap;
-               }
-
-               pregion_orig[idx].mapped_address = mapped_address;
-               pregion_orig[idx].mapped_size = mapped_size;
-               pregion_orig[idx].blksz = alignment;
-               pregion_orig[idx].fd = pmsg->fds[idx];
-
-               mapped_address +=  memory.regions[idx].mmap_offset;
-
-               pregion->address_offset = mapped_address -
-                       pregion->guest_phys_address;
-
-               if (memory.regions[idx].guest_phys_addr == 0) {
-                       dev->mem->base_address =
-                               memory.regions[idx].userspace_addr;
-                       dev->mem->mapped_address =
-                               pregion->address_offset;
-               }
-
-               LOG_DEBUG(VHOST_CONFIG,
-                       "REGION: %u GPA: %p QEMU VA: %p SIZE (%"PRIu64")\n",
-                       idx,
-                       (void *)(uintptr_t)pregion->guest_phys_address,
-                       (void *)(uintptr_t)pregion->userspace_address,
-                        pregion->memory_size);
-       }
-
-       return 0;
-
-err_mmap:
-       while (idx--) {
-               munmap((void *)(uintptr_t)pregion_orig[idx].mapped_address,
-                               pregion_orig[idx].mapped_size);
-               close(pregion_orig[idx].fd);
-       }
-       free(dev->mem);
-       dev->mem = NULL;
-       return -1;
-}
-
-static int
-vq_is_ready(struct vhost_virtqueue *vq)
-{
-       return vq && vq->desc   &&
-              vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD &&
-              vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD;
-}
-
-static int
-virtio_is_ready(struct virtio_net *dev)
-{
-       struct vhost_virtqueue *rvq, *tvq;
-       uint32_t i;
-
-       for (i = 0; i < dev->virt_qp_nb; i++) {
-               rvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ];
-               tvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ];
-
-               if (!vq_is_ready(rvq) || !vq_is_ready(tvq)) {
-                       RTE_LOG(INFO, VHOST_CONFIG,
-                               "virtio is not ready for processing.\n");
-                       return 0;
-               }
-       }
-
-       RTE_LOG(INFO, VHOST_CONFIG,
-               "virtio is now ready for processing.\n");
-       return 1;
-}
-
-void
-user_set_vring_call(int vid, struct VhostUserMsg *pmsg)
-{
-       struct vhost_vring_file file;
-
-       file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
-       if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
-               file.fd = VIRTIO_INVALID_EVENTFD;
-       else
-               file.fd = pmsg->fds[0];
-       RTE_LOG(INFO, VHOST_CONFIG,
-               "vring call idx:%d file:%d\n", file.index, file.fd);
-       vhost_set_vring_call(vid, &file);
-}
-
-
-/*
- *  In vhost-user, when we receive kick message, will test whether virtio
- *  device is ready for packet processing.
- */
-void
-user_set_vring_kick(int vid, struct VhostUserMsg *pmsg)
-{
-       struct vhost_vring_file file;
-       struct virtio_net *dev = get_device(vid);
-
-       if (!dev)
-               return;
-
-       file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
-       if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
-               file.fd = VIRTIO_INVALID_EVENTFD;
-       else
-               file.fd = pmsg->fds[0];
-       RTE_LOG(INFO, VHOST_CONFIG,
-               "vring kick idx:%d file:%d\n", file.index, file.fd);
-       vhost_set_vring_kick(vid, &file);
-
-       if (virtio_is_ready(dev) && !(dev->flags & VIRTIO_DEV_RUNNING)) {
-               if (notify_ops->new_device(vid) == 0)
-                       dev->flags |= VIRTIO_DEV_RUNNING;
-       }
-}
-
-/*
- * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
- */
-int
-user_get_vring_base(int vid, struct vhost_vring_state *state)
-{
-       struct virtio_net *dev = get_device(vid);
-
-       if (dev == NULL)
-               return -1;
-       /* We have to stop the queue (virtio) if it is running. */
-       if (dev->flags & VIRTIO_DEV_RUNNING) {
-               dev->flags &= ~VIRTIO_DEV_RUNNING;
-               notify_ops->destroy_device(vid);
-       }
-
-       /* Here we are safe to get the last used index */
-       vhost_get_vring_base(vid, state->index, state);
-
-       RTE_LOG(INFO, VHOST_CONFIG,
-               "vring base idx:%d file:%d\n", state->index, state->num);
-       /*
-        * Based on current qemu vhost-user implementation, this message is
-        * sent and only sent in vhost_vring_stop.
-        * TODO: cleanup the vring, it isn't usable since here.
-        */
-       if (dev->virtqueue[state->index]->kickfd >= 0)
-               close(dev->virtqueue[state->index]->kickfd);
-
-       dev->virtqueue[state->index]->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
-
-       return 0;
-}
-
-/*
- * when virtio queues are ready to work, qemu will send us to
- * enable the virtio queue pair.
- */
-int
-user_set_vring_enable(int vid, struct vhost_vring_state *state)
-{
-       struct virtio_net *dev;
-       int enable = (int)state->num;
-
-       dev = get_device(vid);
-       if (dev == NULL)
-               return -1;
-
-       RTE_LOG(INFO, VHOST_CONFIG,
-               "set queue enable: %d to qp idx: %d\n",
-               enable, state->index);
-
-       if (notify_ops->vring_state_changed)
-               notify_ops->vring_state_changed(vid, state->index, enable);
-
-       dev->virtqueue[state->index]->enabled = enable;
-
-       return 0;
-}
-
-void
-user_set_protocol_features(int vid, uint64_t protocol_features)
-{
-       struct virtio_net *dev;
-
-       dev = get_device(vid);
-       if (dev == NULL || protocol_features & ~VHOST_USER_PROTOCOL_FEATURES)
-               return;
-
-       dev->protocol_features = protocol_features;
-}
-
-int
-user_set_log_base(int vid, struct VhostUserMsg *msg)
-{
-       struct virtio_net *dev;
-       int fd = msg->fds[0];
-       uint64_t size, off;
-       void *addr;
-
-       dev = get_device(vid);
-       if (!dev)
-               return -1;
-
-       if (fd < 0) {
-               RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd);
-               return -1;
-       }
-
-       if (msg->size != sizeof(VhostUserLog)) {
-               RTE_LOG(ERR, VHOST_CONFIG,
-                       "invalid log base msg size: %"PRId32" != %d\n",
-                       msg->size, (int)sizeof(VhostUserLog));
-               return -1;
-       }
-
-       size = msg->payload.log.mmap_size;
-       off  = msg->payload.log.mmap_offset;
-       RTE_LOG(INFO, VHOST_CONFIG,
-               "log mmap size: %"PRId64", offset: %"PRId64"\n",
-               size, off);
-
-       /*
-        * mmap from 0 to workaround a hugepage mmap bug: mmap will
-        * fail when offset is not page size aligned.
-        */
-       addr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-       close(fd);
-       if (addr == MAP_FAILED) {
-               RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n");
-               return -1;
-       }
-
-       /*
-        * Free previously mapped log memory on occasionally
-        * multiple VHOST_USER_SET_LOG_BASE.
-        */
-       if (dev->log_addr) {
-               munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
-       }
-       dev->log_addr = (uint64_t)(uintptr_t)addr;
-       dev->log_base = dev->log_addr + off;
-       dev->log_size = size;
-
-       return 0;
-}
-
-/*
- * An rarp packet is constructed and broadcasted to notify switches about
- * the new location of the migrated VM, so that packets from outside will
- * not be lost after migration.
- *
- * However, we don't actually "send" a rarp packet here, instead, we set
- * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
- */
-int
-user_send_rarp(int vid, struct VhostUserMsg *msg)
-{
-       struct virtio_net *dev;
-       uint8_t *mac = (uint8_t *)&msg->payload.u64;
-
-       dev = get_device(vid);
-       if (!dev)
-               return -1;
-
-       RTE_LOG(DEBUG, VHOST_CONFIG,
-               ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
-               mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
-       memcpy(dev->mac.addr_bytes, mac, 6);
-
-       /*
-        * Set the flag to inject a RARP broadcast packet at
-        * rte_vhost_dequeue_burst().
-        *
-        * rte_smp_wmb() is for making sure the mac is copied
-        * before the flag is set.
-        */
-       rte_smp_wmb();
-       rte_atomic16_set(&dev->broadcast_rarp, 1);
-
-       return 0;
-}
diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.h b/lib/librte_vhost/vhost_user/virtio-net-user.h
deleted file mode 100644 (file)
index e1b967b..0000000
+++ /dev/null
@@ -1,62 +0,0 @@
-/*-
- *   BSD LICENSE
- *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- *   All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _VIRTIO_NET_USER_H
-#define _VIRTIO_NET_USER_H
-
-#include "vhost-net.h"
-#include "vhost-net-user.h"
-
-#define VHOST_USER_PROTOCOL_F_MQ       0
-#define VHOST_USER_PROTOCOL_F_LOG_SHMFD        1
-#define VHOST_USER_PROTOCOL_F_RARP     2
-
-#define VHOST_USER_PROTOCOL_FEATURES   ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
-                                        (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\
-                                        (1ULL << VHOST_USER_PROTOCOL_F_RARP))
-
-int user_set_mem_table(int, struct VhostUserMsg *);
-
-void user_set_vring_call(int, struct VhostUserMsg *);
-
-void user_set_vring_kick(int, struct VhostUserMsg *);
-
-void user_set_protocol_features(int vid, uint64_t protocol_features);
-int user_set_log_base(int vid, struct VhostUserMsg *);
-int user_send_rarp(int vid, struct VhostUserMsg *);
-
-int user_get_vring_base(int, struct vhost_vring_state *);
-
-int user_set_vring_enable(int vid, struct vhost_vring_state *state);
-
-#endif
diff --git a/lib/librte_vhost/virtio-net-user.c b/lib/librte_vhost/virtio-net-user.c
new file mode 100644 (file)
index 0000000..e7c4347
--- /dev/null
@@ -0,0 +1,470 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+
+#include "virtio-net-user.h"
+#include "vhost-net-user.h"
+#include "vhost-net.h"
+
+struct orig_region_map {
+       int fd;
+       uint64_t mapped_address;
+       uint64_t mapped_size;
+       uint64_t blksz;
+};
+
+#define orig_region(ptr, nregions) \
+       ((struct orig_region_map *)RTE_PTR_ADD((ptr), \
+               sizeof(struct virtio_memory) + \
+               sizeof(struct virtio_memory_regions) * (nregions)))
+
+static uint64_t
+get_blk_size(int fd)
+{
+       struct stat stat;
+       int ret;
+
+       ret = fstat(fd, &stat);
+       return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
+}
+
+static void
+free_mem_region(struct virtio_net *dev)
+{
+       struct orig_region_map *region;
+       unsigned int idx;
+
+       if (!dev || !dev->mem)
+               return;
+
+       region = orig_region(dev->mem, dev->mem->nregions);
+       for (idx = 0; idx < dev->mem->nregions; idx++) {
+               if (region[idx].mapped_address) {
+                       munmap((void *)(uintptr_t)region[idx].mapped_address,
+                                       region[idx].mapped_size);
+                       close(region[idx].fd);
+               }
+       }
+}
+
+void
+vhost_backend_cleanup(struct virtio_net *dev)
+{
+       if (dev->mem) {
+               free_mem_region(dev);
+               free(dev->mem);
+               dev->mem = NULL;
+       }
+       if (dev->log_addr) {
+               munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
+               dev->log_addr = 0;
+       }
+}
+
+int
+user_set_mem_table(int vid, struct VhostUserMsg *pmsg)
+{
+       struct VhostUserMemory memory = pmsg->payload.memory;
+       struct virtio_memory_regions *pregion;
+       uint64_t mapped_address, mapped_size;
+       struct virtio_net *dev;
+       unsigned int idx = 0;
+       struct orig_region_map *pregion_orig;
+       uint64_t alignment;
+
+       /* unmap old memory regions one by one*/
+       dev = get_device(vid);
+       if (dev == NULL)
+               return -1;
+
+       /* Remove from the data plane. */
+       if (dev->flags & VIRTIO_DEV_RUNNING) {
+               dev->flags &= ~VIRTIO_DEV_RUNNING;
+               notify_ops->destroy_device(vid);
+       }
+
+       if (dev->mem) {
+               free_mem_region(dev);
+               free(dev->mem);
+               dev->mem = NULL;
+       }
+
+       dev->mem = calloc(1,
+               sizeof(struct virtio_memory) +
+               sizeof(struct virtio_memory_regions) * memory.nregions +
+               sizeof(struct orig_region_map) * memory.nregions);
+       if (dev->mem == NULL) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "(%d) failed to allocate memory for dev->mem\n",
+                       dev->vid);
+               return -1;
+       }
+       dev->mem->nregions = memory.nregions;
+
+       pregion_orig = orig_region(dev->mem, memory.nregions);
+       for (idx = 0; idx < memory.nregions; idx++) {
+               pregion = &dev->mem->regions[idx];
+               pregion->guest_phys_address =
+                       memory.regions[idx].guest_phys_addr;
+               pregion->guest_phys_address_end =
+                       memory.regions[idx].guest_phys_addr +
+                       memory.regions[idx].memory_size;
+               pregion->memory_size =
+                       memory.regions[idx].memory_size;
+               pregion->userspace_address =
+                       memory.regions[idx].userspace_addr;
+
+               /* This is ugly */
+               mapped_size = memory.regions[idx].memory_size +
+                       memory.regions[idx].mmap_offset;
+
+               /* mmap() without flag of MAP_ANONYMOUS, should be called
+                * with length argument aligned with hugepagesz at older
+                * longterm version Linux, like 2.6.32 and 3.2.72, or
+                * mmap() will fail with EINVAL.
+                *
+                * to avoid failure, make sure in caller to keep length
+                * aligned.
+                */
+               alignment = get_blk_size(pmsg->fds[idx]);
+               if (alignment == (uint64_t)-1) {
+                       RTE_LOG(ERR, VHOST_CONFIG,
+                               "couldn't get hugepage size through fstat\n");
+                       goto err_mmap;
+               }
+               mapped_size = RTE_ALIGN_CEIL(mapped_size, alignment);
+
+               mapped_address = (uint64_t)(uintptr_t)mmap(NULL,
+                       mapped_size,
+                       PROT_READ | PROT_WRITE, MAP_SHARED,
+                       pmsg->fds[idx],
+                       0);
+
+               RTE_LOG(INFO, VHOST_CONFIG,
+                       "mapped region %d fd:%d to:%p sz:0x%"PRIx64" "
+                       "off:0x%"PRIx64" align:0x%"PRIx64"\n",
+                       idx, pmsg->fds[idx], (void *)(uintptr_t)mapped_address,
+                       mapped_size, memory.regions[idx].mmap_offset,
+                       alignment);
+
+               if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) {
+                       RTE_LOG(ERR, VHOST_CONFIG,
+                               "mmap qemu guest failed.\n");
+                       goto err_mmap;
+               }
+
+               pregion_orig[idx].mapped_address = mapped_address;
+               pregion_orig[idx].mapped_size = mapped_size;
+               pregion_orig[idx].blksz = alignment;
+               pregion_orig[idx].fd = pmsg->fds[idx];
+
+               mapped_address +=  memory.regions[idx].mmap_offset;
+
+               pregion->address_offset = mapped_address -
+                       pregion->guest_phys_address;
+
+               if (memory.regions[idx].guest_phys_addr == 0) {
+                       dev->mem->base_address =
+                               memory.regions[idx].userspace_addr;
+                       dev->mem->mapped_address =
+                               pregion->address_offset;
+               }
+
+               LOG_DEBUG(VHOST_CONFIG,
+                       "REGION: %u GPA: %p QEMU VA: %p SIZE (%"PRIu64")\n",
+                       idx,
+                       (void *)(uintptr_t)pregion->guest_phys_address,
+                       (void *)(uintptr_t)pregion->userspace_address,
+                        pregion->memory_size);
+       }
+
+       return 0;
+
+err_mmap:
+       while (idx--) {
+               munmap((void *)(uintptr_t)pregion_orig[idx].mapped_address,
+                               pregion_orig[idx].mapped_size);
+               close(pregion_orig[idx].fd);
+       }
+       free(dev->mem);
+       dev->mem = NULL;
+       return -1;
+}
+
+static int
+vq_is_ready(struct vhost_virtqueue *vq)
+{
+       return vq && vq->desc   &&
+              vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD &&
+              vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD;
+}
+
+static int
+virtio_is_ready(struct virtio_net *dev)
+{
+       struct vhost_virtqueue *rvq, *tvq;
+       uint32_t i;
+
+       for (i = 0; i < dev->virt_qp_nb; i++) {
+               rvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ];
+               tvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ];
+
+               if (!vq_is_ready(rvq) || !vq_is_ready(tvq)) {
+                       RTE_LOG(INFO, VHOST_CONFIG,
+                               "virtio is not ready for processing.\n");
+                       return 0;
+               }
+       }
+
+       RTE_LOG(INFO, VHOST_CONFIG,
+               "virtio is now ready for processing.\n");
+       return 1;
+}
+
+void
+user_set_vring_call(int vid, struct VhostUserMsg *pmsg)
+{
+       struct vhost_vring_file file;
+
+       file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+       if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
+               file.fd = VIRTIO_INVALID_EVENTFD;
+       else
+               file.fd = pmsg->fds[0];
+       RTE_LOG(INFO, VHOST_CONFIG,
+               "vring call idx:%d file:%d\n", file.index, file.fd);
+       vhost_set_vring_call(vid, &file);
+}
+
+
+/*
+ *  In vhost-user, when we receive kick message, will test whether virtio
+ *  device is ready for packet processing.
+ */
+void
+user_set_vring_kick(int vid, struct VhostUserMsg *pmsg)
+{
+       struct vhost_vring_file file;
+       struct virtio_net *dev = get_device(vid);
+
+       if (!dev)
+               return;
+
+       file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+       if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
+               file.fd = VIRTIO_INVALID_EVENTFD;
+       else
+               file.fd = pmsg->fds[0];
+       RTE_LOG(INFO, VHOST_CONFIG,
+               "vring kick idx:%d file:%d\n", file.index, file.fd);
+       vhost_set_vring_kick(vid, &file);
+
+       if (virtio_is_ready(dev) && !(dev->flags & VIRTIO_DEV_RUNNING)) {
+               if (notify_ops->new_device(vid) == 0)
+                       dev->flags |= VIRTIO_DEV_RUNNING;
+       }
+}
+
+/*
+ * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
+ */
+int
+user_get_vring_base(int vid, struct vhost_vring_state *state)
+{
+       struct virtio_net *dev = get_device(vid);
+
+       if (dev == NULL)
+               return -1;
+       /* We have to stop the queue (virtio) if it is running. */
+       if (dev->flags & VIRTIO_DEV_RUNNING) {
+               dev->flags &= ~VIRTIO_DEV_RUNNING;
+               notify_ops->destroy_device(vid);
+       }
+
+       /* Here we are safe to get the last used index */
+       vhost_get_vring_base(vid, state->index, state);
+
+       RTE_LOG(INFO, VHOST_CONFIG,
+               "vring base idx:%d file:%d\n", state->index, state->num);
+       /*
+        * Based on current qemu vhost-user implementation, this message is
+        * sent and only sent in vhost_vring_stop.
+        * TODO: cleanup the vring, it isn't usable since here.
+        */
+       if (dev->virtqueue[state->index]->kickfd >= 0)
+               close(dev->virtqueue[state->index]->kickfd);
+
+       dev->virtqueue[state->index]->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+
+       return 0;
+}
+
+/*
+ * when virtio queues are ready to work, qemu will send us to
+ * enable the virtio queue pair.
+ */
+int
+user_set_vring_enable(int vid, struct vhost_vring_state *state)
+{
+       struct virtio_net *dev;
+       int enable = (int)state->num;
+
+       dev = get_device(vid);
+       if (dev == NULL)
+               return -1;
+
+       RTE_LOG(INFO, VHOST_CONFIG,
+               "set queue enable: %d to qp idx: %d\n",
+               enable, state->index);
+
+       if (notify_ops->vring_state_changed)
+               notify_ops->vring_state_changed(vid, state->index, enable);
+
+       dev->virtqueue[state->index]->enabled = enable;
+
+       return 0;
+}
+
+void
+user_set_protocol_features(int vid, uint64_t protocol_features)
+{
+       struct virtio_net *dev;
+
+       dev = get_device(vid);
+       if (dev == NULL || protocol_features & ~VHOST_USER_PROTOCOL_FEATURES)
+               return;
+
+       dev->protocol_features = protocol_features;
+}
+
+int
+user_set_log_base(int vid, struct VhostUserMsg *msg)
+{
+       struct virtio_net *dev;
+       int fd = msg->fds[0];
+       uint64_t size, off;
+       void *addr;
+
+       dev = get_device(vid);
+       if (!dev)
+               return -1;
+
+       if (fd < 0) {
+               RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd);
+               return -1;
+       }
+
+       if (msg->size != sizeof(VhostUserLog)) {
+               RTE_LOG(ERR, VHOST_CONFIG,
+                       "invalid log base msg size: %"PRId32" != %d\n",
+                       msg->size, (int)sizeof(VhostUserLog));
+               return -1;
+       }
+
+       size = msg->payload.log.mmap_size;
+       off  = msg->payload.log.mmap_offset;
+       RTE_LOG(INFO, VHOST_CONFIG,
+               "log mmap size: %"PRId64", offset: %"PRId64"\n",
+               size, off);
+
+       /*
+        * mmap from 0 to workaround a hugepage mmap bug: mmap will
+        * fail when offset is not page size aligned.
+        */
+       addr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+       close(fd);
+       if (addr == MAP_FAILED) {
+               RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n");
+               return -1;
+       }
+
+       /*
+        * Free previously mapped log memory on occasionally
+        * multiple VHOST_USER_SET_LOG_BASE.
+        */
+       if (dev->log_addr) {
+               munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
+       }
+       dev->log_addr = (uint64_t)(uintptr_t)addr;
+       dev->log_base = dev->log_addr + off;
+       dev->log_size = size;
+
+       return 0;
+}
+
+/*
+ * An rarp packet is constructed and broadcasted to notify switches about
+ * the new location of the migrated VM, so that packets from outside will
+ * not be lost after migration.
+ *
+ * However, we don't actually "send" a rarp packet here, instead, we set
+ * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
+ */
+int
+user_send_rarp(int vid, struct VhostUserMsg *msg)
+{
+       struct virtio_net *dev;
+       uint8_t *mac = (uint8_t *)&msg->payload.u64;
+
+       dev = get_device(vid);
+       if (!dev)
+               return -1;
+
+       RTE_LOG(DEBUG, VHOST_CONFIG,
+               ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
+               mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
+       memcpy(dev->mac.addr_bytes, mac, 6);
+
+       /*
+        * Set the flag to inject a RARP broadcast packet at
+        * rte_vhost_dequeue_burst().
+        *
+        * rte_smp_wmb() is for making sure the mac is copied
+        * before the flag is set.
+        */
+       rte_smp_wmb();
+       rte_atomic16_set(&dev->broadcast_rarp, 1);
+
+       return 0;
+}
diff --git a/lib/librte_vhost/virtio-net-user.h b/lib/librte_vhost/virtio-net-user.h
new file mode 100644 (file)
index 0000000..e1b967b
--- /dev/null
@@ -0,0 +1,62 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VIRTIO_NET_USER_H
+#define _VIRTIO_NET_USER_H
+
+#include "vhost-net.h"
+#include "vhost-net-user.h"
+
+#define VHOST_USER_PROTOCOL_F_MQ       0
+#define VHOST_USER_PROTOCOL_F_LOG_SHMFD        1
+#define VHOST_USER_PROTOCOL_F_RARP     2
+
+#define VHOST_USER_PROTOCOL_FEATURES   ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
+                                        (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\
+                                        (1ULL << VHOST_USER_PROTOCOL_F_RARP))
+
+int user_set_mem_table(int, struct VhostUserMsg *);
+
+void user_set_vring_call(int, struct VhostUserMsg *);
+
+void user_set_vring_kick(int, struct VhostUserMsg *);
+
+void user_set_protocol_features(int vid, uint64_t protocol_features);
+int user_set_log_base(int vid, struct VhostUserMsg *);
+int user_send_rarp(int vid, struct VhostUserMsg *);
+
+int user_get_vring_base(int, struct vhost_vring_state *);
+
+int user_set_vring_enable(int vid, struct vhost_vring_state *state);
+
+#endif