endif
# all source are stored in SRCS-y
-SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := virtio-net.c vhost_rxtx.c
-SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += vhost-net-user.c
-SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += virtio-net-user.c
-SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += fd_man.c
+SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c socket.c vhost.c vhost_user.c \
+ virtio_net.c
# install includes
SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_virtio_net.h
--- /dev/null
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/queue.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+
+#include <rte_log.h>
+
+#include "fd_man.h"
+#include "vhost.h"
+#include "vhost_user.h"
+
+/*
+ * Every time rte_vhost_driver_register() is invoked, an associated
+ * vhost_user_socket struct will be created.
+ */
+struct vhost_user_socket {
+ char *path;
+ int listenfd;
+ int connfd;
+ bool is_server;
+ bool reconnect;
+};
+
+struct vhost_user_connection {
+ struct vhost_user_socket *vsocket;
+ int vid;
+};
+
+#define MAX_VHOST_SOCKET 1024
+struct vhost_user {
+ struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET];
+ struct fdset fdset;
+ int vsocket_cnt;
+ pthread_mutex_t mutex;
+};
+
+#define MAX_VIRTIO_BACKLOG 128
+
+static void vhost_user_server_new_connection(int fd, void *data, int *remove);
+static void vhost_user_read_cb(int fd, void *dat, int *remove);
+static int vhost_user_create_client(struct vhost_user_socket *vsocket);
+
+static struct vhost_user vhost_user = {
+ .fdset = {
+ .fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} },
+ .fd_mutex = PTHREAD_MUTEX_INITIALIZER,
+ .num = 0
+ },
+ .vsocket_cnt = 0,
+ .mutex = PTHREAD_MUTEX_INITIALIZER,
+};
+
+/* return bytes# of read on success or negative val on failure. */
+int
+read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
+{
+ struct iovec iov;
+ struct msghdr msgh;
+ size_t fdsize = fd_num * sizeof(int);
+ char control[CMSG_SPACE(fdsize)];
+ struct cmsghdr *cmsg;
+ int ret;
+
+ memset(&msgh, 0, sizeof(msgh));
+ iov.iov_base = buf;
+ iov.iov_len = buflen;
+
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+
+ ret = recvmsg(sockfd, &msgh, 0);
+ if (ret <= 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n");
+ return ret;
+ }
+
+ if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
+ RTE_LOG(ERR, VHOST_CONFIG, "truncted msg\n");
+ return -1;
+ }
+
+ for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+ cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+ if ((cmsg->cmsg_level == SOL_SOCKET) &&
+ (cmsg->cmsg_type == SCM_RIGHTS)) {
+ memcpy(fds, CMSG_DATA(cmsg), fdsize);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+int
+send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
+{
+
+ struct iovec iov;
+ struct msghdr msgh;
+ size_t fdsize = fd_num * sizeof(int);
+ char control[CMSG_SPACE(fdsize)];
+ struct cmsghdr *cmsg;
+ int ret;
+
+ memset(&msgh, 0, sizeof(msgh));
+ iov.iov_base = buf;
+ iov.iov_len = buflen;
+
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+
+ if (fds && fd_num > 0) {
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+ cmsg = CMSG_FIRSTHDR(&msgh);
+ cmsg->cmsg_len = CMSG_LEN(fdsize);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ memcpy(CMSG_DATA(cmsg), fds, fdsize);
+ } else {
+ msgh.msg_control = NULL;
+ msgh.msg_controllen = 0;
+ }
+
+ do {
+ ret = sendmsg(sockfd, &msgh, 0);
+ } while (ret < 0 && errno == EINTR);
+
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "sendmsg error\n");
+ return ret;
+ }
+
+ return ret;
+}
+
+static void
+vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
+{
+ int vid;
+ size_t size;
+ struct vhost_user_connection *conn;
+ int ret;
+
+ conn = malloc(sizeof(*conn));
+ if (conn == NULL) {
+ close(fd);
+ return;
+ }
+
+ vid = vhost_new_device();
+ if (vid == -1) {
+ close(fd);
+ free(conn);
+ return;
+ }
+
+ size = strnlen(vsocket->path, PATH_MAX);
+ vhost_set_ifname(vid, vsocket->path, size);
+
+ RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid);
+
+ vsocket->connfd = fd;
+ conn->vsocket = vsocket;
+ conn->vid = vid;
+ ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb,
+ NULL, conn);
+ if (ret < 0) {
+ vsocket->connfd = -1;
+ free(conn);
+ close(fd);
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to add fd %d into vhost server fdset\n",
+ fd);
+ }
+}
+
+/* call back when there is new vhost-user connection from client */
+static void
+vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused)
+{
+ struct vhost_user_socket *vsocket = dat;
+
+ fd = accept(fd, NULL, NULL);
+ if (fd < 0)
+ return;
+
+ RTE_LOG(INFO, VHOST_CONFIG, "new vhost user connection is %d\n", fd);
+ vhost_user_add_connection(fd, vsocket);
+}
+
+static void
+vhost_user_read_cb(int connfd, void *dat, int *remove)
+{
+ struct vhost_user_connection *conn = dat;
+ struct vhost_user_socket *vsocket = conn->vsocket;
+ int ret;
+
+ ret = vhost_user_msg_handler(conn->vid, connfd);
+ if (ret < 0) {
+ vsocket->connfd = -1;
+ close(connfd);
+ *remove = 1;
+ free(conn);
+ vhost_destroy_device(conn->vid);
+
+ if (vsocket->reconnect)
+ vhost_user_create_client(vsocket);
+ }
+}
+
+static int
+create_unix_socket(const char *path, struct sockaddr_un *un, bool is_server)
+{
+ int fd;
+
+ fd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (fd < 0)
+ return -1;
+ RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n",
+ is_server ? "server" : "client", fd);
+
+ if (!is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "vhost-user: can't set nonblocking mode for socket, fd: "
+ "%d (%s)\n", fd, strerror(errno));
+ close(fd);
+ return -1;
+ }
+
+ memset(un, 0, sizeof(*un));
+ un->sun_family = AF_UNIX;
+ strncpy(un->sun_path, path, sizeof(un->sun_path));
+ un->sun_path[sizeof(un->sun_path) - 1] = '\0';
+
+ return fd;
+}
+
+static int
+vhost_user_create_server(struct vhost_user_socket *vsocket)
+{
+ int fd;
+ int ret;
+ struct sockaddr_un un;
+ const char *path = vsocket->path;
+
+ fd = create_unix_socket(path, &un, vsocket->is_server);
+ if (fd < 0)
+ return -1;
+
+ ret = bind(fd, (struct sockaddr *)&un, sizeof(un));
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to bind to %s: %s; remove it and try again\n",
+ path, strerror(errno));
+ goto err;
+ }
+ RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path);
+
+ ret = listen(fd, MAX_VIRTIO_BACKLOG);
+ if (ret < 0)
+ goto err;
+
+ vsocket->listenfd = fd;
+ ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection,
+ NULL, vsocket);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to add listen fd %d to vhost server fdset\n",
+ fd);
+ goto err;
+ }
+
+ return 0;
+
+err:
+ close(fd);
+ return -1;
+}
+
+struct vhost_user_reconnect {
+ struct sockaddr_un un;
+ int fd;
+ struct vhost_user_socket *vsocket;
+
+ TAILQ_ENTRY(vhost_user_reconnect) next;
+};
+
+TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect);
+struct vhost_user_reconnect_list {
+ struct vhost_user_reconnect_tailq_list head;
+ pthread_mutex_t mutex;
+};
+
+static struct vhost_user_reconnect_list reconn_list;
+static pthread_t reconn_tid;
+
+static int
+vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz)
+{
+ int ret, flags;
+
+ ret = connect(fd, un, sz);
+ if (ret < 0 && errno != EISCONN)
+ return -1;
+
+ flags = fcntl(fd, F_GETFL, 0);
+ if (flags < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "can't get flags for connfd %d\n", fd);
+ return -2;
+ }
+ if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "can't disable nonblocking on fd %d\n", fd);
+ return -2;
+ }
+ return 0;
+}
+
+static void *
+vhost_user_client_reconnect(void *arg __rte_unused)
+{
+ int ret;
+ struct vhost_user_reconnect *reconn, *next;
+
+ while (1) {
+ pthread_mutex_lock(&reconn_list.mutex);
+
+ /*
+ * An equal implementation of TAILQ_FOREACH_SAFE,
+ * which does not exist on all platforms.
+ */
+ for (reconn = TAILQ_FIRST(&reconn_list.head);
+ reconn != NULL; reconn = next) {
+ next = TAILQ_NEXT(reconn, next);
+
+ ret = vhost_user_connect_nonblock(reconn->fd,
+ (struct sockaddr *)&reconn->un,
+ sizeof(reconn->un));
+ if (ret == -2) {
+ close(reconn->fd);
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "reconnection for fd %d failed\n",
+ reconn->fd);
+ goto remove_fd;
+ }
+ if (ret == -1)
+ continue;
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "%s: connected\n", reconn->vsocket->path);
+ vhost_user_add_connection(reconn->fd, reconn->vsocket);
+remove_fd:
+ TAILQ_REMOVE(&reconn_list.head, reconn, next);
+ free(reconn);
+ }
+
+ pthread_mutex_unlock(&reconn_list.mutex);
+ sleep(1);
+ }
+
+ return NULL;
+}
+
+static int
+vhost_user_reconnect_init(void)
+{
+ int ret;
+
+ pthread_mutex_init(&reconn_list.mutex, NULL);
+ TAILQ_INIT(&reconn_list.head);
+
+ ret = pthread_create(&reconn_tid, NULL,
+ vhost_user_client_reconnect, NULL);
+ if (ret < 0)
+ RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread");
+
+ return ret;
+}
+
+static int
+vhost_user_create_client(struct vhost_user_socket *vsocket)
+{
+ int fd;
+ int ret;
+ struct sockaddr_un un;
+ const char *path = vsocket->path;
+ struct vhost_user_reconnect *reconn;
+
+ fd = create_unix_socket(path, &un, vsocket->is_server);
+ if (fd < 0)
+ return -1;
+
+ ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&un,
+ sizeof(un));
+ if (ret == 0) {
+ vhost_user_add_connection(fd, vsocket);
+ return 0;
+ }
+
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to connect to %s: %s\n",
+ path, strerror(errno));
+
+ if (ret == -2 || !vsocket->reconnect) {
+ close(fd);
+ return -1;
+ }
+
+ RTE_LOG(ERR, VHOST_CONFIG, "%s: reconnecting...\n", path);
+ reconn = malloc(sizeof(*reconn));
+ if (reconn == NULL) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to allocate memory for reconnect\n");
+ close(fd);
+ return -1;
+ }
+ reconn->un = un;
+ reconn->fd = fd;
+ reconn->vsocket = vsocket;
+ pthread_mutex_lock(&reconn_list.mutex);
+ TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next);
+ pthread_mutex_unlock(&reconn_list.mutex);
+
+ return 0;
+}
+
+/*
+ * Register a new vhost-user socket; here we could act as server
+ * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag
+ * is set.
+ */
+int
+rte_vhost_driver_register(const char *path, uint64_t flags)
+{
+ int ret = -1;
+ struct vhost_user_socket *vsocket;
+
+ if (!path)
+ return -1;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+
+ if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "error: the number of vhost sockets reaches maximum\n");
+ goto out;
+ }
+
+ vsocket = malloc(sizeof(struct vhost_user_socket));
+ if (!vsocket)
+ goto out;
+ memset(vsocket, 0, sizeof(struct vhost_user_socket));
+ vsocket->path = strdup(path);
+ vsocket->connfd = -1;
+
+ if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
+ vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);
+ if (vsocket->reconnect && reconn_tid == 0) {
+ if (vhost_user_reconnect_init() < 0) {
+ free(vsocket->path);
+ free(vsocket);
+ goto out;
+ }
+ }
+ ret = vhost_user_create_client(vsocket);
+ } else {
+ vsocket->is_server = true;
+ ret = vhost_user_create_server(vsocket);
+ }
+ if (ret < 0) {
+ free(vsocket->path);
+ free(vsocket);
+ goto out;
+ }
+
+ vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket;
+
+out:
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return ret;
+}
+
+static bool
+vhost_user_remove_reconnect(struct vhost_user_socket *vsocket)
+{
+ int found = false;
+ struct vhost_user_reconnect *reconn, *next;
+
+ pthread_mutex_lock(&reconn_list.mutex);
+
+ for (reconn = TAILQ_FIRST(&reconn_list.head);
+ reconn != NULL; reconn = next) {
+ next = TAILQ_NEXT(reconn, next);
+
+ if (reconn->vsocket == vsocket) {
+ TAILQ_REMOVE(&reconn_list.head, reconn, next);
+ close(reconn->fd);
+ free(reconn);
+ found = true;
+ break;
+ }
+ }
+ pthread_mutex_unlock(&reconn_list.mutex);
+ return found;
+}
+
+/**
+ * Unregister the specified vhost socket
+ */
+int
+rte_vhost_driver_unregister(const char *path)
+{
+ int i;
+ int count;
+ struct vhost_user_connection *conn;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+
+ for (i = 0; i < vhost_user.vsocket_cnt; i++) {
+ struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
+
+ if (!strcmp(vsocket->path, path)) {
+ if (vsocket->is_server) {
+ fdset_del(&vhost_user.fdset, vsocket->listenfd);
+ close(vsocket->listenfd);
+ unlink(path);
+ } else if (vsocket->reconnect) {
+ vhost_user_remove_reconnect(vsocket);
+ }
+
+ conn = fdset_del(&vhost_user.fdset, vsocket->connfd);
+ if (conn) {
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "free connfd = %d for device '%s'\n",
+ vsocket->connfd, path);
+ close(vsocket->connfd);
+ vhost_destroy_device(conn->vid);
+ free(conn);
+ }
+
+ free(vsocket->path);
+ free(vsocket);
+
+ count = --vhost_user.vsocket_cnt;
+ vhost_user.vsockets[i] = vhost_user.vsockets[count];
+ vhost_user.vsockets[count] = NULL;
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return 0;
+ }
+ }
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return -1;
+}
+
+int
+rte_vhost_driver_session_start(void)
+{
+ fdset_event_dispatch(&vhost_user.fdset);
+ return 0;
+}
+++ /dev/null
-/*-
- * BSD LICENSE
- *
- * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdbool.h>
-#include <limits.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <sys/queue.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <pthread.h>
-
-#include <rte_log.h>
-#include <rte_virtio_net.h>
-
-#include "fd_man.h"
-#include "vhost-net-user.h"
-#include "vhost-net.h"
-#include "virtio-net-user.h"
-
-/*
- * Every time rte_vhost_driver_register() is invoked, an associated
- * vhost_user_socket struct will be created.
- */
-struct vhost_user_socket {
- char *path;
- int listenfd;
- int connfd;
- bool is_server;
- bool reconnect;
-};
-
-struct vhost_user_connection {
- struct vhost_user_socket *vsocket;
- int vid;
-};
-
-#define MAX_VHOST_SOCKET 1024
-struct vhost_user {
- struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET];
- struct fdset fdset;
- int vsocket_cnt;
- pthread_mutex_t mutex;
-};
-
-#define MAX_VIRTIO_BACKLOG 128
-
-static void vhost_user_server_new_connection(int fd, void *data, int *remove);
-static void vhost_user_msg_handler(int fd, void *dat, int *remove);
-static int vhost_user_create_client(struct vhost_user_socket *vsocket);
-
-static struct vhost_user vhost_user = {
- .fdset = {
- .fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} },
- .fd_mutex = PTHREAD_MUTEX_INITIALIZER,
- .num = 0
- },
- .vsocket_cnt = 0,
- .mutex = PTHREAD_MUTEX_INITIALIZER,
-};
-
-static const char *vhost_message_str[VHOST_USER_MAX] = {
- [VHOST_USER_NONE] = "VHOST_USER_NONE",
- [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
- [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
- [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
- [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
- [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
- [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
- [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
- [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
- [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
- [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
- [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
- [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
- [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
- [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR",
- [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES",
- [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES",
- [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM",
- [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE",
- [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP",
-};
-
-/* return bytes# of read on success or negative val on failure. */
-static int
-read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
-{
- struct iovec iov;
- struct msghdr msgh;
- size_t fdsize = fd_num * sizeof(int);
- char control[CMSG_SPACE(fdsize)];
- struct cmsghdr *cmsg;
- int ret;
-
- memset(&msgh, 0, sizeof(msgh));
- iov.iov_base = buf;
- iov.iov_len = buflen;
-
- msgh.msg_iov = &iov;
- msgh.msg_iovlen = 1;
- msgh.msg_control = control;
- msgh.msg_controllen = sizeof(control);
-
- ret = recvmsg(sockfd, &msgh, 0);
- if (ret <= 0) {
- RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n");
- return ret;
- }
-
- if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
- RTE_LOG(ERR, VHOST_CONFIG, "truncted msg\n");
- return -1;
- }
-
- for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
- cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
- if ((cmsg->cmsg_level == SOL_SOCKET) &&
- (cmsg->cmsg_type == SCM_RIGHTS)) {
- memcpy(fds, CMSG_DATA(cmsg), fdsize);
- break;
- }
- }
-
- return ret;
-}
-
-/* return bytes# of read on success or negative val on failure. */
-static int
-read_vhost_message(int sockfd, struct VhostUserMsg *msg)
-{
- int ret;
-
- ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
- msg->fds, VHOST_MEMORY_MAX_NREGIONS);
- if (ret <= 0)
- return ret;
-
- if (msg && msg->size) {
- if (msg->size > sizeof(msg->payload)) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "invalid msg size: %d\n", msg->size);
- return -1;
- }
- ret = read(sockfd, &msg->payload, msg->size);
- if (ret <= 0)
- return ret;
- if (ret != (int)msg->size) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "read control message failed\n");
- return -1;
- }
- }
-
- return ret;
-}
-
-static int
-send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
-{
-
- struct iovec iov;
- struct msghdr msgh;
- size_t fdsize = fd_num * sizeof(int);
- char control[CMSG_SPACE(fdsize)];
- struct cmsghdr *cmsg;
- int ret;
-
- memset(&msgh, 0, sizeof(msgh));
- iov.iov_base = buf;
- iov.iov_len = buflen;
-
- msgh.msg_iov = &iov;
- msgh.msg_iovlen = 1;
-
- if (fds && fd_num > 0) {
- msgh.msg_control = control;
- msgh.msg_controllen = sizeof(control);
- cmsg = CMSG_FIRSTHDR(&msgh);
- cmsg->cmsg_len = CMSG_LEN(fdsize);
- cmsg->cmsg_level = SOL_SOCKET;
- cmsg->cmsg_type = SCM_RIGHTS;
- memcpy(CMSG_DATA(cmsg), fds, fdsize);
- } else {
- msgh.msg_control = NULL;
- msgh.msg_controllen = 0;
- }
-
- do {
- ret = sendmsg(sockfd, &msgh, 0);
- } while (ret < 0 && errno == EINTR);
-
- if (ret < 0) {
- RTE_LOG(ERR, VHOST_CONFIG, "sendmsg error\n");
- return ret;
- }
-
- return ret;
-}
-
-static int
-send_vhost_message(int sockfd, struct VhostUserMsg *msg)
-{
- int ret;
-
- if (!msg)
- return 0;
-
- msg->flags &= ~VHOST_USER_VERSION_MASK;
- msg->flags |= VHOST_USER_VERSION;
- msg->flags |= VHOST_USER_REPLY_MASK;
-
- ret = send_fd_message(sockfd, (char *)msg,
- VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
-
- return ret;
-}
-
-
-static void
-vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
-{
- int vid;
- size_t size;
- struct vhost_user_connection *conn;
- int ret;
-
- conn = malloc(sizeof(*conn));
- if (conn == NULL) {
- close(fd);
- return;
- }
-
- vid = vhost_new_device();
- if (vid == -1) {
- close(fd);
- free(conn);
- return;
- }
-
- size = strnlen(vsocket->path, PATH_MAX);
- vhost_set_ifname(vid, vsocket->path, size);
-
- RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid);
-
- vsocket->connfd = fd;
- conn->vsocket = vsocket;
- conn->vid = vid;
- ret = fdset_add(&vhost_user.fdset, fd, vhost_user_msg_handler,
- NULL, conn);
- if (ret < 0) {
- vsocket->connfd = -1;
- free(conn);
- close(fd);
- RTE_LOG(ERR, VHOST_CONFIG,
- "failed to add fd %d into vhost server fdset\n",
- fd);
- }
-}
-
-/* call back when there is new vhost-user connection from client */
-static void
-vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused)
-{
- struct vhost_user_socket *vsocket = dat;
-
- fd = accept(fd, NULL, NULL);
- if (fd < 0)
- return;
-
- RTE_LOG(INFO, VHOST_CONFIG, "new vhost user connection is %d\n", fd);
- vhost_user_add_connection(fd, vsocket);
-}
-
-/* callback when there is message on the connfd */
-static void
-vhost_user_msg_handler(int connfd, void *dat, int *remove)
-{
- int vid;
- struct vhost_user_connection *conn = dat;
- struct VhostUserMsg msg;
- uint64_t features;
- int ret;
-
- vid = conn->vid;
- ret = read_vhost_message(connfd, &msg);
- if (ret <= 0 || msg.request >= VHOST_USER_MAX) {
- struct vhost_user_socket *vsocket = conn->vsocket;
-
- if (ret < 0)
- RTE_LOG(ERR, VHOST_CONFIG,
- "vhost read message failed\n");
- else if (ret == 0)
- RTE_LOG(INFO, VHOST_CONFIG,
- "vhost peer closed\n");
- else
- RTE_LOG(ERR, VHOST_CONFIG,
- "vhost read incorrect message\n");
-
- vsocket->connfd = -1;
- close(connfd);
- *remove = 1;
- free(conn);
- vhost_destroy_device(vid);
-
- if (vsocket->reconnect)
- vhost_user_create_client(vsocket);
-
- return;
- }
-
- RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
- vhost_message_str[msg.request]);
- switch (msg.request) {
- case VHOST_USER_GET_FEATURES:
- ret = vhost_get_features(vid, &features);
- msg.payload.u64 = features;
- msg.size = sizeof(msg.payload.u64);
- send_vhost_message(connfd, &msg);
- break;
- case VHOST_USER_SET_FEATURES:
- features = msg.payload.u64;
- vhost_set_features(vid, &features);
- break;
-
- case VHOST_USER_GET_PROTOCOL_FEATURES:
- msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES;
- msg.size = sizeof(msg.payload.u64);
- send_vhost_message(connfd, &msg);
- break;
- case VHOST_USER_SET_PROTOCOL_FEATURES:
- user_set_protocol_features(vid, msg.payload.u64);
- break;
-
- case VHOST_USER_SET_OWNER:
- vhost_set_owner(vid);
- break;
- case VHOST_USER_RESET_OWNER:
- vhost_reset_owner(vid);
- break;
-
- case VHOST_USER_SET_MEM_TABLE:
- user_set_mem_table(vid, &msg);
- break;
-
- case VHOST_USER_SET_LOG_BASE:
- user_set_log_base(vid, &msg);
-
- /* it needs a reply */
- msg.size = sizeof(msg.payload.u64);
- send_vhost_message(connfd, &msg);
- break;
- case VHOST_USER_SET_LOG_FD:
- close(msg.fds[0]);
- RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
- break;
-
- case VHOST_USER_SET_VRING_NUM:
- vhost_set_vring_num(vid, &msg.payload.state);
- break;
- case VHOST_USER_SET_VRING_ADDR:
- vhost_set_vring_addr(vid, &msg.payload.addr);
- break;
- case VHOST_USER_SET_VRING_BASE:
- vhost_set_vring_base(vid, &msg.payload.state);
- break;
-
- case VHOST_USER_GET_VRING_BASE:
- ret = user_get_vring_base(vid, &msg.payload.state);
- msg.size = sizeof(msg.payload.state);
- send_vhost_message(connfd, &msg);
- break;
-
- case VHOST_USER_SET_VRING_KICK:
- user_set_vring_kick(vid, &msg);
- break;
- case VHOST_USER_SET_VRING_CALL:
- user_set_vring_call(vid, &msg);
- break;
-
- case VHOST_USER_SET_VRING_ERR:
- if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK))
- close(msg.fds[0]);
- RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
- break;
-
- case VHOST_USER_GET_QUEUE_NUM:
- msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS;
- msg.size = sizeof(msg.payload.u64);
- send_vhost_message(connfd, &msg);
- break;
-
- case VHOST_USER_SET_VRING_ENABLE:
- user_set_vring_enable(vid, &msg.payload.state);
- break;
- case VHOST_USER_SEND_RARP:
- user_send_rarp(vid, &msg);
- break;
-
- default:
- break;
-
- }
-}
-
-static int
-create_unix_socket(const char *path, struct sockaddr_un *un, bool is_server)
-{
- int fd;
-
- fd = socket(AF_UNIX, SOCK_STREAM, 0);
- if (fd < 0)
- return -1;
- RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n",
- is_server ? "server" : "client", fd);
-
- if (!is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "vhost-user: can't set nonblocking mode for socket, fd: "
- "%d (%s)\n", fd, strerror(errno));
- close(fd);
- return -1;
- }
-
- memset(un, 0, sizeof(*un));
- un->sun_family = AF_UNIX;
- strncpy(un->sun_path, path, sizeof(un->sun_path));
- un->sun_path[sizeof(un->sun_path) - 1] = '\0';
-
- return fd;
-}
-
-static int
-vhost_user_create_server(struct vhost_user_socket *vsocket)
-{
- int fd;
- int ret;
- struct sockaddr_un un;
- const char *path = vsocket->path;
-
- fd = create_unix_socket(path, &un, vsocket->is_server);
- if (fd < 0)
- return -1;
-
- ret = bind(fd, (struct sockaddr *)&un, sizeof(un));
- if (ret < 0) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "failed to bind to %s: %s; remove it and try again\n",
- path, strerror(errno));
- goto err;
- }
- RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path);
-
- ret = listen(fd, MAX_VIRTIO_BACKLOG);
- if (ret < 0)
- goto err;
-
- vsocket->listenfd = fd;
- ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection,
- NULL, vsocket);
- if (ret < 0) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "failed to add listen fd %d to vhost server fdset\n",
- fd);
- goto err;
- }
-
- return 0;
-
-err:
- close(fd);
- return -1;
-}
-
-struct vhost_user_reconnect {
- struct sockaddr_un un;
- int fd;
- struct vhost_user_socket *vsocket;
-
- TAILQ_ENTRY(vhost_user_reconnect) next;
-};
-
-TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect);
-struct vhost_user_reconnect_list {
- struct vhost_user_reconnect_tailq_list head;
- pthread_mutex_t mutex;
-};
-
-static struct vhost_user_reconnect_list reconn_list;
-static pthread_t reconn_tid;
-
-static int
-vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz)
-{
- int ret, flags;
-
- ret = connect(fd, un, sz);
- if (ret < 0 && errno != EISCONN)
- return -1;
-
- flags = fcntl(fd, F_GETFL, 0);
- if (flags < 0) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "can't get flags for connfd %d\n", fd);
- return -2;
- }
- if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "can't disable nonblocking on fd %d\n", fd);
- return -2;
- }
- return 0;
-}
-
-static void *
-vhost_user_client_reconnect(void *arg __rte_unused)
-{
- int ret;
- struct vhost_user_reconnect *reconn, *next;
-
- while (1) {
- pthread_mutex_lock(&reconn_list.mutex);
-
- /*
- * An equal implementation of TAILQ_FOREACH_SAFE,
- * which does not exist on all platforms.
- */
- for (reconn = TAILQ_FIRST(&reconn_list.head);
- reconn != NULL; reconn = next) {
- next = TAILQ_NEXT(reconn, next);
-
- ret = vhost_user_connect_nonblock(reconn->fd,
- (struct sockaddr *)&reconn->un,
- sizeof(reconn->un));
- if (ret == -2) {
- close(reconn->fd);
- RTE_LOG(ERR, VHOST_CONFIG,
- "reconnection for fd %d failed\n",
- reconn->fd);
- goto remove_fd;
- }
- if (ret == -1)
- continue;
-
- RTE_LOG(INFO, VHOST_CONFIG,
- "%s: connected\n", reconn->vsocket->path);
- vhost_user_add_connection(reconn->fd, reconn->vsocket);
-remove_fd:
- TAILQ_REMOVE(&reconn_list.head, reconn, next);
- free(reconn);
- }
-
- pthread_mutex_unlock(&reconn_list.mutex);
- sleep(1);
- }
-
- return NULL;
-}
-
-static int
-vhost_user_reconnect_init(void)
-{
- int ret;
-
- pthread_mutex_init(&reconn_list.mutex, NULL);
- TAILQ_INIT(&reconn_list.head);
-
- ret = pthread_create(&reconn_tid, NULL,
- vhost_user_client_reconnect, NULL);
- if (ret < 0)
- RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread");
-
- return ret;
-}
-
-static int
-vhost_user_create_client(struct vhost_user_socket *vsocket)
-{
- int fd;
- int ret;
- struct sockaddr_un un;
- const char *path = vsocket->path;
- struct vhost_user_reconnect *reconn;
-
- fd = create_unix_socket(path, &un, vsocket->is_server);
- if (fd < 0)
- return -1;
-
- ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&un,
- sizeof(un));
- if (ret == 0) {
- vhost_user_add_connection(fd, vsocket);
- return 0;
- }
-
- RTE_LOG(ERR, VHOST_CONFIG,
- "failed to connect to %s: %s\n",
- path, strerror(errno));
-
- if (ret == -2 || !vsocket->reconnect) {
- close(fd);
- return -1;
- }
-
- RTE_LOG(ERR, VHOST_CONFIG, "%s: reconnecting...\n", path);
- reconn = malloc(sizeof(*reconn));
- if (reconn == NULL) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "failed to allocate memory for reconnect\n");
- close(fd);
- return -1;
- }
- reconn->un = un;
- reconn->fd = fd;
- reconn->vsocket = vsocket;
- pthread_mutex_lock(&reconn_list.mutex);
- TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next);
- pthread_mutex_unlock(&reconn_list.mutex);
-
- return 0;
-}
-
-/*
- * Register a new vhost-user socket; here we could act as server
- * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag
- * is set.
- */
-int
-rte_vhost_driver_register(const char *path, uint64_t flags)
-{
- int ret = -1;
- struct vhost_user_socket *vsocket;
-
- if (!path)
- return -1;
-
- pthread_mutex_lock(&vhost_user.mutex);
-
- if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "error: the number of vhost sockets reaches maximum\n");
- goto out;
- }
-
- vsocket = malloc(sizeof(struct vhost_user_socket));
- if (!vsocket)
- goto out;
- memset(vsocket, 0, sizeof(struct vhost_user_socket));
- vsocket->path = strdup(path);
- vsocket->connfd = -1;
-
- if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
- vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);
- if (vsocket->reconnect && reconn_tid == 0) {
- if (vhost_user_reconnect_init() < 0) {
- free(vsocket->path);
- free(vsocket);
- goto out;
- }
- }
- ret = vhost_user_create_client(vsocket);
- } else {
- vsocket->is_server = true;
- ret = vhost_user_create_server(vsocket);
- }
- if (ret < 0) {
- free(vsocket->path);
- free(vsocket);
- goto out;
- }
-
- vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket;
-
-out:
- pthread_mutex_unlock(&vhost_user.mutex);
-
- return ret;
-}
-
-static bool
-vhost_user_remove_reconnect(struct vhost_user_socket *vsocket)
-{
- int found = false;
- struct vhost_user_reconnect *reconn, *next;
-
- pthread_mutex_lock(&reconn_list.mutex);
-
- for (reconn = TAILQ_FIRST(&reconn_list.head);
- reconn != NULL; reconn = next) {
- next = TAILQ_NEXT(reconn, next);
-
- if (reconn->vsocket == vsocket) {
- TAILQ_REMOVE(&reconn_list.head, reconn, next);
- close(reconn->fd);
- free(reconn);
- found = true;
- break;
- }
- }
- pthread_mutex_unlock(&reconn_list.mutex);
- return found;
-}
-
-/**
- * Unregister the specified vhost socket
- */
-int
-rte_vhost_driver_unregister(const char *path)
-{
- int i;
- int count;
- struct vhost_user_connection *conn;
-
- pthread_mutex_lock(&vhost_user.mutex);
-
- for (i = 0; i < vhost_user.vsocket_cnt; i++) {
- struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
-
- if (!strcmp(vsocket->path, path)) {
- if (vsocket->is_server) {
- fdset_del(&vhost_user.fdset, vsocket->listenfd);
- close(vsocket->listenfd);
- unlink(path);
- } else if (vsocket->reconnect) {
- vhost_user_remove_reconnect(vsocket);
- }
-
- conn = fdset_del(&vhost_user.fdset, vsocket->connfd);
- if (conn) {
- RTE_LOG(INFO, VHOST_CONFIG,
- "free connfd = %d for device '%s'\n",
- vsocket->connfd, path);
- close(vsocket->connfd);
- vhost_destroy_device(conn->vid);
- free(conn);
- }
-
- free(vsocket->path);
- free(vsocket);
-
- count = --vhost_user.vsocket_cnt;
- vhost_user.vsockets[i] = vhost_user.vsockets[count];
- vhost_user.vsockets[count] = NULL;
- pthread_mutex_unlock(&vhost_user.mutex);
-
- return 0;
- }
- }
- pthread_mutex_unlock(&vhost_user.mutex);
-
- return -1;
-}
-
-int
-rte_vhost_driver_session_start(void)
-{
- fdset_event_dispatch(&vhost_user.fdset);
- return 0;
-}
+++ /dev/null
-/*-
- * BSD LICENSE
- *
- * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _VHOST_NET_USER_H
-#define _VHOST_NET_USER_H
-
-#include <stdint.h>
-#include <linux/vhost.h>
-
-#include "rte_virtio_net.h"
-
-/* refer to hw/virtio/vhost-user.c */
-
-#define VHOST_MEMORY_MAX_NREGIONS 8
-
-typedef enum VhostUserRequest {
- VHOST_USER_NONE = 0,
- VHOST_USER_GET_FEATURES = 1,
- VHOST_USER_SET_FEATURES = 2,
- VHOST_USER_SET_OWNER = 3,
- VHOST_USER_RESET_OWNER = 4,
- VHOST_USER_SET_MEM_TABLE = 5,
- VHOST_USER_SET_LOG_BASE = 6,
- VHOST_USER_SET_LOG_FD = 7,
- VHOST_USER_SET_VRING_NUM = 8,
- VHOST_USER_SET_VRING_ADDR = 9,
- VHOST_USER_SET_VRING_BASE = 10,
- VHOST_USER_GET_VRING_BASE = 11,
- VHOST_USER_SET_VRING_KICK = 12,
- VHOST_USER_SET_VRING_CALL = 13,
- VHOST_USER_SET_VRING_ERR = 14,
- VHOST_USER_GET_PROTOCOL_FEATURES = 15,
- VHOST_USER_SET_PROTOCOL_FEATURES = 16,
- VHOST_USER_GET_QUEUE_NUM = 17,
- VHOST_USER_SET_VRING_ENABLE = 18,
- VHOST_USER_SEND_RARP = 19,
- VHOST_USER_MAX
-} VhostUserRequest;
-
-typedef struct VhostUserMemoryRegion {
- uint64_t guest_phys_addr;
- uint64_t memory_size;
- uint64_t userspace_addr;
- uint64_t mmap_offset;
-} VhostUserMemoryRegion;
-
-typedef struct VhostUserMemory {
- uint32_t nregions;
- uint32_t padding;
- VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
-} VhostUserMemory;
-
-typedef struct VhostUserLog {
- uint64_t mmap_size;
- uint64_t mmap_offset;
-} VhostUserLog;
-
-typedef struct VhostUserMsg {
- VhostUserRequest request;
-
-#define VHOST_USER_VERSION_MASK 0x3
-#define VHOST_USER_REPLY_MASK (0x1 << 2)
- uint32_t flags;
- uint32_t size; /* the following payload size */
- union {
-#define VHOST_USER_VRING_IDX_MASK 0xff
-#define VHOST_USER_VRING_NOFD_MASK (0x1<<8)
- uint64_t u64;
- struct vhost_vring_state state;
- struct vhost_vring_addr addr;
- VhostUserMemory memory;
- VhostUserLog log;
- } payload;
- int fds[VHOST_MEMORY_MAX_NREGIONS];
-} __attribute((packed)) VhostUserMsg;
-
-#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
-
-/* The version of the protocol we support */
-#define VHOST_USER_VERSION 0x1
-
-/*****************************************************************************/
-#endif
+++ /dev/null
-/*-
- * BSD LICENSE
- *
- * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _VHOST_NET_CDEV_H_
-#define _VHOST_NET_CDEV_H_
-#include <stdint.h>
-#include <stdio.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <linux/vhost.h>
-
-#include <rte_log.h>
-
-#include "rte_virtio_net.h"
-
-/* Used to indicate that the device is running on a data core */
-#define VIRTIO_DEV_RUNNING 1
-
-/* Backend value set by guest. */
-#define VIRTIO_DEV_STOPPED -1
-
-#define BUF_VECTOR_MAX 256
-
-/**
- * Structure contains buffer address, length and descriptor index
- * from vring to do scatter RX.
- */
-struct buf_vector {
- uint64_t buf_addr;
- uint32_t buf_len;
- uint32_t desc_idx;
-};
-
-/**
- * Structure contains variables relevant to RX/TX virtqueues.
- */
-struct vhost_virtqueue {
- struct vring_desc *desc;
- struct vring_avail *avail;
- struct vring_used *used;
- uint32_t size;
-
- /* Last index used on the available ring */
- volatile uint16_t last_used_idx;
-#define VIRTIO_INVALID_EVENTFD (-1)
-#define VIRTIO_UNINITIALIZED_EVENTFD (-2)
-
- /* Backend value to determine if device should started/stopped */
- int backend;
- /* Used to notify the guest (trigger interrupt) */
- int callfd;
- /* Currently unused as polling mode is enabled */
- int kickfd;
- int enabled;
-
- /* Physical address of used ring, for logging */
- uint64_t log_guest_addr;
-} __rte_cache_aligned;
-
-/* Old kernels have no such macro defined */
-#ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
- #define VIRTIO_NET_F_GUEST_ANNOUNCE 21
-#endif
-
-
-/*
- * Make an extra wrapper for VIRTIO_NET_F_MQ and
- * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX as they are
- * introduced since kernel v3.8. This makes our
- * code buildable for older kernel.
- */
-#ifdef VIRTIO_NET_F_MQ
- #define VHOST_MAX_QUEUE_PAIRS VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX
- #define VHOST_SUPPORTS_MQ (1ULL << VIRTIO_NET_F_MQ)
-#else
- #define VHOST_MAX_QUEUE_PAIRS 1
- #define VHOST_SUPPORTS_MQ 0
-#endif
-
-/*
- * Define virtio 1.0 for older kernels
- */
-#ifndef VIRTIO_F_VERSION_1
- #define VIRTIO_F_VERSION_1 32
-#endif
-
-/**
- * Device structure contains all configuration information relating
- * to the device.
- */
-struct virtio_net {
- /* Frontend (QEMU) memory and memory region information */
- struct virtio_memory *mem;
- uint64_t features;
- uint64_t protocol_features;
- int vid;
- uint32_t flags;
- uint16_t vhost_hlen;
- /* to tell if we need broadcast rarp packet */
- rte_atomic16_t broadcast_rarp;
- uint32_t virt_qp_nb;
- struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
-#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
- char ifname[IF_NAME_SZ];
- uint64_t log_size;
- uint64_t log_base;
- uint64_t log_addr;
- struct ether_addr mac;
-
-} __rte_cache_aligned;
-
-/**
- * Information relating to memory regions including offsets to
- * addresses in QEMUs memory file.
- */
-struct virtio_memory_regions {
- uint64_t guest_phys_address;
- uint64_t guest_phys_address_end;
- uint64_t memory_size;
- uint64_t userspace_address;
- uint64_t address_offset;
-};
-
-
-/**
- * Memory structure includes region and mapping information.
- */
-struct virtio_memory {
- /* Base QEMU userspace address of the memory file. */
- uint64_t base_address;
- uint64_t mapped_address;
- uint64_t mapped_size;
- uint32_t nregions;
- struct virtio_memory_regions regions[0];
-};
-
-
-/* Macros for printing using RTE_LOG */
-#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1
-#define RTE_LOGTYPE_VHOST_DATA RTE_LOGTYPE_USER1
-
-#ifdef RTE_LIBRTE_VHOST_DEBUG
-#define VHOST_MAX_PRINT_BUFF 6072
-#define LOG_LEVEL RTE_LOG_DEBUG
-#define LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args)
-#define PRINT_PACKET(device, addr, size, header) do { \
- char *pkt_addr = (char *)(addr); \
- unsigned int index; \
- char packet[VHOST_MAX_PRINT_BUFF]; \
- \
- if ((header)) \
- snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Header size %d: ", (device->vid), (size)); \
- else \
- snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Packet size %d: ", (device->vid), (size)); \
- for (index = 0; index < (size); index++) { \
- snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \
- "%02hhx ", pkt_addr[index]); \
- } \
- snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \
- \
- LOG_DEBUG(VHOST_DATA, "%s", packet); \
-} while (0)
-#else
-#define LOG_LEVEL RTE_LOG_INFO
-#define LOG_DEBUG(log_type, fmt, args...) do {} while (0)
-#define PRINT_PACKET(device, addr, size, header) do {} while (0)
-#endif
-
-/**
- * Function to convert guest physical addresses to vhost virtual addresses.
- * This is used to convert guest virtio buffer addresses.
- */
-static inline uint64_t __attribute__((always_inline))
-gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa)
-{
- struct virtio_memory_regions *region;
- uint32_t regionidx;
- uint64_t vhost_va = 0;
-
- for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
- region = &dev->mem->regions[regionidx];
- if ((guest_pa >= region->guest_phys_address) &&
- (guest_pa <= region->guest_phys_address_end)) {
- vhost_va = region->address_offset + guest_pa;
- break;
- }
- }
- return vhost_va;
-}
-
-struct virtio_net_device_ops const *notify_ops;
-struct virtio_net *get_device(int vid);
-
-int vhost_new_device(void);
-void vhost_destroy_device(int);
-
-void vhost_set_ifname(int, const char *if_name, unsigned int if_len);
-
-int vhost_get_features(int, uint64_t *);
-int vhost_set_features(int, uint64_t *);
-
-int vhost_set_vring_num(int, struct vhost_vring_state *);
-int vhost_set_vring_addr(int, struct vhost_vring_addr *);
-int vhost_set_vring_base(int, struct vhost_vring_state *);
-int vhost_get_vring_base(int, uint32_t, struct vhost_vring_state *);
-
-int vhost_set_vring_kick(int, struct vhost_vring_file *);
-int vhost_set_vring_call(int, struct vhost_vring_file *);
-
-int vhost_set_backend(int, struct vhost_vring_file *);
-
-int vhost_set_owner(int);
-int vhost_reset_owner(int);
-
-/*
- * Backend-specific cleanup. Defined by vhost-cuse and vhost-user.
- */
-void vhost_backend_cleanup(struct virtio_net *dev);
-
-#endif /* _VHOST_NET_CDEV_H_ */
--- /dev/null
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#ifdef RTE_LIBRTE_VHOST_NUMA
+#include <numaif.h>
+#endif
+
+#include <rte_ethdev.h>
+#include <rte_log.h>
+#include <rte_string_fns.h>
+#include <rte_memory.h>
+#include <rte_malloc.h>
+#include <rte_virtio_net.h>
+
+#include "vhost.h"
+
+#define VHOST_USER_F_PROTOCOL_FEATURES 30
+
+/* Features supported by this lib. */
+#define VHOST_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \
+ (1ULL << VIRTIO_NET_F_CTRL_VQ) | \
+ (1ULL << VIRTIO_NET_F_CTRL_RX) | \
+ (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \
+ (VHOST_SUPPORTS_MQ) | \
+ (1ULL << VIRTIO_F_VERSION_1) | \
+ (1ULL << VHOST_F_LOG_ALL) | \
+ (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
+ (1ULL << VIRTIO_NET_F_HOST_TSO4) | \
+ (1ULL << VIRTIO_NET_F_HOST_TSO6) | \
+ (1ULL << VIRTIO_NET_F_CSUM) | \
+ (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
+ (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
+ (1ULL << VIRTIO_NET_F_GUEST_TSO6))
+
+uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES;
+
+struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
+
+/* device ops to add/remove device to/from data core. */
+struct virtio_net_device_ops const *notify_ops;
+
+struct virtio_net *
+get_device(int vid)
+{
+ struct virtio_net *dev = vhost_devices[vid];
+
+ if (unlikely(!dev)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) device not found.\n", vid);
+ }
+
+ return dev;
+}
+
+static void
+cleanup_vq(struct vhost_virtqueue *vq, int destroy)
+{
+ if ((vq->callfd >= 0) && (destroy != 0))
+ close(vq->callfd);
+ if (vq->kickfd >= 0)
+ close(vq->kickfd);
+}
+
+/*
+ * Unmap any memory, close any file descriptors and
+ * free any memory owned by a device.
+ */
+void
+cleanup_device(struct virtio_net *dev, int destroy)
+{
+ uint32_t i;
+
+ vhost_backend_cleanup(dev);
+
+ for (i = 0; i < dev->virt_qp_nb; i++) {
+ cleanup_vq(dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ], destroy);
+ cleanup_vq(dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ], destroy);
+ }
+}
+
+/*
+ * Release virtqueues and device memory.
+ */
+static void
+free_device(struct virtio_net *dev)
+{
+ uint32_t i;
+
+ for (i = 0; i < dev->virt_qp_nb; i++)
+ rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
+
+ rte_free(dev);
+}
+
+static void
+init_vring_queue(struct vhost_virtqueue *vq, int qp_idx)
+{
+ memset(vq, 0, sizeof(struct vhost_virtqueue));
+
+ vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+ vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD;
+
+ /* Backends are set to -1 indicating an inactive device. */
+ vq->backend = -1;
+
+ /* always set the default vq pair to enabled */
+ if (qp_idx == 0)
+ vq->enabled = 1;
+}
+
+static void
+init_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx)
+{
+ uint32_t base_idx = qp_idx * VIRTIO_QNUM;
+
+ init_vring_queue(dev->virtqueue[base_idx + VIRTIO_RXQ], qp_idx);
+ init_vring_queue(dev->virtqueue[base_idx + VIRTIO_TXQ], qp_idx);
+}
+
+static void
+reset_vring_queue(struct vhost_virtqueue *vq, int qp_idx)
+{
+ int callfd;
+
+ callfd = vq->callfd;
+ init_vring_queue(vq, qp_idx);
+ vq->callfd = callfd;
+}
+
+static void
+reset_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx)
+{
+ uint32_t base_idx = qp_idx * VIRTIO_QNUM;
+
+ reset_vring_queue(dev->virtqueue[base_idx + VIRTIO_RXQ], qp_idx);
+ reset_vring_queue(dev->virtqueue[base_idx + VIRTIO_TXQ], qp_idx);
+}
+
+int
+alloc_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx)
+{
+ struct vhost_virtqueue *virtqueue = NULL;
+ uint32_t virt_rx_q_idx = qp_idx * VIRTIO_QNUM + VIRTIO_RXQ;
+ uint32_t virt_tx_q_idx = qp_idx * VIRTIO_QNUM + VIRTIO_TXQ;
+
+ virtqueue = rte_malloc(NULL,
+ sizeof(struct vhost_virtqueue) * VIRTIO_QNUM, 0);
+ if (virtqueue == NULL) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to allocate memory for virt qp:%d.\n", qp_idx);
+ return -1;
+ }
+
+ dev->virtqueue[virt_rx_q_idx] = virtqueue;
+ dev->virtqueue[virt_tx_q_idx] = virtqueue + VIRTIO_TXQ;
+
+ init_vring_queue_pair(dev, qp_idx);
+
+ dev->virt_qp_nb += 1;
+
+ return 0;
+}
+
+/*
+ * Reset some variables in device structure, while keeping few
+ * others untouched, such as vid, ifname, virt_qp_nb: they
+ * should be same unless the device is removed.
+ */
+void
+reset_device(struct virtio_net *dev)
+{
+ uint32_t i;
+
+ dev->features = 0;
+ dev->protocol_features = 0;
+ dev->flags = 0;
+
+ for (i = 0; i < dev->virt_qp_nb; i++)
+ reset_vring_queue_pair(dev, i);
+}
+
+/*
+ * Function is called from the CUSE open function. The device structure is
+ * initialised and a new entry is added to the device configuration linked
+ * list.
+ */
+int
+vhost_new_device(void)
+{
+ struct virtio_net *dev;
+ int i;
+
+ dev = rte_zmalloc(NULL, sizeof(struct virtio_net), 0);
+ if (dev == NULL) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to allocate memory for new dev.\n");
+ return -1;
+ }
+
+ for (i = 0; i < MAX_VHOST_DEVICE; i++) {
+ if (vhost_devices[i] == NULL)
+ break;
+ }
+ if (i == MAX_VHOST_DEVICE) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to find a free slot for new device.\n");
+ return -1;
+ }
+
+ vhost_devices[i] = dev;
+ dev->vid = i;
+
+ return i;
+}
+
+/*
+ * Function is called from the CUSE release function. This function will
+ * cleanup the device and remove it from device configuration linked list.
+ */
+void
+vhost_destroy_device(int vid)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL)
+ return;
+
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ notify_ops->destroy_device(vid);
+ }
+
+ cleanup_device(dev, 1);
+ free_device(dev);
+
+ vhost_devices[vid] = NULL;
+}
+
+void
+vhost_set_ifname(int vid, const char *if_name, unsigned int if_len)
+{
+ struct virtio_net *dev;
+ unsigned int len;
+
+ dev = get_device(vid);
+ if (dev == NULL)
+ return;
+
+ len = if_len > sizeof(dev->ifname) ?
+ sizeof(dev->ifname) : if_len;
+
+ strncpy(dev->ifname, if_name, len);
+ dev->ifname[sizeof(dev->ifname) - 1] = '\0';
+}
+
+
+int
+rte_vhost_get_numa_node(int vid)
+{
+#ifdef RTE_LIBRTE_VHOST_NUMA
+ struct virtio_net *dev = get_device(vid);
+ int numa_node;
+ int ret;
+
+ if (dev == NULL)
+ return -1;
+
+ ret = get_mempolicy(&numa_node, NULL, 0, dev,
+ MPOL_F_NODE | MPOL_F_ADDR);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) failed to query numa node: %d\n", vid, ret);
+ return -1;
+ }
+
+ return numa_node;
+#else
+ RTE_SET_USED(vid);
+ return -1;
+#endif
+}
+
+uint32_t
+rte_vhost_get_queue_num(int vid)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL)
+ return 0;
+
+ return dev->virt_qp_nb;
+}
+
+int
+rte_vhost_get_ifname(int vid, char *buf, size_t len)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL)
+ return -1;
+
+ len = RTE_MIN(len, sizeof(dev->ifname));
+
+ strncpy(buf, dev->ifname, len);
+ buf[len - 1] = '\0';
+
+ return 0;
+}
+
+uint16_t
+rte_vhost_avail_entries(int vid, uint16_t queue_id)
+{
+ struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
+
+ dev = get_device(vid);
+ if (!dev)
+ return 0;
+
+ vq = dev->virtqueue[queue_id];
+ if (!vq->enabled)
+ return 0;
+
+ return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx;
+}
+
+int
+rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL)
+ return -1;
+
+ if (enable) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "guest notification isn't supported.\n");
+ return -1;
+ }
+
+ dev->virtqueue[queue_id]->used->flags = VRING_USED_F_NO_NOTIFY;
+ return 0;
+}
+
+uint64_t rte_vhost_feature_get(void)
+{
+ return VHOST_FEATURES;
+}
+
+int rte_vhost_feature_disable(uint64_t feature_mask)
+{
+ VHOST_FEATURES = VHOST_FEATURES & ~feature_mask;
+ return 0;
+}
+
+int rte_vhost_feature_enable(uint64_t feature_mask)
+{
+ if ((feature_mask & VHOST_SUPPORTED_FEATURES) == feature_mask) {
+ VHOST_FEATURES = VHOST_FEATURES | feature_mask;
+ return 0;
+ }
+ return -1;
+}
+
+/*
+ * Register ops so that we can add/remove device to data core.
+ */
+int
+rte_vhost_driver_callback_register(struct virtio_net_device_ops const * const ops)
+{
+ notify_ops = ops;
+
+ return 0;
+}
--- /dev/null
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VHOST_NET_CDEV_H_
+#define _VHOST_NET_CDEV_H_
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <linux/vhost.h>
+
+#include <rte_log.h>
+
+#include "rte_virtio_net.h"
+
+/* Used to indicate that the device is running on a data core */
+#define VIRTIO_DEV_RUNNING 1
+
+/* Backend value set by guest. */
+#define VIRTIO_DEV_STOPPED -1
+
+#define BUF_VECTOR_MAX 256
+
+/**
+ * Structure contains buffer address, length and descriptor index
+ * from vring to do scatter RX.
+ */
+struct buf_vector {
+ uint64_t buf_addr;
+ uint32_t buf_len;
+ uint32_t desc_idx;
+};
+
+/**
+ * Structure contains variables relevant to RX/TX virtqueues.
+ */
+struct vhost_virtqueue {
+ struct vring_desc *desc;
+ struct vring_avail *avail;
+ struct vring_used *used;
+ uint32_t size;
+
+ /* Last index used on the available ring */
+ volatile uint16_t last_used_idx;
+#define VIRTIO_INVALID_EVENTFD (-1)
+#define VIRTIO_UNINITIALIZED_EVENTFD (-2)
+
+ /* Backend value to determine if device should started/stopped */
+ int backend;
+ /* Used to notify the guest (trigger interrupt) */
+ int callfd;
+ /* Currently unused as polling mode is enabled */
+ int kickfd;
+ int enabled;
+
+ /* Physical address of used ring, for logging */
+ uint64_t log_guest_addr;
+} __rte_cache_aligned;
+
+/* Old kernels have no such macro defined */
+#ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
+ #define VIRTIO_NET_F_GUEST_ANNOUNCE 21
+#endif
+
+
+/*
+ * Make an extra wrapper for VIRTIO_NET_F_MQ and
+ * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX as they are
+ * introduced since kernel v3.8. This makes our
+ * code buildable for older kernel.
+ */
+#ifdef VIRTIO_NET_F_MQ
+ #define VHOST_MAX_QUEUE_PAIRS VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX
+ #define VHOST_SUPPORTS_MQ (1ULL << VIRTIO_NET_F_MQ)
+#else
+ #define VHOST_MAX_QUEUE_PAIRS 1
+ #define VHOST_SUPPORTS_MQ 0
+#endif
+
+/*
+ * Define virtio 1.0 for older kernels
+ */
+#ifndef VIRTIO_F_VERSION_1
+ #define VIRTIO_F_VERSION_1 32
+#endif
+
+/**
+ * Device structure contains all configuration information relating
+ * to the device.
+ */
+struct virtio_net {
+ /* Frontend (QEMU) memory and memory region information */
+ struct virtio_memory *mem;
+ uint64_t features;
+ uint64_t protocol_features;
+ int vid;
+ uint32_t flags;
+ uint16_t vhost_hlen;
+ /* to tell if we need broadcast rarp packet */
+ rte_atomic16_t broadcast_rarp;
+ uint32_t virt_qp_nb;
+ struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
+#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
+ char ifname[IF_NAME_SZ];
+ uint64_t log_size;
+ uint64_t log_base;
+ uint64_t log_addr;
+ struct ether_addr mac;
+
+} __rte_cache_aligned;
+
+/**
+ * Information relating to memory regions including offsets to
+ * addresses in QEMUs memory file.
+ */
+struct virtio_memory_regions {
+ uint64_t guest_phys_address;
+ uint64_t guest_phys_address_end;
+ uint64_t memory_size;
+ uint64_t userspace_address;
+ uint64_t address_offset;
+};
+
+
+/**
+ * Memory structure includes region and mapping information.
+ */
+struct virtio_memory {
+ /* Base QEMU userspace address of the memory file. */
+ uint64_t base_address;
+ uint64_t mapped_address;
+ uint64_t mapped_size;
+ uint32_t nregions;
+ struct virtio_memory_regions regions[0];
+};
+
+
+/* Macros for printing using RTE_LOG */
+#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1
+#define RTE_LOGTYPE_VHOST_DATA RTE_LOGTYPE_USER1
+
+#ifdef RTE_LIBRTE_VHOST_DEBUG
+#define VHOST_MAX_PRINT_BUFF 6072
+#define LOG_LEVEL RTE_LOG_DEBUG
+#define LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args)
+#define PRINT_PACKET(device, addr, size, header) do { \
+ char *pkt_addr = (char *)(addr); \
+ unsigned int index; \
+ char packet[VHOST_MAX_PRINT_BUFF]; \
+ \
+ if ((header)) \
+ snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Header size %d: ", (device->vid), (size)); \
+ else \
+ snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Packet size %d: ", (device->vid), (size)); \
+ for (index = 0; index < (size); index++) { \
+ snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \
+ "%02hhx ", pkt_addr[index]); \
+ } \
+ snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \
+ \
+ LOG_DEBUG(VHOST_DATA, "%s", packet); \
+} while (0)
+#else
+#define LOG_LEVEL RTE_LOG_INFO
+#define LOG_DEBUG(log_type, fmt, args...) do {} while (0)
+#define PRINT_PACKET(device, addr, size, header) do {} while (0)
+#endif
+
+extern uint64_t VHOST_FEATURES;
+#define MAX_VHOST_DEVICE 1024
+extern struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
+
+/**
+ * Function to convert guest physical addresses to vhost virtual addresses.
+ * This is used to convert guest virtio buffer addresses.
+ */
+static inline uint64_t __attribute__((always_inline))
+gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa)
+{
+ struct virtio_memory_regions *region;
+ uint32_t regionidx;
+ uint64_t vhost_va = 0;
+
+ for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
+ region = &dev->mem->regions[regionidx];
+ if ((guest_pa >= region->guest_phys_address) &&
+ (guest_pa <= region->guest_phys_address_end)) {
+ vhost_va = region->address_offset + guest_pa;
+ break;
+ }
+ }
+ return vhost_va;
+}
+
+struct virtio_net_device_ops const *notify_ops;
+struct virtio_net *get_device(int vid);
+
+int vhost_new_device(void);
+void cleanup_device(struct virtio_net *dev, int destroy);
+void reset_device(struct virtio_net *dev);
+void vhost_destroy_device(int);
+
+int alloc_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx);
+
+void vhost_set_ifname(int, const char *if_name, unsigned int if_len);
+
+/*
+ * Backend-specific cleanup. Defined by vhost-cuse and vhost-user.
+ */
+void vhost_backend_cleanup(struct virtio_net *dev);
+
+#endif /* _VHOST_NET_CDEV_H_ */
+++ /dev/null
-/*-
- * BSD LICENSE
- *
- * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <stdint.h>
-#include <stdbool.h>
-#include <linux/virtio_net.h>
-
-#include <rte_mbuf.h>
-#include <rte_memcpy.h>
-#include <rte_ether.h>
-#include <rte_ip.h>
-#include <rte_virtio_net.h>
-#include <rte_tcp.h>
-#include <rte_udp.h>
-#include <rte_sctp.h>
-#include <rte_arp.h>
-
-#include "vhost-net.h"
-
-#define MAX_PKT_BURST 32
-#define VHOST_LOG_PAGE 4096
-
-static inline void __attribute__((always_inline))
-vhost_log_page(uint8_t *log_base, uint64_t page)
-{
- log_base[page / 8] |= 1 << (page % 8);
-}
-
-static inline void __attribute__((always_inline))
-vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
-{
- uint64_t page;
-
- if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
- !dev->log_base || !len))
- return;
-
- if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
- return;
-
- /* To make sure guest memory updates are committed before logging */
- rte_smp_wmb();
-
- page = addr / VHOST_LOG_PAGE;
- while (page * VHOST_LOG_PAGE < addr + len) {
- vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
- page += 1;
- }
-}
-
-static inline void __attribute__((always_inline))
-vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
- uint64_t offset, uint64_t len)
-{
- vhost_log_write(dev, vq->log_guest_addr + offset, len);
-}
-
-static bool
-is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
-{
- return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
-}
-
-static void
-virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
-{
- if (m_buf->ol_flags & PKT_TX_L4_MASK) {
- net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
- net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
-
- switch (m_buf->ol_flags & PKT_TX_L4_MASK) {
- case PKT_TX_TCP_CKSUM:
- net_hdr->csum_offset = (offsetof(struct tcp_hdr,
- cksum));
- break;
- case PKT_TX_UDP_CKSUM:
- net_hdr->csum_offset = (offsetof(struct udp_hdr,
- dgram_cksum));
- break;
- case PKT_TX_SCTP_CKSUM:
- net_hdr->csum_offset = (offsetof(struct sctp_hdr,
- cksum));
- break;
- }
- }
-
- if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
- if (m_buf->ol_flags & PKT_TX_IPV4)
- net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
- else
- net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
- net_hdr->gso_size = m_buf->tso_segsz;
- net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
- + m_buf->l4_len;
- }
-}
-
-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
- struct virtio_net_hdr_mrg_rxbuf hdr)
-{
- if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
- *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
- else
- *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
-}
-
-static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
- struct rte_mbuf *m, uint16_t desc_idx)
-{
- uint32_t desc_avail, desc_offset;
- uint32_t mbuf_avail, mbuf_offset;
- uint32_t cpy_len;
- struct vring_desc *desc;
- uint64_t desc_addr;
- struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-
- desc = &vq->desc[desc_idx];
- desc_addr = gpa_to_vva(dev, desc->addr);
- /*
- * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
- * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
- * otherwise stores offset on the stack instead of in a register.
- */
- if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
- return -1;
-
- rte_prefetch0((void *)(uintptr_t)desc_addr);
-
- virtio_enqueue_offload(m, &virtio_hdr.hdr);
- copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
- vhost_log_write(dev, desc->addr, dev->vhost_hlen);
- PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
- desc_offset = dev->vhost_hlen;
- desc_avail = desc->len - dev->vhost_hlen;
-
- mbuf_avail = rte_pktmbuf_data_len(m);
- mbuf_offset = 0;
- while (mbuf_avail != 0 || m->next != NULL) {
- /* done with current mbuf, fetch next */
- if (mbuf_avail == 0) {
- m = m->next;
-
- mbuf_offset = 0;
- mbuf_avail = rte_pktmbuf_data_len(m);
- }
-
- /* done with current desc buf, fetch next */
- if (desc_avail == 0) {
- if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
- /* Room in vring buffer is not enough */
- return -1;
- }
- if (unlikely(desc->next >= vq->size))
- return -1;
-
- desc = &vq->desc[desc->next];
- desc_addr = gpa_to_vva(dev, desc->addr);
- if (unlikely(!desc_addr))
- return -1;
-
- desc_offset = 0;
- desc_avail = desc->len;
- }
-
- cpy_len = RTE_MIN(desc_avail, mbuf_avail);
- rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
- rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
- cpy_len);
- vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
- PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
- cpy_len, 0);
-
- mbuf_avail -= cpy_len;
- mbuf_offset += cpy_len;
- desc_avail -= cpy_len;
- desc_offset += cpy_len;
- }
-
- return 0;
-}
-
-/**
- * This function adds buffers to the virtio devices RX virtqueue. Buffers can
- * be received from the physical port or from another virtio device. A packet
- * count is returned to indicate the number of packets that are succesfully
- * added to the RX queue. This function works when the mbuf is scattered, but
- * it doesn't support the mergeable feature.
- */
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
- struct rte_mbuf **pkts, uint32_t count)
-{
- struct vhost_virtqueue *vq;
- uint16_t avail_idx, free_entries, start_idx;
- uint16_t desc_indexes[MAX_PKT_BURST];
- uint16_t used_idx;
- uint32_t i;
-
- LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
- if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
- RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
- dev->vid, __func__, queue_id);
- return 0;
- }
-
- vq = dev->virtqueue[queue_id];
- if (unlikely(vq->enabled == 0))
- return 0;
-
- avail_idx = *((volatile uint16_t *)&vq->avail->idx);
- start_idx = vq->last_used_idx;
- free_entries = avail_idx - start_idx;
- count = RTE_MIN(count, free_entries);
- count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
- if (count == 0)
- return 0;
-
- LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
- dev->vid, start_idx, start_idx + count);
-
- /* Retrieve all of the desc indexes first to avoid caching issues. */
- rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
- for (i = 0; i < count; i++) {
- used_idx = (start_idx + i) & (vq->size - 1);
- desc_indexes[i] = vq->avail->ring[used_idx];
- vq->used->ring[used_idx].id = desc_indexes[i];
- vq->used->ring[used_idx].len = pkts[i]->pkt_len +
- dev->vhost_hlen;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
-
- rte_prefetch0(&vq->desc[desc_indexes[0]]);
- for (i = 0; i < count; i++) {
- uint16_t desc_idx = desc_indexes[i];
- int err;
-
- err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
- if (unlikely(err)) {
- used_idx = (start_idx + i) & (vq->size - 1);
- vq->used->ring[used_idx].len = dev->vhost_hlen;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
-
- if (i + 1 < count)
- rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
- }
-
- rte_smp_wmb();
-
- *(volatile uint16_t *)&vq->used->idx += count;
- vq->last_used_idx += count;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, idx),
- sizeof(vq->used->idx));
-
- /* flush used->idx update before we read avail->flags. */
- rte_mb();
-
- /* Kick the guest if necessary. */
- if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
- && (vq->callfd >= 0))
- eventfd_write(vq->callfd, (eventfd_t)1);
- return count;
-}
-
-static inline int
-fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
- uint32_t *allocated, uint32_t *vec_idx,
- struct buf_vector *buf_vec)
-{
- uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
- uint32_t vec_id = *vec_idx;
- uint32_t len = *allocated;
-
- while (1) {
- if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
- return -1;
-
- len += vq->desc[idx].len;
- buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
- buf_vec[vec_id].buf_len = vq->desc[idx].len;
- buf_vec[vec_id].desc_idx = idx;
- vec_id++;
-
- if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0)
- break;
-
- idx = vq->desc[idx].next;
- }
-
- *allocated = len;
- *vec_idx = vec_id;
-
- return 0;
-}
-
-/*
- * Returns -1 on fail, 0 on success
- */
-static inline int
-reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
- uint16_t *end, struct buf_vector *buf_vec)
-{
- uint16_t cur_idx;
- uint16_t avail_idx;
- uint32_t allocated = 0;
- uint32_t vec_idx = 0;
- uint16_t tries = 0;
-
- cur_idx = vq->last_used_idx;
-
- while (1) {
- avail_idx = *((volatile uint16_t *)&vq->avail->idx);
- if (unlikely(cur_idx == avail_idx))
- return -1;
-
- if (unlikely(fill_vec_buf(vq, cur_idx, &allocated,
- &vec_idx, buf_vec) < 0))
- return -1;
-
- cur_idx++;
- tries++;
-
- if (allocated >= size)
- break;
-
- /*
- * if we tried all available ring items, and still
- * can't get enough buf, it means something abnormal
- * happened.
- */
- if (unlikely(tries >= vq->size))
- return -1;
- }
-
- *end = cur_idx;
- return 0;
-}
-
-static inline uint32_t __attribute__((always_inline))
-copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
- uint16_t end_idx, struct rte_mbuf *m,
- struct buf_vector *buf_vec)
-{
- struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
- uint32_t vec_idx = 0;
- uint16_t start_idx = vq->last_used_idx;
- uint16_t cur_idx = start_idx;
- uint64_t desc_addr;
- uint32_t mbuf_offset, mbuf_avail;
- uint32_t desc_offset, desc_avail;
- uint32_t cpy_len;
- uint16_t desc_idx, used_idx;
-
- if (unlikely(m == NULL))
- return 0;
-
- LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
- dev->vid, cur_idx, end_idx);
-
- desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
- if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
- return 0;
-
- rte_prefetch0((void *)(uintptr_t)desc_addr);
-
- virtio_hdr.num_buffers = end_idx - start_idx;
- LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
- dev->vid, virtio_hdr.num_buffers);
-
- virtio_enqueue_offload(m, &virtio_hdr.hdr);
- copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
- vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
- PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
- desc_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
- desc_offset = dev->vhost_hlen;
-
- mbuf_avail = rte_pktmbuf_data_len(m);
- mbuf_offset = 0;
- while (mbuf_avail != 0 || m->next != NULL) {
- /* done with current desc buf, get the next one */
- if (desc_avail == 0) {
- desc_idx = buf_vec[vec_idx].desc_idx;
-
- if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
- /* Update used ring with desc information */
- used_idx = cur_idx++ & (vq->size - 1);
- vq->used->ring[used_idx].id = desc_idx;
- vq->used->ring[used_idx].len = desc_offset;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used,
- ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
-
- vec_idx++;
- desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
- if (unlikely(!desc_addr))
- return 0;
-
- /* Prefetch buffer address. */
- rte_prefetch0((void *)(uintptr_t)desc_addr);
- desc_offset = 0;
- desc_avail = buf_vec[vec_idx].buf_len;
- }
-
- /* done with current mbuf, get the next one */
- if (mbuf_avail == 0) {
- m = m->next;
-
- mbuf_offset = 0;
- mbuf_avail = rte_pktmbuf_data_len(m);
- }
-
- cpy_len = RTE_MIN(desc_avail, mbuf_avail);
- rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
- rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
- cpy_len);
- vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
- cpy_len);
- PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
- cpy_len, 0);
-
- mbuf_avail -= cpy_len;
- mbuf_offset += cpy_len;
- desc_avail -= cpy_len;
- desc_offset += cpy_len;
- }
-
- used_idx = cur_idx & (vq->size - 1);
- vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
- vq->used->ring[used_idx].len = desc_offset;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
-
- return end_idx - start_idx;
-}
-
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
- struct rte_mbuf **pkts, uint32_t count)
-{
- struct vhost_virtqueue *vq;
- uint32_t pkt_idx = 0, nr_used = 0;
- uint16_t end;
- struct buf_vector buf_vec[BUF_VECTOR_MAX];
-
- LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
- if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
- RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
- dev->vid, __func__, queue_id);
- return 0;
- }
-
- vq = dev->virtqueue[queue_id];
- if (unlikely(vq->enabled == 0))
- return 0;
-
- count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
- if (count == 0)
- return 0;
-
- for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
- uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
-
- if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len,
- &end, buf_vec) < 0)) {
- LOG_DEBUG(VHOST_DATA,
- "(%d) failed to get enough desc from vring\n",
- dev->vid);
- break;
- }
-
- nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
- pkts[pkt_idx], buf_vec);
- rte_smp_wmb();
-
- *(volatile uint16_t *)&vq->used->idx += nr_used;
- vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
- sizeof(vq->used->idx));
- vq->last_used_idx += nr_used;
- }
-
- if (likely(pkt_idx)) {
- /* flush used->idx update before we read avail->flags. */
- rte_mb();
-
- /* Kick the guest if necessary. */
- if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
- && (vq->callfd >= 0))
- eventfd_write(vq->callfd, (eventfd_t)1);
- }
-
- return pkt_idx;
-}
-
-uint16_t
-rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
- struct rte_mbuf **pkts, uint16_t count)
-{
- struct virtio_net *dev = get_device(vid);
-
- if (!dev)
- return 0;
-
- if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
- return virtio_dev_merge_rx(dev, queue_id, pkts, count);
- else
- return virtio_dev_rx(dev, queue_id, pkts, count);
-}
-
-static void
-parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
-{
- struct ipv4_hdr *ipv4_hdr;
- struct ipv6_hdr *ipv6_hdr;
- void *l3_hdr = NULL;
- struct ether_hdr *eth_hdr;
- uint16_t ethertype;
-
- eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
-
- m->l2_len = sizeof(struct ether_hdr);
- ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
-
- if (ethertype == ETHER_TYPE_VLAN) {
- struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
-
- m->l2_len += sizeof(struct vlan_hdr);
- ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
- }
-
- l3_hdr = (char *)eth_hdr + m->l2_len;
-
- switch (ethertype) {
- case ETHER_TYPE_IPv4:
- ipv4_hdr = (struct ipv4_hdr *)l3_hdr;
- *l4_proto = ipv4_hdr->next_proto_id;
- m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
- *l4_hdr = (char *)l3_hdr + m->l3_len;
- m->ol_flags |= PKT_TX_IPV4;
- break;
- case ETHER_TYPE_IPv6:
- ipv6_hdr = (struct ipv6_hdr *)l3_hdr;
- *l4_proto = ipv6_hdr->proto;
- m->l3_len = sizeof(struct ipv6_hdr);
- *l4_hdr = (char *)l3_hdr + m->l3_len;
- m->ol_flags |= PKT_TX_IPV6;
- break;
- default:
- m->l3_len = 0;
- *l4_proto = 0;
- break;
- }
-}
-
-static inline void __attribute__((always_inline))
-vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
-{
- uint16_t l4_proto = 0;
- void *l4_hdr = NULL;
- struct tcp_hdr *tcp_hdr = NULL;
-
- parse_ethernet(m, &l4_proto, &l4_hdr);
- if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
- if (hdr->csum_start == (m->l2_len + m->l3_len)) {
- switch (hdr->csum_offset) {
- case (offsetof(struct tcp_hdr, cksum)):
- if (l4_proto == IPPROTO_TCP)
- m->ol_flags |= PKT_TX_TCP_CKSUM;
- break;
- case (offsetof(struct udp_hdr, dgram_cksum)):
- if (l4_proto == IPPROTO_UDP)
- m->ol_flags |= PKT_TX_UDP_CKSUM;
- break;
- case (offsetof(struct sctp_hdr, cksum)):
- if (l4_proto == IPPROTO_SCTP)
- m->ol_flags |= PKT_TX_SCTP_CKSUM;
- break;
- default:
- break;
- }
- }
- }
-
- if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
- switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
- case VIRTIO_NET_HDR_GSO_TCPV4:
- case VIRTIO_NET_HDR_GSO_TCPV6:
- tcp_hdr = (struct tcp_hdr *)l4_hdr;
- m->ol_flags |= PKT_TX_TCP_SEG;
- m->tso_segsz = hdr->gso_size;
- m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
- break;
- default:
- RTE_LOG(WARNING, VHOST_DATA,
- "unsupported gso type %u.\n", hdr->gso_type);
- break;
- }
- }
-}
-
-#define RARP_PKT_SIZE 64
-
-static int
-make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
-{
- struct ether_hdr *eth_hdr;
- struct arp_hdr *rarp;
-
- if (rarp_mbuf->buf_len < 64) {
- RTE_LOG(WARNING, VHOST_DATA,
- "failed to make RARP; mbuf size too small %u (< %d)\n",
- rarp_mbuf->buf_len, RARP_PKT_SIZE);
- return -1;
- }
-
- /* Ethernet header. */
- eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0);
- memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN);
- ether_addr_copy(mac, ð_hdr->s_addr);
- eth_hdr->ether_type = htons(ETHER_TYPE_RARP);
-
- /* RARP header. */
- rarp = (struct arp_hdr *)(eth_hdr + 1);
- rarp->arp_hrd = htons(ARP_HRD_ETHER);
- rarp->arp_pro = htons(ETHER_TYPE_IPv4);
- rarp->arp_hln = ETHER_ADDR_LEN;
- rarp->arp_pln = 4;
- rarp->arp_op = htons(ARP_OP_REVREQUEST);
-
- ether_addr_copy(mac, &rarp->arp_data.arp_sha);
- ether_addr_copy(mac, &rarp->arp_data.arp_tha);
- memset(&rarp->arp_data.arp_sip, 0x00, 4);
- memset(&rarp->arp_data.arp_tip, 0x00, 4);
-
- rarp_mbuf->pkt_len = rarp_mbuf->data_len = RARP_PKT_SIZE;
-
- return 0;
-}
-
-static inline int __attribute__((always_inline))
-copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
- struct rte_mbuf *m, uint16_t desc_idx,
- struct rte_mempool *mbuf_pool)
-{
- struct vring_desc *desc;
- uint64_t desc_addr;
- uint32_t desc_avail, desc_offset;
- uint32_t mbuf_avail, mbuf_offset;
- uint32_t cpy_len;
- struct rte_mbuf *cur = m, *prev = m;
- struct virtio_net_hdr *hdr;
- /* A counter to avoid desc dead loop chain */
- uint32_t nr_desc = 1;
-
- desc = &vq->desc[desc_idx];
- if (unlikely(desc->len < dev->vhost_hlen))
- return -1;
-
- desc_addr = gpa_to_vva(dev, desc->addr);
- if (unlikely(!desc_addr))
- return -1;
-
- hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr);
- rte_prefetch0(hdr);
-
- /*
- * A virtio driver normally uses at least 2 desc buffers
- * for Tx: the first for storing the header, and others
- * for storing the data.
- */
- if (likely((desc->len == dev->vhost_hlen) &&
- (desc->flags & VRING_DESC_F_NEXT) != 0)) {
- desc = &vq->desc[desc->next];
-
- desc_addr = gpa_to_vva(dev, desc->addr);
- if (unlikely(!desc_addr))
- return -1;
-
- rte_prefetch0((void *)(uintptr_t)desc_addr);
-
- desc_offset = 0;
- desc_avail = desc->len;
- nr_desc += 1;
-
- PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0);
- } else {
- desc_avail = desc->len - dev->vhost_hlen;
- desc_offset = dev->vhost_hlen;
- }
-
- mbuf_offset = 0;
- mbuf_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
- while (1) {
- cpy_len = RTE_MIN(desc_avail, mbuf_avail);
- rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset),
- (void *)((uintptr_t)(desc_addr + desc_offset)),
- cpy_len);
-
- mbuf_avail -= cpy_len;
- mbuf_offset += cpy_len;
- desc_avail -= cpy_len;
- desc_offset += cpy_len;
-
- /* This desc reaches to its end, get the next one */
- if (desc_avail == 0) {
- if ((desc->flags & VRING_DESC_F_NEXT) == 0)
- break;
-
- if (unlikely(desc->next >= vq->size ||
- ++nr_desc > vq->size))
- return -1;
- desc = &vq->desc[desc->next];
-
- desc_addr = gpa_to_vva(dev, desc->addr);
- if (unlikely(!desc_addr))
- return -1;
-
- rte_prefetch0((void *)(uintptr_t)desc_addr);
-
- desc_offset = 0;
- desc_avail = desc->len;
-
- PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0);
- }
-
- /*
- * This mbuf reaches to its end, get a new one
- * to hold more data.
- */
- if (mbuf_avail == 0) {
- cur = rte_pktmbuf_alloc(mbuf_pool);
- if (unlikely(cur == NULL)) {
- RTE_LOG(ERR, VHOST_DATA, "Failed to "
- "allocate memory for mbuf.\n");
- return -1;
- }
-
- prev->next = cur;
- prev->data_len = mbuf_offset;
- m->nb_segs += 1;
- m->pkt_len += mbuf_offset;
- prev = cur;
-
- mbuf_offset = 0;
- mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
- }
- }
-
- prev->data_len = mbuf_offset;
- m->pkt_len += mbuf_offset;
-
- if (hdr->flags != 0 || hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE)
- vhost_dequeue_offload(hdr, m);
-
- return 0;
-}
-
-uint16_t
-rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
- struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
-{
- struct virtio_net *dev;
- struct rte_mbuf *rarp_mbuf = NULL;
- struct vhost_virtqueue *vq;
- uint32_t desc_indexes[MAX_PKT_BURST];
- uint32_t used_idx;
- uint32_t i = 0;
- uint16_t free_entries;
- uint16_t avail_idx;
-
- dev = get_device(vid);
- if (!dev)
- return 0;
-
- if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) {
- RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
- dev->vid, __func__, queue_id);
- return 0;
- }
-
- vq = dev->virtqueue[queue_id];
- if (unlikely(vq->enabled == 0))
- return 0;
-
- /*
- * Construct a RARP broadcast packet, and inject it to the "pkts"
- * array, to looks like that guest actually send such packet.
- *
- * Check user_send_rarp() for more information.
- */
- if (unlikely(rte_atomic16_cmpset((volatile uint16_t *)
- &dev->broadcast_rarp.cnt, 1, 0))) {
- rarp_mbuf = rte_pktmbuf_alloc(mbuf_pool);
- if (rarp_mbuf == NULL) {
- RTE_LOG(ERR, VHOST_DATA,
- "Failed to allocate memory for mbuf.\n");
- return 0;
- }
-
- if (make_rarp_packet(rarp_mbuf, &dev->mac)) {
- rte_pktmbuf_free(rarp_mbuf);
- rarp_mbuf = NULL;
- } else {
- count -= 1;
- }
- }
-
- avail_idx = *((volatile uint16_t *)&vq->avail->idx);
- free_entries = avail_idx - vq->last_used_idx;
- if (free_entries == 0)
- goto out;
-
- LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
-
- /* Prefetch available ring to retrieve head indexes. */
- used_idx = vq->last_used_idx & (vq->size - 1);
- rte_prefetch0(&vq->avail->ring[used_idx]);
- rte_prefetch0(&vq->used->ring[used_idx]);
-
- count = RTE_MIN(count, MAX_PKT_BURST);
- count = RTE_MIN(count, free_entries);
- LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
- dev->vid, count);
-
- /* Retrieve all of the head indexes first to avoid caching issues. */
- for (i = 0; i < count; i++) {
- used_idx = (vq->last_used_idx + i) & (vq->size - 1);
- desc_indexes[i] = vq->avail->ring[used_idx];
-
- vq->used->ring[used_idx].id = desc_indexes[i];
- vq->used->ring[used_idx].len = 0;
- vhost_log_used_vring(dev, vq,
- offsetof(struct vring_used, ring[used_idx]),
- sizeof(vq->used->ring[used_idx]));
- }
-
- /* Prefetch descriptor index. */
- rte_prefetch0(&vq->desc[desc_indexes[0]]);
- for (i = 0; i < count; i++) {
- int err;
-
- if (likely(i + 1 < count))
- rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
-
- pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
- if (unlikely(pkts[i] == NULL)) {
- RTE_LOG(ERR, VHOST_DATA,
- "Failed to allocate memory for mbuf.\n");
- break;
- }
- err = copy_desc_to_mbuf(dev, vq, pkts[i], desc_indexes[i],
- mbuf_pool);
- if (unlikely(err)) {
- rte_pktmbuf_free(pkts[i]);
- break;
- }
- }
-
- rte_smp_wmb();
- rte_smp_rmb();
- vq->used->idx += i;
- vq->last_used_idx += i;
- vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
- sizeof(vq->used->idx));
-
- /* Kick guest if required. */
- if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
- && (vq->callfd >= 0))
- eventfd_write(vq->callfd, (eventfd_t)1);
-
-out:
- if (unlikely(rarp_mbuf != NULL)) {
- /*
- * Inject it to the head of "pkts" array, so that switch's mac
- * learning table will get updated first.
- */
- memmove(&pkts[1], pkts, i * sizeof(struct rte_mbuf *));
- pkts[0] = rarp_mbuf;
- i += 1;
- }
-
- return i;
-}
--- /dev/null
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <assert.h>
+#ifdef RTE_LIBRTE_VHOST_NUMA
+#include <numaif.h>
+#endif
+
+#include <rte_common.h>
+#include <rte_malloc.h>
+#include <rte_log.h>
+
+#include "vhost.h"
+#include "vhost_user.h"
+
+static const char *vhost_message_str[VHOST_USER_MAX] = {
+ [VHOST_USER_NONE] = "VHOST_USER_NONE",
+ [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
+ [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
+ [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
+ [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
+ [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
+ [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
+ [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
+ [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
+ [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
+ [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
+ [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
+ [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
+ [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
+ [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR",
+ [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES",
+ [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES",
+ [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM",
+ [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE",
+ [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP",
+};
+
+struct orig_region_map {
+ int fd;
+ uint64_t mapped_address;
+ uint64_t mapped_size;
+ uint64_t blksz;
+};
+
+#define orig_region(ptr, nregions) \
+ ((struct orig_region_map *)RTE_PTR_ADD((ptr), \
+ sizeof(struct virtio_memory) + \
+ sizeof(struct virtio_memory_regions) * (nregions)))
+
+static uint64_t
+get_blk_size(int fd)
+{
+ struct stat stat;
+ int ret;
+
+ ret = fstat(fd, &stat);
+ return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
+}
+
+static void
+free_mem_region(struct virtio_net *dev)
+{
+ struct orig_region_map *region;
+ unsigned int idx;
+
+ if (!dev || !dev->mem)
+ return;
+
+ region = orig_region(dev->mem, dev->mem->nregions);
+ for (idx = 0; idx < dev->mem->nregions; idx++) {
+ if (region[idx].mapped_address) {
+ munmap((void *)(uintptr_t)region[idx].mapped_address,
+ region[idx].mapped_size);
+ close(region[idx].fd);
+ }
+ }
+}
+
+void
+vhost_backend_cleanup(struct virtio_net *dev)
+{
+ if (dev->mem) {
+ free_mem_region(dev);
+ free(dev->mem);
+ dev->mem = NULL;
+ }
+ if (dev->log_addr) {
+ munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
+ dev->log_addr = 0;
+ }
+}
+
+/*
+ * This function just returns success at the moment unless
+ * the device hasn't been initialised.
+ */
+static int
+vhost_set_owner(int vid)
+{
+ struct virtio_net *dev;
+
+ dev = get_device(vid);
+ if (dev == NULL)
+ return -1;
+
+ return 0;
+}
+
+static int
+vhost_reset_owner(int vid)
+{
+ struct virtio_net *dev;
+
+ dev = get_device(vid);
+ if (dev == NULL)
+ return -1;
+
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ notify_ops->destroy_device(vid);
+ }
+
+ cleanup_device(dev, 0);
+ reset_device(dev);
+ return 0;
+}
+
+/*
+ * The features that we support are requested.
+ */
+static int
+vhost_get_features(int vid, uint64_t *pu)
+{
+ struct virtio_net *dev;
+
+ dev = get_device(vid);
+ if (dev == NULL)
+ return -1;
+
+ /* Send our supported features. */
+ *pu = VHOST_FEATURES;
+ return 0;
+}
+
+/*
+ * We receive the negotiated features supported by us and the virtio device.
+ */
+static int
+vhost_set_features(int vid, uint64_t *pu)
+{
+ struct virtio_net *dev;
+
+ dev = get_device(vid);
+ if (dev == NULL)
+ return -1;
+ if (*pu & ~VHOST_FEATURES)
+ return -1;
+
+ dev->features = *pu;
+ if (dev->features &
+ ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) {
+ dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+ } else {
+ dev->vhost_hlen = sizeof(struct virtio_net_hdr);
+ }
+ LOG_DEBUG(VHOST_CONFIG,
+ "(%d) mergeable RX buffers %s, virtio 1 %s\n",
+ dev->vid,
+ (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off",
+ (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off");
+
+ return 0;
+}
+
+/*
+ * The virtio device sends us the size of the descriptor ring.
+ */
+static int
+vhost_set_vring_num(int vid, struct vhost_vring_state *state)
+{
+ struct virtio_net *dev;
+
+ dev = get_device(vid);
+ if (dev == NULL)
+ return -1;
+
+ /* State->index refers to the queue index. The txq is 1, rxq is 0. */
+ dev->virtqueue[state->index]->size = state->num;
+
+ return 0;
+}
+
+/*
+ * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the
+ * same numa node as the memory of vring descriptor.
+ */
+#ifdef RTE_LIBRTE_VHOST_NUMA
+static struct virtio_net*
+numa_realloc(struct virtio_net *dev, int index)
+{
+ int oldnode, newnode;
+ struct virtio_net *old_dev;
+ struct vhost_virtqueue *old_vq, *vq;
+ int ret;
+
+ /*
+ * vq is allocated on pairs, we should try to do realloc
+ * on first queue of one queue pair only.
+ */
+ if (index % VIRTIO_QNUM != 0)
+ return dev;
+
+ old_dev = dev;
+ vq = old_vq = dev->virtqueue[index];
+
+ ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc,
+ MPOL_F_NODE | MPOL_F_ADDR);
+
+ /* check if we need to reallocate vq */
+ ret |= get_mempolicy(&oldnode, NULL, 0, old_vq,
+ MPOL_F_NODE | MPOL_F_ADDR);
+ if (ret) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Unable to get vq numa information.\n");
+ return dev;
+ }
+ if (oldnode != newnode) {
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "reallocate vq from %d to %d node\n", oldnode, newnode);
+ vq = rte_malloc_socket(NULL, sizeof(*vq) * VIRTIO_QNUM, 0,
+ newnode);
+ if (!vq)
+ return dev;
+
+ memcpy(vq, old_vq, sizeof(*vq) * VIRTIO_QNUM);
+ rte_free(old_vq);
+ }
+
+ /* check if we need to reallocate dev */
+ ret = get_mempolicy(&oldnode, NULL, 0, old_dev,
+ MPOL_F_NODE | MPOL_F_ADDR);
+ if (ret) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Unable to get dev numa information.\n");
+ goto out;
+ }
+ if (oldnode != newnode) {
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "reallocate dev from %d to %d node\n",
+ oldnode, newnode);
+ dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode);
+ if (!dev) {
+ dev = old_dev;
+ goto out;
+ }
+
+ memcpy(dev, old_dev, sizeof(*dev));
+ rte_free(old_dev);
+ }
+
+out:
+ dev->virtqueue[index] = vq;
+ dev->virtqueue[index + 1] = vq + 1;
+ vhost_devices[dev->vid] = dev;
+
+ return dev;
+}
+#else
+static struct virtio_net*
+numa_realloc(struct virtio_net *dev, int index __rte_unused)
+{
+ return dev;
+}
+#endif
+
+/*
+ * Converts QEMU virtual address to Vhost virtual address. This function is
+ * used to convert the ring addresses to our address space.
+ */
+static uint64_t
+qva_to_vva(struct virtio_net *dev, uint64_t qemu_va)
+{
+ struct virtio_memory_regions *region;
+ uint64_t vhost_va = 0;
+ uint32_t regionidx = 0;
+
+ /* Find the region where the address lives. */
+ for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
+ region = &dev->mem->regions[regionidx];
+ if ((qemu_va >= region->userspace_address) &&
+ (qemu_va <= region->userspace_address +
+ region->memory_size)) {
+ vhost_va = qemu_va + region->guest_phys_address +
+ region->address_offset -
+ region->userspace_address;
+ break;
+ }
+ }
+ return vhost_va;
+}
+
+/*
+ * The virtio device sends us the desc, used and avail ring addresses.
+ * This function then converts these to our address space.
+ */
+static int
+vhost_set_vring_addr(int vid, struct vhost_vring_addr *addr)
+{
+ struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
+
+ dev = get_device(vid);
+ if ((dev == NULL) || (dev->mem == NULL))
+ return -1;
+
+ /* addr->index refers to the queue index. The txq 1, rxq is 0. */
+ vq = dev->virtqueue[addr->index];
+
+ /* The addresses are converted from QEMU virtual to Vhost virtual. */
+ vq->desc = (struct vring_desc *)(uintptr_t)qva_to_vva(dev,
+ addr->desc_user_addr);
+ if (vq->desc == 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) failed to find desc ring address.\n",
+ dev->vid);
+ return -1;
+ }
+
+ dev = numa_realloc(dev, addr->index);
+ vq = dev->virtqueue[addr->index];
+
+ vq->avail = (struct vring_avail *)(uintptr_t)qva_to_vva(dev,
+ addr->avail_user_addr);
+ if (vq->avail == 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) failed to find avail ring address.\n",
+ dev->vid);
+ return -1;
+ }
+
+ vq->used = (struct vring_used *)(uintptr_t)qva_to_vva(dev,
+ addr->used_user_addr);
+ if (vq->used == 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) failed to find used ring address.\n",
+ dev->vid);
+ return -1;
+ }
+
+ if (vq->last_used_idx != vq->used->idx) {
+ RTE_LOG(WARNING, VHOST_CONFIG,
+ "last_used_idx (%u) and vq->used->idx (%u) mismatches; "
+ "some packets maybe resent for Tx and dropped for Rx\n",
+ vq->last_used_idx, vq->used->idx);
+ vq->last_used_idx = vq->used->idx;
+ }
+
+ vq->log_guest_addr = addr->log_guest_addr;
+
+ LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n",
+ dev->vid, vq->desc);
+ LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n",
+ dev->vid, vq->avail);
+ LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n",
+ dev->vid, vq->used);
+ LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n",
+ dev->vid, vq->log_guest_addr);
+
+ return 0;
+}
+
+/*
+ * The virtio device sends us the available ring last used index.
+ */
+static int
+vhost_set_vring_base(int vid, struct vhost_vring_state *state)
+{
+ struct virtio_net *dev;
+
+ dev = get_device(vid);
+ if (dev == NULL)
+ return -1;
+
+ /* State->index refers to the queue index. The txq is 1, rxq is 0. */
+ dev->virtqueue[state->index]->last_used_idx = state->num;
+
+ return 0;
+}
+
+/*
+ * We send the virtio device our available ring last used index.
+ */
+static int
+vhost_get_vring_base(int vid, uint32_t index,
+ struct vhost_vring_state *state)
+{
+ struct virtio_net *dev;
+
+ dev = get_device(vid);
+ if (dev == NULL)
+ return -1;
+
+ state->index = index;
+ /* State->index refers to the queue index. The txq is 1, rxq is 0. */
+ state->num = dev->virtqueue[state->index]->last_used_idx;
+
+ return 0;
+}
+
+/*
+ * The virtio device sends an eventfd to interrupt the guest. This fd gets
+ * copied into our process space.
+ */
+static int
+vhost_set_vring_call(int vid, struct vhost_vring_file *file)
+{
+ struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
+ uint32_t cur_qp_idx = file->index / VIRTIO_QNUM;
+
+ dev = get_device(vid);
+ if (dev == NULL)
+ return -1;
+
+ /*
+ * FIXME: VHOST_SET_VRING_CALL is the first per-vring message
+ * we get, so we do vring queue pair allocation here.
+ */
+ if (cur_qp_idx + 1 > dev->virt_qp_nb) {
+ if (alloc_vring_queue_pair(dev, cur_qp_idx) < 0)
+ return -1;
+ }
+
+ /* file->index refers to the queue index. The txq is 1, rxq is 0. */
+ vq = dev->virtqueue[file->index];
+ assert(vq != NULL);
+
+ if (vq->callfd >= 0)
+ close(vq->callfd);
+
+ vq->callfd = file->fd;
+
+ return 0;
+}
+
+/*
+ * The virtio device sends an eventfd that it can use to notify us.
+ * This fd gets copied into our process space.
+ */
+static int
+vhost_set_vring_kick(int vid, struct vhost_vring_file *file)
+{
+ struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
+
+ dev = get_device(vid);
+ if (dev == NULL)
+ return -1;
+
+ /* file->index refers to the queue index. The txq is 1, rxq is 0. */
+ vq = dev->virtqueue[file->index];
+
+ if (vq->kickfd >= 0)
+ close(vq->kickfd);
+
+ vq->kickfd = file->fd;
+
+ return 0;
+}
+
+static int
+user_set_mem_table(int vid, struct VhostUserMsg *pmsg)
+{
+ struct VhostUserMemory memory = pmsg->payload.memory;
+ struct virtio_memory_regions *pregion;
+ uint64_t mapped_address, mapped_size;
+ struct virtio_net *dev;
+ unsigned int idx = 0;
+ struct orig_region_map *pregion_orig;
+ uint64_t alignment;
+
+ /* unmap old memory regions one by one*/
+ dev = get_device(vid);
+ if (dev == NULL)
+ return -1;
+
+ /* Remove from the data plane. */
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ notify_ops->destroy_device(vid);
+ }
+
+ if (dev->mem) {
+ free_mem_region(dev);
+ free(dev->mem);
+ dev->mem = NULL;
+ }
+
+ dev->mem = calloc(1,
+ sizeof(struct virtio_memory) +
+ sizeof(struct virtio_memory_regions) * memory.nregions +
+ sizeof(struct orig_region_map) * memory.nregions);
+ if (dev->mem == NULL) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) failed to allocate memory for dev->mem\n",
+ dev->vid);
+ return -1;
+ }
+ dev->mem->nregions = memory.nregions;
+
+ pregion_orig = orig_region(dev->mem, memory.nregions);
+ for (idx = 0; idx < memory.nregions; idx++) {
+ pregion = &dev->mem->regions[idx];
+ pregion->guest_phys_address =
+ memory.regions[idx].guest_phys_addr;
+ pregion->guest_phys_address_end =
+ memory.regions[idx].guest_phys_addr +
+ memory.regions[idx].memory_size;
+ pregion->memory_size =
+ memory.regions[idx].memory_size;
+ pregion->userspace_address =
+ memory.regions[idx].userspace_addr;
+
+ /* This is ugly */
+ mapped_size = memory.regions[idx].memory_size +
+ memory.regions[idx].mmap_offset;
+
+ /* mmap() without flag of MAP_ANONYMOUS, should be called
+ * with length argument aligned with hugepagesz at older
+ * longterm version Linux, like 2.6.32 and 3.2.72, or
+ * mmap() will fail with EINVAL.
+ *
+ * to avoid failure, make sure in caller to keep length
+ * aligned.
+ */
+ alignment = get_blk_size(pmsg->fds[idx]);
+ if (alignment == (uint64_t)-1) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "couldn't get hugepage size through fstat\n");
+ goto err_mmap;
+ }
+ mapped_size = RTE_ALIGN_CEIL(mapped_size, alignment);
+
+ mapped_address = (uint64_t)(uintptr_t)mmap(NULL,
+ mapped_size,
+ PROT_READ | PROT_WRITE, MAP_SHARED,
+ pmsg->fds[idx],
+ 0);
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "mapped region %d fd:%d to:%p sz:0x%"PRIx64" "
+ "off:0x%"PRIx64" align:0x%"PRIx64"\n",
+ idx, pmsg->fds[idx], (void *)(uintptr_t)mapped_address,
+ mapped_size, memory.regions[idx].mmap_offset,
+ alignment);
+
+ if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "mmap qemu guest failed.\n");
+ goto err_mmap;
+ }
+
+ pregion_orig[idx].mapped_address = mapped_address;
+ pregion_orig[idx].mapped_size = mapped_size;
+ pregion_orig[idx].blksz = alignment;
+ pregion_orig[idx].fd = pmsg->fds[idx];
+
+ mapped_address += memory.regions[idx].mmap_offset;
+
+ pregion->address_offset = mapped_address -
+ pregion->guest_phys_address;
+
+ if (memory.regions[idx].guest_phys_addr == 0) {
+ dev->mem->base_address =
+ memory.regions[idx].userspace_addr;
+ dev->mem->mapped_address =
+ pregion->address_offset;
+ }
+
+ LOG_DEBUG(VHOST_CONFIG,
+ "REGION: %u GPA: %p QEMU VA: %p SIZE (%"PRIu64")\n",
+ idx,
+ (void *)(uintptr_t)pregion->guest_phys_address,
+ (void *)(uintptr_t)pregion->userspace_address,
+ pregion->memory_size);
+ }
+
+ return 0;
+
+err_mmap:
+ while (idx--) {
+ munmap((void *)(uintptr_t)pregion_orig[idx].mapped_address,
+ pregion_orig[idx].mapped_size);
+ close(pregion_orig[idx].fd);
+ }
+ free(dev->mem);
+ dev->mem = NULL;
+ return -1;
+}
+
+static int
+vq_is_ready(struct vhost_virtqueue *vq)
+{
+ return vq && vq->desc &&
+ vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD &&
+ vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD;
+}
+
+static int
+virtio_is_ready(struct virtio_net *dev)
+{
+ struct vhost_virtqueue *rvq, *tvq;
+ uint32_t i;
+
+ for (i = 0; i < dev->virt_qp_nb; i++) {
+ rvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ];
+ tvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ];
+
+ if (!vq_is_ready(rvq) || !vq_is_ready(tvq)) {
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "virtio is not ready for processing.\n");
+ return 0;
+ }
+ }
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "virtio is now ready for processing.\n");
+ return 1;
+}
+
+static void
+user_set_vring_call(int vid, struct VhostUserMsg *pmsg)
+{
+ struct vhost_vring_file file;
+
+ file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
+ file.fd = VIRTIO_INVALID_EVENTFD;
+ else
+ file.fd = pmsg->fds[0];
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "vring call idx:%d file:%d\n", file.index, file.fd);
+ vhost_set_vring_call(vid, &file);
+}
+
+/*
+ * In vhost-user, when we receive kick message, will test whether virtio
+ * device is ready for packet processing.
+ */
+static void
+user_set_vring_kick(int vid, struct VhostUserMsg *pmsg)
+{
+ struct vhost_vring_file file;
+ struct virtio_net *dev = get_device(vid);
+
+ if (!dev)
+ return;
+
+ file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
+ file.fd = VIRTIO_INVALID_EVENTFD;
+ else
+ file.fd = pmsg->fds[0];
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "vring kick idx:%d file:%d\n", file.index, file.fd);
+ vhost_set_vring_kick(vid, &file);
+
+ if (virtio_is_ready(dev) && !(dev->flags & VIRTIO_DEV_RUNNING)) {
+ if (notify_ops->new_device(vid) == 0)
+ dev->flags |= VIRTIO_DEV_RUNNING;
+ }
+}
+
+/*
+ * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
+ */
+static int
+user_get_vring_base(int vid, struct vhost_vring_state *state)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL)
+ return -1;
+ /* We have to stop the queue (virtio) if it is running. */
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ notify_ops->destroy_device(vid);
+ }
+
+ /* Here we are safe to get the last used index */
+ vhost_get_vring_base(vid, state->index, state);
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "vring base idx:%d file:%d\n", state->index, state->num);
+ /*
+ * Based on current qemu vhost-user implementation, this message is
+ * sent and only sent in vhost_vring_stop.
+ * TODO: cleanup the vring, it isn't usable since here.
+ */
+ if (dev->virtqueue[state->index]->kickfd >= 0)
+ close(dev->virtqueue[state->index]->kickfd);
+
+ dev->virtqueue[state->index]->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+
+ return 0;
+}
+
+/*
+ * when virtio queues are ready to work, qemu will send us to
+ * enable the virtio queue pair.
+ */
+static int
+user_set_vring_enable(int vid, struct vhost_vring_state *state)
+{
+ struct virtio_net *dev;
+ int enable = (int)state->num;
+
+ dev = get_device(vid);
+ if (dev == NULL)
+ return -1;
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "set queue enable: %d to qp idx: %d\n",
+ enable, state->index);
+
+ if (notify_ops->vring_state_changed)
+ notify_ops->vring_state_changed(vid, state->index, enable);
+
+ dev->virtqueue[state->index]->enabled = enable;
+
+ return 0;
+}
+
+static void
+user_set_protocol_features(int vid, uint64_t protocol_features)
+{
+ struct virtio_net *dev;
+
+ dev = get_device(vid);
+ if (dev == NULL || protocol_features & ~VHOST_USER_PROTOCOL_FEATURES)
+ return;
+
+ dev->protocol_features = protocol_features;
+}
+
+static int
+user_set_log_base(int vid, struct VhostUserMsg *msg)
+{
+ struct virtio_net *dev;
+ int fd = msg->fds[0];
+ uint64_t size, off;
+ void *addr;
+
+ dev = get_device(vid);
+ if (!dev)
+ return -1;
+
+ if (fd < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd);
+ return -1;
+ }
+
+ if (msg->size != sizeof(VhostUserLog)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "invalid log base msg size: %"PRId32" != %d\n",
+ msg->size, (int)sizeof(VhostUserLog));
+ return -1;
+ }
+
+ size = msg->payload.log.mmap_size;
+ off = msg->payload.log.mmap_offset;
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "log mmap size: %"PRId64", offset: %"PRId64"\n",
+ size, off);
+
+ /*
+ * mmap from 0 to workaround a hugepage mmap bug: mmap will
+ * fail when offset is not page size aligned.
+ */
+ addr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ close(fd);
+ if (addr == MAP_FAILED) {
+ RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n");
+ return -1;
+ }
+
+ /*
+ * Free previously mapped log memory on occasionally
+ * multiple VHOST_USER_SET_LOG_BASE.
+ */
+ if (dev->log_addr) {
+ munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
+ }
+ dev->log_addr = (uint64_t)(uintptr_t)addr;
+ dev->log_base = dev->log_addr + off;
+ dev->log_size = size;
+
+ return 0;
+}
+
+/*
+ * An rarp packet is constructed and broadcasted to notify switches about
+ * the new location of the migrated VM, so that packets from outside will
+ * not be lost after migration.
+ *
+ * However, we don't actually "send" a rarp packet here, instead, we set
+ * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
+ */
+static int
+user_send_rarp(int vid, struct VhostUserMsg *msg)
+{
+ struct virtio_net *dev;
+ uint8_t *mac = (uint8_t *)&msg->payload.u64;
+
+ dev = get_device(vid);
+ if (!dev)
+ return -1;
+
+ RTE_LOG(DEBUG, VHOST_CONFIG,
+ ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
+ mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
+ memcpy(dev->mac.addr_bytes, mac, 6);
+
+ /*
+ * Set the flag to inject a RARP broadcast packet at
+ * rte_vhost_dequeue_burst().
+ *
+ * rte_smp_wmb() is for making sure the mac is copied
+ * before the flag is set.
+ */
+ rte_smp_wmb();
+ rte_atomic16_set(&dev->broadcast_rarp, 1);
+
+ return 0;
+}
+
+/* return bytes# of read on success or negative val on failure. */
+static int
+read_vhost_message(int sockfd, struct VhostUserMsg *msg)
+{
+ int ret;
+
+ ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
+ msg->fds, VHOST_MEMORY_MAX_NREGIONS);
+ if (ret <= 0)
+ return ret;
+
+ if (msg && msg->size) {
+ if (msg->size > sizeof(msg->payload)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "invalid msg size: %d\n", msg->size);
+ return -1;
+ }
+ ret = read(sockfd, &msg->payload, msg->size);
+ if (ret <= 0)
+ return ret;
+ if (ret != (int)msg->size) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "read control message failed\n");
+ return -1;
+ }
+ }
+
+ return ret;
+}
+
+static int
+send_vhost_message(int sockfd, struct VhostUserMsg *msg)
+{
+ int ret;
+
+ if (!msg)
+ return 0;
+
+ msg->flags &= ~VHOST_USER_VERSION_MASK;
+ msg->flags |= VHOST_USER_VERSION;
+ msg->flags |= VHOST_USER_REPLY_MASK;
+
+ ret = send_fd_message(sockfd, (char *)msg,
+ VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
+
+ return ret;
+}
+
+int
+vhost_user_msg_handler(int vid, int fd)
+{
+ struct VhostUserMsg msg;
+ uint64_t features = 0;
+ int ret;
+
+ ret = read_vhost_message(fd, &msg);
+ if (ret <= 0 || msg.request >= VHOST_USER_MAX) {
+ if (ret < 0)
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "vhost read message failed\n");
+ else if (ret == 0)
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "vhost peer closed\n");
+ else
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "vhost read incorrect message\n");
+
+ return -1;
+ }
+
+ RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
+ vhost_message_str[msg.request]);
+ switch (msg.request) {
+ case VHOST_USER_GET_FEATURES:
+ ret = vhost_get_features(vid, &features);
+ msg.payload.u64 = features;
+ msg.size = sizeof(msg.payload.u64);
+ send_vhost_message(fd, &msg);
+ break;
+ case VHOST_USER_SET_FEATURES:
+ features = msg.payload.u64;
+ vhost_set_features(vid, &features);
+ break;
+
+ case VHOST_USER_GET_PROTOCOL_FEATURES:
+ msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES;
+ msg.size = sizeof(msg.payload.u64);
+ send_vhost_message(fd, &msg);
+ break;
+ case VHOST_USER_SET_PROTOCOL_FEATURES:
+ user_set_protocol_features(vid, msg.payload.u64);
+ break;
+
+ case VHOST_USER_SET_OWNER:
+ vhost_set_owner(vid);
+ break;
+ case VHOST_USER_RESET_OWNER:
+ vhost_reset_owner(vid);
+ break;
+
+ case VHOST_USER_SET_MEM_TABLE:
+ user_set_mem_table(vid, &msg);
+ break;
+
+ case VHOST_USER_SET_LOG_BASE:
+ user_set_log_base(vid, &msg);
+
+ /* it needs a reply */
+ msg.size = sizeof(msg.payload.u64);
+ send_vhost_message(fd, &msg);
+ break;
+ case VHOST_USER_SET_LOG_FD:
+ close(msg.fds[0]);
+ RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
+ break;
+
+ case VHOST_USER_SET_VRING_NUM:
+ vhost_set_vring_num(vid, &msg.payload.state);
+ break;
+ case VHOST_USER_SET_VRING_ADDR:
+ vhost_set_vring_addr(vid, &msg.payload.addr);
+ break;
+ case VHOST_USER_SET_VRING_BASE:
+ vhost_set_vring_base(vid, &msg.payload.state);
+ break;
+
+ case VHOST_USER_GET_VRING_BASE:
+ ret = user_get_vring_base(vid, &msg.payload.state);
+ msg.size = sizeof(msg.payload.state);
+ send_vhost_message(fd, &msg);
+ break;
+
+ case VHOST_USER_SET_VRING_KICK:
+ user_set_vring_kick(vid, &msg);
+ break;
+ case VHOST_USER_SET_VRING_CALL:
+ user_set_vring_call(vid, &msg);
+ break;
+
+ case VHOST_USER_SET_VRING_ERR:
+ if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK))
+ close(msg.fds[0]);
+ RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
+ break;
+
+ case VHOST_USER_GET_QUEUE_NUM:
+ msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS;
+ msg.size = sizeof(msg.payload.u64);
+ send_vhost_message(fd, &msg);
+ break;
+
+ case VHOST_USER_SET_VRING_ENABLE:
+ user_set_vring_enable(vid, &msg.payload.state);
+ break;
+ case VHOST_USER_SEND_RARP:
+ user_send_rarp(vid, &msg);
+ break;
+
+ default:
+ break;
+
+ }
+
+ return 0;
+}
--- /dev/null
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VHOST_NET_USER_H
+#define _VHOST_NET_USER_H
+
+#include <stdint.h>
+#include <linux/vhost.h>
+
+#include "rte_virtio_net.h"
+
+/* refer to hw/virtio/vhost-user.c */
+
+#define VHOST_MEMORY_MAX_NREGIONS 8
+
+#define VHOST_USER_PROTOCOL_F_MQ 0
+#define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1
+#define VHOST_USER_PROTOCOL_F_RARP 2
+
+#define VHOST_USER_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\
+ (1ULL << VHOST_USER_PROTOCOL_F_RARP))
+
+typedef enum VhostUserRequest {
+ VHOST_USER_NONE = 0,
+ VHOST_USER_GET_FEATURES = 1,
+ VHOST_USER_SET_FEATURES = 2,
+ VHOST_USER_SET_OWNER = 3,
+ VHOST_USER_RESET_OWNER = 4,
+ VHOST_USER_SET_MEM_TABLE = 5,
+ VHOST_USER_SET_LOG_BASE = 6,
+ VHOST_USER_SET_LOG_FD = 7,
+ VHOST_USER_SET_VRING_NUM = 8,
+ VHOST_USER_SET_VRING_ADDR = 9,
+ VHOST_USER_SET_VRING_BASE = 10,
+ VHOST_USER_GET_VRING_BASE = 11,
+ VHOST_USER_SET_VRING_KICK = 12,
+ VHOST_USER_SET_VRING_CALL = 13,
+ VHOST_USER_SET_VRING_ERR = 14,
+ VHOST_USER_GET_PROTOCOL_FEATURES = 15,
+ VHOST_USER_SET_PROTOCOL_FEATURES = 16,
+ VHOST_USER_GET_QUEUE_NUM = 17,
+ VHOST_USER_SET_VRING_ENABLE = 18,
+ VHOST_USER_SEND_RARP = 19,
+ VHOST_USER_MAX
+} VhostUserRequest;
+
+typedef struct VhostUserMemoryRegion {
+ uint64_t guest_phys_addr;
+ uint64_t memory_size;
+ uint64_t userspace_addr;
+ uint64_t mmap_offset;
+} VhostUserMemoryRegion;
+
+typedef struct VhostUserMemory {
+ uint32_t nregions;
+ uint32_t padding;
+ VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
+} VhostUserMemory;
+
+typedef struct VhostUserLog {
+ uint64_t mmap_size;
+ uint64_t mmap_offset;
+} VhostUserLog;
+
+typedef struct VhostUserMsg {
+ VhostUserRequest request;
+
+#define VHOST_USER_VERSION_MASK 0x3
+#define VHOST_USER_REPLY_MASK (0x1 << 2)
+ uint32_t flags;
+ uint32_t size; /* the following payload size */
+ union {
+#define VHOST_USER_VRING_IDX_MASK 0xff
+#define VHOST_USER_VRING_NOFD_MASK (0x1<<8)
+ uint64_t u64;
+ struct vhost_vring_state state;
+ struct vhost_vring_addr addr;
+ VhostUserMemory memory;
+ VhostUserLog log;
+ } payload;
+ int fds[VHOST_MEMORY_MAX_NREGIONS];
+} __attribute((packed)) VhostUserMsg;
+
+#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
+
+/* The version of the protocol we support */
+#define VHOST_USER_VERSION 0x1
+
+
+/* vhost_user.c */
+int vhost_user_msg_handler(int vid, int fd);
+
+/* socket.c */
+int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num);
+int send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num);
+
+#endif
+++ /dev/null
-/*-
- * BSD LICENSE
- *
- * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include <rte_common.h>
-#include <rte_log.h>
-
-#include "virtio-net-user.h"
-#include "vhost-net-user.h"
-#include "vhost-net.h"
-
-struct orig_region_map {
- int fd;
- uint64_t mapped_address;
- uint64_t mapped_size;
- uint64_t blksz;
-};
-
-#define orig_region(ptr, nregions) \
- ((struct orig_region_map *)RTE_PTR_ADD((ptr), \
- sizeof(struct virtio_memory) + \
- sizeof(struct virtio_memory_regions) * (nregions)))
-
-static uint64_t
-get_blk_size(int fd)
-{
- struct stat stat;
- int ret;
-
- ret = fstat(fd, &stat);
- return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
-}
-
-static void
-free_mem_region(struct virtio_net *dev)
-{
- struct orig_region_map *region;
- unsigned int idx;
-
- if (!dev || !dev->mem)
- return;
-
- region = orig_region(dev->mem, dev->mem->nregions);
- for (idx = 0; idx < dev->mem->nregions; idx++) {
- if (region[idx].mapped_address) {
- munmap((void *)(uintptr_t)region[idx].mapped_address,
- region[idx].mapped_size);
- close(region[idx].fd);
- }
- }
-}
-
-void
-vhost_backend_cleanup(struct virtio_net *dev)
-{
- if (dev->mem) {
- free_mem_region(dev);
- free(dev->mem);
- dev->mem = NULL;
- }
- if (dev->log_addr) {
- munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
- dev->log_addr = 0;
- }
-}
-
-int
-user_set_mem_table(int vid, struct VhostUserMsg *pmsg)
-{
- struct VhostUserMemory memory = pmsg->payload.memory;
- struct virtio_memory_regions *pregion;
- uint64_t mapped_address, mapped_size;
- struct virtio_net *dev;
- unsigned int idx = 0;
- struct orig_region_map *pregion_orig;
- uint64_t alignment;
-
- /* unmap old memory regions one by one*/
- dev = get_device(vid);
- if (dev == NULL)
- return -1;
-
- /* Remove from the data plane. */
- if (dev->flags & VIRTIO_DEV_RUNNING) {
- dev->flags &= ~VIRTIO_DEV_RUNNING;
- notify_ops->destroy_device(vid);
- }
-
- if (dev->mem) {
- free_mem_region(dev);
- free(dev->mem);
- dev->mem = NULL;
- }
-
- dev->mem = calloc(1,
- sizeof(struct virtio_memory) +
- sizeof(struct virtio_memory_regions) * memory.nregions +
- sizeof(struct orig_region_map) * memory.nregions);
- if (dev->mem == NULL) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "(%d) failed to allocate memory for dev->mem\n",
- dev->vid);
- return -1;
- }
- dev->mem->nregions = memory.nregions;
-
- pregion_orig = orig_region(dev->mem, memory.nregions);
- for (idx = 0; idx < memory.nregions; idx++) {
- pregion = &dev->mem->regions[idx];
- pregion->guest_phys_address =
- memory.regions[idx].guest_phys_addr;
- pregion->guest_phys_address_end =
- memory.regions[idx].guest_phys_addr +
- memory.regions[idx].memory_size;
- pregion->memory_size =
- memory.regions[idx].memory_size;
- pregion->userspace_address =
- memory.regions[idx].userspace_addr;
-
- /* This is ugly */
- mapped_size = memory.regions[idx].memory_size +
- memory.regions[idx].mmap_offset;
-
- /* mmap() without flag of MAP_ANONYMOUS, should be called
- * with length argument aligned with hugepagesz at older
- * longterm version Linux, like 2.6.32 and 3.2.72, or
- * mmap() will fail with EINVAL.
- *
- * to avoid failure, make sure in caller to keep length
- * aligned.
- */
- alignment = get_blk_size(pmsg->fds[idx]);
- if (alignment == (uint64_t)-1) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "couldn't get hugepage size through fstat\n");
- goto err_mmap;
- }
- mapped_size = RTE_ALIGN_CEIL(mapped_size, alignment);
-
- mapped_address = (uint64_t)(uintptr_t)mmap(NULL,
- mapped_size,
- PROT_READ | PROT_WRITE, MAP_SHARED,
- pmsg->fds[idx],
- 0);
-
- RTE_LOG(INFO, VHOST_CONFIG,
- "mapped region %d fd:%d to:%p sz:0x%"PRIx64" "
- "off:0x%"PRIx64" align:0x%"PRIx64"\n",
- idx, pmsg->fds[idx], (void *)(uintptr_t)mapped_address,
- mapped_size, memory.regions[idx].mmap_offset,
- alignment);
-
- if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "mmap qemu guest failed.\n");
- goto err_mmap;
- }
-
- pregion_orig[idx].mapped_address = mapped_address;
- pregion_orig[idx].mapped_size = mapped_size;
- pregion_orig[idx].blksz = alignment;
- pregion_orig[idx].fd = pmsg->fds[idx];
-
- mapped_address += memory.regions[idx].mmap_offset;
-
- pregion->address_offset = mapped_address -
- pregion->guest_phys_address;
-
- if (memory.regions[idx].guest_phys_addr == 0) {
- dev->mem->base_address =
- memory.regions[idx].userspace_addr;
- dev->mem->mapped_address =
- pregion->address_offset;
- }
-
- LOG_DEBUG(VHOST_CONFIG,
- "REGION: %u GPA: %p QEMU VA: %p SIZE (%"PRIu64")\n",
- idx,
- (void *)(uintptr_t)pregion->guest_phys_address,
- (void *)(uintptr_t)pregion->userspace_address,
- pregion->memory_size);
- }
-
- return 0;
-
-err_mmap:
- while (idx--) {
- munmap((void *)(uintptr_t)pregion_orig[idx].mapped_address,
- pregion_orig[idx].mapped_size);
- close(pregion_orig[idx].fd);
- }
- free(dev->mem);
- dev->mem = NULL;
- return -1;
-}
-
-static int
-vq_is_ready(struct vhost_virtqueue *vq)
-{
- return vq && vq->desc &&
- vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD &&
- vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD;
-}
-
-static int
-virtio_is_ready(struct virtio_net *dev)
-{
- struct vhost_virtqueue *rvq, *tvq;
- uint32_t i;
-
- for (i = 0; i < dev->virt_qp_nb; i++) {
- rvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ];
- tvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ];
-
- if (!vq_is_ready(rvq) || !vq_is_ready(tvq)) {
- RTE_LOG(INFO, VHOST_CONFIG,
- "virtio is not ready for processing.\n");
- return 0;
- }
- }
-
- RTE_LOG(INFO, VHOST_CONFIG,
- "virtio is now ready for processing.\n");
- return 1;
-}
-
-void
-user_set_vring_call(int vid, struct VhostUserMsg *pmsg)
-{
- struct vhost_vring_file file;
-
- file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
- if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
- file.fd = VIRTIO_INVALID_EVENTFD;
- else
- file.fd = pmsg->fds[0];
- RTE_LOG(INFO, VHOST_CONFIG,
- "vring call idx:%d file:%d\n", file.index, file.fd);
- vhost_set_vring_call(vid, &file);
-}
-
-
-/*
- * In vhost-user, when we receive kick message, will test whether virtio
- * device is ready for packet processing.
- */
-void
-user_set_vring_kick(int vid, struct VhostUserMsg *pmsg)
-{
- struct vhost_vring_file file;
- struct virtio_net *dev = get_device(vid);
-
- if (!dev)
- return;
-
- file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
- if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
- file.fd = VIRTIO_INVALID_EVENTFD;
- else
- file.fd = pmsg->fds[0];
- RTE_LOG(INFO, VHOST_CONFIG,
- "vring kick idx:%d file:%d\n", file.index, file.fd);
- vhost_set_vring_kick(vid, &file);
-
- if (virtio_is_ready(dev) && !(dev->flags & VIRTIO_DEV_RUNNING)) {
- if (notify_ops->new_device(vid) == 0)
- dev->flags |= VIRTIO_DEV_RUNNING;
- }
-}
-
-/*
- * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
- */
-int
-user_get_vring_base(int vid, struct vhost_vring_state *state)
-{
- struct virtio_net *dev = get_device(vid);
-
- if (dev == NULL)
- return -1;
- /* We have to stop the queue (virtio) if it is running. */
- if (dev->flags & VIRTIO_DEV_RUNNING) {
- dev->flags &= ~VIRTIO_DEV_RUNNING;
- notify_ops->destroy_device(vid);
- }
-
- /* Here we are safe to get the last used index */
- vhost_get_vring_base(vid, state->index, state);
-
- RTE_LOG(INFO, VHOST_CONFIG,
- "vring base idx:%d file:%d\n", state->index, state->num);
- /*
- * Based on current qemu vhost-user implementation, this message is
- * sent and only sent in vhost_vring_stop.
- * TODO: cleanup the vring, it isn't usable since here.
- */
- if (dev->virtqueue[state->index]->kickfd >= 0)
- close(dev->virtqueue[state->index]->kickfd);
-
- dev->virtqueue[state->index]->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
-
- return 0;
-}
-
-/*
- * when virtio queues are ready to work, qemu will send us to
- * enable the virtio queue pair.
- */
-int
-user_set_vring_enable(int vid, struct vhost_vring_state *state)
-{
- struct virtio_net *dev;
- int enable = (int)state->num;
-
- dev = get_device(vid);
- if (dev == NULL)
- return -1;
-
- RTE_LOG(INFO, VHOST_CONFIG,
- "set queue enable: %d to qp idx: %d\n",
- enable, state->index);
-
- if (notify_ops->vring_state_changed)
- notify_ops->vring_state_changed(vid, state->index, enable);
-
- dev->virtqueue[state->index]->enabled = enable;
-
- return 0;
-}
-
-void
-user_set_protocol_features(int vid, uint64_t protocol_features)
-{
- struct virtio_net *dev;
-
- dev = get_device(vid);
- if (dev == NULL || protocol_features & ~VHOST_USER_PROTOCOL_FEATURES)
- return;
-
- dev->protocol_features = protocol_features;
-}
-
-int
-user_set_log_base(int vid, struct VhostUserMsg *msg)
-{
- struct virtio_net *dev;
- int fd = msg->fds[0];
- uint64_t size, off;
- void *addr;
-
- dev = get_device(vid);
- if (!dev)
- return -1;
-
- if (fd < 0) {
- RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd);
- return -1;
- }
-
- if (msg->size != sizeof(VhostUserLog)) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "invalid log base msg size: %"PRId32" != %d\n",
- msg->size, (int)sizeof(VhostUserLog));
- return -1;
- }
-
- size = msg->payload.log.mmap_size;
- off = msg->payload.log.mmap_offset;
- RTE_LOG(INFO, VHOST_CONFIG,
- "log mmap size: %"PRId64", offset: %"PRId64"\n",
- size, off);
-
- /*
- * mmap from 0 to workaround a hugepage mmap bug: mmap will
- * fail when offset is not page size aligned.
- */
- addr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
- close(fd);
- if (addr == MAP_FAILED) {
- RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n");
- return -1;
- }
-
- /*
- * Free previously mapped log memory on occasionally
- * multiple VHOST_USER_SET_LOG_BASE.
- */
- if (dev->log_addr) {
- munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
- }
- dev->log_addr = (uint64_t)(uintptr_t)addr;
- dev->log_base = dev->log_addr + off;
- dev->log_size = size;
-
- return 0;
-}
-
-/*
- * An rarp packet is constructed and broadcasted to notify switches about
- * the new location of the migrated VM, so that packets from outside will
- * not be lost after migration.
- *
- * However, we don't actually "send" a rarp packet here, instead, we set
- * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
- */
-int
-user_send_rarp(int vid, struct VhostUserMsg *msg)
-{
- struct virtio_net *dev;
- uint8_t *mac = (uint8_t *)&msg->payload.u64;
-
- dev = get_device(vid);
- if (!dev)
- return -1;
-
- RTE_LOG(DEBUG, VHOST_CONFIG,
- ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
- mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
- memcpy(dev->mac.addr_bytes, mac, 6);
-
- /*
- * Set the flag to inject a RARP broadcast packet at
- * rte_vhost_dequeue_burst().
- *
- * rte_smp_wmb() is for making sure the mac is copied
- * before the flag is set.
- */
- rte_smp_wmb();
- rte_atomic16_set(&dev->broadcast_rarp, 1);
-
- return 0;
-}
+++ /dev/null
-/*-
- * BSD LICENSE
- *
- * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _VIRTIO_NET_USER_H
-#define _VIRTIO_NET_USER_H
-
-#include "vhost-net.h"
-#include "vhost-net-user.h"
-
-#define VHOST_USER_PROTOCOL_F_MQ 0
-#define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1
-#define VHOST_USER_PROTOCOL_F_RARP 2
-
-#define VHOST_USER_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
- (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\
- (1ULL << VHOST_USER_PROTOCOL_F_RARP))
-
-int user_set_mem_table(int, struct VhostUserMsg *);
-
-void user_set_vring_call(int, struct VhostUserMsg *);
-
-void user_set_vring_kick(int, struct VhostUserMsg *);
-
-void user_set_protocol_features(int vid, uint64_t protocol_features);
-int user_set_log_base(int vid, struct VhostUserMsg *);
-int user_send_rarp(int vid, struct VhostUserMsg *);
-
-int user_get_vring_base(int, struct vhost_vring_state *);
-
-int user_set_vring_enable(int vid, struct vhost_vring_state *state);
-
-#endif
+++ /dev/null
-/*-
- * BSD LICENSE
- *
- * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/vhost.h>
-#include <linux/virtio_net.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <sys/mman.h>
-#include <unistd.h>
-#ifdef RTE_LIBRTE_VHOST_NUMA
-#include <numaif.h>
-#endif
-
-#include <sys/socket.h>
-
-#include <rte_ethdev.h>
-#include <rte_log.h>
-#include <rte_string_fns.h>
-#include <rte_memory.h>
-#include <rte_malloc.h>
-#include <rte_virtio_net.h>
-
-#include "vhost-net.h"
-
-#define MAX_VHOST_DEVICE 1024
-static struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
-
-/* device ops to add/remove device to/from data core. */
-struct virtio_net_device_ops const *notify_ops;
-
-#define VHOST_USER_F_PROTOCOL_FEATURES 30
-
-/* Features supported by this lib. */
-#define VHOST_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \
- (1ULL << VIRTIO_NET_F_CTRL_VQ) | \
- (1ULL << VIRTIO_NET_F_CTRL_RX) | \
- (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \
- (VHOST_SUPPORTS_MQ) | \
- (1ULL << VIRTIO_F_VERSION_1) | \
- (1ULL << VHOST_F_LOG_ALL) | \
- (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
- (1ULL << VIRTIO_NET_F_HOST_TSO4) | \
- (1ULL << VIRTIO_NET_F_HOST_TSO6) | \
- (1ULL << VIRTIO_NET_F_CSUM) | \
- (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
- (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
- (1ULL << VIRTIO_NET_F_GUEST_TSO6))
-
-static uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES;
-
-
-/*
- * Converts QEMU virtual address to Vhost virtual address. This function is
- * used to convert the ring addresses to our address space.
- */
-static uint64_t
-qva_to_vva(struct virtio_net *dev, uint64_t qemu_va)
-{
- struct virtio_memory_regions *region;
- uint64_t vhost_va = 0;
- uint32_t regionidx = 0;
-
- /* Find the region where the address lives. */
- for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
- region = &dev->mem->regions[regionidx];
- if ((qemu_va >= region->userspace_address) &&
- (qemu_va <= region->userspace_address +
- region->memory_size)) {
- vhost_va = qemu_va + region->guest_phys_address +
- region->address_offset -
- region->userspace_address;
- break;
- }
- }
- return vhost_va;
-}
-
-struct virtio_net *
-get_device(int vid)
-{
- struct virtio_net *dev = vhost_devices[vid];
-
- if (unlikely(!dev)) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "(%d) device not found.\n", vid);
- }
-
- return dev;
-}
-
-static void
-cleanup_vq(struct vhost_virtqueue *vq, int destroy)
-{
- if ((vq->callfd >= 0) && (destroy != 0))
- close(vq->callfd);
- if (vq->kickfd >= 0)
- close(vq->kickfd);
-}
-
-/*
- * Unmap any memory, close any file descriptors and
- * free any memory owned by a device.
- */
-static void
-cleanup_device(struct virtio_net *dev, int destroy)
-{
- uint32_t i;
-
- vhost_backend_cleanup(dev);
-
- for (i = 0; i < dev->virt_qp_nb; i++) {
- cleanup_vq(dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ], destroy);
- cleanup_vq(dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ], destroy);
- }
-}
-
-/*
- * Release virtqueues and device memory.
- */
-static void
-free_device(struct virtio_net *dev)
-{
- uint32_t i;
-
- for (i = 0; i < dev->virt_qp_nb; i++)
- rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
-
- rte_free(dev);
-}
-
-static void
-init_vring_queue(struct vhost_virtqueue *vq, int qp_idx)
-{
- memset(vq, 0, sizeof(struct vhost_virtqueue));
-
- vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
- vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD;
-
- /* Backends are set to -1 indicating an inactive device. */
- vq->backend = -1;
-
- /* always set the default vq pair to enabled */
- if (qp_idx == 0)
- vq->enabled = 1;
-}
-
-static void
-init_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx)
-{
- uint32_t base_idx = qp_idx * VIRTIO_QNUM;
-
- init_vring_queue(dev->virtqueue[base_idx + VIRTIO_RXQ], qp_idx);
- init_vring_queue(dev->virtqueue[base_idx + VIRTIO_TXQ], qp_idx);
-}
-
-static void
-reset_vring_queue(struct vhost_virtqueue *vq, int qp_idx)
-{
- int callfd;
-
- callfd = vq->callfd;
- init_vring_queue(vq, qp_idx);
- vq->callfd = callfd;
-}
-
-static void
-reset_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx)
-{
- uint32_t base_idx = qp_idx * VIRTIO_QNUM;
-
- reset_vring_queue(dev->virtqueue[base_idx + VIRTIO_RXQ], qp_idx);
- reset_vring_queue(dev->virtqueue[base_idx + VIRTIO_TXQ], qp_idx);
-}
-
-static int
-alloc_vring_queue_pair(struct virtio_net *dev, uint32_t qp_idx)
-{
- struct vhost_virtqueue *virtqueue = NULL;
- uint32_t virt_rx_q_idx = qp_idx * VIRTIO_QNUM + VIRTIO_RXQ;
- uint32_t virt_tx_q_idx = qp_idx * VIRTIO_QNUM + VIRTIO_TXQ;
-
- virtqueue = rte_malloc(NULL,
- sizeof(struct vhost_virtqueue) * VIRTIO_QNUM, 0);
- if (virtqueue == NULL) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "Failed to allocate memory for virt qp:%d.\n", qp_idx);
- return -1;
- }
-
- dev->virtqueue[virt_rx_q_idx] = virtqueue;
- dev->virtqueue[virt_tx_q_idx] = virtqueue + VIRTIO_TXQ;
-
- init_vring_queue_pair(dev, qp_idx);
-
- dev->virt_qp_nb += 1;
-
- return 0;
-}
-
-/*
- * Reset some variables in device structure, while keeping few
- * others untouched, such as vid, ifname, virt_qp_nb: they
- * should be same unless the device is removed.
- */
-static void
-reset_device(struct virtio_net *dev)
-{
- uint32_t i;
-
- dev->features = 0;
- dev->protocol_features = 0;
- dev->flags = 0;
-
- for (i = 0; i < dev->virt_qp_nb; i++)
- reset_vring_queue_pair(dev, i);
-}
-
-/*
- * Function is called from the CUSE open function. The device structure is
- * initialised and a new entry is added to the device configuration linked
- * list.
- */
-int
-vhost_new_device(void)
-{
- struct virtio_net *dev;
- int i;
-
- dev = rte_zmalloc(NULL, sizeof(struct virtio_net), 0);
- if (dev == NULL) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "Failed to allocate memory for new dev.\n");
- return -1;
- }
-
- for (i = 0; i < MAX_VHOST_DEVICE; i++) {
- if (vhost_devices[i] == NULL)
- break;
- }
- if (i == MAX_VHOST_DEVICE) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "Failed to find a free slot for new device.\n");
- return -1;
- }
-
- vhost_devices[i] = dev;
- dev->vid = i;
-
- return i;
-}
-
-/*
- * Function is called from the CUSE release function. This function will
- * cleanup the device and remove it from device configuration linked list.
- */
-void
-vhost_destroy_device(int vid)
-{
- struct virtio_net *dev = get_device(vid);
-
- if (dev == NULL)
- return;
-
- if (dev->flags & VIRTIO_DEV_RUNNING) {
- dev->flags &= ~VIRTIO_DEV_RUNNING;
- notify_ops->destroy_device(vid);
- }
-
- cleanup_device(dev, 1);
- free_device(dev);
-
- vhost_devices[vid] = NULL;
-}
-
-void
-vhost_set_ifname(int vid, const char *if_name, unsigned int if_len)
-{
- struct virtio_net *dev;
- unsigned int len;
-
- dev = get_device(vid);
- if (dev == NULL)
- return;
-
- len = if_len > sizeof(dev->ifname) ?
- sizeof(dev->ifname) : if_len;
-
- strncpy(dev->ifname, if_name, len);
- dev->ifname[sizeof(dev->ifname) - 1] = '\0';
-}
-
-
-/*
- * Called from CUSE IOCTL: VHOST_SET_OWNER
- * This function just returns success at the moment unless
- * the device hasn't been initialised.
- */
-int
-vhost_set_owner(int vid)
-{
- struct virtio_net *dev;
-
- dev = get_device(vid);
- if (dev == NULL)
- return -1;
-
- return 0;
-}
-
-/*
- * Called from CUSE IOCTL: VHOST_RESET_OWNER
- */
-int
-vhost_reset_owner(int vid)
-{
- struct virtio_net *dev;
-
- dev = get_device(vid);
- if (dev == NULL)
- return -1;
-
- if (dev->flags & VIRTIO_DEV_RUNNING) {
- dev->flags &= ~VIRTIO_DEV_RUNNING;
- notify_ops->destroy_device(vid);
- }
-
- cleanup_device(dev, 0);
- reset_device(dev);
- return 0;
-}
-
-/*
- * Called from CUSE IOCTL: VHOST_GET_FEATURES
- * The features that we support are requested.
- */
-int
-vhost_get_features(int vid, uint64_t *pu)
-{
- struct virtio_net *dev;
-
- dev = get_device(vid);
- if (dev == NULL)
- return -1;
-
- /* Send our supported features. */
- *pu = VHOST_FEATURES;
- return 0;
-}
-
-/*
- * Called from CUSE IOCTL: VHOST_SET_FEATURES
- * We receive the negotiated features supported by us and the virtio device.
- */
-int
-vhost_set_features(int vid, uint64_t *pu)
-{
- struct virtio_net *dev;
-
- dev = get_device(vid);
- if (dev == NULL)
- return -1;
- if (*pu & ~VHOST_FEATURES)
- return -1;
-
- dev->features = *pu;
- if (dev->features &
- ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) {
- dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
- } else {
- dev->vhost_hlen = sizeof(struct virtio_net_hdr);
- }
- LOG_DEBUG(VHOST_CONFIG,
- "(%d) mergeable RX buffers %s, virtio 1 %s\n",
- dev->vid,
- (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off",
- (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off");
-
- return 0;
-}
-
-/*
- * Called from CUSE IOCTL: VHOST_SET_VRING_NUM
- * The virtio device sends us the size of the descriptor ring.
- */
-int
-vhost_set_vring_num(int vid, struct vhost_vring_state *state)
-{
- struct virtio_net *dev;
-
- dev = get_device(vid);
- if (dev == NULL)
- return -1;
-
- /* State->index refers to the queue index. The txq is 1, rxq is 0. */
- dev->virtqueue[state->index]->size = state->num;
-
- return 0;
-}
-
-/*
- * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the
- * same numa node as the memory of vring descriptor.
- */
-#ifdef RTE_LIBRTE_VHOST_NUMA
-static struct virtio_net*
-numa_realloc(struct virtio_net *dev, int index)
-{
- int oldnode, newnode;
- struct virtio_net *old_dev;
- struct vhost_virtqueue *old_vq, *vq;
- int ret;
-
- /*
- * vq is allocated on pairs, we should try to do realloc
- * on first queue of one queue pair only.
- */
- if (index % VIRTIO_QNUM != 0)
- return dev;
-
- old_dev = dev;
- vq = old_vq = dev->virtqueue[index];
-
- ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc,
- MPOL_F_NODE | MPOL_F_ADDR);
-
- /* check if we need to reallocate vq */
- ret |= get_mempolicy(&oldnode, NULL, 0, old_vq,
- MPOL_F_NODE | MPOL_F_ADDR);
- if (ret) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "Unable to get vq numa information.\n");
- return dev;
- }
- if (oldnode != newnode) {
- RTE_LOG(INFO, VHOST_CONFIG,
- "reallocate vq from %d to %d node\n", oldnode, newnode);
- vq = rte_malloc_socket(NULL, sizeof(*vq) * VIRTIO_QNUM, 0,
- newnode);
- if (!vq)
- return dev;
-
- memcpy(vq, old_vq, sizeof(*vq) * VIRTIO_QNUM);
- rte_free(old_vq);
- }
-
- /* check if we need to reallocate dev */
- ret = get_mempolicy(&oldnode, NULL, 0, old_dev,
- MPOL_F_NODE | MPOL_F_ADDR);
- if (ret) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "Unable to get dev numa information.\n");
- goto out;
- }
- if (oldnode != newnode) {
- RTE_LOG(INFO, VHOST_CONFIG,
- "reallocate dev from %d to %d node\n",
- oldnode, newnode);
- dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode);
- if (!dev) {
- dev = old_dev;
- goto out;
- }
-
- memcpy(dev, old_dev, sizeof(*dev));
- rte_free(old_dev);
- }
-
-out:
- dev->virtqueue[index] = vq;
- dev->virtqueue[index + 1] = vq + 1;
- vhost_devices[dev->vid] = dev;
-
- return dev;
-}
-#else
-static struct virtio_net*
-numa_realloc(struct virtio_net *dev, int index __rte_unused)
-{
- return dev;
-}
-#endif
-
-/*
- * Called from CUSE IOCTL: VHOST_SET_VRING_ADDR
- * The virtio device sends us the desc, used and avail ring addresses.
- * This function then converts these to our address space.
- */
-int
-vhost_set_vring_addr(int vid, struct vhost_vring_addr *addr)
-{
- struct virtio_net *dev;
- struct vhost_virtqueue *vq;
-
- dev = get_device(vid);
- if ((dev == NULL) || (dev->mem == NULL))
- return -1;
-
- /* addr->index refers to the queue index. The txq 1, rxq is 0. */
- vq = dev->virtqueue[addr->index];
-
- /* The addresses are converted from QEMU virtual to Vhost virtual. */
- vq->desc = (struct vring_desc *)(uintptr_t)qva_to_vva(dev,
- addr->desc_user_addr);
- if (vq->desc == 0) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "(%d) failed to find desc ring address.\n",
- dev->vid);
- return -1;
- }
-
- dev = numa_realloc(dev, addr->index);
- vq = dev->virtqueue[addr->index];
-
- vq->avail = (struct vring_avail *)(uintptr_t)qva_to_vva(dev,
- addr->avail_user_addr);
- if (vq->avail == 0) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "(%d) failed to find avail ring address.\n",
- dev->vid);
- return -1;
- }
-
- vq->used = (struct vring_used *)(uintptr_t)qva_to_vva(dev,
- addr->used_user_addr);
- if (vq->used == 0) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "(%d) failed to find used ring address.\n",
- dev->vid);
- return -1;
- }
-
- if (vq->last_used_idx != vq->used->idx) {
- RTE_LOG(WARNING, VHOST_CONFIG,
- "last_used_idx (%u) and vq->used->idx (%u) mismatches; "
- "some packets maybe resent for Tx and dropped for Rx\n",
- vq->last_used_idx, vq->used->idx);
- vq->last_used_idx = vq->used->idx;
- }
-
- vq->log_guest_addr = addr->log_guest_addr;
-
- LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n",
- dev->vid, vq->desc);
- LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n",
- dev->vid, vq->avail);
- LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n",
- dev->vid, vq->used);
- LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n",
- dev->vid, vq->log_guest_addr);
-
- return 0;
-}
-
-/*
- * Called from CUSE IOCTL: VHOST_SET_VRING_BASE
- * The virtio device sends us the available ring last used index.
- */
-int
-vhost_set_vring_base(int vid, struct vhost_vring_state *state)
-{
- struct virtio_net *dev;
-
- dev = get_device(vid);
- if (dev == NULL)
- return -1;
-
- /* State->index refers to the queue index. The txq is 1, rxq is 0. */
- dev->virtqueue[state->index]->last_used_idx = state->num;
-
- return 0;
-}
-
-/*
- * Called from CUSE IOCTL: VHOST_GET_VRING_BASE
- * We send the virtio device our available ring last used index.
- */
-int
-vhost_get_vring_base(int vid, uint32_t index,
- struct vhost_vring_state *state)
-{
- struct virtio_net *dev;
-
- dev = get_device(vid);
- if (dev == NULL)
- return -1;
-
- state->index = index;
- /* State->index refers to the queue index. The txq is 1, rxq is 0. */
- state->num = dev->virtqueue[state->index]->last_used_idx;
-
- return 0;
-}
-
-
-/*
- * Called from CUSE IOCTL: VHOST_SET_VRING_CALL
- * The virtio device sends an eventfd to interrupt the guest. This fd gets
- * copied into our process space.
- */
-int
-vhost_set_vring_call(int vid, struct vhost_vring_file *file)
-{
- struct virtio_net *dev;
- struct vhost_virtqueue *vq;
- uint32_t cur_qp_idx = file->index / VIRTIO_QNUM;
-
- dev = get_device(vid);
- if (dev == NULL)
- return -1;
-
- /*
- * FIXME: VHOST_SET_VRING_CALL is the first per-vring message
- * we get, so we do vring queue pair allocation here.
- */
- if (cur_qp_idx + 1 > dev->virt_qp_nb) {
- if (alloc_vring_queue_pair(dev, cur_qp_idx) < 0)
- return -1;
- }
-
- /* file->index refers to the queue index. The txq is 1, rxq is 0. */
- vq = dev->virtqueue[file->index];
- assert(vq != NULL);
-
- if (vq->callfd >= 0)
- close(vq->callfd);
-
- vq->callfd = file->fd;
-
- return 0;
-}
-
-/*
- * Called from CUSE IOCTL: VHOST_SET_VRING_KICK
- * The virtio device sends an eventfd that it can use to notify us.
- * This fd gets copied into our process space.
- */
-int
-vhost_set_vring_kick(int vid, struct vhost_vring_file *file)
-{
- struct virtio_net *dev;
- struct vhost_virtqueue *vq;
-
- dev = get_device(vid);
- if (dev == NULL)
- return -1;
-
- /* file->index refers to the queue index. The txq is 1, rxq is 0. */
- vq = dev->virtqueue[file->index];
-
- if (vq->kickfd >= 0)
- close(vq->kickfd);
-
- vq->kickfd = file->fd;
-
- return 0;
-}
-
-/*
- * Called from CUSE IOCTL: VHOST_NET_SET_BACKEND
- * To complete device initialisation when the virtio driver is loaded,
- * we are provided with a valid fd for a tap device (not used by us).
- * If this happens then we can add the device to a data core.
- * When the virtio driver is removed we get fd=-1.
- * At that point we remove the device from the data core.
- * The device will still exist in the device configuration linked list.
- */
-int
-vhost_set_backend(int vid, struct vhost_vring_file *file)
-{
- struct virtio_net *dev;
-
- dev = get_device(vid);
- if (dev == NULL)
- return -1;
-
- /* file->index refers to the queue index. The txq is 1, rxq is 0. */
- dev->virtqueue[file->index]->backend = file->fd;
-
- /*
- * If the device isn't already running and both backend fds are set,
- * we add the device.
- */
- if (!(dev->flags & VIRTIO_DEV_RUNNING)) {
- if (dev->virtqueue[VIRTIO_TXQ]->backend != VIRTIO_DEV_STOPPED &&
- dev->virtqueue[VIRTIO_RXQ]->backend != VIRTIO_DEV_STOPPED) {
- if (notify_ops->new_device(vid) < 0)
- return -1;
- dev->flags |= VIRTIO_DEV_RUNNING;
- }
- } else if (file->fd == VIRTIO_DEV_STOPPED) {
- dev->flags &= ~VIRTIO_DEV_RUNNING;
- notify_ops->destroy_device(vid);
- }
-
- return 0;
-}
-
-int
-rte_vhost_get_numa_node(int vid)
-{
-#ifdef RTE_LIBRTE_VHOST_NUMA
- struct virtio_net *dev = get_device(vid);
- int numa_node;
- int ret;
-
- if (dev == NULL)
- return -1;
-
- ret = get_mempolicy(&numa_node, NULL, 0, dev,
- MPOL_F_NODE | MPOL_F_ADDR);
- if (ret < 0) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "(%d) failed to query numa node: %d\n", vid, ret);
- return -1;
- }
-
- return numa_node;
-#else
- RTE_SET_USED(vid);
- return -1;
-#endif
-}
-
-uint32_t
-rte_vhost_get_queue_num(int vid)
-{
- struct virtio_net *dev = get_device(vid);
-
- if (dev == NULL)
- return 0;
-
- return dev->virt_qp_nb;
-}
-
-int
-rte_vhost_get_ifname(int vid, char *buf, size_t len)
-{
- struct virtio_net *dev = get_device(vid);
-
- if (dev == NULL)
- return -1;
-
- len = RTE_MIN(len, sizeof(dev->ifname));
-
- strncpy(buf, dev->ifname, len);
- buf[len - 1] = '\0';
-
- return 0;
-}
-
-uint16_t
-rte_vhost_avail_entries(int vid, uint16_t queue_id)
-{
- struct virtio_net *dev;
- struct vhost_virtqueue *vq;
-
- dev = get_device(vid);
- if (!dev)
- return 0;
-
- vq = dev->virtqueue[queue_id];
- if (!vq->enabled)
- return 0;
-
- return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx;
-}
-
-int
-rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable)
-{
- struct virtio_net *dev = get_device(vid);
-
- if (dev == NULL)
- return -1;
-
- if (enable) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "guest notification isn't supported.\n");
- return -1;
- }
-
- dev->virtqueue[queue_id]->used->flags = VRING_USED_F_NO_NOTIFY;
- return 0;
-}
-
-uint64_t rte_vhost_feature_get(void)
-{
- return VHOST_FEATURES;
-}
-
-int rte_vhost_feature_disable(uint64_t feature_mask)
-{
- VHOST_FEATURES = VHOST_FEATURES & ~feature_mask;
- return 0;
-}
-
-int rte_vhost_feature_enable(uint64_t feature_mask)
-{
- if ((feature_mask & VHOST_SUPPORTED_FEATURES) == feature_mask) {
- VHOST_FEATURES = VHOST_FEATURES | feature_mask;
- return 0;
- }
- return -1;
-}
-
-/*
- * Register ops so that we can add/remove device to data core.
- */
-int
-rte_vhost_driver_callback_register(struct virtio_net_device_ops const * const ops)
-{
- notify_ops = ops;
-
- return 0;
-}
--- /dev/null
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <linux/virtio_net.h>
+
+#include <rte_mbuf.h>
+#include <rte_memcpy.h>
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_virtio_net.h>
+#include <rte_tcp.h>
+#include <rte_udp.h>
+#include <rte_sctp.h>
+#include <rte_arp.h>
+
+#include "vhost.h"
+
+#define MAX_PKT_BURST 32
+#define VHOST_LOG_PAGE 4096
+
+static inline void __attribute__((always_inline))
+vhost_log_page(uint8_t *log_base, uint64_t page)
+{
+ log_base[page / 8] |= 1 << (page % 8);
+}
+
+static inline void __attribute__((always_inline))
+vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
+{
+ uint64_t page;
+
+ if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
+ !dev->log_base || !len))
+ return;
+
+ if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
+ return;
+
+ /* To make sure guest memory updates are committed before logging */
+ rte_smp_wmb();
+
+ page = addr / VHOST_LOG_PAGE;
+ while (page * VHOST_LOG_PAGE < addr + len) {
+ vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
+ page += 1;
+ }
+}
+
+static inline void __attribute__((always_inline))
+vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint64_t offset, uint64_t len)
+{
+ vhost_log_write(dev, vq->log_guest_addr + offset, len);
+}
+
+static bool
+is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
+{
+ return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
+}
+
+static void
+virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
+{
+ if (m_buf->ol_flags & PKT_TX_L4_MASK) {
+ net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+ net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
+
+ switch (m_buf->ol_flags & PKT_TX_L4_MASK) {
+ case PKT_TX_TCP_CKSUM:
+ net_hdr->csum_offset = (offsetof(struct tcp_hdr,
+ cksum));
+ break;
+ case PKT_TX_UDP_CKSUM:
+ net_hdr->csum_offset = (offsetof(struct udp_hdr,
+ dgram_cksum));
+ break;
+ case PKT_TX_SCTP_CKSUM:
+ net_hdr->csum_offset = (offsetof(struct sctp_hdr,
+ cksum));
+ break;
+ }
+ }
+
+ if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
+ if (m_buf->ol_flags & PKT_TX_IPV4)
+ net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+ else
+ net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+ net_hdr->gso_size = m_buf->tso_segsz;
+ net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
+ + m_buf->l4_len;
+ }
+}
+
+static inline void
+copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
+ struct virtio_net_hdr_mrg_rxbuf hdr)
+{
+ if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
+ *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
+ else
+ *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
+}
+
+static inline int __attribute__((always_inline))
+copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ struct rte_mbuf *m, uint16_t desc_idx)
+{
+ uint32_t desc_avail, desc_offset;
+ uint32_t mbuf_avail, mbuf_offset;
+ uint32_t cpy_len;
+ struct vring_desc *desc;
+ uint64_t desc_addr;
+ struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
+
+ desc = &vq->desc[desc_idx];
+ desc_addr = gpa_to_vva(dev, desc->addr);
+ /*
+ * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
+ * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
+ * otherwise stores offset on the stack instead of in a register.
+ */
+ if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
+ return -1;
+
+ rte_prefetch0((void *)(uintptr_t)desc_addr);
+
+ virtio_enqueue_offload(m, &virtio_hdr.hdr);
+ copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
+ vhost_log_write(dev, desc->addr, dev->vhost_hlen);
+ PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
+
+ desc_offset = dev->vhost_hlen;
+ desc_avail = desc->len - dev->vhost_hlen;
+
+ mbuf_avail = rte_pktmbuf_data_len(m);
+ mbuf_offset = 0;
+ while (mbuf_avail != 0 || m->next != NULL) {
+ /* done with current mbuf, fetch next */
+ if (mbuf_avail == 0) {
+ m = m->next;
+
+ mbuf_offset = 0;
+ mbuf_avail = rte_pktmbuf_data_len(m);
+ }
+
+ /* done with current desc buf, fetch next */
+ if (desc_avail == 0) {
+ if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
+ /* Room in vring buffer is not enough */
+ return -1;
+ }
+ if (unlikely(desc->next >= vq->size))
+ return -1;
+
+ desc = &vq->desc[desc->next];
+ desc_addr = gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_addr))
+ return -1;
+
+ desc_offset = 0;
+ desc_avail = desc->len;
+ }
+
+ cpy_len = RTE_MIN(desc_avail, mbuf_avail);
+ rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
+ rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
+ cpy_len);
+ vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
+ PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
+ cpy_len, 0);
+
+ mbuf_avail -= cpy_len;
+ mbuf_offset += cpy_len;
+ desc_avail -= cpy_len;
+ desc_offset += cpy_len;
+ }
+
+ return 0;
+}
+
+/**
+ * This function adds buffers to the virtio devices RX virtqueue. Buffers can
+ * be received from the physical port or from another virtio device. A packet
+ * count is returned to indicate the number of packets that are succesfully
+ * added to the RX queue. This function works when the mbuf is scattered, but
+ * it doesn't support the mergeable feature.
+ */
+static inline uint32_t __attribute__((always_inline))
+virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
+ struct rte_mbuf **pkts, uint32_t count)
+{
+ struct vhost_virtqueue *vq;
+ uint16_t avail_idx, free_entries, start_idx;
+ uint16_t desc_indexes[MAX_PKT_BURST];
+ uint16_t used_idx;
+ uint32_t i;
+
+ LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
+ if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
+ RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
+ dev->vid, __func__, queue_id);
+ return 0;
+ }
+
+ vq = dev->virtqueue[queue_id];
+ if (unlikely(vq->enabled == 0))
+ return 0;
+
+ avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+ start_idx = vq->last_used_idx;
+ free_entries = avail_idx - start_idx;
+ count = RTE_MIN(count, free_entries);
+ count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
+ if (count == 0)
+ return 0;
+
+ LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
+ dev->vid, start_idx, start_idx + count);
+
+ /* Retrieve all of the desc indexes first to avoid caching issues. */
+ rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
+ for (i = 0; i < count; i++) {
+ used_idx = (start_idx + i) & (vq->size - 1);
+ desc_indexes[i] = vq->avail->ring[used_idx];
+ vq->used->ring[used_idx].id = desc_indexes[i];
+ vq->used->ring[used_idx].len = pkts[i]->pkt_len +
+ dev->vhost_hlen;
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used, ring[used_idx]),
+ sizeof(vq->used->ring[used_idx]));
+ }
+
+ rte_prefetch0(&vq->desc[desc_indexes[0]]);
+ for (i = 0; i < count; i++) {
+ uint16_t desc_idx = desc_indexes[i];
+ int err;
+
+ err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
+ if (unlikely(err)) {
+ used_idx = (start_idx + i) & (vq->size - 1);
+ vq->used->ring[used_idx].len = dev->vhost_hlen;
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used, ring[used_idx]),
+ sizeof(vq->used->ring[used_idx]));
+ }
+
+ if (i + 1 < count)
+ rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
+ }
+
+ rte_smp_wmb();
+
+ *(volatile uint16_t *)&vq->used->idx += count;
+ vq->last_used_idx += count;
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used, idx),
+ sizeof(vq->used->idx));
+
+ /* flush used->idx update before we read avail->flags. */
+ rte_mb();
+
+ /* Kick the guest if necessary. */
+ if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
+ && (vq->callfd >= 0))
+ eventfd_write(vq->callfd, (eventfd_t)1);
+ return count;
+}
+
+static inline int
+fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
+ uint32_t *allocated, uint32_t *vec_idx,
+ struct buf_vector *buf_vec)
+{
+ uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
+ uint32_t vec_id = *vec_idx;
+ uint32_t len = *allocated;
+
+ while (1) {
+ if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
+ return -1;
+
+ len += vq->desc[idx].len;
+ buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
+ buf_vec[vec_id].buf_len = vq->desc[idx].len;
+ buf_vec[vec_id].desc_idx = idx;
+ vec_id++;
+
+ if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0)
+ break;
+
+ idx = vq->desc[idx].next;
+ }
+
+ *allocated = len;
+ *vec_idx = vec_id;
+
+ return 0;
+}
+
+/*
+ * Returns -1 on fail, 0 on success
+ */
+static inline int
+reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
+ uint16_t *end, struct buf_vector *buf_vec)
+{
+ uint16_t cur_idx;
+ uint16_t avail_idx;
+ uint32_t allocated = 0;
+ uint32_t vec_idx = 0;
+ uint16_t tries = 0;
+
+ cur_idx = vq->last_used_idx;
+
+ while (1) {
+ avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+ if (unlikely(cur_idx == avail_idx))
+ return -1;
+
+ if (unlikely(fill_vec_buf(vq, cur_idx, &allocated,
+ &vec_idx, buf_vec) < 0))
+ return -1;
+
+ cur_idx++;
+ tries++;
+
+ if (allocated >= size)
+ break;
+
+ /*
+ * if we tried all available ring items, and still
+ * can't get enough buf, it means something abnormal
+ * happened.
+ */
+ if (unlikely(tries >= vq->size))
+ return -1;
+ }
+
+ *end = cur_idx;
+ return 0;
+}
+
+static inline uint32_t __attribute__((always_inline))
+copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint16_t end_idx, struct rte_mbuf *m,
+ struct buf_vector *buf_vec)
+{
+ struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
+ uint32_t vec_idx = 0;
+ uint16_t start_idx = vq->last_used_idx;
+ uint16_t cur_idx = start_idx;
+ uint64_t desc_addr;
+ uint32_t mbuf_offset, mbuf_avail;
+ uint32_t desc_offset, desc_avail;
+ uint32_t cpy_len;
+ uint16_t desc_idx, used_idx;
+
+ if (unlikely(m == NULL))
+ return 0;
+
+ LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
+ dev->vid, cur_idx, end_idx);
+
+ desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
+ if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
+ return 0;
+
+ rte_prefetch0((void *)(uintptr_t)desc_addr);
+
+ virtio_hdr.num_buffers = end_idx - start_idx;
+ LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
+ dev->vid, virtio_hdr.num_buffers);
+
+ virtio_enqueue_offload(m, &virtio_hdr.hdr);
+ copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
+ vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
+ PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
+
+ desc_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
+ desc_offset = dev->vhost_hlen;
+
+ mbuf_avail = rte_pktmbuf_data_len(m);
+ mbuf_offset = 0;
+ while (mbuf_avail != 0 || m->next != NULL) {
+ /* done with current desc buf, get the next one */
+ if (desc_avail == 0) {
+ desc_idx = buf_vec[vec_idx].desc_idx;
+
+ if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
+ /* Update used ring with desc information */
+ used_idx = cur_idx++ & (vq->size - 1);
+ vq->used->ring[used_idx].id = desc_idx;
+ vq->used->ring[used_idx].len = desc_offset;
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used,
+ ring[used_idx]),
+ sizeof(vq->used->ring[used_idx]));
+ }
+
+ vec_idx++;
+ desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
+ if (unlikely(!desc_addr))
+ return 0;
+
+ /* Prefetch buffer address. */
+ rte_prefetch0((void *)(uintptr_t)desc_addr);
+ desc_offset = 0;
+ desc_avail = buf_vec[vec_idx].buf_len;
+ }
+
+ /* done with current mbuf, get the next one */
+ if (mbuf_avail == 0) {
+ m = m->next;
+
+ mbuf_offset = 0;
+ mbuf_avail = rte_pktmbuf_data_len(m);
+ }
+
+ cpy_len = RTE_MIN(desc_avail, mbuf_avail);
+ rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
+ rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
+ cpy_len);
+ vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
+ cpy_len);
+ PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
+ cpy_len, 0);
+
+ mbuf_avail -= cpy_len;
+ mbuf_offset += cpy_len;
+ desc_avail -= cpy_len;
+ desc_offset += cpy_len;
+ }
+
+ used_idx = cur_idx & (vq->size - 1);
+ vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
+ vq->used->ring[used_idx].len = desc_offset;
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used, ring[used_idx]),
+ sizeof(vq->used->ring[used_idx]));
+
+ return end_idx - start_idx;
+}
+
+static inline uint32_t __attribute__((always_inline))
+virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
+ struct rte_mbuf **pkts, uint32_t count)
+{
+ struct vhost_virtqueue *vq;
+ uint32_t pkt_idx = 0, nr_used = 0;
+ uint16_t end;
+ struct buf_vector buf_vec[BUF_VECTOR_MAX];
+
+ LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
+ if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
+ RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
+ dev->vid, __func__, queue_id);
+ return 0;
+ }
+
+ vq = dev->virtqueue[queue_id];
+ if (unlikely(vq->enabled == 0))
+ return 0;
+
+ count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
+ if (count == 0)
+ return 0;
+
+ for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+ uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
+
+ if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len,
+ &end, buf_vec) < 0)) {
+ LOG_DEBUG(VHOST_DATA,
+ "(%d) failed to get enough desc from vring\n",
+ dev->vid);
+ break;
+ }
+
+ nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
+ pkts[pkt_idx], buf_vec);
+ rte_smp_wmb();
+
+ *(volatile uint16_t *)&vq->used->idx += nr_used;
+ vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+ sizeof(vq->used->idx));
+ vq->last_used_idx += nr_used;
+ }
+
+ if (likely(pkt_idx)) {
+ /* flush used->idx update before we read avail->flags. */
+ rte_mb();
+
+ /* Kick the guest if necessary. */
+ if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
+ && (vq->callfd >= 0))
+ eventfd_write(vq->callfd, (eventfd_t)1);
+ }
+
+ return pkt_idx;
+}
+
+uint16_t
+rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+ struct rte_mbuf **pkts, uint16_t count)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (!dev)
+ return 0;
+
+ if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
+ return virtio_dev_merge_rx(dev, queue_id, pkts, count);
+ else
+ return virtio_dev_rx(dev, queue_id, pkts, count);
+}
+
+static void
+parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
+{
+ struct ipv4_hdr *ipv4_hdr;
+ struct ipv6_hdr *ipv6_hdr;
+ void *l3_hdr = NULL;
+ struct ether_hdr *eth_hdr;
+ uint16_t ethertype;
+
+ eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
+
+ m->l2_len = sizeof(struct ether_hdr);
+ ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
+
+ if (ethertype == ETHER_TYPE_VLAN) {
+ struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
+
+ m->l2_len += sizeof(struct vlan_hdr);
+ ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
+ }
+
+ l3_hdr = (char *)eth_hdr + m->l2_len;
+
+ switch (ethertype) {
+ case ETHER_TYPE_IPv4:
+ ipv4_hdr = (struct ipv4_hdr *)l3_hdr;
+ *l4_proto = ipv4_hdr->next_proto_id;
+ m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
+ *l4_hdr = (char *)l3_hdr + m->l3_len;
+ m->ol_flags |= PKT_TX_IPV4;
+ break;
+ case ETHER_TYPE_IPv6:
+ ipv6_hdr = (struct ipv6_hdr *)l3_hdr;
+ *l4_proto = ipv6_hdr->proto;
+ m->l3_len = sizeof(struct ipv6_hdr);
+ *l4_hdr = (char *)l3_hdr + m->l3_len;
+ m->ol_flags |= PKT_TX_IPV6;
+ break;
+ default:
+ m->l3_len = 0;
+ *l4_proto = 0;
+ break;
+ }
+}
+
+static inline void __attribute__((always_inline))
+vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
+{
+ uint16_t l4_proto = 0;
+ void *l4_hdr = NULL;
+ struct tcp_hdr *tcp_hdr = NULL;
+
+ parse_ethernet(m, &l4_proto, &l4_hdr);
+ if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+ if (hdr->csum_start == (m->l2_len + m->l3_len)) {
+ switch (hdr->csum_offset) {
+ case (offsetof(struct tcp_hdr, cksum)):
+ if (l4_proto == IPPROTO_TCP)
+ m->ol_flags |= PKT_TX_TCP_CKSUM;
+ break;
+ case (offsetof(struct udp_hdr, dgram_cksum)):
+ if (l4_proto == IPPROTO_UDP)
+ m->ol_flags |= PKT_TX_UDP_CKSUM;
+ break;
+ case (offsetof(struct sctp_hdr, cksum)):
+ if (l4_proto == IPPROTO_SCTP)
+ m->ol_flags |= PKT_TX_SCTP_CKSUM;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+ switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
+ case VIRTIO_NET_HDR_GSO_TCPV4:
+ case VIRTIO_NET_HDR_GSO_TCPV6:
+ tcp_hdr = (struct tcp_hdr *)l4_hdr;
+ m->ol_flags |= PKT_TX_TCP_SEG;
+ m->tso_segsz = hdr->gso_size;
+ m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
+ break;
+ default:
+ RTE_LOG(WARNING, VHOST_DATA,
+ "unsupported gso type %u.\n", hdr->gso_type);
+ break;
+ }
+ }
+}
+
+#define RARP_PKT_SIZE 64
+
+static int
+make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
+{
+ struct ether_hdr *eth_hdr;
+ struct arp_hdr *rarp;
+
+ if (rarp_mbuf->buf_len < 64) {
+ RTE_LOG(WARNING, VHOST_DATA,
+ "failed to make RARP; mbuf size too small %u (< %d)\n",
+ rarp_mbuf->buf_len, RARP_PKT_SIZE);
+ return -1;
+ }
+
+ /* Ethernet header. */
+ eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0);
+ memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN);
+ ether_addr_copy(mac, ð_hdr->s_addr);
+ eth_hdr->ether_type = htons(ETHER_TYPE_RARP);
+
+ /* RARP header. */
+ rarp = (struct arp_hdr *)(eth_hdr + 1);
+ rarp->arp_hrd = htons(ARP_HRD_ETHER);
+ rarp->arp_pro = htons(ETHER_TYPE_IPv4);
+ rarp->arp_hln = ETHER_ADDR_LEN;
+ rarp->arp_pln = 4;
+ rarp->arp_op = htons(ARP_OP_REVREQUEST);
+
+ ether_addr_copy(mac, &rarp->arp_data.arp_sha);
+ ether_addr_copy(mac, &rarp->arp_data.arp_tha);
+ memset(&rarp->arp_data.arp_sip, 0x00, 4);
+ memset(&rarp->arp_data.arp_tip, 0x00, 4);
+
+ rarp_mbuf->pkt_len = rarp_mbuf->data_len = RARP_PKT_SIZE;
+
+ return 0;
+}
+
+static inline int __attribute__((always_inline))
+copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ struct rte_mbuf *m, uint16_t desc_idx,
+ struct rte_mempool *mbuf_pool)
+{
+ struct vring_desc *desc;
+ uint64_t desc_addr;
+ uint32_t desc_avail, desc_offset;
+ uint32_t mbuf_avail, mbuf_offset;
+ uint32_t cpy_len;
+ struct rte_mbuf *cur = m, *prev = m;
+ struct virtio_net_hdr *hdr;
+ /* A counter to avoid desc dead loop chain */
+ uint32_t nr_desc = 1;
+
+ desc = &vq->desc[desc_idx];
+ if (unlikely(desc->len < dev->vhost_hlen))
+ return -1;
+
+ desc_addr = gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_addr))
+ return -1;
+
+ hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr);
+ rte_prefetch0(hdr);
+
+ /*
+ * A virtio driver normally uses at least 2 desc buffers
+ * for Tx: the first for storing the header, and others
+ * for storing the data.
+ */
+ if (likely((desc->len == dev->vhost_hlen) &&
+ (desc->flags & VRING_DESC_F_NEXT) != 0)) {
+ desc = &vq->desc[desc->next];
+
+ desc_addr = gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_addr))
+ return -1;
+
+ rte_prefetch0((void *)(uintptr_t)desc_addr);
+
+ desc_offset = 0;
+ desc_avail = desc->len;
+ nr_desc += 1;
+
+ PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0);
+ } else {
+ desc_avail = desc->len - dev->vhost_hlen;
+ desc_offset = dev->vhost_hlen;
+ }
+
+ mbuf_offset = 0;
+ mbuf_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
+ while (1) {
+ cpy_len = RTE_MIN(desc_avail, mbuf_avail);
+ rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, mbuf_offset),
+ (void *)((uintptr_t)(desc_addr + desc_offset)),
+ cpy_len);
+
+ mbuf_avail -= cpy_len;
+ mbuf_offset += cpy_len;
+ desc_avail -= cpy_len;
+ desc_offset += cpy_len;
+
+ /* This desc reaches to its end, get the next one */
+ if (desc_avail == 0) {
+ if ((desc->flags & VRING_DESC_F_NEXT) == 0)
+ break;
+
+ if (unlikely(desc->next >= vq->size ||
+ ++nr_desc > vq->size))
+ return -1;
+ desc = &vq->desc[desc->next];
+
+ desc_addr = gpa_to_vva(dev, desc->addr);
+ if (unlikely(!desc_addr))
+ return -1;
+
+ rte_prefetch0((void *)(uintptr_t)desc_addr);
+
+ desc_offset = 0;
+ desc_avail = desc->len;
+
+ PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0);
+ }
+
+ /*
+ * This mbuf reaches to its end, get a new one
+ * to hold more data.
+ */
+ if (mbuf_avail == 0) {
+ cur = rte_pktmbuf_alloc(mbuf_pool);
+ if (unlikely(cur == NULL)) {
+ RTE_LOG(ERR, VHOST_DATA, "Failed to "
+ "allocate memory for mbuf.\n");
+ return -1;
+ }
+
+ prev->next = cur;
+ prev->data_len = mbuf_offset;
+ m->nb_segs += 1;
+ m->pkt_len += mbuf_offset;
+ prev = cur;
+
+ mbuf_offset = 0;
+ mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
+ }
+ }
+
+ prev->data_len = mbuf_offset;
+ m->pkt_len += mbuf_offset;
+
+ if (hdr->flags != 0 || hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE)
+ vhost_dequeue_offload(hdr, m);
+
+ return 0;
+}
+
+uint16_t
+rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
+ struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
+{
+ struct virtio_net *dev;
+ struct rte_mbuf *rarp_mbuf = NULL;
+ struct vhost_virtqueue *vq;
+ uint32_t desc_indexes[MAX_PKT_BURST];
+ uint32_t used_idx;
+ uint32_t i = 0;
+ uint16_t free_entries;
+ uint16_t avail_idx;
+
+ dev = get_device(vid);
+ if (!dev)
+ return 0;
+
+ if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) {
+ RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
+ dev->vid, __func__, queue_id);
+ return 0;
+ }
+
+ vq = dev->virtqueue[queue_id];
+ if (unlikely(vq->enabled == 0))
+ return 0;
+
+ /*
+ * Construct a RARP broadcast packet, and inject it to the "pkts"
+ * array, to looks like that guest actually send such packet.
+ *
+ * Check user_send_rarp() for more information.
+ */
+ if (unlikely(rte_atomic16_cmpset((volatile uint16_t *)
+ &dev->broadcast_rarp.cnt, 1, 0))) {
+ rarp_mbuf = rte_pktmbuf_alloc(mbuf_pool);
+ if (rarp_mbuf == NULL) {
+ RTE_LOG(ERR, VHOST_DATA,
+ "Failed to allocate memory for mbuf.\n");
+ return 0;
+ }
+
+ if (make_rarp_packet(rarp_mbuf, &dev->mac)) {
+ rte_pktmbuf_free(rarp_mbuf);
+ rarp_mbuf = NULL;
+ } else {
+ count -= 1;
+ }
+ }
+
+ avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+ free_entries = avail_idx - vq->last_used_idx;
+ if (free_entries == 0)
+ goto out;
+
+ LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
+
+ /* Prefetch available ring to retrieve head indexes. */
+ used_idx = vq->last_used_idx & (vq->size - 1);
+ rte_prefetch0(&vq->avail->ring[used_idx]);
+ rte_prefetch0(&vq->used->ring[used_idx]);
+
+ count = RTE_MIN(count, MAX_PKT_BURST);
+ count = RTE_MIN(count, free_entries);
+ LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
+ dev->vid, count);
+
+ /* Retrieve all of the head indexes first to avoid caching issues. */
+ for (i = 0; i < count; i++) {
+ used_idx = (vq->last_used_idx + i) & (vq->size - 1);
+ desc_indexes[i] = vq->avail->ring[used_idx];
+
+ vq->used->ring[used_idx].id = desc_indexes[i];
+ vq->used->ring[used_idx].len = 0;
+ vhost_log_used_vring(dev, vq,
+ offsetof(struct vring_used, ring[used_idx]),
+ sizeof(vq->used->ring[used_idx]));
+ }
+
+ /* Prefetch descriptor index. */
+ rte_prefetch0(&vq->desc[desc_indexes[0]]);
+ for (i = 0; i < count; i++) {
+ int err;
+
+ if (likely(i + 1 < count))
+ rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
+
+ pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
+ if (unlikely(pkts[i] == NULL)) {
+ RTE_LOG(ERR, VHOST_DATA,
+ "Failed to allocate memory for mbuf.\n");
+ break;
+ }
+ err = copy_desc_to_mbuf(dev, vq, pkts[i], desc_indexes[i],
+ mbuf_pool);
+ if (unlikely(err)) {
+ rte_pktmbuf_free(pkts[i]);
+ break;
+ }
+ }
+
+ rte_smp_wmb();
+ rte_smp_rmb();
+ vq->used->idx += i;
+ vq->last_used_idx += i;
+ vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+ sizeof(vq->used->idx));
+
+ /* Kick guest if required. */
+ if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
+ && (vq->callfd >= 0))
+ eventfd_write(vq->callfd, (eventfd_t)1);
+
+out:
+ if (unlikely(rarp_mbuf != NULL)) {
+ /*
+ * Inject it to the head of "pkts" array, so that switch's mac
+ * learning table will get updated first.
+ */
+ memmove(&pkts[1], pkts, i * sizeof(struct rte_mbuf *));
+ pkts[0] = rarp_mbuf;
+ i += 1;
+ }
+
+ return i;
+}