0a66ef9767c67533162eab31c9012bbfbe2c1a8b
[dpdk.git] / lib / librte_vhost / socket.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2016 Intel Corporation
3  */
4
5 #include <stdint.h>
6 #include <stdio.h>
7 #include <limits.h>
8 #include <stdlib.h>
9 #include <unistd.h>
10 #include <string.h>
11 #include <sys/types.h>
12 #include <sys/socket.h>
13 #include <sys/un.h>
14 #include <sys/queue.h>
15 #include <errno.h>
16 #include <fcntl.h>
17 #include <pthread.h>
18
19 #include <rte_log.h>
20
21 #include "fd_man.h"
22 #include "vhost.h"
23 #include "vhost_user.h"
24
25
26 TAILQ_HEAD(vhost_user_connection_list, vhost_user_connection);
27
28 /*
29  * Every time rte_vhost_driver_register() is invoked, an associated
30  * vhost_user_socket struct will be created.
31  */
32 struct vhost_user_socket {
33         struct vhost_user_connection_list conn_list;
34         pthread_mutex_t conn_mutex;
35         char *path;
36         int socket_fd;
37         struct sockaddr_un un;
38         bool is_server;
39         bool reconnect;
40         bool dequeue_zero_copy;
41         bool iommu_support;
42         bool use_builtin_virtio_net;
43         bool extbuf;
44         bool linearbuf;
45
46         /*
47          * The "supported_features" indicates the feature bits the
48          * vhost driver supports. The "features" indicates the feature
49          * bits after the rte_vhost_driver_features_disable/enable().
50          * It is also the final feature bits used for vhost-user
51          * features negotiation.
52          */
53         uint64_t supported_features;
54         uint64_t features;
55
56         uint64_t protocol_features;
57
58         /*
59          * Device id to identify a specific backend device.
60          * It's set to -1 for the default software implementation.
61          * If valid, one socket can have 1 connection only.
62          */
63         int vdpa_dev_id;
64
65         struct vhost_device_ops const *notify_ops;
66 };
67
68 struct vhost_user_connection {
69         struct vhost_user_socket *vsocket;
70         int connfd;
71         int vid;
72
73         TAILQ_ENTRY(vhost_user_connection) next;
74 };
75
76 #define MAX_VHOST_SOCKET 1024
77 struct vhost_user {
78         struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET];
79         struct fdset fdset;
80         int vsocket_cnt;
81         pthread_mutex_t mutex;
82 };
83
84 #define MAX_VIRTIO_BACKLOG 128
85
86 static void vhost_user_server_new_connection(int fd, void *data, int *remove);
87 static void vhost_user_read_cb(int fd, void *dat, int *remove);
88 static int create_unix_socket(struct vhost_user_socket *vsocket);
89 static int vhost_user_start_client(struct vhost_user_socket *vsocket);
90
91 static struct vhost_user vhost_user = {
92         .fdset = {
93                 .fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} },
94                 .fd_mutex = PTHREAD_MUTEX_INITIALIZER,
95                 .fd_pooling_mutex = PTHREAD_MUTEX_INITIALIZER,
96                 .num = 0
97         },
98         .vsocket_cnt = 0,
99         .mutex = PTHREAD_MUTEX_INITIALIZER,
100 };
101
102 /*
103  * return bytes# of read on success or negative val on failure. Update fdnum
104  * with number of fds read.
105  */
106 int
107 read_fd_message(int sockfd, char *buf, int buflen, int *fds, int max_fds,
108                 int *fd_num)
109 {
110         struct iovec iov;
111         struct msghdr msgh;
112         char control[CMSG_SPACE(max_fds * sizeof(int))];
113         struct cmsghdr *cmsg;
114         int got_fds = 0;
115         int ret;
116
117         *fd_num = 0;
118
119         memset(&msgh, 0, sizeof(msgh));
120         iov.iov_base = buf;
121         iov.iov_len  = buflen;
122
123         msgh.msg_iov = &iov;
124         msgh.msg_iovlen = 1;
125         msgh.msg_control = control;
126         msgh.msg_controllen = sizeof(control);
127
128         ret = recvmsg(sockfd, &msgh, 0);
129         if (ret <= 0) {
130                 if (ret)
131                         VHOST_LOG_CONFIG(ERR, "recvmsg failed\n");
132                 return ret;
133         }
134
135         if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
136                 VHOST_LOG_CONFIG(ERR, "truncated msg\n");
137                 return -1;
138         }
139
140         for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
141                 cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
142                 if ((cmsg->cmsg_level == SOL_SOCKET) &&
143                         (cmsg->cmsg_type == SCM_RIGHTS)) {
144                         got_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
145                         *fd_num = got_fds;
146                         memcpy(fds, CMSG_DATA(cmsg), got_fds * sizeof(int));
147                         break;
148                 }
149         }
150
151         /* Clear out unused file descriptors */
152         while (got_fds < max_fds)
153                 fds[got_fds++] = -1;
154
155         return ret;
156 }
157
158 int
159 send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
160 {
161
162         struct iovec iov;
163         struct msghdr msgh;
164         size_t fdsize = fd_num * sizeof(int);
165         char control[CMSG_SPACE(fdsize)];
166         struct cmsghdr *cmsg;
167         int ret;
168
169         memset(&msgh, 0, sizeof(msgh));
170         iov.iov_base = buf;
171         iov.iov_len = buflen;
172
173         msgh.msg_iov = &iov;
174         msgh.msg_iovlen = 1;
175
176         if (fds && fd_num > 0) {
177                 msgh.msg_control = control;
178                 msgh.msg_controllen = sizeof(control);
179                 cmsg = CMSG_FIRSTHDR(&msgh);
180                 if (cmsg == NULL) {
181                         VHOST_LOG_CONFIG(ERR, "cmsg == NULL\n");
182                         errno = EINVAL;
183                         return -1;
184                 }
185                 cmsg->cmsg_len = CMSG_LEN(fdsize);
186                 cmsg->cmsg_level = SOL_SOCKET;
187                 cmsg->cmsg_type = SCM_RIGHTS;
188                 memcpy(CMSG_DATA(cmsg), fds, fdsize);
189         } else {
190                 msgh.msg_control = NULL;
191                 msgh.msg_controllen = 0;
192         }
193
194         do {
195                 ret = sendmsg(sockfd, &msgh, MSG_NOSIGNAL);
196         } while (ret < 0 && errno == EINTR);
197
198         if (ret < 0) {
199                 VHOST_LOG_CONFIG(ERR,  "sendmsg error\n");
200                 return ret;
201         }
202
203         return ret;
204 }
205
206 static void
207 vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
208 {
209         int vid;
210         size_t size;
211         struct vhost_user_connection *conn;
212         int ret;
213
214         if (vsocket == NULL)
215                 return;
216
217         conn = malloc(sizeof(*conn));
218         if (conn == NULL) {
219                 close(fd);
220                 return;
221         }
222
223         vid = vhost_new_device();
224         if (vid == -1) {
225                 goto err;
226         }
227
228         size = strnlen(vsocket->path, PATH_MAX);
229         vhost_set_ifname(vid, vsocket->path, size);
230
231         vhost_set_builtin_virtio_net(vid, vsocket->use_builtin_virtio_net);
232
233         vhost_attach_vdpa_device(vid, vsocket->vdpa_dev_id);
234
235         if (vsocket->dequeue_zero_copy)
236                 vhost_enable_dequeue_zero_copy(vid);
237
238         if (vsocket->extbuf)
239                 vhost_enable_extbuf(vid);
240
241         if (vsocket->linearbuf)
242                 vhost_enable_linearbuf(vid);
243
244         VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n", vid);
245
246         if (vsocket->notify_ops->new_connection) {
247                 ret = vsocket->notify_ops->new_connection(vid);
248                 if (ret < 0) {
249                         VHOST_LOG_CONFIG(ERR,
250                                 "failed to add vhost user connection with fd %d\n",
251                                 fd);
252                         goto err_cleanup;
253                 }
254         }
255
256         conn->connfd = fd;
257         conn->vsocket = vsocket;
258         conn->vid = vid;
259         ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb,
260                         NULL, conn);
261         if (ret < 0) {
262                 VHOST_LOG_CONFIG(ERR,
263                         "failed to add fd %d into vhost server fdset\n",
264                         fd);
265
266                 if (vsocket->notify_ops->destroy_connection)
267                         vsocket->notify_ops->destroy_connection(conn->vid);
268
269                 goto err_cleanup;
270         }
271
272         pthread_mutex_lock(&vsocket->conn_mutex);
273         TAILQ_INSERT_TAIL(&vsocket->conn_list, conn, next);
274         pthread_mutex_unlock(&vsocket->conn_mutex);
275
276         fdset_pipe_notify(&vhost_user.fdset);
277         return;
278
279 err_cleanup:
280         vhost_destroy_device(vid);
281 err:
282         free(conn);
283         close(fd);
284 }
285
286 /* call back when there is new vhost-user connection from client  */
287 static void
288 vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused)
289 {
290         struct vhost_user_socket *vsocket = dat;
291
292         fd = accept(fd, NULL, NULL);
293         if (fd < 0)
294                 return;
295
296         VHOST_LOG_CONFIG(INFO, "new vhost user connection is %d\n", fd);
297         vhost_user_add_connection(fd, vsocket);
298 }
299
300 static void
301 vhost_user_read_cb(int connfd, void *dat, int *remove)
302 {
303         struct vhost_user_connection *conn = dat;
304         struct vhost_user_socket *vsocket = conn->vsocket;
305         int ret;
306
307         ret = vhost_user_msg_handler(conn->vid, connfd);
308         if (ret < 0) {
309                 struct virtio_net *dev = get_device(conn->vid);
310
311                 close(connfd);
312                 *remove = 1;
313
314                 if (dev)
315                         vhost_destroy_device_notify(dev);
316
317                 if (vsocket->notify_ops->destroy_connection)
318                         vsocket->notify_ops->destroy_connection(conn->vid);
319
320                 vhost_destroy_device(conn->vid);
321
322                 if (vsocket->reconnect) {
323                         create_unix_socket(vsocket);
324                         vhost_user_start_client(vsocket);
325                 }
326
327                 pthread_mutex_lock(&vsocket->conn_mutex);
328                 TAILQ_REMOVE(&vsocket->conn_list, conn, next);
329                 pthread_mutex_unlock(&vsocket->conn_mutex);
330
331                 free(conn);
332         }
333 }
334
335 static int
336 create_unix_socket(struct vhost_user_socket *vsocket)
337 {
338         int fd;
339         struct sockaddr_un *un = &vsocket->un;
340
341         fd = socket(AF_UNIX, SOCK_STREAM, 0);
342         if (fd < 0)
343                 return -1;
344         VHOST_LOG_CONFIG(INFO, "vhost-user %s: socket created, fd: %d\n",
345                 vsocket->is_server ? "server" : "client", fd);
346
347         if (!vsocket->is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) {
348                 VHOST_LOG_CONFIG(ERR,
349                         "vhost-user: can't set nonblocking mode for socket, fd: "
350                         "%d (%s)\n", fd, strerror(errno));
351                 close(fd);
352                 return -1;
353         }
354
355         memset(un, 0, sizeof(*un));
356         un->sun_family = AF_UNIX;
357         strncpy(un->sun_path, vsocket->path, sizeof(un->sun_path));
358         un->sun_path[sizeof(un->sun_path) - 1] = '\0';
359
360         vsocket->socket_fd = fd;
361         return 0;
362 }
363
364 static int
365 vhost_user_start_server(struct vhost_user_socket *vsocket)
366 {
367         int ret;
368         int fd = vsocket->socket_fd;
369         const char *path = vsocket->path;
370
371         /*
372          * bind () may fail if the socket file with the same name already
373          * exists. But the library obviously should not delete the file
374          * provided by the user, since we can not be sure that it is not
375          * being used by other applications. Moreover, many applications form
376          * socket names based on user input, which is prone to errors.
377          *
378          * The user must ensure that the socket does not exist before
379          * registering the vhost driver in server mode.
380          */
381         ret = bind(fd, (struct sockaddr *)&vsocket->un, sizeof(vsocket->un));
382         if (ret < 0) {
383                 VHOST_LOG_CONFIG(ERR,
384                         "failed to bind to %s: %s; remove it and try again\n",
385                         path, strerror(errno));
386                 goto err;
387         }
388         VHOST_LOG_CONFIG(INFO, "bind to %s\n", path);
389
390         ret = listen(fd, MAX_VIRTIO_BACKLOG);
391         if (ret < 0)
392                 goto err;
393
394         ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection,
395                   NULL, vsocket);
396         if (ret < 0) {
397                 VHOST_LOG_CONFIG(ERR,
398                         "failed to add listen fd %d to vhost server fdset\n",
399                         fd);
400                 goto err;
401         }
402
403         return 0;
404
405 err:
406         close(fd);
407         return -1;
408 }
409
410 struct vhost_user_reconnect {
411         struct sockaddr_un un;
412         int fd;
413         struct vhost_user_socket *vsocket;
414
415         TAILQ_ENTRY(vhost_user_reconnect) next;
416 };
417
418 TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect);
419 struct vhost_user_reconnect_list {
420         struct vhost_user_reconnect_tailq_list head;
421         pthread_mutex_t mutex;
422 };
423
424 static struct vhost_user_reconnect_list reconn_list;
425 static pthread_t reconn_tid;
426
427 static int
428 vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz)
429 {
430         int ret, flags;
431
432         ret = connect(fd, un, sz);
433         if (ret < 0 && errno != EISCONN)
434                 return -1;
435
436         flags = fcntl(fd, F_GETFL, 0);
437         if (flags < 0) {
438                 VHOST_LOG_CONFIG(ERR,
439                         "can't get flags for connfd %d\n", fd);
440                 return -2;
441         }
442         if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) {
443                 VHOST_LOG_CONFIG(ERR,
444                                 "can't disable nonblocking on fd %d\n", fd);
445                 return -2;
446         }
447         return 0;
448 }
449
450 static void *
451 vhost_user_client_reconnect(void *arg __rte_unused)
452 {
453         int ret;
454         struct vhost_user_reconnect *reconn, *next;
455
456         while (1) {
457                 pthread_mutex_lock(&reconn_list.mutex);
458
459                 /*
460                  * An equal implementation of TAILQ_FOREACH_SAFE,
461                  * which does not exist on all platforms.
462                  */
463                 for (reconn = TAILQ_FIRST(&reconn_list.head);
464                      reconn != NULL; reconn = next) {
465                         next = TAILQ_NEXT(reconn, next);
466
467                         ret = vhost_user_connect_nonblock(reconn->fd,
468                                                 (struct sockaddr *)&reconn->un,
469                                                 sizeof(reconn->un));
470                         if (ret == -2) {
471                                 close(reconn->fd);
472                                 VHOST_LOG_CONFIG(ERR,
473                                         "reconnection for fd %d failed\n",
474                                         reconn->fd);
475                                 goto remove_fd;
476                         }
477                         if (ret == -1)
478                                 continue;
479
480                         VHOST_LOG_CONFIG(INFO,
481                                 "%s: connected\n", reconn->vsocket->path);
482                         vhost_user_add_connection(reconn->fd, reconn->vsocket);
483 remove_fd:
484                         TAILQ_REMOVE(&reconn_list.head, reconn, next);
485                         free(reconn);
486                 }
487
488                 pthread_mutex_unlock(&reconn_list.mutex);
489                 sleep(1);
490         }
491
492         return NULL;
493 }
494
495 static int
496 vhost_user_reconnect_init(void)
497 {
498         int ret;
499
500         ret = pthread_mutex_init(&reconn_list.mutex, NULL);
501         if (ret < 0) {
502                 VHOST_LOG_CONFIG(ERR, "failed to initialize mutex");
503                 return ret;
504         }
505         TAILQ_INIT(&reconn_list.head);
506
507         ret = rte_ctrl_thread_create(&reconn_tid, "vhost_reconn", NULL,
508                              vhost_user_client_reconnect, NULL);
509         if (ret != 0) {
510                 VHOST_LOG_CONFIG(ERR, "failed to create reconnect thread");
511                 if (pthread_mutex_destroy(&reconn_list.mutex)) {
512                         VHOST_LOG_CONFIG(ERR,
513                                 "failed to destroy reconnect mutex");
514                 }
515         }
516
517         return ret;
518 }
519
520 static int
521 vhost_user_start_client(struct vhost_user_socket *vsocket)
522 {
523         int ret;
524         int fd = vsocket->socket_fd;
525         const char *path = vsocket->path;
526         struct vhost_user_reconnect *reconn;
527
528         ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&vsocket->un,
529                                           sizeof(vsocket->un));
530         if (ret == 0) {
531                 vhost_user_add_connection(fd, vsocket);
532                 return 0;
533         }
534
535         VHOST_LOG_CONFIG(WARNING,
536                 "failed to connect to %s: %s\n",
537                 path, strerror(errno));
538
539         if (ret == -2 || !vsocket->reconnect) {
540                 close(fd);
541                 return -1;
542         }
543
544         VHOST_LOG_CONFIG(INFO, "%s: reconnecting...\n", path);
545         reconn = malloc(sizeof(*reconn));
546         if (reconn == NULL) {
547                 VHOST_LOG_CONFIG(ERR,
548                         "failed to allocate memory for reconnect\n");
549                 close(fd);
550                 return -1;
551         }
552         reconn->un = vsocket->un;
553         reconn->fd = fd;
554         reconn->vsocket = vsocket;
555         pthread_mutex_lock(&reconn_list.mutex);
556         TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next);
557         pthread_mutex_unlock(&reconn_list.mutex);
558
559         return 0;
560 }
561
562 static struct vhost_user_socket *
563 find_vhost_user_socket(const char *path)
564 {
565         int i;
566
567         if (path == NULL)
568                 return NULL;
569
570         for (i = 0; i < vhost_user.vsocket_cnt; i++) {
571                 struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
572
573                 if (!strcmp(vsocket->path, path))
574                         return vsocket;
575         }
576
577         return NULL;
578 }
579
580 int
581 rte_vhost_driver_attach_vdpa_device(const char *path, int did)
582 {
583         struct vhost_user_socket *vsocket;
584
585         if (rte_vdpa_get_device(did) == NULL || path == NULL)
586                 return -1;
587
588         pthread_mutex_lock(&vhost_user.mutex);
589         vsocket = find_vhost_user_socket(path);
590         if (vsocket)
591                 vsocket->vdpa_dev_id = did;
592         pthread_mutex_unlock(&vhost_user.mutex);
593
594         return vsocket ? 0 : -1;
595 }
596
597 int
598 rte_vhost_driver_detach_vdpa_device(const char *path)
599 {
600         struct vhost_user_socket *vsocket;
601
602         pthread_mutex_lock(&vhost_user.mutex);
603         vsocket = find_vhost_user_socket(path);
604         if (vsocket)
605                 vsocket->vdpa_dev_id = -1;
606         pthread_mutex_unlock(&vhost_user.mutex);
607
608         return vsocket ? 0 : -1;
609 }
610
611 int
612 rte_vhost_driver_get_vdpa_device_id(const char *path)
613 {
614         struct vhost_user_socket *vsocket;
615         int did = -1;
616
617         pthread_mutex_lock(&vhost_user.mutex);
618         vsocket = find_vhost_user_socket(path);
619         if (vsocket)
620                 did = vsocket->vdpa_dev_id;
621         pthread_mutex_unlock(&vhost_user.mutex);
622
623         return did;
624 }
625
626 int
627 rte_vhost_driver_disable_features(const char *path, uint64_t features)
628 {
629         struct vhost_user_socket *vsocket;
630
631         pthread_mutex_lock(&vhost_user.mutex);
632         vsocket = find_vhost_user_socket(path);
633
634         /* Note that use_builtin_virtio_net is not affected by this function
635          * since callers may want to selectively disable features of the
636          * built-in vhost net device backend.
637          */
638
639         if (vsocket)
640                 vsocket->features &= ~features;
641         pthread_mutex_unlock(&vhost_user.mutex);
642
643         return vsocket ? 0 : -1;
644 }
645
646 int
647 rte_vhost_driver_enable_features(const char *path, uint64_t features)
648 {
649         struct vhost_user_socket *vsocket;
650
651         pthread_mutex_lock(&vhost_user.mutex);
652         vsocket = find_vhost_user_socket(path);
653         if (vsocket) {
654                 if ((vsocket->supported_features & features) != features) {
655                         /*
656                          * trying to enable features the driver doesn't
657                          * support.
658                          */
659                         pthread_mutex_unlock(&vhost_user.mutex);
660                         return -1;
661                 }
662                 vsocket->features |= features;
663         }
664         pthread_mutex_unlock(&vhost_user.mutex);
665
666         return vsocket ? 0 : -1;
667 }
668
669 int
670 rte_vhost_driver_set_features(const char *path, uint64_t features)
671 {
672         struct vhost_user_socket *vsocket;
673
674         pthread_mutex_lock(&vhost_user.mutex);
675         vsocket = find_vhost_user_socket(path);
676         if (vsocket) {
677                 vsocket->supported_features = features;
678                 vsocket->features = features;
679
680                 /* Anyone setting feature bits is implementing their own vhost
681                  * device backend.
682                  */
683                 vsocket->use_builtin_virtio_net = false;
684         }
685         pthread_mutex_unlock(&vhost_user.mutex);
686
687         return vsocket ? 0 : -1;
688 }
689
690 int
691 rte_vhost_driver_get_features(const char *path, uint64_t *features)
692 {
693         struct vhost_user_socket *vsocket;
694         uint64_t vdpa_features;
695         struct rte_vdpa_device *vdpa_dev;
696         int did = -1;
697         int ret = 0;
698
699         pthread_mutex_lock(&vhost_user.mutex);
700         vsocket = find_vhost_user_socket(path);
701         if (!vsocket) {
702                 VHOST_LOG_CONFIG(ERR,
703                         "socket file %s is not registered yet.\n", path);
704                 ret = -1;
705                 goto unlock_exit;
706         }
707
708         did = vsocket->vdpa_dev_id;
709         vdpa_dev = rte_vdpa_get_device(did);
710         if (!vdpa_dev || !vdpa_dev->ops->get_features) {
711                 *features = vsocket->features;
712                 goto unlock_exit;
713         }
714
715         if (vdpa_dev->ops->get_features(did, &vdpa_features) < 0) {
716                 VHOST_LOG_CONFIG(ERR,
717                                 "failed to get vdpa features "
718                                 "for socket file %s.\n", path);
719                 ret = -1;
720                 goto unlock_exit;
721         }
722
723         *features = vsocket->features & vdpa_features;
724
725 unlock_exit:
726         pthread_mutex_unlock(&vhost_user.mutex);
727         return ret;
728 }
729
730 int
731 rte_vhost_driver_set_protocol_features(const char *path,
732                 uint64_t protocol_features)
733 {
734         struct vhost_user_socket *vsocket;
735
736         pthread_mutex_lock(&vhost_user.mutex);
737         vsocket = find_vhost_user_socket(path);
738         if (vsocket)
739                 vsocket->protocol_features = protocol_features;
740         pthread_mutex_unlock(&vhost_user.mutex);
741         return vsocket ? 0 : -1;
742 }
743
744 int
745 rte_vhost_driver_get_protocol_features(const char *path,
746                 uint64_t *protocol_features)
747 {
748         struct vhost_user_socket *vsocket;
749         uint64_t vdpa_protocol_features;
750         struct rte_vdpa_device *vdpa_dev;
751         int did = -1;
752         int ret = 0;
753
754         pthread_mutex_lock(&vhost_user.mutex);
755         vsocket = find_vhost_user_socket(path);
756         if (!vsocket) {
757                 VHOST_LOG_CONFIG(ERR,
758                         "socket file %s is not registered yet.\n", path);
759                 ret = -1;
760                 goto unlock_exit;
761         }
762
763         did = vsocket->vdpa_dev_id;
764         vdpa_dev = rte_vdpa_get_device(did);
765         if (!vdpa_dev || !vdpa_dev->ops->get_protocol_features) {
766                 *protocol_features = vsocket->protocol_features;
767                 goto unlock_exit;
768         }
769
770         if (vdpa_dev->ops->get_protocol_features(did,
771                                 &vdpa_protocol_features) < 0) {
772                 VHOST_LOG_CONFIG(ERR,
773                                 "failed to get vdpa protocol features "
774                                 "for socket file %s.\n", path);
775                 ret = -1;
776                 goto unlock_exit;
777         }
778
779         *protocol_features = vsocket->protocol_features
780                 & vdpa_protocol_features;
781
782 unlock_exit:
783         pthread_mutex_unlock(&vhost_user.mutex);
784         return ret;
785 }
786
787 int
788 rte_vhost_driver_get_queue_num(const char *path, uint32_t *queue_num)
789 {
790         struct vhost_user_socket *vsocket;
791         uint32_t vdpa_queue_num;
792         struct rte_vdpa_device *vdpa_dev;
793         int did = -1;
794         int ret = 0;
795
796         pthread_mutex_lock(&vhost_user.mutex);
797         vsocket = find_vhost_user_socket(path);
798         if (!vsocket) {
799                 VHOST_LOG_CONFIG(ERR,
800                         "socket file %s is not registered yet.\n", path);
801                 ret = -1;
802                 goto unlock_exit;
803         }
804
805         did = vsocket->vdpa_dev_id;
806         vdpa_dev = rte_vdpa_get_device(did);
807         if (!vdpa_dev || !vdpa_dev->ops->get_queue_num) {
808                 *queue_num = VHOST_MAX_QUEUE_PAIRS;
809                 goto unlock_exit;
810         }
811
812         if (vdpa_dev->ops->get_queue_num(did, &vdpa_queue_num) < 0) {
813                 VHOST_LOG_CONFIG(ERR,
814                                 "failed to get vdpa queue number "
815                                 "for socket file %s.\n", path);
816                 ret = -1;
817                 goto unlock_exit;
818         }
819
820         *queue_num = RTE_MIN((uint32_t)VHOST_MAX_QUEUE_PAIRS, vdpa_queue_num);
821
822 unlock_exit:
823         pthread_mutex_unlock(&vhost_user.mutex);
824         return ret;
825 }
826
827 static void
828 vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
829 {
830         if (vsocket && vsocket->path) {
831                 free(vsocket->path);
832                 vsocket->path = NULL;
833         }
834
835         if (vsocket) {
836                 free(vsocket);
837                 vsocket = NULL;
838         }
839 }
840
841 /*
842  * Register a new vhost-user socket; here we could act as server
843  * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag
844  * is set.
845  */
846 int
847 rte_vhost_driver_register(const char *path, uint64_t flags)
848 {
849         int ret = -1;
850         struct vhost_user_socket *vsocket;
851
852         if (!path)
853                 return -1;
854
855         pthread_mutex_lock(&vhost_user.mutex);
856
857         if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) {
858                 VHOST_LOG_CONFIG(ERR,
859                         "error: the number of vhost sockets reaches maximum\n");
860                 goto out;
861         }
862
863         vsocket = malloc(sizeof(struct vhost_user_socket));
864         if (!vsocket)
865                 goto out;
866         memset(vsocket, 0, sizeof(struct vhost_user_socket));
867         vsocket->path = strdup(path);
868         if (vsocket->path == NULL) {
869                 VHOST_LOG_CONFIG(ERR,
870                         "error: failed to copy socket path string\n");
871                 vhost_user_socket_mem_free(vsocket);
872                 goto out;
873         }
874         TAILQ_INIT(&vsocket->conn_list);
875         ret = pthread_mutex_init(&vsocket->conn_mutex, NULL);
876         if (ret) {
877                 VHOST_LOG_CONFIG(ERR,
878                         "error: failed to init connection mutex\n");
879                 goto out_free;
880         }
881         vsocket->vdpa_dev_id = -1;
882         vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
883         vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
884         vsocket->linearbuf = flags & RTE_VHOST_USER_LINEARBUF_SUPPORT;
885
886         if (vsocket->dequeue_zero_copy &&
887             (flags & RTE_VHOST_USER_IOMMU_SUPPORT)) {
888                 VHOST_LOG_CONFIG(ERR,
889                         "error: enabling dequeue zero copy and IOMMU features "
890                         "simultaneously is not supported\n");
891                 goto out_mutex;
892         }
893
894         /*
895          * Set the supported features correctly for the builtin vhost-user
896          * net driver.
897          *
898          * Applications know nothing about features the builtin virtio net
899          * driver (virtio_net.c) supports, thus it's not possible for them
900          * to invoke rte_vhost_driver_set_features(). To workaround it, here
901          * we set it unconditionally. If the application want to implement
902          * another vhost-user driver (say SCSI), it should call the
903          * rte_vhost_driver_set_features(), which will overwrite following
904          * two values.
905          */
906         vsocket->use_builtin_virtio_net = true;
907         vsocket->supported_features = VIRTIO_NET_SUPPORTED_FEATURES;
908         vsocket->features           = VIRTIO_NET_SUPPORTED_FEATURES;
909         vsocket->protocol_features  = VHOST_USER_PROTOCOL_FEATURES;
910
911         /*
912          * Dequeue zero copy can't assure descriptors returned in order.
913          * Also, it requires that the guest memory is populated, which is
914          * not compatible with postcopy.
915          */
916         if (vsocket->dequeue_zero_copy) {
917                 if (vsocket->extbuf) {
918                         VHOST_LOG_CONFIG(ERR,
919                         "error: zero copy is incompatible with external buffers\n");
920                         ret = -1;
921                         goto out_mutex;
922                 }
923                 if (vsocket->linearbuf) {
924                         VHOST_LOG_CONFIG(ERR,
925                         "error: zero copy is incompatible with linear buffers\n");
926                         ret = -1;
927                         goto out_mutex;
928                 }
929                 if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
930                         VHOST_LOG_CONFIG(ERR,
931                         "error: zero copy is incompatible with vhost client mode\n");
932                         ret = -1;
933                         goto out_mutex;
934                 }
935                 vsocket->supported_features &= ~(1ULL << VIRTIO_F_IN_ORDER);
936                 vsocket->features &= ~(1ULL << VIRTIO_F_IN_ORDER);
937
938                 VHOST_LOG_CONFIG(INFO,
939                         "Dequeue zero copy requested, disabling postcopy support\n");
940                 vsocket->protocol_features &=
941                         ~(1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT);
942         }
943
944         /*
945          * We'll not be able to receive a buffer from guest in linear mode
946          * without external buffer if it will not fit in a single mbuf, which is
947          * likely if segmentation offloading enabled.
948          */
949         if (vsocket->linearbuf && !vsocket->extbuf) {
950                 uint64_t seg_offload_features =
951                                 (1ULL << VIRTIO_NET_F_HOST_TSO4) |
952                                 (1ULL << VIRTIO_NET_F_HOST_TSO6) |
953                                 (1ULL << VIRTIO_NET_F_HOST_UFO);
954
955                 VHOST_LOG_CONFIG(INFO,
956                         "Linear buffers requested without external buffers, "
957                         "disabling host segmentation offloading support\n");
958                 vsocket->supported_features &= ~seg_offload_features;
959                 vsocket->features &= ~seg_offload_features;
960         }
961
962         if (!(flags & RTE_VHOST_USER_IOMMU_SUPPORT)) {
963                 vsocket->supported_features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
964                 vsocket->features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
965         }
966
967         if (!(flags & RTE_VHOST_USER_POSTCOPY_SUPPORT)) {
968                 vsocket->protocol_features &=
969                         ~(1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT);
970         } else {
971 #ifndef RTE_LIBRTE_VHOST_POSTCOPY
972                 VHOST_LOG_CONFIG(ERR,
973                         "Postcopy requested but not compiled\n");
974                 ret = -1;
975                 goto out_mutex;
976 #endif
977         }
978
979         if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
980                 vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);
981                 if (vsocket->reconnect && reconn_tid == 0) {
982                         if (vhost_user_reconnect_init() != 0)
983                                 goto out_mutex;
984                 }
985         } else {
986                 vsocket->is_server = true;
987         }
988         ret = create_unix_socket(vsocket);
989         if (ret < 0) {
990                 goto out_mutex;
991         }
992
993         vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket;
994
995         pthread_mutex_unlock(&vhost_user.mutex);
996         return ret;
997
998 out_mutex:
999         if (pthread_mutex_destroy(&vsocket->conn_mutex)) {
1000                 VHOST_LOG_CONFIG(ERR,
1001                         "error: failed to destroy connection mutex\n");
1002         }
1003 out_free:
1004         vhost_user_socket_mem_free(vsocket);
1005 out:
1006         pthread_mutex_unlock(&vhost_user.mutex);
1007
1008         return ret;
1009 }
1010
1011 static bool
1012 vhost_user_remove_reconnect(struct vhost_user_socket *vsocket)
1013 {
1014         int found = false;
1015         struct vhost_user_reconnect *reconn, *next;
1016
1017         pthread_mutex_lock(&reconn_list.mutex);
1018
1019         for (reconn = TAILQ_FIRST(&reconn_list.head);
1020              reconn != NULL; reconn = next) {
1021                 next = TAILQ_NEXT(reconn, next);
1022
1023                 if (reconn->vsocket == vsocket) {
1024                         TAILQ_REMOVE(&reconn_list.head, reconn, next);
1025                         close(reconn->fd);
1026                         free(reconn);
1027                         found = true;
1028                         break;
1029                 }
1030         }
1031         pthread_mutex_unlock(&reconn_list.mutex);
1032         return found;
1033 }
1034
1035 /**
1036  * Unregister the specified vhost socket
1037  */
1038 int
1039 rte_vhost_driver_unregister(const char *path)
1040 {
1041         int i;
1042         int count;
1043         struct vhost_user_connection *conn, *next;
1044
1045         if (path == NULL)
1046                 return -1;
1047
1048 again:
1049         pthread_mutex_lock(&vhost_user.mutex);
1050
1051         for (i = 0; i < vhost_user.vsocket_cnt; i++) {
1052                 struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
1053
1054                 if (!strcmp(vsocket->path, path)) {
1055                         pthread_mutex_lock(&vsocket->conn_mutex);
1056                         for (conn = TAILQ_FIRST(&vsocket->conn_list);
1057                              conn != NULL;
1058                              conn = next) {
1059                                 next = TAILQ_NEXT(conn, next);
1060
1061                                 /*
1062                                  * If r/wcb is executing, release vsocket's
1063                                  * conn_mutex and vhost_user's mutex locks, and
1064                                  * try again since the r/wcb may use the
1065                                  * conn_mutex and mutex locks.
1066                                  */
1067                                 if (fdset_try_del(&vhost_user.fdset,
1068                                                   conn->connfd) == -1) {
1069                                         pthread_mutex_unlock(
1070                                                         &vsocket->conn_mutex);
1071                                         pthread_mutex_unlock(&vhost_user.mutex);
1072                                         goto again;
1073                                 }
1074
1075                                 VHOST_LOG_CONFIG(INFO,
1076                                         "free connfd = %d for device '%s'\n",
1077                                         conn->connfd, path);
1078                                 close(conn->connfd);
1079                                 vhost_destroy_device(conn->vid);
1080                                 TAILQ_REMOVE(&vsocket->conn_list, conn, next);
1081                                 free(conn);
1082                         }
1083                         pthread_mutex_unlock(&vsocket->conn_mutex);
1084
1085                         if (vsocket->is_server) {
1086                                 /*
1087                                  * If r/wcb is executing, release vhost_user's
1088                                  * mutex lock, and try again since the r/wcb
1089                                  * may use the mutex lock.
1090                                  */
1091                                 if (fdset_try_del(&vhost_user.fdset,
1092                                                 vsocket->socket_fd) == -1) {
1093                                         pthread_mutex_unlock(&vhost_user.mutex);
1094                                         goto again;
1095                                 }
1096
1097                                 close(vsocket->socket_fd);
1098                                 unlink(path);
1099                         } else if (vsocket->reconnect) {
1100                                 vhost_user_remove_reconnect(vsocket);
1101                         }
1102
1103                         pthread_mutex_destroy(&vsocket->conn_mutex);
1104                         vhost_user_socket_mem_free(vsocket);
1105
1106                         count = --vhost_user.vsocket_cnt;
1107                         vhost_user.vsockets[i] = vhost_user.vsockets[count];
1108                         vhost_user.vsockets[count] = NULL;
1109                         pthread_mutex_unlock(&vhost_user.mutex);
1110
1111                         return 0;
1112                 }
1113         }
1114         pthread_mutex_unlock(&vhost_user.mutex);
1115
1116         return -1;
1117 }
1118
1119 /*
1120  * Register ops so that we can add/remove device to data core.
1121  */
1122 int
1123 rte_vhost_driver_callback_register(const char *path,
1124         struct vhost_device_ops const * const ops)
1125 {
1126         struct vhost_user_socket *vsocket;
1127
1128         pthread_mutex_lock(&vhost_user.mutex);
1129         vsocket = find_vhost_user_socket(path);
1130         if (vsocket)
1131                 vsocket->notify_ops = ops;
1132         pthread_mutex_unlock(&vhost_user.mutex);
1133
1134         return vsocket ? 0 : -1;
1135 }
1136
1137 struct vhost_device_ops const *
1138 vhost_driver_callback_get(const char *path)
1139 {
1140         struct vhost_user_socket *vsocket;
1141
1142         pthread_mutex_lock(&vhost_user.mutex);
1143         vsocket = find_vhost_user_socket(path);
1144         pthread_mutex_unlock(&vhost_user.mutex);
1145
1146         return vsocket ? vsocket->notify_ops : NULL;
1147 }
1148
1149 int
1150 rte_vhost_driver_start(const char *path)
1151 {
1152         struct vhost_user_socket *vsocket;
1153         static pthread_t fdset_tid;
1154
1155         pthread_mutex_lock(&vhost_user.mutex);
1156         vsocket = find_vhost_user_socket(path);
1157         pthread_mutex_unlock(&vhost_user.mutex);
1158
1159         if (!vsocket)
1160                 return -1;
1161
1162         if (fdset_tid == 0) {
1163                 /**
1164                  * create a pipe which will be waited by poll and notified to
1165                  * rebuild the wait list of poll.
1166                  */
1167                 if (fdset_pipe_init(&vhost_user.fdset) < 0) {
1168                         VHOST_LOG_CONFIG(ERR,
1169                                 "failed to create pipe for vhost fdset\n");
1170                         return -1;
1171                 }
1172
1173                 int ret = rte_ctrl_thread_create(&fdset_tid,
1174                         "vhost-events", NULL, fdset_event_dispatch,
1175                         &vhost_user.fdset);
1176                 if (ret != 0) {
1177                         VHOST_LOG_CONFIG(ERR,
1178                                 "failed to create fdset handling thread");
1179
1180                         fdset_pipe_uninit(&vhost_user.fdset);
1181                         return -1;
1182                 }
1183         }
1184
1185         if (vsocket->is_server)
1186                 return vhost_user_start_server(vsocket);
1187         else
1188                 return vhost_user_start_client(vsocket);
1189 }