vhost: replace vDPA device ID in Vhost
[dpdk.git] / lib / librte_vhost / socket.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2016 Intel Corporation
3  */
4
5 #include <stdint.h>
6 #include <stdio.h>
7 #include <limits.h>
8 #include <stdlib.h>
9 #include <unistd.h>
10 #include <string.h>
11 #include <sys/types.h>
12 #include <sys/socket.h>
13 #include <sys/un.h>
14 #include <sys/queue.h>
15 #include <errno.h>
16 #include <fcntl.h>
17 #include <pthread.h>
18
19 #include <rte_log.h>
20
21 #include "fd_man.h"
22 #include "vhost.h"
23 #include "vhost_user.h"
24
25
26 TAILQ_HEAD(vhost_user_connection_list, vhost_user_connection);
27
28 /*
29  * Every time rte_vhost_driver_register() is invoked, an associated
30  * vhost_user_socket struct will be created.
31  */
32 struct vhost_user_socket {
33         struct vhost_user_connection_list conn_list;
34         pthread_mutex_t conn_mutex;
35         char *path;
36         int socket_fd;
37         struct sockaddr_un un;
38         bool is_server;
39         bool reconnect;
40         bool dequeue_zero_copy;
41         bool iommu_support;
42         bool use_builtin_virtio_net;
43         bool extbuf;
44         bool linearbuf;
45
46         /*
47          * The "supported_features" indicates the feature bits the
48          * vhost driver supports. The "features" indicates the feature
49          * bits after the rte_vhost_driver_features_disable/enable().
50          * It is also the final feature bits used for vhost-user
51          * features negotiation.
52          */
53         uint64_t supported_features;
54         uint64_t features;
55
56         uint64_t protocol_features;
57
58         struct rte_vdpa_device *vdpa_dev;
59
60         struct vhost_device_ops const *notify_ops;
61 };
62
63 struct vhost_user_connection {
64         struct vhost_user_socket *vsocket;
65         int connfd;
66         int vid;
67
68         TAILQ_ENTRY(vhost_user_connection) next;
69 };
70
71 #define MAX_VHOST_SOCKET 1024
72 struct vhost_user {
73         struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET];
74         struct fdset fdset;
75         int vsocket_cnt;
76         pthread_mutex_t mutex;
77 };
78
79 #define MAX_VIRTIO_BACKLOG 128
80
81 static void vhost_user_server_new_connection(int fd, void *data, int *remove);
82 static void vhost_user_read_cb(int fd, void *dat, int *remove);
83 static int create_unix_socket(struct vhost_user_socket *vsocket);
84 static int vhost_user_start_client(struct vhost_user_socket *vsocket);
85
86 static struct vhost_user vhost_user = {
87         .fdset = {
88                 .fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} },
89                 .fd_mutex = PTHREAD_MUTEX_INITIALIZER,
90                 .fd_pooling_mutex = PTHREAD_MUTEX_INITIALIZER,
91                 .num = 0
92         },
93         .vsocket_cnt = 0,
94         .mutex = PTHREAD_MUTEX_INITIALIZER,
95 };
96
97 /*
98  * return bytes# of read on success or negative val on failure. Update fdnum
99  * with number of fds read.
100  */
101 int
102 read_fd_message(int sockfd, char *buf, int buflen, int *fds, int max_fds,
103                 int *fd_num)
104 {
105         struct iovec iov;
106         struct msghdr msgh;
107         char control[CMSG_SPACE(max_fds * sizeof(int))];
108         struct cmsghdr *cmsg;
109         int got_fds = 0;
110         int ret;
111
112         *fd_num = 0;
113
114         memset(&msgh, 0, sizeof(msgh));
115         iov.iov_base = buf;
116         iov.iov_len  = buflen;
117
118         msgh.msg_iov = &iov;
119         msgh.msg_iovlen = 1;
120         msgh.msg_control = control;
121         msgh.msg_controllen = sizeof(control);
122
123         ret = recvmsg(sockfd, &msgh, 0);
124         if (ret <= 0) {
125                 if (ret)
126                         VHOST_LOG_CONFIG(ERR, "recvmsg failed\n");
127                 return ret;
128         }
129
130         if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
131                 VHOST_LOG_CONFIG(ERR, "truncated msg\n");
132                 return -1;
133         }
134
135         for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
136                 cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
137                 if ((cmsg->cmsg_level == SOL_SOCKET) &&
138                         (cmsg->cmsg_type == SCM_RIGHTS)) {
139                         got_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
140                         *fd_num = got_fds;
141                         memcpy(fds, CMSG_DATA(cmsg), got_fds * sizeof(int));
142                         break;
143                 }
144         }
145
146         /* Clear out unused file descriptors */
147         while (got_fds < max_fds)
148                 fds[got_fds++] = -1;
149
150         return ret;
151 }
152
153 int
154 send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
155 {
156
157         struct iovec iov;
158         struct msghdr msgh;
159         size_t fdsize = fd_num * sizeof(int);
160         char control[CMSG_SPACE(fdsize)];
161         struct cmsghdr *cmsg;
162         int ret;
163
164         memset(&msgh, 0, sizeof(msgh));
165         iov.iov_base = buf;
166         iov.iov_len = buflen;
167
168         msgh.msg_iov = &iov;
169         msgh.msg_iovlen = 1;
170
171         if (fds && fd_num > 0) {
172                 msgh.msg_control = control;
173                 msgh.msg_controllen = sizeof(control);
174                 cmsg = CMSG_FIRSTHDR(&msgh);
175                 if (cmsg == NULL) {
176                         VHOST_LOG_CONFIG(ERR, "cmsg == NULL\n");
177                         errno = EINVAL;
178                         return -1;
179                 }
180                 cmsg->cmsg_len = CMSG_LEN(fdsize);
181                 cmsg->cmsg_level = SOL_SOCKET;
182                 cmsg->cmsg_type = SCM_RIGHTS;
183                 memcpy(CMSG_DATA(cmsg), fds, fdsize);
184         } else {
185                 msgh.msg_control = NULL;
186                 msgh.msg_controllen = 0;
187         }
188
189         do {
190                 ret = sendmsg(sockfd, &msgh, MSG_NOSIGNAL);
191         } while (ret < 0 && errno == EINTR);
192
193         if (ret < 0) {
194                 VHOST_LOG_CONFIG(ERR,  "sendmsg error\n");
195                 return ret;
196         }
197
198         return ret;
199 }
200
201 static void
202 vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
203 {
204         int vid;
205         size_t size;
206         struct vhost_user_connection *conn;
207         int ret;
208
209         if (vsocket == NULL)
210                 return;
211
212         conn = malloc(sizeof(*conn));
213         if (conn == NULL) {
214                 close(fd);
215                 return;
216         }
217
218         vid = vhost_new_device();
219         if (vid == -1) {
220                 goto err;
221         }
222
223         size = strnlen(vsocket->path, PATH_MAX);
224         vhost_set_ifname(vid, vsocket->path, size);
225
226         vhost_set_builtin_virtio_net(vid, vsocket->use_builtin_virtio_net);
227
228         vhost_attach_vdpa_device(vid, vsocket->vdpa_dev);
229
230         if (vsocket->dequeue_zero_copy)
231                 vhost_enable_dequeue_zero_copy(vid);
232
233         if (vsocket->extbuf)
234                 vhost_enable_extbuf(vid);
235
236         if (vsocket->linearbuf)
237                 vhost_enable_linearbuf(vid);
238
239         VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n", vid);
240
241         if (vsocket->notify_ops->new_connection) {
242                 ret = vsocket->notify_ops->new_connection(vid);
243                 if (ret < 0) {
244                         VHOST_LOG_CONFIG(ERR,
245                                 "failed to add vhost user connection with fd %d\n",
246                                 fd);
247                         goto err_cleanup;
248                 }
249         }
250
251         conn->connfd = fd;
252         conn->vsocket = vsocket;
253         conn->vid = vid;
254         ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb,
255                         NULL, conn);
256         if (ret < 0) {
257                 VHOST_LOG_CONFIG(ERR,
258                         "failed to add fd %d into vhost server fdset\n",
259                         fd);
260
261                 if (vsocket->notify_ops->destroy_connection)
262                         vsocket->notify_ops->destroy_connection(conn->vid);
263
264                 goto err_cleanup;
265         }
266
267         pthread_mutex_lock(&vsocket->conn_mutex);
268         TAILQ_INSERT_TAIL(&vsocket->conn_list, conn, next);
269         pthread_mutex_unlock(&vsocket->conn_mutex);
270
271         fdset_pipe_notify(&vhost_user.fdset);
272         return;
273
274 err_cleanup:
275         vhost_destroy_device(vid);
276 err:
277         free(conn);
278         close(fd);
279 }
280
281 /* call back when there is new vhost-user connection from client  */
282 static void
283 vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused)
284 {
285         struct vhost_user_socket *vsocket = dat;
286
287         fd = accept(fd, NULL, NULL);
288         if (fd < 0)
289                 return;
290
291         VHOST_LOG_CONFIG(INFO, "new vhost user connection is %d\n", fd);
292         vhost_user_add_connection(fd, vsocket);
293 }
294
295 static void
296 vhost_user_read_cb(int connfd, void *dat, int *remove)
297 {
298         struct vhost_user_connection *conn = dat;
299         struct vhost_user_socket *vsocket = conn->vsocket;
300         int ret;
301
302         ret = vhost_user_msg_handler(conn->vid, connfd);
303         if (ret < 0) {
304                 struct virtio_net *dev = get_device(conn->vid);
305
306                 close(connfd);
307                 *remove = 1;
308
309                 if (dev)
310                         vhost_destroy_device_notify(dev);
311
312                 if (vsocket->notify_ops->destroy_connection)
313                         vsocket->notify_ops->destroy_connection(conn->vid);
314
315                 vhost_destroy_device(conn->vid);
316
317                 if (vsocket->reconnect) {
318                         create_unix_socket(vsocket);
319                         vhost_user_start_client(vsocket);
320                 }
321
322                 pthread_mutex_lock(&vsocket->conn_mutex);
323                 TAILQ_REMOVE(&vsocket->conn_list, conn, next);
324                 pthread_mutex_unlock(&vsocket->conn_mutex);
325
326                 free(conn);
327         }
328 }
329
330 static int
331 create_unix_socket(struct vhost_user_socket *vsocket)
332 {
333         int fd;
334         struct sockaddr_un *un = &vsocket->un;
335
336         fd = socket(AF_UNIX, SOCK_STREAM, 0);
337         if (fd < 0)
338                 return -1;
339         VHOST_LOG_CONFIG(INFO, "vhost-user %s: socket created, fd: %d\n",
340                 vsocket->is_server ? "server" : "client", fd);
341
342         if (!vsocket->is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) {
343                 VHOST_LOG_CONFIG(ERR,
344                         "vhost-user: can't set nonblocking mode for socket, fd: "
345                         "%d (%s)\n", fd, strerror(errno));
346                 close(fd);
347                 return -1;
348         }
349
350         memset(un, 0, sizeof(*un));
351         un->sun_family = AF_UNIX;
352         strncpy(un->sun_path, vsocket->path, sizeof(un->sun_path));
353         un->sun_path[sizeof(un->sun_path) - 1] = '\0';
354
355         vsocket->socket_fd = fd;
356         return 0;
357 }
358
359 static int
360 vhost_user_start_server(struct vhost_user_socket *vsocket)
361 {
362         int ret;
363         int fd = vsocket->socket_fd;
364         const char *path = vsocket->path;
365
366         /*
367          * bind () may fail if the socket file with the same name already
368          * exists. But the library obviously should not delete the file
369          * provided by the user, since we can not be sure that it is not
370          * being used by other applications. Moreover, many applications form
371          * socket names based on user input, which is prone to errors.
372          *
373          * The user must ensure that the socket does not exist before
374          * registering the vhost driver in server mode.
375          */
376         ret = bind(fd, (struct sockaddr *)&vsocket->un, sizeof(vsocket->un));
377         if (ret < 0) {
378                 VHOST_LOG_CONFIG(ERR,
379                         "failed to bind to %s: %s; remove it and try again\n",
380                         path, strerror(errno));
381                 goto err;
382         }
383         VHOST_LOG_CONFIG(INFO, "bind to %s\n", path);
384
385         ret = listen(fd, MAX_VIRTIO_BACKLOG);
386         if (ret < 0)
387                 goto err;
388
389         ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection,
390                   NULL, vsocket);
391         if (ret < 0) {
392                 VHOST_LOG_CONFIG(ERR,
393                         "failed to add listen fd %d to vhost server fdset\n",
394                         fd);
395                 goto err;
396         }
397
398         return 0;
399
400 err:
401         close(fd);
402         return -1;
403 }
404
405 struct vhost_user_reconnect {
406         struct sockaddr_un un;
407         int fd;
408         struct vhost_user_socket *vsocket;
409
410         TAILQ_ENTRY(vhost_user_reconnect) next;
411 };
412
413 TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect);
414 struct vhost_user_reconnect_list {
415         struct vhost_user_reconnect_tailq_list head;
416         pthread_mutex_t mutex;
417 };
418
419 static struct vhost_user_reconnect_list reconn_list;
420 static pthread_t reconn_tid;
421
422 static int
423 vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz)
424 {
425         int ret, flags;
426
427         ret = connect(fd, un, sz);
428         if (ret < 0 && errno != EISCONN)
429                 return -1;
430
431         flags = fcntl(fd, F_GETFL, 0);
432         if (flags < 0) {
433                 VHOST_LOG_CONFIG(ERR,
434                         "can't get flags for connfd %d\n", fd);
435                 return -2;
436         }
437         if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) {
438                 VHOST_LOG_CONFIG(ERR,
439                                 "can't disable nonblocking on fd %d\n", fd);
440                 return -2;
441         }
442         return 0;
443 }
444
445 static void *
446 vhost_user_client_reconnect(void *arg __rte_unused)
447 {
448         int ret;
449         struct vhost_user_reconnect *reconn, *next;
450
451         while (1) {
452                 pthread_mutex_lock(&reconn_list.mutex);
453
454                 /*
455                  * An equal implementation of TAILQ_FOREACH_SAFE,
456                  * which does not exist on all platforms.
457                  */
458                 for (reconn = TAILQ_FIRST(&reconn_list.head);
459                      reconn != NULL; reconn = next) {
460                         next = TAILQ_NEXT(reconn, next);
461
462                         ret = vhost_user_connect_nonblock(reconn->fd,
463                                                 (struct sockaddr *)&reconn->un,
464                                                 sizeof(reconn->un));
465                         if (ret == -2) {
466                                 close(reconn->fd);
467                                 VHOST_LOG_CONFIG(ERR,
468                                         "reconnection for fd %d failed\n",
469                                         reconn->fd);
470                                 goto remove_fd;
471                         }
472                         if (ret == -1)
473                                 continue;
474
475                         VHOST_LOG_CONFIG(INFO,
476                                 "%s: connected\n", reconn->vsocket->path);
477                         vhost_user_add_connection(reconn->fd, reconn->vsocket);
478 remove_fd:
479                         TAILQ_REMOVE(&reconn_list.head, reconn, next);
480                         free(reconn);
481                 }
482
483                 pthread_mutex_unlock(&reconn_list.mutex);
484                 sleep(1);
485         }
486
487         return NULL;
488 }
489
490 static int
491 vhost_user_reconnect_init(void)
492 {
493         int ret;
494
495         ret = pthread_mutex_init(&reconn_list.mutex, NULL);
496         if (ret < 0) {
497                 VHOST_LOG_CONFIG(ERR, "failed to initialize mutex");
498                 return ret;
499         }
500         TAILQ_INIT(&reconn_list.head);
501
502         ret = rte_ctrl_thread_create(&reconn_tid, "vhost_reconn", NULL,
503                              vhost_user_client_reconnect, NULL);
504         if (ret != 0) {
505                 VHOST_LOG_CONFIG(ERR, "failed to create reconnect thread");
506                 if (pthread_mutex_destroy(&reconn_list.mutex)) {
507                         VHOST_LOG_CONFIG(ERR,
508                                 "failed to destroy reconnect mutex");
509                 }
510         }
511
512         return ret;
513 }
514
515 static int
516 vhost_user_start_client(struct vhost_user_socket *vsocket)
517 {
518         int ret;
519         int fd = vsocket->socket_fd;
520         const char *path = vsocket->path;
521         struct vhost_user_reconnect *reconn;
522
523         ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&vsocket->un,
524                                           sizeof(vsocket->un));
525         if (ret == 0) {
526                 vhost_user_add_connection(fd, vsocket);
527                 return 0;
528         }
529
530         VHOST_LOG_CONFIG(WARNING,
531                 "failed to connect to %s: %s\n",
532                 path, strerror(errno));
533
534         if (ret == -2 || !vsocket->reconnect) {
535                 close(fd);
536                 return -1;
537         }
538
539         VHOST_LOG_CONFIG(INFO, "%s: reconnecting...\n", path);
540         reconn = malloc(sizeof(*reconn));
541         if (reconn == NULL) {
542                 VHOST_LOG_CONFIG(ERR,
543                         "failed to allocate memory for reconnect\n");
544                 close(fd);
545                 return -1;
546         }
547         reconn->un = vsocket->un;
548         reconn->fd = fd;
549         reconn->vsocket = vsocket;
550         pthread_mutex_lock(&reconn_list.mutex);
551         TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next);
552         pthread_mutex_unlock(&reconn_list.mutex);
553
554         return 0;
555 }
556
557 static struct vhost_user_socket *
558 find_vhost_user_socket(const char *path)
559 {
560         int i;
561
562         if (path == NULL)
563                 return NULL;
564
565         for (i = 0; i < vhost_user.vsocket_cnt; i++) {
566                 struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
567
568                 if (!strcmp(vsocket->path, path))
569                         return vsocket;
570         }
571
572         return NULL;
573 }
574
575 int
576 rte_vhost_driver_attach_vdpa_device(const char *path,
577                 struct rte_vdpa_device *dev)
578 {
579         struct vhost_user_socket *vsocket;
580
581         if (dev == NULL || path == NULL)
582                 return -1;
583
584         pthread_mutex_lock(&vhost_user.mutex);
585         vsocket = find_vhost_user_socket(path);
586         if (vsocket)
587                 vsocket->vdpa_dev = dev;
588         pthread_mutex_unlock(&vhost_user.mutex);
589
590         return vsocket ? 0 : -1;
591 }
592
593 int
594 rte_vhost_driver_detach_vdpa_device(const char *path)
595 {
596         struct vhost_user_socket *vsocket;
597
598         pthread_mutex_lock(&vhost_user.mutex);
599         vsocket = find_vhost_user_socket(path);
600         if (vsocket)
601                 vsocket->vdpa_dev = NULL;
602         pthread_mutex_unlock(&vhost_user.mutex);
603
604         return vsocket ? 0 : -1;
605 }
606
607 struct rte_vdpa_device *
608 rte_vhost_driver_get_vdpa_device(const char *path)
609 {
610         struct vhost_user_socket *vsocket;
611         struct rte_vdpa_device *dev = NULL;
612
613         pthread_mutex_lock(&vhost_user.mutex);
614         vsocket = find_vhost_user_socket(path);
615         if (vsocket)
616                 dev = vsocket->vdpa_dev;
617         pthread_mutex_unlock(&vhost_user.mutex);
618
619         return dev;
620 }
621
622 int
623 rte_vhost_driver_disable_features(const char *path, uint64_t features)
624 {
625         struct vhost_user_socket *vsocket;
626
627         pthread_mutex_lock(&vhost_user.mutex);
628         vsocket = find_vhost_user_socket(path);
629
630         /* Note that use_builtin_virtio_net is not affected by this function
631          * since callers may want to selectively disable features of the
632          * built-in vhost net device backend.
633          */
634
635         if (vsocket)
636                 vsocket->features &= ~features;
637         pthread_mutex_unlock(&vhost_user.mutex);
638
639         return vsocket ? 0 : -1;
640 }
641
642 int
643 rte_vhost_driver_enable_features(const char *path, uint64_t features)
644 {
645         struct vhost_user_socket *vsocket;
646
647         pthread_mutex_lock(&vhost_user.mutex);
648         vsocket = find_vhost_user_socket(path);
649         if (vsocket) {
650                 if ((vsocket->supported_features & features) != features) {
651                         /*
652                          * trying to enable features the driver doesn't
653                          * support.
654                          */
655                         pthread_mutex_unlock(&vhost_user.mutex);
656                         return -1;
657                 }
658                 vsocket->features |= features;
659         }
660         pthread_mutex_unlock(&vhost_user.mutex);
661
662         return vsocket ? 0 : -1;
663 }
664
665 int
666 rte_vhost_driver_set_features(const char *path, uint64_t features)
667 {
668         struct vhost_user_socket *vsocket;
669
670         pthread_mutex_lock(&vhost_user.mutex);
671         vsocket = find_vhost_user_socket(path);
672         if (vsocket) {
673                 vsocket->supported_features = features;
674                 vsocket->features = features;
675
676                 /* Anyone setting feature bits is implementing their own vhost
677                  * device backend.
678                  */
679                 vsocket->use_builtin_virtio_net = false;
680         }
681         pthread_mutex_unlock(&vhost_user.mutex);
682
683         return vsocket ? 0 : -1;
684 }
685
686 int
687 rte_vhost_driver_get_features(const char *path, uint64_t *features)
688 {
689         struct vhost_user_socket *vsocket;
690         uint64_t vdpa_features;
691         struct rte_vdpa_device *vdpa_dev;
692         int ret = 0;
693
694         pthread_mutex_lock(&vhost_user.mutex);
695         vsocket = find_vhost_user_socket(path);
696         if (!vsocket) {
697                 VHOST_LOG_CONFIG(ERR,
698                         "socket file %s is not registered yet.\n", path);
699                 ret = -1;
700                 goto unlock_exit;
701         }
702
703         vdpa_dev = vsocket->vdpa_dev;
704         if (!vdpa_dev || !vdpa_dev->ops->get_features) {
705                 *features = vsocket->features;
706                 goto unlock_exit;
707         }
708
709         if (vdpa_dev->ops->get_features(vdpa_dev, &vdpa_features) < 0) {
710                 VHOST_LOG_CONFIG(ERR,
711                                 "failed to get vdpa features "
712                                 "for socket file %s.\n", path);
713                 ret = -1;
714                 goto unlock_exit;
715         }
716
717         *features = vsocket->features & vdpa_features;
718
719 unlock_exit:
720         pthread_mutex_unlock(&vhost_user.mutex);
721         return ret;
722 }
723
724 int
725 rte_vhost_driver_set_protocol_features(const char *path,
726                 uint64_t protocol_features)
727 {
728         struct vhost_user_socket *vsocket;
729
730         pthread_mutex_lock(&vhost_user.mutex);
731         vsocket = find_vhost_user_socket(path);
732         if (vsocket)
733                 vsocket->protocol_features = protocol_features;
734         pthread_mutex_unlock(&vhost_user.mutex);
735         return vsocket ? 0 : -1;
736 }
737
738 int
739 rte_vhost_driver_get_protocol_features(const char *path,
740                 uint64_t *protocol_features)
741 {
742         struct vhost_user_socket *vsocket;
743         uint64_t vdpa_protocol_features;
744         struct rte_vdpa_device *vdpa_dev;
745         int ret = 0;
746
747         pthread_mutex_lock(&vhost_user.mutex);
748         vsocket = find_vhost_user_socket(path);
749         if (!vsocket) {
750                 VHOST_LOG_CONFIG(ERR,
751                         "socket file %s is not registered yet.\n", path);
752                 ret = -1;
753                 goto unlock_exit;
754         }
755
756         vdpa_dev = vsocket->vdpa_dev;
757         if (!vdpa_dev || !vdpa_dev->ops->get_protocol_features) {
758                 *protocol_features = vsocket->protocol_features;
759                 goto unlock_exit;
760         }
761
762         if (vdpa_dev->ops->get_protocol_features(vdpa_dev,
763                                 &vdpa_protocol_features) < 0) {
764                 VHOST_LOG_CONFIG(ERR,
765                                 "failed to get vdpa protocol features "
766                                 "for socket file %s.\n", path);
767                 ret = -1;
768                 goto unlock_exit;
769         }
770
771         *protocol_features = vsocket->protocol_features
772                 & vdpa_protocol_features;
773
774 unlock_exit:
775         pthread_mutex_unlock(&vhost_user.mutex);
776         return ret;
777 }
778
779 int
780 rte_vhost_driver_get_queue_num(const char *path, uint32_t *queue_num)
781 {
782         struct vhost_user_socket *vsocket;
783         uint32_t vdpa_queue_num;
784         struct rte_vdpa_device *vdpa_dev;
785         int ret = 0;
786
787         pthread_mutex_lock(&vhost_user.mutex);
788         vsocket = find_vhost_user_socket(path);
789         if (!vsocket) {
790                 VHOST_LOG_CONFIG(ERR,
791                         "socket file %s is not registered yet.\n", path);
792                 ret = -1;
793                 goto unlock_exit;
794         }
795
796         vdpa_dev = vsocket->vdpa_dev;
797         if (!vdpa_dev || !vdpa_dev->ops->get_queue_num) {
798                 *queue_num = VHOST_MAX_QUEUE_PAIRS;
799                 goto unlock_exit;
800         }
801
802         if (vdpa_dev->ops->get_queue_num(vdpa_dev, &vdpa_queue_num) < 0) {
803                 VHOST_LOG_CONFIG(ERR,
804                                 "failed to get vdpa queue number "
805                                 "for socket file %s.\n", path);
806                 ret = -1;
807                 goto unlock_exit;
808         }
809
810         *queue_num = RTE_MIN((uint32_t)VHOST_MAX_QUEUE_PAIRS, vdpa_queue_num);
811
812 unlock_exit:
813         pthread_mutex_unlock(&vhost_user.mutex);
814         return ret;
815 }
816
817 static void
818 vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
819 {
820         if (vsocket && vsocket->path) {
821                 free(vsocket->path);
822                 vsocket->path = NULL;
823         }
824
825         if (vsocket) {
826                 free(vsocket);
827                 vsocket = NULL;
828         }
829 }
830
831 /*
832  * Register a new vhost-user socket; here we could act as server
833  * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag
834  * is set.
835  */
836 int
837 rte_vhost_driver_register(const char *path, uint64_t flags)
838 {
839         int ret = -1;
840         struct vhost_user_socket *vsocket;
841
842         if (!path)
843                 return -1;
844
845         pthread_mutex_lock(&vhost_user.mutex);
846
847         if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) {
848                 VHOST_LOG_CONFIG(ERR,
849                         "error: the number of vhost sockets reaches maximum\n");
850                 goto out;
851         }
852
853         vsocket = malloc(sizeof(struct vhost_user_socket));
854         if (!vsocket)
855                 goto out;
856         memset(vsocket, 0, sizeof(struct vhost_user_socket));
857         vsocket->path = strdup(path);
858         if (vsocket->path == NULL) {
859                 VHOST_LOG_CONFIG(ERR,
860                         "error: failed to copy socket path string\n");
861                 vhost_user_socket_mem_free(vsocket);
862                 goto out;
863         }
864         TAILQ_INIT(&vsocket->conn_list);
865         ret = pthread_mutex_init(&vsocket->conn_mutex, NULL);
866         if (ret) {
867                 VHOST_LOG_CONFIG(ERR,
868                         "error: failed to init connection mutex\n");
869                 goto out_free;
870         }
871         vsocket->vdpa_dev = NULL;
872         vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
873         vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
874         vsocket->linearbuf = flags & RTE_VHOST_USER_LINEARBUF_SUPPORT;
875
876         if (vsocket->dequeue_zero_copy &&
877             (flags & RTE_VHOST_USER_IOMMU_SUPPORT)) {
878                 VHOST_LOG_CONFIG(ERR,
879                         "error: enabling dequeue zero copy and IOMMU features "
880                         "simultaneously is not supported\n");
881                 goto out_mutex;
882         }
883
884         /*
885          * Set the supported features correctly for the builtin vhost-user
886          * net driver.
887          *
888          * Applications know nothing about features the builtin virtio net
889          * driver (virtio_net.c) supports, thus it's not possible for them
890          * to invoke rte_vhost_driver_set_features(). To workaround it, here
891          * we set it unconditionally. If the application want to implement
892          * another vhost-user driver (say SCSI), it should call the
893          * rte_vhost_driver_set_features(), which will overwrite following
894          * two values.
895          */
896         vsocket->use_builtin_virtio_net = true;
897         vsocket->supported_features = VIRTIO_NET_SUPPORTED_FEATURES;
898         vsocket->features           = VIRTIO_NET_SUPPORTED_FEATURES;
899         vsocket->protocol_features  = VHOST_USER_PROTOCOL_FEATURES;
900
901         /*
902          * Dequeue zero copy can't assure descriptors returned in order.
903          * Also, it requires that the guest memory is populated, which is
904          * not compatible with postcopy.
905          */
906         if (vsocket->dequeue_zero_copy) {
907                 if (vsocket->extbuf) {
908                         VHOST_LOG_CONFIG(ERR,
909                         "error: zero copy is incompatible with external buffers\n");
910                         ret = -1;
911                         goto out_mutex;
912                 }
913                 if (vsocket->linearbuf) {
914                         VHOST_LOG_CONFIG(ERR,
915                         "error: zero copy is incompatible with linear buffers\n");
916                         ret = -1;
917                         goto out_mutex;
918                 }
919                 if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
920                         VHOST_LOG_CONFIG(ERR,
921                         "error: zero copy is incompatible with vhost client mode\n");
922                         ret = -1;
923                         goto out_mutex;
924                 }
925                 vsocket->supported_features &= ~(1ULL << VIRTIO_F_IN_ORDER);
926                 vsocket->features &= ~(1ULL << VIRTIO_F_IN_ORDER);
927
928                 VHOST_LOG_CONFIG(INFO,
929                         "Dequeue zero copy requested, disabling postcopy support\n");
930                 vsocket->protocol_features &=
931                         ~(1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT);
932         }
933
934         /*
935          * We'll not be able to receive a buffer from guest in linear mode
936          * without external buffer if it will not fit in a single mbuf, which is
937          * likely if segmentation offloading enabled.
938          */
939         if (vsocket->linearbuf && !vsocket->extbuf) {
940                 uint64_t seg_offload_features =
941                                 (1ULL << VIRTIO_NET_F_HOST_TSO4) |
942                                 (1ULL << VIRTIO_NET_F_HOST_TSO6) |
943                                 (1ULL << VIRTIO_NET_F_HOST_UFO);
944
945                 VHOST_LOG_CONFIG(INFO,
946                         "Linear buffers requested without external buffers, "
947                         "disabling host segmentation offloading support\n");
948                 vsocket->supported_features &= ~seg_offload_features;
949                 vsocket->features &= ~seg_offload_features;
950         }
951
952         if (!(flags & RTE_VHOST_USER_IOMMU_SUPPORT)) {
953                 vsocket->supported_features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
954                 vsocket->features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
955         }
956
957         if (!(flags & RTE_VHOST_USER_POSTCOPY_SUPPORT)) {
958                 vsocket->protocol_features &=
959                         ~(1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT);
960         } else {
961 #ifndef RTE_LIBRTE_VHOST_POSTCOPY
962                 VHOST_LOG_CONFIG(ERR,
963                         "Postcopy requested but not compiled\n");
964                 ret = -1;
965                 goto out_mutex;
966 #endif
967         }
968
969         if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
970                 vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);
971                 if (vsocket->reconnect && reconn_tid == 0) {
972                         if (vhost_user_reconnect_init() != 0)
973                                 goto out_mutex;
974                 }
975         } else {
976                 vsocket->is_server = true;
977         }
978         ret = create_unix_socket(vsocket);
979         if (ret < 0) {
980                 goto out_mutex;
981         }
982
983         vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket;
984
985         pthread_mutex_unlock(&vhost_user.mutex);
986         return ret;
987
988 out_mutex:
989         if (pthread_mutex_destroy(&vsocket->conn_mutex)) {
990                 VHOST_LOG_CONFIG(ERR,
991                         "error: failed to destroy connection mutex\n");
992         }
993 out_free:
994         vhost_user_socket_mem_free(vsocket);
995 out:
996         pthread_mutex_unlock(&vhost_user.mutex);
997
998         return ret;
999 }
1000
1001 static bool
1002 vhost_user_remove_reconnect(struct vhost_user_socket *vsocket)
1003 {
1004         int found = false;
1005         struct vhost_user_reconnect *reconn, *next;
1006
1007         pthread_mutex_lock(&reconn_list.mutex);
1008
1009         for (reconn = TAILQ_FIRST(&reconn_list.head);
1010              reconn != NULL; reconn = next) {
1011                 next = TAILQ_NEXT(reconn, next);
1012
1013                 if (reconn->vsocket == vsocket) {
1014                         TAILQ_REMOVE(&reconn_list.head, reconn, next);
1015                         close(reconn->fd);
1016                         free(reconn);
1017                         found = true;
1018                         break;
1019                 }
1020         }
1021         pthread_mutex_unlock(&reconn_list.mutex);
1022         return found;
1023 }
1024
1025 /**
1026  * Unregister the specified vhost socket
1027  */
1028 int
1029 rte_vhost_driver_unregister(const char *path)
1030 {
1031         int i;
1032         int count;
1033         struct vhost_user_connection *conn, *next;
1034
1035         if (path == NULL)
1036                 return -1;
1037
1038 again:
1039         pthread_mutex_lock(&vhost_user.mutex);
1040
1041         for (i = 0; i < vhost_user.vsocket_cnt; i++) {
1042                 struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
1043
1044                 if (!strcmp(vsocket->path, path)) {
1045                         pthread_mutex_lock(&vsocket->conn_mutex);
1046                         for (conn = TAILQ_FIRST(&vsocket->conn_list);
1047                              conn != NULL;
1048                              conn = next) {
1049                                 next = TAILQ_NEXT(conn, next);
1050
1051                                 /*
1052                                  * If r/wcb is executing, release vsocket's
1053                                  * conn_mutex and vhost_user's mutex locks, and
1054                                  * try again since the r/wcb may use the
1055                                  * conn_mutex and mutex locks.
1056                                  */
1057                                 if (fdset_try_del(&vhost_user.fdset,
1058                                                   conn->connfd) == -1) {
1059                                         pthread_mutex_unlock(
1060                                                         &vsocket->conn_mutex);
1061                                         pthread_mutex_unlock(&vhost_user.mutex);
1062                                         goto again;
1063                                 }
1064
1065                                 VHOST_LOG_CONFIG(INFO,
1066                                         "free connfd = %d for device '%s'\n",
1067                                         conn->connfd, path);
1068                                 close(conn->connfd);
1069                                 vhost_destroy_device(conn->vid);
1070                                 TAILQ_REMOVE(&vsocket->conn_list, conn, next);
1071                                 free(conn);
1072                         }
1073                         pthread_mutex_unlock(&vsocket->conn_mutex);
1074
1075                         if (vsocket->is_server) {
1076                                 /*
1077                                  * If r/wcb is executing, release vhost_user's
1078                                  * mutex lock, and try again since the r/wcb
1079                                  * may use the mutex lock.
1080                                  */
1081                                 if (fdset_try_del(&vhost_user.fdset,
1082                                                 vsocket->socket_fd) == -1) {
1083                                         pthread_mutex_unlock(&vhost_user.mutex);
1084                                         goto again;
1085                                 }
1086
1087                                 close(vsocket->socket_fd);
1088                                 unlink(path);
1089                         } else if (vsocket->reconnect) {
1090                                 vhost_user_remove_reconnect(vsocket);
1091                         }
1092
1093                         pthread_mutex_destroy(&vsocket->conn_mutex);
1094                         vhost_user_socket_mem_free(vsocket);
1095
1096                         count = --vhost_user.vsocket_cnt;
1097                         vhost_user.vsockets[i] = vhost_user.vsockets[count];
1098                         vhost_user.vsockets[count] = NULL;
1099                         pthread_mutex_unlock(&vhost_user.mutex);
1100
1101                         return 0;
1102                 }
1103         }
1104         pthread_mutex_unlock(&vhost_user.mutex);
1105
1106         return -1;
1107 }
1108
1109 /*
1110  * Register ops so that we can add/remove device to data core.
1111  */
1112 int
1113 rte_vhost_driver_callback_register(const char *path,
1114         struct vhost_device_ops const * const ops)
1115 {
1116         struct vhost_user_socket *vsocket;
1117
1118         pthread_mutex_lock(&vhost_user.mutex);
1119         vsocket = find_vhost_user_socket(path);
1120         if (vsocket)
1121                 vsocket->notify_ops = ops;
1122         pthread_mutex_unlock(&vhost_user.mutex);
1123
1124         return vsocket ? 0 : -1;
1125 }
1126
1127 struct vhost_device_ops const *
1128 vhost_driver_callback_get(const char *path)
1129 {
1130         struct vhost_user_socket *vsocket;
1131
1132         pthread_mutex_lock(&vhost_user.mutex);
1133         vsocket = find_vhost_user_socket(path);
1134         pthread_mutex_unlock(&vhost_user.mutex);
1135
1136         return vsocket ? vsocket->notify_ops : NULL;
1137 }
1138
1139 int
1140 rte_vhost_driver_start(const char *path)
1141 {
1142         struct vhost_user_socket *vsocket;
1143         static pthread_t fdset_tid;
1144
1145         pthread_mutex_lock(&vhost_user.mutex);
1146         vsocket = find_vhost_user_socket(path);
1147         pthread_mutex_unlock(&vhost_user.mutex);
1148
1149         if (!vsocket)
1150                 return -1;
1151
1152         if (fdset_tid == 0) {
1153                 /**
1154                  * create a pipe which will be waited by poll and notified to
1155                  * rebuild the wait list of poll.
1156                  */
1157                 if (fdset_pipe_init(&vhost_user.fdset) < 0) {
1158                         VHOST_LOG_CONFIG(ERR,
1159                                 "failed to create pipe for vhost fdset\n");
1160                         return -1;
1161                 }
1162
1163                 int ret = rte_ctrl_thread_create(&fdset_tid,
1164                         "vhost-events", NULL, fdset_event_dispatch,
1165                         &vhost_user.fdset);
1166                 if (ret != 0) {
1167                         VHOST_LOG_CONFIG(ERR,
1168                                 "failed to create fdset handling thread");
1169
1170                         fdset_pipe_uninit(&vhost_user.fdset);
1171                         return -1;
1172                 }
1173         }
1174
1175         if (vsocket->is_server)
1176                 return vhost_user_start_server(vsocket);
1177         else
1178                 return vhost_user_start_client(vsocket);
1179 }