vhost: send userfault range addresses back to Qemu
[dpdk.git] / lib / librte_vhost / vhost_user.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2018 Intel Corporation
3  */
4
5 /* Security model
6  * --------------
7  * The vhost-user protocol connection is an external interface, so it must be
8  * robust against invalid inputs.
9  *
10  * This is important because the vhost-user master is only one step removed
11  * from the guest.  Malicious guests that have escaped will then launch further
12  * attacks from the vhost-user master.
13  *
14  * Even in deployments where guests are trusted, a bug in the vhost-user master
15  * can still cause invalid messages to be sent.  Such messages must not
16  * compromise the stability of the DPDK application by causing crashes, memory
17  * corruption, or other problematic behavior.
18  *
19  * Do not assume received VhostUserMsg fields contain sensible values!
20  */
21
22 #include <stdint.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <unistd.h>
27 #include <fcntl.h>
28 #include <sys/ioctl.h>
29 #include <sys/mman.h>
30 #include <sys/types.h>
31 #include <sys/stat.h>
32 #include <sys/syscall.h>
33 #include <assert.h>
34 #ifdef RTE_LIBRTE_VHOST_NUMA
35 #include <numaif.h>
36 #endif
37 #ifdef RTE_LIBRTE_VHOST_POSTCOPY
38 #include <linux/userfaultfd.h>
39 #endif
40
41 #include <rte_common.h>
42 #include <rte_malloc.h>
43 #include <rte_log.h>
44
45 #include "iotlb.h"
46 #include "vhost.h"
47 #include "vhost_user.h"
48
49 #define VIRTIO_MIN_MTU 68
50 #define VIRTIO_MAX_MTU 65535
51
52 static const char *vhost_message_str[VHOST_USER_MAX] = {
53         [VHOST_USER_NONE] = "VHOST_USER_NONE",
54         [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
55         [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
56         [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
57         [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
58         [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
59         [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
60         [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
61         [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
62         [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
63         [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
64         [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
65         [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
66         [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
67         [VHOST_USER_SET_VRING_ERR]  = "VHOST_USER_SET_VRING_ERR",
68         [VHOST_USER_GET_PROTOCOL_FEATURES]  = "VHOST_USER_GET_PROTOCOL_FEATURES",
69         [VHOST_USER_SET_PROTOCOL_FEATURES]  = "VHOST_USER_SET_PROTOCOL_FEATURES",
70         [VHOST_USER_GET_QUEUE_NUM]  = "VHOST_USER_GET_QUEUE_NUM",
71         [VHOST_USER_SET_VRING_ENABLE]  = "VHOST_USER_SET_VRING_ENABLE",
72         [VHOST_USER_SEND_RARP]  = "VHOST_USER_SEND_RARP",
73         [VHOST_USER_NET_SET_MTU]  = "VHOST_USER_NET_SET_MTU",
74         [VHOST_USER_SET_SLAVE_REQ_FD]  = "VHOST_USER_SET_SLAVE_REQ_FD",
75         [VHOST_USER_IOTLB_MSG]  = "VHOST_USER_IOTLB_MSG",
76         [VHOST_USER_CRYPTO_CREATE_SESS] = "VHOST_USER_CRYPTO_CREATE_SESS",
77         [VHOST_USER_CRYPTO_CLOSE_SESS] = "VHOST_USER_CRYPTO_CLOSE_SESS",
78         [VHOST_USER_POSTCOPY_ADVISE]  = "VHOST_USER_POSTCOPY_ADVISE",
79         [VHOST_USER_POSTCOPY_LISTEN]  = "VHOST_USER_POSTCOPY_LISTEN",
80 };
81
82 static int send_vhost_reply(int sockfd, struct VhostUserMsg *msg);
83 static int read_vhost_message(int sockfd, struct VhostUserMsg *msg);
84
85 static uint64_t
86 get_blk_size(int fd)
87 {
88         struct stat stat;
89         int ret;
90
91         ret = fstat(fd, &stat);
92         return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
93 }
94
95 static void
96 free_mem_region(struct virtio_net *dev)
97 {
98         uint32_t i;
99         struct rte_vhost_mem_region *reg;
100
101         if (!dev || !dev->mem)
102                 return;
103
104         for (i = 0; i < dev->mem->nregions; i++) {
105                 reg = &dev->mem->regions[i];
106                 if (reg->host_user_addr) {
107                         munmap(reg->mmap_addr, reg->mmap_size);
108                         close(reg->fd);
109                 }
110         }
111 }
112
113 void
114 vhost_backend_cleanup(struct virtio_net *dev)
115 {
116         if (dev->mem) {
117                 free_mem_region(dev);
118                 rte_free(dev->mem);
119                 dev->mem = NULL;
120         }
121
122         free(dev->guest_pages);
123         dev->guest_pages = NULL;
124
125         if (dev->log_addr) {
126                 munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
127                 dev->log_addr = 0;
128         }
129
130         if (dev->slave_req_fd >= 0) {
131                 close(dev->slave_req_fd);
132                 dev->slave_req_fd = -1;
133         }
134
135         if (dev->postcopy_ufd >= 0) {
136                 close(dev->postcopy_ufd);
137                 dev->postcopy_ufd = -1;
138         }
139
140         dev->postcopy_listening = 0;
141 }
142
143 /*
144  * This function just returns success at the moment unless
145  * the device hasn't been initialised.
146  */
147 static int
148 vhost_user_set_owner(struct virtio_net **pdev __rte_unused,
149                         struct VhostUserMsg *msg __rte_unused,
150                         int main_fd __rte_unused)
151 {
152         return VH_RESULT_OK;
153 }
154
155 static int
156 vhost_user_reset_owner(struct virtio_net **pdev,
157                         struct VhostUserMsg *msg __rte_unused,
158                         int main_fd __rte_unused)
159 {
160         struct virtio_net *dev = *pdev;
161         vhost_destroy_device_notify(dev);
162
163         cleanup_device(dev, 0);
164         reset_device(dev);
165         return VH_RESULT_OK;
166 }
167
168 /*
169  * The features that we support are requested.
170  */
171 static int
172 vhost_user_get_features(struct virtio_net **pdev, struct VhostUserMsg *msg,
173                         int main_fd __rte_unused)
174 {
175         struct virtio_net *dev = *pdev;
176         uint64_t features = 0;
177
178         rte_vhost_driver_get_features(dev->ifname, &features);
179
180         msg->payload.u64 = features;
181         msg->size = sizeof(msg->payload.u64);
182         msg->fd_num = 0;
183
184         return VH_RESULT_REPLY;
185 }
186
187 /*
188  * The queue number that we support are requested.
189  */
190 static int
191 vhost_user_get_queue_num(struct virtio_net **pdev, struct VhostUserMsg *msg,
192                         int main_fd __rte_unused)
193 {
194         struct virtio_net *dev = *pdev;
195         uint32_t queue_num = 0;
196
197         rte_vhost_driver_get_queue_num(dev->ifname, &queue_num);
198
199         msg->payload.u64 = (uint64_t)queue_num;
200         msg->size = sizeof(msg->payload.u64);
201         msg->fd_num = 0;
202
203         return VH_RESULT_REPLY;
204 }
205
206 /*
207  * We receive the negotiated features supported by us and the virtio device.
208  */
209 static int
210 vhost_user_set_features(struct virtio_net **pdev, struct VhostUserMsg *msg,
211                         int main_fd __rte_unused)
212 {
213         struct virtio_net *dev = *pdev;
214         uint64_t features = msg->payload.u64;
215         uint64_t vhost_features = 0;
216         struct rte_vdpa_device *vdpa_dev;
217         int did = -1;
218
219         rte_vhost_driver_get_features(dev->ifname, &vhost_features);
220         if (features & ~vhost_features) {
221                 RTE_LOG(ERR, VHOST_CONFIG,
222                         "(%d) received invalid negotiated features.\n",
223                         dev->vid);
224                 return VH_RESULT_ERR;
225         }
226
227         if (dev->flags & VIRTIO_DEV_RUNNING) {
228                 if (dev->features == features)
229                         return VH_RESULT_OK;
230
231                 /*
232                  * Error out if master tries to change features while device is
233                  * in running state. The exception being VHOST_F_LOG_ALL, which
234                  * is enabled when the live-migration starts.
235                  */
236                 if ((dev->features ^ features) & ~(1ULL << VHOST_F_LOG_ALL)) {
237                         RTE_LOG(ERR, VHOST_CONFIG,
238                                 "(%d) features changed while device is running.\n",
239                                 dev->vid);
240                         return VH_RESULT_ERR;
241                 }
242
243                 if (dev->notify_ops->features_changed)
244                         dev->notify_ops->features_changed(dev->vid, features);
245         }
246
247         dev->features = features;
248         if (dev->features &
249                 ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) {
250                 dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
251         } else {
252                 dev->vhost_hlen = sizeof(struct virtio_net_hdr);
253         }
254         VHOST_LOG_DEBUG(VHOST_CONFIG,
255                 "(%d) mergeable RX buffers %s, virtio 1 %s\n",
256                 dev->vid,
257                 (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off",
258                 (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off");
259
260         if ((dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET) &&
261             !(dev->features & (1ULL << VIRTIO_NET_F_MQ))) {
262                 /*
263                  * Remove all but first queue pair if MQ hasn't been
264                  * negotiated. This is safe because the device is not
265                  * running at this stage.
266                  */
267                 while (dev->nr_vring > 2) {
268                         struct vhost_virtqueue *vq;
269
270                         vq = dev->virtqueue[--dev->nr_vring];
271                         if (!vq)
272                                 continue;
273
274                         dev->virtqueue[dev->nr_vring] = NULL;
275                         cleanup_vq(vq, 1);
276                         free_vq(dev, vq);
277                 }
278         }
279
280         did = dev->vdpa_dev_id;
281         vdpa_dev = rte_vdpa_get_device(did);
282         if (vdpa_dev && vdpa_dev->ops->set_features)
283                 vdpa_dev->ops->set_features(dev->vid);
284
285         return VH_RESULT_OK;
286 }
287
288 /*
289  * The virtio device sends us the size of the descriptor ring.
290  */
291 static int
292 vhost_user_set_vring_num(struct virtio_net **pdev,
293                         struct VhostUserMsg *msg,
294                         int main_fd __rte_unused)
295 {
296         struct virtio_net *dev = *pdev;
297         struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index];
298
299         vq->size = msg->payload.state.num;
300
301         /* VIRTIO 1.0, 2.4 Virtqueues says:
302          *
303          *   Queue Size value is always a power of 2. The maximum Queue Size
304          *   value is 32768.
305          */
306         if ((vq->size & (vq->size - 1)) || vq->size > 32768) {
307                 RTE_LOG(ERR, VHOST_CONFIG,
308                         "invalid virtqueue size %u\n", vq->size);
309                 return VH_RESULT_ERR;
310         }
311
312         if (dev->dequeue_zero_copy) {
313                 vq->nr_zmbuf = 0;
314                 vq->last_zmbuf_idx = 0;
315                 vq->zmbuf_size = vq->size;
316                 vq->zmbufs = rte_zmalloc(NULL, vq->zmbuf_size *
317                                          sizeof(struct zcopy_mbuf), 0);
318                 if (vq->zmbufs == NULL) {
319                         RTE_LOG(WARNING, VHOST_CONFIG,
320                                 "failed to allocate mem for zero copy; "
321                                 "zero copy is force disabled\n");
322                         dev->dequeue_zero_copy = 0;
323                 }
324                 TAILQ_INIT(&vq->zmbuf_list);
325         }
326
327         if (vq_is_packed(dev)) {
328                 vq->shadow_used_packed = rte_malloc(NULL,
329                                 vq->size *
330                                 sizeof(struct vring_used_elem_packed),
331                                 RTE_CACHE_LINE_SIZE);
332                 if (!vq->shadow_used_packed) {
333                         RTE_LOG(ERR, VHOST_CONFIG,
334                                         "failed to allocate memory for shadow used ring.\n");
335                         return VH_RESULT_ERR;
336                 }
337
338         } else {
339                 vq->shadow_used_split = rte_malloc(NULL,
340                                 vq->size * sizeof(struct vring_used_elem),
341                                 RTE_CACHE_LINE_SIZE);
342                 if (!vq->shadow_used_split) {
343                         RTE_LOG(ERR, VHOST_CONFIG,
344                                         "failed to allocate memory for shadow used ring.\n");
345                         return VH_RESULT_ERR;
346                 }
347         }
348
349         vq->batch_copy_elems = rte_malloc(NULL,
350                                 vq->size * sizeof(struct batch_copy_elem),
351                                 RTE_CACHE_LINE_SIZE);
352         if (!vq->batch_copy_elems) {
353                 RTE_LOG(ERR, VHOST_CONFIG,
354                         "failed to allocate memory for batching copy.\n");
355                 return VH_RESULT_ERR;
356         }
357
358         return VH_RESULT_OK;
359 }
360
361 /*
362  * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the
363  * same numa node as the memory of vring descriptor.
364  */
365 #ifdef RTE_LIBRTE_VHOST_NUMA
366 static struct virtio_net*
367 numa_realloc(struct virtio_net *dev, int index)
368 {
369         int oldnode, newnode;
370         struct virtio_net *old_dev;
371         struct vhost_virtqueue *old_vq, *vq;
372         struct zcopy_mbuf *new_zmbuf;
373         struct vring_used_elem *new_shadow_used_split;
374         struct vring_used_elem_packed *new_shadow_used_packed;
375         struct batch_copy_elem *new_batch_copy_elems;
376         int ret;
377
378         old_dev = dev;
379         vq = old_vq = dev->virtqueue[index];
380
381         ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc,
382                             MPOL_F_NODE | MPOL_F_ADDR);
383
384         /* check if we need to reallocate vq */
385         ret |= get_mempolicy(&oldnode, NULL, 0, old_vq,
386                              MPOL_F_NODE | MPOL_F_ADDR);
387         if (ret) {
388                 RTE_LOG(ERR, VHOST_CONFIG,
389                         "Unable to get vq numa information.\n");
390                 return dev;
391         }
392         if (oldnode != newnode) {
393                 RTE_LOG(INFO, VHOST_CONFIG,
394                         "reallocate vq from %d to %d node\n", oldnode, newnode);
395                 vq = rte_malloc_socket(NULL, sizeof(*vq), 0, newnode);
396                 if (!vq)
397                         return dev;
398
399                 memcpy(vq, old_vq, sizeof(*vq));
400                 TAILQ_INIT(&vq->zmbuf_list);
401
402                 if (dev->dequeue_zero_copy) {
403                         new_zmbuf = rte_malloc_socket(NULL, vq->zmbuf_size *
404                                         sizeof(struct zcopy_mbuf), 0, newnode);
405                         if (new_zmbuf) {
406                                 rte_free(vq->zmbufs);
407                                 vq->zmbufs = new_zmbuf;
408                         }
409                 }
410
411                 if (vq_is_packed(dev)) {
412                         new_shadow_used_packed = rte_malloc_socket(NULL,
413                                         vq->size *
414                                         sizeof(struct vring_used_elem_packed),
415                                         RTE_CACHE_LINE_SIZE,
416                                         newnode);
417                         if (new_shadow_used_packed) {
418                                 rte_free(vq->shadow_used_packed);
419                                 vq->shadow_used_packed = new_shadow_used_packed;
420                         }
421                 } else {
422                         new_shadow_used_split = rte_malloc_socket(NULL,
423                                         vq->size *
424                                         sizeof(struct vring_used_elem),
425                                         RTE_CACHE_LINE_SIZE,
426                                         newnode);
427                         if (new_shadow_used_split) {
428                                 rte_free(vq->shadow_used_split);
429                                 vq->shadow_used_split = new_shadow_used_split;
430                         }
431                 }
432
433                 new_batch_copy_elems = rte_malloc_socket(NULL,
434                         vq->size * sizeof(struct batch_copy_elem),
435                         RTE_CACHE_LINE_SIZE,
436                         newnode);
437                 if (new_batch_copy_elems) {
438                         rte_free(vq->batch_copy_elems);
439                         vq->batch_copy_elems = new_batch_copy_elems;
440                 }
441
442                 rte_free(old_vq);
443         }
444
445         /* check if we need to reallocate dev */
446         ret = get_mempolicy(&oldnode, NULL, 0, old_dev,
447                             MPOL_F_NODE | MPOL_F_ADDR);
448         if (ret) {
449                 RTE_LOG(ERR, VHOST_CONFIG,
450                         "Unable to get dev numa information.\n");
451                 goto out;
452         }
453         if (oldnode != newnode) {
454                 RTE_LOG(INFO, VHOST_CONFIG,
455                         "reallocate dev from %d to %d node\n",
456                         oldnode, newnode);
457                 dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode);
458                 if (!dev) {
459                         dev = old_dev;
460                         goto out;
461                 }
462
463                 memcpy(dev, old_dev, sizeof(*dev));
464                 rte_free(old_dev);
465         }
466
467 out:
468         dev->virtqueue[index] = vq;
469         vhost_devices[dev->vid] = dev;
470
471         if (old_vq != vq)
472                 vhost_user_iotlb_init(dev, index);
473
474         return dev;
475 }
476 #else
477 static struct virtio_net*
478 numa_realloc(struct virtio_net *dev, int index __rte_unused)
479 {
480         return dev;
481 }
482 #endif
483
484 /* Converts QEMU virtual address to Vhost virtual address. */
485 static uint64_t
486 qva_to_vva(struct virtio_net *dev, uint64_t qva, uint64_t *len)
487 {
488         struct rte_vhost_mem_region *r;
489         uint32_t i;
490
491         /* Find the region where the address lives. */
492         for (i = 0; i < dev->mem->nregions; i++) {
493                 r = &dev->mem->regions[i];
494
495                 if (qva >= r->guest_user_addr &&
496                     qva <  r->guest_user_addr + r->size) {
497
498                         if (unlikely(*len > r->guest_user_addr + r->size - qva))
499                                 *len = r->guest_user_addr + r->size - qva;
500
501                         return qva - r->guest_user_addr +
502                                r->host_user_addr;
503                 }
504         }
505         *len = 0;
506
507         return 0;
508 }
509
510
511 /*
512  * Converts ring address to Vhost virtual address.
513  * If IOMMU is enabled, the ring address is a guest IO virtual address,
514  * else it is a QEMU virtual address.
515  */
516 static uint64_t
517 ring_addr_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq,
518                 uint64_t ra, uint64_t *size)
519 {
520         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) {
521                 uint64_t vva;
522
523                 vva = vhost_user_iotlb_cache_find(vq, ra,
524                                         size, VHOST_ACCESS_RW);
525                 if (!vva)
526                         vhost_user_iotlb_miss(dev, ra, VHOST_ACCESS_RW);
527
528                 return vva;
529         }
530
531         return qva_to_vva(dev, ra, size);
532 }
533
534 static struct virtio_net *
535 translate_ring_addresses(struct virtio_net *dev, int vq_index)
536 {
537         struct vhost_virtqueue *vq = dev->virtqueue[vq_index];
538         struct vhost_vring_addr *addr = &vq->ring_addrs;
539         uint64_t len;
540
541         if (vq_is_packed(dev)) {
542                 len = sizeof(struct vring_packed_desc) * vq->size;
543                 vq->desc_packed = (struct vring_packed_desc *)(uintptr_t)
544                         ring_addr_to_vva(dev, vq, addr->desc_user_addr, &len);
545                 vq->log_guest_addr = 0;
546                 if (vq->desc_packed == NULL ||
547                                 len != sizeof(struct vring_packed_desc) *
548                                 vq->size) {
549                         RTE_LOG(DEBUG, VHOST_CONFIG,
550                                 "(%d) failed to map desc_packed ring.\n",
551                                 dev->vid);
552                         return dev;
553                 }
554
555                 dev = numa_realloc(dev, vq_index);
556                 vq = dev->virtqueue[vq_index];
557                 addr = &vq->ring_addrs;
558
559                 len = sizeof(struct vring_packed_desc_event);
560                 vq->driver_event = (struct vring_packed_desc_event *)
561                                         (uintptr_t)ring_addr_to_vva(dev,
562                                         vq, addr->avail_user_addr, &len);
563                 if (vq->driver_event == NULL ||
564                                 len != sizeof(struct vring_packed_desc_event)) {
565                         RTE_LOG(DEBUG, VHOST_CONFIG,
566                                 "(%d) failed to find driver area address.\n",
567                                 dev->vid);
568                         return dev;
569                 }
570
571                 len = sizeof(struct vring_packed_desc_event);
572                 vq->device_event = (struct vring_packed_desc_event *)
573                                         (uintptr_t)ring_addr_to_vva(dev,
574                                         vq, addr->used_user_addr, &len);
575                 if (vq->device_event == NULL ||
576                                 len != sizeof(struct vring_packed_desc_event)) {
577                         RTE_LOG(DEBUG, VHOST_CONFIG,
578                                 "(%d) failed to find device area address.\n",
579                                 dev->vid);
580                         return dev;
581                 }
582
583                 return dev;
584         }
585
586         /* The addresses are converted from QEMU virtual to Vhost virtual. */
587         if (vq->desc && vq->avail && vq->used)
588                 return dev;
589
590         len = sizeof(struct vring_desc) * vq->size;
591         vq->desc = (struct vring_desc *)(uintptr_t)ring_addr_to_vva(dev,
592                         vq, addr->desc_user_addr, &len);
593         if (vq->desc == 0 || len != sizeof(struct vring_desc) * vq->size) {
594                 RTE_LOG(DEBUG, VHOST_CONFIG,
595                         "(%d) failed to map desc ring.\n",
596                         dev->vid);
597                 return dev;
598         }
599
600         dev = numa_realloc(dev, vq_index);
601         vq = dev->virtqueue[vq_index];
602         addr = &vq->ring_addrs;
603
604         len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size;
605         vq->avail = (struct vring_avail *)(uintptr_t)ring_addr_to_vva(dev,
606                         vq, addr->avail_user_addr, &len);
607         if (vq->avail == 0 ||
608                         len != sizeof(struct vring_avail) +
609                         sizeof(uint16_t) * vq->size) {
610                 RTE_LOG(DEBUG, VHOST_CONFIG,
611                         "(%d) failed to map avail ring.\n",
612                         dev->vid);
613                 return dev;
614         }
615
616         len = sizeof(struct vring_used) +
617                 sizeof(struct vring_used_elem) * vq->size;
618         vq->used = (struct vring_used *)(uintptr_t)ring_addr_to_vva(dev,
619                         vq, addr->used_user_addr, &len);
620         if (vq->used == 0 || len != sizeof(struct vring_used) +
621                         sizeof(struct vring_used_elem) * vq->size) {
622                 RTE_LOG(DEBUG, VHOST_CONFIG,
623                         "(%d) failed to map used ring.\n",
624                         dev->vid);
625                 return dev;
626         }
627
628         if (vq->last_used_idx != vq->used->idx) {
629                 RTE_LOG(WARNING, VHOST_CONFIG,
630                         "last_used_idx (%u) and vq->used->idx (%u) mismatches; "
631                         "some packets maybe resent for Tx and dropped for Rx\n",
632                         vq->last_used_idx, vq->used->idx);
633                 vq->last_used_idx  = vq->used->idx;
634                 vq->last_avail_idx = vq->used->idx;
635         }
636
637         vq->log_guest_addr = addr->log_guest_addr;
638
639         VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n",
640                         dev->vid, vq->desc);
641         VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n",
642                         dev->vid, vq->avail);
643         VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n",
644                         dev->vid, vq->used);
645         VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n",
646                         dev->vid, vq->log_guest_addr);
647
648         return dev;
649 }
650
651 /*
652  * The virtio device sends us the desc, used and avail ring addresses.
653  * This function then converts these to our address space.
654  */
655 static int
656 vhost_user_set_vring_addr(struct virtio_net **pdev, struct VhostUserMsg *msg,
657                         int main_fd __rte_unused)
658 {
659         struct virtio_net *dev = *pdev;
660         struct vhost_virtqueue *vq;
661         struct vhost_vring_addr *addr = &msg->payload.addr;
662
663         if (dev->mem == NULL)
664                 return VH_RESULT_ERR;
665
666         /* addr->index refers to the queue index. The txq 1, rxq is 0. */
667         vq = dev->virtqueue[msg->payload.addr.index];
668
669         /*
670          * Rings addresses should not be interpreted as long as the ring is not
671          * started and enabled
672          */
673         memcpy(&vq->ring_addrs, addr, sizeof(*addr));
674
675         vring_invalidate(dev, vq);
676
677         if (vq->enabled && (dev->features &
678                                 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) {
679                 dev = translate_ring_addresses(dev, msg->payload.addr.index);
680                 if (!dev)
681                         return VH_RESULT_ERR;
682
683                 *pdev = dev;
684         }
685
686         return VH_RESULT_OK;
687 }
688
689 /*
690  * The virtio device sends us the available ring last used index.
691  */
692 static int
693 vhost_user_set_vring_base(struct virtio_net **pdev,
694                         struct VhostUserMsg *msg,
695                         int main_fd __rte_unused)
696 {
697         struct virtio_net *dev = *pdev;
698         dev->virtqueue[msg->payload.state.index]->last_used_idx  =
699                         msg->payload.state.num;
700         dev->virtqueue[msg->payload.state.index]->last_avail_idx =
701                         msg->payload.state.num;
702
703         return VH_RESULT_OK;
704 }
705
706 static int
707 add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
708                    uint64_t host_phys_addr, uint64_t size)
709 {
710         struct guest_page *page, *last_page;
711
712         if (dev->nr_guest_pages == dev->max_guest_pages) {
713                 dev->max_guest_pages *= 2;
714                 dev->guest_pages = realloc(dev->guest_pages,
715                                         dev->max_guest_pages * sizeof(*page));
716                 if (!dev->guest_pages) {
717                         RTE_LOG(ERR, VHOST_CONFIG, "cannot realloc guest_pages\n");
718                         return -1;
719                 }
720         }
721
722         if (dev->nr_guest_pages > 0) {
723                 last_page = &dev->guest_pages[dev->nr_guest_pages - 1];
724                 /* merge if the two pages are continuous */
725                 if (host_phys_addr == last_page->host_phys_addr +
726                                       last_page->size) {
727                         last_page->size += size;
728                         return 0;
729                 }
730         }
731
732         page = &dev->guest_pages[dev->nr_guest_pages++];
733         page->guest_phys_addr = guest_phys_addr;
734         page->host_phys_addr  = host_phys_addr;
735         page->size = size;
736
737         return 0;
738 }
739
740 static int
741 add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg,
742                 uint64_t page_size)
743 {
744         uint64_t reg_size = reg->size;
745         uint64_t host_user_addr  = reg->host_user_addr;
746         uint64_t guest_phys_addr = reg->guest_phys_addr;
747         uint64_t host_phys_addr;
748         uint64_t size;
749
750         host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)host_user_addr);
751         size = page_size - (guest_phys_addr & (page_size - 1));
752         size = RTE_MIN(size, reg_size);
753
754         if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size) < 0)
755                 return -1;
756
757         host_user_addr  += size;
758         guest_phys_addr += size;
759         reg_size -= size;
760
761         while (reg_size > 0) {
762                 size = RTE_MIN(reg_size, page_size);
763                 host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)
764                                                   host_user_addr);
765                 if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr,
766                                 size) < 0)
767                         return -1;
768
769                 host_user_addr  += size;
770                 guest_phys_addr += size;
771                 reg_size -= size;
772         }
773
774         return 0;
775 }
776
777 #ifdef RTE_LIBRTE_VHOST_DEBUG
778 /* TODO: enable it only in debug mode? */
779 static void
780 dump_guest_pages(struct virtio_net *dev)
781 {
782         uint32_t i;
783         struct guest_page *page;
784
785         for (i = 0; i < dev->nr_guest_pages; i++) {
786                 page = &dev->guest_pages[i];
787
788                 RTE_LOG(INFO, VHOST_CONFIG,
789                         "guest physical page region %u\n"
790                         "\t guest_phys_addr: %" PRIx64 "\n"
791                         "\t host_phys_addr : %" PRIx64 "\n"
792                         "\t size           : %" PRIx64 "\n",
793                         i,
794                         page->guest_phys_addr,
795                         page->host_phys_addr,
796                         page->size);
797         }
798 }
799 #else
800 #define dump_guest_pages(dev)
801 #endif
802
803 static bool
804 vhost_memory_changed(struct VhostUserMemory *new,
805                      struct rte_vhost_memory *old)
806 {
807         uint32_t i;
808
809         if (new->nregions != old->nregions)
810                 return true;
811
812         for (i = 0; i < new->nregions; ++i) {
813                 VhostUserMemoryRegion *new_r = &new->regions[i];
814                 struct rte_vhost_mem_region *old_r = &old->regions[i];
815
816                 if (new_r->guest_phys_addr != old_r->guest_phys_addr)
817                         return true;
818                 if (new_r->memory_size != old_r->size)
819                         return true;
820                 if (new_r->userspace_addr != old_r->guest_user_addr)
821                         return true;
822         }
823
824         return false;
825 }
826
827 static int
828 vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
829                         int main_fd)
830 {
831         struct virtio_net *dev = *pdev;
832         struct VhostUserMemory *memory = &msg->payload.memory;
833         struct rte_vhost_mem_region *reg;
834         void *mmap_addr;
835         uint64_t mmap_size;
836         uint64_t mmap_offset;
837         uint64_t alignment;
838         uint32_t i;
839         int populate;
840         int fd;
841
842         if (memory->nregions > VHOST_MEMORY_MAX_NREGIONS) {
843                 RTE_LOG(ERR, VHOST_CONFIG,
844                         "too many memory regions (%u)\n", memory->nregions);
845                 return VH_RESULT_ERR;
846         }
847
848         if (dev->mem && !vhost_memory_changed(memory, dev->mem)) {
849                 RTE_LOG(INFO, VHOST_CONFIG,
850                         "(%d) memory regions not changed\n", dev->vid);
851
852                 for (i = 0; i < memory->nregions; i++)
853                         close(msg->fds[i]);
854
855                 return VH_RESULT_OK;
856         }
857
858         if (dev->mem) {
859                 free_mem_region(dev);
860                 rte_free(dev->mem);
861                 dev->mem = NULL;
862         }
863
864         /* Flush IOTLB cache as previous HVAs are now invalid */
865         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
866                 for (i = 0; i < dev->nr_vring; i++)
867                         vhost_user_iotlb_flush_all(dev->virtqueue[i]);
868
869         dev->nr_guest_pages = 0;
870         if (!dev->guest_pages) {
871                 dev->max_guest_pages = 8;
872                 dev->guest_pages = malloc(dev->max_guest_pages *
873                                                 sizeof(struct guest_page));
874                 if (dev->guest_pages == NULL) {
875                         RTE_LOG(ERR, VHOST_CONFIG,
876                                 "(%d) failed to allocate memory "
877                                 "for dev->guest_pages\n",
878                                 dev->vid);
879                         return VH_RESULT_ERR;
880                 }
881         }
882
883         dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct rte_vhost_memory) +
884                 sizeof(struct rte_vhost_mem_region) * memory->nregions, 0);
885         if (dev->mem == NULL) {
886                 RTE_LOG(ERR, VHOST_CONFIG,
887                         "(%d) failed to allocate memory for dev->mem\n",
888                         dev->vid);
889                 return VH_RESULT_ERR;
890         }
891         dev->mem->nregions = memory->nregions;
892
893         for (i = 0; i < memory->nregions; i++) {
894                 fd  = msg->fds[i];
895                 reg = &dev->mem->regions[i];
896
897                 reg->guest_phys_addr = memory->regions[i].guest_phys_addr;
898                 reg->guest_user_addr = memory->regions[i].userspace_addr;
899                 reg->size            = memory->regions[i].memory_size;
900                 reg->fd              = fd;
901
902                 mmap_offset = memory->regions[i].mmap_offset;
903
904                 /* Check for memory_size + mmap_offset overflow */
905                 if (mmap_offset >= -reg->size) {
906                         RTE_LOG(ERR, VHOST_CONFIG,
907                                 "mmap_offset (%#"PRIx64") and memory_size "
908                                 "(%#"PRIx64") overflow\n",
909                                 mmap_offset, reg->size);
910                         goto err_mmap;
911                 }
912
913                 mmap_size = reg->size + mmap_offset;
914
915                 /* mmap() without flag of MAP_ANONYMOUS, should be called
916                  * with length argument aligned with hugepagesz at older
917                  * longterm version Linux, like 2.6.32 and 3.2.72, or
918                  * mmap() will fail with EINVAL.
919                  *
920                  * to avoid failure, make sure in caller to keep length
921                  * aligned.
922                  */
923                 alignment = get_blk_size(fd);
924                 if (alignment == (uint64_t)-1) {
925                         RTE_LOG(ERR, VHOST_CONFIG,
926                                 "couldn't get hugepage size through fstat\n");
927                         goto err_mmap;
928                 }
929                 mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment);
930
931                 populate = (dev->dequeue_zero_copy) ? MAP_POPULATE : 0;
932                 mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
933                                  MAP_SHARED | populate, fd, 0);
934
935                 if (mmap_addr == MAP_FAILED) {
936                         RTE_LOG(ERR, VHOST_CONFIG,
937                                 "mmap region %u failed.\n", i);
938                         goto err_mmap;
939                 }
940
941                 reg->mmap_addr = mmap_addr;
942                 reg->mmap_size = mmap_size;
943                 reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
944                                       mmap_offset;
945
946                 if (dev->dequeue_zero_copy)
947                         if (add_guest_pages(dev, reg, alignment) < 0) {
948                                 RTE_LOG(ERR, VHOST_CONFIG,
949                                         "adding guest pages to region %u failed.\n",
950                                         i);
951                                 goto err_mmap;
952                         }
953
954                 RTE_LOG(INFO, VHOST_CONFIG,
955                         "guest memory region %u, size: 0x%" PRIx64 "\n"
956                         "\t guest physical addr: 0x%" PRIx64 "\n"
957                         "\t guest virtual  addr: 0x%" PRIx64 "\n"
958                         "\t host  virtual  addr: 0x%" PRIx64 "\n"
959                         "\t mmap addr : 0x%" PRIx64 "\n"
960                         "\t mmap size : 0x%" PRIx64 "\n"
961                         "\t mmap align: 0x%" PRIx64 "\n"
962                         "\t mmap off  : 0x%" PRIx64 "\n",
963                         i, reg->size,
964                         reg->guest_phys_addr,
965                         reg->guest_user_addr,
966                         reg->host_user_addr,
967                         (uint64_t)(uintptr_t)mmap_addr,
968                         mmap_size,
969                         alignment,
970                         mmap_offset);
971
972                 if (dev->postcopy_listening) {
973                         /*
974                          * We haven't a better way right now than sharing
975                          * DPDK's virtual address with Qemu, so that Qemu can
976                          * retrieve the region offset when handling userfaults.
977                          */
978                         memory->regions[i].userspace_addr =
979                                 reg->host_user_addr;
980                 }
981         }
982         if (dev->postcopy_listening) {
983                 /* Send the addresses back to qemu */
984                 msg->fd_num = 0;
985                 send_vhost_reply(main_fd, msg);
986
987                 /* Wait for qemu to acknolwedge it's got the addresses
988                  * we've got to wait before we're allowed to generate faults.
989                  */
990                 VhostUserMsg ack_msg;
991                 if (read_vhost_message(main_fd, &ack_msg) <= 0) {
992                         RTE_LOG(ERR, VHOST_CONFIG,
993                                 "Failed to read qemu ack on postcopy set-mem-table\n");
994                         goto err_mmap;
995                 }
996                 if (ack_msg.request.master != VHOST_USER_SET_MEM_TABLE) {
997                         RTE_LOG(ERR, VHOST_CONFIG,
998                                 "Bad qemu ack on postcopy set-mem-table (%d)\n",
999                                 ack_msg.request.master);
1000                         goto err_mmap;
1001                 }
1002
1003                 /* Now userfault register and we can use the memory */
1004                 for (i = 0; i < memory->nregions; i++) {
1005 #ifdef RTE_LIBRTE_VHOST_POSTCOPY
1006                         reg = &dev->mem->regions[i];
1007                         struct uffdio_register reg_struct;
1008
1009                         /*
1010                          * Let's register all the mmap'ed area to ensure
1011                          * alignment on page boundary.
1012                          */
1013                         reg_struct.range.start =
1014                                 (uint64_t)(uintptr_t)reg->mmap_addr;
1015                         reg_struct.range.len = reg->mmap_size;
1016                         reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
1017
1018                         if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER,
1019                                                 &reg_struct)) {
1020                                 RTE_LOG(ERR, VHOST_CONFIG,
1021                                         "Failed to register ufd for region %d: (ufd = %d) %s\n",
1022                                         i, dev->postcopy_ufd,
1023                                         strerror(errno));
1024                                 goto err_mmap;
1025                         }
1026                         RTE_LOG(INFO, VHOST_CONFIG,
1027                                 "\t userfaultfd registered for range : %llx - %llx\n",
1028                                 reg_struct.range.start,
1029                                 reg_struct.range.start +
1030                                 reg_struct.range.len - 1);
1031 #else
1032                         goto err_mmap;
1033 #endif
1034                 }
1035         }
1036
1037         for (i = 0; i < dev->nr_vring; i++) {
1038                 struct vhost_virtqueue *vq = dev->virtqueue[i];
1039
1040                 if (vq->desc || vq->avail || vq->used) {
1041                         /*
1042                          * If the memory table got updated, the ring addresses
1043                          * need to be translated again as virtual addresses have
1044                          * changed.
1045                          */
1046                         vring_invalidate(dev, vq);
1047
1048                         dev = translate_ring_addresses(dev, i);
1049                         if (!dev) {
1050                                 dev = *pdev;
1051                                 goto err_mmap;
1052                         }
1053
1054                         *pdev = dev;
1055                 }
1056         }
1057
1058         dump_guest_pages(dev);
1059
1060         return VH_RESULT_OK;
1061
1062 err_mmap:
1063         free_mem_region(dev);
1064         rte_free(dev->mem);
1065         dev->mem = NULL;
1066         return VH_RESULT_ERR;
1067 }
1068
1069 static bool
1070 vq_is_ready(struct virtio_net *dev, struct vhost_virtqueue *vq)
1071 {
1072         bool rings_ok;
1073
1074         if (!vq)
1075                 return false;
1076
1077         if (vq_is_packed(dev))
1078                 rings_ok = !!vq->desc_packed;
1079         else
1080                 rings_ok = vq->desc && vq->avail && vq->used;
1081
1082         return rings_ok &&
1083                vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD &&
1084                vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD;
1085 }
1086
1087 static int
1088 virtio_is_ready(struct virtio_net *dev)
1089 {
1090         struct vhost_virtqueue *vq;
1091         uint32_t i;
1092
1093         if (dev->nr_vring == 0)
1094                 return 0;
1095
1096         for (i = 0; i < dev->nr_vring; i++) {
1097                 vq = dev->virtqueue[i];
1098
1099                 if (!vq_is_ready(dev, vq))
1100                         return 0;
1101         }
1102
1103         RTE_LOG(INFO, VHOST_CONFIG,
1104                 "virtio is now ready for processing.\n");
1105         return 1;
1106 }
1107
1108 static int
1109 vhost_user_set_vring_call(struct virtio_net **pdev, struct VhostUserMsg *msg,
1110                         int main_fd __rte_unused)
1111 {
1112         struct virtio_net *dev = *pdev;
1113         struct vhost_vring_file file;
1114         struct vhost_virtqueue *vq;
1115
1116         file.index = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
1117         if (msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
1118                 file.fd = VIRTIO_INVALID_EVENTFD;
1119         else
1120                 file.fd = msg->fds[0];
1121         RTE_LOG(INFO, VHOST_CONFIG,
1122                 "vring call idx:%d file:%d\n", file.index, file.fd);
1123
1124         vq = dev->virtqueue[file.index];
1125         if (vq->callfd >= 0)
1126                 close(vq->callfd);
1127
1128         vq->callfd = file.fd;
1129
1130         return VH_RESULT_OK;
1131 }
1132
1133 static int vhost_user_set_vring_err(struct virtio_net **pdev __rte_unused,
1134                         struct VhostUserMsg *msg,
1135                         int main_fd __rte_unused)
1136 {
1137         if (!(msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK))
1138                 close(msg->fds[0]);
1139         RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
1140
1141         return VH_RESULT_OK;
1142 }
1143
1144 static int
1145 vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg,
1146                         int main_fd __rte_unused)
1147 {
1148         struct virtio_net *dev = *pdev;
1149         struct vhost_vring_file file;
1150         struct vhost_virtqueue *vq;
1151
1152         file.index = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
1153         if (msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
1154                 file.fd = VIRTIO_INVALID_EVENTFD;
1155         else
1156                 file.fd = msg->fds[0];
1157         RTE_LOG(INFO, VHOST_CONFIG,
1158                 "vring kick idx:%d file:%d\n", file.index, file.fd);
1159
1160         /* Interpret ring addresses only when ring is started. */
1161         dev = translate_ring_addresses(dev, file.index);
1162         if (!dev)
1163                 return VH_RESULT_ERR;
1164
1165         *pdev = dev;
1166
1167         vq = dev->virtqueue[file.index];
1168
1169         /*
1170          * When VHOST_USER_F_PROTOCOL_FEATURES is not negotiated,
1171          * the ring starts already enabled. Otherwise, it is enabled via
1172          * the SET_VRING_ENABLE message.
1173          */
1174         if (!(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES)))
1175                 vq->enabled = 1;
1176
1177         if (vq->kickfd >= 0)
1178                 close(vq->kickfd);
1179         vq->kickfd = file.fd;
1180
1181         return VH_RESULT_OK;
1182 }
1183
1184 static void
1185 free_zmbufs(struct vhost_virtqueue *vq)
1186 {
1187         struct zcopy_mbuf *zmbuf, *next;
1188
1189         for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
1190              zmbuf != NULL; zmbuf = next) {
1191                 next = TAILQ_NEXT(zmbuf, next);
1192
1193                 rte_pktmbuf_free(zmbuf->mbuf);
1194                 TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
1195         }
1196
1197         rte_free(vq->zmbufs);
1198 }
1199
1200 /*
1201  * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
1202  */
1203 static int
1204 vhost_user_get_vring_base(struct virtio_net **pdev,
1205                         struct VhostUserMsg *msg,
1206                         int main_fd __rte_unused)
1207 {
1208         struct virtio_net *dev = *pdev;
1209         struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index];
1210
1211         /* We have to stop the queue (virtio) if it is running. */
1212         vhost_destroy_device_notify(dev);
1213
1214         dev->flags &= ~VIRTIO_DEV_READY;
1215         dev->flags &= ~VIRTIO_DEV_VDPA_CONFIGURED;
1216
1217         /* Here we are safe to get the last avail index */
1218         msg->payload.state.num = vq->last_avail_idx;
1219
1220         RTE_LOG(INFO, VHOST_CONFIG,
1221                 "vring base idx:%d file:%d\n", msg->payload.state.index,
1222                 msg->payload.state.num);
1223         /*
1224          * Based on current qemu vhost-user implementation, this message is
1225          * sent and only sent in vhost_vring_stop.
1226          * TODO: cleanup the vring, it isn't usable since here.
1227          */
1228         if (vq->kickfd >= 0)
1229                 close(vq->kickfd);
1230
1231         vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
1232
1233         if (vq->callfd >= 0)
1234                 close(vq->callfd);
1235
1236         vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD;
1237
1238         if (dev->dequeue_zero_copy)
1239                 free_zmbufs(vq);
1240         if (vq_is_packed(dev)) {
1241                 rte_free(vq->shadow_used_packed);
1242                 vq->shadow_used_packed = NULL;
1243         } else {
1244                 rte_free(vq->shadow_used_split);
1245                 vq->shadow_used_split = NULL;
1246         }
1247
1248         rte_free(vq->batch_copy_elems);
1249         vq->batch_copy_elems = NULL;
1250
1251         msg->size = sizeof(msg->payload.state);
1252         msg->fd_num = 0;
1253
1254         return VH_RESULT_REPLY;
1255 }
1256
1257 /*
1258  * when virtio queues are ready to work, qemu will send us to
1259  * enable the virtio queue pair.
1260  */
1261 static int
1262 vhost_user_set_vring_enable(struct virtio_net **pdev,
1263                         struct VhostUserMsg *msg,
1264                         int main_fd __rte_unused)
1265 {
1266         struct virtio_net *dev = *pdev;
1267         int enable = (int)msg->payload.state.num;
1268         int index = (int)msg->payload.state.index;
1269         struct rte_vdpa_device *vdpa_dev;
1270         int did = -1;
1271
1272         RTE_LOG(INFO, VHOST_CONFIG,
1273                 "set queue enable: %d to qp idx: %d\n",
1274                 enable, index);
1275
1276         did = dev->vdpa_dev_id;
1277         vdpa_dev = rte_vdpa_get_device(did);
1278         if (vdpa_dev && vdpa_dev->ops->set_vring_state)
1279                 vdpa_dev->ops->set_vring_state(dev->vid, index, enable);
1280
1281         if (dev->notify_ops->vring_state_changed)
1282                 dev->notify_ops->vring_state_changed(dev->vid,
1283                                 index, enable);
1284
1285         dev->virtqueue[index]->enabled = enable;
1286
1287         return VH_RESULT_OK;
1288 }
1289
1290 static int
1291 vhost_user_get_protocol_features(struct virtio_net **pdev,
1292                         struct VhostUserMsg *msg,
1293                         int main_fd __rte_unused)
1294 {
1295         struct virtio_net *dev = *pdev;
1296         uint64_t features, protocol_features;
1297
1298         rte_vhost_driver_get_features(dev->ifname, &features);
1299         rte_vhost_driver_get_protocol_features(dev->ifname, &protocol_features);
1300
1301         /*
1302          * REPLY_ACK protocol feature is only mandatory for now
1303          * for IOMMU feature. If IOMMU is explicitly disabled by the
1304          * application, disable also REPLY_ACK feature for older buggy
1305          * Qemu versions (from v2.7.0 to v2.9.0).
1306          */
1307         if (!(features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)))
1308                 protocol_features &= ~(1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK);
1309
1310         msg->payload.u64 = protocol_features;
1311         msg->size = sizeof(msg->payload.u64);
1312         msg->fd_num = 0;
1313
1314         return VH_RESULT_REPLY;
1315 }
1316
1317 static int
1318 vhost_user_set_protocol_features(struct virtio_net **pdev,
1319                         struct VhostUserMsg *msg,
1320                         int main_fd __rte_unused)
1321 {
1322         struct virtio_net *dev = *pdev;
1323         uint64_t protocol_features = msg->payload.u64;
1324         if (protocol_features & ~VHOST_USER_PROTOCOL_FEATURES) {
1325                 RTE_LOG(ERR, VHOST_CONFIG,
1326                         "(%d) received invalid protocol features.\n",
1327                         dev->vid);
1328                 return VH_RESULT_ERR;
1329         }
1330
1331         dev->protocol_features = protocol_features;
1332
1333         return VH_RESULT_OK;
1334 }
1335
1336 static int
1337 vhost_user_set_log_base(struct virtio_net **pdev, struct VhostUserMsg *msg,
1338                         int main_fd __rte_unused)
1339 {
1340         struct virtio_net *dev = *pdev;
1341         int fd = msg->fds[0];
1342         uint64_t size, off;
1343         void *addr;
1344
1345         if (fd < 0) {
1346                 RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd);
1347                 return VH_RESULT_ERR;
1348         }
1349
1350         if (msg->size != sizeof(VhostUserLog)) {
1351                 RTE_LOG(ERR, VHOST_CONFIG,
1352                         "invalid log base msg size: %"PRId32" != %d\n",
1353                         msg->size, (int)sizeof(VhostUserLog));
1354                 return VH_RESULT_ERR;
1355         }
1356
1357         size = msg->payload.log.mmap_size;
1358         off  = msg->payload.log.mmap_offset;
1359
1360         /* Don't allow mmap_offset to point outside the mmap region */
1361         if (off > size) {
1362                 RTE_LOG(ERR, VHOST_CONFIG,
1363                         "log offset %#"PRIx64" exceeds log size %#"PRIx64"\n",
1364                         off, size);
1365                 return VH_RESULT_ERR;
1366         }
1367
1368         RTE_LOG(INFO, VHOST_CONFIG,
1369                 "log mmap size: %"PRId64", offset: %"PRId64"\n",
1370                 size, off);
1371
1372         /*
1373          * mmap from 0 to workaround a hugepage mmap bug: mmap will
1374          * fail when offset is not page size aligned.
1375          */
1376         addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
1377         close(fd);
1378         if (addr == MAP_FAILED) {
1379                 RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n");
1380                 return VH_RESULT_ERR;
1381         }
1382
1383         /*
1384          * Free previously mapped log memory on occasionally
1385          * multiple VHOST_USER_SET_LOG_BASE.
1386          */
1387         if (dev->log_addr) {
1388                 munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
1389         }
1390         dev->log_addr = (uint64_t)(uintptr_t)addr;
1391         dev->log_base = dev->log_addr + off;
1392         dev->log_size = size;
1393
1394         /*
1395          * The spec is not clear about it (yet), but QEMU doesn't expect
1396          * any payload in the reply.
1397          */
1398         msg->size = 0;
1399         msg->fd_num = 0;
1400
1401         return VH_RESULT_REPLY;
1402 }
1403
1404 static int vhost_user_set_log_fd(struct virtio_net **pdev __rte_unused,
1405                         struct VhostUserMsg *msg,
1406                         int main_fd __rte_unused)
1407 {
1408         close(msg->fds[0]);
1409         RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
1410
1411         return VH_RESULT_OK;
1412 }
1413
1414 /*
1415  * An rarp packet is constructed and broadcasted to notify switches about
1416  * the new location of the migrated VM, so that packets from outside will
1417  * not be lost after migration.
1418  *
1419  * However, we don't actually "send" a rarp packet here, instead, we set
1420  * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
1421  */
1422 static int
1423 vhost_user_send_rarp(struct virtio_net **pdev, struct VhostUserMsg *msg,
1424                         int main_fd __rte_unused)
1425 {
1426         struct virtio_net *dev = *pdev;
1427         uint8_t *mac = (uint8_t *)&msg->payload.u64;
1428         struct rte_vdpa_device *vdpa_dev;
1429         int did = -1;
1430
1431         RTE_LOG(DEBUG, VHOST_CONFIG,
1432                 ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
1433                 mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
1434         memcpy(dev->mac.addr_bytes, mac, 6);
1435
1436         /*
1437          * Set the flag to inject a RARP broadcast packet at
1438          * rte_vhost_dequeue_burst().
1439          *
1440          * rte_smp_wmb() is for making sure the mac is copied
1441          * before the flag is set.
1442          */
1443         rte_smp_wmb();
1444         rte_atomic16_set(&dev->broadcast_rarp, 1);
1445         did = dev->vdpa_dev_id;
1446         vdpa_dev = rte_vdpa_get_device(did);
1447         if (vdpa_dev && vdpa_dev->ops->migration_done)
1448                 vdpa_dev->ops->migration_done(dev->vid);
1449
1450         return VH_RESULT_OK;
1451 }
1452
1453 static int
1454 vhost_user_net_set_mtu(struct virtio_net **pdev, struct VhostUserMsg *msg,
1455                         int main_fd __rte_unused)
1456 {
1457         struct virtio_net *dev = *pdev;
1458         if (msg->payload.u64 < VIRTIO_MIN_MTU ||
1459                         msg->payload.u64 > VIRTIO_MAX_MTU) {
1460                 RTE_LOG(ERR, VHOST_CONFIG, "Invalid MTU size (%"PRIu64")\n",
1461                                 msg->payload.u64);
1462
1463                 return VH_RESULT_ERR;
1464         }
1465
1466         dev->mtu = msg->payload.u64;
1467
1468         return VH_RESULT_OK;
1469 }
1470
1471 static int
1472 vhost_user_set_req_fd(struct virtio_net **pdev, struct VhostUserMsg *msg,
1473                         int main_fd __rte_unused)
1474 {
1475         struct virtio_net *dev = *pdev;
1476         int fd = msg->fds[0];
1477
1478         if (fd < 0) {
1479                 RTE_LOG(ERR, VHOST_CONFIG,
1480                                 "Invalid file descriptor for slave channel (%d)\n",
1481                                 fd);
1482                 return VH_RESULT_ERR;
1483         }
1484
1485         dev->slave_req_fd = fd;
1486
1487         return VH_RESULT_OK;
1488 }
1489
1490 static int
1491 is_vring_iotlb_update(struct vhost_virtqueue *vq, struct vhost_iotlb_msg *imsg)
1492 {
1493         struct vhost_vring_addr *ra;
1494         uint64_t start, end;
1495
1496         start = imsg->iova;
1497         end = start + imsg->size;
1498
1499         ra = &vq->ring_addrs;
1500         if (ra->desc_user_addr >= start && ra->desc_user_addr < end)
1501                 return 1;
1502         if (ra->avail_user_addr >= start && ra->avail_user_addr < end)
1503                 return 1;
1504         if (ra->used_user_addr >= start && ra->used_user_addr < end)
1505                 return 1;
1506
1507         return 0;
1508 }
1509
1510 static int
1511 is_vring_iotlb_invalidate(struct vhost_virtqueue *vq,
1512                                 struct vhost_iotlb_msg *imsg)
1513 {
1514         uint64_t istart, iend, vstart, vend;
1515
1516         istart = imsg->iova;
1517         iend = istart + imsg->size - 1;
1518
1519         vstart = (uintptr_t)vq->desc;
1520         vend = vstart + sizeof(struct vring_desc) * vq->size - 1;
1521         if (vstart <= iend && istart <= vend)
1522                 return 1;
1523
1524         vstart = (uintptr_t)vq->avail;
1525         vend = vstart + sizeof(struct vring_avail);
1526         vend += sizeof(uint16_t) * vq->size - 1;
1527         if (vstart <= iend && istart <= vend)
1528                 return 1;
1529
1530         vstart = (uintptr_t)vq->used;
1531         vend = vstart + sizeof(struct vring_used);
1532         vend += sizeof(struct vring_used_elem) * vq->size - 1;
1533         if (vstart <= iend && istart <= vend)
1534                 return 1;
1535
1536         return 0;
1537 }
1538
1539 static int
1540 vhost_user_iotlb_msg(struct virtio_net **pdev, struct VhostUserMsg *msg,
1541                         int main_fd __rte_unused)
1542 {
1543         struct virtio_net *dev = *pdev;
1544         struct vhost_iotlb_msg *imsg = &msg->payload.iotlb;
1545         uint16_t i;
1546         uint64_t vva, len;
1547
1548         switch (imsg->type) {
1549         case VHOST_IOTLB_UPDATE:
1550                 len = imsg->size;
1551                 vva = qva_to_vva(dev, imsg->uaddr, &len);
1552                 if (!vva)
1553                         return VH_RESULT_ERR;
1554
1555                 for (i = 0; i < dev->nr_vring; i++) {
1556                         struct vhost_virtqueue *vq = dev->virtqueue[i];
1557
1558                         vhost_user_iotlb_cache_insert(vq, imsg->iova, vva,
1559                                         len, imsg->perm);
1560
1561                         if (is_vring_iotlb_update(vq, imsg))
1562                                 *pdev = dev = translate_ring_addresses(dev, i);
1563                 }
1564                 break;
1565         case VHOST_IOTLB_INVALIDATE:
1566                 for (i = 0; i < dev->nr_vring; i++) {
1567                         struct vhost_virtqueue *vq = dev->virtqueue[i];
1568
1569                         vhost_user_iotlb_cache_remove(vq, imsg->iova,
1570                                         imsg->size);
1571
1572                         if (is_vring_iotlb_invalidate(vq, imsg))
1573                                 vring_invalidate(dev, vq);
1574                 }
1575                 break;
1576         default:
1577                 RTE_LOG(ERR, VHOST_CONFIG, "Invalid IOTLB message type (%d)\n",
1578                                 imsg->type);
1579                 return VH_RESULT_ERR;
1580         }
1581
1582         return VH_RESULT_OK;
1583 }
1584
1585 static int
1586 vhost_user_set_postcopy_advise(struct virtio_net **pdev,
1587                         struct VhostUserMsg *msg,
1588                         int main_fd __rte_unused)
1589 {
1590         struct virtio_net *dev = *pdev;
1591 #ifdef RTE_LIBRTE_VHOST_POSTCOPY
1592         struct uffdio_api api_struct;
1593
1594         dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
1595
1596         if (dev->postcopy_ufd == -1) {
1597                 RTE_LOG(ERR, VHOST_CONFIG, "Userfaultfd not available: %s\n",
1598                         strerror(errno));
1599                 return VH_RESULT_ERR;
1600         }
1601         api_struct.api = UFFD_API;
1602         api_struct.features = 0;
1603         if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) {
1604                 RTE_LOG(ERR, VHOST_CONFIG, "UFFDIO_API ioctl failure: %s\n",
1605                         strerror(errno));
1606                 close(dev->postcopy_ufd);
1607                 dev->postcopy_ufd = -1;
1608                 return VH_RESULT_ERR;
1609         }
1610         msg->fds[0] = dev->postcopy_ufd;
1611         msg->fd_num = 1;
1612
1613         return VH_RESULT_REPLY;
1614 #else
1615         dev->postcopy_ufd = -1;
1616         msg->fd_num = 0;
1617
1618         return VH_RESULT_ERR;
1619 #endif
1620 }
1621
1622 static int
1623 vhost_user_set_postcopy_listen(struct virtio_net **pdev,
1624                         struct VhostUserMsg *msg __rte_unused,
1625                         int main_fd __rte_unused)
1626 {
1627         struct virtio_net *dev = *pdev;
1628
1629         if (dev->mem && dev->mem->nregions) {
1630                 RTE_LOG(ERR, VHOST_CONFIG,
1631                         "Regions already registered at postcopy-listen\n");
1632                 return VH_RESULT_ERR;
1633         }
1634         dev->postcopy_listening = 1;
1635
1636         return VH_RESULT_OK;
1637 }
1638
1639 typedef int (*vhost_message_handler_t)(struct virtio_net **pdev,
1640                                         struct VhostUserMsg *msg,
1641                                         int main_fd);
1642 static vhost_message_handler_t vhost_message_handlers[VHOST_USER_MAX] = {
1643         [VHOST_USER_NONE] = NULL,
1644         [VHOST_USER_GET_FEATURES] = vhost_user_get_features,
1645         [VHOST_USER_SET_FEATURES] = vhost_user_set_features,
1646         [VHOST_USER_SET_OWNER] = vhost_user_set_owner,
1647         [VHOST_USER_RESET_OWNER] = vhost_user_reset_owner,
1648         [VHOST_USER_SET_MEM_TABLE] = vhost_user_set_mem_table,
1649         [VHOST_USER_SET_LOG_BASE] = vhost_user_set_log_base,
1650         [VHOST_USER_SET_LOG_FD] = vhost_user_set_log_fd,
1651         [VHOST_USER_SET_VRING_NUM] = vhost_user_set_vring_num,
1652         [VHOST_USER_SET_VRING_ADDR] = vhost_user_set_vring_addr,
1653         [VHOST_USER_SET_VRING_BASE] = vhost_user_set_vring_base,
1654         [VHOST_USER_GET_VRING_BASE] = vhost_user_get_vring_base,
1655         [VHOST_USER_SET_VRING_KICK] = vhost_user_set_vring_kick,
1656         [VHOST_USER_SET_VRING_CALL] = vhost_user_set_vring_call,
1657         [VHOST_USER_SET_VRING_ERR] = vhost_user_set_vring_err,
1658         [VHOST_USER_GET_PROTOCOL_FEATURES] = vhost_user_get_protocol_features,
1659         [VHOST_USER_SET_PROTOCOL_FEATURES] = vhost_user_set_protocol_features,
1660         [VHOST_USER_GET_QUEUE_NUM] = vhost_user_get_queue_num,
1661         [VHOST_USER_SET_VRING_ENABLE] = vhost_user_set_vring_enable,
1662         [VHOST_USER_SEND_RARP] = vhost_user_send_rarp,
1663         [VHOST_USER_NET_SET_MTU] = vhost_user_net_set_mtu,
1664         [VHOST_USER_SET_SLAVE_REQ_FD] = vhost_user_set_req_fd,
1665         [VHOST_USER_IOTLB_MSG] = vhost_user_iotlb_msg,
1666         [VHOST_USER_POSTCOPY_ADVISE] = vhost_user_set_postcopy_advise,
1667         [VHOST_USER_POSTCOPY_LISTEN] = vhost_user_set_postcopy_listen,
1668 };
1669
1670
1671 /* return bytes# of read on success or negative val on failure. */
1672 static int
1673 read_vhost_message(int sockfd, struct VhostUserMsg *msg)
1674 {
1675         int ret;
1676
1677         ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
1678                 msg->fds, VHOST_MEMORY_MAX_NREGIONS, &msg->fd_num);
1679         if (ret <= 0)
1680                 return ret;
1681
1682         if (msg && msg->size) {
1683                 if (msg->size > sizeof(msg->payload)) {
1684                         RTE_LOG(ERR, VHOST_CONFIG,
1685                                 "invalid msg size: %d\n", msg->size);
1686                         return -1;
1687                 }
1688                 ret = read(sockfd, &msg->payload, msg->size);
1689                 if (ret <= 0)
1690                         return ret;
1691                 if (ret != (int)msg->size) {
1692                         RTE_LOG(ERR, VHOST_CONFIG,
1693                                 "read control message failed\n");
1694                         return -1;
1695                 }
1696         }
1697
1698         return ret;
1699 }
1700
1701 static int
1702 send_vhost_message(int sockfd, struct VhostUserMsg *msg)
1703 {
1704         if (!msg)
1705                 return 0;
1706
1707         return send_fd_message(sockfd, (char *)msg,
1708                 VHOST_USER_HDR_SIZE + msg->size, msg->fds, msg->fd_num);
1709 }
1710
1711 static int
1712 send_vhost_reply(int sockfd, struct VhostUserMsg *msg)
1713 {
1714         if (!msg)
1715                 return 0;
1716
1717         msg->flags &= ~VHOST_USER_VERSION_MASK;
1718         msg->flags &= ~VHOST_USER_NEED_REPLY;
1719         msg->flags |= VHOST_USER_VERSION;
1720         msg->flags |= VHOST_USER_REPLY_MASK;
1721
1722         return send_vhost_message(sockfd, msg);
1723 }
1724
1725 static int
1726 send_vhost_slave_message(struct virtio_net *dev, struct VhostUserMsg *msg)
1727 {
1728         int ret;
1729
1730         if (msg->flags & VHOST_USER_NEED_REPLY)
1731                 rte_spinlock_lock(&dev->slave_req_lock);
1732
1733         ret = send_vhost_message(dev->slave_req_fd, msg);
1734         if (ret < 0 && (msg->flags & VHOST_USER_NEED_REPLY))
1735                 rte_spinlock_unlock(&dev->slave_req_lock);
1736
1737         return ret;
1738 }
1739
1740 /*
1741  * Allocate a queue pair if it hasn't been allocated yet
1742  */
1743 static int
1744 vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev,
1745                         struct VhostUserMsg *msg)
1746 {
1747         uint16_t vring_idx;
1748
1749         switch (msg->request.master) {
1750         case VHOST_USER_SET_VRING_KICK:
1751         case VHOST_USER_SET_VRING_CALL:
1752         case VHOST_USER_SET_VRING_ERR:
1753                 vring_idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
1754                 break;
1755         case VHOST_USER_SET_VRING_NUM:
1756         case VHOST_USER_SET_VRING_BASE:
1757         case VHOST_USER_SET_VRING_ENABLE:
1758                 vring_idx = msg->payload.state.index;
1759                 break;
1760         case VHOST_USER_SET_VRING_ADDR:
1761                 vring_idx = msg->payload.addr.index;
1762                 break;
1763         default:
1764                 return 0;
1765         }
1766
1767         if (vring_idx >= VHOST_MAX_VRING) {
1768                 RTE_LOG(ERR, VHOST_CONFIG,
1769                         "invalid vring index: %u\n", vring_idx);
1770                 return -1;
1771         }
1772
1773         if (dev->virtqueue[vring_idx])
1774                 return 0;
1775
1776         return alloc_vring_queue(dev, vring_idx);
1777 }
1778
1779 static void
1780 vhost_user_lock_all_queue_pairs(struct virtio_net *dev)
1781 {
1782         unsigned int i = 0;
1783         unsigned int vq_num = 0;
1784
1785         while (vq_num < dev->nr_vring) {
1786                 struct vhost_virtqueue *vq = dev->virtqueue[i];
1787
1788                 if (vq) {
1789                         rte_spinlock_lock(&vq->access_lock);
1790                         vq_num++;
1791                 }
1792                 i++;
1793         }
1794 }
1795
1796 static void
1797 vhost_user_unlock_all_queue_pairs(struct virtio_net *dev)
1798 {
1799         unsigned int i = 0;
1800         unsigned int vq_num = 0;
1801
1802         while (vq_num < dev->nr_vring) {
1803                 struct vhost_virtqueue *vq = dev->virtqueue[i];
1804
1805                 if (vq) {
1806                         rte_spinlock_unlock(&vq->access_lock);
1807                         vq_num++;
1808                 }
1809                 i++;
1810         }
1811 }
1812
1813 int
1814 vhost_user_msg_handler(int vid, int fd)
1815 {
1816         struct virtio_net *dev;
1817         struct VhostUserMsg msg;
1818         struct rte_vdpa_device *vdpa_dev;
1819         int did = -1;
1820         int ret;
1821         int unlock_required = 0;
1822         uint32_t skip_master = 0;
1823         int request;
1824
1825         dev = get_device(vid);
1826         if (dev == NULL)
1827                 return -1;
1828
1829         if (!dev->notify_ops) {
1830                 dev->notify_ops = vhost_driver_callback_get(dev->ifname);
1831                 if (!dev->notify_ops) {
1832                         RTE_LOG(ERR, VHOST_CONFIG,
1833                                 "failed to get callback ops for driver %s\n",
1834                                 dev->ifname);
1835                         return -1;
1836                 }
1837         }
1838
1839         ret = read_vhost_message(fd, &msg);
1840         if (ret <= 0 || msg.request.master >= VHOST_USER_MAX) {
1841                 if (ret < 0)
1842                         RTE_LOG(ERR, VHOST_CONFIG,
1843                                 "vhost read message failed\n");
1844                 else if (ret == 0)
1845                         RTE_LOG(INFO, VHOST_CONFIG,
1846                                 "vhost peer closed\n");
1847                 else
1848                         RTE_LOG(ERR, VHOST_CONFIG,
1849                                 "vhost read incorrect message\n");
1850
1851                 return -1;
1852         }
1853
1854         ret = 0;
1855         if (msg.request.master != VHOST_USER_IOTLB_MSG)
1856                 RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
1857                         vhost_message_str[msg.request.master]);
1858         else
1859                 RTE_LOG(DEBUG, VHOST_CONFIG, "read message %s\n",
1860                         vhost_message_str[msg.request.master]);
1861
1862         ret = vhost_user_check_and_alloc_queue_pair(dev, &msg);
1863         if (ret < 0) {
1864                 RTE_LOG(ERR, VHOST_CONFIG,
1865                         "failed to alloc queue\n");
1866                 return -1;
1867         }
1868
1869         /*
1870          * Note: we don't lock all queues on VHOST_USER_GET_VRING_BASE
1871          * and VHOST_USER_RESET_OWNER, since it is sent when virtio stops
1872          * and device is destroyed. destroy_device waits for queues to be
1873          * inactive, so it is safe. Otherwise taking the access_lock
1874          * would cause a dead lock.
1875          */
1876         switch (msg.request.master) {
1877         case VHOST_USER_SET_FEATURES:
1878         case VHOST_USER_SET_PROTOCOL_FEATURES:
1879         case VHOST_USER_SET_OWNER:
1880         case VHOST_USER_SET_MEM_TABLE:
1881         case VHOST_USER_SET_LOG_BASE:
1882         case VHOST_USER_SET_LOG_FD:
1883         case VHOST_USER_SET_VRING_NUM:
1884         case VHOST_USER_SET_VRING_ADDR:
1885         case VHOST_USER_SET_VRING_BASE:
1886         case VHOST_USER_SET_VRING_KICK:
1887         case VHOST_USER_SET_VRING_CALL:
1888         case VHOST_USER_SET_VRING_ERR:
1889         case VHOST_USER_SET_VRING_ENABLE:
1890         case VHOST_USER_SEND_RARP:
1891         case VHOST_USER_NET_SET_MTU:
1892         case VHOST_USER_SET_SLAVE_REQ_FD:
1893                 vhost_user_lock_all_queue_pairs(dev);
1894                 unlock_required = 1;
1895                 break;
1896         default:
1897                 break;
1898
1899         }
1900
1901         if (dev->extern_ops.pre_msg_handle) {
1902                 ret = (*dev->extern_ops.pre_msg_handle)(dev->vid,
1903                                 (void *)&msg, &skip_master);
1904                 if (ret == VH_RESULT_ERR)
1905                         goto skip_to_reply;
1906                 else if (ret == VH_RESULT_REPLY)
1907                         send_vhost_reply(fd, &msg);
1908
1909                 if (skip_master)
1910                         goto skip_to_post_handle;
1911         }
1912
1913         request = msg.request.master;
1914         if (request > VHOST_USER_NONE && request < VHOST_USER_MAX) {
1915                 if (!vhost_message_handlers[request])
1916                         goto skip_to_post_handle;
1917                 ret = vhost_message_handlers[request](&dev, &msg, fd);
1918
1919                 switch (ret) {
1920                 case VH_RESULT_ERR:
1921                         RTE_LOG(ERR, VHOST_CONFIG,
1922                                 "Processing %s failed.\n",
1923                                 vhost_message_str[request]);
1924                         break;
1925                 case VH_RESULT_OK:
1926                         RTE_LOG(DEBUG, VHOST_CONFIG,
1927                                 "Processing %s succeeded.\n",
1928                                 vhost_message_str[request]);
1929                         break;
1930                 case VH_RESULT_REPLY:
1931                         RTE_LOG(DEBUG, VHOST_CONFIG,
1932                                 "Processing %s succeeded and needs reply.\n",
1933                                 vhost_message_str[request]);
1934                         send_vhost_reply(fd, &msg);
1935                         break;
1936                 }
1937         } else {
1938                 RTE_LOG(ERR, VHOST_CONFIG,
1939                         "Requested invalid message type %d.\n", request);
1940                 ret = VH_RESULT_ERR;
1941         }
1942
1943 skip_to_post_handle:
1944         if (ret != VH_RESULT_ERR && dev->extern_ops.post_msg_handle) {
1945                 ret = (*dev->extern_ops.post_msg_handle)(
1946                                 dev->vid, (void *)&msg);
1947                 if (ret == VH_RESULT_ERR)
1948                         goto skip_to_reply;
1949                 else if (ret == VH_RESULT_REPLY)
1950                         send_vhost_reply(fd, &msg);
1951         }
1952
1953 skip_to_reply:
1954         if (unlock_required)
1955                 vhost_user_unlock_all_queue_pairs(dev);
1956
1957         /*
1958          * If the request required a reply that was already sent,
1959          * this optional reply-ack won't be sent as the
1960          * VHOST_USER_NEED_REPLY was cleared in send_vhost_reply().
1961          */
1962         if (msg.flags & VHOST_USER_NEED_REPLY) {
1963                 msg.payload.u64 = ret == VH_RESULT_ERR;
1964                 msg.size = sizeof(msg.payload.u64);
1965                 msg.fd_num = 0;
1966                 send_vhost_reply(fd, &msg);
1967         } else if (ret == VH_RESULT_ERR) {
1968                 RTE_LOG(ERR, VHOST_CONFIG,
1969                         "vhost message handling failed.\n");
1970                 return -1;
1971         }
1972
1973         if (!(dev->flags & VIRTIO_DEV_RUNNING) && virtio_is_ready(dev)) {
1974                 dev->flags |= VIRTIO_DEV_READY;
1975
1976                 if (!(dev->flags & VIRTIO_DEV_RUNNING)) {
1977                         if (dev->dequeue_zero_copy) {
1978                                 RTE_LOG(INFO, VHOST_CONFIG,
1979                                                 "dequeue zero copy is enabled\n");
1980                         }
1981
1982                         if (dev->notify_ops->new_device(dev->vid) == 0)
1983                                 dev->flags |= VIRTIO_DEV_RUNNING;
1984                 }
1985         }
1986
1987         did = dev->vdpa_dev_id;
1988         vdpa_dev = rte_vdpa_get_device(did);
1989         if (vdpa_dev && virtio_is_ready(dev) &&
1990                         !(dev->flags & VIRTIO_DEV_VDPA_CONFIGURED) &&
1991                         msg.request.master == VHOST_USER_SET_VRING_ENABLE) {
1992                 if (vdpa_dev->ops->dev_conf)
1993                         vdpa_dev->ops->dev_conf(dev->vid);
1994                 dev->flags |= VIRTIO_DEV_VDPA_CONFIGURED;
1995                 if (vhost_user_host_notifier_ctrl(dev->vid, true) != 0) {
1996                         RTE_LOG(INFO, VHOST_CONFIG,
1997                                 "(%d) software relay is used for vDPA, performance may be low.\n",
1998                                 dev->vid);
1999                 }
2000         }
2001
2002         return 0;
2003 }
2004
2005 static int process_slave_message_reply(struct virtio_net *dev,
2006                                        const struct VhostUserMsg *msg)
2007 {
2008         struct VhostUserMsg msg_reply;
2009         int ret;
2010
2011         if ((msg->flags & VHOST_USER_NEED_REPLY) == 0)
2012                 return 0;
2013
2014         if (read_vhost_message(dev->slave_req_fd, &msg_reply) < 0) {
2015                 ret = -1;
2016                 goto out;
2017         }
2018
2019         if (msg_reply.request.slave != msg->request.slave) {
2020                 RTE_LOG(ERR, VHOST_CONFIG,
2021                         "Received unexpected msg type (%u), expected %u\n",
2022                         msg_reply.request.slave, msg->request.slave);
2023                 ret = -1;
2024                 goto out;
2025         }
2026
2027         ret = msg_reply.payload.u64 ? -1 : 0;
2028
2029 out:
2030         rte_spinlock_unlock(&dev->slave_req_lock);
2031         return ret;
2032 }
2033
2034 int
2035 vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm)
2036 {
2037         int ret;
2038         struct VhostUserMsg msg = {
2039                 .request.slave = VHOST_USER_SLAVE_IOTLB_MSG,
2040                 .flags = VHOST_USER_VERSION,
2041                 .size = sizeof(msg.payload.iotlb),
2042                 .payload.iotlb = {
2043                         .iova = iova,
2044                         .perm = perm,
2045                         .type = VHOST_IOTLB_MISS,
2046                 },
2047         };
2048
2049         ret = send_vhost_message(dev->slave_req_fd, &msg);
2050         if (ret < 0) {
2051                 RTE_LOG(ERR, VHOST_CONFIG,
2052                                 "Failed to send IOTLB miss message (%d)\n",
2053                                 ret);
2054                 return ret;
2055         }
2056
2057         return 0;
2058 }
2059
2060 static int vhost_user_slave_set_vring_host_notifier(struct virtio_net *dev,
2061                                                     int index, int fd,
2062                                                     uint64_t offset,
2063                                                     uint64_t size)
2064 {
2065         int ret;
2066         struct VhostUserMsg msg = {
2067                 .request.slave = VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG,
2068                 .flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY,
2069                 .size = sizeof(msg.payload.area),
2070                 .payload.area = {
2071                         .u64 = index & VHOST_USER_VRING_IDX_MASK,
2072                         .size = size,
2073                         .offset = offset,
2074                 },
2075         };
2076
2077         if (fd < 0)
2078                 msg.payload.area.u64 |= VHOST_USER_VRING_NOFD_MASK;
2079         else {
2080                 msg.fds[0] = fd;
2081                 msg.fd_num = 1;
2082         }
2083
2084         ret = send_vhost_slave_message(dev, &msg);
2085         if (ret < 0) {
2086                 RTE_LOG(ERR, VHOST_CONFIG,
2087                         "Failed to set host notifier (%d)\n", ret);
2088                 return ret;
2089         }
2090
2091         return process_slave_message_reply(dev, &msg);
2092 }
2093
2094 int vhost_user_host_notifier_ctrl(int vid, bool enable)
2095 {
2096         struct virtio_net *dev;
2097         struct rte_vdpa_device *vdpa_dev;
2098         int vfio_device_fd, did, ret = 0;
2099         uint64_t offset, size;
2100         unsigned int i;
2101
2102         dev = get_device(vid);
2103         if (!dev)
2104                 return -ENODEV;
2105
2106         did = dev->vdpa_dev_id;
2107         if (did < 0)
2108                 return -EINVAL;
2109
2110         if (!(dev->features & (1ULL << VIRTIO_F_VERSION_1)) ||
2111             !(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES)) ||
2112             !(dev->protocol_features &
2113                         (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ)) ||
2114             !(dev->protocol_features &
2115                         (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD)) ||
2116             !(dev->protocol_features &
2117                         (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER)))
2118                 return -ENOTSUP;
2119
2120         vdpa_dev = rte_vdpa_get_device(did);
2121         if (!vdpa_dev)
2122                 return -ENODEV;
2123
2124         RTE_FUNC_PTR_OR_ERR_RET(vdpa_dev->ops->get_vfio_device_fd, -ENOTSUP);
2125         RTE_FUNC_PTR_OR_ERR_RET(vdpa_dev->ops->get_notify_area, -ENOTSUP);
2126
2127         vfio_device_fd = vdpa_dev->ops->get_vfio_device_fd(vid);
2128         if (vfio_device_fd < 0)
2129                 return -ENOTSUP;
2130
2131         if (enable) {
2132                 for (i = 0; i < dev->nr_vring; i++) {
2133                         if (vdpa_dev->ops->get_notify_area(vid, i, &offset,
2134                                         &size) < 0) {
2135                                 ret = -ENOTSUP;
2136                                 goto disable;
2137                         }
2138
2139                         if (vhost_user_slave_set_vring_host_notifier(dev, i,
2140                                         vfio_device_fd, offset, size) < 0) {
2141                                 ret = -EFAULT;
2142                                 goto disable;
2143                         }
2144                 }
2145         } else {
2146 disable:
2147                 for (i = 0; i < dev->nr_vring; i++) {
2148                         vhost_user_slave_set_vring_host_notifier(dev, i, -1,
2149                                         0, 0);
2150                 }
2151         }
2152
2153         return ret;
2154 }