vhost: do deep copy while reallocating queue
[dpdk.git] / lib / librte_vhost / vhost_user.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2016 Intel Corporation
3  */
4
5 #include <stdint.h>
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <string.h>
9 #include <unistd.h>
10 #include <sys/mman.h>
11 #include <sys/types.h>
12 #include <sys/stat.h>
13 #include <assert.h>
14 #ifdef RTE_LIBRTE_VHOST_NUMA
15 #include <numaif.h>
16 #endif
17
18 #include <rte_common.h>
19 #include <rte_malloc.h>
20 #include <rte_log.h>
21
22 #include "iotlb.h"
23 #include "vhost.h"
24 #include "vhost_user.h"
25
26 #define VIRTIO_MIN_MTU 68
27 #define VIRTIO_MAX_MTU 65535
28
29 static const char *vhost_message_str[VHOST_USER_MAX] = {
30         [VHOST_USER_NONE] = "VHOST_USER_NONE",
31         [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
32         [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
33         [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
34         [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
35         [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
36         [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
37         [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
38         [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
39         [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
40         [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
41         [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
42         [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
43         [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
44         [VHOST_USER_SET_VRING_ERR]  = "VHOST_USER_SET_VRING_ERR",
45         [VHOST_USER_GET_PROTOCOL_FEATURES]  = "VHOST_USER_GET_PROTOCOL_FEATURES",
46         [VHOST_USER_SET_PROTOCOL_FEATURES]  = "VHOST_USER_SET_PROTOCOL_FEATURES",
47         [VHOST_USER_GET_QUEUE_NUM]  = "VHOST_USER_GET_QUEUE_NUM",
48         [VHOST_USER_SET_VRING_ENABLE]  = "VHOST_USER_SET_VRING_ENABLE",
49         [VHOST_USER_SEND_RARP]  = "VHOST_USER_SEND_RARP",
50         [VHOST_USER_NET_SET_MTU]  = "VHOST_USER_NET_SET_MTU",
51         [VHOST_USER_SET_SLAVE_REQ_FD]  = "VHOST_USER_SET_SLAVE_REQ_FD",
52         [VHOST_USER_IOTLB_MSG]  = "VHOST_USER_IOTLB_MSG",
53 };
54
55 static uint64_t
56 get_blk_size(int fd)
57 {
58         struct stat stat;
59         int ret;
60
61         ret = fstat(fd, &stat);
62         return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
63 }
64
65 static void
66 free_mem_region(struct virtio_net *dev)
67 {
68         uint32_t i;
69         struct rte_vhost_mem_region *reg;
70
71         if (!dev || !dev->mem)
72                 return;
73
74         for (i = 0; i < dev->mem->nregions; i++) {
75                 reg = &dev->mem->regions[i];
76                 if (reg->host_user_addr) {
77                         munmap(reg->mmap_addr, reg->mmap_size);
78                         close(reg->fd);
79                 }
80         }
81 }
82
83 void
84 vhost_backend_cleanup(struct virtio_net *dev)
85 {
86         if (dev->mem) {
87                 free_mem_region(dev);
88                 rte_free(dev->mem);
89                 dev->mem = NULL;
90         }
91
92         free(dev->guest_pages);
93         dev->guest_pages = NULL;
94
95         if (dev->log_addr) {
96                 munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
97                 dev->log_addr = 0;
98         }
99
100         if (dev->slave_req_fd >= 0) {
101                 close(dev->slave_req_fd);
102                 dev->slave_req_fd = -1;
103         }
104 }
105
106 /*
107  * This function just returns success at the moment unless
108  * the device hasn't been initialised.
109  */
110 static int
111 vhost_user_set_owner(void)
112 {
113         return 0;
114 }
115
116 static int
117 vhost_user_reset_owner(struct virtio_net *dev)
118 {
119         if (dev->flags & VIRTIO_DEV_RUNNING) {
120                 dev->flags &= ~VIRTIO_DEV_RUNNING;
121                 dev->notify_ops->destroy_device(dev->vid);
122         }
123
124         cleanup_device(dev, 0);
125         reset_device(dev);
126         return 0;
127 }
128
129 /*
130  * The features that we support are requested.
131  */
132 static uint64_t
133 vhost_user_get_features(struct virtio_net *dev)
134 {
135         uint64_t features = 0;
136
137         rte_vhost_driver_get_features(dev->ifname, &features);
138         return features;
139 }
140
141 /*
142  * We receive the negotiated features supported by us and the virtio device.
143  */
144 static int
145 vhost_user_set_features(struct virtio_net *dev, uint64_t features)
146 {
147         uint64_t vhost_features = 0;
148
149         rte_vhost_driver_get_features(dev->ifname, &vhost_features);
150         if (features & ~vhost_features) {
151                 RTE_LOG(ERR, VHOST_CONFIG,
152                         "(%d) received invalid negotiated features.\n",
153                         dev->vid);
154                 return -1;
155         }
156
157         if (dev->flags & VIRTIO_DEV_RUNNING) {
158                 if (dev->features == features)
159                         return 0;
160
161                 /*
162                  * Error out if master tries to change features while device is
163                  * in running state. The exception being VHOST_F_LOG_ALL, which
164                  * is enabled when the live-migration starts.
165                  */
166                 if ((dev->features ^ features) & ~(1ULL << VHOST_F_LOG_ALL)) {
167                         RTE_LOG(ERR, VHOST_CONFIG,
168                                 "(%d) features changed while device is running.\n",
169                                 dev->vid);
170                         return -1;
171                 }
172
173                 if (dev->notify_ops->features_changed)
174                         dev->notify_ops->features_changed(dev->vid, features);
175         }
176
177         dev->features = features;
178         if (dev->features &
179                 ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) {
180                 dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
181         } else {
182                 dev->vhost_hlen = sizeof(struct virtio_net_hdr);
183         }
184         LOG_DEBUG(VHOST_CONFIG,
185                 "(%d) mergeable RX buffers %s, virtio 1 %s\n",
186                 dev->vid,
187                 (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off",
188                 (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off");
189
190         if (!(dev->features & (1ULL << VIRTIO_NET_F_MQ))) {
191                 /*
192                  * Remove all but first queue pair if MQ hasn't been
193                  * negotiated. This is safe because the device is not
194                  * running at this stage.
195                  */
196                 while (dev->nr_vring > 2) {
197                         struct vhost_virtqueue *vq;
198
199                         vq = dev->virtqueue[--dev->nr_vring];
200                         if (!vq)
201                                 continue;
202
203                         dev->virtqueue[dev->nr_vring] = NULL;
204                         cleanup_vq(vq, 1);
205                         free_vq(vq);
206                 }
207         }
208
209         return 0;
210 }
211
212 /*
213  * The virtio device sends us the size of the descriptor ring.
214  */
215 static int
216 vhost_user_set_vring_num(struct virtio_net *dev,
217                          VhostUserMsg *msg)
218 {
219         struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index];
220
221         vq->size = msg->payload.state.num;
222
223         if (dev->dequeue_zero_copy) {
224                 vq->nr_zmbuf = 0;
225                 vq->last_zmbuf_idx = 0;
226                 vq->zmbuf_size = vq->size;
227                 vq->zmbufs = rte_zmalloc(NULL, vq->zmbuf_size *
228                                          sizeof(struct zcopy_mbuf), 0);
229                 if (vq->zmbufs == NULL) {
230                         RTE_LOG(WARNING, VHOST_CONFIG,
231                                 "failed to allocate mem for zero copy; "
232                                 "zero copy is force disabled\n");
233                         dev->dequeue_zero_copy = 0;
234                 }
235                 TAILQ_INIT(&vq->zmbuf_list);
236         }
237
238         vq->shadow_used_ring = rte_malloc(NULL,
239                                 vq->size * sizeof(struct vring_used_elem),
240                                 RTE_CACHE_LINE_SIZE);
241         if (!vq->shadow_used_ring) {
242                 RTE_LOG(ERR, VHOST_CONFIG,
243                         "failed to allocate memory for shadow used ring.\n");
244                 return -1;
245         }
246
247         vq->batch_copy_elems = rte_malloc(NULL,
248                                 vq->size * sizeof(struct batch_copy_elem),
249                                 RTE_CACHE_LINE_SIZE);
250         if (!vq->batch_copy_elems) {
251                 RTE_LOG(ERR, VHOST_CONFIG,
252                         "failed to allocate memory for batching copy.\n");
253                 return -1;
254         }
255
256         return 0;
257 }
258
259 /*
260  * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the
261  * same numa node as the memory of vring descriptor.
262  */
263 #ifdef RTE_LIBRTE_VHOST_NUMA
264 static struct virtio_net*
265 numa_realloc(struct virtio_net *dev, int index)
266 {
267         int oldnode, newnode;
268         struct virtio_net *old_dev;
269         struct vhost_virtqueue *old_vq, *vq;
270         struct zcopy_mbuf *new_zmbuf;
271         struct vring_used_elem *new_shadow_used_ring;
272         struct batch_copy_elem *new_batch_copy_elems;
273         int ret;
274
275         old_dev = dev;
276         vq = old_vq = dev->virtqueue[index];
277
278         ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc,
279                             MPOL_F_NODE | MPOL_F_ADDR);
280
281         /* check if we need to reallocate vq */
282         ret |= get_mempolicy(&oldnode, NULL, 0, old_vq,
283                              MPOL_F_NODE | MPOL_F_ADDR);
284         if (ret) {
285                 RTE_LOG(ERR, VHOST_CONFIG,
286                         "Unable to get vq numa information.\n");
287                 return dev;
288         }
289         if (oldnode != newnode) {
290                 RTE_LOG(INFO, VHOST_CONFIG,
291                         "reallocate vq from %d to %d node\n", oldnode, newnode);
292                 vq = rte_malloc_socket(NULL, sizeof(*vq), 0, newnode);
293                 if (!vq)
294                         return dev;
295
296                 memcpy(vq, old_vq, sizeof(*vq));
297                 TAILQ_INIT(&vq->zmbuf_list);
298
299                 new_zmbuf = rte_malloc_socket(NULL, vq->zmbuf_size *
300                         sizeof(struct zcopy_mbuf), 0, newnode);
301                 if (new_zmbuf) {
302                         rte_free(vq->zmbufs);
303                         vq->zmbufs = new_zmbuf;
304                 }
305
306                 new_shadow_used_ring = rte_malloc_socket(NULL,
307                         vq->size * sizeof(struct vring_used_elem),
308                         RTE_CACHE_LINE_SIZE,
309                         newnode);
310                 if (new_shadow_used_ring) {
311                         rte_free(vq->shadow_used_ring);
312                         vq->shadow_used_ring = new_shadow_used_ring;
313                 }
314
315                 new_batch_copy_elems = rte_malloc_socket(NULL,
316                         vq->size * sizeof(struct batch_copy_elem),
317                         RTE_CACHE_LINE_SIZE,
318                         newnode);
319                 if (new_batch_copy_elems) {
320                         rte_free(vq->batch_copy_elems);
321                         vq->batch_copy_elems = new_batch_copy_elems;
322                 }
323
324                 rte_free(old_vq);
325         }
326
327         /* check if we need to reallocate dev */
328         ret = get_mempolicy(&oldnode, NULL, 0, old_dev,
329                             MPOL_F_NODE | MPOL_F_ADDR);
330         if (ret) {
331                 RTE_LOG(ERR, VHOST_CONFIG,
332                         "Unable to get dev numa information.\n");
333                 goto out;
334         }
335         if (oldnode != newnode) {
336                 RTE_LOG(INFO, VHOST_CONFIG,
337                         "reallocate dev from %d to %d node\n",
338                         oldnode, newnode);
339                 dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode);
340                 if (!dev) {
341                         dev = old_dev;
342                         goto out;
343                 }
344
345                 memcpy(dev, old_dev, sizeof(*dev));
346                 rte_free(old_dev);
347         }
348
349 out:
350         dev->virtqueue[index] = vq;
351         vhost_devices[dev->vid] = dev;
352
353         if (old_vq != vq)
354                 vhost_user_iotlb_init(dev, index);
355
356         return dev;
357 }
358 #else
359 static struct virtio_net*
360 numa_realloc(struct virtio_net *dev, int index __rte_unused)
361 {
362         return dev;
363 }
364 #endif
365
366 /* Converts QEMU virtual address to Vhost virtual address. */
367 static uint64_t
368 qva_to_vva(struct virtio_net *dev, uint64_t qva)
369 {
370         struct rte_vhost_mem_region *reg;
371         uint32_t i;
372
373         /* Find the region where the address lives. */
374         for (i = 0; i < dev->mem->nregions; i++) {
375                 reg = &dev->mem->regions[i];
376
377                 if (qva >= reg->guest_user_addr &&
378                     qva <  reg->guest_user_addr + reg->size) {
379                         return qva - reg->guest_user_addr +
380                                reg->host_user_addr;
381                 }
382         }
383
384         return 0;
385 }
386
387
388 /*
389  * Converts ring address to Vhost virtual address.
390  * If IOMMU is enabled, the ring address is a guest IO virtual address,
391  * else it is a QEMU virtual address.
392  */
393 static uint64_t
394 ring_addr_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq,
395                 uint64_t ra, uint64_t size)
396 {
397         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) {
398                 uint64_t vva;
399
400                 vva = vhost_user_iotlb_cache_find(vq, ra,
401                                         &size, VHOST_ACCESS_RW);
402                 if (!vva)
403                         vhost_user_iotlb_miss(dev, ra, VHOST_ACCESS_RW);
404
405                 return vva;
406         }
407
408         return qva_to_vva(dev, ra);
409 }
410
411 static struct virtio_net *
412 translate_ring_addresses(struct virtio_net *dev, int vq_index)
413 {
414         struct vhost_virtqueue *vq = dev->virtqueue[vq_index];
415         struct vhost_vring_addr *addr = &vq->ring_addrs;
416
417         /* The addresses are converted from QEMU virtual to Vhost virtual. */
418         if (vq->desc && vq->avail && vq->used)
419                 return dev;
420
421         vq->desc = (struct vring_desc *)(uintptr_t)ring_addr_to_vva(dev,
422                         vq, addr->desc_user_addr, sizeof(struct vring_desc));
423         if (vq->desc == 0) {
424                 RTE_LOG(DEBUG, VHOST_CONFIG,
425                         "(%d) failed to find desc ring address.\n",
426                         dev->vid);
427                 return dev;
428         }
429
430         dev = numa_realloc(dev, vq_index);
431         vq = dev->virtqueue[vq_index];
432         addr = &vq->ring_addrs;
433
434         vq->avail = (struct vring_avail *)(uintptr_t)ring_addr_to_vva(dev,
435                         vq, addr->avail_user_addr, sizeof(struct vring_avail));
436         if (vq->avail == 0) {
437                 RTE_LOG(DEBUG, VHOST_CONFIG,
438                         "(%d) failed to find avail ring address.\n",
439                         dev->vid);
440                 return dev;
441         }
442
443         vq->used = (struct vring_used *)(uintptr_t)ring_addr_to_vva(dev,
444                         vq, addr->used_user_addr, sizeof(struct vring_used));
445         if (vq->used == 0) {
446                 RTE_LOG(DEBUG, VHOST_CONFIG,
447                         "(%d) failed to find used ring address.\n",
448                         dev->vid);
449                 return dev;
450         }
451
452         if (vq->last_used_idx != vq->used->idx) {
453                 RTE_LOG(WARNING, VHOST_CONFIG,
454                         "last_used_idx (%u) and vq->used->idx (%u) mismatches; "
455                         "some packets maybe resent for Tx and dropped for Rx\n",
456                         vq->last_used_idx, vq->used->idx);
457                 vq->last_used_idx  = vq->used->idx;
458                 vq->last_avail_idx = vq->used->idx;
459         }
460
461         vq->log_guest_addr = addr->log_guest_addr;
462
463         LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n",
464                         dev->vid, vq->desc);
465         LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n",
466                         dev->vid, vq->avail);
467         LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n",
468                         dev->vid, vq->used);
469         LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n",
470                         dev->vid, vq->log_guest_addr);
471
472         return dev;
473 }
474
475 /*
476  * The virtio device sends us the desc, used and avail ring addresses.
477  * This function then converts these to our address space.
478  */
479 static int
480 vhost_user_set_vring_addr(struct virtio_net **pdev, VhostUserMsg *msg)
481 {
482         struct vhost_virtqueue *vq;
483         struct vhost_vring_addr *addr = &msg->payload.addr;
484         struct virtio_net *dev = *pdev;
485
486         if (dev->mem == NULL)
487                 return -1;
488
489         /* addr->index refers to the queue index. The txq 1, rxq is 0. */
490         vq = dev->virtqueue[msg->payload.addr.index];
491
492         /*
493          * Rings addresses should not be interpreted as long as the ring is not
494          * started and enabled
495          */
496         memcpy(&vq->ring_addrs, addr, sizeof(*addr));
497
498         vring_invalidate(dev, vq);
499
500         if (vq->enabled && (dev->features &
501                                 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) {
502                 dev = translate_ring_addresses(dev, msg->payload.state.index);
503                 if (!dev)
504                         return -1;
505
506                 *pdev = dev;
507         }
508
509         return 0;
510 }
511
512 /*
513  * The virtio device sends us the available ring last used index.
514  */
515 static int
516 vhost_user_set_vring_base(struct virtio_net *dev,
517                           VhostUserMsg *msg)
518 {
519         dev->virtqueue[msg->payload.state.index]->last_used_idx  =
520                         msg->payload.state.num;
521         dev->virtqueue[msg->payload.state.index]->last_avail_idx =
522                         msg->payload.state.num;
523
524         return 0;
525 }
526
527 static void
528 add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
529                    uint64_t host_phys_addr, uint64_t size)
530 {
531         struct guest_page *page, *last_page;
532
533         if (dev->nr_guest_pages == dev->max_guest_pages) {
534                 dev->max_guest_pages *= 2;
535                 dev->guest_pages = realloc(dev->guest_pages,
536                                         dev->max_guest_pages * sizeof(*page));
537         }
538
539         if (dev->nr_guest_pages > 0) {
540                 last_page = &dev->guest_pages[dev->nr_guest_pages - 1];
541                 /* merge if the two pages are continuous */
542                 if (host_phys_addr == last_page->host_phys_addr +
543                                       last_page->size) {
544                         last_page->size += size;
545                         return;
546                 }
547         }
548
549         page = &dev->guest_pages[dev->nr_guest_pages++];
550         page->guest_phys_addr = guest_phys_addr;
551         page->host_phys_addr  = host_phys_addr;
552         page->size = size;
553 }
554
555 static void
556 add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg,
557                 uint64_t page_size)
558 {
559         uint64_t reg_size = reg->size;
560         uint64_t host_user_addr  = reg->host_user_addr;
561         uint64_t guest_phys_addr = reg->guest_phys_addr;
562         uint64_t host_phys_addr;
563         uint64_t size;
564
565         host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)host_user_addr);
566         size = page_size - (guest_phys_addr & (page_size - 1));
567         size = RTE_MIN(size, reg_size);
568
569         add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size);
570         host_user_addr  += size;
571         guest_phys_addr += size;
572         reg_size -= size;
573
574         while (reg_size > 0) {
575                 size = RTE_MIN(reg_size, page_size);
576                 host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)
577                                                   host_user_addr);
578                 add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size);
579
580                 host_user_addr  += size;
581                 guest_phys_addr += size;
582                 reg_size -= size;
583         }
584 }
585
586 #ifdef RTE_LIBRTE_VHOST_DEBUG
587 /* TODO: enable it only in debug mode? */
588 static void
589 dump_guest_pages(struct virtio_net *dev)
590 {
591         uint32_t i;
592         struct guest_page *page;
593
594         for (i = 0; i < dev->nr_guest_pages; i++) {
595                 page = &dev->guest_pages[i];
596
597                 RTE_LOG(INFO, VHOST_CONFIG,
598                         "guest physical page region %u\n"
599                         "\t guest_phys_addr: %" PRIx64 "\n"
600                         "\t host_phys_addr : %" PRIx64 "\n"
601                         "\t size           : %" PRIx64 "\n",
602                         i,
603                         page->guest_phys_addr,
604                         page->host_phys_addr,
605                         page->size);
606         }
607 }
608 #else
609 #define dump_guest_pages(dev)
610 #endif
611
612 static bool
613 vhost_memory_changed(struct VhostUserMemory *new,
614                      struct rte_vhost_memory *old)
615 {
616         uint32_t i;
617
618         if (new->nregions != old->nregions)
619                 return true;
620
621         for (i = 0; i < new->nregions; ++i) {
622                 VhostUserMemoryRegion *new_r = &new->regions[i];
623                 struct rte_vhost_mem_region *old_r = &old->regions[i];
624
625                 if (new_r->guest_phys_addr != old_r->guest_phys_addr)
626                         return true;
627                 if (new_r->memory_size != old_r->size)
628                         return true;
629                 if (new_r->userspace_addr != old_r->guest_user_addr)
630                         return true;
631         }
632
633         return false;
634 }
635
636 static int
637 vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
638 {
639         struct VhostUserMemory memory = pmsg->payload.memory;
640         struct rte_vhost_mem_region *reg;
641         void *mmap_addr;
642         uint64_t mmap_size;
643         uint64_t mmap_offset;
644         uint64_t alignment;
645         uint32_t i;
646         int fd;
647
648         if (dev->mem && !vhost_memory_changed(&memory, dev->mem)) {
649                 RTE_LOG(INFO, VHOST_CONFIG,
650                         "(%d) memory regions not changed\n", dev->vid);
651
652                 for (i = 0; i < memory.nregions; i++)
653                         close(pmsg->fds[i]);
654
655                 return 0;
656         }
657
658         if (dev->mem) {
659                 free_mem_region(dev);
660                 rte_free(dev->mem);
661                 dev->mem = NULL;
662         }
663
664         dev->nr_guest_pages = 0;
665         if (!dev->guest_pages) {
666                 dev->max_guest_pages = 8;
667                 dev->guest_pages = malloc(dev->max_guest_pages *
668                                                 sizeof(struct guest_page));
669                 if (dev->guest_pages == NULL) {
670                         RTE_LOG(ERR, VHOST_CONFIG,
671                                 "(%d) failed to allocate memory "
672                                 "for dev->guest_pages\n",
673                                 dev->vid);
674                         return -1;
675                 }
676         }
677
678         dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct rte_vhost_memory) +
679                 sizeof(struct rte_vhost_mem_region) * memory.nregions, 0);
680         if (dev->mem == NULL) {
681                 RTE_LOG(ERR, VHOST_CONFIG,
682                         "(%d) failed to allocate memory for dev->mem\n",
683                         dev->vid);
684                 return -1;
685         }
686         dev->mem->nregions = memory.nregions;
687
688         for (i = 0; i < memory.nregions; i++) {
689                 fd  = pmsg->fds[i];
690                 reg = &dev->mem->regions[i];
691
692                 reg->guest_phys_addr = memory.regions[i].guest_phys_addr;
693                 reg->guest_user_addr = memory.regions[i].userspace_addr;
694                 reg->size            = memory.regions[i].memory_size;
695                 reg->fd              = fd;
696
697                 mmap_offset = memory.regions[i].mmap_offset;
698                 mmap_size   = reg->size + mmap_offset;
699
700                 /* mmap() without flag of MAP_ANONYMOUS, should be called
701                  * with length argument aligned with hugepagesz at older
702                  * longterm version Linux, like 2.6.32 and 3.2.72, or
703                  * mmap() will fail with EINVAL.
704                  *
705                  * to avoid failure, make sure in caller to keep length
706                  * aligned.
707                  */
708                 alignment = get_blk_size(fd);
709                 if (alignment == (uint64_t)-1) {
710                         RTE_LOG(ERR, VHOST_CONFIG,
711                                 "couldn't get hugepage size through fstat\n");
712                         goto err_mmap;
713                 }
714                 mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment);
715
716                 mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
717                                  MAP_SHARED | MAP_POPULATE, fd, 0);
718
719                 if (mmap_addr == MAP_FAILED) {
720                         RTE_LOG(ERR, VHOST_CONFIG,
721                                 "mmap region %u failed.\n", i);
722                         goto err_mmap;
723                 }
724
725                 reg->mmap_addr = mmap_addr;
726                 reg->mmap_size = mmap_size;
727                 reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
728                                       mmap_offset;
729
730                 if (dev->dequeue_zero_copy)
731                         add_guest_pages(dev, reg, alignment);
732
733                 RTE_LOG(INFO, VHOST_CONFIG,
734                         "guest memory region %u, size: 0x%" PRIx64 "\n"
735                         "\t guest physical addr: 0x%" PRIx64 "\n"
736                         "\t guest virtual  addr: 0x%" PRIx64 "\n"
737                         "\t host  virtual  addr: 0x%" PRIx64 "\n"
738                         "\t mmap addr : 0x%" PRIx64 "\n"
739                         "\t mmap size : 0x%" PRIx64 "\n"
740                         "\t mmap align: 0x%" PRIx64 "\n"
741                         "\t mmap off  : 0x%" PRIx64 "\n",
742                         i, reg->size,
743                         reg->guest_phys_addr,
744                         reg->guest_user_addr,
745                         reg->host_user_addr,
746                         (uint64_t)(uintptr_t)mmap_addr,
747                         mmap_size,
748                         alignment,
749                         mmap_offset);
750         }
751
752         dump_guest_pages(dev);
753
754         return 0;
755
756 err_mmap:
757         free_mem_region(dev);
758         rte_free(dev->mem);
759         dev->mem = NULL;
760         return -1;
761 }
762
763 static int
764 vq_is_ready(struct vhost_virtqueue *vq)
765 {
766         return vq && vq->desc && vq->avail && vq->used &&
767                vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD &&
768                vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD;
769 }
770
771 static int
772 virtio_is_ready(struct virtio_net *dev)
773 {
774         struct vhost_virtqueue *vq;
775         uint32_t i;
776
777         if (dev->nr_vring == 0)
778                 return 0;
779
780         for (i = 0; i < dev->nr_vring; i++) {
781                 vq = dev->virtqueue[i];
782
783                 if (!vq_is_ready(vq))
784                         return 0;
785         }
786
787         RTE_LOG(INFO, VHOST_CONFIG,
788                 "virtio is now ready for processing.\n");
789         return 1;
790 }
791
792 static void
793 vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg)
794 {
795         struct vhost_vring_file file;
796         struct vhost_virtqueue *vq;
797
798         file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
799         if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
800                 file.fd = VIRTIO_INVALID_EVENTFD;
801         else
802                 file.fd = pmsg->fds[0];
803         RTE_LOG(INFO, VHOST_CONFIG,
804                 "vring call idx:%d file:%d\n", file.index, file.fd);
805
806         vq = dev->virtqueue[file.index];
807         if (vq->callfd >= 0)
808                 close(vq->callfd);
809
810         vq->callfd = file.fd;
811 }
812
813 static void
814 vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *pmsg)
815 {
816         struct vhost_vring_file file;
817         struct vhost_virtqueue *vq;
818         struct virtio_net *dev = *pdev;
819
820         file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
821         if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
822                 file.fd = VIRTIO_INVALID_EVENTFD;
823         else
824                 file.fd = pmsg->fds[0];
825         RTE_LOG(INFO, VHOST_CONFIG,
826                 "vring kick idx:%d file:%d\n", file.index, file.fd);
827
828         /* Interpret ring addresses only when ring is started. */
829         dev = translate_ring_addresses(dev, file.index);
830         if (!dev)
831                 return;
832
833         *pdev = dev;
834
835         vq = dev->virtqueue[file.index];
836
837         /*
838          * When VHOST_USER_F_PROTOCOL_FEATURES is not negotiated,
839          * the ring starts already enabled. Otherwise, it is enabled via
840          * the SET_VRING_ENABLE message.
841          */
842         if (!(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES)))
843                 vq->enabled = 1;
844
845         if (vq->kickfd >= 0)
846                 close(vq->kickfd);
847         vq->kickfd = file.fd;
848 }
849
850 static void
851 free_zmbufs(struct vhost_virtqueue *vq)
852 {
853         struct zcopy_mbuf *zmbuf, *next;
854
855         for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
856              zmbuf != NULL; zmbuf = next) {
857                 next = TAILQ_NEXT(zmbuf, next);
858
859                 rte_pktmbuf_free(zmbuf->mbuf);
860                 TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
861         }
862
863         rte_free(vq->zmbufs);
864 }
865
866 /*
867  * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
868  */
869 static int
870 vhost_user_get_vring_base(struct virtio_net *dev,
871                           VhostUserMsg *msg)
872 {
873         struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index];
874
875         /* We have to stop the queue (virtio) if it is running. */
876         if (dev->flags & VIRTIO_DEV_RUNNING) {
877                 dev->flags &= ~VIRTIO_DEV_RUNNING;
878                 dev->notify_ops->destroy_device(dev->vid);
879         }
880
881         dev->flags &= ~VIRTIO_DEV_READY;
882
883         /* Here we are safe to get the last used index */
884         msg->payload.state.num = vq->last_used_idx;
885
886         RTE_LOG(INFO, VHOST_CONFIG,
887                 "vring base idx:%d file:%d\n", msg->payload.state.index,
888                 msg->payload.state.num);
889         /*
890          * Based on current qemu vhost-user implementation, this message is
891          * sent and only sent in vhost_vring_stop.
892          * TODO: cleanup the vring, it isn't usable since here.
893          */
894         if (vq->kickfd >= 0)
895                 close(vq->kickfd);
896
897         vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
898
899         if (dev->dequeue_zero_copy)
900                 free_zmbufs(vq);
901         rte_free(vq->shadow_used_ring);
902         vq->shadow_used_ring = NULL;
903
904         rte_free(vq->batch_copy_elems);
905         vq->batch_copy_elems = NULL;
906
907         return 0;
908 }
909
910 /*
911  * when virtio queues are ready to work, qemu will send us to
912  * enable the virtio queue pair.
913  */
914 static int
915 vhost_user_set_vring_enable(struct virtio_net *dev,
916                             VhostUserMsg *msg)
917 {
918         int enable = (int)msg->payload.state.num;
919
920         RTE_LOG(INFO, VHOST_CONFIG,
921                 "set queue enable: %d to qp idx: %d\n",
922                 enable, msg->payload.state.index);
923
924         if (dev->notify_ops->vring_state_changed)
925                 dev->notify_ops->vring_state_changed(dev->vid,
926                                 msg->payload.state.index, enable);
927
928         dev->virtqueue[msg->payload.state.index]->enabled = enable;
929
930         return 0;
931 }
932
933 static void
934 vhost_user_get_protocol_features(struct virtio_net *dev,
935                                  struct VhostUserMsg *msg)
936 {
937         uint64_t features, protocol_features = VHOST_USER_PROTOCOL_FEATURES;
938
939         rte_vhost_driver_get_features(dev->ifname, &features);
940
941         /*
942          * REPLY_ACK protocol feature is only mandatory for now
943          * for IOMMU feature. If IOMMU is explicitly disabled by the
944          * application, disable also REPLY_ACK feature for older buggy
945          * Qemu versions (from v2.7.0 to v2.9.0).
946          */
947         if (!(features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)))
948                 protocol_features &= ~(1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK);
949
950         msg->payload.u64 = protocol_features;
951         msg->size = sizeof(msg->payload.u64);
952 }
953
954 static void
955 vhost_user_set_protocol_features(struct virtio_net *dev,
956                                  uint64_t protocol_features)
957 {
958         if (protocol_features & ~VHOST_USER_PROTOCOL_FEATURES)
959                 return;
960
961         dev->protocol_features = protocol_features;
962 }
963
964 static int
965 vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg)
966 {
967         int fd = msg->fds[0];
968         uint64_t size, off;
969         void *addr;
970
971         if (fd < 0) {
972                 RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd);
973                 return -1;
974         }
975
976         if (msg->size != sizeof(VhostUserLog)) {
977                 RTE_LOG(ERR, VHOST_CONFIG,
978                         "invalid log base msg size: %"PRId32" != %d\n",
979                         msg->size, (int)sizeof(VhostUserLog));
980                 return -1;
981         }
982
983         size = msg->payload.log.mmap_size;
984         off  = msg->payload.log.mmap_offset;
985         RTE_LOG(INFO, VHOST_CONFIG,
986                 "log mmap size: %"PRId64", offset: %"PRId64"\n",
987                 size, off);
988
989         /*
990          * mmap from 0 to workaround a hugepage mmap bug: mmap will
991          * fail when offset is not page size aligned.
992          */
993         addr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
994         close(fd);
995         if (addr == MAP_FAILED) {
996                 RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n");
997                 return -1;
998         }
999
1000         /*
1001          * Free previously mapped log memory on occasionally
1002          * multiple VHOST_USER_SET_LOG_BASE.
1003          */
1004         if (dev->log_addr) {
1005                 munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
1006         }
1007         dev->log_addr = (uint64_t)(uintptr_t)addr;
1008         dev->log_base = dev->log_addr + off;
1009         dev->log_size = size;
1010
1011         return 0;
1012 }
1013
1014 /*
1015  * An rarp packet is constructed and broadcasted to notify switches about
1016  * the new location of the migrated VM, so that packets from outside will
1017  * not be lost after migration.
1018  *
1019  * However, we don't actually "send" a rarp packet here, instead, we set
1020  * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
1021  */
1022 static int
1023 vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg)
1024 {
1025         uint8_t *mac = (uint8_t *)&msg->payload.u64;
1026
1027         RTE_LOG(DEBUG, VHOST_CONFIG,
1028                 ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
1029                 mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
1030         memcpy(dev->mac.addr_bytes, mac, 6);
1031
1032         /*
1033          * Set the flag to inject a RARP broadcast packet at
1034          * rte_vhost_dequeue_burst().
1035          *
1036          * rte_smp_wmb() is for making sure the mac is copied
1037          * before the flag is set.
1038          */
1039         rte_smp_wmb();
1040         rte_atomic16_set(&dev->broadcast_rarp, 1);
1041
1042         return 0;
1043 }
1044
1045 static int
1046 vhost_user_net_set_mtu(struct virtio_net *dev, struct VhostUserMsg *msg)
1047 {
1048         if (msg->payload.u64 < VIRTIO_MIN_MTU ||
1049                         msg->payload.u64 > VIRTIO_MAX_MTU) {
1050                 RTE_LOG(ERR, VHOST_CONFIG, "Invalid MTU size (%"PRIu64")\n",
1051                                 msg->payload.u64);
1052
1053                 return -1;
1054         }
1055
1056         dev->mtu = msg->payload.u64;
1057
1058         return 0;
1059 }
1060
1061 static int
1062 vhost_user_set_req_fd(struct virtio_net *dev, struct VhostUserMsg *msg)
1063 {
1064         int fd = msg->fds[0];
1065
1066         if (fd < 0) {
1067                 RTE_LOG(ERR, VHOST_CONFIG,
1068                                 "Invalid file descriptor for slave channel (%d)\n",
1069                                 fd);
1070                 return -1;
1071         }
1072
1073         dev->slave_req_fd = fd;
1074
1075         return 0;
1076 }
1077
1078 static int
1079 is_vring_iotlb_update(struct vhost_virtqueue *vq, struct vhost_iotlb_msg *imsg)
1080 {
1081         struct vhost_vring_addr *ra;
1082         uint64_t start, end;
1083
1084         start = imsg->iova;
1085         end = start + imsg->size;
1086
1087         ra = &vq->ring_addrs;
1088         if (ra->desc_user_addr >= start && ra->desc_user_addr < end)
1089                 return 1;
1090         if (ra->avail_user_addr >= start && ra->avail_user_addr < end)
1091                 return 1;
1092         if (ra->used_user_addr >= start && ra->used_user_addr < end)
1093                 return 1;
1094
1095         return 0;
1096 }
1097
1098 static int
1099 is_vring_iotlb_invalidate(struct vhost_virtqueue *vq,
1100                                 struct vhost_iotlb_msg *imsg)
1101 {
1102         uint64_t istart, iend, vstart, vend;
1103
1104         istart = imsg->iova;
1105         iend = istart + imsg->size - 1;
1106
1107         vstart = (uintptr_t)vq->desc;
1108         vend = vstart + sizeof(struct vring_desc) * vq->size - 1;
1109         if (vstart <= iend && istart <= vend)
1110                 return 1;
1111
1112         vstart = (uintptr_t)vq->avail;
1113         vend = vstart + sizeof(struct vring_avail);
1114         vend += sizeof(uint16_t) * vq->size - 1;
1115         if (vstart <= iend && istart <= vend)
1116                 return 1;
1117
1118         vstart = (uintptr_t)vq->used;
1119         vend = vstart + sizeof(struct vring_used);
1120         vend += sizeof(struct vring_used_elem) * vq->size - 1;
1121         if (vstart <= iend && istart <= vend)
1122                 return 1;
1123
1124         return 0;
1125 }
1126
1127 static int
1128 vhost_user_iotlb_msg(struct virtio_net **pdev, struct VhostUserMsg *msg)
1129 {
1130         struct virtio_net *dev = *pdev;
1131         struct vhost_iotlb_msg *imsg = &msg->payload.iotlb;
1132         uint16_t i;
1133         uint64_t vva;
1134
1135         switch (imsg->type) {
1136         case VHOST_IOTLB_UPDATE:
1137                 vva = qva_to_vva(dev, imsg->uaddr);
1138                 if (!vva)
1139                         return -1;
1140
1141                 for (i = 0; i < dev->nr_vring; i++) {
1142                         struct vhost_virtqueue *vq = dev->virtqueue[i];
1143
1144                         vhost_user_iotlb_cache_insert(vq, imsg->iova, vva,
1145                                         imsg->size, imsg->perm);
1146
1147                         if (is_vring_iotlb_update(vq, imsg))
1148                                 *pdev = dev = translate_ring_addresses(dev, i);
1149                 }
1150                 break;
1151         case VHOST_IOTLB_INVALIDATE:
1152                 for (i = 0; i < dev->nr_vring; i++) {
1153                         struct vhost_virtqueue *vq = dev->virtqueue[i];
1154
1155                         vhost_user_iotlb_cache_remove(vq, imsg->iova,
1156                                         imsg->size);
1157
1158                         if (is_vring_iotlb_invalidate(vq, imsg))
1159                                 vring_invalidate(dev, vq);
1160                 }
1161                 break;
1162         default:
1163                 RTE_LOG(ERR, VHOST_CONFIG, "Invalid IOTLB message type (%d)\n",
1164                                 imsg->type);
1165                 return -1;
1166         }
1167
1168         return 0;
1169 }
1170
1171 /* return bytes# of read on success or negative val on failure. */
1172 static int
1173 read_vhost_message(int sockfd, struct VhostUserMsg *msg)
1174 {
1175         int ret;
1176
1177         ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
1178                 msg->fds, VHOST_MEMORY_MAX_NREGIONS);
1179         if (ret <= 0)
1180                 return ret;
1181
1182         if (msg && msg->size) {
1183                 if (msg->size > sizeof(msg->payload)) {
1184                         RTE_LOG(ERR, VHOST_CONFIG,
1185                                 "invalid msg size: %d\n", msg->size);
1186                         return -1;
1187                 }
1188                 ret = read(sockfd, &msg->payload, msg->size);
1189                 if (ret <= 0)
1190                         return ret;
1191                 if (ret != (int)msg->size) {
1192                         RTE_LOG(ERR, VHOST_CONFIG,
1193                                 "read control message failed\n");
1194                         return -1;
1195                 }
1196         }
1197
1198         return ret;
1199 }
1200
1201 static int
1202 send_vhost_message(int sockfd, struct VhostUserMsg *msg)
1203 {
1204         if (!msg)
1205                 return 0;
1206
1207         return send_fd_message(sockfd, (char *)msg,
1208                 VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
1209 }
1210
1211 static int
1212 send_vhost_reply(int sockfd, struct VhostUserMsg *msg)
1213 {
1214         if (!msg)
1215                 return 0;
1216
1217         msg->flags &= ~VHOST_USER_VERSION_MASK;
1218         msg->flags &= ~VHOST_USER_NEED_REPLY;
1219         msg->flags |= VHOST_USER_VERSION;
1220         msg->flags |= VHOST_USER_REPLY_MASK;
1221
1222         return send_vhost_message(sockfd, msg);
1223 }
1224
1225 /*
1226  * Allocate a queue pair if it hasn't been allocated yet
1227  */
1228 static int
1229 vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, VhostUserMsg *msg)
1230 {
1231         uint16_t vring_idx;
1232
1233         switch (msg->request.master) {
1234         case VHOST_USER_SET_VRING_KICK:
1235         case VHOST_USER_SET_VRING_CALL:
1236         case VHOST_USER_SET_VRING_ERR:
1237                 vring_idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
1238                 break;
1239         case VHOST_USER_SET_VRING_NUM:
1240         case VHOST_USER_SET_VRING_BASE:
1241         case VHOST_USER_SET_VRING_ENABLE:
1242                 vring_idx = msg->payload.state.index;
1243                 break;
1244         case VHOST_USER_SET_VRING_ADDR:
1245                 vring_idx = msg->payload.addr.index;
1246                 break;
1247         default:
1248                 return 0;
1249         }
1250
1251         if (vring_idx >= VHOST_MAX_VRING) {
1252                 RTE_LOG(ERR, VHOST_CONFIG,
1253                         "invalid vring index: %u\n", vring_idx);
1254                 return -1;
1255         }
1256
1257         if (dev->virtqueue[vring_idx])
1258                 return 0;
1259
1260         return alloc_vring_queue(dev, vring_idx);
1261 }
1262
1263 int
1264 vhost_user_msg_handler(int vid, int fd)
1265 {
1266         struct virtio_net *dev;
1267         struct VhostUserMsg msg;
1268         int ret;
1269
1270         dev = get_device(vid);
1271         if (dev == NULL)
1272                 return -1;
1273
1274         if (!dev->notify_ops) {
1275                 dev->notify_ops = vhost_driver_callback_get(dev->ifname);
1276                 if (!dev->notify_ops) {
1277                         RTE_LOG(ERR, VHOST_CONFIG,
1278                                 "failed to get callback ops for driver %s\n",
1279                                 dev->ifname);
1280                         return -1;
1281                 }
1282         }
1283
1284         ret = read_vhost_message(fd, &msg);
1285         if (ret <= 0 || msg.request.master >= VHOST_USER_MAX) {
1286                 if (ret < 0)
1287                         RTE_LOG(ERR, VHOST_CONFIG,
1288                                 "vhost read message failed\n");
1289                 else if (ret == 0)
1290                         RTE_LOG(INFO, VHOST_CONFIG,
1291                                 "vhost peer closed\n");
1292                 else
1293                         RTE_LOG(ERR, VHOST_CONFIG,
1294                                 "vhost read incorrect message\n");
1295
1296                 return -1;
1297         }
1298
1299         ret = 0;
1300         if (msg.request.master != VHOST_USER_IOTLB_MSG)
1301                 RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
1302                         vhost_message_str[msg.request.master]);
1303         else
1304                 RTE_LOG(DEBUG, VHOST_CONFIG, "read message %s\n",
1305                         vhost_message_str[msg.request.master]);
1306
1307         ret = vhost_user_check_and_alloc_queue_pair(dev, &msg);
1308         if (ret < 0) {
1309                 RTE_LOG(ERR, VHOST_CONFIG,
1310                         "failed to alloc queue\n");
1311                 return -1;
1312         }
1313
1314         switch (msg.request.master) {
1315         case VHOST_USER_GET_FEATURES:
1316                 msg.payload.u64 = vhost_user_get_features(dev);
1317                 msg.size = sizeof(msg.payload.u64);
1318                 send_vhost_reply(fd, &msg);
1319                 break;
1320         case VHOST_USER_SET_FEATURES:
1321                 ret = vhost_user_set_features(dev, msg.payload.u64);
1322                 if (ret)
1323                         return -1;
1324                 break;
1325
1326         case VHOST_USER_GET_PROTOCOL_FEATURES:
1327                 vhost_user_get_protocol_features(dev, &msg);
1328                 send_vhost_reply(fd, &msg);
1329                 break;
1330         case VHOST_USER_SET_PROTOCOL_FEATURES:
1331                 vhost_user_set_protocol_features(dev, msg.payload.u64);
1332                 break;
1333
1334         case VHOST_USER_SET_OWNER:
1335                 vhost_user_set_owner();
1336                 break;
1337         case VHOST_USER_RESET_OWNER:
1338                 vhost_user_reset_owner(dev);
1339                 break;
1340
1341         case VHOST_USER_SET_MEM_TABLE:
1342                 ret = vhost_user_set_mem_table(dev, &msg);
1343                 break;
1344
1345         case VHOST_USER_SET_LOG_BASE:
1346                 vhost_user_set_log_base(dev, &msg);
1347
1348                 /* it needs a reply */
1349                 msg.size = sizeof(msg.payload.u64);
1350                 send_vhost_reply(fd, &msg);
1351                 break;
1352         case VHOST_USER_SET_LOG_FD:
1353                 close(msg.fds[0]);
1354                 RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
1355                 break;
1356
1357         case VHOST_USER_SET_VRING_NUM:
1358                 vhost_user_set_vring_num(dev, &msg);
1359                 break;
1360         case VHOST_USER_SET_VRING_ADDR:
1361                 vhost_user_set_vring_addr(&dev, &msg);
1362                 break;
1363         case VHOST_USER_SET_VRING_BASE:
1364                 vhost_user_set_vring_base(dev, &msg);
1365                 break;
1366
1367         case VHOST_USER_GET_VRING_BASE:
1368                 vhost_user_get_vring_base(dev, &msg);
1369                 msg.size = sizeof(msg.payload.state);
1370                 send_vhost_reply(fd, &msg);
1371                 break;
1372
1373         case VHOST_USER_SET_VRING_KICK:
1374                 vhost_user_set_vring_kick(&dev, &msg);
1375                 break;
1376         case VHOST_USER_SET_VRING_CALL:
1377                 vhost_user_set_vring_call(dev, &msg);
1378                 break;
1379
1380         case VHOST_USER_SET_VRING_ERR:
1381                 if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK))
1382                         close(msg.fds[0]);
1383                 RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
1384                 break;
1385
1386         case VHOST_USER_GET_QUEUE_NUM:
1387                 msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS;
1388                 msg.size = sizeof(msg.payload.u64);
1389                 send_vhost_reply(fd, &msg);
1390                 break;
1391
1392         case VHOST_USER_SET_VRING_ENABLE:
1393                 vhost_user_set_vring_enable(dev, &msg);
1394                 break;
1395         case VHOST_USER_SEND_RARP:
1396                 vhost_user_send_rarp(dev, &msg);
1397                 break;
1398
1399         case VHOST_USER_NET_SET_MTU:
1400                 ret = vhost_user_net_set_mtu(dev, &msg);
1401                 break;
1402
1403         case VHOST_USER_SET_SLAVE_REQ_FD:
1404                 ret = vhost_user_set_req_fd(dev, &msg);
1405                 break;
1406
1407         case VHOST_USER_IOTLB_MSG:
1408                 ret = vhost_user_iotlb_msg(&dev, &msg);
1409                 break;
1410
1411         default:
1412                 ret = -1;
1413                 break;
1414
1415         }
1416
1417         if (msg.flags & VHOST_USER_NEED_REPLY) {
1418                 msg.payload.u64 = !!ret;
1419                 msg.size = sizeof(msg.payload.u64);
1420                 send_vhost_reply(fd, &msg);
1421         }
1422
1423         if (!(dev->flags & VIRTIO_DEV_RUNNING) && virtio_is_ready(dev)) {
1424                 dev->flags |= VIRTIO_DEV_READY;
1425
1426                 if (!(dev->flags & VIRTIO_DEV_RUNNING)) {
1427                         if (dev->dequeue_zero_copy) {
1428                                 RTE_LOG(INFO, VHOST_CONFIG,
1429                                                 "dequeue zero copy is enabled\n");
1430                         }
1431
1432                         if (dev->notify_ops->new_device(dev->vid) == 0)
1433                                 dev->flags |= VIRTIO_DEV_RUNNING;
1434                 }
1435         }
1436
1437         return 0;
1438 }
1439
1440 int
1441 vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm)
1442 {
1443         int ret;
1444         struct VhostUserMsg msg = {
1445                 .request.slave = VHOST_USER_SLAVE_IOTLB_MSG,
1446                 .flags = VHOST_USER_VERSION,
1447                 .size = sizeof(msg.payload.iotlb),
1448                 .payload.iotlb = {
1449                         .iova = iova,
1450                         .perm = perm,
1451                         .type = VHOST_IOTLB_MISS,
1452                 },
1453         };
1454
1455         ret = send_vhost_message(dev->slave_req_fd, &msg);
1456         if (ret < 0) {
1457                 RTE_LOG(ERR, VHOST_CONFIG,
1458                                 "Failed to send IOTLB miss message (%d)\n",
1459                                 ret);
1460                 return ret;
1461         }
1462
1463         return 0;
1464 }