vhost: prepare for slave requests
[dpdk.git] / lib / librte_vhost / vhost_user.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <stdint.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <unistd.h>
39 #include <sys/mman.h>
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <assert.h>
43 #ifdef RTE_LIBRTE_VHOST_NUMA
44 #include <numaif.h>
45 #endif
46
47 #include <rte_common.h>
48 #include <rte_malloc.h>
49 #include <rte_log.h>
50
51 #include "vhost.h"
52 #include "vhost_user.h"
53
54 #define VIRTIO_MIN_MTU 68
55 #define VIRTIO_MAX_MTU 65535
56
57 static const char *vhost_message_str[VHOST_USER_MAX] = {
58         [VHOST_USER_NONE] = "VHOST_USER_NONE",
59         [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
60         [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
61         [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
62         [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
63         [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
64         [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
65         [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
66         [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
67         [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
68         [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
69         [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
70         [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
71         [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
72         [VHOST_USER_SET_VRING_ERR]  = "VHOST_USER_SET_VRING_ERR",
73         [VHOST_USER_GET_PROTOCOL_FEATURES]  = "VHOST_USER_GET_PROTOCOL_FEATURES",
74         [VHOST_USER_SET_PROTOCOL_FEATURES]  = "VHOST_USER_SET_PROTOCOL_FEATURES",
75         [VHOST_USER_GET_QUEUE_NUM]  = "VHOST_USER_GET_QUEUE_NUM",
76         [VHOST_USER_SET_VRING_ENABLE]  = "VHOST_USER_SET_VRING_ENABLE",
77         [VHOST_USER_SEND_RARP]  = "VHOST_USER_SEND_RARP",
78         [VHOST_USER_NET_SET_MTU]  = "VHOST_USER_NET_SET_MTU",
79 };
80
81 static uint64_t
82 get_blk_size(int fd)
83 {
84         struct stat stat;
85         int ret;
86
87         ret = fstat(fd, &stat);
88         return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
89 }
90
91 static void
92 free_mem_region(struct virtio_net *dev)
93 {
94         uint32_t i;
95         struct rte_vhost_mem_region *reg;
96
97         if (!dev || !dev->mem)
98                 return;
99
100         for (i = 0; i < dev->mem->nregions; i++) {
101                 reg = &dev->mem->regions[i];
102                 if (reg->host_user_addr) {
103                         munmap(reg->mmap_addr, reg->mmap_size);
104                         close(reg->fd);
105                 }
106         }
107 }
108
109 void
110 vhost_backend_cleanup(struct virtio_net *dev)
111 {
112         if (dev->mem) {
113                 free_mem_region(dev);
114                 rte_free(dev->mem);
115                 dev->mem = NULL;
116         }
117
118         free(dev->guest_pages);
119         dev->guest_pages = NULL;
120
121         if (dev->log_addr) {
122                 munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
123                 dev->log_addr = 0;
124         }
125 }
126
127 /*
128  * This function just returns success at the moment unless
129  * the device hasn't been initialised.
130  */
131 static int
132 vhost_user_set_owner(void)
133 {
134         return 0;
135 }
136
137 static int
138 vhost_user_reset_owner(struct virtio_net *dev)
139 {
140         if (dev->flags & VIRTIO_DEV_RUNNING) {
141                 dev->flags &= ~VIRTIO_DEV_RUNNING;
142                 dev->notify_ops->destroy_device(dev->vid);
143         }
144
145         cleanup_device(dev, 0);
146         reset_device(dev);
147         return 0;
148 }
149
150 /*
151  * The features that we support are requested.
152  */
153 static uint64_t
154 vhost_user_get_features(struct virtio_net *dev)
155 {
156         uint64_t features = 0;
157
158         rte_vhost_driver_get_features(dev->ifname, &features);
159         return features;
160 }
161
162 /*
163  * We receive the negotiated features supported by us and the virtio device.
164  */
165 static int
166 vhost_user_set_features(struct virtio_net *dev, uint64_t features)
167 {
168         uint64_t vhost_features = 0;
169
170         rte_vhost_driver_get_features(dev->ifname, &vhost_features);
171         if (features & ~vhost_features) {
172                 RTE_LOG(ERR, VHOST_CONFIG,
173                         "(%d) received invalid negotiated features.\n",
174                         dev->vid);
175                 return -1;
176         }
177
178         if ((dev->flags & VIRTIO_DEV_RUNNING) && dev->features != features) {
179                 if (dev->notify_ops->features_changed)
180                         dev->notify_ops->features_changed(dev->vid, features);
181         }
182
183         dev->features = features;
184         if (dev->features &
185                 ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) {
186                 dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
187         } else {
188                 dev->vhost_hlen = sizeof(struct virtio_net_hdr);
189         }
190         LOG_DEBUG(VHOST_CONFIG,
191                 "(%d) mergeable RX buffers %s, virtio 1 %s\n",
192                 dev->vid,
193                 (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off",
194                 (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off");
195
196         return 0;
197 }
198
199 /*
200  * The virtio device sends us the size of the descriptor ring.
201  */
202 static int
203 vhost_user_set_vring_num(struct virtio_net *dev,
204                          VhostUserMsg *msg)
205 {
206         struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index];
207
208         vq->size = msg->payload.state.num;
209
210         if (dev->dequeue_zero_copy) {
211                 vq->nr_zmbuf = 0;
212                 vq->last_zmbuf_idx = 0;
213                 vq->zmbuf_size = vq->size;
214                 vq->zmbufs = rte_zmalloc(NULL, vq->zmbuf_size *
215                                          sizeof(struct zcopy_mbuf), 0);
216                 if (vq->zmbufs == NULL) {
217                         RTE_LOG(WARNING, VHOST_CONFIG,
218                                 "failed to allocate mem for zero copy; "
219                                 "zero copy is force disabled\n");
220                         dev->dequeue_zero_copy = 0;
221                 }
222         }
223
224         vq->shadow_used_ring = rte_malloc(NULL,
225                                 vq->size * sizeof(struct vring_used_elem),
226                                 RTE_CACHE_LINE_SIZE);
227         if (!vq->shadow_used_ring) {
228                 RTE_LOG(ERR, VHOST_CONFIG,
229                         "failed to allocate memory for shadow used ring.\n");
230                 return -1;
231         }
232
233         vq->batch_copy_elems = rte_malloc(NULL,
234                                 vq->size * sizeof(struct batch_copy_elem),
235                                 RTE_CACHE_LINE_SIZE);
236         if (!vq->batch_copy_elems) {
237                 RTE_LOG(ERR, VHOST_CONFIG,
238                         "failed to allocate memory for batching copy.\n");
239                 return -1;
240         }
241
242         return 0;
243 }
244
245 /*
246  * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the
247  * same numa node as the memory of vring descriptor.
248  */
249 #ifdef RTE_LIBRTE_VHOST_NUMA
250 static struct virtio_net*
251 numa_realloc(struct virtio_net *dev, int index)
252 {
253         int oldnode, newnode;
254         struct virtio_net *old_dev;
255         struct vhost_virtqueue *old_vq, *vq;
256         int ret;
257
258         old_dev = dev;
259         vq = old_vq = dev->virtqueue[index];
260
261         ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc,
262                             MPOL_F_NODE | MPOL_F_ADDR);
263
264         /* check if we need to reallocate vq */
265         ret |= get_mempolicy(&oldnode, NULL, 0, old_vq,
266                              MPOL_F_NODE | MPOL_F_ADDR);
267         if (ret) {
268                 RTE_LOG(ERR, VHOST_CONFIG,
269                         "Unable to get vq numa information.\n");
270                 return dev;
271         }
272         if (oldnode != newnode) {
273                 RTE_LOG(INFO, VHOST_CONFIG,
274                         "reallocate vq from %d to %d node\n", oldnode, newnode);
275                 vq = rte_malloc_socket(NULL, sizeof(*vq), 0, newnode);
276                 if (!vq)
277                         return dev;
278
279                 memcpy(vq, old_vq, sizeof(*vq));
280                 rte_free(old_vq);
281         }
282
283         /* check if we need to reallocate dev */
284         ret = get_mempolicy(&oldnode, NULL, 0, old_dev,
285                             MPOL_F_NODE | MPOL_F_ADDR);
286         if (ret) {
287                 RTE_LOG(ERR, VHOST_CONFIG,
288                         "Unable to get dev numa information.\n");
289                 goto out;
290         }
291         if (oldnode != newnode) {
292                 RTE_LOG(INFO, VHOST_CONFIG,
293                         "reallocate dev from %d to %d node\n",
294                         oldnode, newnode);
295                 dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode);
296                 if (!dev) {
297                         dev = old_dev;
298                         goto out;
299                 }
300
301                 memcpy(dev, old_dev, sizeof(*dev));
302                 rte_free(old_dev);
303         }
304
305 out:
306         dev->virtqueue[index] = vq;
307         vhost_devices[dev->vid] = dev;
308
309         return dev;
310 }
311 #else
312 static struct virtio_net*
313 numa_realloc(struct virtio_net *dev, int index __rte_unused)
314 {
315         return dev;
316 }
317 #endif
318
319 /*
320  * Converts QEMU virtual address to Vhost virtual address. This function is
321  * used to convert the ring addresses to our address space.
322  */
323 static uint64_t
324 qva_to_vva(struct virtio_net *dev, uint64_t qva)
325 {
326         struct rte_vhost_mem_region *reg;
327         uint32_t i;
328
329         /* Find the region where the address lives. */
330         for (i = 0; i < dev->mem->nregions; i++) {
331                 reg = &dev->mem->regions[i];
332
333                 if (qva >= reg->guest_user_addr &&
334                     qva <  reg->guest_user_addr + reg->size) {
335                         return qva - reg->guest_user_addr +
336                                reg->host_user_addr;
337                 }
338         }
339
340         return 0;
341 }
342
343 /*
344  * The virtio device sends us the desc, used and avail ring addresses.
345  * This function then converts these to our address space.
346  */
347 static int
348 vhost_user_set_vring_addr(struct virtio_net *dev, VhostUserMsg *msg)
349 {
350         struct vhost_virtqueue *vq;
351
352         if (dev->mem == NULL)
353                 return -1;
354
355         /* addr->index refers to the queue index. The txq 1, rxq is 0. */
356         vq = dev->virtqueue[msg->payload.addr.index];
357
358         /* The addresses are converted from QEMU virtual to Vhost virtual. */
359         vq->desc = (struct vring_desc *)(uintptr_t)qva_to_vva(dev,
360                         msg->payload.addr.desc_user_addr);
361         if (vq->desc == 0) {
362                 RTE_LOG(ERR, VHOST_CONFIG,
363                         "(%d) failed to find desc ring address.\n",
364                         dev->vid);
365                 return -1;
366         }
367
368         dev = numa_realloc(dev, msg->payload.addr.index);
369         vq = dev->virtqueue[msg->payload.addr.index];
370
371         vq->avail = (struct vring_avail *)(uintptr_t)qva_to_vva(dev,
372                         msg->payload.addr.avail_user_addr);
373         if (vq->avail == 0) {
374                 RTE_LOG(ERR, VHOST_CONFIG,
375                         "(%d) failed to find avail ring address.\n",
376                         dev->vid);
377                 return -1;
378         }
379
380         vq->used = (struct vring_used *)(uintptr_t)qva_to_vva(dev,
381                         msg->payload.addr.used_user_addr);
382         if (vq->used == 0) {
383                 RTE_LOG(ERR, VHOST_CONFIG,
384                         "(%d) failed to find used ring address.\n",
385                         dev->vid);
386                 return -1;
387         }
388
389         if (vq->last_used_idx != vq->used->idx) {
390                 RTE_LOG(WARNING, VHOST_CONFIG,
391                         "last_used_idx (%u) and vq->used->idx (%u) mismatches; "
392                         "some packets maybe resent for Tx and dropped for Rx\n",
393                         vq->last_used_idx, vq->used->idx);
394                 vq->last_used_idx  = vq->used->idx;
395                 vq->last_avail_idx = vq->used->idx;
396         }
397
398         vq->log_guest_addr = msg->payload.addr.log_guest_addr;
399
400         LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n",
401                         dev->vid, vq->desc);
402         LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n",
403                         dev->vid, vq->avail);
404         LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n",
405                         dev->vid, vq->used);
406         LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n",
407                         dev->vid, vq->log_guest_addr);
408
409         return 0;
410 }
411
412 /*
413  * The virtio device sends us the available ring last used index.
414  */
415 static int
416 vhost_user_set_vring_base(struct virtio_net *dev,
417                           VhostUserMsg *msg)
418 {
419         dev->virtqueue[msg->payload.state.index]->last_used_idx  =
420                         msg->payload.state.num;
421         dev->virtqueue[msg->payload.state.index]->last_avail_idx =
422                         msg->payload.state.num;
423
424         return 0;
425 }
426
427 static void
428 add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
429                    uint64_t host_phys_addr, uint64_t size)
430 {
431         struct guest_page *page, *last_page;
432
433         if (dev->nr_guest_pages == dev->max_guest_pages) {
434                 dev->max_guest_pages *= 2;
435                 dev->guest_pages = realloc(dev->guest_pages,
436                                         dev->max_guest_pages * sizeof(*page));
437         }
438
439         if (dev->nr_guest_pages > 0) {
440                 last_page = &dev->guest_pages[dev->nr_guest_pages - 1];
441                 /* merge if the two pages are continuous */
442                 if (host_phys_addr == last_page->host_phys_addr +
443                                       last_page->size) {
444                         last_page->size += size;
445                         return;
446                 }
447         }
448
449         page = &dev->guest_pages[dev->nr_guest_pages++];
450         page->guest_phys_addr = guest_phys_addr;
451         page->host_phys_addr  = host_phys_addr;
452         page->size = size;
453 }
454
455 static void
456 add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg,
457                 uint64_t page_size)
458 {
459         uint64_t reg_size = reg->size;
460         uint64_t host_user_addr  = reg->host_user_addr;
461         uint64_t guest_phys_addr = reg->guest_phys_addr;
462         uint64_t host_phys_addr;
463         uint64_t size;
464
465         host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr);
466         size = page_size - (guest_phys_addr & (page_size - 1));
467         size = RTE_MIN(size, reg_size);
468
469         add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size);
470         host_user_addr  += size;
471         guest_phys_addr += size;
472         reg_size -= size;
473
474         while (reg_size > 0) {
475                 size = RTE_MIN(reg_size, page_size);
476                 host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)
477                                                   host_user_addr);
478                 add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size);
479
480                 host_user_addr  += size;
481                 guest_phys_addr += size;
482                 reg_size -= size;
483         }
484 }
485
486 #ifdef RTE_LIBRTE_VHOST_DEBUG
487 /* TODO: enable it only in debug mode? */
488 static void
489 dump_guest_pages(struct virtio_net *dev)
490 {
491         uint32_t i;
492         struct guest_page *page;
493
494         for (i = 0; i < dev->nr_guest_pages; i++) {
495                 page = &dev->guest_pages[i];
496
497                 RTE_LOG(INFO, VHOST_CONFIG,
498                         "guest physical page region %u\n"
499                         "\t guest_phys_addr: %" PRIx64 "\n"
500                         "\t host_phys_addr : %" PRIx64 "\n"
501                         "\t size           : %" PRIx64 "\n",
502                         i,
503                         page->guest_phys_addr,
504                         page->host_phys_addr,
505                         page->size);
506         }
507 }
508 #else
509 #define dump_guest_pages(dev)
510 #endif
511
512 static int
513 vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
514 {
515         struct VhostUserMemory memory = pmsg->payload.memory;
516         struct rte_vhost_mem_region *reg;
517         void *mmap_addr;
518         uint64_t mmap_size;
519         uint64_t mmap_offset;
520         uint64_t alignment;
521         uint32_t i;
522         int fd;
523
524         if (dev->mem) {
525                 free_mem_region(dev);
526                 rte_free(dev->mem);
527                 dev->mem = NULL;
528         }
529
530         dev->nr_guest_pages = 0;
531         if (!dev->guest_pages) {
532                 dev->max_guest_pages = 8;
533                 dev->guest_pages = malloc(dev->max_guest_pages *
534                                                 sizeof(struct guest_page));
535                 if (dev->guest_pages == NULL) {
536                         RTE_LOG(ERR, VHOST_CONFIG,
537                                 "(%d) failed to allocate memory "
538                                 "for dev->guest_pages\n",
539                                 dev->vid);
540                         return -1;
541                 }
542         }
543
544         dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct rte_vhost_memory) +
545                 sizeof(struct rte_vhost_mem_region) * memory.nregions, 0);
546         if (dev->mem == NULL) {
547                 RTE_LOG(ERR, VHOST_CONFIG,
548                         "(%d) failed to allocate memory for dev->mem\n",
549                         dev->vid);
550                 return -1;
551         }
552         dev->mem->nregions = memory.nregions;
553
554         for (i = 0; i < memory.nregions; i++) {
555                 fd  = pmsg->fds[i];
556                 reg = &dev->mem->regions[i];
557
558                 reg->guest_phys_addr = memory.regions[i].guest_phys_addr;
559                 reg->guest_user_addr = memory.regions[i].userspace_addr;
560                 reg->size            = memory.regions[i].memory_size;
561                 reg->fd              = fd;
562
563                 mmap_offset = memory.regions[i].mmap_offset;
564                 mmap_size   = reg->size + mmap_offset;
565
566                 /* mmap() without flag of MAP_ANONYMOUS, should be called
567                  * with length argument aligned with hugepagesz at older
568                  * longterm version Linux, like 2.6.32 and 3.2.72, or
569                  * mmap() will fail with EINVAL.
570                  *
571                  * to avoid failure, make sure in caller to keep length
572                  * aligned.
573                  */
574                 alignment = get_blk_size(fd);
575                 if (alignment == (uint64_t)-1) {
576                         RTE_LOG(ERR, VHOST_CONFIG,
577                                 "couldn't get hugepage size through fstat\n");
578                         goto err_mmap;
579                 }
580                 mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment);
581
582                 mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
583                                  MAP_SHARED | MAP_POPULATE, fd, 0);
584
585                 if (mmap_addr == MAP_FAILED) {
586                         RTE_LOG(ERR, VHOST_CONFIG,
587                                 "mmap region %u failed.\n", i);
588                         goto err_mmap;
589                 }
590
591                 reg->mmap_addr = mmap_addr;
592                 reg->mmap_size = mmap_size;
593                 reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
594                                       mmap_offset;
595
596                 if (dev->dequeue_zero_copy)
597                         add_guest_pages(dev, reg, alignment);
598
599                 RTE_LOG(INFO, VHOST_CONFIG,
600                         "guest memory region %u, size: 0x%" PRIx64 "\n"
601                         "\t guest physical addr: 0x%" PRIx64 "\n"
602                         "\t guest virtual  addr: 0x%" PRIx64 "\n"
603                         "\t host  virtual  addr: 0x%" PRIx64 "\n"
604                         "\t mmap addr : 0x%" PRIx64 "\n"
605                         "\t mmap size : 0x%" PRIx64 "\n"
606                         "\t mmap align: 0x%" PRIx64 "\n"
607                         "\t mmap off  : 0x%" PRIx64 "\n",
608                         i, reg->size,
609                         reg->guest_phys_addr,
610                         reg->guest_user_addr,
611                         reg->host_user_addr,
612                         (uint64_t)(uintptr_t)mmap_addr,
613                         mmap_size,
614                         alignment,
615                         mmap_offset);
616         }
617
618         dump_guest_pages(dev);
619
620         return 0;
621
622 err_mmap:
623         free_mem_region(dev);
624         rte_free(dev->mem);
625         dev->mem = NULL;
626         return -1;
627 }
628
629 static int
630 vq_is_ready(struct vhost_virtqueue *vq)
631 {
632         return vq && vq->desc   &&
633                vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD &&
634                vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD;
635 }
636
637 static int
638 virtio_is_ready(struct virtio_net *dev)
639 {
640         struct vhost_virtqueue *vq;
641         uint32_t i;
642
643         if (dev->nr_vring == 0)
644                 return 0;
645
646         for (i = 0; i < dev->nr_vring; i++) {
647                 vq = dev->virtqueue[i];
648
649                 if (!vq_is_ready(vq))
650                         return 0;
651         }
652
653         RTE_LOG(INFO, VHOST_CONFIG,
654                 "virtio is now ready for processing.\n");
655         return 1;
656 }
657
658 static void
659 vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg)
660 {
661         struct vhost_vring_file file;
662         struct vhost_virtqueue *vq;
663
664         file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
665         if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
666                 file.fd = VIRTIO_INVALID_EVENTFD;
667         else
668                 file.fd = pmsg->fds[0];
669         RTE_LOG(INFO, VHOST_CONFIG,
670                 "vring call idx:%d file:%d\n", file.index, file.fd);
671
672         vq = dev->virtqueue[file.index];
673         if (vq->callfd >= 0)
674                 close(vq->callfd);
675
676         vq->callfd = file.fd;
677 }
678
679 static void
680 vhost_user_set_vring_kick(struct virtio_net *dev, struct VhostUserMsg *pmsg)
681 {
682         struct vhost_vring_file file;
683         struct vhost_virtqueue *vq;
684
685         file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
686         if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
687                 file.fd = VIRTIO_INVALID_EVENTFD;
688         else
689                 file.fd = pmsg->fds[0];
690         RTE_LOG(INFO, VHOST_CONFIG,
691                 "vring kick idx:%d file:%d\n", file.index, file.fd);
692
693         vq = dev->virtqueue[file.index];
694         if (vq->kickfd >= 0)
695                 close(vq->kickfd);
696         vq->kickfd = file.fd;
697 }
698
699 static void
700 free_zmbufs(struct vhost_virtqueue *vq)
701 {
702         struct zcopy_mbuf *zmbuf, *next;
703
704         for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
705              zmbuf != NULL; zmbuf = next) {
706                 next = TAILQ_NEXT(zmbuf, next);
707
708                 rte_pktmbuf_free(zmbuf->mbuf);
709                 TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
710         }
711
712         rte_free(vq->zmbufs);
713 }
714
715 /*
716  * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
717  */
718 static int
719 vhost_user_get_vring_base(struct virtio_net *dev,
720                           VhostUserMsg *msg)
721 {
722         struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index];
723
724         /* We have to stop the queue (virtio) if it is running. */
725         if (dev->flags & VIRTIO_DEV_RUNNING) {
726                 dev->flags &= ~VIRTIO_DEV_RUNNING;
727                 dev->notify_ops->destroy_device(dev->vid);
728         }
729
730         dev->flags &= ~VIRTIO_DEV_READY;
731
732         /* Here we are safe to get the last used index */
733         msg->payload.state.num = vq->last_used_idx;
734
735         RTE_LOG(INFO, VHOST_CONFIG,
736                 "vring base idx:%d file:%d\n", msg->payload.state.index,
737                 msg->payload.state.num);
738         /*
739          * Based on current qemu vhost-user implementation, this message is
740          * sent and only sent in vhost_vring_stop.
741          * TODO: cleanup the vring, it isn't usable since here.
742          */
743         if (vq->kickfd >= 0)
744                 close(vq->kickfd);
745
746         vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
747
748         if (dev->dequeue_zero_copy)
749                 free_zmbufs(vq);
750         rte_free(vq->shadow_used_ring);
751         vq->shadow_used_ring = NULL;
752
753         rte_free(vq->batch_copy_elems);
754         vq->batch_copy_elems = NULL;
755
756         return 0;
757 }
758
759 /*
760  * when virtio queues are ready to work, qemu will send us to
761  * enable the virtio queue pair.
762  */
763 static int
764 vhost_user_set_vring_enable(struct virtio_net *dev,
765                             VhostUserMsg *msg)
766 {
767         int enable = (int)msg->payload.state.num;
768
769         RTE_LOG(INFO, VHOST_CONFIG,
770                 "set queue enable: %d to qp idx: %d\n",
771                 enable, msg->payload.state.index);
772
773         if (dev->notify_ops->vring_state_changed)
774                 dev->notify_ops->vring_state_changed(dev->vid,
775                                 msg->payload.state.index, enable);
776
777         dev->virtqueue[msg->payload.state.index]->enabled = enable;
778
779         return 0;
780 }
781
782 static void
783 vhost_user_set_protocol_features(struct virtio_net *dev,
784                                  uint64_t protocol_features)
785 {
786         if (protocol_features & ~VHOST_USER_PROTOCOL_FEATURES)
787                 return;
788
789         dev->protocol_features = protocol_features;
790 }
791
792 static int
793 vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg)
794 {
795         int fd = msg->fds[0];
796         uint64_t size, off;
797         void *addr;
798
799         if (fd < 0) {
800                 RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd);
801                 return -1;
802         }
803
804         if (msg->size != sizeof(VhostUserLog)) {
805                 RTE_LOG(ERR, VHOST_CONFIG,
806                         "invalid log base msg size: %"PRId32" != %d\n",
807                         msg->size, (int)sizeof(VhostUserLog));
808                 return -1;
809         }
810
811         size = msg->payload.log.mmap_size;
812         off  = msg->payload.log.mmap_offset;
813         RTE_LOG(INFO, VHOST_CONFIG,
814                 "log mmap size: %"PRId64", offset: %"PRId64"\n",
815                 size, off);
816
817         /*
818          * mmap from 0 to workaround a hugepage mmap bug: mmap will
819          * fail when offset is not page size aligned.
820          */
821         addr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
822         close(fd);
823         if (addr == MAP_FAILED) {
824                 RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n");
825                 return -1;
826         }
827
828         /*
829          * Free previously mapped log memory on occasionally
830          * multiple VHOST_USER_SET_LOG_BASE.
831          */
832         if (dev->log_addr) {
833                 munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
834         }
835         dev->log_addr = (uint64_t)(uintptr_t)addr;
836         dev->log_base = dev->log_addr + off;
837         dev->log_size = size;
838
839         return 0;
840 }
841
842 /*
843  * An rarp packet is constructed and broadcasted to notify switches about
844  * the new location of the migrated VM, so that packets from outside will
845  * not be lost after migration.
846  *
847  * However, we don't actually "send" a rarp packet here, instead, we set
848  * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
849  */
850 static int
851 vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg)
852 {
853         uint8_t *mac = (uint8_t *)&msg->payload.u64;
854
855         RTE_LOG(DEBUG, VHOST_CONFIG,
856                 ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
857                 mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
858         memcpy(dev->mac.addr_bytes, mac, 6);
859
860         /*
861          * Set the flag to inject a RARP broadcast packet at
862          * rte_vhost_dequeue_burst().
863          *
864          * rte_smp_wmb() is for making sure the mac is copied
865          * before the flag is set.
866          */
867         rte_smp_wmb();
868         rte_atomic16_set(&dev->broadcast_rarp, 1);
869
870         return 0;
871 }
872
873 static int
874 vhost_user_net_set_mtu(struct virtio_net *dev, struct VhostUserMsg *msg)
875 {
876         if (msg->payload.u64 < VIRTIO_MIN_MTU ||
877                         msg->payload.u64 > VIRTIO_MAX_MTU) {
878                 RTE_LOG(ERR, VHOST_CONFIG, "Invalid MTU size (%"PRIu64")\n",
879                                 msg->payload.u64);
880
881                 return -1;
882         }
883
884         dev->mtu = msg->payload.u64;
885
886         return 0;
887 }
888
889 /* return bytes# of read on success or negative val on failure. */
890 static int
891 read_vhost_message(int sockfd, struct VhostUserMsg *msg)
892 {
893         int ret;
894
895         ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
896                 msg->fds, VHOST_MEMORY_MAX_NREGIONS);
897         if (ret <= 0)
898                 return ret;
899
900         if (msg && msg->size) {
901                 if (msg->size > sizeof(msg->payload)) {
902                         RTE_LOG(ERR, VHOST_CONFIG,
903                                 "invalid msg size: %d\n", msg->size);
904                         return -1;
905                 }
906                 ret = read(sockfd, &msg->payload, msg->size);
907                 if (ret <= 0)
908                         return ret;
909                 if (ret != (int)msg->size) {
910                         RTE_LOG(ERR, VHOST_CONFIG,
911                                 "read control message failed\n");
912                         return -1;
913                 }
914         }
915
916         return ret;
917 }
918
919 static int
920 send_vhost_message(int sockfd, struct VhostUserMsg *msg)
921 {
922         if (!msg)
923                 return 0;
924
925         return send_fd_message(sockfd, (char *)msg,
926                 VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
927 }
928
929 static int
930 send_vhost_reply(int sockfd, struct VhostUserMsg *msg)
931 {
932         if (!msg)
933                 return 0;
934
935         msg->flags &= ~VHOST_USER_VERSION_MASK;
936         msg->flags &= ~VHOST_USER_NEED_REPLY;
937         msg->flags |= VHOST_USER_VERSION;
938         msg->flags |= VHOST_USER_REPLY_MASK;
939
940         return send_vhost_message(sockfd, msg);
941 }
942
943 /*
944  * Allocate a queue pair if it hasn't been allocated yet
945  */
946 static int
947 vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, VhostUserMsg *msg)
948 {
949         uint16_t vring_idx;
950
951         switch (msg->request) {
952         case VHOST_USER_SET_VRING_KICK:
953         case VHOST_USER_SET_VRING_CALL:
954         case VHOST_USER_SET_VRING_ERR:
955                 vring_idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
956                 break;
957         case VHOST_USER_SET_VRING_NUM:
958         case VHOST_USER_SET_VRING_BASE:
959         case VHOST_USER_SET_VRING_ENABLE:
960                 vring_idx = msg->payload.state.index;
961                 break;
962         case VHOST_USER_SET_VRING_ADDR:
963                 vring_idx = msg->payload.addr.index;
964                 break;
965         default:
966                 return 0;
967         }
968
969         if (vring_idx >= VHOST_MAX_VRING) {
970                 RTE_LOG(ERR, VHOST_CONFIG,
971                         "invalid vring index: %u\n", vring_idx);
972                 return -1;
973         }
974
975         if (dev->virtqueue[vring_idx])
976                 return 0;
977
978         return alloc_vring_queue(dev, vring_idx);
979 }
980
981 int
982 vhost_user_msg_handler(int vid, int fd)
983 {
984         struct virtio_net *dev;
985         struct VhostUserMsg msg;
986         int ret;
987
988         dev = get_device(vid);
989         if (dev == NULL)
990                 return -1;
991
992         if (!dev->notify_ops) {
993                 dev->notify_ops = vhost_driver_callback_get(dev->ifname);
994                 if (!dev->notify_ops) {
995                         RTE_LOG(ERR, VHOST_CONFIG,
996                                 "failed to get callback ops for driver %s\n",
997                                 dev->ifname);
998                         return -1;
999                 }
1000         }
1001
1002         ret = read_vhost_message(fd, &msg);
1003         if (ret <= 0 || msg.request >= VHOST_USER_MAX) {
1004                 if (ret < 0)
1005                         RTE_LOG(ERR, VHOST_CONFIG,
1006                                 "vhost read message failed\n");
1007                 else if (ret == 0)
1008                         RTE_LOG(INFO, VHOST_CONFIG,
1009                                 "vhost peer closed\n");
1010                 else
1011                         RTE_LOG(ERR, VHOST_CONFIG,
1012                                 "vhost read incorrect message\n");
1013
1014                 return -1;
1015         }
1016
1017         ret = 0;
1018         RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
1019                 vhost_message_str[msg.request]);
1020
1021         ret = vhost_user_check_and_alloc_queue_pair(dev, &msg);
1022         if (ret < 0) {
1023                 RTE_LOG(ERR, VHOST_CONFIG,
1024                         "failed to alloc queue\n");
1025                 return -1;
1026         }
1027
1028         switch (msg.request) {
1029         case VHOST_USER_GET_FEATURES:
1030                 msg.payload.u64 = vhost_user_get_features(dev);
1031                 msg.size = sizeof(msg.payload.u64);
1032                 send_vhost_reply(fd, &msg);
1033                 break;
1034         case VHOST_USER_SET_FEATURES:
1035                 vhost_user_set_features(dev, msg.payload.u64);
1036                 break;
1037
1038         case VHOST_USER_GET_PROTOCOL_FEATURES:
1039                 msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES;
1040                 msg.size = sizeof(msg.payload.u64);
1041                 send_vhost_reply(fd, &msg);
1042                 break;
1043         case VHOST_USER_SET_PROTOCOL_FEATURES:
1044                 vhost_user_set_protocol_features(dev, msg.payload.u64);
1045                 break;
1046
1047         case VHOST_USER_SET_OWNER:
1048                 vhost_user_set_owner();
1049                 break;
1050         case VHOST_USER_RESET_OWNER:
1051                 vhost_user_reset_owner(dev);
1052                 break;
1053
1054         case VHOST_USER_SET_MEM_TABLE:
1055                 ret = vhost_user_set_mem_table(dev, &msg);
1056                 break;
1057
1058         case VHOST_USER_SET_LOG_BASE:
1059                 vhost_user_set_log_base(dev, &msg);
1060
1061                 /* it needs a reply */
1062                 msg.size = sizeof(msg.payload.u64);
1063                 send_vhost_reply(fd, &msg);
1064                 break;
1065         case VHOST_USER_SET_LOG_FD:
1066                 close(msg.fds[0]);
1067                 RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
1068                 break;
1069
1070         case VHOST_USER_SET_VRING_NUM:
1071                 vhost_user_set_vring_num(dev, &msg);
1072                 break;
1073         case VHOST_USER_SET_VRING_ADDR:
1074                 vhost_user_set_vring_addr(dev, &msg);
1075                 break;
1076         case VHOST_USER_SET_VRING_BASE:
1077                 vhost_user_set_vring_base(dev, &msg);
1078                 break;
1079
1080         case VHOST_USER_GET_VRING_BASE:
1081                 vhost_user_get_vring_base(dev, &msg);
1082                 msg.size = sizeof(msg.payload.state);
1083                 send_vhost_reply(fd, &msg);
1084                 break;
1085
1086         case VHOST_USER_SET_VRING_KICK:
1087                 vhost_user_set_vring_kick(dev, &msg);
1088                 break;
1089         case VHOST_USER_SET_VRING_CALL:
1090                 vhost_user_set_vring_call(dev, &msg);
1091                 break;
1092
1093         case VHOST_USER_SET_VRING_ERR:
1094                 if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK))
1095                         close(msg.fds[0]);
1096                 RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
1097                 break;
1098
1099         case VHOST_USER_GET_QUEUE_NUM:
1100                 msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS;
1101                 msg.size = sizeof(msg.payload.u64);
1102                 send_vhost_reply(fd, &msg);
1103                 break;
1104
1105         case VHOST_USER_SET_VRING_ENABLE:
1106                 vhost_user_set_vring_enable(dev, &msg);
1107                 break;
1108         case VHOST_USER_SEND_RARP:
1109                 vhost_user_send_rarp(dev, &msg);
1110                 break;
1111
1112         case VHOST_USER_NET_SET_MTU:
1113                 ret = vhost_user_net_set_mtu(dev, &msg);
1114                 break;
1115
1116         default:
1117                 ret = -1;
1118                 break;
1119
1120         }
1121
1122         if (msg.flags & VHOST_USER_NEED_REPLY) {
1123                 msg.payload.u64 = !!ret;
1124                 msg.size = sizeof(msg.payload.u64);
1125                 send_vhost_reply(fd, &msg);
1126         }
1127
1128         if (!(dev->flags & VIRTIO_DEV_RUNNING) && virtio_is_ready(dev)) {
1129                 dev->flags |= VIRTIO_DEV_READY;
1130
1131                 if (!(dev->flags & VIRTIO_DEV_RUNNING)) {
1132                         if (dev->dequeue_zero_copy) {
1133                                 RTE_LOG(INFO, VHOST_CONFIG,
1134                                                 "dequeue zero copy is enabled\n");
1135                         }
1136
1137                         if (dev->notify_ops->new_device(dev->vid) == 0)
1138                                 dev->flags |= VIRTIO_DEV_RUNNING;
1139                 }
1140         }
1141
1142         return 0;
1143 }