vhost: simplify features set/get
[dpdk.git] / lib / librte_vhost / vhost_user.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <stdint.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <unistd.h>
39 #include <sys/mman.h>
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <assert.h>
43 #ifdef RTE_LIBRTE_VHOST_NUMA
44 #include <numaif.h>
45 #endif
46
47 #include <rte_common.h>
48 #include <rte_malloc.h>
49 #include <rte_log.h>
50
51 #include "vhost.h"
52 #include "vhost_user.h"
53
54 static const char *vhost_message_str[VHOST_USER_MAX] = {
55         [VHOST_USER_NONE] = "VHOST_USER_NONE",
56         [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
57         [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
58         [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
59         [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
60         [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
61         [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
62         [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
63         [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
64         [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
65         [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
66         [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
67         [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
68         [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
69         [VHOST_USER_SET_VRING_ERR]  = "VHOST_USER_SET_VRING_ERR",
70         [VHOST_USER_GET_PROTOCOL_FEATURES]  = "VHOST_USER_GET_PROTOCOL_FEATURES",
71         [VHOST_USER_SET_PROTOCOL_FEATURES]  = "VHOST_USER_SET_PROTOCOL_FEATURES",
72         [VHOST_USER_GET_QUEUE_NUM]  = "VHOST_USER_GET_QUEUE_NUM",
73         [VHOST_USER_SET_VRING_ENABLE]  = "VHOST_USER_SET_VRING_ENABLE",
74         [VHOST_USER_SEND_RARP]  = "VHOST_USER_SEND_RARP",
75 };
76
77 struct orig_region_map {
78         int fd;
79         uint64_t mapped_address;
80         uint64_t mapped_size;
81         uint64_t blksz;
82 };
83
84 #define orig_region(ptr, nregions) \
85         ((struct orig_region_map *)RTE_PTR_ADD((ptr), \
86                 sizeof(struct virtio_memory) + \
87                 sizeof(struct virtio_memory_regions) * (nregions)))
88
89 static uint64_t
90 get_blk_size(int fd)
91 {
92         struct stat stat;
93         int ret;
94
95         ret = fstat(fd, &stat);
96         return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
97 }
98
99 static void
100 free_mem_region(struct virtio_net *dev)
101 {
102         struct orig_region_map *region;
103         unsigned int idx;
104
105         if (!dev || !dev->mem)
106                 return;
107
108         region = orig_region(dev->mem, dev->mem->nregions);
109         for (idx = 0; idx < dev->mem->nregions; idx++) {
110                 if (region[idx].mapped_address) {
111                         munmap((void *)(uintptr_t)region[idx].mapped_address,
112                                         region[idx].mapped_size);
113                         close(region[idx].fd);
114                 }
115         }
116 }
117
118 void
119 vhost_backend_cleanup(struct virtio_net *dev)
120 {
121         if (dev->mem) {
122                 free_mem_region(dev);
123                 free(dev->mem);
124                 dev->mem = NULL;
125         }
126         if (dev->log_addr) {
127                 munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
128                 dev->log_addr = 0;
129         }
130 }
131
132 /*
133  * This function just returns success at the moment unless
134  * the device hasn't been initialised.
135  */
136 static int
137 vhost_user_set_owner(void)
138 {
139         return 0;
140 }
141
142 static int
143 vhost_user_reset_owner(struct virtio_net *dev)
144 {
145         if (dev->flags & VIRTIO_DEV_RUNNING) {
146                 dev->flags &= ~VIRTIO_DEV_RUNNING;
147                 notify_ops->destroy_device(dev->vid);
148         }
149
150         cleanup_device(dev, 0);
151         reset_device(dev);
152         return 0;
153 }
154
155 /*
156  * The features that we support are requested.
157  */
158 static uint64_t
159 vhost_user_get_features(void)
160 {
161         return VHOST_FEATURES;
162 }
163
164 /*
165  * We receive the negotiated features supported by us and the virtio device.
166  */
167 static int
168 vhost_user_set_features(struct virtio_net *dev, uint64_t features)
169 {
170         if (features & ~VHOST_FEATURES)
171                 return -1;
172
173         dev->features = features;
174         if (dev->features &
175                 ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) {
176                 dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
177         } else {
178                 dev->vhost_hlen = sizeof(struct virtio_net_hdr);
179         }
180         LOG_DEBUG(VHOST_CONFIG,
181                 "(%d) mergeable RX buffers %s, virtio 1 %s\n",
182                 dev->vid,
183                 (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off",
184                 (dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off");
185
186         return 0;
187 }
188
189 /*
190  * The virtio device sends us the size of the descriptor ring.
191  */
192 static int
193 vhost_user_set_vring_num(struct virtio_net *dev,
194                          struct vhost_vring_state *state)
195 {
196         dev->virtqueue[state->index]->size = state->num;
197
198         return 0;
199 }
200
201 /*
202  * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the
203  * same numa node as the memory of vring descriptor.
204  */
205 #ifdef RTE_LIBRTE_VHOST_NUMA
206 static struct virtio_net*
207 numa_realloc(struct virtio_net *dev, int index)
208 {
209         int oldnode, newnode;
210         struct virtio_net *old_dev;
211         struct vhost_virtqueue *old_vq, *vq;
212         int ret;
213
214         /*
215          * vq is allocated on pairs, we should try to do realloc
216          * on first queue of one queue pair only.
217          */
218         if (index % VIRTIO_QNUM != 0)
219                 return dev;
220
221         old_dev = dev;
222         vq = old_vq = dev->virtqueue[index];
223
224         ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc,
225                             MPOL_F_NODE | MPOL_F_ADDR);
226
227         /* check if we need to reallocate vq */
228         ret |= get_mempolicy(&oldnode, NULL, 0, old_vq,
229                              MPOL_F_NODE | MPOL_F_ADDR);
230         if (ret) {
231                 RTE_LOG(ERR, VHOST_CONFIG,
232                         "Unable to get vq numa information.\n");
233                 return dev;
234         }
235         if (oldnode != newnode) {
236                 RTE_LOG(INFO, VHOST_CONFIG,
237                         "reallocate vq from %d to %d node\n", oldnode, newnode);
238                 vq = rte_malloc_socket(NULL, sizeof(*vq) * VIRTIO_QNUM, 0,
239                                        newnode);
240                 if (!vq)
241                         return dev;
242
243                 memcpy(vq, old_vq, sizeof(*vq) * VIRTIO_QNUM);
244                 rte_free(old_vq);
245         }
246
247         /* check if we need to reallocate dev */
248         ret = get_mempolicy(&oldnode, NULL, 0, old_dev,
249                             MPOL_F_NODE | MPOL_F_ADDR);
250         if (ret) {
251                 RTE_LOG(ERR, VHOST_CONFIG,
252                         "Unable to get dev numa information.\n");
253                 goto out;
254         }
255         if (oldnode != newnode) {
256                 RTE_LOG(INFO, VHOST_CONFIG,
257                         "reallocate dev from %d to %d node\n",
258                         oldnode, newnode);
259                 dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode);
260                 if (!dev) {
261                         dev = old_dev;
262                         goto out;
263                 }
264
265                 memcpy(dev, old_dev, sizeof(*dev));
266                 rte_free(old_dev);
267         }
268
269 out:
270         dev->virtqueue[index] = vq;
271         dev->virtqueue[index + 1] = vq + 1;
272         vhost_devices[dev->vid] = dev;
273
274         return dev;
275 }
276 #else
277 static struct virtio_net*
278 numa_realloc(struct virtio_net *dev, int index __rte_unused)
279 {
280         return dev;
281 }
282 #endif
283
284 /*
285  * Converts QEMU virtual address to Vhost virtual address. This function is
286  * used to convert the ring addresses to our address space.
287  */
288 static uint64_t
289 qva_to_vva(struct virtio_net *dev, uint64_t qemu_va)
290 {
291         struct virtio_memory_regions *region;
292         uint64_t vhost_va = 0;
293         uint32_t regionidx = 0;
294
295         /* Find the region where the address lives. */
296         for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
297                 region = &dev->mem->regions[regionidx];
298                 if ((qemu_va >= region->userspace_address) &&
299                         (qemu_va <= region->userspace_address +
300                         region->memory_size)) {
301                         vhost_va = qemu_va + region->guest_phys_address +
302                                 region->address_offset -
303                                 region->userspace_address;
304                         break;
305                 }
306         }
307         return vhost_va;
308 }
309
310 /*
311  * The virtio device sends us the desc, used and avail ring addresses.
312  * This function then converts these to our address space.
313  */
314 static int
315 vhost_user_set_vring_addr(struct virtio_net *dev, struct vhost_vring_addr *addr)
316 {
317         struct vhost_virtqueue *vq;
318
319         if (dev->mem == NULL)
320                 return -1;
321
322         /* addr->index refers to the queue index. The txq 1, rxq is 0. */
323         vq = dev->virtqueue[addr->index];
324
325         /* The addresses are converted from QEMU virtual to Vhost virtual. */
326         vq->desc = (struct vring_desc *)(uintptr_t)qva_to_vva(dev,
327                         addr->desc_user_addr);
328         if (vq->desc == 0) {
329                 RTE_LOG(ERR, VHOST_CONFIG,
330                         "(%d) failed to find desc ring address.\n",
331                         dev->vid);
332                 return -1;
333         }
334
335         dev = numa_realloc(dev, addr->index);
336         vq = dev->virtqueue[addr->index];
337
338         vq->avail = (struct vring_avail *)(uintptr_t)qva_to_vva(dev,
339                         addr->avail_user_addr);
340         if (vq->avail == 0) {
341                 RTE_LOG(ERR, VHOST_CONFIG,
342                         "(%d) failed to find avail ring address.\n",
343                         dev->vid);
344                 return -1;
345         }
346
347         vq->used = (struct vring_used *)(uintptr_t)qva_to_vva(dev,
348                         addr->used_user_addr);
349         if (vq->used == 0) {
350                 RTE_LOG(ERR, VHOST_CONFIG,
351                         "(%d) failed to find used ring address.\n",
352                         dev->vid);
353                 return -1;
354         }
355
356         if (vq->last_used_idx != vq->used->idx) {
357                 RTE_LOG(WARNING, VHOST_CONFIG,
358                         "last_used_idx (%u) and vq->used->idx (%u) mismatches; "
359                         "some packets maybe resent for Tx and dropped for Rx\n",
360                         vq->last_used_idx, vq->used->idx);
361                 vq->last_used_idx     = vq->used->idx;
362         }
363
364         vq->log_guest_addr = addr->log_guest_addr;
365
366         LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n",
367                         dev->vid, vq->desc);
368         LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n",
369                         dev->vid, vq->avail);
370         LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n",
371                         dev->vid, vq->used);
372         LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n",
373                         dev->vid, vq->log_guest_addr);
374
375         return 0;
376 }
377
378 /*
379  * The virtio device sends us the available ring last used index.
380  */
381 static int
382 vhost_user_set_vring_base(struct virtio_net *dev,
383                           struct vhost_vring_state *state)
384 {
385         dev->virtqueue[state->index]->last_used_idx = state->num;
386
387         return 0;
388 }
389
390 static int
391 vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
392 {
393         struct VhostUserMemory memory = pmsg->payload.memory;
394         struct virtio_memory_regions *pregion;
395         uint64_t mapped_address, mapped_size;
396         unsigned int idx = 0;
397         struct orig_region_map *pregion_orig;
398         uint64_t alignment;
399
400         /* Remove from the data plane. */
401         if (dev->flags & VIRTIO_DEV_RUNNING) {
402                 dev->flags &= ~VIRTIO_DEV_RUNNING;
403                 notify_ops->destroy_device(dev->vid);
404         }
405
406         if (dev->mem) {
407                 free_mem_region(dev);
408                 free(dev->mem);
409                 dev->mem = NULL;
410         }
411
412         dev->mem = calloc(1,
413                 sizeof(struct virtio_memory) +
414                 sizeof(struct virtio_memory_regions) * memory.nregions +
415                 sizeof(struct orig_region_map) * memory.nregions);
416         if (dev->mem == NULL) {
417                 RTE_LOG(ERR, VHOST_CONFIG,
418                         "(%d) failed to allocate memory for dev->mem\n",
419                         dev->vid);
420                 return -1;
421         }
422         dev->mem->nregions = memory.nregions;
423
424         pregion_orig = orig_region(dev->mem, memory.nregions);
425         for (idx = 0; idx < memory.nregions; idx++) {
426                 pregion = &dev->mem->regions[idx];
427                 pregion->guest_phys_address =
428                         memory.regions[idx].guest_phys_addr;
429                 pregion->guest_phys_address_end =
430                         memory.regions[idx].guest_phys_addr +
431                         memory.regions[idx].memory_size;
432                 pregion->memory_size =
433                         memory.regions[idx].memory_size;
434                 pregion->userspace_address =
435                         memory.regions[idx].userspace_addr;
436
437                 /* This is ugly */
438                 mapped_size = memory.regions[idx].memory_size +
439                         memory.regions[idx].mmap_offset;
440
441                 /* mmap() without flag of MAP_ANONYMOUS, should be called
442                  * with length argument aligned with hugepagesz at older
443                  * longterm version Linux, like 2.6.32 and 3.2.72, or
444                  * mmap() will fail with EINVAL.
445                  *
446                  * to avoid failure, make sure in caller to keep length
447                  * aligned.
448                  */
449                 alignment = get_blk_size(pmsg->fds[idx]);
450                 if (alignment == (uint64_t)-1) {
451                         RTE_LOG(ERR, VHOST_CONFIG,
452                                 "couldn't get hugepage size through fstat\n");
453                         goto err_mmap;
454                 }
455                 mapped_size = RTE_ALIGN_CEIL(mapped_size, alignment);
456
457                 mapped_address = (uint64_t)(uintptr_t)mmap(NULL,
458                         mapped_size,
459                         PROT_READ | PROT_WRITE, MAP_SHARED,
460                         pmsg->fds[idx],
461                         0);
462
463                 RTE_LOG(INFO, VHOST_CONFIG,
464                         "mapped region %d fd:%d to:%p sz:0x%"PRIx64" "
465                         "off:0x%"PRIx64" align:0x%"PRIx64"\n",
466                         idx, pmsg->fds[idx], (void *)(uintptr_t)mapped_address,
467                         mapped_size, memory.regions[idx].mmap_offset,
468                         alignment);
469
470                 if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) {
471                         RTE_LOG(ERR, VHOST_CONFIG,
472                                 "mmap qemu guest failed.\n");
473                         goto err_mmap;
474                 }
475
476                 pregion_orig[idx].mapped_address = mapped_address;
477                 pregion_orig[idx].mapped_size = mapped_size;
478                 pregion_orig[idx].blksz = alignment;
479                 pregion_orig[idx].fd = pmsg->fds[idx];
480
481                 mapped_address +=  memory.regions[idx].mmap_offset;
482
483                 pregion->address_offset = mapped_address -
484                         pregion->guest_phys_address;
485
486                 if (memory.regions[idx].guest_phys_addr == 0) {
487                         dev->mem->base_address =
488                                 memory.regions[idx].userspace_addr;
489                         dev->mem->mapped_address =
490                                 pregion->address_offset;
491                 }
492
493                 LOG_DEBUG(VHOST_CONFIG,
494                         "REGION: %u GPA: %p QEMU VA: %p SIZE (%"PRIu64")\n",
495                         idx,
496                         (void *)(uintptr_t)pregion->guest_phys_address,
497                         (void *)(uintptr_t)pregion->userspace_address,
498                          pregion->memory_size);
499         }
500
501         return 0;
502
503 err_mmap:
504         while (idx--) {
505                 munmap((void *)(uintptr_t)pregion_orig[idx].mapped_address,
506                                 pregion_orig[idx].mapped_size);
507                 close(pregion_orig[idx].fd);
508         }
509         free(dev->mem);
510         dev->mem = NULL;
511         return -1;
512 }
513
514 static int
515 vq_is_ready(struct vhost_virtqueue *vq)
516 {
517         return vq && vq->desc   &&
518                vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD &&
519                vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD;
520 }
521
522 static int
523 virtio_is_ready(struct virtio_net *dev)
524 {
525         struct vhost_virtqueue *rvq, *tvq;
526         uint32_t i;
527
528         for (i = 0; i < dev->virt_qp_nb; i++) {
529                 rvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_RXQ];
530                 tvq = dev->virtqueue[i * VIRTIO_QNUM + VIRTIO_TXQ];
531
532                 if (!vq_is_ready(rvq) || !vq_is_ready(tvq)) {
533                         RTE_LOG(INFO, VHOST_CONFIG,
534                                 "virtio is not ready for processing.\n");
535                         return 0;
536                 }
537         }
538
539         RTE_LOG(INFO, VHOST_CONFIG,
540                 "virtio is now ready for processing.\n");
541         return 1;
542 }
543
544 static void
545 vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg)
546 {
547         struct vhost_vring_file file;
548         struct vhost_virtqueue *vq;
549         uint32_t cur_qp_idx;
550
551         file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
552         if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
553                 file.fd = VIRTIO_INVALID_EVENTFD;
554         else
555                 file.fd = pmsg->fds[0];
556         RTE_LOG(INFO, VHOST_CONFIG,
557                 "vring call idx:%d file:%d\n", file.index, file.fd);
558
559         /*
560          * FIXME: VHOST_SET_VRING_CALL is the first per-vring message
561          * we get, so we do vring queue pair allocation here.
562          */
563         cur_qp_idx = file.index / VIRTIO_QNUM;
564         if (cur_qp_idx + 1 > dev->virt_qp_nb) {
565                 if (alloc_vring_queue_pair(dev, cur_qp_idx) < 0)
566                         return;
567         }
568
569         vq = dev->virtqueue[file.index];
570         assert(vq != NULL);
571
572         if (vq->callfd >= 0)
573                 close(vq->callfd);
574
575         vq->callfd = file.fd;
576 }
577
578 /*
579  *  In vhost-user, when we receive kick message, will test whether virtio
580  *  device is ready for packet processing.
581  */
582 static void
583 vhost_user_set_vring_kick(struct virtio_net *dev, struct VhostUserMsg *pmsg)
584 {
585         struct vhost_vring_file file;
586         struct vhost_virtqueue *vq;
587
588         file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
589         if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
590                 file.fd = VIRTIO_INVALID_EVENTFD;
591         else
592                 file.fd = pmsg->fds[0];
593         RTE_LOG(INFO, VHOST_CONFIG,
594                 "vring kick idx:%d file:%d\n", file.index, file.fd);
595
596         vq = dev->virtqueue[file.index];
597         if (vq->kickfd >= 0)
598                 close(vq->kickfd);
599         vq->kickfd = file.fd;
600
601         if (virtio_is_ready(dev) && !(dev->flags & VIRTIO_DEV_RUNNING)) {
602                 if (notify_ops->new_device(dev->vid) == 0)
603                         dev->flags |= VIRTIO_DEV_RUNNING;
604         }
605 }
606
607 /*
608  * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
609  */
610 static int
611 vhost_user_get_vring_base(struct virtio_net *dev,
612                           struct vhost_vring_state *state)
613 {
614         /* We have to stop the queue (virtio) if it is running. */
615         if (dev->flags & VIRTIO_DEV_RUNNING) {
616                 dev->flags &= ~VIRTIO_DEV_RUNNING;
617                 notify_ops->destroy_device(dev->vid);
618         }
619
620         /* Here we are safe to get the last used index */
621         state->num = dev->virtqueue[state->index]->last_used_idx;
622
623         RTE_LOG(INFO, VHOST_CONFIG,
624                 "vring base idx:%d file:%d\n", state->index, state->num);
625         /*
626          * Based on current qemu vhost-user implementation, this message is
627          * sent and only sent in vhost_vring_stop.
628          * TODO: cleanup the vring, it isn't usable since here.
629          */
630         if (dev->virtqueue[state->index]->kickfd >= 0)
631                 close(dev->virtqueue[state->index]->kickfd);
632
633         dev->virtqueue[state->index]->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
634
635         return 0;
636 }
637
638 /*
639  * when virtio queues are ready to work, qemu will send us to
640  * enable the virtio queue pair.
641  */
642 static int
643 vhost_user_set_vring_enable(struct virtio_net *dev,
644                             struct vhost_vring_state *state)
645 {
646         int enable = (int)state->num;
647
648         RTE_LOG(INFO, VHOST_CONFIG,
649                 "set queue enable: %d to qp idx: %d\n",
650                 enable, state->index);
651
652         if (notify_ops->vring_state_changed)
653                 notify_ops->vring_state_changed(dev->vid, state->index, enable);
654
655         dev->virtqueue[state->index]->enabled = enable;
656
657         return 0;
658 }
659
660 static void
661 vhost_user_set_protocol_features(struct virtio_net *dev,
662                                  uint64_t protocol_features)
663 {
664         if (protocol_features & ~VHOST_USER_PROTOCOL_FEATURES)
665                 return;
666
667         dev->protocol_features = protocol_features;
668 }
669
670 static int
671 vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg)
672 {
673         int fd = msg->fds[0];
674         uint64_t size, off;
675         void *addr;
676
677         if (fd < 0) {
678                 RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd);
679                 return -1;
680         }
681
682         if (msg->size != sizeof(VhostUserLog)) {
683                 RTE_LOG(ERR, VHOST_CONFIG,
684                         "invalid log base msg size: %"PRId32" != %d\n",
685                         msg->size, (int)sizeof(VhostUserLog));
686                 return -1;
687         }
688
689         size = msg->payload.log.mmap_size;
690         off  = msg->payload.log.mmap_offset;
691         RTE_LOG(INFO, VHOST_CONFIG,
692                 "log mmap size: %"PRId64", offset: %"PRId64"\n",
693                 size, off);
694
695         /*
696          * mmap from 0 to workaround a hugepage mmap bug: mmap will
697          * fail when offset is not page size aligned.
698          */
699         addr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
700         close(fd);
701         if (addr == MAP_FAILED) {
702                 RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n");
703                 return -1;
704         }
705
706         /*
707          * Free previously mapped log memory on occasionally
708          * multiple VHOST_USER_SET_LOG_BASE.
709          */
710         if (dev->log_addr) {
711                 munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
712         }
713         dev->log_addr = (uint64_t)(uintptr_t)addr;
714         dev->log_base = dev->log_addr + off;
715         dev->log_size = size;
716
717         return 0;
718 }
719
720 /*
721  * An rarp packet is constructed and broadcasted to notify switches about
722  * the new location of the migrated VM, so that packets from outside will
723  * not be lost after migration.
724  *
725  * However, we don't actually "send" a rarp packet here, instead, we set
726  * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
727  */
728 static int
729 vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg)
730 {
731         uint8_t *mac = (uint8_t *)&msg->payload.u64;
732
733         RTE_LOG(DEBUG, VHOST_CONFIG,
734                 ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
735                 mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
736         memcpy(dev->mac.addr_bytes, mac, 6);
737
738         /*
739          * Set the flag to inject a RARP broadcast packet at
740          * rte_vhost_dequeue_burst().
741          *
742          * rte_smp_wmb() is for making sure the mac is copied
743          * before the flag is set.
744          */
745         rte_smp_wmb();
746         rte_atomic16_set(&dev->broadcast_rarp, 1);
747
748         return 0;
749 }
750
751 /* return bytes# of read on success or negative val on failure. */
752 static int
753 read_vhost_message(int sockfd, struct VhostUserMsg *msg)
754 {
755         int ret;
756
757         ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
758                 msg->fds, VHOST_MEMORY_MAX_NREGIONS);
759         if (ret <= 0)
760                 return ret;
761
762         if (msg && msg->size) {
763                 if (msg->size > sizeof(msg->payload)) {
764                         RTE_LOG(ERR, VHOST_CONFIG,
765                                 "invalid msg size: %d\n", msg->size);
766                         return -1;
767                 }
768                 ret = read(sockfd, &msg->payload, msg->size);
769                 if (ret <= 0)
770                         return ret;
771                 if (ret != (int)msg->size) {
772                         RTE_LOG(ERR, VHOST_CONFIG,
773                                 "read control message failed\n");
774                         return -1;
775                 }
776         }
777
778         return ret;
779 }
780
781 static int
782 send_vhost_message(int sockfd, struct VhostUserMsg *msg)
783 {
784         int ret;
785
786         if (!msg)
787                 return 0;
788
789         msg->flags &= ~VHOST_USER_VERSION_MASK;
790         msg->flags |= VHOST_USER_VERSION;
791         msg->flags |= VHOST_USER_REPLY_MASK;
792
793         ret = send_fd_message(sockfd, (char *)msg,
794                 VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
795
796         return ret;
797 }
798
799 int
800 vhost_user_msg_handler(int vid, int fd)
801 {
802         struct virtio_net *dev;
803         struct VhostUserMsg msg;
804         int ret;
805
806         dev = get_device(vid);
807         if (dev == NULL)
808                 return -1;
809
810         ret = read_vhost_message(fd, &msg);
811         if (ret <= 0 || msg.request >= VHOST_USER_MAX) {
812                 if (ret < 0)
813                         RTE_LOG(ERR, VHOST_CONFIG,
814                                 "vhost read message failed\n");
815                 else if (ret == 0)
816                         RTE_LOG(INFO, VHOST_CONFIG,
817                                 "vhost peer closed\n");
818                 else
819                         RTE_LOG(ERR, VHOST_CONFIG,
820                                 "vhost read incorrect message\n");
821
822                 return -1;
823         }
824
825         RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
826                 vhost_message_str[msg.request]);
827         switch (msg.request) {
828         case VHOST_USER_GET_FEATURES:
829                 msg.payload.u64 = vhost_user_get_features();
830                 msg.size = sizeof(msg.payload.u64);
831                 send_vhost_message(fd, &msg);
832                 break;
833         case VHOST_USER_SET_FEATURES:
834                 vhost_user_set_features(dev, msg.payload.u64);
835                 break;
836
837         case VHOST_USER_GET_PROTOCOL_FEATURES:
838                 msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES;
839                 msg.size = sizeof(msg.payload.u64);
840                 send_vhost_message(fd, &msg);
841                 break;
842         case VHOST_USER_SET_PROTOCOL_FEATURES:
843                 vhost_user_set_protocol_features(dev, msg.payload.u64);
844                 break;
845
846         case VHOST_USER_SET_OWNER:
847                 vhost_user_set_owner();
848                 break;
849         case VHOST_USER_RESET_OWNER:
850                 vhost_user_reset_owner(dev);
851                 break;
852
853         case VHOST_USER_SET_MEM_TABLE:
854                 vhost_user_set_mem_table(dev, &msg);
855                 break;
856
857         case VHOST_USER_SET_LOG_BASE:
858                 vhost_user_set_log_base(dev, &msg);
859
860                 /* it needs a reply */
861                 msg.size = sizeof(msg.payload.u64);
862                 send_vhost_message(fd, &msg);
863                 break;
864         case VHOST_USER_SET_LOG_FD:
865                 close(msg.fds[0]);
866                 RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
867                 break;
868
869         case VHOST_USER_SET_VRING_NUM:
870                 vhost_user_set_vring_num(dev, &msg.payload.state);
871                 break;
872         case VHOST_USER_SET_VRING_ADDR:
873                 vhost_user_set_vring_addr(dev, &msg.payload.addr);
874                 break;
875         case VHOST_USER_SET_VRING_BASE:
876                 vhost_user_set_vring_base(dev, &msg.payload.state);
877                 break;
878
879         case VHOST_USER_GET_VRING_BASE:
880                 ret = vhost_user_get_vring_base(dev, &msg.payload.state);
881                 msg.size = sizeof(msg.payload.state);
882                 send_vhost_message(fd, &msg);
883                 break;
884
885         case VHOST_USER_SET_VRING_KICK:
886                 vhost_user_set_vring_kick(dev, &msg);
887                 break;
888         case VHOST_USER_SET_VRING_CALL:
889                 vhost_user_set_vring_call(dev, &msg);
890                 break;
891
892         case VHOST_USER_SET_VRING_ERR:
893                 if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK))
894                         close(msg.fds[0]);
895                 RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
896                 break;
897
898         case VHOST_USER_GET_QUEUE_NUM:
899                 msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS;
900                 msg.size = sizeof(msg.payload.u64);
901                 send_vhost_message(fd, &msg);
902                 break;
903
904         case VHOST_USER_SET_VRING_ENABLE:
905                 vhost_user_set_vring_enable(dev, &msg.payload.state);
906                 break;
907         case VHOST_USER_SEND_RARP:
908                 vhost_user_send_rarp(dev, &msg);
909                 break;
910
911         default:
912                 break;
913
914         }
915
916         return 0;
917 }