1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2019 Intel Corporation
10 #include <semaphore.h>
11 #include <linux/virtio_blk.h>
12 #include <linux/virtio_ring.h>
14 #include <rte_atomic.h>
15 #include <rte_cycles.h>
17 #include <rte_malloc.h>
18 #include <rte_vhost.h>
20 #include "vhost_blk.h"
23 #define VIRTQ_DESC_F_NEXT 1
24 #define VIRTQ_DESC_F_AVAIL (1 << 7)
25 #define VIRTQ_DESC_F_USED (1 << 15)
29 #define VHOST_BLK_FEATURES ((1ULL << VIRTIO_F_RING_PACKED) | \
30 (1ULL << VIRTIO_F_VERSION_1) |\
31 (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \
32 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))
33 #define CTRLR_NAME "vhost.socket"
35 enum CTRLR_WORKER_STATUS {
36 WORKER_STATE_START = 0,
40 struct vhost_blk_ctrlr *g_vhost_ctrlr;
42 /* Path to folder where character device will be created. Can be set by user. */
43 static char dev_pathname[PATH_MAX] = "";
44 static sem_t exit_sem;
45 static enum CTRLR_WORKER_STATUS worker_thread_status;
47 struct vhost_blk_ctrlr *
48 vhost_blk_ctrlr_find(const char *ctrlr_name)
50 if (ctrlr_name == NULL)
53 /* currently we only support 1 socket file fd */
58 gpa_to_vva(struct vhost_blk_ctrlr *ctrlr, uint64_t gpa, uint64_t *len)
60 assert(ctrlr->mem != NULL);
62 return rte_vhost_va_from_guest_pa(ctrlr->mem, gpa, len);
66 enqueue_task(struct vhost_blk_task *task)
68 struct vhost_blk_queue *vq = task->vq;
69 struct vring_used *used = vq->vring.used;
71 rte_vhost_set_last_inflight_io_split(task->ctrlr->vid,
72 vq->id, task->req_idx);
74 /* Fill out the next entry in the "used" ring. id = the
75 * index of the descriptor that contained the blk request.
76 * len = the total amount of data transferred for the blk
77 * request. We must report the correct len, for variable
78 * length blk CDBs, where we may return less data than
79 * allocated by the guest VM.
81 used->ring[used->idx & (vq->vring.size - 1)].id = task->req_idx;
82 used->ring[used->idx & (vq->vring.size - 1)].len = task->data_len;
87 rte_vhost_clr_inflight_desc_split(task->ctrlr->vid,
88 vq->id, used->idx, task->req_idx);
90 /* Send an interrupt back to the guest VM so that it knows
91 * a completion is ready to be processed.
93 rte_vhost_vring_call(task->ctrlr->vid, vq->id);
97 enqueue_task_packed(struct vhost_blk_task *task)
99 struct vhost_blk_queue *vq = task->vq;
100 struct vring_packed_desc *desc;
102 rte_vhost_set_last_inflight_io_packed(task->ctrlr->vid, vq->id,
105 desc = &vq->vring.desc_packed[vq->last_used_idx];
106 desc->id = task->buffer_id;
110 if (vq->used_wrap_counter)
111 desc->flags |= VIRTQ_DESC_F_AVAIL | VIRTQ_DESC_F_USED;
113 desc->flags &= ~(VIRTQ_DESC_F_AVAIL | VIRTQ_DESC_F_USED);
116 rte_vhost_clr_inflight_desc_packed(task->ctrlr->vid, vq->id,
119 vq->last_used_idx += task->chain_num;
120 if (vq->last_used_idx >= vq->vring.size) {
121 vq->last_used_idx -= vq->vring.size;
122 vq->used_wrap_counter = !vq->used_wrap_counter;
125 /* Send an interrupt back to the guest VM so that it knows
126 * a completion is ready to be processed.
128 rte_vhost_vring_call(task->ctrlr->vid, vq->id);
132 descriptor_has_next_packed(struct vring_packed_desc *cur_desc)
134 return !!(cur_desc->flags & VRING_DESC_F_NEXT);
138 descriptor_has_next_split(struct vring_desc *cur_desc)
140 return !!(cur_desc->flags & VRING_DESC_F_NEXT);
144 desc_payload_to_iovs(struct vhost_blk_ctrlr *ctrlr, struct iovec *iovs,
145 uint32_t *iov_index, uintptr_t payload, uint64_t remaining)
151 if (*iov_index >= VHOST_BLK_MAX_IOVS) {
152 fprintf(stderr, "VHOST_BLK_MAX_IOVS reached\n");
156 vva = (void *)(uintptr_t)gpa_to_vva(ctrlr,
159 fprintf(stderr, "failed to translate desc address.\n");
163 iovs[*iov_index].iov_base = vva;
164 iovs[*iov_index].iov_len = len;
173 static struct vring_desc *
174 vring_get_next_desc(struct vhost_blk_queue *vq, struct vring_desc *desc)
176 if (descriptor_has_next_split(desc))
177 return &vq->vring.desc[desc->next];
182 static struct vring_packed_desc *
183 vring_get_next_desc_packed(struct vhost_blk_queue *vq, uint16_t *req_idx)
185 if (descriptor_has_next_packed(&vq->vring.desc_packed[*req_idx])) {
186 *req_idx = (*req_idx + 1) % vq->vring.size;
187 return &vq->vring.desc_packed[*req_idx];
193 static struct rte_vhost_inflight_desc_packed *
194 vring_get_next_inflight_desc(struct vhost_blk_queue *vq,
195 struct rte_vhost_inflight_desc_packed *desc)
197 if (!!(desc->flags & VRING_DESC_F_NEXT))
198 return &vq->inflight_ring.inflight_packed->desc[desc->next];
204 setup_iovs_from_descs_split(struct vhost_blk_ctrlr *ctrlr,
205 struct vhost_blk_queue *vq, uint16_t req_idx,
206 struct iovec *iovs, uint32_t *iovs_idx,
209 struct vring_desc *desc = &vq->vring.desc[req_idx];
212 /* does not support indirect descriptors */
213 assert((desc->flags & VRING_DESC_F_INDIRECT) == 0);
215 if (*iovs_idx >= VHOST_BLK_MAX_IOVS) {
216 fprintf(stderr, "Reach VHOST_BLK_MAX_IOVS\n");
220 if (desc_payload_to_iovs(ctrlr, iovs, iovs_idx,
221 desc->addr, desc->len) != 0) {
222 fprintf(stderr, "Failed to convert desc payload to iovs\n");
226 *payload += desc->len;
228 desc = vring_get_next_desc(vq, desc);
229 } while (desc != NULL);
235 setup_iovs_from_descs_packed(struct vhost_blk_ctrlr *ctrlr,
236 struct vhost_blk_queue *vq, uint16_t req_idx,
237 struct iovec *iovs, uint32_t *iovs_idx,
240 struct vring_packed_desc *desc = &vq->vring.desc_packed[req_idx];
243 /* does not support indirect descriptors */
244 assert((desc->flags & VRING_DESC_F_INDIRECT) == 0);
246 if (*iovs_idx >= VHOST_BLK_MAX_IOVS) {
247 fprintf(stderr, "Reach VHOST_BLK_MAX_IOVS\n");
251 if (desc_payload_to_iovs(ctrlr, iovs, iovs_idx,
252 desc->addr, desc->len) != 0) {
253 fprintf(stderr, "Failed to convert desc payload to iovs\n");
257 *payload += desc->len;
259 desc = vring_get_next_desc_packed(vq, &req_idx);
260 } while (desc != NULL);
266 setup_iovs_from_inflight_desc(struct vhost_blk_ctrlr *ctrlr,
267 struct vhost_blk_queue *vq, uint16_t req_idx,
268 struct iovec *iovs, uint32_t *iovs_idx,
271 struct rte_vhost_ring_inflight *inflight_vq;
272 struct rte_vhost_inflight_desc_packed *desc;
274 inflight_vq = &vq->inflight_ring;
275 desc = &inflight_vq->inflight_packed->desc[req_idx];
278 /* does not support indirect descriptors */
279 assert((desc->flags & VRING_DESC_F_INDIRECT) == 0);
281 if (*iovs_idx >= VHOST_BLK_MAX_IOVS) {
282 fprintf(stderr, "Reach VHOST_BLK_MAX_IOVS\n");
286 if (desc_payload_to_iovs(ctrlr, iovs, iovs_idx,
287 desc->addr, desc->len) != 0) {
288 fprintf(stderr, "Failed to convert desc payload to iovs\n");
292 *payload += desc->len;
294 desc = vring_get_next_inflight_desc(vq, desc);
295 } while (desc != NULL);
301 process_blk_task(struct vhost_blk_task *task)
303 uint32_t payload = 0;
305 if (task->vq->packed_ring) {
306 struct rte_vhost_ring_inflight *inflight_ring;
307 struct rte_vhost_resubmit_info *resubmit_inflight;
309 inflight_ring = &task->vq->inflight_ring;
310 resubmit_inflight = inflight_ring->resubmit_inflight;
312 if (resubmit_inflight != NULL &&
313 resubmit_inflight->resubmit_list != NULL) {
314 if (setup_iovs_from_inflight_desc(task->ctrlr, task->vq,
315 task->req_idx, task->iovs, &task->iovs_cnt,
317 fprintf(stderr, "Failed to setup iovs\n");
321 if (setup_iovs_from_descs_packed(task->ctrlr, task->vq,
322 task->req_idx, task->iovs, &task->iovs_cnt,
324 fprintf(stderr, "Failed to setup iovs\n");
329 if (setup_iovs_from_descs_split(task->ctrlr, task->vq,
330 task->req_idx, task->iovs, &task->iovs_cnt, &payload)) {
331 fprintf(stderr, "Failed to setup iovs\n");
336 /* First IOV must be the req head. */
337 task->req = (struct virtio_blk_outhdr *)task->iovs[0].iov_base;
338 assert(sizeof(*task->req) == task->iovs[0].iov_len);
340 /* Last IOV must be the status tail. */
341 task->status = (uint8_t *)task->iovs[task->iovs_cnt - 1].iov_base;
342 assert(sizeof(*task->status) == task->iovs[task->iovs_cnt - 1].iov_len);
344 /* Transport data len */
345 task->data_len = payload - task->iovs[0].iov_len -
346 task->iovs[task->iovs_cnt - 1].iov_len;
348 if (vhost_bdev_process_blk_commands(task->ctrlr->bdev, task))
349 /* invalid response */
350 *task->status = VIRTIO_BLK_S_IOERR;
353 *task->status = VIRTIO_BLK_S_OK;
355 if (task->vq->packed_ring)
356 enqueue_task_packed(task);
362 blk_task_init(struct vhost_blk_task *task)
371 submit_inflight_vq(struct vhost_blk_queue *vq)
373 struct rte_vhost_ring_inflight *inflight_ring;
374 struct rte_vhost_resubmit_info *resubmit_inflight;
375 struct vhost_blk_task *task;
377 inflight_ring = &vq->inflight_ring;
378 resubmit_inflight = inflight_ring->resubmit_inflight;
380 if (resubmit_inflight == NULL ||
381 resubmit_inflight->resubmit_num == 0)
384 fprintf(stdout, "Resubmit inflight num is %d\n",
385 resubmit_inflight->resubmit_num);
387 while (resubmit_inflight->resubmit_num-- > 0) {
390 desc_idx = resubmit_inflight->resubmit_list[
391 resubmit_inflight->resubmit_num].index;
393 if (vq->packed_ring) {
395 struct rte_vhost_inflight_desc_packed *desc;
397 desc = inflight_ring->inflight_packed->desc;
398 task_idx = desc[desc[desc_idx].last].id;
399 task = &vq->tasks[task_idx];
401 task->req_idx = desc_idx;
402 task->chain_num = desc[desc_idx].num;
403 task->buffer_id = task_idx;
404 task->inflight_idx = desc_idx;
406 vq->last_avail_idx += desc[desc_idx].num;
407 if (vq->last_avail_idx >= vq->vring.size) {
408 vq->last_avail_idx -= vq->vring.size;
409 vq->avail_wrap_counter =
410 !vq->avail_wrap_counter;
413 /* In split ring, the desc_idx is the req_id
414 * which was initialized when allocated the task pool.
416 task = &vq->tasks[desc_idx];
419 process_blk_task(task);
422 free(resubmit_inflight->resubmit_list);
423 resubmit_inflight->resubmit_list = NULL;
426 /* Use the buffer_id as the task_idx */
428 vhost_blk_vq_get_desc_chain_buffer_id(struct vhost_blk_queue *vq,
429 uint16_t *req_head, uint16_t *num)
431 struct vring_packed_desc *desc = &vq->vring.desc_packed[
434 *req_head = vq->last_avail_idx;
437 while (descriptor_has_next_packed(desc)) {
438 vq->last_avail_idx = (vq->last_avail_idx + 1) % vq->vring.size;
439 desc = &vq->vring.desc_packed[vq->last_avail_idx];
443 /* Point to next desc */
444 vq->last_avail_idx = (vq->last_avail_idx + 1) % vq->vring.size;
445 if (vq->last_avail_idx < *req_head)
446 vq->avail_wrap_counter = !vq->avail_wrap_counter;
452 vq_get_desc_idx(struct vhost_blk_queue *vq)
455 uint16_t last_avail_idx;
457 last_avail_idx = vq->last_avail_idx & (vq->vring.size - 1);
458 desc_idx = vq->vring.avail->ring[last_avail_idx];
459 vq->last_avail_idx++;
465 vhost_blk_vq_is_avail(struct vhost_blk_queue *vq)
467 if (vq->packed_ring) {
468 uint16_t flags = vq->vring.desc_packed[
469 vq->last_avail_idx].flags;
470 bool avail_wrap_counter = vq->avail_wrap_counter;
472 return (!!(flags & VIRTQ_DESC_F_AVAIL) == avail_wrap_counter &&
473 !!(flags & VIRTQ_DESC_F_USED) != avail_wrap_counter);
475 if (vq->vring.avail->idx != vq->last_avail_idx)
483 process_vq(struct vhost_blk_queue *vq)
485 struct vhost_blk_task *task;
487 if (vq->packed_ring) {
488 while (vhost_blk_vq_is_avail(vq)) {
489 uint16_t task_idx, req_idx, last_idx, chain_num;
491 task_idx = vhost_blk_vq_get_desc_chain_buffer_id(vq,
492 &req_idx, &chain_num);
493 task = &vq->tasks[task_idx];
496 task->req_idx = req_idx;
497 task->chain_num = chain_num;
498 task->buffer_id = task_idx;
499 last_idx = (req_idx + chain_num - 1) % vq->vring.size;
501 rte_vhost_set_inflight_desc_packed(task->ctrlr->vid,
505 &task->inflight_idx);
507 process_blk_task(task);
510 while (vhost_blk_vq_is_avail(vq)) {
513 desc_idx = vq_get_desc_idx(vq);
514 task = &vq->tasks[desc_idx];
517 rte_vhost_set_inflight_desc_split(task->ctrlr->vid,
520 process_blk_task(task);
526 ctrlr_worker(void *arg)
528 struct vhost_blk_ctrlr *ctrlr = (struct vhost_blk_ctrlr *)arg;
533 fprintf(stdout, "Ctrlr Worker Thread start\n");
535 if (ctrlr == NULL || ctrlr->bdev == NULL) {
537 "%s: Error, invalid argument passed to worker thread\n",
542 thread = pthread_self();
545 pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
547 for (i = 0; i < NUM_OF_BLK_QUEUES; i++)
548 submit_inflight_vq(&ctrlr->queues[i]);
550 while (worker_thread_status != WORKER_STATE_STOP)
551 for (i = 0; i < NUM_OF_BLK_QUEUES; i++)
552 process_vq(&ctrlr->queues[i]);
554 fprintf(stdout, "Ctrlr Worker Thread Exiting\n");
560 alloc_task_pool(struct vhost_blk_ctrlr *ctrlr)
562 struct vhost_blk_queue *vq;
565 for (i = 0; i < NUM_OF_BLK_QUEUES; i++) {
566 vq = &ctrlr->queues[i];
568 vq->tasks = rte_zmalloc(NULL,
569 sizeof(struct vhost_blk_task) * vq->vring.size, 0);
571 fprintf(stderr, "Failed to allocate task memory\n");
575 for (j = 0; j < vq->vring.size; j++) {
576 vq->tasks[j].req_idx = j;
577 vq->tasks[j].ctrlr = ctrlr;
578 vq->tasks[j].vq = vq;
586 free_task_pool(struct vhost_blk_ctrlr *ctrlr)
590 for (i = 0; i < NUM_OF_BLK_QUEUES; i++)
591 rte_free(ctrlr->queues[i].tasks);
597 struct vhost_blk_ctrlr *ctrlr;
598 struct vhost_blk_queue *vq;
605 ret = rte_vhost_get_ifname(vid, path, PATH_MAX);
607 fprintf(stderr, "Failed to get the socket path\n");
611 ctrlr = vhost_blk_ctrlr_find(path);
613 fprintf(stderr, "Failed to find controller\n");
621 ret = rte_vhost_get_negotiated_features(vid, &features);
623 fprintf(stderr, "Failed to get the negotiated features\n");
626 packed_ring = !!(features & (1ULL << VIRTIO_F_RING_PACKED));
628 /* Disable Notifications and init last idx */
629 for (i = 0; i < NUM_OF_BLK_QUEUES; i++) {
630 vq = &ctrlr->queues[i];
633 assert(rte_vhost_get_vhost_vring(ctrlr->vid, i,
635 assert(rte_vhost_get_vring_base(ctrlr->vid, i,
637 &vq->last_used_idx) == 0);
638 assert(rte_vhost_get_vhost_ring_inflight(ctrlr->vid, i,
639 &vq->inflight_ring) == 0);
642 /* for the reconnection */
643 assert(rte_vhost_get_vring_base_from_inflight(
646 &vq->last_used_idx) == 0);
648 vq->avail_wrap_counter = vq->last_avail_idx &
650 vq->last_avail_idx = vq->last_avail_idx &
652 vq->used_wrap_counter = vq->last_used_idx &
654 vq->last_used_idx = vq->last_used_idx &
658 vq->packed_ring = packed_ring;
659 rte_vhost_enable_guest_notification(vid, i, 0);
662 assert(rte_vhost_get_mem_table(vid, &ctrlr->mem) == 0);
663 assert(ctrlr->mem != NULL);
664 assert(alloc_task_pool(ctrlr) == 0);
666 /* start polling vring */
667 worker_thread_status = WORKER_STATE_START;
668 fprintf(stdout, "New Device %s, Device ID %d\n", path, vid);
669 if (pthread_create(&tid, NULL, &ctrlr_worker, ctrlr) < 0) {
670 fprintf(stderr, "Worker Thread Started Failed\n");
674 /* device has been started */
681 destroy_device(int vid)
684 struct vhost_blk_ctrlr *ctrlr;
685 struct vhost_blk_queue *vq;
688 ret = rte_vhost_get_ifname(vid, path, PATH_MAX);
690 fprintf(stderr, "Destroy Ctrlr Failed\n");
694 fprintf(stdout, "Destroy %s Device ID %d\n", path, vid);
695 ctrlr = vhost_blk_ctrlr_find(path);
697 fprintf(stderr, "Destroy Ctrlr Failed\n");
704 worker_thread_status = WORKER_STATE_STOP;
707 for (i = 0; i < NUM_OF_BLK_QUEUES; i++) {
708 vq = &ctrlr->queues[i];
709 if (vq->packed_ring) {
710 vq->last_avail_idx |= (vq->avail_wrap_counter <<
712 vq->last_used_idx |= (vq->used_wrap_counter <<
716 rte_vhost_set_vring_base(ctrlr->vid, i,
721 free_task_pool(ctrlr);
728 new_connection(int vid)
730 /* extend the proper features for block device */
731 vhost_session_install_rte_compat_hooks(vid);
736 struct vhost_device_ops vhost_blk_device_ops = {
737 .new_device = new_device,
738 .destroy_device = destroy_device,
739 .new_connection = new_connection,
742 static struct vhost_block_dev *
743 vhost_blk_bdev_construct(const char *bdev_name,
744 const char *bdev_serial, uint32_t blk_size, uint64_t blk_cnt,
747 struct vhost_block_dev *bdev;
749 bdev = rte_zmalloc(NULL, sizeof(*bdev), RTE_CACHE_LINE_SIZE);
753 snprintf(bdev->name, sizeof(bdev->name), "%s", bdev_name);
754 snprintf(bdev->product_name, sizeof(bdev->product_name), "%s",
756 bdev->blocklen = blk_size;
757 bdev->blockcnt = blk_cnt;
758 bdev->write_cache = wce_enable;
760 fprintf(stdout, "Blocklen=%d, blockcnt=%"PRIx64"\n", bdev->blocklen,
763 /* use memory as disk storage space */
764 bdev->data = rte_zmalloc(NULL, blk_cnt * blk_size, 0);
766 fprintf(stderr, "No enough reserved huge memory for disk\n");
774 static struct vhost_blk_ctrlr *
775 vhost_blk_ctrlr_construct(const char *ctrlr_name)
778 struct vhost_blk_ctrlr *ctrlr;
782 /* always use current directory */
783 path = getcwd(cwd, PATH_MAX);
785 fprintf(stderr, "Cannot get current working directory\n");
788 snprintf(dev_pathname, sizeof(dev_pathname), "%s/%s", path, ctrlr_name);
790 unlink(dev_pathname);
792 if (rte_vhost_driver_register(dev_pathname, 0) != 0) {
793 fprintf(stderr, "Socket %s already exists\n", dev_pathname);
797 ret = rte_vhost_driver_set_features(dev_pathname, VHOST_BLK_FEATURES);
799 fprintf(stderr, "Set vhost driver features failed\n");
800 rte_vhost_driver_unregister(dev_pathname);
804 /* set vhost user protocol features */
805 vhost_dev_install_rte_compat_hooks(dev_pathname);
807 ctrlr = rte_zmalloc(NULL, sizeof(*ctrlr), RTE_CACHE_LINE_SIZE);
809 rte_vhost_driver_unregister(dev_pathname);
813 /* hardcoded block device information with 128MiB */
814 ctrlr->bdev = vhost_blk_bdev_construct("malloc0", "vhost_blk_malloc0",
818 rte_vhost_driver_unregister(dev_pathname);
822 rte_vhost_driver_callback_register(dev_pathname,
823 &vhost_blk_device_ops);
829 vhost_blk_ctrlr_destroy(struct vhost_blk_ctrlr *ctrlr)
831 if (ctrlr->bdev != NULL) {
832 if (ctrlr->bdev->data != NULL)
833 rte_free(ctrlr->bdev->data);
835 rte_free(ctrlr->bdev);
839 rte_vhost_driver_unregister(dev_pathname);
843 signal_handler(__rte_unused int signum)
845 struct vhost_blk_ctrlr *ctrlr;
847 ctrlr = vhost_blk_ctrlr_find(dev_pathname);
852 destroy_device(ctrlr->vid);
854 vhost_blk_ctrlr_destroy(ctrlr);
858 int main(int argc, char *argv[])
863 ret = rte_eal_init(argc, argv);
865 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
867 g_vhost_ctrlr = vhost_blk_ctrlr_construct(CTRLR_NAME);
868 if (g_vhost_ctrlr == NULL) {
869 fprintf(stderr, "Construct vhost blk controller failed\n");
873 if (sem_init(&exit_sem, 0, 0) < 0) {
874 fprintf(stderr, "Error init exit_sem\n");
878 signal(SIGINT, signal_handler);
880 rte_vhost_driver_start(dev_pathname);
882 /* loop for exit the application */