1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2019 Intel Corporation
16 #include <semaphore.h>
17 #include <linux/virtio_blk.h>
18 #include <linux/virtio_ring.h>
20 #include <rte_atomic.h>
21 #include <rte_cycles.h>
23 #include <rte_malloc.h>
24 #include <rte_vhost.h>
26 #include "vhost_blk.h"
29 #define VIRTQ_DESC_F_NEXT 1
30 #define VIRTQ_DESC_F_AVAIL (1 << 7)
31 #define VIRTQ_DESC_F_USED (1 << 15)
35 #define VHOST_BLK_FEATURES ((1ULL << VIRTIO_F_RING_PACKED) | \
36 (1ULL << VIRTIO_F_VERSION_1) |\
37 (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \
38 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))
39 #define CTRLR_NAME "vhost.socket"
41 enum CTRLR_WORKER_STATUS {
42 WORKER_STATE_START = 0,
46 struct vhost_blk_ctrlr *g_vhost_ctrlr;
48 /* Path to folder where character device will be created. Can be set by user. */
49 static char dev_pathname[PATH_MAX] = "";
50 static sem_t exit_sem;
51 static enum CTRLR_WORKER_STATUS worker_thread_status;
53 struct vhost_blk_ctrlr *
54 vhost_blk_ctrlr_find(const char *ctrlr_name)
56 if (ctrlr_name == NULL)
59 /* currently we only support 1 socket file fd */
64 gpa_to_vva(struct vhost_blk_ctrlr *ctrlr, uint64_t gpa, uint64_t *len)
66 assert(ctrlr->mem != NULL);
68 return rte_vhost_va_from_guest_pa(ctrlr->mem, gpa, len);
72 enqueue_task(struct vhost_blk_task *task)
74 struct vhost_blk_queue *vq = task->vq;
75 struct vring_used *used = vq->vring.used;
77 rte_vhost_set_last_inflight_io_split(task->ctrlr->vid,
78 vq->id, task->req_idx);
80 /* Fill out the next entry in the "used" ring. id = the
81 * index of the descriptor that contained the blk request.
82 * len = the total amount of data transferred for the blk
83 * request. We must report the correct len, for variable
84 * length blk CDBs, where we may return less data than
85 * allocated by the guest VM.
87 used->ring[used->idx & (vq->vring.size - 1)].id = task->req_idx;
88 used->ring[used->idx & (vq->vring.size - 1)].len = task->data_len;
89 rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
91 rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
93 rte_vhost_clr_inflight_desc_split(task->ctrlr->vid,
94 vq->id, used->idx, task->req_idx);
96 /* Send an interrupt back to the guest VM so that it knows
97 * a completion is ready to be processed.
99 rte_vhost_vring_call(task->ctrlr->vid, vq->id);
103 enqueue_task_packed(struct vhost_blk_task *task)
105 struct vhost_blk_queue *vq = task->vq;
106 struct vring_packed_desc *desc;
108 rte_vhost_set_last_inflight_io_packed(task->ctrlr->vid, vq->id,
111 desc = &vq->vring.desc_packed[vq->last_used_idx];
112 desc->id = task->buffer_id;
115 rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
116 if (vq->used_wrap_counter)
117 desc->flags |= VIRTQ_DESC_F_AVAIL | VIRTQ_DESC_F_USED;
119 desc->flags &= ~(VIRTQ_DESC_F_AVAIL | VIRTQ_DESC_F_USED);
120 rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
122 rte_vhost_clr_inflight_desc_packed(task->ctrlr->vid, vq->id,
125 vq->last_used_idx += task->chain_num;
126 if (vq->last_used_idx >= vq->vring.size) {
127 vq->last_used_idx -= vq->vring.size;
128 vq->used_wrap_counter = !vq->used_wrap_counter;
131 /* Send an interrupt back to the guest VM so that it knows
132 * a completion is ready to be processed.
134 rte_vhost_vring_call(task->ctrlr->vid, vq->id);
138 descriptor_has_next_packed(struct vring_packed_desc *cur_desc)
140 return !!(cur_desc->flags & VRING_DESC_F_NEXT);
144 descriptor_has_next_split(struct vring_desc *cur_desc)
146 return !!(cur_desc->flags & VRING_DESC_F_NEXT);
150 desc_payload_to_iovs(struct vhost_blk_ctrlr *ctrlr, struct iovec *iovs,
151 uint32_t *iov_index, uintptr_t payload, uint64_t remaining)
157 if (*iov_index >= VHOST_BLK_MAX_IOVS) {
158 fprintf(stderr, "VHOST_BLK_MAX_IOVS reached\n");
162 vva = (void *)(uintptr_t)gpa_to_vva(ctrlr,
165 fprintf(stderr, "failed to translate desc address.\n");
169 iovs[*iov_index].iov_base = vva;
170 iovs[*iov_index].iov_len = len;
179 static struct vring_desc *
180 vring_get_next_desc(struct vhost_blk_queue *vq, struct vring_desc *desc)
182 if (descriptor_has_next_split(desc))
183 return &vq->vring.desc[desc->next];
188 static struct vring_packed_desc *
189 vring_get_next_desc_packed(struct vhost_blk_queue *vq, uint16_t *req_idx)
191 if (descriptor_has_next_packed(&vq->vring.desc_packed[*req_idx])) {
192 *req_idx = (*req_idx + 1) % vq->vring.size;
193 return &vq->vring.desc_packed[*req_idx];
199 static struct rte_vhost_inflight_desc_packed *
200 vring_get_next_inflight_desc(struct vhost_blk_queue *vq,
201 struct rte_vhost_inflight_desc_packed *desc)
203 if (!!(desc->flags & VRING_DESC_F_NEXT))
204 return &vq->inflight_ring.inflight_packed->desc[desc->next];
210 setup_iovs_from_descs_split(struct vhost_blk_ctrlr *ctrlr,
211 struct vhost_blk_queue *vq, uint16_t req_idx,
212 struct iovec *iovs, uint32_t *iovs_idx,
215 struct vring_desc *desc = &vq->vring.desc[req_idx];
218 /* does not support indirect descriptors */
219 assert((desc->flags & VRING_DESC_F_INDIRECT) == 0);
221 if (*iovs_idx >= VHOST_BLK_MAX_IOVS) {
222 fprintf(stderr, "Reach VHOST_BLK_MAX_IOVS\n");
226 if (desc_payload_to_iovs(ctrlr, iovs, iovs_idx,
227 desc->addr, desc->len) != 0) {
228 fprintf(stderr, "Failed to convert desc payload to iovs\n");
232 *payload += desc->len;
234 desc = vring_get_next_desc(vq, desc);
235 } while (desc != NULL);
241 setup_iovs_from_descs_packed(struct vhost_blk_ctrlr *ctrlr,
242 struct vhost_blk_queue *vq, uint16_t req_idx,
243 struct iovec *iovs, uint32_t *iovs_idx,
246 struct vring_packed_desc *desc = &vq->vring.desc_packed[req_idx];
249 /* does not support indirect descriptors */
250 assert((desc->flags & VRING_DESC_F_INDIRECT) == 0);
252 if (*iovs_idx >= VHOST_BLK_MAX_IOVS) {
253 fprintf(stderr, "Reach VHOST_BLK_MAX_IOVS\n");
257 if (desc_payload_to_iovs(ctrlr, iovs, iovs_idx,
258 desc->addr, desc->len) != 0) {
259 fprintf(stderr, "Failed to convert desc payload to iovs\n");
263 *payload += desc->len;
265 desc = vring_get_next_desc_packed(vq, &req_idx);
266 } while (desc != NULL);
272 setup_iovs_from_inflight_desc(struct vhost_blk_ctrlr *ctrlr,
273 struct vhost_blk_queue *vq, uint16_t req_idx,
274 struct iovec *iovs, uint32_t *iovs_idx,
277 struct rte_vhost_ring_inflight *inflight_vq;
278 struct rte_vhost_inflight_desc_packed *desc;
280 inflight_vq = &vq->inflight_ring;
281 desc = &inflight_vq->inflight_packed->desc[req_idx];
284 /* does not support indirect descriptors */
285 assert((desc->flags & VRING_DESC_F_INDIRECT) == 0);
287 if (*iovs_idx >= VHOST_BLK_MAX_IOVS) {
288 fprintf(stderr, "Reach VHOST_BLK_MAX_IOVS\n");
292 if (desc_payload_to_iovs(ctrlr, iovs, iovs_idx,
293 desc->addr, desc->len) != 0) {
294 fprintf(stderr, "Failed to convert desc payload to iovs\n");
298 *payload += desc->len;
300 desc = vring_get_next_inflight_desc(vq, desc);
301 } while (desc != NULL);
307 process_blk_task(struct vhost_blk_task *task)
309 uint32_t payload = 0;
311 if (task->vq->packed_ring) {
312 struct rte_vhost_ring_inflight *inflight_ring;
313 struct rte_vhost_resubmit_info *resubmit_inflight;
315 inflight_ring = &task->vq->inflight_ring;
316 resubmit_inflight = inflight_ring->resubmit_inflight;
318 if (resubmit_inflight != NULL &&
319 resubmit_inflight->resubmit_list != NULL) {
320 if (setup_iovs_from_inflight_desc(task->ctrlr, task->vq,
321 task->req_idx, task->iovs, &task->iovs_cnt,
323 fprintf(stderr, "Failed to setup iovs\n");
327 if (setup_iovs_from_descs_packed(task->ctrlr, task->vq,
328 task->req_idx, task->iovs, &task->iovs_cnt,
330 fprintf(stderr, "Failed to setup iovs\n");
335 if (setup_iovs_from_descs_split(task->ctrlr, task->vq,
336 task->req_idx, task->iovs, &task->iovs_cnt, &payload)) {
337 fprintf(stderr, "Failed to setup iovs\n");
342 /* First IOV must be the req head. */
343 task->req = (struct virtio_blk_outhdr *)task->iovs[0].iov_base;
344 assert(sizeof(*task->req) == task->iovs[0].iov_len);
346 /* Last IOV must be the status tail. */
347 task->status = (uint8_t *)task->iovs[task->iovs_cnt - 1].iov_base;
348 assert(sizeof(*task->status) == task->iovs[task->iovs_cnt - 1].iov_len);
350 /* Transport data len */
351 task->data_len = payload - task->iovs[0].iov_len -
352 task->iovs[task->iovs_cnt - 1].iov_len;
354 if (vhost_bdev_process_blk_commands(task->ctrlr->bdev, task))
355 /* invalid response */
356 *task->status = VIRTIO_BLK_S_IOERR;
359 *task->status = VIRTIO_BLK_S_OK;
361 if (task->vq->packed_ring)
362 enqueue_task_packed(task);
368 blk_task_init(struct vhost_blk_task *task)
377 submit_inflight_vq(struct vhost_blk_queue *vq)
379 struct rte_vhost_ring_inflight *inflight_ring;
380 struct rte_vhost_resubmit_info *resubmit_inflight;
381 struct vhost_blk_task *task;
383 inflight_ring = &vq->inflight_ring;
384 resubmit_inflight = inflight_ring->resubmit_inflight;
386 if (resubmit_inflight == NULL ||
387 resubmit_inflight->resubmit_num == 0)
390 fprintf(stdout, "Resubmit inflight num is %d\n",
391 resubmit_inflight->resubmit_num);
393 while (resubmit_inflight->resubmit_num-- > 0) {
396 desc_idx = resubmit_inflight->resubmit_list[
397 resubmit_inflight->resubmit_num].index;
399 if (vq->packed_ring) {
401 struct rte_vhost_inflight_desc_packed *desc;
403 desc = inflight_ring->inflight_packed->desc;
404 task_idx = desc[desc[desc_idx].last].id;
405 task = &vq->tasks[task_idx];
407 task->req_idx = desc_idx;
408 task->chain_num = desc[desc_idx].num;
409 task->buffer_id = task_idx;
410 task->inflight_idx = desc_idx;
412 vq->last_avail_idx += desc[desc_idx].num;
413 if (vq->last_avail_idx >= vq->vring.size) {
414 vq->last_avail_idx -= vq->vring.size;
415 vq->avail_wrap_counter =
416 !vq->avail_wrap_counter;
419 /* In split ring, the desc_idx is the req_id
420 * which was initialized when allocated the task pool.
422 task = &vq->tasks[desc_idx];
425 process_blk_task(task);
428 free(resubmit_inflight->resubmit_list);
429 resubmit_inflight->resubmit_list = NULL;
432 /* Use the buffer_id as the task_idx */
434 vhost_blk_vq_get_desc_chain_buffer_id(struct vhost_blk_queue *vq,
435 uint16_t *req_head, uint16_t *num)
437 struct vring_packed_desc *desc = &vq->vring.desc_packed[
440 *req_head = vq->last_avail_idx;
443 while (descriptor_has_next_packed(desc)) {
444 vq->last_avail_idx = (vq->last_avail_idx + 1) % vq->vring.size;
445 desc = &vq->vring.desc_packed[vq->last_avail_idx];
449 /* Point to next desc */
450 vq->last_avail_idx = (vq->last_avail_idx + 1) % vq->vring.size;
451 if (vq->last_avail_idx < *req_head)
452 vq->avail_wrap_counter = !vq->avail_wrap_counter;
458 vq_get_desc_idx(struct vhost_blk_queue *vq)
461 uint16_t last_avail_idx;
463 last_avail_idx = vq->last_avail_idx & (vq->vring.size - 1);
464 desc_idx = vq->vring.avail->ring[last_avail_idx];
465 vq->last_avail_idx++;
471 vhost_blk_vq_is_avail(struct vhost_blk_queue *vq)
473 if (vq->packed_ring) {
474 uint16_t flags = vq->vring.desc_packed[
475 vq->last_avail_idx].flags;
476 bool avail_wrap_counter = vq->avail_wrap_counter;
478 return (!!(flags & VIRTQ_DESC_F_AVAIL) == avail_wrap_counter &&
479 !!(flags & VIRTQ_DESC_F_USED) != avail_wrap_counter);
481 if (vq->vring.avail->idx != vq->last_avail_idx)
489 process_vq(struct vhost_blk_queue *vq)
491 struct vhost_blk_task *task;
493 if (vq->packed_ring) {
494 while (vhost_blk_vq_is_avail(vq)) {
495 uint16_t task_idx, req_idx, last_idx, chain_num;
497 task_idx = vhost_blk_vq_get_desc_chain_buffer_id(vq,
498 &req_idx, &chain_num);
499 task = &vq->tasks[task_idx];
502 task->req_idx = req_idx;
503 task->chain_num = chain_num;
504 task->buffer_id = task_idx;
505 last_idx = (req_idx + chain_num - 1) % vq->vring.size;
507 rte_vhost_set_inflight_desc_packed(task->ctrlr->vid,
511 &task->inflight_idx);
513 process_blk_task(task);
516 while (vhost_blk_vq_is_avail(vq)) {
519 desc_idx = vq_get_desc_idx(vq);
520 task = &vq->tasks[desc_idx];
523 rte_vhost_set_inflight_desc_split(task->ctrlr->vid,
526 process_blk_task(task);
532 ctrlr_worker(void *arg)
534 struct vhost_blk_ctrlr *ctrlr = (struct vhost_blk_ctrlr *)arg;
539 fprintf(stdout, "Ctrlr Worker Thread start\n");
541 if (ctrlr == NULL || ctrlr->bdev == NULL) {
543 "%s: Error, invalid argument passed to worker thread\n",
548 thread = pthread_self();
551 pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
553 for (i = 0; i < NUM_OF_BLK_QUEUES; i++)
554 submit_inflight_vq(&ctrlr->queues[i]);
556 while (worker_thread_status != WORKER_STATE_STOP)
557 for (i = 0; i < NUM_OF_BLK_QUEUES; i++)
558 process_vq(&ctrlr->queues[i]);
560 fprintf(stdout, "Ctrlr Worker Thread Exiting\n");
566 alloc_task_pool(struct vhost_blk_ctrlr *ctrlr)
568 struct vhost_blk_queue *vq;
571 for (i = 0; i < NUM_OF_BLK_QUEUES; i++) {
572 vq = &ctrlr->queues[i];
574 vq->tasks = rte_zmalloc(NULL,
575 sizeof(struct vhost_blk_task) * vq->vring.size, 0);
577 fprintf(stderr, "Failed to allocate task memory\n");
581 for (j = 0; j < vq->vring.size; j++) {
582 vq->tasks[j].req_idx = j;
583 vq->tasks[j].ctrlr = ctrlr;
584 vq->tasks[j].vq = vq;
592 free_task_pool(struct vhost_blk_ctrlr *ctrlr)
596 for (i = 0; i < NUM_OF_BLK_QUEUES; i++)
597 rte_free(ctrlr->queues[i].tasks);
603 struct vhost_blk_ctrlr *ctrlr;
604 struct vhost_blk_queue *vq;
606 uint64_t features, protocol_features;
609 bool packed_ring, inflight_shmfd;
611 ret = rte_vhost_get_ifname(vid, path, PATH_MAX);
613 fprintf(stderr, "Failed to get the socket path\n");
617 ctrlr = vhost_blk_ctrlr_find(path);
619 fprintf(stderr, "Failed to find controller\n");
627 ret = rte_vhost_get_negotiated_features(vid, &features);
629 fprintf(stderr, "Failed to get the negotiated features\n");
632 packed_ring = !!(features & (1ULL << VIRTIO_F_RING_PACKED));
634 ret = rte_vhost_get_negotiated_protocol_features(
635 vid, &protocol_features);
638 "Failed to get the negotiated protocol features\n");
641 inflight_shmfd = !!(features &
642 (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD));
644 /* Disable Notifications and init last idx */
645 for (i = 0; i < NUM_OF_BLK_QUEUES; i++) {
646 vq = &ctrlr->queues[i];
649 assert(rte_vhost_get_vhost_vring(ctrlr->vid, i,
651 assert(rte_vhost_get_vring_base(ctrlr->vid, i,
653 &vq->last_used_idx) == 0);
656 assert(rte_vhost_get_vhost_ring_inflight(
658 &vq->inflight_ring) == 0);
660 if (packed_ring && inflight_shmfd) {
661 /* for the reconnection */
662 assert(rte_vhost_get_vring_base_from_inflight(
665 &vq->last_used_idx) == 0);
667 vq->avail_wrap_counter = vq->last_avail_idx &
669 vq->last_avail_idx = vq->last_avail_idx &
671 vq->used_wrap_counter = vq->last_used_idx &
673 vq->last_used_idx = vq->last_used_idx &
677 vq->packed_ring = packed_ring;
678 rte_vhost_enable_guest_notification(vid, i, 0);
681 assert(rte_vhost_get_mem_table(vid, &ctrlr->mem) == 0);
682 assert(ctrlr->mem != NULL);
683 assert(alloc_task_pool(ctrlr) == 0);
685 /* start polling vring */
686 worker_thread_status = WORKER_STATE_START;
687 fprintf(stdout, "New Device %s, Device ID %d\n", path, vid);
688 if (rte_ctrl_thread_create(&tid, "vhostblk-ctrlr", NULL,
689 &ctrlr_worker, ctrlr) != 0) {
690 fprintf(stderr, "Worker Thread Started Failed\n");
694 /* device has been started */
701 destroy_device(int vid)
704 struct vhost_blk_ctrlr *ctrlr;
705 struct vhost_blk_queue *vq;
708 ret = rte_vhost_get_ifname(vid, path, PATH_MAX);
710 fprintf(stderr, "Destroy Ctrlr Failed\n");
714 fprintf(stdout, "Destroy %s Device ID %d\n", path, vid);
715 ctrlr = vhost_blk_ctrlr_find(path);
717 fprintf(stderr, "Destroy Ctrlr Failed\n");
724 worker_thread_status = WORKER_STATE_STOP;
727 for (i = 0; i < NUM_OF_BLK_QUEUES; i++) {
728 vq = &ctrlr->queues[i];
729 if (vq->packed_ring) {
730 vq->last_avail_idx |= (vq->avail_wrap_counter <<
732 vq->last_used_idx |= (vq->used_wrap_counter <<
736 rte_vhost_set_vring_base(ctrlr->vid, i,
741 free_task_pool(ctrlr);
748 new_connection(int vid)
750 /* extend the proper features for block device */
751 vhost_session_install_rte_compat_hooks(vid);
756 struct vhost_device_ops vhost_blk_device_ops = {
757 .new_device = new_device,
758 .destroy_device = destroy_device,
759 .new_connection = new_connection,
762 static struct vhost_block_dev *
763 vhost_blk_bdev_construct(const char *bdev_name,
764 const char *bdev_serial, uint32_t blk_size, uint64_t blk_cnt,
767 struct vhost_block_dev *bdev;
769 bdev = rte_zmalloc(NULL, sizeof(*bdev), RTE_CACHE_LINE_SIZE);
773 snprintf(bdev->name, sizeof(bdev->name), "%s", bdev_name);
774 snprintf(bdev->product_name, sizeof(bdev->product_name), "%s",
776 bdev->blocklen = blk_size;
777 bdev->blockcnt = blk_cnt;
778 bdev->write_cache = wce_enable;
780 fprintf(stdout, "Blocklen=%d, blockcnt=%"PRIx64"\n", bdev->blocklen,
783 /* use memory as disk storage space */
784 bdev->data = rte_zmalloc(NULL, blk_cnt * blk_size, 0);
786 fprintf(stderr, "No enough reserved huge memory for disk\n");
794 static struct vhost_blk_ctrlr *
795 vhost_blk_ctrlr_construct(const char *ctrlr_name)
798 struct vhost_blk_ctrlr *ctrlr;
802 /* always use current directory */
803 path = getcwd(cwd, PATH_MAX);
805 fprintf(stderr, "Cannot get current working directory\n");
808 snprintf(dev_pathname, sizeof(dev_pathname), "%s/%s", path, ctrlr_name);
810 unlink(dev_pathname);
812 if (rte_vhost_driver_register(dev_pathname, 0) != 0) {
813 fprintf(stderr, "Socket %s already exists\n", dev_pathname);
817 ret = rte_vhost_driver_set_features(dev_pathname, VHOST_BLK_FEATURES);
819 fprintf(stderr, "Set vhost driver features failed\n");
820 rte_vhost_driver_unregister(dev_pathname);
824 /* set vhost user protocol features */
825 vhost_dev_install_rte_compat_hooks(dev_pathname);
827 ctrlr = rte_zmalloc(NULL, sizeof(*ctrlr), RTE_CACHE_LINE_SIZE);
829 rte_vhost_driver_unregister(dev_pathname);
833 /* hardcoded block device information with 128MiB */
834 ctrlr->bdev = vhost_blk_bdev_construct("malloc0", "vhost_blk_malloc0",
838 rte_vhost_driver_unregister(dev_pathname);
842 rte_vhost_driver_callback_register(dev_pathname,
843 &vhost_blk_device_ops);
849 vhost_blk_ctrlr_destroy(struct vhost_blk_ctrlr *ctrlr)
851 if (ctrlr->bdev != NULL) {
852 if (ctrlr->bdev->data != NULL)
853 rte_free(ctrlr->bdev->data);
855 rte_free(ctrlr->bdev);
859 rte_vhost_driver_unregister(dev_pathname);
863 signal_handler(__rte_unused int signum)
865 struct vhost_blk_ctrlr *ctrlr;
867 ctrlr = vhost_blk_ctrlr_find(dev_pathname);
872 destroy_device(ctrlr->vid);
874 vhost_blk_ctrlr_destroy(ctrlr);
878 int main(int argc, char *argv[])
883 ret = rte_eal_init(argc, argv);
885 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
887 g_vhost_ctrlr = vhost_blk_ctrlr_construct(CTRLR_NAME);
888 if (g_vhost_ctrlr == NULL) {
889 fprintf(stderr, "Construct vhost blk controller failed\n");
893 if (sem_init(&exit_sem, 0, 0) < 0) {
894 fprintf(stderr, "Error init exit_sem\n");
898 signal(SIGINT, signal_handler);
900 ret = rte_vhost_driver_start(dev_pathname);
902 fprintf(stderr, "Failed to start vhost driver.\n");
906 /* loop for exit the application */
910 /* clean up the EAL */