74c82a900bfa803631c69e2d39189ded5f1bb3d3
[dpdk.git] / examples / vhost_blk / vhost_blk.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2019 Intel Corporation
3  */
4
5 #include <stdint.h>
6 #include <unistd.h>
7 #include <stdbool.h>
8 #include <signal.h>
9 #include <assert.h>
10 #include <semaphore.h>
11 #include <linux/virtio_blk.h>
12 #include <linux/virtio_ring.h>
13
14 #include <rte_atomic.h>
15 #include <rte_cycles.h>
16 #include <rte_log.h>
17 #include <rte_malloc.h>
18 #include <rte_vhost.h>
19
20 #include "vhost_blk.h"
21 #include "blk_spec.h"
22
23 #define VIRTQ_DESC_F_NEXT       1
24 #define VIRTQ_DESC_F_AVAIL      (1 << 7)
25 #define VIRTQ_DESC_F_USED       (1 << 15)
26
27 #define MAX_TASK                12
28
29 #define VHOST_BLK_FEATURES ((1ULL << VIRTIO_F_RING_PACKED) | \
30                             (1ULL << VIRTIO_F_VERSION_1) |\
31                             (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \
32                             (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))
33
34 /* Path to folder where character device will be created. Can be set by user. */
35 static char dev_pathname[PATH_MAX] = "";
36 static sem_t exit_sem;
37 static int g_should_stop = -1;
38
39 struct vhost_blk_ctrlr *
40 vhost_blk_ctrlr_find(const char *ctrlr_name)
41 {
42         if (ctrlr_name == NULL)
43                 return NULL;
44
45         /* currently we only support 1 socket file fd */
46         return g_vhost_ctrlr;
47 }
48
49 static uint64_t gpa_to_vva(int vid, uint64_t gpa, uint64_t *len)
50 {
51         char path[PATH_MAX];
52         struct vhost_blk_ctrlr *ctrlr;
53         int ret = 0;
54
55         ret = rte_vhost_get_ifname(vid, path, PATH_MAX);
56         if (ret) {
57                 fprintf(stderr, "Cannot get socket name\n");
58                 assert(ret != 0);
59         }
60
61         ctrlr = vhost_blk_ctrlr_find(path);
62         if (!ctrlr) {
63                 fprintf(stderr, "Controller is not ready\n");
64                 assert(ctrlr != NULL);
65         }
66
67         assert(ctrlr->mem != NULL);
68
69         return rte_vhost_va_from_guest_pa(ctrlr->mem, gpa, len);
70 }
71
72 static struct vring_packed_desc *
73 descriptor_get_next_packed(struct rte_vhost_vring *vq,
74                              uint16_t *idx)
75 {
76         if (vq->desc_packed[*idx % vq->size].flags & VIRTQ_DESC_F_NEXT) {
77                 *idx += 1;
78                 return &vq->desc_packed[*idx % vq->size];
79         }
80
81         return NULL;
82 }
83
84 static bool
85 descriptor_has_next_packed(struct vring_packed_desc *cur_desc)
86 {
87         return !!(cur_desc->flags & VRING_DESC_F_NEXT);
88 }
89
90 static bool
91 descriptor_is_wr_packed(struct vring_packed_desc *cur_desc)
92 {
93         return !!(cur_desc->flags & VRING_DESC_F_WRITE);
94 }
95
96 static struct rte_vhost_inflight_desc_packed *
97 inflight_desc_get_next(struct rte_vhost_inflight_info_packed *inflight_packed,
98                                struct rte_vhost_inflight_desc_packed *cur_desc)
99 {
100         if (!!(cur_desc->flags & VIRTQ_DESC_F_NEXT))
101                 return &inflight_packed->desc[cur_desc->next];
102
103         return NULL;
104 }
105
106 static bool
107 inflight_desc_has_next(struct rte_vhost_inflight_desc_packed *cur_desc)
108 {
109         return !!(cur_desc->flags & VRING_DESC_F_NEXT);
110 }
111
112 static bool
113 inflight_desc_is_wr(struct rte_vhost_inflight_desc_packed *cur_desc)
114 {
115         return !!(cur_desc->flags & VRING_DESC_F_WRITE);
116 }
117
118 static void
119 inflight_process_payload_chain_packed(struct inflight_blk_task *task)
120 {
121         void *data;
122         uint64_t chunck_len;
123         struct vhost_blk_task *blk_task;
124         struct rte_vhost_inflight_desc_packed *desc;
125
126         blk_task = &task->blk_task;
127         blk_task->iovs_cnt = 0;
128
129         do {
130                 desc = task->inflight_desc;
131                 chunck_len = desc->len;
132                 data = (void *)(uintptr_t)gpa_to_vva(blk_task->bdev->vid,
133                                                      desc->addr,
134                                                      &chunck_len);
135                 if (!data || chunck_len != desc->len) {
136                         fprintf(stderr, "failed to translate desc address.\n");
137                         return;
138                 }
139
140                 blk_task->iovs[blk_task->iovs_cnt].iov_base = data;
141                 blk_task->iovs[blk_task->iovs_cnt].iov_len = desc->len;
142                 blk_task->data_len += desc->len;
143                 blk_task->iovs_cnt++;
144                 task->inflight_desc = inflight_desc_get_next(
145                                         task->inflight_packed, desc);
146         } while (inflight_desc_has_next(task->inflight_desc));
147
148         chunck_len = task->inflight_desc->len;
149         blk_task->status = (void *)(uintptr_t)gpa_to_vva(
150                 blk_task->bdev->vid, task->inflight_desc->addr, &chunck_len);
151         if (!blk_task->status || chunck_len != task->inflight_desc->len)
152                 fprintf(stderr, "failed to translate desc address.\n");
153 }
154
155 static void
156 inflight_submit_completion_packed(struct inflight_blk_task *task,
157                                               uint32_t q_idx, uint16_t *used_id,
158                                               bool *used_wrap_counter)
159 {
160         struct vhost_blk_ctrlr *ctrlr;
161         struct rte_vhost_vring *vq;
162         struct vring_packed_desc *desc;
163         int ret;
164
165         ctrlr = vhost_blk_ctrlr_find(dev_pathname);
166         vq = task->blk_task.vq;
167
168         ret = rte_vhost_set_last_inflight_io_packed(ctrlr->bdev->vid, q_idx,
169                                                     task->blk_task.head_idx);
170         if (ret != 0)
171                 fprintf(stderr, "failed to set last inflight io\n");
172
173         desc = &vq->desc_packed[*used_id];
174         desc->id = task->blk_task.buffer_id;
175         rte_smp_mb();
176         if (*used_wrap_counter)
177                 desc->flags |= VIRTQ_DESC_F_AVAIL | VIRTQ_DESC_F_USED;
178         else
179                 desc->flags &= ~(VIRTQ_DESC_F_AVAIL | VIRTQ_DESC_F_USED);
180         rte_smp_mb();
181
182         *used_id += task->blk_task.iovs_cnt + 2;
183         if (*used_id >= vq->size) {
184                 *used_id -= vq->size;
185                 *used_wrap_counter = !(*used_wrap_counter);
186         }
187
188         ret = rte_vhost_clr_inflight_desc_packed(ctrlr->bdev->vid, q_idx,
189                                                  task->blk_task.head_idx);
190         if (ret != 0)
191                 fprintf(stderr, "failed to clear inflight io\n");
192
193         /* Send an interrupt back to the guest VM so that it knows
194          * a completion is ready to be processed.
195          */
196         rte_vhost_vring_call(task->blk_task.bdev->vid, q_idx);
197 }
198
199 static void
200 submit_completion_packed(struct vhost_blk_task *task, uint32_t q_idx,
201                                   uint16_t *used_id, bool *used_wrap_counter)
202 {
203         struct vhost_blk_ctrlr *ctrlr;
204         struct rte_vhost_vring *vq;
205         struct vring_packed_desc *desc;
206         int ret;
207
208         ctrlr = vhost_blk_ctrlr_find(dev_pathname);
209         vq = task->vq;
210
211         ret = rte_vhost_set_last_inflight_io_packed(ctrlr->bdev->vid, q_idx,
212                                                     task->inflight_idx);
213         if (ret != 0)
214                 fprintf(stderr, "failed to set last inflight io\n");
215
216         desc = &vq->desc_packed[*used_id];
217         desc->id = task->buffer_id;
218         rte_smp_mb();
219         if (*used_wrap_counter)
220                 desc->flags |= VIRTQ_DESC_F_AVAIL | VIRTQ_DESC_F_USED;
221         else
222                 desc->flags &= ~(VIRTQ_DESC_F_AVAIL | VIRTQ_DESC_F_USED);
223         rte_smp_mb();
224
225         *used_id += task->iovs_cnt + 2;
226         if (*used_id >= vq->size) {
227                 *used_id -= vq->size;
228                 *used_wrap_counter = !(*used_wrap_counter);
229         }
230
231         ret = rte_vhost_clr_inflight_desc_packed(ctrlr->bdev->vid, q_idx,
232                                                  task->inflight_idx);
233         if (ret != 0)
234                 fprintf(stderr, "failed to clear inflight io\n");
235
236         /* Send an interrupt back to the guest VM so that it knows
237          * a completion is ready to be processed.
238          */
239         rte_vhost_vring_call(task->bdev->vid, q_idx);
240 }
241
242 static void
243 vhost_process_payload_chain_packed(struct vhost_blk_task *task,
244         uint16_t *idx)
245 {
246         void *data;
247         uint64_t chunck_len;
248
249         task->iovs_cnt = 0;
250
251         do {
252                 chunck_len = task->desc_packed->len;
253                 data = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid,
254                                                      task->desc_packed->addr,
255                                                          &chunck_len);
256                 if (!data || chunck_len != task->desc_packed->len) {
257                         fprintf(stderr, "failed to translate desc address.\n");
258                         return;
259                 }
260
261                 task->iovs[task->iovs_cnt].iov_base = data;
262                 task->iovs[task->iovs_cnt].iov_len = task->desc_packed->len;
263                 task->data_len += task->desc_packed->len;
264                 task->iovs_cnt++;
265                 task->desc_packed = descriptor_get_next_packed(task->vq, idx);
266         } while (descriptor_has_next_packed(task->desc_packed));
267
268         task->last_idx = *idx % task->vq->size;
269         chunck_len = task->desc_packed->len;
270         task->status = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid,
271                                                    task->desc_packed->addr,
272                                                    &chunck_len);
273         if (!task->status || chunck_len != task->desc_packed->len)
274                 fprintf(stderr, "failed to translate desc address.\n");
275 }
276
277
278 static int
279 descriptor_is_available(struct rte_vhost_vring *vring, uint16_t idx,
280                                         bool avail_wrap_counter)
281 {
282         uint16_t flags = vring->desc_packed[idx].flags;
283
284         return ((!!(flags & VIRTQ_DESC_F_AVAIL) == avail_wrap_counter) &&
285                 (!!(flags & VIRTQ_DESC_F_USED) != avail_wrap_counter));
286 }
287
288 static void
289 process_requestq_packed(struct vhost_blk_ctrlr *ctrlr, uint32_t q_idx)
290 {
291         bool avail_wrap_counter, used_wrap_counter;
292         uint16_t avail_idx, used_idx;
293         int ret;
294         uint64_t chunck_len;
295         struct vhost_blk_queue *blk_vq;
296         struct rte_vhost_vring *vq;
297         struct vhost_blk_task *task;
298
299         blk_vq = &ctrlr->bdev->queues[q_idx];
300         vq = &blk_vq->vq;
301
302         avail_idx = blk_vq->last_avail_idx;
303         avail_wrap_counter = blk_vq->avail_wrap_counter;
304         used_idx = blk_vq->last_used_idx;
305         used_wrap_counter = blk_vq->used_wrap_counter;
306
307         task = rte_zmalloc(NULL, sizeof(*task), 0);
308         assert(task != NULL);
309         task->vq = vq;
310         task->bdev = ctrlr->bdev;
311
312         while (descriptor_is_available(vq, avail_idx, avail_wrap_counter)) {
313                 task->head_idx = avail_idx;
314                 task->desc_packed = &task->vq->desc_packed[task->head_idx];
315                 task->iovs_cnt = 0;
316                 task->data_len = 0;
317                 task->req = NULL;
318                 task->status = NULL;
319
320                 /* does not support indirect descriptors */
321                 assert((task->desc_packed->flags & VRING_DESC_F_INDIRECT) == 0);
322
323                 chunck_len = task->desc_packed->len;
324                 task->req = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid,
325                         task->desc_packed->addr, &chunck_len);
326                 if (!task->req || chunck_len != task->desc_packed->len) {
327                         fprintf(stderr, "failed to translate desc address.\n");
328                         rte_free(task);
329                         return;
330                 }
331
332                 task->desc_packed = descriptor_get_next_packed(task->vq,
333                                                                 &avail_idx);
334                 assert(task->desc_packed != NULL);
335                 if (!descriptor_has_next_packed(task->desc_packed)) {
336                         task->dxfer_dir = BLK_DIR_NONE;
337                         task->last_idx = avail_idx % vq->size;
338                         chunck_len = task->desc_packed->len;
339                         task->status = (void *)(uintptr_t)
340                                               gpa_to_vva(task->bdev->vid,
341                                                         task->desc_packed->addr,
342                                                         &chunck_len);
343                         if (!task->status ||
344                                 chunck_len != task->desc_packed->len) {
345                                 fprintf(stderr,
346                                         "failed to translate desc address.\n");
347                                 rte_free(task);
348                                 return;
349                         }
350                 } else {
351                         task->readtype = descriptor_is_wr_packed(
352                                                         task->desc_packed);
353                         vhost_process_payload_chain_packed(task, &avail_idx);
354                 }
355                 task->buffer_id = vq->desc_packed[task->last_idx].id;
356                 rte_vhost_set_inflight_desc_packed(ctrlr->bdev->vid, q_idx,
357                                                    task->head_idx,
358                                                    task->last_idx,
359                                                    &task->inflight_idx);
360
361                 if (++avail_idx >= vq->size) {
362                         avail_idx -= vq->size;
363                         avail_wrap_counter = !avail_wrap_counter;
364                 }
365                 blk_vq->last_avail_idx = avail_idx;
366                 blk_vq->avail_wrap_counter = avail_wrap_counter;
367
368                 ret = vhost_bdev_process_blk_commands(ctrlr->bdev, task);
369                 if (ret) {
370                         /* invalid response */
371                         *task->status = VIRTIO_BLK_S_IOERR;
372                 } else {
373                         /* successfully */
374                         *task->status = VIRTIO_BLK_S_OK;
375                 }
376
377                 submit_completion_packed(task, q_idx, &used_idx,
378                                                 &used_wrap_counter);
379                 blk_vq->last_used_idx = used_idx;
380                 blk_vq->used_wrap_counter = used_wrap_counter;
381         }
382
383         rte_free(task);
384 }
385
386 static void
387 submit_inflight_vq_packed(struct vhost_blk_ctrlr *ctrlr,
388         uint16_t q_idx)
389 {
390         bool used_wrap_counter;
391         int req_idx, ret;
392         uint16_t used_idx;
393         uint64_t chunck_len;
394         struct vhost_blk_queue *blk_vq;
395         struct rte_vhost_ring_inflight *inflight_vq;
396         struct rte_vhost_resubmit_info *resubmit_info;
397         struct rte_vhost_vring *vq;
398         struct inflight_blk_task *task;
399         struct vhost_blk_task *blk_task;
400         struct rte_vhost_inflight_info_packed *inflight_info;
401
402         blk_vq = &ctrlr->bdev->queues[q_idx];
403         vq = &blk_vq->vq;
404         inflight_vq = &blk_vq->inflight_vq;
405         resubmit_info = inflight_vq->resubmit_inflight;
406         inflight_info = inflight_vq->inflight_packed;
407         used_idx = blk_vq->last_used_idx;
408         used_wrap_counter = blk_vq->used_wrap_counter;
409
410         task = rte_malloc(NULL, sizeof(*task), 0);
411         if (!task) {
412                 fprintf(stderr, "failed to allocate memory\n");
413                 return;
414         }
415         blk_task = &task->blk_task;
416         blk_task->vq = vq;
417         blk_task->bdev = ctrlr->bdev;
418         task->inflight_packed = inflight_vq->inflight_packed;
419
420         while (resubmit_info->resubmit_num-- > 0) {
421                 req_idx = resubmit_info->resubmit_num;
422                 blk_task->head_idx =
423                         resubmit_info->resubmit_list[req_idx].index;
424                 task->inflight_desc =
425                         &inflight_info->desc[blk_task->head_idx];
426                 task->blk_task.iovs_cnt = 0;
427                 task->blk_task.data_len = 0;
428                 task->blk_task.req = NULL;
429                 task->blk_task.status = NULL;
430
431                 /* update the avail idx too
432                  * as it's initial value equals to used idx
433                  */
434                 blk_vq->last_avail_idx += task->inflight_desc->num;
435                 if (blk_vq->last_avail_idx >= vq->size) {
436                         blk_vq->last_avail_idx -= vq->size;
437                         blk_vq->avail_wrap_counter =
438                                 !blk_vq->avail_wrap_counter;
439                 }
440
441                 /* does not support indirect descriptors */
442                 assert(task->inflight_desc != NULL);
443                 assert((task->inflight_desc->flags &
444                         VRING_DESC_F_INDIRECT) == 0);
445
446                 chunck_len = task->inflight_desc->len;
447                 blk_task->req = (void *)(uintptr_t)
448                                      gpa_to_vva(blk_task->bdev->vid,
449                                                 task->inflight_desc->addr,
450                                                 &chunck_len);
451                 if (!blk_task->req ||
452                         chunck_len != task->inflight_desc->len) {
453                         fprintf(stderr, "failed to translate desc address.\n");
454                         rte_free(task);
455                         return;
456                 }
457
458                 task->inflight_desc = inflight_desc_get_next(
459                         task->inflight_packed, task->inflight_desc);
460                 assert(task->inflight_desc != NULL);
461                 if (!inflight_desc_has_next(task->inflight_desc)) {
462                         blk_task->dxfer_dir = BLK_DIR_NONE;
463                         chunck_len = task->inflight_desc->len;
464                         blk_task->status = (void *)(uintptr_t)
465                                 gpa_to_vva(blk_task->bdev->vid,
466                                                 task->inflight_desc->addr,
467                                                 &chunck_len);
468                         if (!blk_task->status ||
469                             chunck_len != task->inflight_desc->len) {
470                                 fprintf(stderr,
471                                         "failed to translate desc address.\n");
472                                 rte_free(task);
473                                 return;
474                         }
475                 } else {
476                         blk_task->readtype =
477                         inflight_desc_is_wr(task->inflight_desc);
478                         inflight_process_payload_chain_packed(task);
479                 }
480
481                 blk_task->buffer_id = task->inflight_desc->id;
482
483                 ret = vhost_bdev_process_blk_commands(ctrlr->bdev, blk_task);
484                 if (ret)
485                         /* invalid response */
486                         *blk_task->status = VIRTIO_BLK_S_IOERR;
487                 else
488                         /* successfully */
489                         *blk_task->status = VIRTIO_BLK_S_OK;
490
491                 inflight_submit_completion_packed(task, q_idx, &used_idx,
492                                                   &used_wrap_counter);
493
494                 blk_vq->last_used_idx = used_idx;
495                 blk_vq->used_wrap_counter = used_wrap_counter;
496         }
497
498         rte_free(task);
499 }
500
501 static struct vring_desc *
502 descriptor_get_next_split(struct vring_desc *vq_desc,
503                                    struct vring_desc *cur_desc)
504 {
505         return &vq_desc[cur_desc->next];
506 }
507
508 static bool
509 descriptor_has_next_split(struct vring_desc *cur_desc)
510 {
511         return !!(cur_desc->flags & VRING_DESC_F_NEXT);
512 }
513
514 static bool
515 descriptor_is_wr_split(struct vring_desc *cur_desc)
516 {
517         return !!(cur_desc->flags & VRING_DESC_F_WRITE);
518 }
519
520 static void
521 vhost_process_payload_chain_split(struct vhost_blk_task *task)
522 {
523         void *data;
524         uint64_t chunck_len;
525
526         task->iovs_cnt = 0;
527
528         do {
529                 chunck_len = task->desc_split->len;
530                 data = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid,
531                                                      task->desc_split->addr,
532                                                      &chunck_len);
533                 if (!data || chunck_len != task->desc_split->len) {
534                         fprintf(stderr, "failed to translate desc address.\n");
535                         return;
536                 }
537
538                 task->iovs[task->iovs_cnt].iov_base = data;
539                 task->iovs[task->iovs_cnt].iov_len = task->desc_split->len;
540                 task->data_len += task->desc_split->len;
541                 task->iovs_cnt++;
542                 task->desc_split =
543                 descriptor_get_next_split(task->vq->desc, task->desc_split);
544         } while (descriptor_has_next_split(task->desc_split));
545
546         chunck_len = task->desc_split->len;
547         task->status = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid,
548                                                      task->desc_split->addr,
549                                                      &chunck_len);
550         if (!task->status || chunck_len != task->desc_split->len)
551                 fprintf(stderr, "failed to translate desc address.\n");
552 }
553
554 static void
555 submit_completion_split(struct vhost_blk_task *task, uint32_t vid,
556         uint32_t q_idx)
557 {
558         struct rte_vhost_vring *vq;
559         struct vring_used *used;
560
561         vq = task->vq;
562         used = vq->used;
563
564         rte_vhost_set_last_inflight_io_split(vid, q_idx, task->req_idx);
565
566         /* Fill out the next entry in the "used" ring.  id = the
567          * index of the descriptor that contained the blk request.
568          * len = the total amount of data transferred for the blk
569          * request. We must report the correct len, for variable
570          * length blk CDBs, where we may return less data than
571          * allocated by the guest VM.
572          */
573         used->ring[used->idx & (vq->size - 1)].id = task->req_idx;
574         used->ring[used->idx & (vq->size - 1)].len = task->data_len;
575         rte_smp_mb();
576         used->idx++;
577         rte_smp_mb();
578
579         rte_vhost_clr_inflight_desc_split(vid, q_idx, used->idx, task->req_idx);
580
581         /* Send an interrupt back to the guest VM so that it knows
582          * a completion is ready to be processed.
583          */
584         rte_vhost_vring_call(task->bdev->vid, q_idx);
585 }
586
587 static void
588 submit_inflight_vq_split(struct vhost_blk_ctrlr *ctrlr,
589         uint32_t q_idx)
590 {
591         struct vhost_blk_queue *blk_vq;
592         struct rte_vhost_ring_inflight *inflight_vq;
593         struct rte_vhost_resubmit_info *resubmit_inflight;
594         struct rte_vhost_resubmit_desc *resubmit_list;
595         struct vhost_blk_task *task;
596         int req_idx;
597         uint64_t chunck_len;
598         int ret;
599
600         blk_vq = &ctrlr->bdev->queues[q_idx];
601         inflight_vq = &blk_vq->inflight_vq;
602         resubmit_inflight = inflight_vq->resubmit_inflight;
603         resubmit_list = resubmit_inflight->resubmit_list;
604
605         task = rte_zmalloc(NULL, sizeof(*task), 0);
606         assert(task != NULL);
607
608         task->ctrlr = ctrlr;
609         task->bdev = ctrlr->bdev;
610         task->vq = &blk_vq->vq;
611
612         while (resubmit_inflight->resubmit_num-- > 0) {
613                 req_idx = resubmit_list[resubmit_inflight->resubmit_num].index;
614                 task->req_idx = req_idx;
615                 task->desc_split = &task->vq->desc[task->req_idx];
616                 task->iovs_cnt = 0;
617                 task->data_len = 0;
618                 task->req = NULL;
619                 task->status = NULL;
620
621                 /* does not support indirect descriptors */
622                 assert(task->desc_split != NULL);
623                 assert((task->desc_split->flags & VRING_DESC_F_INDIRECT) == 0);
624
625                 chunck_len = task->desc_split->len;
626                 task->req = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid,
627                                 task->desc_split->addr, &chunck_len);
628                 if (!task->req || chunck_len != task->desc_split->len) {
629                         fprintf(stderr, "failed to translate desc address.\n");
630                         rte_free(task);
631                         return;
632                 }
633
634                 task->desc_split = descriptor_get_next_split(task->vq->desc,
635                                                              task->desc_split);
636                 if (!descriptor_has_next_split(task->desc_split)) {
637                         task->dxfer_dir = BLK_DIR_NONE;
638                         chunck_len = task->desc_split->len;
639                         task->status = (void *)(uintptr_t)
640                                        gpa_to_vva(task->bdev->vid,
641                                                   task->desc_split->addr,
642                                                   &chunck_len);
643                         if (!task->status ||
644                                 chunck_len != task->desc_split->len) {
645                                 fprintf(stderr,
646                                         "failed to translate desc address.\n");
647                                 rte_free(task);
648                                 return;
649                         }
650                 } else {
651                         task->readtype =
652                                 descriptor_is_wr_split(task->desc_split);
653                         vhost_process_payload_chain_split(task);
654                 }
655
656                 ret = vhost_bdev_process_blk_commands(ctrlr->bdev, task);
657                 if (ret) {
658                         /* invalid response */
659                         *task->status = VIRTIO_BLK_S_IOERR;
660                 } else {
661                         /* successfully */
662                         *task->status = VIRTIO_BLK_S_OK;
663                 }
664                 submit_completion_split(task, ctrlr->bdev->vid, q_idx);
665         }
666
667         rte_free(task);
668 }
669
670 static void
671 process_requestq_split(struct vhost_blk_ctrlr *ctrlr, uint32_t q_idx)
672 {
673         int ret;
674         int req_idx;
675         uint16_t last_idx;
676         uint64_t chunck_len;
677         struct vhost_blk_queue *blk_vq;
678         struct rte_vhost_vring *vq;
679         struct vhost_blk_task *task;
680
681         blk_vq = &ctrlr->bdev->queues[q_idx];
682         vq = &blk_vq->vq;
683
684         task = rte_zmalloc(NULL, sizeof(*task), 0);
685         assert(task != NULL);
686         task->ctrlr = ctrlr;
687         task->bdev = ctrlr->bdev;
688         task->vq = vq;
689
690         while (vq->avail->idx != blk_vq->last_avail_idx) {
691                 last_idx = blk_vq->last_avail_idx & (vq->size - 1);
692                 req_idx = vq->avail->ring[last_idx];
693                 task->req_idx = req_idx;
694                 task->desc_split = &task->vq->desc[task->req_idx];
695                 task->iovs_cnt = 0;
696                 task->data_len = 0;
697                 task->req = NULL;
698                 task->status = NULL;
699
700                 rte_vhost_set_inflight_desc_split(ctrlr->bdev->vid, q_idx,
701                                                         task->req_idx);
702
703                 /* does not support indirect descriptors */
704                 assert((task->desc_split->flags & VRING_DESC_F_INDIRECT) == 0);
705
706                 chunck_len = task->desc_split->len;
707                 task->req = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid,
708                                 task->desc_split->addr, &chunck_len);
709                 if (!task->req || chunck_len != task->desc_split->len) {
710                         fprintf(stderr, "failed to translate desc address.\n");
711                         rte_free(task);
712                         return;
713                 }
714
715                 task->desc_split = descriptor_get_next_split(task->vq->desc,
716                                                              task->desc_split);
717                 if (!descriptor_has_next_split(task->desc_split)) {
718                         task->dxfer_dir = BLK_DIR_NONE;
719                         chunck_len = task->desc_split->len;
720                         task->status = (void *)(uintptr_t)
721                                               gpa_to_vva(task->bdev->vid,
722                                                          task->desc_split->addr,
723                                                          &chunck_len);
724                         if (!task->status ||
725                                 chunck_len != task->desc_split->len) {
726                                 fprintf(stderr,
727                                         "failed to translate desc address.\n");
728                                 rte_free(task);
729                                 return;
730                         }
731                 } else {
732                         task->readtype =
733                                 descriptor_is_wr_split(task->desc_split);
734                         vhost_process_payload_chain_split(task);
735                 }
736                 blk_vq->last_avail_idx++;
737
738                 ret = vhost_bdev_process_blk_commands(ctrlr->bdev, task);
739                 if (ret) {
740                         /* invalid response */
741                         *task->status = VIRTIO_BLK_S_IOERR;
742                 } else {
743                         /* successfully */
744                         *task->status = VIRTIO_BLK_S_OK;
745                 }
746
747                 submit_completion_split(task, ctrlr->bdev->vid, q_idx);
748         }
749
750         rte_free(task);
751 }
752
753 static void *
754 ctrlr_worker(void *arg)
755 {
756         struct vhost_blk_ctrlr *ctrlr = (struct vhost_blk_ctrlr *)arg;
757         struct vhost_blk_queue *blk_vq;
758         struct rte_vhost_ring_inflight *inflight_vq;
759         cpu_set_t cpuset;
760         pthread_t thread;
761         int i;
762
763         fprintf(stdout, "Ctrlr Worker Thread start\n");
764
765         if (ctrlr == NULL || ctrlr->bdev == NULL) {
766                 fprintf(stderr,
767                         "%s: Error, invalid argument passed to worker thread\n",
768                         __func__);
769                 exit(0);
770         }
771
772         thread = pthread_self();
773         CPU_ZERO(&cpuset);
774         CPU_SET(0, &cpuset);
775         pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
776
777         for (i = 0; i < NUM_OF_BLK_QUEUES; i++) {
778                 blk_vq = &ctrlr->bdev->queues[i];
779                 inflight_vq = &blk_vq->inflight_vq;
780                 if (inflight_vq->resubmit_inflight != NULL &&
781                     inflight_vq->resubmit_inflight->resubmit_num != 0) {
782                         if (ctrlr->packed_ring)
783                                 submit_inflight_vq_packed(ctrlr, i);
784                         else
785                                 submit_inflight_vq_split(ctrlr, i);
786                 }
787         }
788
789         while (!g_should_stop && ctrlr->bdev != NULL) {
790                 for (i = 0; i < NUM_OF_BLK_QUEUES; i++) {
791                         if (ctrlr->packed_ring)
792                                 process_requestq_packed(ctrlr, i);
793                         else
794                                 process_requestq_split(ctrlr, i);
795                 }
796         }
797
798         g_should_stop = 2;
799         fprintf(stdout, "Ctrlr Worker Thread Exiting\n");
800         sem_post(&exit_sem);
801         return NULL;
802 }
803
804 static int
805 new_device(int vid)
806 {
807         struct vhost_blk_ctrlr *ctrlr;
808         struct vhost_blk_queue *blk_vq;
809         struct rte_vhost_vring *vq;
810         uint64_t features;
811         pthread_t tid;
812         int i, ret;
813
814         ctrlr = vhost_blk_ctrlr_find(dev_pathname);
815         if (!ctrlr) {
816                 fprintf(stderr, "Controller is not ready\n");
817                 return -1;
818         }
819
820         if (ctrlr->started)
821                 return 0;
822
823         ctrlr->bdev->vid = vid;
824         ret = rte_vhost_get_negotiated_features(vid, &features);
825         if (ret) {
826                 fprintf(stderr, "failed to get the negotiated features\n");
827                 return -1;
828         }
829         ctrlr->packed_ring = !!(features & (1ULL << VIRTIO_F_RING_PACKED));
830
831         ret = rte_vhost_get_mem_table(vid, &ctrlr->mem);
832         if (ret)
833                 fprintf(stderr, "Get Controller memory region failed\n");
834         assert(ctrlr->mem != NULL);
835
836         /* Disable Notifications and init last idx */
837         for (i = 0; i < NUM_OF_BLK_QUEUES; i++) {
838                 blk_vq = &ctrlr->bdev->queues[i];
839                 vq = &blk_vq->vq;
840
841                 ret = rte_vhost_get_vhost_vring(ctrlr->bdev->vid, i, vq);
842                 assert(ret == 0);
843
844                 ret = rte_vhost_get_vring_base(ctrlr->bdev->vid, i,
845                                                &blk_vq->last_avail_idx,
846                                                &blk_vq->last_used_idx);
847                 assert(ret == 0);
848
849                 ret = rte_vhost_get_vhost_ring_inflight(ctrlr->bdev->vid, i,
850                                                         &blk_vq->inflight_vq);
851                 assert(ret == 0);
852
853                 if (ctrlr->packed_ring) {
854                         /* for the reconnection */
855                         ret = rte_vhost_get_vring_base_from_inflight(
856                                 ctrlr->bdev->vid, i,
857                                 &blk_vq->last_avail_idx,
858                                 &blk_vq->last_used_idx);
859                         assert(ret == 0);
860
861                         blk_vq->avail_wrap_counter = blk_vq->last_avail_idx &
862                                 (1 << 15);
863                         blk_vq->last_avail_idx = blk_vq->last_avail_idx &
864                                 0x7fff;
865                         blk_vq->used_wrap_counter = blk_vq->last_used_idx &
866                                 (1 << 15);
867                         blk_vq->last_used_idx = blk_vq->last_used_idx &
868                                 0x7fff;
869                 }
870
871                 rte_vhost_enable_guest_notification(vid, i, 0);
872         }
873
874         /* start polling vring */
875         g_should_stop = 0;
876         fprintf(stdout, "New Device %s, Device ID %d\n", dev_pathname, vid);
877         if (pthread_create(&tid, NULL, &ctrlr_worker, ctrlr) < 0) {
878                 fprintf(stderr, "Worker Thread Started Failed\n");
879                 return -1;
880         }
881
882         /* device has been started */
883         ctrlr->started = 1;
884         pthread_detach(tid);
885         return 0;
886 }
887
888 static void
889 destroy_device(int vid)
890 {
891         char path[PATH_MAX];
892         struct vhost_blk_ctrlr *ctrlr;
893         struct vhost_blk_queue *blk_vq;
894         int i, ret;
895
896         ret = rte_vhost_get_ifname(vid, path, PATH_MAX);
897         if (ret) {
898                 fprintf(stderr, "Destroy Ctrlr Failed\n");
899                 return;
900         }
901
902         fprintf(stdout, "Destroy %s Device ID %d\n", path, vid);
903         ctrlr = vhost_blk_ctrlr_find(path);
904         if (!ctrlr) {
905                 fprintf(stderr, "Destroy Ctrlr Failed\n");
906                 return;
907         }
908
909         if (!ctrlr->started)
910                 return;
911
912         g_should_stop = 1;
913         while (g_should_stop != 2)
914                 ;
915
916         for (i = 0; i < NUM_OF_BLK_QUEUES; i++) {
917                 blk_vq = &ctrlr->bdev->queues[i];
918                 if (ctrlr->packed_ring) {
919                         blk_vq->last_avail_idx |= (blk_vq->avail_wrap_counter <<
920                                 15);
921                         blk_vq->last_used_idx |= (blk_vq->used_wrap_counter <<
922                                 15);
923                 }
924                 rte_vhost_set_vring_base(ctrlr->bdev->vid, i,
925                                          blk_vq->last_avail_idx,
926                                          blk_vq->last_used_idx);
927         }
928
929         free(ctrlr->mem);
930
931         ctrlr->started = 0;
932         sem_wait(&exit_sem);
933 }
934
935 static int
936 new_connection(int vid)
937 {
938         /* extend the proper features for block device */
939         vhost_session_install_rte_compat_hooks(vid);
940
941         return 0;
942 }
943
944 struct vhost_device_ops vhost_blk_device_ops = {
945         .new_device =  new_device,
946         .destroy_device = destroy_device,
947         .new_connection = new_connection,
948 };
949
950 static struct vhost_block_dev *
951 vhost_blk_bdev_construct(const char *bdev_name,
952         const char *bdev_serial, uint32_t blk_size, uint64_t blk_cnt,
953         bool wce_enable)
954 {
955         struct vhost_block_dev *bdev;
956
957         bdev = rte_zmalloc(NULL, sizeof(*bdev), RTE_CACHE_LINE_SIZE);
958         if (!bdev)
959                 return NULL;
960
961         strncpy(bdev->name, bdev_name, sizeof(bdev->name));
962         strncpy(bdev->product_name, bdev_serial, sizeof(bdev->product_name));
963         bdev->blocklen = blk_size;
964         bdev->blockcnt = blk_cnt;
965         bdev->write_cache = wce_enable;
966
967         fprintf(stdout, "blocklen=%d, blockcnt=%"PRIx64"\n", bdev->blocklen,
968                 bdev->blockcnt);
969
970         /* use memory as disk storage space */
971         bdev->data = rte_zmalloc(NULL, blk_cnt * blk_size, 0);
972         if (!bdev->data) {
973                 fprintf(stderr, "no enough reserved huge memory for disk\n");
974                 free(bdev);
975                 return NULL;
976         }
977
978         return bdev;
979 }
980
981 static struct vhost_blk_ctrlr *
982 vhost_blk_ctrlr_construct(const char *ctrlr_name)
983 {
984         int ret;
985         struct vhost_blk_ctrlr *ctrlr;
986         char *path;
987         char cwd[PATH_MAX];
988
989         /* always use current directory */
990         path = getcwd(cwd, PATH_MAX);
991         if (!path) {
992                 fprintf(stderr, "Cannot get current working directory\n");
993                 return NULL;
994         }
995         snprintf(dev_pathname, sizeof(dev_pathname), "%s/%s", path, ctrlr_name);
996
997         unlink(dev_pathname);
998
999         if (rte_vhost_driver_register(dev_pathname, 0) != 0) {
1000                 fprintf(stderr, "socket %s already exists\n", dev_pathname);
1001                 return NULL;
1002         }
1003
1004         ret = rte_vhost_driver_set_features(dev_pathname, VHOST_BLK_FEATURES);
1005         if (ret != 0) {
1006                 fprintf(stderr, "Set vhost driver features failed\n");
1007                 rte_vhost_driver_unregister(dev_pathname);
1008                 return NULL;
1009         }
1010
1011         /* set proper features */
1012         vhost_dev_install_rte_compat_hooks(dev_pathname);
1013
1014         ctrlr = rte_zmalloc(NULL, sizeof(*ctrlr), RTE_CACHE_LINE_SIZE);
1015         if (!ctrlr) {
1016                 rte_vhost_driver_unregister(dev_pathname);
1017                 return NULL;
1018         }
1019
1020         /* hardcoded block device information with 128MiB */
1021         ctrlr->bdev = vhost_blk_bdev_construct("malloc0", "vhost_blk_malloc0",
1022                                                 4096, 32768, 0);
1023         if (!ctrlr->bdev) {
1024                 rte_free(ctrlr);
1025                 rte_vhost_driver_unregister(dev_pathname);
1026                 return NULL;
1027         }
1028
1029         rte_vhost_driver_callback_register(dev_pathname,
1030                                            &vhost_blk_device_ops);
1031
1032         return ctrlr;
1033 }
1034
1035 static void
1036 signal_handler(__rte_unused int signum)
1037 {
1038         struct vhost_blk_ctrlr *ctrlr;
1039
1040         unlink(dev_pathname);
1041
1042         if (g_should_stop != -1) {
1043                 g_should_stop = 1;
1044                 while (g_should_stop != 2)
1045                         ;
1046         }
1047
1048         ctrlr = vhost_blk_ctrlr_find(dev_pathname);
1049         if (ctrlr != NULL) {
1050                 if (ctrlr->bdev != NULL) {
1051                         rte_free(ctrlr->bdev->data);
1052                         rte_free(ctrlr->bdev);
1053                 }
1054                 rte_free(ctrlr);
1055         }
1056
1057         rte_vhost_driver_unregister(dev_pathname);
1058         exit(0);
1059 }
1060
1061 int main(int argc, char *argv[])
1062 {
1063         int ret;
1064
1065         signal(SIGINT, signal_handler);
1066
1067         /* init EAL */
1068         ret = rte_eal_init(argc, argv);
1069         if (ret < 0)
1070                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1071
1072         g_vhost_ctrlr = vhost_blk_ctrlr_construct("vhost.socket");
1073         if (g_vhost_ctrlr == NULL) {
1074                 fprintf(stderr, "Construct vhost blk controller failed\n");
1075                 return 0;
1076         }
1077
1078         if (sem_init(&exit_sem, 0, 0) < 0) {
1079                 fprintf(stderr, "Error init exit_sem\n");
1080                 return -1;
1081         }
1082
1083         rte_vhost_driver_start(dev_pathname);
1084
1085         /* loop for exit the application */
1086         while (1)
1087                 sleep(1);
1088
1089         return 0;
1090 }