40a18b250754057f60d78ad7e86bd2b82c664c7e
[dpdk.git] / drivers / vdpa / ifc / ifcvf_vdpa.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2018 Intel Corporation
3  */
4
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <fcntl.h>
8 #include <string.h>
9 #include <sys/ioctl.h>
10 #include <sys/epoll.h>
11 #include <linux/virtio_net.h>
12 #include <stdbool.h>
13
14 #include <rte_eal_paging.h>
15 #include <rte_malloc.h>
16 #include <rte_memory.h>
17 #include <rte_bus_pci.h>
18 #include <rte_vhost.h>
19 #include <rte_vdpa.h>
20 #include <vdpa_driver.h>
21 #include <rte_vfio.h>
22 #include <rte_spinlock.h>
23 #include <rte_log.h>
24 #include <rte_kvargs.h>
25 #include <rte_devargs.h>
26
27 #include "base/ifcvf.h"
28
29 RTE_LOG_REGISTER(ifcvf_vdpa_logtype, pmd.vdpa.ifcvf, NOTICE);
30 #define DRV_LOG(level, fmt, args...) \
31         rte_log(RTE_LOG_ ## level, ifcvf_vdpa_logtype, \
32                 "IFCVF %s(): " fmt "\n", __func__, ##args)
33
34 #define IFCVF_USED_RING_LEN(size) \
35         ((size) * sizeof(struct vring_used_elem) + sizeof(uint16_t) * 3)
36
37 #define IFCVF_VDPA_MODE         "vdpa"
38 #define IFCVF_SW_FALLBACK_LM    "sw-live-migration"
39
40 #define THREAD_NAME_LEN 16
41
42 static const char * const ifcvf_valid_arguments[] = {
43         IFCVF_VDPA_MODE,
44         IFCVF_SW_FALLBACK_LM,
45         NULL
46 };
47
48 struct ifcvf_internal {
49         struct rte_pci_device *pdev;
50         struct ifcvf_hw hw;
51         int configured;
52         int vfio_container_fd;
53         int vfio_group_fd;
54         int vfio_dev_fd;
55         pthread_t tid;  /* thread for notify relay */
56         pthread_t intr_tid; /* thread for config space change interrupt relay */
57         int epfd;
58         int csc_epfd;
59         int vid;
60         struct rte_vdpa_device *vdev;
61         uint16_t max_queues;
62         uint64_t features;
63         rte_atomic32_t started;
64         rte_atomic32_t dev_attached;
65         rte_atomic32_t running;
66         rte_spinlock_t lock;
67         bool sw_lm;
68         bool sw_fallback_running;
69         /* mediated vring for sw fallback */
70         struct vring m_vring[IFCVF_MAX_QUEUES * 2];
71         /* eventfd for used ring interrupt */
72         int intr_fd[IFCVF_MAX_QUEUES * 2];
73 };
74
75 struct internal_list {
76         TAILQ_ENTRY(internal_list) next;
77         struct ifcvf_internal *internal;
78 };
79
80 /* vdpa device info includes device features and devcic operation. */
81 struct rte_vdpa_dev_info {
82         uint64_t features;
83         struct rte_vdpa_dev_ops *ops;
84 };
85
86 TAILQ_HEAD(internal_list_head, internal_list);
87 static struct internal_list_head internal_list =
88         TAILQ_HEAD_INITIALIZER(internal_list);
89
90 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
91
92 static void update_used_ring(struct ifcvf_internal *internal, uint16_t qid);
93
94 static struct internal_list *
95 find_internal_resource_by_vdev(struct rte_vdpa_device *vdev)
96 {
97         int found = 0;
98         struct internal_list *list;
99
100         pthread_mutex_lock(&internal_list_lock);
101
102         TAILQ_FOREACH(list, &internal_list, next) {
103                 if (vdev == list->internal->vdev) {
104                         found = 1;
105                         break;
106                 }
107         }
108
109         pthread_mutex_unlock(&internal_list_lock);
110
111         if (!found)
112                 return NULL;
113
114         return list;
115 }
116
117 static struct internal_list *
118 find_internal_resource_by_dev(struct rte_pci_device *pdev)
119 {
120         int found = 0;
121         struct internal_list *list;
122
123         pthread_mutex_lock(&internal_list_lock);
124
125         TAILQ_FOREACH(list, &internal_list, next) {
126                 if (!rte_pci_addr_cmp(&pdev->addr,
127                                         &list->internal->pdev->addr)) {
128                         found = 1;
129                         break;
130                 }
131         }
132
133         pthread_mutex_unlock(&internal_list_lock);
134
135         if (!found)
136                 return NULL;
137
138         return list;
139 }
140
141 static int
142 ifcvf_vfio_setup(struct ifcvf_internal *internal)
143 {
144         struct rte_pci_device *dev = internal->pdev;
145         char devname[RTE_DEV_NAME_MAX_LEN] = {0};
146         int iommu_group_num;
147         int i, ret;
148
149         internal->vfio_dev_fd = -1;
150         internal->vfio_group_fd = -1;
151         internal->vfio_container_fd = -1;
152
153         rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
154         ret = rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
155                         &iommu_group_num);
156         if (ret <= 0) {
157                 DRV_LOG(ERR, "%s failed to get IOMMU group", devname);
158                 return -1;
159         }
160
161         internal->vfio_container_fd = rte_vfio_container_create();
162         if (internal->vfio_container_fd < 0)
163                 return -1;
164
165         internal->vfio_group_fd = rte_vfio_container_group_bind(
166                         internal->vfio_container_fd, iommu_group_num);
167         if (internal->vfio_group_fd < 0)
168                 goto err;
169
170         if (rte_pci_map_device(dev))
171                 goto err;
172
173         internal->vfio_dev_fd = rte_intr_dev_fd_get(dev->intr_handle);
174
175         for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
176                         i++) {
177                 internal->hw.mem_resource[i].addr =
178                         internal->pdev->mem_resource[i].addr;
179                 internal->hw.mem_resource[i].phys_addr =
180                         internal->pdev->mem_resource[i].phys_addr;
181                 internal->hw.mem_resource[i].len =
182                         internal->pdev->mem_resource[i].len;
183         }
184
185         return 0;
186
187 err:
188         rte_vfio_container_destroy(internal->vfio_container_fd);
189         return -1;
190 }
191
192 static int
193 ifcvf_dma_map(struct ifcvf_internal *internal, bool do_map)
194 {
195         uint32_t i;
196         int ret;
197         struct rte_vhost_memory *mem = NULL;
198         int vfio_container_fd;
199
200         ret = rte_vhost_get_mem_table(internal->vid, &mem);
201         if (ret < 0) {
202                 DRV_LOG(ERR, "failed to get VM memory layout.");
203                 goto exit;
204         }
205
206         vfio_container_fd = internal->vfio_container_fd;
207
208         for (i = 0; i < mem->nregions; i++) {
209                 struct rte_vhost_mem_region *reg;
210
211                 reg = &mem->regions[i];
212                 DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", "
213                         "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".",
214                         do_map ? "DMA map" : "DMA unmap", i,
215                         reg->host_user_addr, reg->guest_phys_addr, reg->size);
216
217                 if (do_map) {
218                         ret = rte_vfio_container_dma_map(vfio_container_fd,
219                                 reg->host_user_addr, reg->guest_phys_addr,
220                                 reg->size);
221                         if (ret < 0) {
222                                 DRV_LOG(ERR, "DMA map failed.");
223                                 goto exit;
224                         }
225                 } else {
226                         ret = rte_vfio_container_dma_unmap(vfio_container_fd,
227                                 reg->host_user_addr, reg->guest_phys_addr,
228                                 reg->size);
229                         if (ret < 0) {
230                                 DRV_LOG(ERR, "DMA unmap failed.");
231                                 goto exit;
232                         }
233                 }
234         }
235
236 exit:
237         free(mem);
238         return ret;
239 }
240
241 static uint64_t
242 hva_to_gpa(int vid, uint64_t hva)
243 {
244         struct rte_vhost_memory *mem = NULL;
245         struct rte_vhost_mem_region *reg;
246         uint32_t i;
247         uint64_t gpa = 0;
248
249         if (rte_vhost_get_mem_table(vid, &mem) < 0)
250                 goto exit;
251
252         for (i = 0; i < mem->nregions; i++) {
253                 reg = &mem->regions[i];
254
255                 if (hva >= reg->host_user_addr &&
256                                 hva < reg->host_user_addr + reg->size) {
257                         gpa = hva - reg->host_user_addr + reg->guest_phys_addr;
258                         break;
259                 }
260         }
261
262 exit:
263         free(mem);
264         return gpa;
265 }
266
267 static int
268 vdpa_ifcvf_start(struct ifcvf_internal *internal)
269 {
270         struct ifcvf_hw *hw = &internal->hw;
271         int i, nr_vring;
272         int vid;
273         struct rte_vhost_vring vq;
274         uint64_t gpa;
275
276         vid = internal->vid;
277         nr_vring = rte_vhost_get_vring_num(vid);
278         rte_vhost_get_negotiated_features(vid, &hw->req_features);
279
280         for (i = 0; i < nr_vring; i++) {
281                 rte_vhost_get_vhost_vring(vid, i, &vq);
282                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
283                 if (gpa == 0) {
284                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
285                         return -1;
286                 }
287                 hw->vring[i].desc = gpa;
288
289                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
290                 if (gpa == 0) {
291                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
292                         return -1;
293                 }
294                 hw->vring[i].avail = gpa;
295
296                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
297                 if (gpa == 0) {
298                         DRV_LOG(ERR, "Fail to get GPA for used ring.");
299                         return -1;
300                 }
301                 hw->vring[i].used = gpa;
302
303                 hw->vring[i].size = vq.size;
304                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
305                                 &hw->vring[i].last_used_idx);
306         }
307         hw->nr_vring = i;
308
309         return ifcvf_start_hw(&internal->hw);
310 }
311
312 static void
313 vdpa_ifcvf_stop(struct ifcvf_internal *internal)
314 {
315         struct ifcvf_hw *hw = &internal->hw;
316         uint32_t i;
317         int vid;
318         uint64_t features = 0;
319         uint64_t log_base = 0, log_size = 0;
320         uint64_t len;
321         u32 ring_state = 0;
322
323         vid = internal->vid;
324
325         /* to make sure no packet is lost for blk device
326          * do not stop until last_avail_idx == last_used_idx
327          */
328         if (internal->hw.device_type == IFCVF_BLK) {
329                 for (i = 0; i < hw->nr_vring; i++) {
330                         do {
331                                 if (hw->lm_cfg != NULL)
332                                         ring_state = *(u32 *)(hw->lm_cfg +
333                                                 IFCVF_LM_RING_STATE_OFFSET +
334                                                 i * IFCVF_LM_CFG_SIZE);
335                                 hw->vring[i].last_avail_idx =
336                                         (u16)(ring_state & IFCVF_16_BIT_MASK);
337                                 hw->vring[i].last_used_idx =
338                                         (u16)(ring_state >> 16);
339                                 if (hw->vring[i].last_avail_idx !=
340                                         hw->vring[i].last_used_idx) {
341                                         ifcvf_notify_queue(hw, i);
342                                         usleep(10);
343                                 }
344                         } while (hw->vring[i].last_avail_idx !=
345                                 hw->vring[i].last_used_idx);
346                 }
347         }
348
349         ifcvf_stop_hw(hw);
350
351         for (i = 0; i < hw->nr_vring; i++)
352                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
353                                 hw->vring[i].last_used_idx);
354
355         if (internal->sw_lm)
356                 return;
357
358         rte_vhost_get_negotiated_features(vid, &features);
359         if (RTE_VHOST_NEED_LOG(features)) {
360                 ifcvf_disable_logging(hw);
361                 rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
362                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
363                                 log_base, IFCVF_LOG_BASE, log_size);
364                 /*
365                  * IFCVF marks dirty memory pages for only packet buffer,
366                  * SW helps to mark the used ring as dirty after device stops.
367                  */
368                 for (i = 0; i < hw->nr_vring; i++) {
369                         len = IFCVF_USED_RING_LEN(hw->vring[i].size);
370                         rte_vhost_log_used_vring(vid, i, 0, len);
371                 }
372         }
373 }
374
375 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
376                 sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
377 static int
378 vdpa_enable_vfio_intr(struct ifcvf_internal *internal, bool m_rx)
379 {
380         int ret;
381         uint32_t i, nr_vring;
382         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
383         struct vfio_irq_set *irq_set;
384         int *fd_ptr;
385         struct rte_vhost_vring vring;
386         int fd;
387
388         vring.callfd = -1;
389
390         nr_vring = rte_vhost_get_vring_num(internal->vid);
391
392         irq_set = (struct vfio_irq_set *)irq_set_buf;
393         irq_set->argsz = sizeof(irq_set_buf);
394         irq_set->count = nr_vring + 1;
395         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
396                          VFIO_IRQ_SET_ACTION_TRIGGER;
397         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
398         irq_set->start = 0;
399         fd_ptr = (int *)&irq_set->data;
400         /* The first interrupt is for the configure space change notification */
401         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] =
402                 rte_intr_fd_get(internal->pdev->intr_handle);
403
404         for (i = 0; i < nr_vring; i++)
405                 internal->intr_fd[i] = -1;
406
407         for (i = 0; i < nr_vring; i++) {
408                 rte_vhost_get_vhost_vring(internal->vid, i, &vring);
409                 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
410                 if (m_rx == true &&
411                         ((i & 1) == 0 || internal->hw.device_type == IFCVF_BLK)) {
412                         /* For the net we only need to relay rx queue,
413                          * which will change the mem of VM.
414                          * For the blk we need to relay all the read cmd
415                          * of each queue
416                          */
417                         fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
418                         if (fd < 0) {
419                                 DRV_LOG(ERR, "can't setup eventfd: %s",
420                                         strerror(errno));
421                                 return -1;
422                         }
423                         internal->intr_fd[i] = fd;
424                         fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = fd;
425                 }
426         }
427
428         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
429         if (ret) {
430                 DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s",
431                                 strerror(errno));
432                 return -1;
433         }
434
435         return 0;
436 }
437
438 static int
439 vdpa_disable_vfio_intr(struct ifcvf_internal *internal)
440 {
441         int ret;
442         uint32_t i, nr_vring;
443         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
444         struct vfio_irq_set *irq_set;
445
446         irq_set = (struct vfio_irq_set *)irq_set_buf;
447         irq_set->argsz = sizeof(irq_set_buf);
448         irq_set->count = 0;
449         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
450         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
451         irq_set->start = 0;
452
453         nr_vring = rte_vhost_get_vring_num(internal->vid);
454         for (i = 0; i < nr_vring; i++) {
455                 if (internal->intr_fd[i] >= 0)
456                         close(internal->intr_fd[i]);
457                 internal->intr_fd[i] = -1;
458         }
459
460         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
461         if (ret) {
462                 DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s",
463                                 strerror(errno));
464                 return -1;
465         }
466
467         return 0;
468 }
469
470 static void *
471 notify_relay(void *arg)
472 {
473         int i, kickfd, epfd, nfds = 0;
474         uint32_t qid, q_num;
475         struct epoll_event events[IFCVF_MAX_QUEUES * 2];
476         struct epoll_event ev;
477         uint64_t buf;
478         int nbytes;
479         struct rte_vhost_vring vring;
480         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
481         struct ifcvf_hw *hw = &internal->hw;
482
483         q_num = rte_vhost_get_vring_num(internal->vid);
484
485         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
486         if (epfd < 0) {
487                 DRV_LOG(ERR, "failed to create epoll instance.");
488                 return NULL;
489         }
490         internal->epfd = epfd;
491
492         vring.kickfd = -1;
493         for (qid = 0; qid < q_num; qid++) {
494                 ev.events = EPOLLIN | EPOLLPRI;
495                 rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
496                 ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
497                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
498                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
499                         return NULL;
500                 }
501         }
502
503         for (;;) {
504                 nfds = epoll_wait(epfd, events, q_num, -1);
505                 if (nfds < 0) {
506                         if (errno == EINTR)
507                                 continue;
508                         DRV_LOG(ERR, "epoll_wait return fail\n");
509                         return NULL;
510                 }
511
512                 for (i = 0; i < nfds; i++) {
513                         qid = events[i].data.u32;
514                         kickfd = (uint32_t)(events[i].data.u64 >> 32);
515                         do {
516                                 nbytes = read(kickfd, &buf, 8);
517                                 if (nbytes < 0) {
518                                         if (errno == EINTR ||
519                                             errno == EWOULDBLOCK ||
520                                             errno == EAGAIN)
521                                                 continue;
522                                         DRV_LOG(INFO, "Error reading "
523                                                 "kickfd: %s",
524                                                 strerror(errno));
525                                 }
526                                 break;
527                         } while (1);
528
529                         ifcvf_notify_queue(hw, qid);
530                 }
531         }
532
533         return NULL;
534 }
535
536 static int
537 setup_notify_relay(struct ifcvf_internal *internal)
538 {
539         char name[THREAD_NAME_LEN];
540         int ret;
541
542         snprintf(name, sizeof(name), "ifc-notify-%d", internal->vid);
543         ret = rte_ctrl_thread_create(&internal->tid, name, NULL, notify_relay,
544                                      (void *)internal);
545         if (ret != 0) {
546                 DRV_LOG(ERR, "failed to create notify relay pthread.");
547                 return -1;
548         }
549
550         return 0;
551 }
552
553 static int
554 unset_notify_relay(struct ifcvf_internal *internal)
555 {
556         void *status;
557
558         if (internal->tid) {
559                 pthread_cancel(internal->tid);
560                 pthread_join(internal->tid, &status);
561         }
562         internal->tid = 0;
563
564         if (internal->epfd >= 0)
565                 close(internal->epfd);
566         internal->epfd = -1;
567
568         return 0;
569 }
570
571 static void
572 virtio_interrupt_handler(struct ifcvf_internal *internal)
573 {
574         int vid = internal->vid;
575         int ret;
576
577         ret = rte_vhost_slave_config_change(vid, 1);
578         if (ret)
579                 DRV_LOG(ERR, "failed to notify the guest about configuration space change.");
580 }
581
582 static void *
583 intr_relay(void *arg)
584 {
585         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
586         struct epoll_event csc_event;
587         struct epoll_event ev;
588         uint64_t buf;
589         int nbytes;
590         int csc_epfd, csc_val = 0;
591
592         csc_epfd = epoll_create(1);
593         if (csc_epfd < 0) {
594                 DRV_LOG(ERR, "failed to create epoll for config space change.");
595                 return NULL;
596         }
597
598         ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
599         ev.data.fd = rte_intr_fd_get(internal->pdev->intr_handle);
600         if (epoll_ctl(csc_epfd, EPOLL_CTL_ADD,
601                 rte_intr_fd_get(internal->pdev->intr_handle), &ev) < 0) {
602                 DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
603                 goto out;
604         }
605
606         internal->csc_epfd = csc_epfd;
607
608         for (;;) {
609                 csc_val = epoll_wait(csc_epfd, &csc_event, 1, -1);
610                 if (csc_val < 0) {
611                         if (errno == EINTR)
612                                 continue;
613                         DRV_LOG(ERR, "epoll_wait return fail.");
614                         goto out;
615                 } else if (csc_val == 0) {
616                         continue;
617                 } else {
618                         /* csc_val > 0 */
619                         nbytes = read(csc_event.data.fd, &buf, 8);
620                         if (nbytes < 0) {
621                                 if (errno == EINTR ||
622                                     errno == EWOULDBLOCK ||
623                                     errno == EAGAIN)
624                                         continue;
625                                 DRV_LOG(ERR, "Error reading from file descriptor %d: %s\n",
626                                         csc_event.data.fd,
627                                         strerror(errno));
628                                 goto out;
629                         } else if (nbytes == 0) {
630                                 DRV_LOG(ERR, "Read nothing from file descriptor %d\n",
631                                         csc_event.data.fd);
632                                 continue;
633                         } else {
634                                 virtio_interrupt_handler(internal);
635                         }
636                 }
637         }
638
639 out:
640         if (csc_epfd >= 0)
641                 close(csc_epfd);
642         internal->csc_epfd = -1;
643
644         return NULL;
645 }
646
647 static int
648 setup_intr_relay(struct ifcvf_internal *internal)
649 {
650         char name[THREAD_NAME_LEN];
651         int ret;
652
653         snprintf(name, sizeof(name), "ifc-intr-%d", internal->vid);
654         ret = rte_ctrl_thread_create(&internal->intr_tid, name, NULL,
655                                      intr_relay, (void *)internal);
656         if (ret) {
657                 DRV_LOG(ERR, "failed to create notify relay pthread.");
658                 return -1;
659         }
660         return 0;
661 }
662
663 static void
664 unset_intr_relay(struct ifcvf_internal *internal)
665 {
666         void *status;
667
668         if (internal->intr_tid) {
669                 pthread_cancel(internal->intr_tid);
670                 pthread_join(internal->intr_tid, &status);
671         }
672         internal->intr_tid = 0;
673
674         if (internal->csc_epfd >= 0)
675                 close(internal->csc_epfd);
676         internal->csc_epfd = -1;
677 }
678
679 static int
680 update_datapath(struct ifcvf_internal *internal)
681 {
682         int ret;
683
684         rte_spinlock_lock(&internal->lock);
685
686         if (!rte_atomic32_read(&internal->running) &&
687             (rte_atomic32_read(&internal->started) &&
688              rte_atomic32_read(&internal->dev_attached))) {
689                 ret = ifcvf_dma_map(internal, true);
690                 if (ret)
691                         goto err;
692
693                 ret = vdpa_enable_vfio_intr(internal, false);
694                 if (ret)
695                         goto err;
696
697                 ret = vdpa_ifcvf_start(internal);
698                 if (ret)
699                         goto err;
700
701                 ret = setup_notify_relay(internal);
702                 if (ret)
703                         goto err;
704
705                 ret = setup_intr_relay(internal);
706                 if (ret)
707                         goto err;
708
709                 rte_atomic32_set(&internal->running, 1);
710         } else if (rte_atomic32_read(&internal->running) &&
711                    (!rte_atomic32_read(&internal->started) ||
712                     !rte_atomic32_read(&internal->dev_attached))) {
713                 unset_intr_relay(internal);
714
715                 ret = unset_notify_relay(internal);
716                 if (ret)
717                         goto err;
718
719                 vdpa_ifcvf_stop(internal);
720
721                 ret = vdpa_disable_vfio_intr(internal);
722                 if (ret)
723                         goto err;
724
725                 ret = ifcvf_dma_map(internal, false);
726                 if (ret)
727                         goto err;
728
729                 rte_atomic32_set(&internal->running, 0);
730         }
731
732         rte_spinlock_unlock(&internal->lock);
733         return 0;
734 err:
735         rte_spinlock_unlock(&internal->lock);
736         return ret;
737 }
738
739 static int
740 m_ifcvf_start(struct ifcvf_internal *internal)
741 {
742         struct ifcvf_hw *hw = &internal->hw;
743         uint32_t i, nr_vring;
744         int vid, ret;
745         struct rte_vhost_vring vq;
746         void *vring_buf;
747         uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
748         uint64_t size;
749         uint64_t gpa;
750
751         memset(&vq, 0, sizeof(vq));
752         vid = internal->vid;
753         nr_vring = rte_vhost_get_vring_num(vid);
754         rte_vhost_get_negotiated_features(vid, &hw->req_features);
755
756         for (i = 0; i < nr_vring; i++) {
757                 rte_vhost_get_vhost_vring(vid, i, &vq);
758
759                 size = RTE_ALIGN_CEIL(vring_size(vq.size, rte_mem_page_size()),
760                                 rte_mem_page_size());
761                 vring_buf = rte_zmalloc("ifcvf", size, rte_mem_page_size());
762                 vring_init(&internal->m_vring[i], vq.size, vring_buf,
763                                 rte_mem_page_size());
764
765                 ret = rte_vfio_container_dma_map(internal->vfio_container_fd,
766                         (uint64_t)(uintptr_t)vring_buf, m_vring_iova, size);
767                 if (ret < 0) {
768                         DRV_LOG(ERR, "mediated vring DMA map failed.");
769                         goto error;
770                 }
771
772                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
773                 if (gpa == 0) {
774                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
775                         return -1;
776                 }
777                 hw->vring[i].desc = gpa;
778
779                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
780                 if (gpa == 0) {
781                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
782                         return -1;
783                 }
784                 hw->vring[i].avail = gpa;
785
786                 /* NET: Direct I/O for Tx queue, relay for Rx queue
787                  * BLK: relay every queue
788                  */
789                 if ((internal->hw.device_type == IFCVF_NET) && (i & 1)) {
790                         gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
791                         if (gpa == 0) {
792                                 DRV_LOG(ERR, "Fail to get GPA for used ring.");
793                                 return -1;
794                         }
795                         hw->vring[i].used = gpa;
796                 } else {
797                         hw->vring[i].used = m_vring_iova +
798                                 (char *)internal->m_vring[i].used -
799                                 (char *)internal->m_vring[i].desc;
800                 }
801
802                 hw->vring[i].size = vq.size;
803
804                 rte_vhost_get_vring_base(vid, i,
805                                 &internal->m_vring[i].avail->idx,
806                                 &internal->m_vring[i].used->idx);
807
808                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
809                                 &hw->vring[i].last_used_idx);
810
811                 m_vring_iova += size;
812         }
813         hw->nr_vring = nr_vring;
814
815         return ifcvf_start_hw(&internal->hw);
816
817 error:
818         for (i = 0; i < nr_vring; i++)
819                 rte_free(internal->m_vring[i].desc);
820
821         return -1;
822 }
823
824 static int
825 m_ifcvf_stop(struct ifcvf_internal *internal)
826 {
827         int vid;
828         uint32_t i;
829         struct rte_vhost_vring vq;
830         struct ifcvf_hw *hw = &internal->hw;
831         uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
832         uint64_t size, len;
833
834         vid = internal->vid;
835         ifcvf_stop_hw(hw);
836
837         for (i = 0; i < hw->nr_vring; i++) {
838                 /* synchronize remaining new used entries if any */
839                 if (internal->hw.device_type == IFCVF_NET) {
840                         if ((i & 1) == 0)
841                                 update_used_ring(internal, i);
842                 } else if (internal->hw.device_type == IFCVF_BLK) {
843                         update_used_ring(internal, i);
844                 }
845
846                 rte_vhost_get_vhost_vring(vid, i, &vq);
847                 len = IFCVF_USED_RING_LEN(vq.size);
848                 rte_vhost_log_used_vring(vid, i, 0, len);
849
850                 size = RTE_ALIGN_CEIL(vring_size(vq.size, rte_mem_page_size()),
851                                 rte_mem_page_size());
852                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
853                         (uint64_t)(uintptr_t)internal->m_vring[i].desc,
854                         m_vring_iova, size);
855
856                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
857                                 hw->vring[i].last_used_idx);
858                 rte_free(internal->m_vring[i].desc);
859                 m_vring_iova += size;
860         }
861
862         return 0;
863 }
864
865 static void
866 update_used_ring(struct ifcvf_internal *internal, uint16_t qid)
867 {
868         rte_vdpa_relay_vring_used(internal->vid, qid, &internal->m_vring[qid]);
869         rte_vhost_vring_call(internal->vid, qid);
870 }
871
872 static void *
873 vring_relay(void *arg)
874 {
875         int i, vid, epfd, fd, nfds;
876         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
877         struct rte_vhost_vring vring;
878         uint16_t qid, q_num;
879         struct epoll_event events[IFCVF_MAX_QUEUES * 4];
880         struct epoll_event ev;
881         int nbytes;
882         uint64_t buf;
883
884         vid = internal->vid;
885         q_num = rte_vhost_get_vring_num(vid);
886
887         /* add notify fd and interrupt fd to epoll */
888         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
889         if (epfd < 0) {
890                 DRV_LOG(ERR, "failed to create epoll instance.");
891                 return NULL;
892         }
893         internal->epfd = epfd;
894
895         vring.kickfd = -1;
896         for (qid = 0; qid < q_num; qid++) {
897                 ev.events = EPOLLIN | EPOLLPRI;
898                 rte_vhost_get_vhost_vring(vid, qid, &vring);
899                 ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32;
900                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
901                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
902                         return NULL;
903                 }
904         }
905
906         for (qid = 0; qid < q_num; qid += 1) {
907                 if ((internal->hw.device_type == IFCVF_NET) && (qid & 1))
908                         continue;
909                 ev.events = EPOLLIN | EPOLLPRI;
910                 /* leave a flag to mark it's for interrupt */
911                 ev.data.u64 = 1 | qid << 1 |
912                         (uint64_t)internal->intr_fd[qid] << 32;
913                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, internal->intr_fd[qid], &ev)
914                                 < 0) {
915                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
916                         return NULL;
917                 }
918                 update_used_ring(internal, qid);
919         }
920
921         /* start relay with a first kick */
922         for (qid = 0; qid < q_num; qid++)
923                 ifcvf_notify_queue(&internal->hw, qid);
924
925         /* listen to the events and react accordingly */
926         for (;;) {
927                 nfds = epoll_wait(epfd, events, q_num * 2, -1);
928                 if (nfds < 0) {
929                         if (errno == EINTR)
930                                 continue;
931                         DRV_LOG(ERR, "epoll_wait return fail.");
932                         return NULL;
933                 }
934
935                 for (i = 0; i < nfds; i++) {
936                         fd = (uint32_t)(events[i].data.u64 >> 32);
937                         do {
938                                 nbytes = read(fd, &buf, 8);
939                                 if (nbytes < 0) {
940                                         if (errno == EINTR ||
941                                             errno == EWOULDBLOCK ||
942                                             errno == EAGAIN)
943                                                 continue;
944                                         DRV_LOG(INFO, "Error reading "
945                                                 "kickfd: %s",
946                                                 strerror(errno));
947                                 }
948                                 break;
949                         } while (1);
950
951                         qid = events[i].data.u32 >> 1;
952
953                         if (events[i].data.u32 & 1)
954                                 update_used_ring(internal, qid);
955                         else
956                                 ifcvf_notify_queue(&internal->hw, qid);
957                 }
958         }
959
960         return NULL;
961 }
962
963 static int
964 setup_vring_relay(struct ifcvf_internal *internal)
965 {
966         char name[THREAD_NAME_LEN];
967         int ret;
968
969         snprintf(name, sizeof(name), "ifc-vring-%d", internal->vid);
970         ret = rte_ctrl_thread_create(&internal->tid, name, NULL, vring_relay,
971                                      (void *)internal);
972         if (ret != 0) {
973                 DRV_LOG(ERR, "failed to create ring relay pthread.");
974                 return -1;
975         }
976
977         return 0;
978 }
979
980 static int
981 unset_vring_relay(struct ifcvf_internal *internal)
982 {
983         void *status;
984
985         if (internal->tid) {
986                 pthread_cancel(internal->tid);
987                 pthread_join(internal->tid, &status);
988         }
989         internal->tid = 0;
990
991         if (internal->epfd >= 0)
992                 close(internal->epfd);
993         internal->epfd = -1;
994
995         return 0;
996 }
997
998 static int
999 ifcvf_sw_fallback_switchover(struct ifcvf_internal *internal)
1000 {
1001         int ret;
1002         int vid = internal->vid;
1003
1004         /* stop the direct IO data path */
1005         unset_notify_relay(internal);
1006         vdpa_ifcvf_stop(internal);
1007
1008         unset_intr_relay(internal);
1009
1010         vdpa_disable_vfio_intr(internal);
1011
1012         ret = rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, false);
1013         if (ret && ret != -ENOTSUP)
1014                 goto error;
1015
1016         /* set up interrupt for interrupt relay */
1017         ret = vdpa_enable_vfio_intr(internal, true);
1018         if (ret)
1019                 goto unmap;
1020
1021         /* config the VF */
1022         ret = m_ifcvf_start(internal);
1023         if (ret)
1024                 goto unset_intr;
1025
1026         /* set up vring relay thread */
1027         ret = setup_vring_relay(internal);
1028         if (ret)
1029                 goto stop_vf;
1030
1031         rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, true);
1032
1033         internal->sw_fallback_running = true;
1034
1035         return 0;
1036
1037 stop_vf:
1038         m_ifcvf_stop(internal);
1039 unset_intr:
1040         vdpa_disable_vfio_intr(internal);
1041 unmap:
1042         ifcvf_dma_map(internal, false);
1043 error:
1044         return -1;
1045 }
1046
1047 static int
1048 ifcvf_dev_config(int vid)
1049 {
1050         struct rte_vdpa_device *vdev;
1051         struct internal_list *list;
1052         struct ifcvf_internal *internal;
1053
1054         vdev = rte_vhost_get_vdpa_device(vid);
1055         list = find_internal_resource_by_vdev(vdev);
1056         if (list == NULL) {
1057                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1058                 return -1;
1059         }
1060
1061         internal = list->internal;
1062         internal->vid = vid;
1063         rte_atomic32_set(&internal->dev_attached, 1);
1064         update_datapath(internal);
1065
1066         if (rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, true) != 0)
1067                 DRV_LOG(NOTICE, "vDPA (%s): software relay is used.",
1068                                 vdev->device->name);
1069
1070         internal->configured = 1;
1071         return 0;
1072 }
1073
1074 static int
1075 ifcvf_dev_close(int vid)
1076 {
1077         struct rte_vdpa_device *vdev;
1078         struct internal_list *list;
1079         struct ifcvf_internal *internal;
1080
1081         vdev = rte_vhost_get_vdpa_device(vid);
1082         list = find_internal_resource_by_vdev(vdev);
1083         if (list == NULL) {
1084                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1085                 return -1;
1086         }
1087
1088         internal = list->internal;
1089
1090         if (internal->sw_fallback_running) {
1091                 /* unset ring relay */
1092                 unset_vring_relay(internal);
1093
1094                 /* reset VF */
1095                 m_ifcvf_stop(internal);
1096
1097                 /* remove interrupt setting */
1098                 vdpa_disable_vfio_intr(internal);
1099
1100                 /* unset DMA map for guest memory */
1101                 ifcvf_dma_map(internal, false);
1102
1103                 internal->sw_fallback_running = false;
1104         } else {
1105                 rte_atomic32_set(&internal->dev_attached, 0);
1106                 update_datapath(internal);
1107         }
1108
1109         internal->configured = 0;
1110         return 0;
1111 }
1112
1113 static int
1114 ifcvf_set_features(int vid)
1115 {
1116         uint64_t features = 0;
1117         struct rte_vdpa_device *vdev;
1118         struct internal_list *list;
1119         struct ifcvf_internal *internal;
1120         uint64_t log_base = 0, log_size = 0;
1121
1122         vdev = rte_vhost_get_vdpa_device(vid);
1123         list = find_internal_resource_by_vdev(vdev);
1124         if (list == NULL) {
1125                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1126                 return -1;
1127         }
1128
1129         internal = list->internal;
1130         rte_vhost_get_negotiated_features(vid, &features);
1131
1132         if (!RTE_VHOST_NEED_LOG(features))
1133                 return 0;
1134
1135         if (internal->sw_lm) {
1136                 ifcvf_sw_fallback_switchover(internal);
1137         } else {
1138                 rte_vhost_get_log_base(vid, &log_base, &log_size);
1139                 rte_vfio_container_dma_map(internal->vfio_container_fd,
1140                                 log_base, IFCVF_LOG_BASE, log_size);
1141                 ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size);
1142         }
1143
1144         return 0;
1145 }
1146
1147 static int
1148 ifcvf_get_vfio_group_fd(int vid)
1149 {
1150         struct rte_vdpa_device *vdev;
1151         struct internal_list *list;
1152
1153         vdev = rte_vhost_get_vdpa_device(vid);
1154         list = find_internal_resource_by_vdev(vdev);
1155         if (list == NULL) {
1156                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1157                 return -1;
1158         }
1159
1160         return list->internal->vfio_group_fd;
1161 }
1162
1163 static int
1164 ifcvf_get_vfio_device_fd(int vid)
1165 {
1166         struct rte_vdpa_device *vdev;
1167         struct internal_list *list;
1168
1169         vdev = rte_vhost_get_vdpa_device(vid);
1170         list = find_internal_resource_by_vdev(vdev);
1171         if (list == NULL) {
1172                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1173                 return -1;
1174         }
1175
1176         return list->internal->vfio_dev_fd;
1177 }
1178
1179 static int
1180 ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
1181 {
1182         struct rte_vdpa_device *vdev;
1183         struct internal_list *list;
1184         struct ifcvf_internal *internal;
1185         struct vfio_region_info reg = { .argsz = sizeof(reg) };
1186         int ret;
1187
1188         vdev = rte_vhost_get_vdpa_device(vid);
1189         list = find_internal_resource_by_vdev(vdev);
1190         if (list == NULL) {
1191                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1192                 return -1;
1193         }
1194
1195         internal = list->internal;
1196
1197         reg.index = ifcvf_get_notify_region(&internal->hw);
1198         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
1199         if (ret) {
1200                 DRV_LOG(ERR, "Get not get device region info: %s",
1201                                 strerror(errno));
1202                 return -1;
1203         }
1204
1205         *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset;
1206         *size = 0x1000;
1207
1208         return 0;
1209 }
1210
1211 static int
1212 ifcvf_get_queue_num(struct rte_vdpa_device *vdev, uint32_t *queue_num)
1213 {
1214         struct internal_list *list;
1215
1216         list = find_internal_resource_by_vdev(vdev);
1217         if (list == NULL) {
1218                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1219                 return -1;
1220         }
1221
1222         *queue_num = list->internal->max_queues;
1223
1224         return 0;
1225 }
1226
1227 static int
1228 ifcvf_get_vdpa_features(struct rte_vdpa_device *vdev, uint64_t *features)
1229 {
1230         struct internal_list *list;
1231
1232         list = find_internal_resource_by_vdev(vdev);
1233         if (list == NULL) {
1234                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1235                 return -1;
1236         }
1237
1238         *features = list->internal->features;
1239
1240         return 0;
1241 }
1242
1243 #define VDPA_SUPPORTED_PROTOCOL_FEATURES \
1244                 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \
1245                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | \
1246                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | \
1247                  1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
1248                  1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | \
1249                  1ULL << VHOST_USER_PROTOCOL_F_STATUS)
1250
1251 #define VDPA_BLK_PROTOCOL_FEATURES \
1252                 (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)
1253
1254 static int
1255 ifcvf_get_protocol_features(struct rte_vdpa_device *vdev, uint64_t *features)
1256 {
1257         RTE_SET_USED(vdev);
1258
1259         *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
1260         return 0;
1261 }
1262
1263 static int
1264 ifcvf_set_vring_state(int vid, int vring, int state)
1265 {
1266         struct rte_vdpa_device *vdev;
1267         struct internal_list *list;
1268         struct ifcvf_internal *internal;
1269         struct ifcvf_hw *hw;
1270         struct ifcvf_pci_common_cfg *cfg;
1271         int ret = 0;
1272
1273         vdev = rte_vhost_get_vdpa_device(vid);
1274         list = find_internal_resource_by_vdev(vdev);
1275         if (list == NULL) {
1276                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1277                 return -1;
1278         }
1279
1280         internal = list->internal;
1281         if (vring < 0 || vring >= internal->max_queues * 2) {
1282                 DRV_LOG(ERR, "Vring index %d not correct", vring);
1283                 return -1;
1284         }
1285
1286         hw = &internal->hw;
1287         if (!internal->configured)
1288                 goto exit;
1289
1290         cfg = hw->common_cfg;
1291         IFCVF_WRITE_REG16(vring, &cfg->queue_select);
1292         IFCVF_WRITE_REG16(!!state, &cfg->queue_enable);
1293
1294         if (!state && hw->vring[vring].enable) {
1295                 ret = vdpa_disable_vfio_intr(internal);
1296                 if (ret)
1297                         return ret;
1298         }
1299
1300         if (state && !hw->vring[vring].enable) {
1301                 ret = vdpa_enable_vfio_intr(internal, false);
1302                 if (ret)
1303                         return ret;
1304         }
1305
1306 exit:
1307         hw->vring[vring].enable = !!state;
1308         return 0;
1309 }
1310
1311 static int
1312 ifcvf_get_device_type(struct rte_vdpa_device *vdev,
1313         uint32_t *type)
1314 {
1315         struct ifcvf_internal *internal;
1316         struct internal_list *list;
1317
1318         list = find_internal_resource_by_vdev(vdev);
1319         if (list == NULL) {
1320                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1321                 return -1;
1322         }
1323
1324         internal = list->internal;
1325
1326         if (internal->hw.device_type == IFCVF_BLK)
1327                 *type = RTE_VHOST_VDPA_DEVICE_TYPE_BLK;
1328         else
1329                 *type = RTE_VHOST_VDPA_DEVICE_TYPE_NET;
1330
1331         return 0;
1332 }
1333
1334 static struct rte_vdpa_dev_ops ifcvf_net_ops = {
1335         .get_queue_num = ifcvf_get_queue_num,
1336         .get_features = ifcvf_get_vdpa_features,
1337         .get_protocol_features = ifcvf_get_protocol_features,
1338         .dev_conf = ifcvf_dev_config,
1339         .dev_close = ifcvf_dev_close,
1340         .set_vring_state = ifcvf_set_vring_state,
1341         .set_features = ifcvf_set_features,
1342         .migration_done = NULL,
1343         .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
1344         .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
1345         .get_notify_area = ifcvf_get_notify_area,
1346         .get_dev_type = ifcvf_get_device_type,
1347 };
1348
1349 static inline int
1350 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1351 {
1352         uint16_t *n = extra_args;
1353
1354         if (value == NULL || extra_args == NULL)
1355                 return -EINVAL;
1356
1357         *n = (uint16_t)strtoul(value, NULL, 0);
1358         if (*n == USHRT_MAX && errno == ERANGE)
1359                 return -1;
1360
1361         return 0;
1362 }
1363
1364 static int16_t
1365 ifcvf_pci_get_device_type(struct rte_pci_device *pci_dev)
1366 {
1367         uint16_t pci_device_id = pci_dev->id.device_id;
1368         uint16_t device_id;
1369
1370         if (pci_device_id < 0x1000 || pci_device_id > 0x107f) {
1371                 DRV_LOG(ERR, "Probe device is not a virtio device\n");
1372                 return -1;
1373         }
1374
1375         if (pci_device_id < 0x1040) {
1376                 /* Transitional devices: use the PCI subsystem device id as
1377                  * virtio device id, same as legacy driver always did.
1378                  */
1379                 device_id = pci_dev->id.subsystem_device_id;
1380         } else {
1381                 /* Modern devices: simply use PCI device id,
1382                  * but start from 0x1040.
1383                  */
1384                 device_id = pci_device_id - 0x1040;
1385         }
1386
1387         return device_id;
1388 }
1389
1390 static int
1391 ifcvf_blk_get_config(int vid, uint8_t *config, uint32_t size)
1392 {
1393         struct virtio_blk_config *dev_cfg;
1394         struct ifcvf_internal *internal;
1395         struct rte_vdpa_device *vdev;
1396         struct internal_list *list;
1397         uint32_t i;
1398         uint64_t capacity = 0;
1399         uint8_t *byte;
1400
1401         if (size != sizeof(struct virtio_blk_config)) {
1402                 DRV_LOG(ERR, "Invalid len: %u, required: %u",
1403                         size, (uint32_t)sizeof(struct virtio_blk_config));
1404                 return -1;
1405         }
1406
1407         vdev = rte_vhost_get_vdpa_device(vid);
1408         if (vdev == NULL) {
1409                 DRV_LOG(ERR, "Invalid vDPA device vid: %d", vid);
1410                 return -1;
1411         }
1412
1413         list = find_internal_resource_by_vdev(vdev);
1414         if (list == NULL) {
1415                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1416                 return -1;
1417         }
1418
1419         internal = list->internal;
1420
1421         for (i = 0; i < sizeof(struct virtio_blk_config); i++)
1422                 config[i] = *((u8 *)internal->hw.blk_cfg + i);
1423
1424         dev_cfg = (struct virtio_blk_config *)internal->hw.blk_cfg;
1425
1426         /* cannot read 64-bit register in one attempt, so read byte by byte. */
1427         for (i = 0; i < sizeof(internal->hw.blk_cfg->capacity); i++) {
1428                 byte = (uint8_t *)&internal->hw.blk_cfg->capacity + i;
1429                 capacity |= (uint64_t)*byte << (i * 8);
1430         }
1431         /* The capacity is number of sectors in 512-byte.
1432          * So right shift 1 bit  we get in K,
1433          * another right shift 10 bits we get in M,
1434          * right shift 10 more bits, we get in G.
1435          * To show capacity in G, we right shift 21 bits in total.
1436          */
1437         DRV_LOG(DEBUG, "capacity  : %"PRIu64"G", capacity >> 21);
1438
1439         DRV_LOG(DEBUG, "size_max  : 0x%08x", dev_cfg->size_max);
1440         DRV_LOG(DEBUG, "seg_max   : 0x%08x", dev_cfg->seg_max);
1441         DRV_LOG(DEBUG, "blk_size  : 0x%08x", dev_cfg->blk_size);
1442         DRV_LOG(DEBUG, "geometry");
1443         DRV_LOG(DEBUG, "      cylinders: %u", dev_cfg->geometry.cylinders);
1444         DRV_LOG(DEBUG, "      heads    : %u", dev_cfg->geometry.heads);
1445         DRV_LOG(DEBUG, "      sectors  : %u", dev_cfg->geometry.sectors);
1446         DRV_LOG(DEBUG, "num_queues: 0x%08x", dev_cfg->num_queues);
1447
1448         DRV_LOG(DEBUG, "config: [%x] [%x] [%x] [%x] [%x] [%x] [%x] [%x]\n",
1449                 config[0], config[1], config[2], config[3], config[4],
1450                 config[5], config[6], config[7]);
1451         return 0;
1452 }
1453
1454 static int
1455 ifcvf_blk_get_protocol_features(struct rte_vdpa_device *vdev,
1456         uint64_t *features)
1457 {
1458         RTE_SET_USED(vdev);
1459
1460         *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
1461         *features |= VDPA_BLK_PROTOCOL_FEATURES;
1462         return 0;
1463 }
1464
1465 static struct rte_vdpa_dev_ops ifcvf_blk_ops = {
1466         .get_queue_num = ifcvf_get_queue_num,
1467         .get_features = ifcvf_get_vdpa_features,
1468         .set_features = ifcvf_set_features,
1469         .get_protocol_features = ifcvf_blk_get_protocol_features,
1470         .dev_conf = ifcvf_dev_config,
1471         .dev_close = ifcvf_dev_close,
1472         .set_vring_state = ifcvf_set_vring_state,
1473         .migration_done = NULL,
1474         .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
1475         .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
1476         .get_notify_area = ifcvf_get_notify_area,
1477         .get_config = ifcvf_blk_get_config,
1478         .get_dev_type = ifcvf_get_device_type,
1479 };
1480
1481 struct rte_vdpa_dev_info dev_info[] = {
1482         {
1483                 .features = (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
1484                             (1ULL << VIRTIO_NET_F_CTRL_VQ) |
1485                             (1ULL << VIRTIO_NET_F_STATUS) |
1486                             (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
1487                             (1ULL << VHOST_F_LOG_ALL),
1488                 .ops = &ifcvf_net_ops,
1489         },
1490         {
1491                 .features = (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
1492                             (1ULL << VHOST_F_LOG_ALL),
1493                 .ops = &ifcvf_blk_ops,
1494         },
1495 };
1496
1497 static int
1498 ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
1499                 struct rte_pci_device *pci_dev)
1500 {
1501         uint64_t features;
1502         struct ifcvf_internal *internal = NULL;
1503         struct internal_list *list = NULL;
1504         int vdpa_mode = 0;
1505         int sw_fallback_lm = 0;
1506         struct rte_kvargs *kvlist = NULL;
1507         int ret = 0;
1508         int16_t device_id;
1509         uint64_t capacity = 0;
1510         uint8_t *byte;
1511         uint32_t i;
1512
1513         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1514                 return 0;
1515
1516         if (!pci_dev->device.devargs)
1517                 return 1;
1518
1519         kvlist = rte_kvargs_parse(pci_dev->device.devargs->args,
1520                         ifcvf_valid_arguments);
1521         if (kvlist == NULL)
1522                 return 1;
1523
1524         /* probe only when vdpa mode is specified */
1525         if (rte_kvargs_count(kvlist, IFCVF_VDPA_MODE) == 0) {
1526                 rte_kvargs_free(kvlist);
1527                 return 1;
1528         }
1529
1530         ret = rte_kvargs_process(kvlist, IFCVF_VDPA_MODE, &open_int,
1531                         &vdpa_mode);
1532         if (ret < 0 || vdpa_mode == 0) {
1533                 rte_kvargs_free(kvlist);
1534                 return 1;
1535         }
1536
1537         list = rte_zmalloc("ifcvf", sizeof(*list), 0);
1538         if (list == NULL)
1539                 goto error;
1540
1541         internal = rte_zmalloc("ifcvf", sizeof(*internal), 0);
1542         if (internal == NULL)
1543                 goto error;
1544
1545         internal->pdev = pci_dev;
1546         rte_spinlock_init(&internal->lock);
1547
1548         if (ifcvf_vfio_setup(internal) < 0) {
1549                 DRV_LOG(ERR, "failed to setup device %s", pci_dev->name);
1550                 goto error;
1551         }
1552
1553         if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0) {
1554                 DRV_LOG(ERR, "failed to init device %s", pci_dev->name);
1555                 goto error;
1556         }
1557
1558         internal->configured = 0;
1559         internal->max_queues = IFCVF_MAX_QUEUES;
1560         features = ifcvf_get_features(&internal->hw);
1561
1562         device_id = ifcvf_pci_get_device_type(pci_dev);
1563         if (device_id < 0) {
1564                 DRV_LOG(ERR, "failed to get device %s type", pci_dev->name);
1565                 goto error;
1566         }
1567
1568         if (device_id == VIRTIO_ID_NET) {
1569                 internal->hw.device_type = IFCVF_NET;
1570                 internal->features = features &
1571                                         ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
1572                 internal->features |= dev_info[IFCVF_NET].features;
1573         } else if (device_id == VIRTIO_ID_BLOCK) {
1574                 internal->hw.device_type = IFCVF_BLK;
1575                 internal->features = features &
1576                                         ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
1577                 internal->features |= dev_info[IFCVF_BLK].features;
1578
1579                 /* cannot read 64-bit register in one attempt,
1580                  * so read byte by byte.
1581                  */
1582                 for (i = 0; i < sizeof(internal->hw.blk_cfg->capacity); i++) {
1583                         byte = (uint8_t *)&internal->hw.blk_cfg->capacity + i;
1584                         capacity |= (uint64_t)*byte << (i * 8);
1585                 }
1586                 /* The capacity is number of sectors in 512-byte.
1587                  * So right shift 1 bit  we get in K,
1588                  * another right shift 10 bits we get in M,
1589                  * right shift 10 more bits, we get in G.
1590                  * To show capacity in G, we right shift 21 bits in total.
1591                  */
1592                 DRV_LOG(DEBUG, "capacity  : %"PRIu64"G", capacity >> 21);
1593
1594                 DRV_LOG(DEBUG, "size_max  : 0x%08x",
1595                         internal->hw.blk_cfg->size_max);
1596                 DRV_LOG(DEBUG, "seg_max   : 0x%08x",
1597                         internal->hw.blk_cfg->seg_max);
1598                 DRV_LOG(DEBUG, "blk_size  : 0x%08x",
1599                         internal->hw.blk_cfg->blk_size);
1600                 DRV_LOG(DEBUG, "geometry");
1601                 DRV_LOG(DEBUG, "    cylinders: %u",
1602                         internal->hw.blk_cfg->geometry.cylinders);
1603                 DRV_LOG(DEBUG, "    heads    : %u",
1604                         internal->hw.blk_cfg->geometry.heads);
1605                 DRV_LOG(DEBUG, "    sectors  : %u",
1606                         internal->hw.blk_cfg->geometry.sectors);
1607                 DRV_LOG(DEBUG, "num_queues: 0x%08x",
1608                         internal->hw.blk_cfg->num_queues);
1609         }
1610
1611         list->internal = internal;
1612
1613         if (rte_kvargs_count(kvlist, IFCVF_SW_FALLBACK_LM)) {
1614                 ret = rte_kvargs_process(kvlist, IFCVF_SW_FALLBACK_LM,
1615                                 &open_int, &sw_fallback_lm);
1616                 if (ret < 0)
1617                         goto error;
1618         }
1619         internal->sw_lm = sw_fallback_lm;
1620
1621         internal->vdev = rte_vdpa_register_device(&pci_dev->device,
1622                                 dev_info[internal->hw.device_type].ops);
1623         if (internal->vdev == NULL) {
1624                 DRV_LOG(ERR, "failed to register device %s", pci_dev->name);
1625                 goto error;
1626         }
1627
1628         pthread_mutex_lock(&internal_list_lock);
1629         TAILQ_INSERT_TAIL(&internal_list, list, next);
1630         pthread_mutex_unlock(&internal_list_lock);
1631
1632         rte_atomic32_set(&internal->started, 1);
1633         update_datapath(internal);
1634
1635         rte_kvargs_free(kvlist);
1636         return 0;
1637
1638 error:
1639         rte_kvargs_free(kvlist);
1640         rte_free(list);
1641         rte_free(internal);
1642         return -1;
1643 }
1644
1645 static int
1646 ifcvf_pci_remove(struct rte_pci_device *pci_dev)
1647 {
1648         struct ifcvf_internal *internal;
1649         struct internal_list *list;
1650
1651         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1652                 return 0;
1653
1654         list = find_internal_resource_by_dev(pci_dev);
1655         if (list == NULL) {
1656                 DRV_LOG(ERR, "Invalid device: %s", pci_dev->name);
1657                 return -1;
1658         }
1659
1660         internal = list->internal;
1661         rte_atomic32_set(&internal->started, 0);
1662         update_datapath(internal);
1663
1664         rte_pci_unmap_device(internal->pdev);
1665         rte_vfio_container_destroy(internal->vfio_container_fd);
1666         rte_vdpa_unregister_device(internal->vdev);
1667
1668         pthread_mutex_lock(&internal_list_lock);
1669         TAILQ_REMOVE(&internal_list, list, next);
1670         pthread_mutex_unlock(&internal_list_lock);
1671
1672         rte_free(list);
1673         rte_free(internal);
1674
1675         return 0;
1676 }
1677
1678 /*
1679  * IFCVF has the same vendor ID and device ID as virtio net PCI
1680  * device, with its specific subsystem vendor ID and device ID.
1681  */
1682 static const struct rte_pci_id pci_id_ifcvf_map[] = {
1683         { .class_id = RTE_CLASS_ANY_ID,
1684           .vendor_id = IFCVF_VENDOR_ID,
1685           .device_id = IFCVF_NET_DEVICE_ID,
1686           .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
1687           .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID,
1688         },
1689
1690         { .class_id = RTE_CLASS_ANY_ID,
1691           .vendor_id = IFCVF_VENDOR_ID,
1692           .device_id = IFCVF_BLK_TRANSITIONAL_DEVICE_ID,
1693           .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
1694           .subsystem_device_id = IFCVF_BLK_DEVICE_ID,
1695         },
1696
1697         { .class_id = RTE_CLASS_ANY_ID,
1698           .vendor_id = IFCVF_VENDOR_ID,
1699           .device_id = IFCVF_BLK_MODERN_DEVICE_ID,
1700           .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
1701           .subsystem_device_id = IFCVF_BLK_DEVICE_ID,
1702         },
1703
1704         { .vendor_id = 0, /* sentinel */
1705         },
1706 };
1707
1708 static struct rte_pci_driver rte_ifcvf_vdpa = {
1709         .id_table = pci_id_ifcvf_map,
1710         .drv_flags = 0,
1711         .probe = ifcvf_pci_probe,
1712         .remove = ifcvf_pci_remove,
1713 };
1714
1715 RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa);
1716 RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map);
1717 RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci");