vdpa/ifc: fix build with GCC 12
[dpdk.git] / drivers / vdpa / ifc / ifcvf_vdpa.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2018 Intel Corporation
3  */
4
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <fcntl.h>
8 #include <string.h>
9 #include <sys/ioctl.h>
10 #include <sys/epoll.h>
11 #include <linux/virtio_net.h>
12 #include <stdbool.h>
13
14 #include <rte_eal_paging.h>
15 #include <rte_malloc.h>
16 #include <rte_memory.h>
17 #include <rte_bus_pci.h>
18 #include <rte_vhost.h>
19 #include <rte_vdpa.h>
20 #include <vdpa_driver.h>
21 #include <rte_vfio.h>
22 #include <rte_spinlock.h>
23 #include <rte_log.h>
24 #include <rte_kvargs.h>
25 #include <rte_devargs.h>
26
27 #include "base/ifcvf.h"
28
29 RTE_LOG_REGISTER(ifcvf_vdpa_logtype, pmd.vdpa.ifcvf, NOTICE);
30 #define DRV_LOG(level, fmt, args...) \
31         rte_log(RTE_LOG_ ## level, ifcvf_vdpa_logtype, \
32                 "IFCVF %s(): " fmt "\n", __func__, ##args)
33
34 #define IFCVF_USED_RING_LEN(size) \
35         ((size) * sizeof(struct vring_used_elem) + sizeof(uint16_t) * 3)
36
37 #define IFCVF_VDPA_MODE         "vdpa"
38 #define IFCVF_SW_FALLBACK_LM    "sw-live-migration"
39
40 #define THREAD_NAME_LEN 16
41
42 static const char * const ifcvf_valid_arguments[] = {
43         IFCVF_VDPA_MODE,
44         IFCVF_SW_FALLBACK_LM,
45         NULL
46 };
47
48 struct ifcvf_internal {
49         struct rte_pci_device *pdev;
50         struct ifcvf_hw hw;
51         int configured;
52         int vfio_container_fd;
53         int vfio_group_fd;
54         int vfio_dev_fd;
55         pthread_t tid;  /* thread for notify relay */
56         pthread_t intr_tid; /* thread for config space change interrupt relay */
57         int epfd;
58         int csc_epfd;
59         int vid;
60         struct rte_vdpa_device *vdev;
61         uint16_t max_queues;
62         uint64_t features;
63         rte_atomic32_t started;
64         rte_atomic32_t dev_attached;
65         rte_atomic32_t running;
66         rte_spinlock_t lock;
67         bool sw_lm;
68         bool sw_fallback_running;
69         /* mediated vring for sw fallback */
70         struct vring m_vring[IFCVF_MAX_QUEUES * 2];
71         /* eventfd for used ring interrupt */
72         int intr_fd[IFCVF_MAX_QUEUES * 2];
73 };
74
75 struct internal_list {
76         TAILQ_ENTRY(internal_list) next;
77         struct ifcvf_internal *internal;
78 };
79
80 /* vdpa device info includes device features and devcic operation. */
81 struct rte_vdpa_dev_info {
82         uint64_t features;
83         struct rte_vdpa_dev_ops *ops;
84 };
85
86 TAILQ_HEAD(internal_list_head, internal_list);
87 static struct internal_list_head internal_list =
88         TAILQ_HEAD_INITIALIZER(internal_list);
89
90 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
91
92 static void update_used_ring(struct ifcvf_internal *internal, uint16_t qid);
93
94 static struct internal_list *
95 find_internal_resource_by_vdev(struct rte_vdpa_device *vdev)
96 {
97         int found = 0;
98         struct internal_list *list;
99
100         pthread_mutex_lock(&internal_list_lock);
101
102         TAILQ_FOREACH(list, &internal_list, next) {
103                 if (vdev == list->internal->vdev) {
104                         found = 1;
105                         break;
106                 }
107         }
108
109         pthread_mutex_unlock(&internal_list_lock);
110
111         if (!found)
112                 return NULL;
113
114         return list;
115 }
116
117 static struct internal_list *
118 find_internal_resource_by_dev(struct rte_pci_device *pdev)
119 {
120         int found = 0;
121         struct internal_list *list;
122
123         pthread_mutex_lock(&internal_list_lock);
124
125         TAILQ_FOREACH(list, &internal_list, next) {
126                 if (!rte_pci_addr_cmp(&pdev->addr,
127                                         &list->internal->pdev->addr)) {
128                         found = 1;
129                         break;
130                 }
131         }
132
133         pthread_mutex_unlock(&internal_list_lock);
134
135         if (!found)
136                 return NULL;
137
138         return list;
139 }
140
141 static int
142 ifcvf_vfio_setup(struct ifcvf_internal *internal)
143 {
144         struct rte_pci_device *dev = internal->pdev;
145         char devname[RTE_DEV_NAME_MAX_LEN] = {0};
146         int iommu_group_num;
147         int i, ret;
148
149         internal->vfio_dev_fd = -1;
150         internal->vfio_group_fd = -1;
151         internal->vfio_container_fd = -1;
152
153         rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
154         ret = rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
155                         &iommu_group_num);
156         if (ret <= 0) {
157                 DRV_LOG(ERR, "%s failed to get IOMMU group", devname);
158                 return -1;
159         }
160
161         internal->vfio_container_fd = rte_vfio_container_create();
162         if (internal->vfio_container_fd < 0)
163                 return -1;
164
165         internal->vfio_group_fd = rte_vfio_container_group_bind(
166                         internal->vfio_container_fd, iommu_group_num);
167         if (internal->vfio_group_fd < 0)
168                 goto err;
169
170         if (rte_pci_map_device(dev))
171                 goto err;
172
173         internal->vfio_dev_fd = rte_intr_dev_fd_get(dev->intr_handle);
174
175         for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
176                         i++) {
177                 internal->hw.mem_resource[i].addr =
178                         internal->pdev->mem_resource[i].addr;
179                 internal->hw.mem_resource[i].phys_addr =
180                         internal->pdev->mem_resource[i].phys_addr;
181                 internal->hw.mem_resource[i].len =
182                         internal->pdev->mem_resource[i].len;
183         }
184
185         return 0;
186
187 err:
188         rte_vfio_container_destroy(internal->vfio_container_fd);
189         return -1;
190 }
191
192 static int
193 ifcvf_dma_map(struct ifcvf_internal *internal, bool do_map)
194 {
195         uint32_t i;
196         int ret;
197         struct rte_vhost_memory *mem = NULL;
198         int vfio_container_fd;
199
200         ret = rte_vhost_get_mem_table(internal->vid, &mem);
201         if (ret < 0) {
202                 DRV_LOG(ERR, "failed to get VM memory layout.");
203                 goto exit;
204         }
205
206         vfio_container_fd = internal->vfio_container_fd;
207
208         for (i = 0; i < mem->nregions; i++) {
209                 struct rte_vhost_mem_region *reg;
210
211                 reg = &mem->regions[i];
212                 DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", "
213                         "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".",
214                         do_map ? "DMA map" : "DMA unmap", i,
215                         reg->host_user_addr, reg->guest_phys_addr, reg->size);
216
217                 if (do_map) {
218                         ret = rte_vfio_container_dma_map(vfio_container_fd,
219                                 reg->host_user_addr, reg->guest_phys_addr,
220                                 reg->size);
221                         if (ret < 0) {
222                                 DRV_LOG(ERR, "DMA map failed.");
223                                 goto exit;
224                         }
225                 } else {
226                         ret = rte_vfio_container_dma_unmap(vfio_container_fd,
227                                 reg->host_user_addr, reg->guest_phys_addr,
228                                 reg->size);
229                         if (ret < 0) {
230                                 DRV_LOG(ERR, "DMA unmap failed.");
231                                 goto exit;
232                         }
233                 }
234         }
235
236 exit:
237         free(mem);
238         return ret;
239 }
240
241 static uint64_t
242 hva_to_gpa(int vid, uint64_t hva)
243 {
244         struct rte_vhost_memory *mem = NULL;
245         struct rte_vhost_mem_region *reg;
246         uint32_t i;
247         uint64_t gpa = 0;
248
249         if (rte_vhost_get_mem_table(vid, &mem) < 0)
250                 goto exit;
251
252         for (i = 0; i < mem->nregions; i++) {
253                 reg = &mem->regions[i];
254
255                 if (hva >= reg->host_user_addr &&
256                                 hva < reg->host_user_addr + reg->size) {
257                         gpa = hva - reg->host_user_addr + reg->guest_phys_addr;
258                         break;
259                 }
260         }
261
262 exit:
263         free(mem);
264         return gpa;
265 }
266
267 static int
268 vdpa_ifcvf_start(struct ifcvf_internal *internal)
269 {
270         struct ifcvf_hw *hw = &internal->hw;
271         int i, nr_vring;
272         int vid;
273         struct rte_vhost_vring vq;
274         uint64_t gpa;
275
276         vid = internal->vid;
277         nr_vring = rte_vhost_get_vring_num(vid);
278         rte_vhost_get_negotiated_features(vid, &hw->req_features);
279
280         for (i = 0; i < nr_vring; i++) {
281                 rte_vhost_get_vhost_vring(vid, i, &vq);
282                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
283                 if (gpa == 0) {
284                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
285                         return -1;
286                 }
287                 hw->vring[i].desc = gpa;
288
289                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
290                 if (gpa == 0) {
291                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
292                         return -1;
293                 }
294                 hw->vring[i].avail = gpa;
295
296                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
297                 if (gpa == 0) {
298                         DRV_LOG(ERR, "Fail to get GPA for used ring.");
299                         return -1;
300                 }
301                 hw->vring[i].used = gpa;
302
303                 hw->vring[i].size = vq.size;
304                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
305                                 &hw->vring[i].last_used_idx);
306         }
307         hw->nr_vring = i;
308
309         return ifcvf_start_hw(&internal->hw);
310 }
311
312 static void
313 vdpa_ifcvf_stop(struct ifcvf_internal *internal)
314 {
315         struct ifcvf_hw *hw = &internal->hw;
316         uint32_t i;
317         int vid;
318         uint64_t features = 0;
319         uint64_t log_base = 0, log_size = 0;
320         uint64_t len;
321         u32 ring_state = 0;
322
323         vid = internal->vid;
324
325         /* to make sure no packet is lost for blk device
326          * do not stop until last_avail_idx == last_used_idx
327          */
328         if (internal->hw.device_type == IFCVF_BLK) {
329                 for (i = 0; i < hw->nr_vring; i++) {
330                         do {
331                                 if (hw->lm_cfg != NULL)
332                                         ring_state = *(u32 *)(hw->lm_cfg +
333                                                 IFCVF_LM_RING_STATE_OFFSET +
334                                                 i * IFCVF_LM_CFG_SIZE);
335                                 hw->vring[i].last_avail_idx =
336                                         (u16)(ring_state & IFCVF_16_BIT_MASK);
337                                 hw->vring[i].last_used_idx =
338                                         (u16)(ring_state >> 16);
339                                 if (hw->vring[i].last_avail_idx !=
340                                         hw->vring[i].last_used_idx) {
341                                         ifcvf_notify_queue(hw, i);
342                                         usleep(10);
343                                 }
344                         } while (hw->vring[i].last_avail_idx !=
345                                 hw->vring[i].last_used_idx);
346                 }
347         }
348
349         ifcvf_stop_hw(hw);
350
351         for (i = 0; i < hw->nr_vring; i++)
352                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
353                                 hw->vring[i].last_used_idx);
354
355         if (internal->sw_lm)
356                 return;
357
358         rte_vhost_get_negotiated_features(vid, &features);
359         if (RTE_VHOST_NEED_LOG(features)) {
360                 ifcvf_disable_logging(hw);
361                 rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
362                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
363                                 log_base, IFCVF_LOG_BASE, log_size);
364                 /*
365                  * IFCVF marks dirty memory pages for only packet buffer,
366                  * SW helps to mark the used ring as dirty after device stops.
367                  */
368                 for (i = 0; i < hw->nr_vring; i++) {
369                         len = IFCVF_USED_RING_LEN(hw->vring[i].size);
370                         rte_vhost_log_used_vring(vid, i, 0, len);
371                 }
372         }
373 }
374
375 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
376                 sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
377 static int
378 vdpa_enable_vfio_intr(struct ifcvf_internal *internal, bool m_rx)
379 {
380         int ret;
381         uint32_t i, nr_vring;
382         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
383         struct vfio_irq_set *irq_set;
384         int *fd_ptr;
385         struct rte_vhost_vring vring;
386         int fd;
387
388         vring.callfd = -1;
389
390         nr_vring = rte_vhost_get_vring_num(internal->vid);
391         if (nr_vring > IFCVF_MAX_QUEUES * 2)
392                 return -1;
393
394         irq_set = (struct vfio_irq_set *)irq_set_buf;
395         irq_set->argsz = sizeof(irq_set_buf);
396         irq_set->count = nr_vring + 1;
397         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
398                          VFIO_IRQ_SET_ACTION_TRIGGER;
399         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
400         irq_set->start = 0;
401         fd_ptr = (int *)&irq_set->data;
402         /* The first interrupt is for the configure space change notification */
403         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] =
404                 rte_intr_fd_get(internal->pdev->intr_handle);
405
406         for (i = 0; i < nr_vring; i++)
407                 internal->intr_fd[i] = -1;
408
409         for (i = 0; i < nr_vring; i++) {
410                 rte_vhost_get_vhost_vring(internal->vid, i, &vring);
411                 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
412                 if (m_rx == true &&
413                         ((i & 1) == 0 || internal->hw.device_type == IFCVF_BLK)) {
414                         /* For the net we only need to relay rx queue,
415                          * which will change the mem of VM.
416                          * For the blk we need to relay all the read cmd
417                          * of each queue
418                          */
419                         fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
420                         if (fd < 0) {
421                                 DRV_LOG(ERR, "can't setup eventfd: %s",
422                                         strerror(errno));
423                                 return -1;
424                         }
425                         internal->intr_fd[i] = fd;
426                         fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = fd;
427                 }
428         }
429
430         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
431         if (ret) {
432                 DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s",
433                                 strerror(errno));
434                 return -1;
435         }
436
437         return 0;
438 }
439
440 static int
441 vdpa_disable_vfio_intr(struct ifcvf_internal *internal)
442 {
443         int ret;
444         uint32_t i, nr_vring;
445         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
446         struct vfio_irq_set *irq_set;
447
448         irq_set = (struct vfio_irq_set *)irq_set_buf;
449         irq_set->argsz = sizeof(irq_set_buf);
450         irq_set->count = 0;
451         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
452         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
453         irq_set->start = 0;
454
455         nr_vring = rte_vhost_get_vring_num(internal->vid);
456         for (i = 0; i < nr_vring; i++) {
457                 if (internal->intr_fd[i] >= 0)
458                         close(internal->intr_fd[i]);
459                 internal->intr_fd[i] = -1;
460         }
461
462         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
463         if (ret) {
464                 DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s",
465                                 strerror(errno));
466                 return -1;
467         }
468
469         return 0;
470 }
471
472 static void *
473 notify_relay(void *arg)
474 {
475         int i, kickfd, epfd, nfds = 0;
476         uint32_t qid, q_num;
477         struct epoll_event events[IFCVF_MAX_QUEUES * 2];
478         struct epoll_event ev;
479         uint64_t buf;
480         int nbytes;
481         struct rte_vhost_vring vring;
482         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
483         struct ifcvf_hw *hw = &internal->hw;
484
485         q_num = rte_vhost_get_vring_num(internal->vid);
486
487         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
488         if (epfd < 0) {
489                 DRV_LOG(ERR, "failed to create epoll instance.");
490                 return NULL;
491         }
492         internal->epfd = epfd;
493
494         vring.kickfd = -1;
495         for (qid = 0; qid < q_num; qid++) {
496                 ev.events = EPOLLIN | EPOLLPRI;
497                 rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
498                 ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
499                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
500                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
501                         return NULL;
502                 }
503         }
504
505         for (;;) {
506                 nfds = epoll_wait(epfd, events, q_num, -1);
507                 if (nfds < 0) {
508                         if (errno == EINTR)
509                                 continue;
510                         DRV_LOG(ERR, "epoll_wait return fail\n");
511                         return NULL;
512                 }
513
514                 for (i = 0; i < nfds; i++) {
515                         qid = events[i].data.u32;
516                         kickfd = (uint32_t)(events[i].data.u64 >> 32);
517                         do {
518                                 nbytes = read(kickfd, &buf, 8);
519                                 if (nbytes < 0) {
520                                         if (errno == EINTR ||
521                                             errno == EWOULDBLOCK ||
522                                             errno == EAGAIN)
523                                                 continue;
524                                         DRV_LOG(INFO, "Error reading "
525                                                 "kickfd: %s",
526                                                 strerror(errno));
527                                 }
528                                 break;
529                         } while (1);
530
531                         ifcvf_notify_queue(hw, qid);
532                 }
533         }
534
535         return NULL;
536 }
537
538 static int
539 setup_notify_relay(struct ifcvf_internal *internal)
540 {
541         char name[THREAD_NAME_LEN];
542         int ret;
543
544         snprintf(name, sizeof(name), "ifc-notify-%d", internal->vid);
545         ret = rte_ctrl_thread_create(&internal->tid, name, NULL, notify_relay,
546                                      (void *)internal);
547         if (ret != 0) {
548                 DRV_LOG(ERR, "failed to create notify relay pthread.");
549                 return -1;
550         }
551
552         return 0;
553 }
554
555 static int
556 unset_notify_relay(struct ifcvf_internal *internal)
557 {
558         void *status;
559
560         if (internal->tid) {
561                 pthread_cancel(internal->tid);
562                 pthread_join(internal->tid, &status);
563         }
564         internal->tid = 0;
565
566         if (internal->epfd >= 0)
567                 close(internal->epfd);
568         internal->epfd = -1;
569
570         return 0;
571 }
572
573 static void
574 virtio_interrupt_handler(struct ifcvf_internal *internal)
575 {
576         int vid = internal->vid;
577         int ret;
578
579         ret = rte_vhost_slave_config_change(vid, 1);
580         if (ret)
581                 DRV_LOG(ERR, "failed to notify the guest about configuration space change.");
582 }
583
584 static void *
585 intr_relay(void *arg)
586 {
587         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
588         struct epoll_event csc_event;
589         struct epoll_event ev;
590         uint64_t buf;
591         int nbytes;
592         int csc_epfd, csc_val = 0;
593
594         csc_epfd = epoll_create(1);
595         if (csc_epfd < 0) {
596                 DRV_LOG(ERR, "failed to create epoll for config space change.");
597                 return NULL;
598         }
599
600         ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
601         ev.data.fd = rte_intr_fd_get(internal->pdev->intr_handle);
602         if (epoll_ctl(csc_epfd, EPOLL_CTL_ADD,
603                 rte_intr_fd_get(internal->pdev->intr_handle), &ev) < 0) {
604                 DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
605                 goto out;
606         }
607
608         internal->csc_epfd = csc_epfd;
609
610         for (;;) {
611                 csc_val = epoll_wait(csc_epfd, &csc_event, 1, -1);
612                 if (csc_val < 0) {
613                         if (errno == EINTR)
614                                 continue;
615                         DRV_LOG(ERR, "epoll_wait return fail.");
616                         goto out;
617                 } else if (csc_val == 0) {
618                         continue;
619                 } else {
620                         /* csc_val > 0 */
621                         nbytes = read(csc_event.data.fd, &buf, 8);
622                         if (nbytes < 0) {
623                                 if (errno == EINTR ||
624                                     errno == EWOULDBLOCK ||
625                                     errno == EAGAIN)
626                                         continue;
627                                 DRV_LOG(ERR, "Error reading from file descriptor %d: %s\n",
628                                         csc_event.data.fd,
629                                         strerror(errno));
630                                 goto out;
631                         } else if (nbytes == 0) {
632                                 DRV_LOG(ERR, "Read nothing from file descriptor %d\n",
633                                         csc_event.data.fd);
634                                 continue;
635                         } else {
636                                 virtio_interrupt_handler(internal);
637                         }
638                 }
639         }
640
641 out:
642         if (csc_epfd >= 0)
643                 close(csc_epfd);
644         internal->csc_epfd = -1;
645
646         return NULL;
647 }
648
649 static int
650 setup_intr_relay(struct ifcvf_internal *internal)
651 {
652         char name[THREAD_NAME_LEN];
653         int ret;
654
655         snprintf(name, sizeof(name), "ifc-intr-%d", internal->vid);
656         ret = rte_ctrl_thread_create(&internal->intr_tid, name, NULL,
657                                      intr_relay, (void *)internal);
658         if (ret) {
659                 DRV_LOG(ERR, "failed to create notify relay pthread.");
660                 return -1;
661         }
662         return 0;
663 }
664
665 static void
666 unset_intr_relay(struct ifcvf_internal *internal)
667 {
668         void *status;
669
670         if (internal->intr_tid) {
671                 pthread_cancel(internal->intr_tid);
672                 pthread_join(internal->intr_tid, &status);
673         }
674         internal->intr_tid = 0;
675
676         if (internal->csc_epfd >= 0)
677                 close(internal->csc_epfd);
678         internal->csc_epfd = -1;
679 }
680
681 static int
682 update_datapath(struct ifcvf_internal *internal)
683 {
684         int ret;
685
686         rte_spinlock_lock(&internal->lock);
687
688         if (!rte_atomic32_read(&internal->running) &&
689             (rte_atomic32_read(&internal->started) &&
690              rte_atomic32_read(&internal->dev_attached))) {
691                 ret = ifcvf_dma_map(internal, true);
692                 if (ret)
693                         goto err;
694
695                 ret = vdpa_enable_vfio_intr(internal, false);
696                 if (ret)
697                         goto err;
698
699                 ret = vdpa_ifcvf_start(internal);
700                 if (ret)
701                         goto err;
702
703                 ret = setup_notify_relay(internal);
704                 if (ret)
705                         goto err;
706
707                 ret = setup_intr_relay(internal);
708                 if (ret)
709                         goto err;
710
711                 rte_atomic32_set(&internal->running, 1);
712         } else if (rte_atomic32_read(&internal->running) &&
713                    (!rte_atomic32_read(&internal->started) ||
714                     !rte_atomic32_read(&internal->dev_attached))) {
715                 unset_intr_relay(internal);
716
717                 ret = unset_notify_relay(internal);
718                 if (ret)
719                         goto err;
720
721                 vdpa_ifcvf_stop(internal);
722
723                 ret = vdpa_disable_vfio_intr(internal);
724                 if (ret)
725                         goto err;
726
727                 ret = ifcvf_dma_map(internal, false);
728                 if (ret)
729                         goto err;
730
731                 rte_atomic32_set(&internal->running, 0);
732         }
733
734         rte_spinlock_unlock(&internal->lock);
735         return 0;
736 err:
737         rte_spinlock_unlock(&internal->lock);
738         return ret;
739 }
740
741 static int
742 m_ifcvf_start(struct ifcvf_internal *internal)
743 {
744         struct ifcvf_hw *hw = &internal->hw;
745         uint32_t i, nr_vring;
746         int vid, ret;
747         struct rte_vhost_vring vq;
748         void *vring_buf;
749         uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
750         uint64_t size;
751         uint64_t gpa;
752
753         memset(&vq, 0, sizeof(vq));
754         vid = internal->vid;
755         nr_vring = rte_vhost_get_vring_num(vid);
756         rte_vhost_get_negotiated_features(vid, &hw->req_features);
757
758         for (i = 0; i < nr_vring; i++) {
759                 rte_vhost_get_vhost_vring(vid, i, &vq);
760
761                 size = RTE_ALIGN_CEIL(vring_size(vq.size, rte_mem_page_size()),
762                                 rte_mem_page_size());
763                 vring_buf = rte_zmalloc("ifcvf", size, rte_mem_page_size());
764                 vring_init(&internal->m_vring[i], vq.size, vring_buf,
765                                 rte_mem_page_size());
766
767                 ret = rte_vfio_container_dma_map(internal->vfio_container_fd,
768                         (uint64_t)(uintptr_t)vring_buf, m_vring_iova, size);
769                 if (ret < 0) {
770                         DRV_LOG(ERR, "mediated vring DMA map failed.");
771                         goto error;
772                 }
773
774                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
775                 if (gpa == 0) {
776                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
777                         return -1;
778                 }
779                 hw->vring[i].desc = gpa;
780
781                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
782                 if (gpa == 0) {
783                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
784                         return -1;
785                 }
786                 hw->vring[i].avail = gpa;
787
788                 /* NET: Direct I/O for Tx queue, relay for Rx queue
789                  * BLK: relay every queue
790                  */
791                 if ((internal->hw.device_type == IFCVF_NET) && (i & 1)) {
792                         gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
793                         if (gpa == 0) {
794                                 DRV_LOG(ERR, "Fail to get GPA for used ring.");
795                                 return -1;
796                         }
797                         hw->vring[i].used = gpa;
798                 } else {
799                         hw->vring[i].used = m_vring_iova +
800                                 (char *)internal->m_vring[i].used -
801                                 (char *)internal->m_vring[i].desc;
802                 }
803
804                 hw->vring[i].size = vq.size;
805
806                 rte_vhost_get_vring_base(vid, i,
807                                 &internal->m_vring[i].avail->idx,
808                                 &internal->m_vring[i].used->idx);
809
810                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
811                                 &hw->vring[i].last_used_idx);
812
813                 m_vring_iova += size;
814         }
815         hw->nr_vring = nr_vring;
816
817         return ifcvf_start_hw(&internal->hw);
818
819 error:
820         for (i = 0; i < nr_vring; i++)
821                 rte_free(internal->m_vring[i].desc);
822
823         return -1;
824 }
825
826 static int
827 m_ifcvf_stop(struct ifcvf_internal *internal)
828 {
829         int vid;
830         uint32_t i;
831         struct rte_vhost_vring vq;
832         struct ifcvf_hw *hw = &internal->hw;
833         uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
834         uint64_t size, len;
835
836         vid = internal->vid;
837         ifcvf_stop_hw(hw);
838
839         for (i = 0; i < hw->nr_vring; i++) {
840                 /* synchronize remaining new used entries if any */
841                 if (internal->hw.device_type == IFCVF_NET) {
842                         if ((i & 1) == 0)
843                                 update_used_ring(internal, i);
844                 } else if (internal->hw.device_type == IFCVF_BLK) {
845                         update_used_ring(internal, i);
846                 }
847
848                 rte_vhost_get_vhost_vring(vid, i, &vq);
849                 len = IFCVF_USED_RING_LEN(vq.size);
850                 rte_vhost_log_used_vring(vid, i, 0, len);
851
852                 size = RTE_ALIGN_CEIL(vring_size(vq.size, rte_mem_page_size()),
853                                 rte_mem_page_size());
854                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
855                         (uint64_t)(uintptr_t)internal->m_vring[i].desc,
856                         m_vring_iova, size);
857
858                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
859                                 hw->vring[i].last_used_idx);
860                 rte_free(internal->m_vring[i].desc);
861                 m_vring_iova += size;
862         }
863
864         return 0;
865 }
866
867 static void
868 update_used_ring(struct ifcvf_internal *internal, uint16_t qid)
869 {
870         rte_vdpa_relay_vring_used(internal->vid, qid, &internal->m_vring[qid]);
871         rte_vhost_vring_call(internal->vid, qid);
872 }
873
874 static void *
875 vring_relay(void *arg)
876 {
877         int i, vid, epfd, fd, nfds;
878         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
879         struct rte_vhost_vring vring;
880         uint16_t qid, q_num;
881         struct epoll_event events[IFCVF_MAX_QUEUES * 4];
882         struct epoll_event ev;
883         int nbytes;
884         uint64_t buf;
885
886         vid = internal->vid;
887         q_num = rte_vhost_get_vring_num(vid);
888
889         /* add notify fd and interrupt fd to epoll */
890         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
891         if (epfd < 0) {
892                 DRV_LOG(ERR, "failed to create epoll instance.");
893                 return NULL;
894         }
895         internal->epfd = epfd;
896
897         vring.kickfd = -1;
898         for (qid = 0; qid < q_num; qid++) {
899                 ev.events = EPOLLIN | EPOLLPRI;
900                 rte_vhost_get_vhost_vring(vid, qid, &vring);
901                 ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32;
902                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
903                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
904                         return NULL;
905                 }
906         }
907
908         for (qid = 0; qid < q_num; qid += 1) {
909                 if ((internal->hw.device_type == IFCVF_NET) && (qid & 1))
910                         continue;
911                 ev.events = EPOLLIN | EPOLLPRI;
912                 /* leave a flag to mark it's for interrupt */
913                 ev.data.u64 = 1 | qid << 1 |
914                         (uint64_t)internal->intr_fd[qid] << 32;
915                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, internal->intr_fd[qid], &ev)
916                                 < 0) {
917                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
918                         return NULL;
919                 }
920                 update_used_ring(internal, qid);
921         }
922
923         /* start relay with a first kick */
924         for (qid = 0; qid < q_num; qid++)
925                 ifcvf_notify_queue(&internal->hw, qid);
926
927         /* listen to the events and react accordingly */
928         for (;;) {
929                 nfds = epoll_wait(epfd, events, q_num * 2, -1);
930                 if (nfds < 0) {
931                         if (errno == EINTR)
932                                 continue;
933                         DRV_LOG(ERR, "epoll_wait return fail.");
934                         return NULL;
935                 }
936
937                 for (i = 0; i < nfds; i++) {
938                         fd = (uint32_t)(events[i].data.u64 >> 32);
939                         do {
940                                 nbytes = read(fd, &buf, 8);
941                                 if (nbytes < 0) {
942                                         if (errno == EINTR ||
943                                             errno == EWOULDBLOCK ||
944                                             errno == EAGAIN)
945                                                 continue;
946                                         DRV_LOG(INFO, "Error reading "
947                                                 "kickfd: %s",
948                                                 strerror(errno));
949                                 }
950                                 break;
951                         } while (1);
952
953                         qid = events[i].data.u32 >> 1;
954
955                         if (events[i].data.u32 & 1)
956                                 update_used_ring(internal, qid);
957                         else
958                                 ifcvf_notify_queue(&internal->hw, qid);
959                 }
960         }
961
962         return NULL;
963 }
964
965 static int
966 setup_vring_relay(struct ifcvf_internal *internal)
967 {
968         char name[THREAD_NAME_LEN];
969         int ret;
970
971         snprintf(name, sizeof(name), "ifc-vring-%d", internal->vid);
972         ret = rte_ctrl_thread_create(&internal->tid, name, NULL, vring_relay,
973                                      (void *)internal);
974         if (ret != 0) {
975                 DRV_LOG(ERR, "failed to create ring relay pthread.");
976                 return -1;
977         }
978
979         return 0;
980 }
981
982 static int
983 unset_vring_relay(struct ifcvf_internal *internal)
984 {
985         void *status;
986
987         if (internal->tid) {
988                 pthread_cancel(internal->tid);
989                 pthread_join(internal->tid, &status);
990         }
991         internal->tid = 0;
992
993         if (internal->epfd >= 0)
994                 close(internal->epfd);
995         internal->epfd = -1;
996
997         return 0;
998 }
999
1000 static int
1001 ifcvf_sw_fallback_switchover(struct ifcvf_internal *internal)
1002 {
1003         int ret;
1004         int vid = internal->vid;
1005
1006         /* stop the direct IO data path */
1007         unset_notify_relay(internal);
1008         vdpa_ifcvf_stop(internal);
1009
1010         unset_intr_relay(internal);
1011
1012         vdpa_disable_vfio_intr(internal);
1013
1014         ret = rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, false);
1015         if (ret && ret != -ENOTSUP)
1016                 goto error;
1017
1018         /* set up interrupt for interrupt relay */
1019         ret = vdpa_enable_vfio_intr(internal, true);
1020         if (ret)
1021                 goto unmap;
1022
1023         /* config the VF */
1024         ret = m_ifcvf_start(internal);
1025         if (ret)
1026                 goto unset_intr;
1027
1028         /* set up vring relay thread */
1029         ret = setup_vring_relay(internal);
1030         if (ret)
1031                 goto stop_vf;
1032
1033         rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, true);
1034
1035         internal->sw_fallback_running = true;
1036
1037         return 0;
1038
1039 stop_vf:
1040         m_ifcvf_stop(internal);
1041 unset_intr:
1042         vdpa_disable_vfio_intr(internal);
1043 unmap:
1044         ifcvf_dma_map(internal, false);
1045 error:
1046         return -1;
1047 }
1048
1049 static int
1050 ifcvf_dev_config(int vid)
1051 {
1052         struct rte_vdpa_device *vdev;
1053         struct internal_list *list;
1054         struct ifcvf_internal *internal;
1055
1056         vdev = rte_vhost_get_vdpa_device(vid);
1057         list = find_internal_resource_by_vdev(vdev);
1058         if (list == NULL) {
1059                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1060                 return -1;
1061         }
1062
1063         internal = list->internal;
1064         internal->vid = vid;
1065         rte_atomic32_set(&internal->dev_attached, 1);
1066         update_datapath(internal);
1067
1068         if (rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, true) != 0)
1069                 DRV_LOG(NOTICE, "vDPA (%s): software relay is used.",
1070                                 vdev->device->name);
1071
1072         internal->configured = 1;
1073         return 0;
1074 }
1075
1076 static int
1077 ifcvf_dev_close(int vid)
1078 {
1079         struct rte_vdpa_device *vdev;
1080         struct internal_list *list;
1081         struct ifcvf_internal *internal;
1082
1083         vdev = rte_vhost_get_vdpa_device(vid);
1084         list = find_internal_resource_by_vdev(vdev);
1085         if (list == NULL) {
1086                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1087                 return -1;
1088         }
1089
1090         internal = list->internal;
1091
1092         if (internal->sw_fallback_running) {
1093                 /* unset ring relay */
1094                 unset_vring_relay(internal);
1095
1096                 /* reset VF */
1097                 m_ifcvf_stop(internal);
1098
1099                 /* remove interrupt setting */
1100                 vdpa_disable_vfio_intr(internal);
1101
1102                 /* unset DMA map for guest memory */
1103                 ifcvf_dma_map(internal, false);
1104
1105                 internal->sw_fallback_running = false;
1106         } else {
1107                 rte_atomic32_set(&internal->dev_attached, 0);
1108                 update_datapath(internal);
1109         }
1110
1111         internal->configured = 0;
1112         return 0;
1113 }
1114
1115 static int
1116 ifcvf_set_features(int vid)
1117 {
1118         uint64_t features = 0;
1119         struct rte_vdpa_device *vdev;
1120         struct internal_list *list;
1121         struct ifcvf_internal *internal;
1122         uint64_t log_base = 0, log_size = 0;
1123
1124         vdev = rte_vhost_get_vdpa_device(vid);
1125         list = find_internal_resource_by_vdev(vdev);
1126         if (list == NULL) {
1127                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1128                 return -1;
1129         }
1130
1131         internal = list->internal;
1132         rte_vhost_get_negotiated_features(vid, &features);
1133
1134         if (!RTE_VHOST_NEED_LOG(features))
1135                 return 0;
1136
1137         if (internal->sw_lm) {
1138                 ifcvf_sw_fallback_switchover(internal);
1139         } else {
1140                 rte_vhost_get_log_base(vid, &log_base, &log_size);
1141                 rte_vfio_container_dma_map(internal->vfio_container_fd,
1142                                 log_base, IFCVF_LOG_BASE, log_size);
1143                 ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size);
1144         }
1145
1146         return 0;
1147 }
1148
1149 static int
1150 ifcvf_get_vfio_group_fd(int vid)
1151 {
1152         struct rte_vdpa_device *vdev;
1153         struct internal_list *list;
1154
1155         vdev = rte_vhost_get_vdpa_device(vid);
1156         list = find_internal_resource_by_vdev(vdev);
1157         if (list == NULL) {
1158                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1159                 return -1;
1160         }
1161
1162         return list->internal->vfio_group_fd;
1163 }
1164
1165 static int
1166 ifcvf_get_vfio_device_fd(int vid)
1167 {
1168         struct rte_vdpa_device *vdev;
1169         struct internal_list *list;
1170
1171         vdev = rte_vhost_get_vdpa_device(vid);
1172         list = find_internal_resource_by_vdev(vdev);
1173         if (list == NULL) {
1174                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1175                 return -1;
1176         }
1177
1178         return list->internal->vfio_dev_fd;
1179 }
1180
1181 static int
1182 ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
1183 {
1184         struct rte_vdpa_device *vdev;
1185         struct internal_list *list;
1186         struct ifcvf_internal *internal;
1187         struct vfio_region_info reg = { .argsz = sizeof(reg) };
1188         int ret;
1189
1190         vdev = rte_vhost_get_vdpa_device(vid);
1191         list = find_internal_resource_by_vdev(vdev);
1192         if (list == NULL) {
1193                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1194                 return -1;
1195         }
1196
1197         internal = list->internal;
1198
1199         reg.index = ifcvf_get_notify_region(&internal->hw);
1200         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
1201         if (ret) {
1202                 DRV_LOG(ERR, "Get not get device region info: %s",
1203                                 strerror(errno));
1204                 return -1;
1205         }
1206
1207         *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset;
1208         *size = 0x1000;
1209
1210         return 0;
1211 }
1212
1213 static int
1214 ifcvf_get_queue_num(struct rte_vdpa_device *vdev, uint32_t *queue_num)
1215 {
1216         struct internal_list *list;
1217
1218         list = find_internal_resource_by_vdev(vdev);
1219         if (list == NULL) {
1220                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1221                 return -1;
1222         }
1223
1224         *queue_num = list->internal->max_queues;
1225
1226         return 0;
1227 }
1228
1229 static int
1230 ifcvf_get_vdpa_features(struct rte_vdpa_device *vdev, uint64_t *features)
1231 {
1232         struct internal_list *list;
1233
1234         list = find_internal_resource_by_vdev(vdev);
1235         if (list == NULL) {
1236                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1237                 return -1;
1238         }
1239
1240         *features = list->internal->features;
1241
1242         return 0;
1243 }
1244
1245 #define VDPA_SUPPORTED_PROTOCOL_FEATURES \
1246                 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \
1247                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | \
1248                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | \
1249                  1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
1250                  1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | \
1251                  1ULL << VHOST_USER_PROTOCOL_F_STATUS)
1252
1253 #define VDPA_BLK_PROTOCOL_FEATURES \
1254                 (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)
1255
1256 static int
1257 ifcvf_get_protocol_features(struct rte_vdpa_device *vdev, uint64_t *features)
1258 {
1259         RTE_SET_USED(vdev);
1260
1261         *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
1262         return 0;
1263 }
1264
1265 static int
1266 ifcvf_set_vring_state(int vid, int vring, int state)
1267 {
1268         struct rte_vdpa_device *vdev;
1269         struct internal_list *list;
1270         struct ifcvf_internal *internal;
1271         struct ifcvf_hw *hw;
1272         struct ifcvf_pci_common_cfg *cfg;
1273         int ret = 0;
1274
1275         vdev = rte_vhost_get_vdpa_device(vid);
1276         list = find_internal_resource_by_vdev(vdev);
1277         if (list == NULL) {
1278                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1279                 return -1;
1280         }
1281
1282         internal = list->internal;
1283         if (vring < 0 || vring >= internal->max_queues * 2) {
1284                 DRV_LOG(ERR, "Vring index %d not correct", vring);
1285                 return -1;
1286         }
1287
1288         hw = &internal->hw;
1289         if (!internal->configured)
1290                 goto exit;
1291
1292         cfg = hw->common_cfg;
1293         IFCVF_WRITE_REG16(vring, &cfg->queue_select);
1294         IFCVF_WRITE_REG16(!!state, &cfg->queue_enable);
1295
1296         if (!state && hw->vring[vring].enable) {
1297                 ret = vdpa_disable_vfio_intr(internal);
1298                 if (ret)
1299                         return ret;
1300         }
1301
1302         if (state && !hw->vring[vring].enable) {
1303                 ret = vdpa_enable_vfio_intr(internal, false);
1304                 if (ret)
1305                         return ret;
1306         }
1307
1308 exit:
1309         hw->vring[vring].enable = !!state;
1310         return 0;
1311 }
1312
1313 static int
1314 ifcvf_get_device_type(struct rte_vdpa_device *vdev,
1315         uint32_t *type)
1316 {
1317         struct ifcvf_internal *internal;
1318         struct internal_list *list;
1319
1320         list = find_internal_resource_by_vdev(vdev);
1321         if (list == NULL) {
1322                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1323                 return -1;
1324         }
1325
1326         internal = list->internal;
1327
1328         if (internal->hw.device_type == IFCVF_BLK)
1329                 *type = RTE_VHOST_VDPA_DEVICE_TYPE_BLK;
1330         else
1331                 *type = RTE_VHOST_VDPA_DEVICE_TYPE_NET;
1332
1333         return 0;
1334 }
1335
1336 static struct rte_vdpa_dev_ops ifcvf_net_ops = {
1337         .get_queue_num = ifcvf_get_queue_num,
1338         .get_features = ifcvf_get_vdpa_features,
1339         .get_protocol_features = ifcvf_get_protocol_features,
1340         .dev_conf = ifcvf_dev_config,
1341         .dev_close = ifcvf_dev_close,
1342         .set_vring_state = ifcvf_set_vring_state,
1343         .set_features = ifcvf_set_features,
1344         .migration_done = NULL,
1345         .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
1346         .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
1347         .get_notify_area = ifcvf_get_notify_area,
1348         .get_dev_type = ifcvf_get_device_type,
1349 };
1350
1351 static inline int
1352 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1353 {
1354         uint16_t *n = extra_args;
1355
1356         if (value == NULL || extra_args == NULL)
1357                 return -EINVAL;
1358
1359         *n = (uint16_t)strtoul(value, NULL, 0);
1360         if (*n == USHRT_MAX && errno == ERANGE)
1361                 return -1;
1362
1363         return 0;
1364 }
1365
1366 static int16_t
1367 ifcvf_pci_get_device_type(struct rte_pci_device *pci_dev)
1368 {
1369         uint16_t pci_device_id = pci_dev->id.device_id;
1370         uint16_t device_id;
1371
1372         if (pci_device_id < 0x1000 || pci_device_id > 0x107f) {
1373                 DRV_LOG(ERR, "Probe device is not a virtio device\n");
1374                 return -1;
1375         }
1376
1377         if (pci_device_id < 0x1040) {
1378                 /* Transitional devices: use the PCI subsystem device id as
1379                  * virtio device id, same as legacy driver always did.
1380                  */
1381                 device_id = pci_dev->id.subsystem_device_id;
1382         } else {
1383                 /* Modern devices: simply use PCI device id,
1384                  * but start from 0x1040.
1385                  */
1386                 device_id = pci_device_id - 0x1040;
1387         }
1388
1389         return device_id;
1390 }
1391
1392 static int
1393 ifcvf_blk_get_config(int vid, uint8_t *config, uint32_t size)
1394 {
1395         struct virtio_blk_config *dev_cfg;
1396         struct ifcvf_internal *internal;
1397         struct rte_vdpa_device *vdev;
1398         struct internal_list *list;
1399         uint32_t i;
1400         uint64_t capacity = 0;
1401         uint8_t *byte;
1402
1403         if (size != sizeof(struct virtio_blk_config)) {
1404                 DRV_LOG(ERR, "Invalid len: %u, required: %u",
1405                         size, (uint32_t)sizeof(struct virtio_blk_config));
1406                 return -1;
1407         }
1408
1409         vdev = rte_vhost_get_vdpa_device(vid);
1410         if (vdev == NULL) {
1411                 DRV_LOG(ERR, "Invalid vDPA device vid: %d", vid);
1412                 return -1;
1413         }
1414
1415         list = find_internal_resource_by_vdev(vdev);
1416         if (list == NULL) {
1417                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1418                 return -1;
1419         }
1420
1421         internal = list->internal;
1422
1423         for (i = 0; i < sizeof(struct virtio_blk_config); i++)
1424                 config[i] = *((u8 *)internal->hw.blk_cfg + i);
1425
1426         dev_cfg = (struct virtio_blk_config *)internal->hw.blk_cfg;
1427
1428         /* cannot read 64-bit register in one attempt, so read byte by byte. */
1429         for (i = 0; i < sizeof(internal->hw.blk_cfg->capacity); i++) {
1430                 byte = (uint8_t *)&internal->hw.blk_cfg->capacity + i;
1431                 capacity |= (uint64_t)*byte << (i * 8);
1432         }
1433         /* The capacity is number of sectors in 512-byte.
1434          * So right shift 1 bit  we get in K,
1435          * another right shift 10 bits we get in M,
1436          * right shift 10 more bits, we get in G.
1437          * To show capacity in G, we right shift 21 bits in total.
1438          */
1439         DRV_LOG(DEBUG, "capacity  : %"PRIu64"G", capacity >> 21);
1440
1441         DRV_LOG(DEBUG, "size_max  : 0x%08x", dev_cfg->size_max);
1442         DRV_LOG(DEBUG, "seg_max   : 0x%08x", dev_cfg->seg_max);
1443         DRV_LOG(DEBUG, "blk_size  : 0x%08x", dev_cfg->blk_size);
1444         DRV_LOG(DEBUG, "geometry");
1445         DRV_LOG(DEBUG, "      cylinders: %u", dev_cfg->geometry.cylinders);
1446         DRV_LOG(DEBUG, "      heads    : %u", dev_cfg->geometry.heads);
1447         DRV_LOG(DEBUG, "      sectors  : %u", dev_cfg->geometry.sectors);
1448         DRV_LOG(DEBUG, "num_queues: 0x%08x", dev_cfg->num_queues);
1449
1450         DRV_LOG(DEBUG, "config: [%x] [%x] [%x] [%x] [%x] [%x] [%x] [%x]\n",
1451                 config[0], config[1], config[2], config[3], config[4],
1452                 config[5], config[6], config[7]);
1453         return 0;
1454 }
1455
1456 static int
1457 ifcvf_blk_get_protocol_features(struct rte_vdpa_device *vdev,
1458         uint64_t *features)
1459 {
1460         RTE_SET_USED(vdev);
1461
1462         *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
1463         *features |= VDPA_BLK_PROTOCOL_FEATURES;
1464         return 0;
1465 }
1466
1467 static struct rte_vdpa_dev_ops ifcvf_blk_ops = {
1468         .get_queue_num = ifcvf_get_queue_num,
1469         .get_features = ifcvf_get_vdpa_features,
1470         .set_features = ifcvf_set_features,
1471         .get_protocol_features = ifcvf_blk_get_protocol_features,
1472         .dev_conf = ifcvf_dev_config,
1473         .dev_close = ifcvf_dev_close,
1474         .set_vring_state = ifcvf_set_vring_state,
1475         .migration_done = NULL,
1476         .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
1477         .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
1478         .get_notify_area = ifcvf_get_notify_area,
1479         .get_config = ifcvf_blk_get_config,
1480         .get_dev_type = ifcvf_get_device_type,
1481 };
1482
1483 struct rte_vdpa_dev_info dev_info[] = {
1484         {
1485                 .features = (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
1486                             (1ULL << VIRTIO_NET_F_CTRL_VQ) |
1487                             (1ULL << VIRTIO_NET_F_STATUS) |
1488                             (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
1489                             (1ULL << VHOST_F_LOG_ALL),
1490                 .ops = &ifcvf_net_ops,
1491         },
1492         {
1493                 .features = (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
1494                             (1ULL << VHOST_F_LOG_ALL),
1495                 .ops = &ifcvf_blk_ops,
1496         },
1497 };
1498
1499 static int
1500 ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
1501                 struct rte_pci_device *pci_dev)
1502 {
1503         uint64_t features;
1504         struct ifcvf_internal *internal = NULL;
1505         struct internal_list *list = NULL;
1506         int vdpa_mode = 0;
1507         int sw_fallback_lm = 0;
1508         struct rte_kvargs *kvlist = NULL;
1509         int ret = 0;
1510         int16_t device_id;
1511         uint64_t capacity = 0;
1512         uint8_t *byte;
1513         uint32_t i;
1514
1515         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1516                 return 0;
1517
1518         if (!pci_dev->device.devargs)
1519                 return 1;
1520
1521         kvlist = rte_kvargs_parse(pci_dev->device.devargs->args,
1522                         ifcvf_valid_arguments);
1523         if (kvlist == NULL)
1524                 return 1;
1525
1526         /* probe only when vdpa mode is specified */
1527         if (rte_kvargs_count(kvlist, IFCVF_VDPA_MODE) == 0) {
1528                 rte_kvargs_free(kvlist);
1529                 return 1;
1530         }
1531
1532         ret = rte_kvargs_process(kvlist, IFCVF_VDPA_MODE, &open_int,
1533                         &vdpa_mode);
1534         if (ret < 0 || vdpa_mode == 0) {
1535                 rte_kvargs_free(kvlist);
1536                 return 1;
1537         }
1538
1539         list = rte_zmalloc("ifcvf", sizeof(*list), 0);
1540         if (list == NULL)
1541                 goto error;
1542
1543         internal = rte_zmalloc("ifcvf", sizeof(*internal), 0);
1544         if (internal == NULL)
1545                 goto error;
1546
1547         internal->pdev = pci_dev;
1548         rte_spinlock_init(&internal->lock);
1549
1550         if (ifcvf_vfio_setup(internal) < 0) {
1551                 DRV_LOG(ERR, "failed to setup device %s", pci_dev->name);
1552                 goto error;
1553         }
1554
1555         if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0) {
1556                 DRV_LOG(ERR, "failed to init device %s", pci_dev->name);
1557                 goto error;
1558         }
1559
1560         internal->configured = 0;
1561         internal->max_queues = IFCVF_MAX_QUEUES;
1562         features = ifcvf_get_features(&internal->hw);
1563
1564         device_id = ifcvf_pci_get_device_type(pci_dev);
1565         if (device_id < 0) {
1566                 DRV_LOG(ERR, "failed to get device %s type", pci_dev->name);
1567                 goto error;
1568         }
1569
1570         if (device_id == VIRTIO_ID_NET) {
1571                 internal->hw.device_type = IFCVF_NET;
1572                 internal->features = features &
1573                                         ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
1574                 internal->features |= dev_info[IFCVF_NET].features;
1575         } else if (device_id == VIRTIO_ID_BLOCK) {
1576                 internal->hw.device_type = IFCVF_BLK;
1577                 internal->features = features &
1578                                         ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
1579                 internal->features |= dev_info[IFCVF_BLK].features;
1580
1581                 /* cannot read 64-bit register in one attempt,
1582                  * so read byte by byte.
1583                  */
1584                 for (i = 0; i < sizeof(internal->hw.blk_cfg->capacity); i++) {
1585                         byte = (uint8_t *)&internal->hw.blk_cfg->capacity + i;
1586                         capacity |= (uint64_t)*byte << (i * 8);
1587                 }
1588                 /* The capacity is number of sectors in 512-byte.
1589                  * So right shift 1 bit  we get in K,
1590                  * another right shift 10 bits we get in M,
1591                  * right shift 10 more bits, we get in G.
1592                  * To show capacity in G, we right shift 21 bits in total.
1593                  */
1594                 DRV_LOG(DEBUG, "capacity  : %"PRIu64"G", capacity >> 21);
1595
1596                 DRV_LOG(DEBUG, "size_max  : 0x%08x",
1597                         internal->hw.blk_cfg->size_max);
1598                 DRV_LOG(DEBUG, "seg_max   : 0x%08x",
1599                         internal->hw.blk_cfg->seg_max);
1600                 DRV_LOG(DEBUG, "blk_size  : 0x%08x",
1601                         internal->hw.blk_cfg->blk_size);
1602                 DRV_LOG(DEBUG, "geometry");
1603                 DRV_LOG(DEBUG, "    cylinders: %u",
1604                         internal->hw.blk_cfg->geometry.cylinders);
1605                 DRV_LOG(DEBUG, "    heads    : %u",
1606                         internal->hw.blk_cfg->geometry.heads);
1607                 DRV_LOG(DEBUG, "    sectors  : %u",
1608                         internal->hw.blk_cfg->geometry.sectors);
1609                 DRV_LOG(DEBUG, "num_queues: 0x%08x",
1610                         internal->hw.blk_cfg->num_queues);
1611         }
1612
1613         list->internal = internal;
1614
1615         if (rte_kvargs_count(kvlist, IFCVF_SW_FALLBACK_LM)) {
1616                 ret = rte_kvargs_process(kvlist, IFCVF_SW_FALLBACK_LM,
1617                                 &open_int, &sw_fallback_lm);
1618                 if (ret < 0)
1619                         goto error;
1620         }
1621         internal->sw_lm = sw_fallback_lm;
1622
1623         internal->vdev = rte_vdpa_register_device(&pci_dev->device,
1624                                 dev_info[internal->hw.device_type].ops);
1625         if (internal->vdev == NULL) {
1626                 DRV_LOG(ERR, "failed to register device %s", pci_dev->name);
1627                 goto error;
1628         }
1629
1630         pthread_mutex_lock(&internal_list_lock);
1631         TAILQ_INSERT_TAIL(&internal_list, list, next);
1632         pthread_mutex_unlock(&internal_list_lock);
1633
1634         rte_atomic32_set(&internal->started, 1);
1635         update_datapath(internal);
1636
1637         rte_kvargs_free(kvlist);
1638         return 0;
1639
1640 error:
1641         rte_kvargs_free(kvlist);
1642         rte_free(list);
1643         rte_free(internal);
1644         return -1;
1645 }
1646
1647 static int
1648 ifcvf_pci_remove(struct rte_pci_device *pci_dev)
1649 {
1650         struct ifcvf_internal *internal;
1651         struct internal_list *list;
1652
1653         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1654                 return 0;
1655
1656         list = find_internal_resource_by_dev(pci_dev);
1657         if (list == NULL) {
1658                 DRV_LOG(ERR, "Invalid device: %s", pci_dev->name);
1659                 return -1;
1660         }
1661
1662         internal = list->internal;
1663         rte_atomic32_set(&internal->started, 0);
1664         update_datapath(internal);
1665
1666         rte_pci_unmap_device(internal->pdev);
1667         rte_vfio_container_destroy(internal->vfio_container_fd);
1668         rte_vdpa_unregister_device(internal->vdev);
1669
1670         pthread_mutex_lock(&internal_list_lock);
1671         TAILQ_REMOVE(&internal_list, list, next);
1672         pthread_mutex_unlock(&internal_list_lock);
1673
1674         rte_free(list);
1675         rte_free(internal);
1676
1677         return 0;
1678 }
1679
1680 /*
1681  * IFCVF has the same vendor ID and device ID as virtio net PCI
1682  * device, with its specific subsystem vendor ID and device ID.
1683  */
1684 static const struct rte_pci_id pci_id_ifcvf_map[] = {
1685         { .class_id = RTE_CLASS_ANY_ID,
1686           .vendor_id = IFCVF_VENDOR_ID,
1687           .device_id = IFCVF_NET_DEVICE_ID,
1688           .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
1689           .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID,
1690         },
1691
1692         { .class_id = RTE_CLASS_ANY_ID,
1693           .vendor_id = IFCVF_VENDOR_ID,
1694           .device_id = IFCVF_BLK_TRANSITIONAL_DEVICE_ID,
1695           .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
1696           .subsystem_device_id = IFCVF_BLK_DEVICE_ID,
1697         },
1698
1699         { .class_id = RTE_CLASS_ANY_ID,
1700           .vendor_id = IFCVF_VENDOR_ID,
1701           .device_id = IFCVF_BLK_MODERN_DEVICE_ID,
1702           .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
1703           .subsystem_device_id = IFCVF_BLK_DEVICE_ID,
1704         },
1705
1706         { .vendor_id = 0, /* sentinel */
1707         },
1708 };
1709
1710 static struct rte_pci_driver rte_ifcvf_vdpa = {
1711         .id_table = pci_id_ifcvf_map,
1712         .drv_flags = 0,
1713         .probe = ifcvf_pci_probe,
1714         .remove = ifcvf_pci_remove,
1715 };
1716
1717 RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa);
1718 RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map);
1719 RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci");