f81d13a2c861cb16249fcc8ddf8b144a50bc350a
[dpdk.git] / drivers / vdpa / ifc / ifcvf_vdpa.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2018 Intel Corporation
3  */
4
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <fcntl.h>
8 #include <string.h>
9 #include <sys/ioctl.h>
10 #include <sys/epoll.h>
11 #include <linux/virtio_net.h>
12 #include <stdbool.h>
13
14 #include <rte_malloc.h>
15 #include <rte_memory.h>
16 #include <rte_bus_pci.h>
17 #include <rte_vhost.h>
18 #include <rte_vdpa.h>
19 #include <rte_vdpa_dev.h>
20 #include <rte_vfio.h>
21 #include <rte_spinlock.h>
22 #include <rte_log.h>
23 #include <rte_kvargs.h>
24 #include <rte_devargs.h>
25
26 #include "base/ifcvf.h"
27
28 #define DRV_LOG(level, fmt, args...) \
29         rte_log(RTE_LOG_ ## level, ifcvf_vdpa_logtype, \
30                 "IFCVF %s(): " fmt "\n", __func__, ##args)
31
32 #ifndef PAGE_SIZE
33 #define PAGE_SIZE 4096
34 #endif
35
36 #define IFCVF_USED_RING_LEN(size) \
37         ((size) * sizeof(struct vring_used_elem) + sizeof(uint16_t) * 3)
38
39 #define IFCVF_VDPA_MODE         "vdpa"
40 #define IFCVF_SW_FALLBACK_LM    "sw-live-migration"
41
42 static const char * const ifcvf_valid_arguments[] = {
43         IFCVF_VDPA_MODE,
44         IFCVF_SW_FALLBACK_LM,
45         NULL
46 };
47
48 static int ifcvf_vdpa_logtype;
49
50 struct ifcvf_internal {
51         struct rte_pci_device *pdev;
52         struct ifcvf_hw hw;
53         int vfio_container_fd;
54         int vfio_group_fd;
55         int vfio_dev_fd;
56         pthread_t tid;  /* thread for notify relay */
57         int epfd;
58         int vid;
59         struct rte_vdpa_device *vdev;
60         uint16_t max_queues;
61         uint64_t features;
62         rte_atomic32_t started;
63         rte_atomic32_t dev_attached;
64         rte_atomic32_t running;
65         rte_spinlock_t lock;
66         bool sw_lm;
67         bool sw_fallback_running;
68         /* mediated vring for sw fallback */
69         struct vring m_vring[IFCVF_MAX_QUEUES * 2];
70         /* eventfd for used ring interrupt */
71         int intr_fd[IFCVF_MAX_QUEUES * 2];
72 };
73
74 struct internal_list {
75         TAILQ_ENTRY(internal_list) next;
76         struct ifcvf_internal *internal;
77 };
78
79 TAILQ_HEAD(internal_list_head, internal_list);
80 static struct internal_list_head internal_list =
81         TAILQ_HEAD_INITIALIZER(internal_list);
82
83 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
84
85 static void update_used_ring(struct ifcvf_internal *internal, uint16_t qid);
86
87 static struct internal_list *
88 find_internal_resource_by_vdev(struct rte_vdpa_device *vdev)
89 {
90         int found = 0;
91         struct internal_list *list;
92
93         pthread_mutex_lock(&internal_list_lock);
94
95         TAILQ_FOREACH(list, &internal_list, next) {
96                 if (vdev == list->internal->vdev) {
97                         found = 1;
98                         break;
99                 }
100         }
101
102         pthread_mutex_unlock(&internal_list_lock);
103
104         if (!found)
105                 return NULL;
106
107         return list;
108 }
109
110 static struct internal_list *
111 find_internal_resource_by_dev(struct rte_pci_device *pdev)
112 {
113         int found = 0;
114         struct internal_list *list;
115
116         pthread_mutex_lock(&internal_list_lock);
117
118         TAILQ_FOREACH(list, &internal_list, next) {
119                 if (!rte_pci_addr_cmp(&pdev->addr,
120                                         &list->internal->pdev->addr)) {
121                         found = 1;
122                         break;
123                 }
124         }
125
126         pthread_mutex_unlock(&internal_list_lock);
127
128         if (!found)
129                 return NULL;
130
131         return list;
132 }
133
134 static int
135 ifcvf_vfio_setup(struct ifcvf_internal *internal)
136 {
137         struct rte_pci_device *dev = internal->pdev;
138         char devname[RTE_DEV_NAME_MAX_LEN] = {0};
139         int iommu_group_num;
140         int i, ret;
141
142         internal->vfio_dev_fd = -1;
143         internal->vfio_group_fd = -1;
144         internal->vfio_container_fd = -1;
145
146         rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
147         ret = rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
148                         &iommu_group_num);
149         if (ret <= 0) {
150                 DRV_LOG(ERR, "%s failed to get IOMMU group", devname);
151                 return -1;
152         }
153
154         internal->vfio_container_fd = rte_vfio_container_create();
155         if (internal->vfio_container_fd < 0)
156                 return -1;
157
158         internal->vfio_group_fd = rte_vfio_container_group_bind(
159                         internal->vfio_container_fd, iommu_group_num);
160         if (internal->vfio_group_fd < 0)
161                 goto err;
162
163         if (rte_pci_map_device(dev))
164                 goto err;
165
166         internal->vfio_dev_fd = dev->intr_handle.vfio_dev_fd;
167
168         for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
169                         i++) {
170                 internal->hw.mem_resource[i].addr =
171                         internal->pdev->mem_resource[i].addr;
172                 internal->hw.mem_resource[i].phys_addr =
173                         internal->pdev->mem_resource[i].phys_addr;
174                 internal->hw.mem_resource[i].len =
175                         internal->pdev->mem_resource[i].len;
176         }
177
178         return 0;
179
180 err:
181         rte_vfio_container_destroy(internal->vfio_container_fd);
182         return -1;
183 }
184
185 static int
186 ifcvf_dma_map(struct ifcvf_internal *internal, int do_map)
187 {
188         uint32_t i;
189         int ret;
190         struct rte_vhost_memory *mem = NULL;
191         int vfio_container_fd;
192
193         ret = rte_vhost_get_mem_table(internal->vid, &mem);
194         if (ret < 0) {
195                 DRV_LOG(ERR, "failed to get VM memory layout.");
196                 goto exit;
197         }
198
199         vfio_container_fd = internal->vfio_container_fd;
200
201         for (i = 0; i < mem->nregions; i++) {
202                 struct rte_vhost_mem_region *reg;
203
204                 reg = &mem->regions[i];
205                 DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", "
206                         "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".",
207                         do_map ? "DMA map" : "DMA unmap", i,
208                         reg->host_user_addr, reg->guest_phys_addr, reg->size);
209
210                 if (do_map) {
211                         ret = rte_vfio_container_dma_map(vfio_container_fd,
212                                 reg->host_user_addr, reg->guest_phys_addr,
213                                 reg->size);
214                         if (ret < 0) {
215                                 DRV_LOG(ERR, "DMA map failed.");
216                                 goto exit;
217                         }
218                 } else {
219                         ret = rte_vfio_container_dma_unmap(vfio_container_fd,
220                                 reg->host_user_addr, reg->guest_phys_addr,
221                                 reg->size);
222                         if (ret < 0) {
223                                 DRV_LOG(ERR, "DMA unmap failed.");
224                                 goto exit;
225                         }
226                 }
227         }
228
229 exit:
230         if (mem)
231                 free(mem);
232         return ret;
233 }
234
235 static uint64_t
236 hva_to_gpa(int vid, uint64_t hva)
237 {
238         struct rte_vhost_memory *mem = NULL;
239         struct rte_vhost_mem_region *reg;
240         uint32_t i;
241         uint64_t gpa = 0;
242
243         if (rte_vhost_get_mem_table(vid, &mem) < 0)
244                 goto exit;
245
246         for (i = 0; i < mem->nregions; i++) {
247                 reg = &mem->regions[i];
248
249                 if (hva >= reg->host_user_addr &&
250                                 hva < reg->host_user_addr + reg->size) {
251                         gpa = hva - reg->host_user_addr + reg->guest_phys_addr;
252                         break;
253                 }
254         }
255
256 exit:
257         if (mem)
258                 free(mem);
259         return gpa;
260 }
261
262 static int
263 vdpa_ifcvf_start(struct ifcvf_internal *internal)
264 {
265         struct ifcvf_hw *hw = &internal->hw;
266         int i, nr_vring;
267         int vid;
268         struct rte_vhost_vring vq;
269         uint64_t gpa;
270
271         vid = internal->vid;
272         nr_vring = rte_vhost_get_vring_num(vid);
273         rte_vhost_get_negotiated_features(vid, &hw->req_features);
274
275         for (i = 0; i < nr_vring; i++) {
276                 rte_vhost_get_vhost_vring(vid, i, &vq);
277                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
278                 if (gpa == 0) {
279                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
280                         return -1;
281                 }
282                 hw->vring[i].desc = gpa;
283
284                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
285                 if (gpa == 0) {
286                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
287                         return -1;
288                 }
289                 hw->vring[i].avail = gpa;
290
291                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
292                 if (gpa == 0) {
293                         DRV_LOG(ERR, "Fail to get GPA for used ring.");
294                         return -1;
295                 }
296                 hw->vring[i].used = gpa;
297
298                 hw->vring[i].size = vq.size;
299                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
300                                 &hw->vring[i].last_used_idx);
301         }
302         hw->nr_vring = i;
303
304         return ifcvf_start_hw(&internal->hw);
305 }
306
307 static void
308 vdpa_ifcvf_stop(struct ifcvf_internal *internal)
309 {
310         struct ifcvf_hw *hw = &internal->hw;
311         uint32_t i;
312         int vid;
313         uint64_t features = 0;
314         uint64_t log_base = 0, log_size = 0;
315         uint64_t len;
316
317         vid = internal->vid;
318         ifcvf_stop_hw(hw);
319
320         for (i = 0; i < hw->nr_vring; i++)
321                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
322                                 hw->vring[i].last_used_idx);
323
324         if (internal->sw_lm)
325                 return;
326
327         rte_vhost_get_negotiated_features(vid, &features);
328         if (RTE_VHOST_NEED_LOG(features)) {
329                 ifcvf_disable_logging(hw);
330                 rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
331                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
332                                 log_base, IFCVF_LOG_BASE, log_size);
333                 /*
334                  * IFCVF marks dirty memory pages for only packet buffer,
335                  * SW helps to mark the used ring as dirty after device stops.
336                  */
337                 for (i = 0; i < hw->nr_vring; i++) {
338                         len = IFCVF_USED_RING_LEN(hw->vring[i].size);
339                         rte_vhost_log_used_vring(vid, i, 0, len);
340                 }
341         }
342 }
343
344 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
345                 sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
346 static int
347 vdpa_enable_vfio_intr(struct ifcvf_internal *internal, bool m_rx)
348 {
349         int ret;
350         uint32_t i, nr_vring;
351         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
352         struct vfio_irq_set *irq_set;
353         int *fd_ptr;
354         struct rte_vhost_vring vring;
355         int fd;
356
357         vring.callfd = -1;
358
359         nr_vring = rte_vhost_get_vring_num(internal->vid);
360
361         irq_set = (struct vfio_irq_set *)irq_set_buf;
362         irq_set->argsz = sizeof(irq_set_buf);
363         irq_set->count = nr_vring + 1;
364         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
365                          VFIO_IRQ_SET_ACTION_TRIGGER;
366         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
367         irq_set->start = 0;
368         fd_ptr = (int *)&irq_set->data;
369         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = internal->pdev->intr_handle.fd;
370
371         for (i = 0; i < nr_vring; i++)
372                 internal->intr_fd[i] = -1;
373
374         for (i = 0; i < nr_vring; i++) {
375                 rte_vhost_get_vhost_vring(internal->vid, i, &vring);
376                 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
377                 if ((i & 1) == 0 && m_rx == true) {
378                         fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
379                         if (fd < 0) {
380                                 DRV_LOG(ERR, "can't setup eventfd: %s",
381                                         strerror(errno));
382                                 return -1;
383                         }
384                         internal->intr_fd[i] = fd;
385                         fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = fd;
386                 }
387         }
388
389         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
390         if (ret) {
391                 DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s",
392                                 strerror(errno));
393                 return -1;
394         }
395
396         return 0;
397 }
398
399 static int
400 vdpa_disable_vfio_intr(struct ifcvf_internal *internal)
401 {
402         int ret;
403         uint32_t i, nr_vring;
404         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
405         struct vfio_irq_set *irq_set;
406
407         irq_set = (struct vfio_irq_set *)irq_set_buf;
408         irq_set->argsz = sizeof(irq_set_buf);
409         irq_set->count = 0;
410         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
411         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
412         irq_set->start = 0;
413
414         nr_vring = rte_vhost_get_vring_num(internal->vid);
415         for (i = 0; i < nr_vring; i++) {
416                 if (internal->intr_fd[i] >= 0)
417                         close(internal->intr_fd[i]);
418                 internal->intr_fd[i] = -1;
419         }
420
421         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
422         if (ret) {
423                 DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s",
424                                 strerror(errno));
425                 return -1;
426         }
427
428         return 0;
429 }
430
431 static void *
432 notify_relay(void *arg)
433 {
434         int i, kickfd, epfd, nfds = 0;
435         uint32_t qid, q_num;
436         struct epoll_event events[IFCVF_MAX_QUEUES * 2];
437         struct epoll_event ev;
438         uint64_t buf;
439         int nbytes;
440         struct rte_vhost_vring vring;
441         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
442         struct ifcvf_hw *hw = &internal->hw;
443
444         q_num = rte_vhost_get_vring_num(internal->vid);
445
446         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
447         if (epfd < 0) {
448                 DRV_LOG(ERR, "failed to create epoll instance.");
449                 return NULL;
450         }
451         internal->epfd = epfd;
452
453         vring.kickfd = -1;
454         for (qid = 0; qid < q_num; qid++) {
455                 ev.events = EPOLLIN | EPOLLPRI;
456                 rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
457                 ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
458                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
459                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
460                         return NULL;
461                 }
462         }
463
464         for (;;) {
465                 nfds = epoll_wait(epfd, events, q_num, -1);
466                 if (nfds < 0) {
467                         if (errno == EINTR)
468                                 continue;
469                         DRV_LOG(ERR, "epoll_wait return fail\n");
470                         return NULL;
471                 }
472
473                 for (i = 0; i < nfds; i++) {
474                         qid = events[i].data.u32;
475                         kickfd = (uint32_t)(events[i].data.u64 >> 32);
476                         do {
477                                 nbytes = read(kickfd, &buf, 8);
478                                 if (nbytes < 0) {
479                                         if (errno == EINTR ||
480                                             errno == EWOULDBLOCK ||
481                                             errno == EAGAIN)
482                                                 continue;
483                                         DRV_LOG(INFO, "Error reading "
484                                                 "kickfd: %s",
485                                                 strerror(errno));
486                                 }
487                                 break;
488                         } while (1);
489
490                         ifcvf_notify_queue(hw, qid);
491                 }
492         }
493
494         return NULL;
495 }
496
497 static int
498 setup_notify_relay(struct ifcvf_internal *internal)
499 {
500         int ret;
501
502         ret = pthread_create(&internal->tid, NULL, notify_relay,
503                         (void *)internal);
504         if (ret) {
505                 DRV_LOG(ERR, "failed to create notify relay pthread.");
506                 return -1;
507         }
508         return 0;
509 }
510
511 static int
512 unset_notify_relay(struct ifcvf_internal *internal)
513 {
514         void *status;
515
516         if (internal->tid) {
517                 pthread_cancel(internal->tid);
518                 pthread_join(internal->tid, &status);
519         }
520         internal->tid = 0;
521
522         if (internal->epfd >= 0)
523                 close(internal->epfd);
524         internal->epfd = -1;
525
526         return 0;
527 }
528
529 static int
530 update_datapath(struct ifcvf_internal *internal)
531 {
532         int ret;
533
534         rte_spinlock_lock(&internal->lock);
535
536         if (!rte_atomic32_read(&internal->running) &&
537             (rte_atomic32_read(&internal->started) &&
538              rte_atomic32_read(&internal->dev_attached))) {
539                 ret = ifcvf_dma_map(internal, 1);
540                 if (ret)
541                         goto err;
542
543                 ret = vdpa_enable_vfio_intr(internal, 0);
544                 if (ret)
545                         goto err;
546
547                 ret = vdpa_ifcvf_start(internal);
548                 if (ret)
549                         goto err;
550
551                 ret = setup_notify_relay(internal);
552                 if (ret)
553                         goto err;
554
555                 rte_atomic32_set(&internal->running, 1);
556         } else if (rte_atomic32_read(&internal->running) &&
557                    (!rte_atomic32_read(&internal->started) ||
558                     !rte_atomic32_read(&internal->dev_attached))) {
559                 ret = unset_notify_relay(internal);
560                 if (ret)
561                         goto err;
562
563                 vdpa_ifcvf_stop(internal);
564
565                 ret = vdpa_disable_vfio_intr(internal);
566                 if (ret)
567                         goto err;
568
569                 ret = ifcvf_dma_map(internal, 0);
570                 if (ret)
571                         goto err;
572
573                 rte_atomic32_set(&internal->running, 0);
574         }
575
576         rte_spinlock_unlock(&internal->lock);
577         return 0;
578 err:
579         rte_spinlock_unlock(&internal->lock);
580         return ret;
581 }
582
583 static int
584 m_ifcvf_start(struct ifcvf_internal *internal)
585 {
586         struct ifcvf_hw *hw = &internal->hw;
587         uint32_t i, nr_vring;
588         int vid, ret;
589         struct rte_vhost_vring vq;
590         void *vring_buf;
591         uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
592         uint64_t size;
593         uint64_t gpa;
594
595         memset(&vq, 0, sizeof(vq));
596         vid = internal->vid;
597         nr_vring = rte_vhost_get_vring_num(vid);
598         rte_vhost_get_negotiated_features(vid, &hw->req_features);
599
600         for (i = 0; i < nr_vring; i++) {
601                 rte_vhost_get_vhost_vring(vid, i, &vq);
602
603                 size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE),
604                                 PAGE_SIZE);
605                 vring_buf = rte_zmalloc("ifcvf", size, PAGE_SIZE);
606                 vring_init(&internal->m_vring[i], vq.size, vring_buf,
607                                 PAGE_SIZE);
608
609                 ret = rte_vfio_container_dma_map(internal->vfio_container_fd,
610                         (uint64_t)(uintptr_t)vring_buf, m_vring_iova, size);
611                 if (ret < 0) {
612                         DRV_LOG(ERR, "mediated vring DMA map failed.");
613                         goto error;
614                 }
615
616                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
617                 if (gpa == 0) {
618                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
619                         return -1;
620                 }
621                 hw->vring[i].desc = gpa;
622
623                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
624                 if (gpa == 0) {
625                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
626                         return -1;
627                 }
628                 hw->vring[i].avail = gpa;
629
630                 /* Direct I/O for Tx queue, relay for Rx queue */
631                 if (i & 1) {
632                         gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
633                         if (gpa == 0) {
634                                 DRV_LOG(ERR, "Fail to get GPA for used ring.");
635                                 return -1;
636                         }
637                         hw->vring[i].used = gpa;
638                 } else {
639                         hw->vring[i].used = m_vring_iova +
640                                 (char *)internal->m_vring[i].used -
641                                 (char *)internal->m_vring[i].desc;
642                 }
643
644                 hw->vring[i].size = vq.size;
645
646                 rte_vhost_get_vring_base(vid, i,
647                                 &internal->m_vring[i].avail->idx,
648                                 &internal->m_vring[i].used->idx);
649
650                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
651                                 &hw->vring[i].last_used_idx);
652
653                 m_vring_iova += size;
654         }
655         hw->nr_vring = nr_vring;
656
657         return ifcvf_start_hw(&internal->hw);
658
659 error:
660         for (i = 0; i < nr_vring; i++)
661                 if (internal->m_vring[i].desc)
662                         rte_free(internal->m_vring[i].desc);
663
664         return -1;
665 }
666
667 static int
668 m_ifcvf_stop(struct ifcvf_internal *internal)
669 {
670         int vid;
671         uint32_t i;
672         struct rte_vhost_vring vq;
673         struct ifcvf_hw *hw = &internal->hw;
674         uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
675         uint64_t size, len;
676
677         vid = internal->vid;
678         ifcvf_stop_hw(hw);
679
680         for (i = 0; i < hw->nr_vring; i++) {
681                 /* synchronize remaining new used entries if any */
682                 if ((i & 1) == 0)
683                         update_used_ring(internal, i);
684
685                 rte_vhost_get_vhost_vring(vid, i, &vq);
686                 len = IFCVF_USED_RING_LEN(vq.size);
687                 rte_vhost_log_used_vring(vid, i, 0, len);
688
689                 size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE),
690                                 PAGE_SIZE);
691                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
692                         (uint64_t)(uintptr_t)internal->m_vring[i].desc,
693                         m_vring_iova, size);
694
695                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
696                                 hw->vring[i].last_used_idx);
697                 rte_free(internal->m_vring[i].desc);
698                 m_vring_iova += size;
699         }
700
701         return 0;
702 }
703
704 static void
705 update_used_ring(struct ifcvf_internal *internal, uint16_t qid)
706 {
707         rte_vdpa_relay_vring_used(internal->vid, qid, &internal->m_vring[qid]);
708         rte_vhost_vring_call(internal->vid, qid);
709 }
710
711 static void *
712 vring_relay(void *arg)
713 {
714         int i, vid, epfd, fd, nfds;
715         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
716         struct rte_vhost_vring vring;
717         uint16_t qid, q_num;
718         struct epoll_event events[IFCVF_MAX_QUEUES * 4];
719         struct epoll_event ev;
720         int nbytes;
721         uint64_t buf;
722
723         vid = internal->vid;
724         q_num = rte_vhost_get_vring_num(vid);
725
726         /* add notify fd and interrupt fd to epoll */
727         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
728         if (epfd < 0) {
729                 DRV_LOG(ERR, "failed to create epoll instance.");
730                 return NULL;
731         }
732         internal->epfd = epfd;
733
734         vring.kickfd = -1;
735         for (qid = 0; qid < q_num; qid++) {
736                 ev.events = EPOLLIN | EPOLLPRI;
737                 rte_vhost_get_vhost_vring(vid, qid, &vring);
738                 ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32;
739                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
740                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
741                         return NULL;
742                 }
743         }
744
745         for (qid = 0; qid < q_num; qid += 2) {
746                 ev.events = EPOLLIN | EPOLLPRI;
747                 /* leave a flag to mark it's for interrupt */
748                 ev.data.u64 = 1 | qid << 1 |
749                         (uint64_t)internal->intr_fd[qid] << 32;
750                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, internal->intr_fd[qid], &ev)
751                                 < 0) {
752                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
753                         return NULL;
754                 }
755                 update_used_ring(internal, qid);
756         }
757
758         /* start relay with a first kick */
759         for (qid = 0; qid < q_num; qid++)
760                 ifcvf_notify_queue(&internal->hw, qid);
761
762         /* listen to the events and react accordingly */
763         for (;;) {
764                 nfds = epoll_wait(epfd, events, q_num * 2, -1);
765                 if (nfds < 0) {
766                         if (errno == EINTR)
767                                 continue;
768                         DRV_LOG(ERR, "epoll_wait return fail\n");
769                         return NULL;
770                 }
771
772                 for (i = 0; i < nfds; i++) {
773                         fd = (uint32_t)(events[i].data.u64 >> 32);
774                         do {
775                                 nbytes = read(fd, &buf, 8);
776                                 if (nbytes < 0) {
777                                         if (errno == EINTR ||
778                                             errno == EWOULDBLOCK ||
779                                             errno == EAGAIN)
780                                                 continue;
781                                         DRV_LOG(INFO, "Error reading "
782                                                 "kickfd: %s",
783                                                 strerror(errno));
784                                 }
785                                 break;
786                         } while (1);
787
788                         qid = events[i].data.u32 >> 1;
789
790                         if (events[i].data.u32 & 1)
791                                 update_used_ring(internal, qid);
792                         else
793                                 ifcvf_notify_queue(&internal->hw, qid);
794                 }
795         }
796
797         return NULL;
798 }
799
800 static int
801 setup_vring_relay(struct ifcvf_internal *internal)
802 {
803         int ret;
804
805         ret = pthread_create(&internal->tid, NULL, vring_relay,
806                         (void *)internal);
807         if (ret) {
808                 DRV_LOG(ERR, "failed to create ring relay pthread.");
809                 return -1;
810         }
811         return 0;
812 }
813
814 static int
815 unset_vring_relay(struct ifcvf_internal *internal)
816 {
817         void *status;
818
819         if (internal->tid) {
820                 pthread_cancel(internal->tid);
821                 pthread_join(internal->tid, &status);
822         }
823         internal->tid = 0;
824
825         if (internal->epfd >= 0)
826                 close(internal->epfd);
827         internal->epfd = -1;
828
829         return 0;
830 }
831
832 static int
833 ifcvf_sw_fallback_switchover(struct ifcvf_internal *internal)
834 {
835         int ret;
836         int vid = internal->vid;
837
838         /* stop the direct IO data path */
839         unset_notify_relay(internal);
840         vdpa_ifcvf_stop(internal);
841         vdpa_disable_vfio_intr(internal);
842
843         ret = rte_vhost_host_notifier_ctrl(vid, false);
844         if (ret && ret != -ENOTSUP)
845                 goto error;
846
847         /* set up interrupt for interrupt relay */
848         ret = vdpa_enable_vfio_intr(internal, 1);
849         if (ret)
850                 goto unmap;
851
852         /* config the VF */
853         ret = m_ifcvf_start(internal);
854         if (ret)
855                 goto unset_intr;
856
857         /* set up vring relay thread */
858         ret = setup_vring_relay(internal);
859         if (ret)
860                 goto stop_vf;
861
862         rte_vhost_host_notifier_ctrl(vid, true);
863
864         internal->sw_fallback_running = true;
865
866         return 0;
867
868 stop_vf:
869         m_ifcvf_stop(internal);
870 unset_intr:
871         vdpa_disable_vfio_intr(internal);
872 unmap:
873         ifcvf_dma_map(internal, 0);
874 error:
875         return -1;
876 }
877
878 static int
879 ifcvf_dev_config(int vid)
880 {
881         struct rte_vdpa_device *vdev;
882         struct internal_list *list;
883         struct ifcvf_internal *internal;
884
885         vdev = rte_vhost_get_vdpa_device(vid);
886         list = find_internal_resource_by_vdev(vdev);
887         if (list == NULL) {
888                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
889                 return -1;
890         }
891
892         internal = list->internal;
893         internal->vid = vid;
894         rte_atomic32_set(&internal->dev_attached, 1);
895         update_datapath(internal);
896
897         if (rte_vhost_host_notifier_ctrl(vid, true) != 0)
898                 DRV_LOG(NOTICE, "vDPA (%s): software relay is used.",
899                                 vdev->device->name);
900
901         return 0;
902 }
903
904 static int
905 ifcvf_dev_close(int vid)
906 {
907         struct rte_vdpa_device *vdev;
908         struct internal_list *list;
909         struct ifcvf_internal *internal;
910
911         vdev = rte_vhost_get_vdpa_device(vid);
912         list = find_internal_resource_by_vdev(vdev);
913         if (list == NULL) {
914                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
915                 return -1;
916         }
917
918         internal = list->internal;
919
920         if (internal->sw_fallback_running) {
921                 /* unset ring relay */
922                 unset_vring_relay(internal);
923
924                 /* reset VF */
925                 m_ifcvf_stop(internal);
926
927                 /* remove interrupt setting */
928                 vdpa_disable_vfio_intr(internal);
929
930                 /* unset DMA map for guest memory */
931                 ifcvf_dma_map(internal, 0);
932
933                 internal->sw_fallback_running = false;
934         } else {
935                 rte_atomic32_set(&internal->dev_attached, 0);
936                 update_datapath(internal);
937         }
938
939         return 0;
940 }
941
942 static int
943 ifcvf_set_features(int vid)
944 {
945         uint64_t features = 0;
946         struct rte_vdpa_device *vdev;
947         struct internal_list *list;
948         struct ifcvf_internal *internal;
949         uint64_t log_base = 0, log_size = 0;
950
951         vdev = rte_vhost_get_vdpa_device(vid);
952         list = find_internal_resource_by_vdev(vdev);
953         if (list == NULL) {
954                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
955                 return -1;
956         }
957
958         internal = list->internal;
959         rte_vhost_get_negotiated_features(vid, &features);
960
961         if (!RTE_VHOST_NEED_LOG(features))
962                 return 0;
963
964         if (internal->sw_lm) {
965                 ifcvf_sw_fallback_switchover(internal);
966         } else {
967                 rte_vhost_get_log_base(vid, &log_base, &log_size);
968                 rte_vfio_container_dma_map(internal->vfio_container_fd,
969                                 log_base, IFCVF_LOG_BASE, log_size);
970                 ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size);
971         }
972
973         return 0;
974 }
975
976 static int
977 ifcvf_get_vfio_group_fd(int vid)
978 {
979         struct rte_vdpa_device *vdev;
980         struct internal_list *list;
981
982         vdev = rte_vhost_get_vdpa_device(vid);
983         list = find_internal_resource_by_vdev(vdev);
984         if (list == NULL) {
985                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
986                 return -1;
987         }
988
989         return list->internal->vfio_group_fd;
990 }
991
992 static int
993 ifcvf_get_vfio_device_fd(int vid)
994 {
995         struct rte_vdpa_device *vdev;
996         struct internal_list *list;
997
998         vdev = rte_vhost_get_vdpa_device(vid);
999         list = find_internal_resource_by_vdev(vdev);
1000         if (list == NULL) {
1001                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1002                 return -1;
1003         }
1004
1005         return list->internal->vfio_dev_fd;
1006 }
1007
1008 static int
1009 ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
1010 {
1011         struct rte_vdpa_device *vdev;
1012         struct internal_list *list;
1013         struct ifcvf_internal *internal;
1014         struct vfio_region_info reg = { .argsz = sizeof(reg) };
1015         int ret;
1016
1017         vdev = rte_vhost_get_vdpa_device(vid);
1018         list = find_internal_resource_by_vdev(vdev);
1019         if (list == NULL) {
1020                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1021                 return -1;
1022         }
1023
1024         internal = list->internal;
1025
1026         reg.index = ifcvf_get_notify_region(&internal->hw);
1027         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
1028         if (ret) {
1029                 DRV_LOG(ERR, "Get not get device region info: %s",
1030                                 strerror(errno));
1031                 return -1;
1032         }
1033
1034         *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset;
1035         *size = 0x1000;
1036
1037         return 0;
1038 }
1039
1040 static int
1041 ifcvf_get_queue_num(struct rte_vdpa_device *vdev, uint32_t *queue_num)
1042 {
1043         struct internal_list *list;
1044
1045         list = find_internal_resource_by_vdev(vdev);
1046         if (list == NULL) {
1047                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1048                 return -1;
1049         }
1050
1051         *queue_num = list->internal->max_queues;
1052
1053         return 0;
1054 }
1055
1056 static int
1057 ifcvf_get_vdpa_features(struct rte_vdpa_device *vdev, uint64_t *features)
1058 {
1059         struct internal_list *list;
1060
1061         list = find_internal_resource_by_vdev(vdev);
1062         if (list == NULL) {
1063                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1064                 return -1;
1065         }
1066
1067         *features = list->internal->features;
1068
1069         return 0;
1070 }
1071
1072 #define VDPA_SUPPORTED_PROTOCOL_FEATURES \
1073                 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \
1074                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | \
1075                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | \
1076                  1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
1077                  1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD)
1078 static int
1079 ifcvf_get_protocol_features(struct rte_vdpa_device *vdev, uint64_t *features)
1080 {
1081         RTE_SET_USED(vdev);
1082
1083         *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
1084         return 0;
1085 }
1086
1087 static struct rte_vdpa_dev_ops ifcvf_ops = {
1088         .get_queue_num = ifcvf_get_queue_num,
1089         .get_features = ifcvf_get_vdpa_features,
1090         .get_protocol_features = ifcvf_get_protocol_features,
1091         .dev_conf = ifcvf_dev_config,
1092         .dev_close = ifcvf_dev_close,
1093         .set_vring_state = NULL,
1094         .set_features = ifcvf_set_features,
1095         .migration_done = NULL,
1096         .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
1097         .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
1098         .get_notify_area = ifcvf_get_notify_area,
1099 };
1100
1101 static inline int
1102 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1103 {
1104         uint16_t *n = extra_args;
1105
1106         if (value == NULL || extra_args == NULL)
1107                 return -EINVAL;
1108
1109         *n = (uint16_t)strtoul(value, NULL, 0);
1110         if (*n == USHRT_MAX && errno == ERANGE)
1111                 return -1;
1112
1113         return 0;
1114 }
1115
1116 static int
1117 ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
1118                 struct rte_pci_device *pci_dev)
1119 {
1120         uint64_t features;
1121         struct ifcvf_internal *internal = NULL;
1122         struct internal_list *list = NULL;
1123         int vdpa_mode = 0;
1124         int sw_fallback_lm = 0;
1125         struct rte_kvargs *kvlist = NULL;
1126         int ret = 0;
1127
1128         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1129                 return 0;
1130
1131         if (!pci_dev->device.devargs)
1132                 return 1;
1133
1134         kvlist = rte_kvargs_parse(pci_dev->device.devargs->args,
1135                         ifcvf_valid_arguments);
1136         if (kvlist == NULL)
1137                 return 1;
1138
1139         /* probe only when vdpa mode is specified */
1140         if (rte_kvargs_count(kvlist, IFCVF_VDPA_MODE) == 0) {
1141                 rte_kvargs_free(kvlist);
1142                 return 1;
1143         }
1144
1145         ret = rte_kvargs_process(kvlist, IFCVF_VDPA_MODE, &open_int,
1146                         &vdpa_mode);
1147         if (ret < 0 || vdpa_mode == 0) {
1148                 rte_kvargs_free(kvlist);
1149                 return 1;
1150         }
1151
1152         list = rte_zmalloc("ifcvf", sizeof(*list), 0);
1153         if (list == NULL)
1154                 goto error;
1155
1156         internal = rte_zmalloc("ifcvf", sizeof(*internal), 0);
1157         if (internal == NULL)
1158                 goto error;
1159
1160         internal->pdev = pci_dev;
1161         rte_spinlock_init(&internal->lock);
1162
1163         if (ifcvf_vfio_setup(internal) < 0) {
1164                 DRV_LOG(ERR, "failed to setup device %s", pci_dev->name);
1165                 goto error;
1166         }
1167
1168         if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0) {
1169                 DRV_LOG(ERR, "failed to init device %s", pci_dev->name);
1170                 goto error;
1171         }
1172
1173         internal->max_queues = IFCVF_MAX_QUEUES;
1174         features = ifcvf_get_features(&internal->hw);
1175         internal->features = (features &
1176                 ~(1ULL << VIRTIO_F_IOMMU_PLATFORM)) |
1177                 (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
1178                 (1ULL << VIRTIO_NET_F_CTRL_VQ) |
1179                 (1ULL << VIRTIO_NET_F_STATUS) |
1180                 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
1181                 (1ULL << VHOST_F_LOG_ALL);
1182
1183         list->internal = internal;
1184
1185         if (rte_kvargs_count(kvlist, IFCVF_SW_FALLBACK_LM)) {
1186                 ret = rte_kvargs_process(kvlist, IFCVF_SW_FALLBACK_LM,
1187                                 &open_int, &sw_fallback_lm);
1188                 if (ret < 0)
1189                         goto error;
1190         }
1191         internal->sw_lm = sw_fallback_lm;
1192
1193         internal->vdev = rte_vdpa_register_device(&pci_dev->device, &ifcvf_ops);
1194         if (internal->vdev == NULL) {
1195                 DRV_LOG(ERR, "failed to register device %s", pci_dev->name);
1196                 goto error;
1197         }
1198
1199         pthread_mutex_lock(&internal_list_lock);
1200         TAILQ_INSERT_TAIL(&internal_list, list, next);
1201         pthread_mutex_unlock(&internal_list_lock);
1202
1203         rte_atomic32_set(&internal->started, 1);
1204         update_datapath(internal);
1205
1206         rte_kvargs_free(kvlist);
1207         return 0;
1208
1209 error:
1210         rte_kvargs_free(kvlist);
1211         rte_free(list);
1212         rte_free(internal);
1213         return -1;
1214 }
1215
1216 static int
1217 ifcvf_pci_remove(struct rte_pci_device *pci_dev)
1218 {
1219         struct ifcvf_internal *internal;
1220         struct internal_list *list;
1221
1222         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1223                 return 0;
1224
1225         list = find_internal_resource_by_dev(pci_dev);
1226         if (list == NULL) {
1227                 DRV_LOG(ERR, "Invalid device: %s", pci_dev->name);
1228                 return -1;
1229         }
1230
1231         internal = list->internal;
1232         rte_atomic32_set(&internal->started, 0);
1233         update_datapath(internal);
1234
1235         rte_pci_unmap_device(internal->pdev);
1236         rte_vfio_container_destroy(internal->vfio_container_fd);
1237         rte_vdpa_unregister_device(internal->vdev);
1238
1239         pthread_mutex_lock(&internal_list_lock);
1240         TAILQ_REMOVE(&internal_list, list, next);
1241         pthread_mutex_unlock(&internal_list_lock);
1242
1243         rte_free(list);
1244         rte_free(internal);
1245
1246         return 0;
1247 }
1248
1249 /*
1250  * IFCVF has the same vendor ID and device ID as virtio net PCI
1251  * device, with its specific subsystem vendor ID and device ID.
1252  */
1253 static const struct rte_pci_id pci_id_ifcvf_map[] = {
1254         { .class_id = RTE_CLASS_ANY_ID,
1255           .vendor_id = IFCVF_VENDOR_ID,
1256           .device_id = IFCVF_DEVICE_ID,
1257           .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
1258           .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID,
1259         },
1260
1261         { .vendor_id = 0, /* sentinel */
1262         },
1263 };
1264
1265 static struct rte_pci_driver rte_ifcvf_vdpa = {
1266         .id_table = pci_id_ifcvf_map,
1267         .drv_flags = 0,
1268         .probe = ifcvf_pci_probe,
1269         .remove = ifcvf_pci_remove,
1270 };
1271
1272 RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa);
1273 RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map);
1274 RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci");
1275
1276 RTE_INIT(ifcvf_vdpa_init_log)
1277 {
1278         ifcvf_vdpa_logtype = rte_log_register("pmd.net.ifcvf_vdpa");
1279         if (ifcvf_vdpa_logtype >= 0)
1280                 rte_log_set_level(ifcvf_vdpa_logtype, RTE_LOG_NOTICE);
1281 }