drivers: move ifc to vDPA directory
[dpdk.git] / drivers / vdpa / ifc / ifcvf_vdpa.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2018 Intel Corporation
3  */
4
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <fcntl.h>
8 #include <string.h>
9 #include <sys/ioctl.h>
10 #include <sys/epoll.h>
11 #include <linux/virtio_net.h>
12 #include <stdbool.h>
13
14 #include <rte_malloc.h>
15 #include <rte_memory.h>
16 #include <rte_bus_pci.h>
17 #include <rte_vhost.h>
18 #include <rte_vdpa.h>
19 #include <rte_vfio.h>
20 #include <rte_spinlock.h>
21 #include <rte_log.h>
22 #include <rte_kvargs.h>
23 #include <rte_devargs.h>
24
25 #include "base/ifcvf.h"
26
27 #define DRV_LOG(level, fmt, args...) \
28         rte_log(RTE_LOG_ ## level, ifcvf_vdpa_logtype, \
29                 "IFCVF %s(): " fmt "\n", __func__, ##args)
30
31 #ifndef PAGE_SIZE
32 #define PAGE_SIZE 4096
33 #endif
34
35 #define IFCVF_USED_RING_LEN(size) \
36         ((size) * sizeof(struct vring_used_elem) + sizeof(uint16_t) * 3)
37
38 #define IFCVF_VDPA_MODE         "vdpa"
39 #define IFCVF_SW_FALLBACK_LM    "sw-live-migration"
40
41 static const char * const ifcvf_valid_arguments[] = {
42         IFCVF_VDPA_MODE,
43         IFCVF_SW_FALLBACK_LM,
44         NULL
45 };
46
47 static int ifcvf_vdpa_logtype;
48
49 struct ifcvf_internal {
50         struct rte_vdpa_dev_addr dev_addr;
51         struct rte_pci_device *pdev;
52         struct ifcvf_hw hw;
53         int vfio_container_fd;
54         int vfio_group_fd;
55         int vfio_dev_fd;
56         pthread_t tid;  /* thread for notify relay */
57         int epfd;
58         int vid;
59         int did;
60         uint16_t max_queues;
61         uint64_t features;
62         rte_atomic32_t started;
63         rte_atomic32_t dev_attached;
64         rte_atomic32_t running;
65         rte_spinlock_t lock;
66         bool sw_lm;
67         bool sw_fallback_running;
68         /* mediated vring for sw fallback */
69         struct vring m_vring[IFCVF_MAX_QUEUES * 2];
70         /* eventfd for used ring interrupt */
71         int intr_fd[IFCVF_MAX_QUEUES * 2];
72 };
73
74 struct internal_list {
75         TAILQ_ENTRY(internal_list) next;
76         struct ifcvf_internal *internal;
77 };
78
79 TAILQ_HEAD(internal_list_head, internal_list);
80 static struct internal_list_head internal_list =
81         TAILQ_HEAD_INITIALIZER(internal_list);
82
83 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
84
85 static void update_used_ring(struct ifcvf_internal *internal, uint16_t qid);
86
87 static struct internal_list *
88 find_internal_resource_by_did(int did)
89 {
90         int found = 0;
91         struct internal_list *list;
92
93         pthread_mutex_lock(&internal_list_lock);
94
95         TAILQ_FOREACH(list, &internal_list, next) {
96                 if (did == list->internal->did) {
97                         found = 1;
98                         break;
99                 }
100         }
101
102         pthread_mutex_unlock(&internal_list_lock);
103
104         if (!found)
105                 return NULL;
106
107         return list;
108 }
109
110 static struct internal_list *
111 find_internal_resource_by_dev(struct rte_pci_device *pdev)
112 {
113         int found = 0;
114         struct internal_list *list;
115
116         pthread_mutex_lock(&internal_list_lock);
117
118         TAILQ_FOREACH(list, &internal_list, next) {
119                 if (pdev == list->internal->pdev) {
120                         found = 1;
121                         break;
122                 }
123         }
124
125         pthread_mutex_unlock(&internal_list_lock);
126
127         if (!found)
128                 return NULL;
129
130         return list;
131 }
132
133 static int
134 ifcvf_vfio_setup(struct ifcvf_internal *internal)
135 {
136         struct rte_pci_device *dev = internal->pdev;
137         char devname[RTE_DEV_NAME_MAX_LEN] = {0};
138         int iommu_group_num;
139         int i, ret;
140
141         internal->vfio_dev_fd = -1;
142         internal->vfio_group_fd = -1;
143         internal->vfio_container_fd = -1;
144
145         rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
146         ret = rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
147                         &iommu_group_num);
148         if (ret <= 0) {
149                 DRV_LOG(ERR, "%s failed to get IOMMU group", devname);
150                 return -1;
151         }
152
153         internal->vfio_container_fd = rte_vfio_container_create();
154         if (internal->vfio_container_fd < 0)
155                 return -1;
156
157         internal->vfio_group_fd = rte_vfio_container_group_bind(
158                         internal->vfio_container_fd, iommu_group_num);
159         if (internal->vfio_group_fd < 0)
160                 goto err;
161
162         if (rte_pci_map_device(dev))
163                 goto err;
164
165         internal->vfio_dev_fd = dev->intr_handle.vfio_dev_fd;
166
167         for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
168                         i++) {
169                 internal->hw.mem_resource[i].addr =
170                         internal->pdev->mem_resource[i].addr;
171                 internal->hw.mem_resource[i].phys_addr =
172                         internal->pdev->mem_resource[i].phys_addr;
173                 internal->hw.mem_resource[i].len =
174                         internal->pdev->mem_resource[i].len;
175         }
176
177         return 0;
178
179 err:
180         rte_vfio_container_destroy(internal->vfio_container_fd);
181         return -1;
182 }
183
184 static int
185 ifcvf_dma_map(struct ifcvf_internal *internal, int do_map)
186 {
187         uint32_t i;
188         int ret;
189         struct rte_vhost_memory *mem = NULL;
190         int vfio_container_fd;
191
192         ret = rte_vhost_get_mem_table(internal->vid, &mem);
193         if (ret < 0) {
194                 DRV_LOG(ERR, "failed to get VM memory layout.");
195                 goto exit;
196         }
197
198         vfio_container_fd = internal->vfio_container_fd;
199
200         for (i = 0; i < mem->nregions; i++) {
201                 struct rte_vhost_mem_region *reg;
202
203                 reg = &mem->regions[i];
204                 DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", "
205                         "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".",
206                         do_map ? "DMA map" : "DMA unmap", i,
207                         reg->host_user_addr, reg->guest_phys_addr, reg->size);
208
209                 if (do_map) {
210                         ret = rte_vfio_container_dma_map(vfio_container_fd,
211                                 reg->host_user_addr, reg->guest_phys_addr,
212                                 reg->size);
213                         if (ret < 0) {
214                                 DRV_LOG(ERR, "DMA map failed.");
215                                 goto exit;
216                         }
217                 } else {
218                         ret = rte_vfio_container_dma_unmap(vfio_container_fd,
219                                 reg->host_user_addr, reg->guest_phys_addr,
220                                 reg->size);
221                         if (ret < 0) {
222                                 DRV_LOG(ERR, "DMA unmap failed.");
223                                 goto exit;
224                         }
225                 }
226         }
227
228 exit:
229         if (mem)
230                 free(mem);
231         return ret;
232 }
233
234 static uint64_t
235 hva_to_gpa(int vid, uint64_t hva)
236 {
237         struct rte_vhost_memory *mem = NULL;
238         struct rte_vhost_mem_region *reg;
239         uint32_t i;
240         uint64_t gpa = 0;
241
242         if (rte_vhost_get_mem_table(vid, &mem) < 0)
243                 goto exit;
244
245         for (i = 0; i < mem->nregions; i++) {
246                 reg = &mem->regions[i];
247
248                 if (hva >= reg->host_user_addr &&
249                                 hva < reg->host_user_addr + reg->size) {
250                         gpa = hva - reg->host_user_addr + reg->guest_phys_addr;
251                         break;
252                 }
253         }
254
255 exit:
256         if (mem)
257                 free(mem);
258         return gpa;
259 }
260
261 static int
262 vdpa_ifcvf_start(struct ifcvf_internal *internal)
263 {
264         struct ifcvf_hw *hw = &internal->hw;
265         int i, nr_vring;
266         int vid;
267         struct rte_vhost_vring vq;
268         uint64_t gpa;
269
270         vid = internal->vid;
271         nr_vring = rte_vhost_get_vring_num(vid);
272         rte_vhost_get_negotiated_features(vid, &hw->req_features);
273
274         for (i = 0; i < nr_vring; i++) {
275                 rte_vhost_get_vhost_vring(vid, i, &vq);
276                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
277                 if (gpa == 0) {
278                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
279                         return -1;
280                 }
281                 hw->vring[i].desc = gpa;
282
283                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
284                 if (gpa == 0) {
285                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
286                         return -1;
287                 }
288                 hw->vring[i].avail = gpa;
289
290                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
291                 if (gpa == 0) {
292                         DRV_LOG(ERR, "Fail to get GPA for used ring.");
293                         return -1;
294                 }
295                 hw->vring[i].used = gpa;
296
297                 hw->vring[i].size = vq.size;
298                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
299                                 &hw->vring[i].last_used_idx);
300         }
301         hw->nr_vring = i;
302
303         return ifcvf_start_hw(&internal->hw);
304 }
305
306 static void
307 vdpa_ifcvf_stop(struct ifcvf_internal *internal)
308 {
309         struct ifcvf_hw *hw = &internal->hw;
310         uint32_t i;
311         int vid;
312         uint64_t features = 0;
313         uint64_t log_base = 0, log_size = 0;
314         uint64_t len;
315
316         vid = internal->vid;
317         ifcvf_stop_hw(hw);
318
319         for (i = 0; i < hw->nr_vring; i++)
320                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
321                                 hw->vring[i].last_used_idx);
322
323         if (internal->sw_lm)
324                 return;
325
326         rte_vhost_get_negotiated_features(vid, &features);
327         if (RTE_VHOST_NEED_LOG(features)) {
328                 ifcvf_disable_logging(hw);
329                 rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
330                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
331                                 log_base, IFCVF_LOG_BASE, log_size);
332                 /*
333                  * IFCVF marks dirty memory pages for only packet buffer,
334                  * SW helps to mark the used ring as dirty after device stops.
335                  */
336                 for (i = 0; i < hw->nr_vring; i++) {
337                         len = IFCVF_USED_RING_LEN(hw->vring[i].size);
338                         rte_vhost_log_used_vring(vid, i, 0, len);
339                 }
340         }
341 }
342
343 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
344                 sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
345 static int
346 vdpa_enable_vfio_intr(struct ifcvf_internal *internal, bool m_rx)
347 {
348         int ret;
349         uint32_t i, nr_vring;
350         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
351         struct vfio_irq_set *irq_set;
352         int *fd_ptr;
353         struct rte_vhost_vring vring;
354         int fd;
355
356         vring.callfd = -1;
357
358         nr_vring = rte_vhost_get_vring_num(internal->vid);
359
360         irq_set = (struct vfio_irq_set *)irq_set_buf;
361         irq_set->argsz = sizeof(irq_set_buf);
362         irq_set->count = nr_vring + 1;
363         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
364                          VFIO_IRQ_SET_ACTION_TRIGGER;
365         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
366         irq_set->start = 0;
367         fd_ptr = (int *)&irq_set->data;
368         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = internal->pdev->intr_handle.fd;
369
370         for (i = 0; i < nr_vring; i++)
371                 internal->intr_fd[i] = -1;
372
373         for (i = 0; i < nr_vring; i++) {
374                 rte_vhost_get_vhost_vring(internal->vid, i, &vring);
375                 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
376                 if ((i & 1) == 0 && m_rx == true) {
377                         fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
378                         if (fd < 0) {
379                                 DRV_LOG(ERR, "can't setup eventfd: %s",
380                                         strerror(errno));
381                                 return -1;
382                         }
383                         internal->intr_fd[i] = fd;
384                         fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = fd;
385                 }
386         }
387
388         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
389         if (ret) {
390                 DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s",
391                                 strerror(errno));
392                 return -1;
393         }
394
395         return 0;
396 }
397
398 static int
399 vdpa_disable_vfio_intr(struct ifcvf_internal *internal)
400 {
401         int ret;
402         uint32_t i, nr_vring;
403         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
404         struct vfio_irq_set *irq_set;
405
406         irq_set = (struct vfio_irq_set *)irq_set_buf;
407         irq_set->argsz = sizeof(irq_set_buf);
408         irq_set->count = 0;
409         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
410         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
411         irq_set->start = 0;
412
413         nr_vring = rte_vhost_get_vring_num(internal->vid);
414         for (i = 0; i < nr_vring; i++) {
415                 if (internal->intr_fd[i] >= 0)
416                         close(internal->intr_fd[i]);
417                 internal->intr_fd[i] = -1;
418         }
419
420         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
421         if (ret) {
422                 DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s",
423                                 strerror(errno));
424                 return -1;
425         }
426
427         return 0;
428 }
429
430 static void *
431 notify_relay(void *arg)
432 {
433         int i, kickfd, epfd, nfds = 0;
434         uint32_t qid, q_num;
435         struct epoll_event events[IFCVF_MAX_QUEUES * 2];
436         struct epoll_event ev;
437         uint64_t buf;
438         int nbytes;
439         struct rte_vhost_vring vring;
440         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
441         struct ifcvf_hw *hw = &internal->hw;
442
443         q_num = rte_vhost_get_vring_num(internal->vid);
444
445         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
446         if (epfd < 0) {
447                 DRV_LOG(ERR, "failed to create epoll instance.");
448                 return NULL;
449         }
450         internal->epfd = epfd;
451
452         vring.kickfd = -1;
453         for (qid = 0; qid < q_num; qid++) {
454                 ev.events = EPOLLIN | EPOLLPRI;
455                 rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
456                 ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
457                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
458                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
459                         return NULL;
460                 }
461         }
462
463         for (;;) {
464                 nfds = epoll_wait(epfd, events, q_num, -1);
465                 if (nfds < 0) {
466                         if (errno == EINTR)
467                                 continue;
468                         DRV_LOG(ERR, "epoll_wait return fail\n");
469                         return NULL;
470                 }
471
472                 for (i = 0; i < nfds; i++) {
473                         qid = events[i].data.u32;
474                         kickfd = (uint32_t)(events[i].data.u64 >> 32);
475                         do {
476                                 nbytes = read(kickfd, &buf, 8);
477                                 if (nbytes < 0) {
478                                         if (errno == EINTR ||
479                                             errno == EWOULDBLOCK ||
480                                             errno == EAGAIN)
481                                                 continue;
482                                         DRV_LOG(INFO, "Error reading "
483                                                 "kickfd: %s",
484                                                 strerror(errno));
485                                 }
486                                 break;
487                         } while (1);
488
489                         ifcvf_notify_queue(hw, qid);
490                 }
491         }
492
493         return NULL;
494 }
495
496 static int
497 setup_notify_relay(struct ifcvf_internal *internal)
498 {
499         int ret;
500
501         ret = pthread_create(&internal->tid, NULL, notify_relay,
502                         (void *)internal);
503         if (ret) {
504                 DRV_LOG(ERR, "failed to create notify relay pthread.");
505                 return -1;
506         }
507         return 0;
508 }
509
510 static int
511 unset_notify_relay(struct ifcvf_internal *internal)
512 {
513         void *status;
514
515         if (internal->tid) {
516                 pthread_cancel(internal->tid);
517                 pthread_join(internal->tid, &status);
518         }
519         internal->tid = 0;
520
521         if (internal->epfd >= 0)
522                 close(internal->epfd);
523         internal->epfd = -1;
524
525         return 0;
526 }
527
528 static int
529 update_datapath(struct ifcvf_internal *internal)
530 {
531         int ret;
532
533         rte_spinlock_lock(&internal->lock);
534
535         if (!rte_atomic32_read(&internal->running) &&
536             (rte_atomic32_read(&internal->started) &&
537              rte_atomic32_read(&internal->dev_attached))) {
538                 ret = ifcvf_dma_map(internal, 1);
539                 if (ret)
540                         goto err;
541
542                 ret = vdpa_enable_vfio_intr(internal, 0);
543                 if (ret)
544                         goto err;
545
546                 ret = vdpa_ifcvf_start(internal);
547                 if (ret)
548                         goto err;
549
550                 ret = setup_notify_relay(internal);
551                 if (ret)
552                         goto err;
553
554                 rte_atomic32_set(&internal->running, 1);
555         } else if (rte_atomic32_read(&internal->running) &&
556                    (!rte_atomic32_read(&internal->started) ||
557                     !rte_atomic32_read(&internal->dev_attached))) {
558                 ret = unset_notify_relay(internal);
559                 if (ret)
560                         goto err;
561
562                 vdpa_ifcvf_stop(internal);
563
564                 ret = vdpa_disable_vfio_intr(internal);
565                 if (ret)
566                         goto err;
567
568                 ret = ifcvf_dma_map(internal, 0);
569                 if (ret)
570                         goto err;
571
572                 rte_atomic32_set(&internal->running, 0);
573         }
574
575         rte_spinlock_unlock(&internal->lock);
576         return 0;
577 err:
578         rte_spinlock_unlock(&internal->lock);
579         return ret;
580 }
581
582 static int
583 m_ifcvf_start(struct ifcvf_internal *internal)
584 {
585         struct ifcvf_hw *hw = &internal->hw;
586         uint32_t i, nr_vring;
587         int vid, ret;
588         struct rte_vhost_vring vq;
589         void *vring_buf;
590         uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
591         uint64_t size;
592         uint64_t gpa;
593
594         memset(&vq, 0, sizeof(vq));
595         vid = internal->vid;
596         nr_vring = rte_vhost_get_vring_num(vid);
597         rte_vhost_get_negotiated_features(vid, &hw->req_features);
598
599         for (i = 0; i < nr_vring; i++) {
600                 rte_vhost_get_vhost_vring(vid, i, &vq);
601
602                 size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE),
603                                 PAGE_SIZE);
604                 vring_buf = rte_zmalloc("ifcvf", size, PAGE_SIZE);
605                 vring_init(&internal->m_vring[i], vq.size, vring_buf,
606                                 PAGE_SIZE);
607
608                 ret = rte_vfio_container_dma_map(internal->vfio_container_fd,
609                         (uint64_t)(uintptr_t)vring_buf, m_vring_iova, size);
610                 if (ret < 0) {
611                         DRV_LOG(ERR, "mediated vring DMA map failed.");
612                         goto error;
613                 }
614
615                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
616                 if (gpa == 0) {
617                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
618                         return -1;
619                 }
620                 hw->vring[i].desc = gpa;
621
622                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
623                 if (gpa == 0) {
624                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
625                         return -1;
626                 }
627                 hw->vring[i].avail = gpa;
628
629                 /* Direct I/O for Tx queue, relay for Rx queue */
630                 if (i & 1) {
631                         gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
632                         if (gpa == 0) {
633                                 DRV_LOG(ERR, "Fail to get GPA for used ring.");
634                                 return -1;
635                         }
636                         hw->vring[i].used = gpa;
637                 } else {
638                         hw->vring[i].used = m_vring_iova +
639                                 (char *)internal->m_vring[i].used -
640                                 (char *)internal->m_vring[i].desc;
641                 }
642
643                 hw->vring[i].size = vq.size;
644
645                 rte_vhost_get_vring_base(vid, i,
646                                 &internal->m_vring[i].avail->idx,
647                                 &internal->m_vring[i].used->idx);
648
649                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
650                                 &hw->vring[i].last_used_idx);
651
652                 m_vring_iova += size;
653         }
654         hw->nr_vring = nr_vring;
655
656         return ifcvf_start_hw(&internal->hw);
657
658 error:
659         for (i = 0; i < nr_vring; i++)
660                 if (internal->m_vring[i].desc)
661                         rte_free(internal->m_vring[i].desc);
662
663         return -1;
664 }
665
666 static int
667 m_ifcvf_stop(struct ifcvf_internal *internal)
668 {
669         int vid;
670         uint32_t i;
671         struct rte_vhost_vring vq;
672         struct ifcvf_hw *hw = &internal->hw;
673         uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
674         uint64_t size, len;
675
676         vid = internal->vid;
677         ifcvf_stop_hw(hw);
678
679         for (i = 0; i < hw->nr_vring; i++) {
680                 /* synchronize remaining new used entries if any */
681                 if ((i & 1) == 0)
682                         update_used_ring(internal, i);
683
684                 rte_vhost_get_vhost_vring(vid, i, &vq);
685                 len = IFCVF_USED_RING_LEN(vq.size);
686                 rte_vhost_log_used_vring(vid, i, 0, len);
687
688                 size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE),
689                                 PAGE_SIZE);
690                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
691                         (uint64_t)(uintptr_t)internal->m_vring[i].desc,
692                         m_vring_iova, size);
693
694                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
695                                 hw->vring[i].last_used_idx);
696                 rte_free(internal->m_vring[i].desc);
697                 m_vring_iova += size;
698         }
699
700         return 0;
701 }
702
703 static void
704 update_used_ring(struct ifcvf_internal *internal, uint16_t qid)
705 {
706         rte_vdpa_relay_vring_used(internal->vid, qid, &internal->m_vring[qid]);
707         rte_vhost_vring_call(internal->vid, qid);
708 }
709
710 static void *
711 vring_relay(void *arg)
712 {
713         int i, vid, epfd, fd, nfds;
714         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
715         struct rte_vhost_vring vring;
716         uint16_t qid, q_num;
717         struct epoll_event events[IFCVF_MAX_QUEUES * 4];
718         struct epoll_event ev;
719         int nbytes;
720         uint64_t buf;
721
722         vid = internal->vid;
723         q_num = rte_vhost_get_vring_num(vid);
724
725         /* add notify fd and interrupt fd to epoll */
726         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
727         if (epfd < 0) {
728                 DRV_LOG(ERR, "failed to create epoll instance.");
729                 return NULL;
730         }
731         internal->epfd = epfd;
732
733         vring.kickfd = -1;
734         for (qid = 0; qid < q_num; qid++) {
735                 ev.events = EPOLLIN | EPOLLPRI;
736                 rte_vhost_get_vhost_vring(vid, qid, &vring);
737                 ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32;
738                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
739                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
740                         return NULL;
741                 }
742         }
743
744         for (qid = 0; qid < q_num; qid += 2) {
745                 ev.events = EPOLLIN | EPOLLPRI;
746                 /* leave a flag to mark it's for interrupt */
747                 ev.data.u64 = 1 | qid << 1 |
748                         (uint64_t)internal->intr_fd[qid] << 32;
749                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, internal->intr_fd[qid], &ev)
750                                 < 0) {
751                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
752                         return NULL;
753                 }
754                 update_used_ring(internal, qid);
755         }
756
757         /* start relay with a first kick */
758         for (qid = 0; qid < q_num; qid++)
759                 ifcvf_notify_queue(&internal->hw, qid);
760
761         /* listen to the events and react accordingly */
762         for (;;) {
763                 nfds = epoll_wait(epfd, events, q_num * 2, -1);
764                 if (nfds < 0) {
765                         if (errno == EINTR)
766                                 continue;
767                         DRV_LOG(ERR, "epoll_wait return fail\n");
768                         return NULL;
769                 }
770
771                 for (i = 0; i < nfds; i++) {
772                         fd = (uint32_t)(events[i].data.u64 >> 32);
773                         do {
774                                 nbytes = read(fd, &buf, 8);
775                                 if (nbytes < 0) {
776                                         if (errno == EINTR ||
777                                             errno == EWOULDBLOCK ||
778                                             errno == EAGAIN)
779                                                 continue;
780                                         DRV_LOG(INFO, "Error reading "
781                                                 "kickfd: %s",
782                                                 strerror(errno));
783                                 }
784                                 break;
785                         } while (1);
786
787                         qid = events[i].data.u32 >> 1;
788
789                         if (events[i].data.u32 & 1)
790                                 update_used_ring(internal, qid);
791                         else
792                                 ifcvf_notify_queue(&internal->hw, qid);
793                 }
794         }
795
796         return NULL;
797 }
798
799 static int
800 setup_vring_relay(struct ifcvf_internal *internal)
801 {
802         int ret;
803
804         ret = pthread_create(&internal->tid, NULL, vring_relay,
805                         (void *)internal);
806         if (ret) {
807                 DRV_LOG(ERR, "failed to create ring relay pthread.");
808                 return -1;
809         }
810         return 0;
811 }
812
813 static int
814 unset_vring_relay(struct ifcvf_internal *internal)
815 {
816         void *status;
817
818         if (internal->tid) {
819                 pthread_cancel(internal->tid);
820                 pthread_join(internal->tid, &status);
821         }
822         internal->tid = 0;
823
824         if (internal->epfd >= 0)
825                 close(internal->epfd);
826         internal->epfd = -1;
827
828         return 0;
829 }
830
831 static int
832 ifcvf_sw_fallback_switchover(struct ifcvf_internal *internal)
833 {
834         int ret;
835         int vid = internal->vid;
836
837         /* stop the direct IO data path */
838         unset_notify_relay(internal);
839         vdpa_ifcvf_stop(internal);
840         vdpa_disable_vfio_intr(internal);
841
842         ret = rte_vhost_host_notifier_ctrl(vid, false);
843         if (ret && ret != -ENOTSUP)
844                 goto error;
845
846         /* set up interrupt for interrupt relay */
847         ret = vdpa_enable_vfio_intr(internal, 1);
848         if (ret)
849                 goto unmap;
850
851         /* config the VF */
852         ret = m_ifcvf_start(internal);
853         if (ret)
854                 goto unset_intr;
855
856         /* set up vring relay thread */
857         ret = setup_vring_relay(internal);
858         if (ret)
859                 goto stop_vf;
860
861         rte_vhost_host_notifier_ctrl(vid, true);
862
863         internal->sw_fallback_running = true;
864
865         return 0;
866
867 stop_vf:
868         m_ifcvf_stop(internal);
869 unset_intr:
870         vdpa_disable_vfio_intr(internal);
871 unmap:
872         ifcvf_dma_map(internal, 0);
873 error:
874         return -1;
875 }
876
877 static int
878 ifcvf_dev_config(int vid)
879 {
880         int did;
881         struct internal_list *list;
882         struct ifcvf_internal *internal;
883
884         did = rte_vhost_get_vdpa_device_id(vid);
885         list = find_internal_resource_by_did(did);
886         if (list == NULL) {
887                 DRV_LOG(ERR, "Invalid device id: %d", did);
888                 return -1;
889         }
890
891         internal = list->internal;
892         internal->vid = vid;
893         rte_atomic32_set(&internal->dev_attached, 1);
894         update_datapath(internal);
895
896         if (rte_vhost_host_notifier_ctrl(vid, true) != 0)
897                 DRV_LOG(NOTICE, "vDPA (%d): software relay is used.", did);
898
899         return 0;
900 }
901
902 static int
903 ifcvf_dev_close(int vid)
904 {
905         int did;
906         struct internal_list *list;
907         struct ifcvf_internal *internal;
908
909         did = rte_vhost_get_vdpa_device_id(vid);
910         list = find_internal_resource_by_did(did);
911         if (list == NULL) {
912                 DRV_LOG(ERR, "Invalid device id: %d", did);
913                 return -1;
914         }
915
916         internal = list->internal;
917
918         if (internal->sw_fallback_running) {
919                 /* unset ring relay */
920                 unset_vring_relay(internal);
921
922                 /* reset VF */
923                 m_ifcvf_stop(internal);
924
925                 /* remove interrupt setting */
926                 vdpa_disable_vfio_intr(internal);
927
928                 /* unset DMA map for guest memory */
929                 ifcvf_dma_map(internal, 0);
930
931                 internal->sw_fallback_running = false;
932         } else {
933                 rte_atomic32_set(&internal->dev_attached, 0);
934                 update_datapath(internal);
935         }
936
937         return 0;
938 }
939
940 static int
941 ifcvf_set_features(int vid)
942 {
943         uint64_t features = 0;
944         int did;
945         struct internal_list *list;
946         struct ifcvf_internal *internal;
947         uint64_t log_base = 0, log_size = 0;
948
949         did = rte_vhost_get_vdpa_device_id(vid);
950         list = find_internal_resource_by_did(did);
951         if (list == NULL) {
952                 DRV_LOG(ERR, "Invalid device id: %d", did);
953                 return -1;
954         }
955
956         internal = list->internal;
957         rte_vhost_get_negotiated_features(vid, &features);
958
959         if (!RTE_VHOST_NEED_LOG(features))
960                 return 0;
961
962         if (internal->sw_lm) {
963                 ifcvf_sw_fallback_switchover(internal);
964         } else {
965                 rte_vhost_get_log_base(vid, &log_base, &log_size);
966                 rte_vfio_container_dma_map(internal->vfio_container_fd,
967                                 log_base, IFCVF_LOG_BASE, log_size);
968                 ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size);
969         }
970
971         return 0;
972 }
973
974 static int
975 ifcvf_get_vfio_group_fd(int vid)
976 {
977         int did;
978         struct internal_list *list;
979
980         did = rte_vhost_get_vdpa_device_id(vid);
981         list = find_internal_resource_by_did(did);
982         if (list == NULL) {
983                 DRV_LOG(ERR, "Invalid device id: %d", did);
984                 return -1;
985         }
986
987         return list->internal->vfio_group_fd;
988 }
989
990 static int
991 ifcvf_get_vfio_device_fd(int vid)
992 {
993         int did;
994         struct internal_list *list;
995
996         did = rte_vhost_get_vdpa_device_id(vid);
997         list = find_internal_resource_by_did(did);
998         if (list == NULL) {
999                 DRV_LOG(ERR, "Invalid device id: %d", did);
1000                 return -1;
1001         }
1002
1003         return list->internal->vfio_dev_fd;
1004 }
1005
1006 static int
1007 ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
1008 {
1009         int did;
1010         struct internal_list *list;
1011         struct ifcvf_internal *internal;
1012         struct vfio_region_info reg = { .argsz = sizeof(reg) };
1013         int ret;
1014
1015         did = rte_vhost_get_vdpa_device_id(vid);
1016         list = find_internal_resource_by_did(did);
1017         if (list == NULL) {
1018                 DRV_LOG(ERR, "Invalid device id: %d", did);
1019                 return -1;
1020         }
1021
1022         internal = list->internal;
1023
1024         reg.index = ifcvf_get_notify_region(&internal->hw);
1025         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
1026         if (ret) {
1027                 DRV_LOG(ERR, "Get not get device region info: %s",
1028                                 strerror(errno));
1029                 return -1;
1030         }
1031
1032         *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset;
1033         *size = 0x1000;
1034
1035         return 0;
1036 }
1037
1038 static int
1039 ifcvf_get_queue_num(int did, uint32_t *queue_num)
1040 {
1041         struct internal_list *list;
1042
1043         list = find_internal_resource_by_did(did);
1044         if (list == NULL) {
1045                 DRV_LOG(ERR, "Invalid device id: %d", did);
1046                 return -1;
1047         }
1048
1049         *queue_num = list->internal->max_queues;
1050
1051         return 0;
1052 }
1053
1054 static int
1055 ifcvf_get_vdpa_features(int did, uint64_t *features)
1056 {
1057         struct internal_list *list;
1058
1059         list = find_internal_resource_by_did(did);
1060         if (list == NULL) {
1061                 DRV_LOG(ERR, "Invalid device id: %d", did);
1062                 return -1;
1063         }
1064
1065         *features = list->internal->features;
1066
1067         return 0;
1068 }
1069
1070 #define VDPA_SUPPORTED_PROTOCOL_FEATURES \
1071                 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \
1072                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | \
1073                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | \
1074                  1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
1075                  1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD)
1076 static int
1077 ifcvf_get_protocol_features(int did __rte_unused, uint64_t *features)
1078 {
1079         *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
1080         return 0;
1081 }
1082
1083 static struct rte_vdpa_dev_ops ifcvf_ops = {
1084         .get_queue_num = ifcvf_get_queue_num,
1085         .get_features = ifcvf_get_vdpa_features,
1086         .get_protocol_features = ifcvf_get_protocol_features,
1087         .dev_conf = ifcvf_dev_config,
1088         .dev_close = ifcvf_dev_close,
1089         .set_vring_state = NULL,
1090         .set_features = ifcvf_set_features,
1091         .migration_done = NULL,
1092         .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
1093         .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
1094         .get_notify_area = ifcvf_get_notify_area,
1095 };
1096
1097 static inline int
1098 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1099 {
1100         uint16_t *n = extra_args;
1101
1102         if (value == NULL || extra_args == NULL)
1103                 return -EINVAL;
1104
1105         *n = (uint16_t)strtoul(value, NULL, 0);
1106         if (*n == USHRT_MAX && errno == ERANGE)
1107                 return -1;
1108
1109         return 0;
1110 }
1111
1112 static int
1113 ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
1114                 struct rte_pci_device *pci_dev)
1115 {
1116         uint64_t features;
1117         struct ifcvf_internal *internal = NULL;
1118         struct internal_list *list = NULL;
1119         int vdpa_mode = 0;
1120         int sw_fallback_lm = 0;
1121         struct rte_kvargs *kvlist = NULL;
1122         int ret = 0;
1123
1124         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1125                 return 0;
1126
1127         if (!pci_dev->device.devargs)
1128                 return 1;
1129
1130         kvlist = rte_kvargs_parse(pci_dev->device.devargs->args,
1131                         ifcvf_valid_arguments);
1132         if (kvlist == NULL)
1133                 return 1;
1134
1135         /* probe only when vdpa mode is specified */
1136         if (rte_kvargs_count(kvlist, IFCVF_VDPA_MODE) == 0) {
1137                 rte_kvargs_free(kvlist);
1138                 return 1;
1139         }
1140
1141         ret = rte_kvargs_process(kvlist, IFCVF_VDPA_MODE, &open_int,
1142                         &vdpa_mode);
1143         if (ret < 0 || vdpa_mode == 0) {
1144                 rte_kvargs_free(kvlist);
1145                 return 1;
1146         }
1147
1148         list = rte_zmalloc("ifcvf", sizeof(*list), 0);
1149         if (list == NULL)
1150                 goto error;
1151
1152         internal = rte_zmalloc("ifcvf", sizeof(*internal), 0);
1153         if (internal == NULL)
1154                 goto error;
1155
1156         internal->pdev = pci_dev;
1157         rte_spinlock_init(&internal->lock);
1158
1159         if (ifcvf_vfio_setup(internal) < 0) {
1160                 DRV_LOG(ERR, "failed to setup device %s", pci_dev->name);
1161                 goto error;
1162         }
1163
1164         if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0) {
1165                 DRV_LOG(ERR, "failed to init device %s", pci_dev->name);
1166                 goto error;
1167         }
1168
1169         internal->max_queues = IFCVF_MAX_QUEUES;
1170         features = ifcvf_get_features(&internal->hw);
1171         internal->features = (features &
1172                 ~(1ULL << VIRTIO_F_IOMMU_PLATFORM)) |
1173                 (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
1174                 (1ULL << VIRTIO_NET_F_CTRL_VQ) |
1175                 (1ULL << VIRTIO_NET_F_STATUS) |
1176                 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
1177                 (1ULL << VHOST_F_LOG_ALL);
1178
1179         internal->dev_addr.pci_addr = pci_dev->addr;
1180         internal->dev_addr.type = PCI_ADDR;
1181         list->internal = internal;
1182
1183         if (rte_kvargs_count(kvlist, IFCVF_SW_FALLBACK_LM)) {
1184                 ret = rte_kvargs_process(kvlist, IFCVF_SW_FALLBACK_LM,
1185                                 &open_int, &sw_fallback_lm);
1186                 if (ret < 0)
1187                         goto error;
1188         }
1189         internal->sw_lm = sw_fallback_lm;
1190
1191         internal->did = rte_vdpa_register_device(&internal->dev_addr,
1192                                 &ifcvf_ops);
1193         if (internal->did < 0) {
1194                 DRV_LOG(ERR, "failed to register device %s", pci_dev->name);
1195                 goto error;
1196         }
1197
1198         pthread_mutex_lock(&internal_list_lock);
1199         TAILQ_INSERT_TAIL(&internal_list, list, next);
1200         pthread_mutex_unlock(&internal_list_lock);
1201
1202         rte_atomic32_set(&internal->started, 1);
1203         update_datapath(internal);
1204
1205         rte_kvargs_free(kvlist);
1206         return 0;
1207
1208 error:
1209         rte_kvargs_free(kvlist);
1210         rte_free(list);
1211         rte_free(internal);
1212         return -1;
1213 }
1214
1215 static int
1216 ifcvf_pci_remove(struct rte_pci_device *pci_dev)
1217 {
1218         struct ifcvf_internal *internal;
1219         struct internal_list *list;
1220
1221         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1222                 return 0;
1223
1224         list = find_internal_resource_by_dev(pci_dev);
1225         if (list == NULL) {
1226                 DRV_LOG(ERR, "Invalid device: %s", pci_dev->name);
1227                 return -1;
1228         }
1229
1230         internal = list->internal;
1231         rte_atomic32_set(&internal->started, 0);
1232         update_datapath(internal);
1233
1234         rte_pci_unmap_device(internal->pdev);
1235         rte_vfio_container_destroy(internal->vfio_container_fd);
1236         rte_vdpa_unregister_device(internal->did);
1237
1238         pthread_mutex_lock(&internal_list_lock);
1239         TAILQ_REMOVE(&internal_list, list, next);
1240         pthread_mutex_unlock(&internal_list_lock);
1241
1242         rte_free(list);
1243         rte_free(internal);
1244
1245         return 0;
1246 }
1247
1248 /*
1249  * IFCVF has the same vendor ID and device ID as virtio net PCI
1250  * device, with its specific subsystem vendor ID and device ID.
1251  */
1252 static const struct rte_pci_id pci_id_ifcvf_map[] = {
1253         { .class_id = RTE_CLASS_ANY_ID,
1254           .vendor_id = IFCVF_VENDOR_ID,
1255           .device_id = IFCVF_DEVICE_ID,
1256           .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
1257           .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID,
1258         },
1259
1260         { .vendor_id = 0, /* sentinel */
1261         },
1262 };
1263
1264 static struct rte_pci_driver rte_ifcvf_vdpa = {
1265         .id_table = pci_id_ifcvf_map,
1266         .drv_flags = 0,
1267         .probe = ifcvf_pci_probe,
1268         .remove = ifcvf_pci_remove,
1269 };
1270
1271 RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa);
1272 RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map);
1273 RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci");
1274
1275 RTE_INIT(ifcvf_vdpa_init_log)
1276 {
1277         ifcvf_vdpa_logtype = rte_log_register("pmd.net.ifcvf_vdpa");
1278         if (ifcvf_vdpa_logtype >= 0)
1279                 rte_log_set_level(ifcvf_vdpa_logtype, RTE_LOG_NOTICE);
1280 }