common/cpt: fix encryption offset
[dpdk.git] / drivers / vdpa / ifc / ifcvf_vdpa.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2018 Intel Corporation
3  */
4
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <fcntl.h>
8 #include <string.h>
9 #include <sys/ioctl.h>
10 #include <sys/epoll.h>
11 #include <linux/virtio_net.h>
12 #include <stdbool.h>
13
14 #include <rte_malloc.h>
15 #include <rte_memory.h>
16 #include <rte_bus_pci.h>
17 #include <rte_vhost.h>
18 #include <rte_vdpa.h>
19 #include <rte_vdpa_dev.h>
20 #include <rte_vfio.h>
21 #include <rte_spinlock.h>
22 #include <rte_log.h>
23 #include <rte_kvargs.h>
24 #include <rte_devargs.h>
25
26 #include "base/ifcvf.h"
27
28 RTE_LOG_REGISTER(ifcvf_vdpa_logtype, pmd.net.ifcvf_vdpa, NOTICE);
29 #define DRV_LOG(level, fmt, args...) \
30         rte_log(RTE_LOG_ ## level, ifcvf_vdpa_logtype, \
31                 "IFCVF %s(): " fmt "\n", __func__, ##args)
32
33 #ifndef PAGE_SIZE
34 #define PAGE_SIZE 4096
35 #endif
36
37 #define IFCVF_USED_RING_LEN(size) \
38         ((size) * sizeof(struct vring_used_elem) + sizeof(uint16_t) * 3)
39
40 #define IFCVF_VDPA_MODE         "vdpa"
41 #define IFCVF_SW_FALLBACK_LM    "sw-live-migration"
42
43 static const char * const ifcvf_valid_arguments[] = {
44         IFCVF_VDPA_MODE,
45         IFCVF_SW_FALLBACK_LM,
46         NULL
47 };
48
49 struct ifcvf_internal {
50         struct rte_pci_device *pdev;
51         struct ifcvf_hw hw;
52         int vfio_container_fd;
53         int vfio_group_fd;
54         int vfio_dev_fd;
55         pthread_t tid;  /* thread for notify relay */
56         int epfd;
57         int vid;
58         struct rte_vdpa_device *vdev;
59         uint16_t max_queues;
60         uint64_t features;
61         rte_atomic32_t started;
62         rte_atomic32_t dev_attached;
63         rte_atomic32_t running;
64         rte_spinlock_t lock;
65         bool sw_lm;
66         bool sw_fallback_running;
67         /* mediated vring for sw fallback */
68         struct vring m_vring[IFCVF_MAX_QUEUES * 2];
69         /* eventfd for used ring interrupt */
70         int intr_fd[IFCVF_MAX_QUEUES * 2];
71 };
72
73 struct internal_list {
74         TAILQ_ENTRY(internal_list) next;
75         struct ifcvf_internal *internal;
76 };
77
78 TAILQ_HEAD(internal_list_head, internal_list);
79 static struct internal_list_head internal_list =
80         TAILQ_HEAD_INITIALIZER(internal_list);
81
82 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
83
84 static void update_used_ring(struct ifcvf_internal *internal, uint16_t qid);
85
86 static struct internal_list *
87 find_internal_resource_by_vdev(struct rte_vdpa_device *vdev)
88 {
89         int found = 0;
90         struct internal_list *list;
91
92         pthread_mutex_lock(&internal_list_lock);
93
94         TAILQ_FOREACH(list, &internal_list, next) {
95                 if (vdev == list->internal->vdev) {
96                         found = 1;
97                         break;
98                 }
99         }
100
101         pthread_mutex_unlock(&internal_list_lock);
102
103         if (!found)
104                 return NULL;
105
106         return list;
107 }
108
109 static struct internal_list *
110 find_internal_resource_by_dev(struct rte_pci_device *pdev)
111 {
112         int found = 0;
113         struct internal_list *list;
114
115         pthread_mutex_lock(&internal_list_lock);
116
117         TAILQ_FOREACH(list, &internal_list, next) {
118                 if (!rte_pci_addr_cmp(&pdev->addr,
119                                         &list->internal->pdev->addr)) {
120                         found = 1;
121                         break;
122                 }
123         }
124
125         pthread_mutex_unlock(&internal_list_lock);
126
127         if (!found)
128                 return NULL;
129
130         return list;
131 }
132
133 static int
134 ifcvf_vfio_setup(struct ifcvf_internal *internal)
135 {
136         struct rte_pci_device *dev = internal->pdev;
137         char devname[RTE_DEV_NAME_MAX_LEN] = {0};
138         int iommu_group_num;
139         int i, ret;
140
141         internal->vfio_dev_fd = -1;
142         internal->vfio_group_fd = -1;
143         internal->vfio_container_fd = -1;
144
145         rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
146         ret = rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
147                         &iommu_group_num);
148         if (ret <= 0) {
149                 DRV_LOG(ERR, "%s failed to get IOMMU group", devname);
150                 return -1;
151         }
152
153         internal->vfio_container_fd = rte_vfio_container_create();
154         if (internal->vfio_container_fd < 0)
155                 return -1;
156
157         internal->vfio_group_fd = rte_vfio_container_group_bind(
158                         internal->vfio_container_fd, iommu_group_num);
159         if (internal->vfio_group_fd < 0)
160                 goto err;
161
162         if (rte_pci_map_device(dev))
163                 goto err;
164
165         internal->vfio_dev_fd = dev->intr_handle.vfio_dev_fd;
166
167         for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
168                         i++) {
169                 internal->hw.mem_resource[i].addr =
170                         internal->pdev->mem_resource[i].addr;
171                 internal->hw.mem_resource[i].phys_addr =
172                         internal->pdev->mem_resource[i].phys_addr;
173                 internal->hw.mem_resource[i].len =
174                         internal->pdev->mem_resource[i].len;
175         }
176
177         return 0;
178
179 err:
180         rte_vfio_container_destroy(internal->vfio_container_fd);
181         return -1;
182 }
183
184 static int
185 ifcvf_dma_map(struct ifcvf_internal *internal, int do_map)
186 {
187         uint32_t i;
188         int ret;
189         struct rte_vhost_memory *mem = NULL;
190         int vfio_container_fd;
191
192         ret = rte_vhost_get_mem_table(internal->vid, &mem);
193         if (ret < 0) {
194                 DRV_LOG(ERR, "failed to get VM memory layout.");
195                 goto exit;
196         }
197
198         vfio_container_fd = internal->vfio_container_fd;
199
200         for (i = 0; i < mem->nregions; i++) {
201                 struct rte_vhost_mem_region *reg;
202
203                 reg = &mem->regions[i];
204                 DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", "
205                         "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".",
206                         do_map ? "DMA map" : "DMA unmap", i,
207                         reg->host_user_addr, reg->guest_phys_addr, reg->size);
208
209                 if (do_map) {
210                         ret = rte_vfio_container_dma_map(vfio_container_fd,
211                                 reg->host_user_addr, reg->guest_phys_addr,
212                                 reg->size);
213                         if (ret < 0) {
214                                 DRV_LOG(ERR, "DMA map failed.");
215                                 goto exit;
216                         }
217                 } else {
218                         ret = rte_vfio_container_dma_unmap(vfio_container_fd,
219                                 reg->host_user_addr, reg->guest_phys_addr,
220                                 reg->size);
221                         if (ret < 0) {
222                                 DRV_LOG(ERR, "DMA unmap failed.");
223                                 goto exit;
224                         }
225                 }
226         }
227
228 exit:
229         if (mem)
230                 free(mem);
231         return ret;
232 }
233
234 static uint64_t
235 hva_to_gpa(int vid, uint64_t hva)
236 {
237         struct rte_vhost_memory *mem = NULL;
238         struct rte_vhost_mem_region *reg;
239         uint32_t i;
240         uint64_t gpa = 0;
241
242         if (rte_vhost_get_mem_table(vid, &mem) < 0)
243                 goto exit;
244
245         for (i = 0; i < mem->nregions; i++) {
246                 reg = &mem->regions[i];
247
248                 if (hva >= reg->host_user_addr &&
249                                 hva < reg->host_user_addr + reg->size) {
250                         gpa = hva - reg->host_user_addr + reg->guest_phys_addr;
251                         break;
252                 }
253         }
254
255 exit:
256         if (mem)
257                 free(mem);
258         return gpa;
259 }
260
261 static int
262 vdpa_ifcvf_start(struct ifcvf_internal *internal)
263 {
264         struct ifcvf_hw *hw = &internal->hw;
265         int i, nr_vring;
266         int vid;
267         struct rte_vhost_vring vq;
268         uint64_t gpa;
269
270         vid = internal->vid;
271         nr_vring = rte_vhost_get_vring_num(vid);
272         rte_vhost_get_negotiated_features(vid, &hw->req_features);
273
274         for (i = 0; i < nr_vring; i++) {
275                 rte_vhost_get_vhost_vring(vid, i, &vq);
276                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
277                 if (gpa == 0) {
278                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
279                         return -1;
280                 }
281                 hw->vring[i].desc = gpa;
282
283                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
284                 if (gpa == 0) {
285                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
286                         return -1;
287                 }
288                 hw->vring[i].avail = gpa;
289
290                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
291                 if (gpa == 0) {
292                         DRV_LOG(ERR, "Fail to get GPA for used ring.");
293                         return -1;
294                 }
295                 hw->vring[i].used = gpa;
296
297                 hw->vring[i].size = vq.size;
298                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
299                                 &hw->vring[i].last_used_idx);
300         }
301         hw->nr_vring = i;
302
303         return ifcvf_start_hw(&internal->hw);
304 }
305
306 static void
307 vdpa_ifcvf_stop(struct ifcvf_internal *internal)
308 {
309         struct ifcvf_hw *hw = &internal->hw;
310         uint32_t i;
311         int vid;
312         uint64_t features = 0;
313         uint64_t log_base = 0, log_size = 0;
314         uint64_t len;
315
316         vid = internal->vid;
317         ifcvf_stop_hw(hw);
318
319         for (i = 0; i < hw->nr_vring; i++)
320                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
321                                 hw->vring[i].last_used_idx);
322
323         if (internal->sw_lm)
324                 return;
325
326         rte_vhost_get_negotiated_features(vid, &features);
327         if (RTE_VHOST_NEED_LOG(features)) {
328                 ifcvf_disable_logging(hw);
329                 rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
330                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
331                                 log_base, IFCVF_LOG_BASE, log_size);
332                 /*
333                  * IFCVF marks dirty memory pages for only packet buffer,
334                  * SW helps to mark the used ring as dirty after device stops.
335                  */
336                 for (i = 0; i < hw->nr_vring; i++) {
337                         len = IFCVF_USED_RING_LEN(hw->vring[i].size);
338                         rte_vhost_log_used_vring(vid, i, 0, len);
339                 }
340         }
341 }
342
343 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
344                 sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
345 static int
346 vdpa_enable_vfio_intr(struct ifcvf_internal *internal, bool m_rx)
347 {
348         int ret;
349         uint32_t i, nr_vring;
350         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
351         struct vfio_irq_set *irq_set;
352         int *fd_ptr;
353         struct rte_vhost_vring vring;
354         int fd;
355
356         vring.callfd = -1;
357
358         nr_vring = rte_vhost_get_vring_num(internal->vid);
359
360         irq_set = (struct vfio_irq_set *)irq_set_buf;
361         irq_set->argsz = sizeof(irq_set_buf);
362         irq_set->count = nr_vring + 1;
363         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
364                          VFIO_IRQ_SET_ACTION_TRIGGER;
365         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
366         irq_set->start = 0;
367         fd_ptr = (int *)&irq_set->data;
368         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = internal->pdev->intr_handle.fd;
369
370         for (i = 0; i < nr_vring; i++)
371                 internal->intr_fd[i] = -1;
372
373         for (i = 0; i < nr_vring; i++) {
374                 rte_vhost_get_vhost_vring(internal->vid, i, &vring);
375                 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
376                 if ((i & 1) == 0 && m_rx == true) {
377                         fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
378                         if (fd < 0) {
379                                 DRV_LOG(ERR, "can't setup eventfd: %s",
380                                         strerror(errno));
381                                 return -1;
382                         }
383                         internal->intr_fd[i] = fd;
384                         fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = fd;
385                 }
386         }
387
388         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
389         if (ret) {
390                 DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s",
391                                 strerror(errno));
392                 return -1;
393         }
394
395         return 0;
396 }
397
398 static int
399 vdpa_disable_vfio_intr(struct ifcvf_internal *internal)
400 {
401         int ret;
402         uint32_t i, nr_vring;
403         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
404         struct vfio_irq_set *irq_set;
405
406         irq_set = (struct vfio_irq_set *)irq_set_buf;
407         irq_set->argsz = sizeof(irq_set_buf);
408         irq_set->count = 0;
409         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
410         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
411         irq_set->start = 0;
412
413         nr_vring = rte_vhost_get_vring_num(internal->vid);
414         for (i = 0; i < nr_vring; i++) {
415                 if (internal->intr_fd[i] >= 0)
416                         close(internal->intr_fd[i]);
417                 internal->intr_fd[i] = -1;
418         }
419
420         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
421         if (ret) {
422                 DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s",
423                                 strerror(errno));
424                 return -1;
425         }
426
427         return 0;
428 }
429
430 static void *
431 notify_relay(void *arg)
432 {
433         int i, kickfd, epfd, nfds = 0;
434         uint32_t qid, q_num;
435         struct epoll_event events[IFCVF_MAX_QUEUES * 2];
436         struct epoll_event ev;
437         uint64_t buf;
438         int nbytes;
439         struct rte_vhost_vring vring;
440         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
441         struct ifcvf_hw *hw = &internal->hw;
442
443         q_num = rte_vhost_get_vring_num(internal->vid);
444
445         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
446         if (epfd < 0) {
447                 DRV_LOG(ERR, "failed to create epoll instance.");
448                 return NULL;
449         }
450         internal->epfd = epfd;
451
452         vring.kickfd = -1;
453         for (qid = 0; qid < q_num; qid++) {
454                 ev.events = EPOLLIN | EPOLLPRI;
455                 rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
456                 ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
457                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
458                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
459                         return NULL;
460                 }
461         }
462
463         for (;;) {
464                 nfds = epoll_wait(epfd, events, q_num, -1);
465                 if (nfds < 0) {
466                         if (errno == EINTR)
467                                 continue;
468                         DRV_LOG(ERR, "epoll_wait return fail\n");
469                         return NULL;
470                 }
471
472                 for (i = 0; i < nfds; i++) {
473                         qid = events[i].data.u32;
474                         kickfd = (uint32_t)(events[i].data.u64 >> 32);
475                         do {
476                                 nbytes = read(kickfd, &buf, 8);
477                                 if (nbytes < 0) {
478                                         if (errno == EINTR ||
479                                             errno == EWOULDBLOCK ||
480                                             errno == EAGAIN)
481                                                 continue;
482                                         DRV_LOG(INFO, "Error reading "
483                                                 "kickfd: %s",
484                                                 strerror(errno));
485                                 }
486                                 break;
487                         } while (1);
488
489                         ifcvf_notify_queue(hw, qid);
490                 }
491         }
492
493         return NULL;
494 }
495
496 static int
497 setup_notify_relay(struct ifcvf_internal *internal)
498 {
499         int ret;
500
501         ret = pthread_create(&internal->tid, NULL, notify_relay,
502                         (void *)internal);
503         if (ret) {
504                 DRV_LOG(ERR, "failed to create notify relay pthread.");
505                 return -1;
506         }
507         return 0;
508 }
509
510 static int
511 unset_notify_relay(struct ifcvf_internal *internal)
512 {
513         void *status;
514
515         if (internal->tid) {
516                 pthread_cancel(internal->tid);
517                 pthread_join(internal->tid, &status);
518         }
519         internal->tid = 0;
520
521         if (internal->epfd >= 0)
522                 close(internal->epfd);
523         internal->epfd = -1;
524
525         return 0;
526 }
527
528 static int
529 update_datapath(struct ifcvf_internal *internal)
530 {
531         int ret;
532
533         rte_spinlock_lock(&internal->lock);
534
535         if (!rte_atomic32_read(&internal->running) &&
536             (rte_atomic32_read(&internal->started) &&
537              rte_atomic32_read(&internal->dev_attached))) {
538                 ret = ifcvf_dma_map(internal, 1);
539                 if (ret)
540                         goto err;
541
542                 ret = vdpa_enable_vfio_intr(internal, 0);
543                 if (ret)
544                         goto err;
545
546                 ret = vdpa_ifcvf_start(internal);
547                 if (ret)
548                         goto err;
549
550                 ret = setup_notify_relay(internal);
551                 if (ret)
552                         goto err;
553
554                 rte_atomic32_set(&internal->running, 1);
555         } else if (rte_atomic32_read(&internal->running) &&
556                    (!rte_atomic32_read(&internal->started) ||
557                     !rte_atomic32_read(&internal->dev_attached))) {
558                 ret = unset_notify_relay(internal);
559                 if (ret)
560                         goto err;
561
562                 vdpa_ifcvf_stop(internal);
563
564                 ret = vdpa_disable_vfio_intr(internal);
565                 if (ret)
566                         goto err;
567
568                 ret = ifcvf_dma_map(internal, 0);
569                 if (ret)
570                         goto err;
571
572                 rte_atomic32_set(&internal->running, 0);
573         }
574
575         rte_spinlock_unlock(&internal->lock);
576         return 0;
577 err:
578         rte_spinlock_unlock(&internal->lock);
579         return ret;
580 }
581
582 static int
583 m_ifcvf_start(struct ifcvf_internal *internal)
584 {
585         struct ifcvf_hw *hw = &internal->hw;
586         uint32_t i, nr_vring;
587         int vid, ret;
588         struct rte_vhost_vring vq;
589         void *vring_buf;
590         uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
591         uint64_t size;
592         uint64_t gpa;
593
594         memset(&vq, 0, sizeof(vq));
595         vid = internal->vid;
596         nr_vring = rte_vhost_get_vring_num(vid);
597         rte_vhost_get_negotiated_features(vid, &hw->req_features);
598
599         for (i = 0; i < nr_vring; i++) {
600                 rte_vhost_get_vhost_vring(vid, i, &vq);
601
602                 size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE),
603                                 PAGE_SIZE);
604                 vring_buf = rte_zmalloc("ifcvf", size, PAGE_SIZE);
605                 vring_init(&internal->m_vring[i], vq.size, vring_buf,
606                                 PAGE_SIZE);
607
608                 ret = rte_vfio_container_dma_map(internal->vfio_container_fd,
609                         (uint64_t)(uintptr_t)vring_buf, m_vring_iova, size);
610                 if (ret < 0) {
611                         DRV_LOG(ERR, "mediated vring DMA map failed.");
612                         goto error;
613                 }
614
615                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
616                 if (gpa == 0) {
617                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
618                         return -1;
619                 }
620                 hw->vring[i].desc = gpa;
621
622                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
623                 if (gpa == 0) {
624                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
625                         return -1;
626                 }
627                 hw->vring[i].avail = gpa;
628
629                 /* Direct I/O for Tx queue, relay for Rx queue */
630                 if (i & 1) {
631                         gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
632                         if (gpa == 0) {
633                                 DRV_LOG(ERR, "Fail to get GPA for used ring.");
634                                 return -1;
635                         }
636                         hw->vring[i].used = gpa;
637                 } else {
638                         hw->vring[i].used = m_vring_iova +
639                                 (char *)internal->m_vring[i].used -
640                                 (char *)internal->m_vring[i].desc;
641                 }
642
643                 hw->vring[i].size = vq.size;
644
645                 rte_vhost_get_vring_base(vid, i,
646                                 &internal->m_vring[i].avail->idx,
647                                 &internal->m_vring[i].used->idx);
648
649                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
650                                 &hw->vring[i].last_used_idx);
651
652                 m_vring_iova += size;
653         }
654         hw->nr_vring = nr_vring;
655
656         return ifcvf_start_hw(&internal->hw);
657
658 error:
659         for (i = 0; i < nr_vring; i++)
660                 if (internal->m_vring[i].desc)
661                         rte_free(internal->m_vring[i].desc);
662
663         return -1;
664 }
665
666 static int
667 m_ifcvf_stop(struct ifcvf_internal *internal)
668 {
669         int vid;
670         uint32_t i;
671         struct rte_vhost_vring vq;
672         struct ifcvf_hw *hw = &internal->hw;
673         uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
674         uint64_t size, len;
675
676         vid = internal->vid;
677         ifcvf_stop_hw(hw);
678
679         for (i = 0; i < hw->nr_vring; i++) {
680                 /* synchronize remaining new used entries if any */
681                 if ((i & 1) == 0)
682                         update_used_ring(internal, i);
683
684                 rte_vhost_get_vhost_vring(vid, i, &vq);
685                 len = IFCVF_USED_RING_LEN(vq.size);
686                 rte_vhost_log_used_vring(vid, i, 0, len);
687
688                 size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE),
689                                 PAGE_SIZE);
690                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
691                         (uint64_t)(uintptr_t)internal->m_vring[i].desc,
692                         m_vring_iova, size);
693
694                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
695                                 hw->vring[i].last_used_idx);
696                 rte_free(internal->m_vring[i].desc);
697                 m_vring_iova += size;
698         }
699
700         return 0;
701 }
702
703 static void
704 update_used_ring(struct ifcvf_internal *internal, uint16_t qid)
705 {
706         rte_vdpa_relay_vring_used(internal->vid, qid, &internal->m_vring[qid]);
707         rte_vhost_vring_call(internal->vid, qid);
708 }
709
710 static void *
711 vring_relay(void *arg)
712 {
713         int i, vid, epfd, fd, nfds;
714         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
715         struct rte_vhost_vring vring;
716         uint16_t qid, q_num;
717         struct epoll_event events[IFCVF_MAX_QUEUES * 4];
718         struct epoll_event ev;
719         int nbytes;
720         uint64_t buf;
721
722         vid = internal->vid;
723         q_num = rte_vhost_get_vring_num(vid);
724
725         /* add notify fd and interrupt fd to epoll */
726         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
727         if (epfd < 0) {
728                 DRV_LOG(ERR, "failed to create epoll instance.");
729                 return NULL;
730         }
731         internal->epfd = epfd;
732
733         vring.kickfd = -1;
734         for (qid = 0; qid < q_num; qid++) {
735                 ev.events = EPOLLIN | EPOLLPRI;
736                 rte_vhost_get_vhost_vring(vid, qid, &vring);
737                 ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32;
738                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
739                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
740                         return NULL;
741                 }
742         }
743
744         for (qid = 0; qid < q_num; qid += 2) {
745                 ev.events = EPOLLIN | EPOLLPRI;
746                 /* leave a flag to mark it's for interrupt */
747                 ev.data.u64 = 1 | qid << 1 |
748                         (uint64_t)internal->intr_fd[qid] << 32;
749                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, internal->intr_fd[qid], &ev)
750                                 < 0) {
751                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
752                         return NULL;
753                 }
754                 update_used_ring(internal, qid);
755         }
756
757         /* start relay with a first kick */
758         for (qid = 0; qid < q_num; qid++)
759                 ifcvf_notify_queue(&internal->hw, qid);
760
761         /* listen to the events and react accordingly */
762         for (;;) {
763                 nfds = epoll_wait(epfd, events, q_num * 2, -1);
764                 if (nfds < 0) {
765                         if (errno == EINTR)
766                                 continue;
767                         DRV_LOG(ERR, "epoll_wait return fail\n");
768                         return NULL;
769                 }
770
771                 for (i = 0; i < nfds; i++) {
772                         fd = (uint32_t)(events[i].data.u64 >> 32);
773                         do {
774                                 nbytes = read(fd, &buf, 8);
775                                 if (nbytes < 0) {
776                                         if (errno == EINTR ||
777                                             errno == EWOULDBLOCK ||
778                                             errno == EAGAIN)
779                                                 continue;
780                                         DRV_LOG(INFO, "Error reading "
781                                                 "kickfd: %s",
782                                                 strerror(errno));
783                                 }
784                                 break;
785                         } while (1);
786
787                         qid = events[i].data.u32 >> 1;
788
789                         if (events[i].data.u32 & 1)
790                                 update_used_ring(internal, qid);
791                         else
792                                 ifcvf_notify_queue(&internal->hw, qid);
793                 }
794         }
795
796         return NULL;
797 }
798
799 static int
800 setup_vring_relay(struct ifcvf_internal *internal)
801 {
802         int ret;
803
804         ret = pthread_create(&internal->tid, NULL, vring_relay,
805                         (void *)internal);
806         if (ret) {
807                 DRV_LOG(ERR, "failed to create ring relay pthread.");
808                 return -1;
809         }
810         return 0;
811 }
812
813 static int
814 unset_vring_relay(struct ifcvf_internal *internal)
815 {
816         void *status;
817
818         if (internal->tid) {
819                 pthread_cancel(internal->tid);
820                 pthread_join(internal->tid, &status);
821         }
822         internal->tid = 0;
823
824         if (internal->epfd >= 0)
825                 close(internal->epfd);
826         internal->epfd = -1;
827
828         return 0;
829 }
830
831 static int
832 ifcvf_sw_fallback_switchover(struct ifcvf_internal *internal)
833 {
834         int ret;
835         int vid = internal->vid;
836
837         /* stop the direct IO data path */
838         unset_notify_relay(internal);
839         vdpa_ifcvf_stop(internal);
840         vdpa_disable_vfio_intr(internal);
841
842         ret = rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, false);
843         if (ret && ret != -ENOTSUP)
844                 goto error;
845
846         /* set up interrupt for interrupt relay */
847         ret = vdpa_enable_vfio_intr(internal, 1);
848         if (ret)
849                 goto unmap;
850
851         /* config the VF */
852         ret = m_ifcvf_start(internal);
853         if (ret)
854                 goto unset_intr;
855
856         /* set up vring relay thread */
857         ret = setup_vring_relay(internal);
858         if (ret)
859                 goto stop_vf;
860
861         rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, true);
862
863         internal->sw_fallback_running = true;
864
865         return 0;
866
867 stop_vf:
868         m_ifcvf_stop(internal);
869 unset_intr:
870         vdpa_disable_vfio_intr(internal);
871 unmap:
872         ifcvf_dma_map(internal, 0);
873 error:
874         return -1;
875 }
876
877 static int
878 ifcvf_dev_config(int vid)
879 {
880         struct rte_vdpa_device *vdev;
881         struct internal_list *list;
882         struct ifcvf_internal *internal;
883
884         vdev = rte_vhost_get_vdpa_device(vid);
885         list = find_internal_resource_by_vdev(vdev);
886         if (list == NULL) {
887                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
888                 return -1;
889         }
890
891         internal = list->internal;
892         internal->vid = vid;
893         rte_atomic32_set(&internal->dev_attached, 1);
894         update_datapath(internal);
895
896         if (rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, true) != 0)
897                 DRV_LOG(NOTICE, "vDPA (%s): software relay is used.",
898                                 vdev->device->name);
899
900         return 0;
901 }
902
903 static int
904 ifcvf_dev_close(int vid)
905 {
906         struct rte_vdpa_device *vdev;
907         struct internal_list *list;
908         struct ifcvf_internal *internal;
909
910         vdev = rte_vhost_get_vdpa_device(vid);
911         list = find_internal_resource_by_vdev(vdev);
912         if (list == NULL) {
913                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
914                 return -1;
915         }
916
917         internal = list->internal;
918
919         if (internal->sw_fallback_running) {
920                 /* unset ring relay */
921                 unset_vring_relay(internal);
922
923                 /* reset VF */
924                 m_ifcvf_stop(internal);
925
926                 /* remove interrupt setting */
927                 vdpa_disable_vfio_intr(internal);
928
929                 /* unset DMA map for guest memory */
930                 ifcvf_dma_map(internal, 0);
931
932                 internal->sw_fallback_running = false;
933         } else {
934                 rte_atomic32_set(&internal->dev_attached, 0);
935                 update_datapath(internal);
936         }
937
938         return 0;
939 }
940
941 static int
942 ifcvf_set_features(int vid)
943 {
944         uint64_t features = 0;
945         struct rte_vdpa_device *vdev;
946         struct internal_list *list;
947         struct ifcvf_internal *internal;
948         uint64_t log_base = 0, log_size = 0;
949
950         vdev = rte_vhost_get_vdpa_device(vid);
951         list = find_internal_resource_by_vdev(vdev);
952         if (list == NULL) {
953                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
954                 return -1;
955         }
956
957         internal = list->internal;
958         rte_vhost_get_negotiated_features(vid, &features);
959
960         if (!RTE_VHOST_NEED_LOG(features))
961                 return 0;
962
963         if (internal->sw_lm) {
964                 ifcvf_sw_fallback_switchover(internal);
965         } else {
966                 rte_vhost_get_log_base(vid, &log_base, &log_size);
967                 rte_vfio_container_dma_map(internal->vfio_container_fd,
968                                 log_base, IFCVF_LOG_BASE, log_size);
969                 ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size);
970         }
971
972         return 0;
973 }
974
975 static int
976 ifcvf_get_vfio_group_fd(int vid)
977 {
978         struct rte_vdpa_device *vdev;
979         struct internal_list *list;
980
981         vdev = rte_vhost_get_vdpa_device(vid);
982         list = find_internal_resource_by_vdev(vdev);
983         if (list == NULL) {
984                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
985                 return -1;
986         }
987
988         return list->internal->vfio_group_fd;
989 }
990
991 static int
992 ifcvf_get_vfio_device_fd(int vid)
993 {
994         struct rte_vdpa_device *vdev;
995         struct internal_list *list;
996
997         vdev = rte_vhost_get_vdpa_device(vid);
998         list = find_internal_resource_by_vdev(vdev);
999         if (list == NULL) {
1000                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1001                 return -1;
1002         }
1003
1004         return list->internal->vfio_dev_fd;
1005 }
1006
1007 static int
1008 ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
1009 {
1010         struct rte_vdpa_device *vdev;
1011         struct internal_list *list;
1012         struct ifcvf_internal *internal;
1013         struct vfio_region_info reg = { .argsz = sizeof(reg) };
1014         int ret;
1015
1016         vdev = rte_vhost_get_vdpa_device(vid);
1017         list = find_internal_resource_by_vdev(vdev);
1018         if (list == NULL) {
1019                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1020                 return -1;
1021         }
1022
1023         internal = list->internal;
1024
1025         reg.index = ifcvf_get_notify_region(&internal->hw);
1026         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
1027         if (ret) {
1028                 DRV_LOG(ERR, "Get not get device region info: %s",
1029                                 strerror(errno));
1030                 return -1;
1031         }
1032
1033         *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset;
1034         *size = 0x1000;
1035
1036         return 0;
1037 }
1038
1039 static int
1040 ifcvf_get_queue_num(struct rte_vdpa_device *vdev, uint32_t *queue_num)
1041 {
1042         struct internal_list *list;
1043
1044         list = find_internal_resource_by_vdev(vdev);
1045         if (list == NULL) {
1046                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1047                 return -1;
1048         }
1049
1050         *queue_num = list->internal->max_queues;
1051
1052         return 0;
1053 }
1054
1055 static int
1056 ifcvf_get_vdpa_features(struct rte_vdpa_device *vdev, uint64_t *features)
1057 {
1058         struct internal_list *list;
1059
1060         list = find_internal_resource_by_vdev(vdev);
1061         if (list == NULL) {
1062                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1063                 return -1;
1064         }
1065
1066         *features = list->internal->features;
1067
1068         return 0;
1069 }
1070
1071 #define VDPA_SUPPORTED_PROTOCOL_FEATURES \
1072                 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \
1073                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | \
1074                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | \
1075                  1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
1076                  1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD)
1077 static int
1078 ifcvf_get_protocol_features(struct rte_vdpa_device *vdev, uint64_t *features)
1079 {
1080         RTE_SET_USED(vdev);
1081
1082         *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
1083         return 0;
1084 }
1085
1086 static struct rte_vdpa_dev_ops ifcvf_ops = {
1087         .get_queue_num = ifcvf_get_queue_num,
1088         .get_features = ifcvf_get_vdpa_features,
1089         .get_protocol_features = ifcvf_get_protocol_features,
1090         .dev_conf = ifcvf_dev_config,
1091         .dev_close = ifcvf_dev_close,
1092         .set_vring_state = NULL,
1093         .set_features = ifcvf_set_features,
1094         .migration_done = NULL,
1095         .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
1096         .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
1097         .get_notify_area = ifcvf_get_notify_area,
1098 };
1099
1100 static inline int
1101 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1102 {
1103         uint16_t *n = extra_args;
1104
1105         if (value == NULL || extra_args == NULL)
1106                 return -EINVAL;
1107
1108         *n = (uint16_t)strtoul(value, NULL, 0);
1109         if (*n == USHRT_MAX && errno == ERANGE)
1110                 return -1;
1111
1112         return 0;
1113 }
1114
1115 static int
1116 ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
1117                 struct rte_pci_device *pci_dev)
1118 {
1119         uint64_t features;
1120         struct ifcvf_internal *internal = NULL;
1121         struct internal_list *list = NULL;
1122         int vdpa_mode = 0;
1123         int sw_fallback_lm = 0;
1124         struct rte_kvargs *kvlist = NULL;
1125         int ret = 0;
1126
1127         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1128                 return 0;
1129
1130         if (!pci_dev->device.devargs)
1131                 return 1;
1132
1133         kvlist = rte_kvargs_parse(pci_dev->device.devargs->args,
1134                         ifcvf_valid_arguments);
1135         if (kvlist == NULL)
1136                 return 1;
1137
1138         /* probe only when vdpa mode is specified */
1139         if (rte_kvargs_count(kvlist, IFCVF_VDPA_MODE) == 0) {
1140                 rte_kvargs_free(kvlist);
1141                 return 1;
1142         }
1143
1144         ret = rte_kvargs_process(kvlist, IFCVF_VDPA_MODE, &open_int,
1145                         &vdpa_mode);
1146         if (ret < 0 || vdpa_mode == 0) {
1147                 rte_kvargs_free(kvlist);
1148                 return 1;
1149         }
1150
1151         list = rte_zmalloc("ifcvf", sizeof(*list), 0);
1152         if (list == NULL)
1153                 goto error;
1154
1155         internal = rte_zmalloc("ifcvf", sizeof(*internal), 0);
1156         if (internal == NULL)
1157                 goto error;
1158
1159         internal->pdev = pci_dev;
1160         rte_spinlock_init(&internal->lock);
1161
1162         if (ifcvf_vfio_setup(internal) < 0) {
1163                 DRV_LOG(ERR, "failed to setup device %s", pci_dev->name);
1164                 goto error;
1165         }
1166
1167         if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0) {
1168                 DRV_LOG(ERR, "failed to init device %s", pci_dev->name);
1169                 goto error;
1170         }
1171
1172         internal->max_queues = IFCVF_MAX_QUEUES;
1173         features = ifcvf_get_features(&internal->hw);
1174         internal->features = (features &
1175                 ~(1ULL << VIRTIO_F_IOMMU_PLATFORM)) |
1176                 (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
1177                 (1ULL << VIRTIO_NET_F_CTRL_VQ) |
1178                 (1ULL << VIRTIO_NET_F_STATUS) |
1179                 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
1180                 (1ULL << VHOST_F_LOG_ALL);
1181
1182         list->internal = internal;
1183
1184         if (rte_kvargs_count(kvlist, IFCVF_SW_FALLBACK_LM)) {
1185                 ret = rte_kvargs_process(kvlist, IFCVF_SW_FALLBACK_LM,
1186                                 &open_int, &sw_fallback_lm);
1187                 if (ret < 0)
1188                         goto error;
1189         }
1190         internal->sw_lm = sw_fallback_lm;
1191
1192         internal->vdev = rte_vdpa_register_device(&pci_dev->device, &ifcvf_ops);
1193         if (internal->vdev == NULL) {
1194                 DRV_LOG(ERR, "failed to register device %s", pci_dev->name);
1195                 goto error;
1196         }
1197
1198         pthread_mutex_lock(&internal_list_lock);
1199         TAILQ_INSERT_TAIL(&internal_list, list, next);
1200         pthread_mutex_unlock(&internal_list_lock);
1201
1202         rte_atomic32_set(&internal->started, 1);
1203         update_datapath(internal);
1204
1205         rte_kvargs_free(kvlist);
1206         return 0;
1207
1208 error:
1209         rte_kvargs_free(kvlist);
1210         rte_free(list);
1211         rte_free(internal);
1212         return -1;
1213 }
1214
1215 static int
1216 ifcvf_pci_remove(struct rte_pci_device *pci_dev)
1217 {
1218         struct ifcvf_internal *internal;
1219         struct internal_list *list;
1220
1221         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1222                 return 0;
1223
1224         list = find_internal_resource_by_dev(pci_dev);
1225         if (list == NULL) {
1226                 DRV_LOG(ERR, "Invalid device: %s", pci_dev->name);
1227                 return -1;
1228         }
1229
1230         internal = list->internal;
1231         rte_atomic32_set(&internal->started, 0);
1232         update_datapath(internal);
1233
1234         rte_pci_unmap_device(internal->pdev);
1235         rte_vfio_container_destroy(internal->vfio_container_fd);
1236         rte_vdpa_unregister_device(internal->vdev);
1237
1238         pthread_mutex_lock(&internal_list_lock);
1239         TAILQ_REMOVE(&internal_list, list, next);
1240         pthread_mutex_unlock(&internal_list_lock);
1241
1242         rte_free(list);
1243         rte_free(internal);
1244
1245         return 0;
1246 }
1247
1248 /*
1249  * IFCVF has the same vendor ID and device ID as virtio net PCI
1250  * device, with its specific subsystem vendor ID and device ID.
1251  */
1252 static const struct rte_pci_id pci_id_ifcvf_map[] = {
1253         { .class_id = RTE_CLASS_ANY_ID,
1254           .vendor_id = IFCVF_VENDOR_ID,
1255           .device_id = IFCVF_DEVICE_ID,
1256           .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
1257           .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID,
1258         },
1259
1260         { .vendor_id = 0, /* sentinel */
1261         },
1262 };
1263
1264 static struct rte_pci_driver rte_ifcvf_vdpa = {
1265         .id_table = pci_id_ifcvf_map,
1266         .drv_flags = 0,
1267         .probe = ifcvf_pci_probe,
1268         .remove = ifcvf_pci_remove,
1269 };
1270
1271 RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa);
1272 RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map);
1273 RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci");