vdpa/mlx5: support device cleanup callback
[dpdk.git] / drivers / vdpa / ifc / ifcvf_vdpa.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2018 Intel Corporation
3  */
4
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <fcntl.h>
8 #include <string.h>
9 #include <sys/ioctl.h>
10 #include <sys/epoll.h>
11 #include <linux/virtio_net.h>
12 #include <stdbool.h>
13
14 #include <rte_eal_paging.h>
15 #include <rte_malloc.h>
16 #include <rte_memory.h>
17 #include <rte_bus_pci.h>
18 #include <rte_vhost.h>
19 #include <rte_vdpa.h>
20 #include <vdpa_driver.h>
21 #include <rte_vfio.h>
22 #include <rte_spinlock.h>
23 #include <rte_log.h>
24 #include <rte_kvargs.h>
25 #include <rte_devargs.h>
26
27 #include "base/ifcvf.h"
28
29 RTE_LOG_REGISTER(ifcvf_vdpa_logtype, pmd.vdpa.ifcvf, NOTICE);
30 #define DRV_LOG(level, fmt, args...) \
31         rte_log(RTE_LOG_ ## level, ifcvf_vdpa_logtype, \
32                 "IFCVF %s(): " fmt "\n", __func__, ##args)
33
34 #define IFCVF_USED_RING_LEN(size) \
35         ((size) * sizeof(struct vring_used_elem) + sizeof(uint16_t) * 3)
36
37 #define IFCVF_VDPA_MODE         "vdpa"
38 #define IFCVF_SW_FALLBACK_LM    "sw-live-migration"
39
40 #define THREAD_NAME_LEN 16
41
42 static const char * const ifcvf_valid_arguments[] = {
43         IFCVF_VDPA_MODE,
44         IFCVF_SW_FALLBACK_LM,
45         NULL
46 };
47
48 struct ifcvf_internal {
49         struct rte_pci_device *pdev;
50         struct ifcvf_hw hw;
51         int configured;
52         int vfio_container_fd;
53         int vfio_group_fd;
54         int vfio_dev_fd;
55         pthread_t tid;  /* thread for notify relay */
56         int epfd;
57         int vid;
58         struct rte_vdpa_device *vdev;
59         uint16_t max_queues;
60         uint64_t features;
61         rte_atomic32_t started;
62         rte_atomic32_t dev_attached;
63         rte_atomic32_t running;
64         rte_spinlock_t lock;
65         bool sw_lm;
66         bool sw_fallback_running;
67         /* mediated vring for sw fallback */
68         struct vring m_vring[IFCVF_MAX_QUEUES * 2];
69         /* eventfd for used ring interrupt */
70         int intr_fd[IFCVF_MAX_QUEUES * 2];
71 };
72
73 struct internal_list {
74         TAILQ_ENTRY(internal_list) next;
75         struct ifcvf_internal *internal;
76 };
77
78 TAILQ_HEAD(internal_list_head, internal_list);
79 static struct internal_list_head internal_list =
80         TAILQ_HEAD_INITIALIZER(internal_list);
81
82 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
83
84 static void update_used_ring(struct ifcvf_internal *internal, uint16_t qid);
85
86 static struct internal_list *
87 find_internal_resource_by_vdev(struct rte_vdpa_device *vdev)
88 {
89         int found = 0;
90         struct internal_list *list;
91
92         pthread_mutex_lock(&internal_list_lock);
93
94         TAILQ_FOREACH(list, &internal_list, next) {
95                 if (vdev == list->internal->vdev) {
96                         found = 1;
97                         break;
98                 }
99         }
100
101         pthread_mutex_unlock(&internal_list_lock);
102
103         if (!found)
104                 return NULL;
105
106         return list;
107 }
108
109 static struct internal_list *
110 find_internal_resource_by_dev(struct rte_pci_device *pdev)
111 {
112         int found = 0;
113         struct internal_list *list;
114
115         pthread_mutex_lock(&internal_list_lock);
116
117         TAILQ_FOREACH(list, &internal_list, next) {
118                 if (!rte_pci_addr_cmp(&pdev->addr,
119                                         &list->internal->pdev->addr)) {
120                         found = 1;
121                         break;
122                 }
123         }
124
125         pthread_mutex_unlock(&internal_list_lock);
126
127         if (!found)
128                 return NULL;
129
130         return list;
131 }
132
133 static int
134 ifcvf_vfio_setup(struct ifcvf_internal *internal)
135 {
136         struct rte_pci_device *dev = internal->pdev;
137         char devname[RTE_DEV_NAME_MAX_LEN] = {0};
138         int iommu_group_num;
139         int i, ret;
140
141         internal->vfio_dev_fd = -1;
142         internal->vfio_group_fd = -1;
143         internal->vfio_container_fd = -1;
144
145         rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
146         ret = rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
147                         &iommu_group_num);
148         if (ret <= 0) {
149                 DRV_LOG(ERR, "%s failed to get IOMMU group", devname);
150                 return -1;
151         }
152
153         internal->vfio_container_fd = rte_vfio_container_create();
154         if (internal->vfio_container_fd < 0)
155                 return -1;
156
157         internal->vfio_group_fd = rte_vfio_container_group_bind(
158                         internal->vfio_container_fd, iommu_group_num);
159         if (internal->vfio_group_fd < 0)
160                 goto err;
161
162         if (rte_pci_map_device(dev))
163                 goto err;
164
165         internal->vfio_dev_fd = rte_intr_dev_fd_get(dev->intr_handle);
166
167         for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
168                         i++) {
169                 internal->hw.mem_resource[i].addr =
170                         internal->pdev->mem_resource[i].addr;
171                 internal->hw.mem_resource[i].phys_addr =
172                         internal->pdev->mem_resource[i].phys_addr;
173                 internal->hw.mem_resource[i].len =
174                         internal->pdev->mem_resource[i].len;
175         }
176
177         return 0;
178
179 err:
180         rte_vfio_container_destroy(internal->vfio_container_fd);
181         return -1;
182 }
183
184 static int
185 ifcvf_dma_map(struct ifcvf_internal *internal, bool do_map)
186 {
187         uint32_t i;
188         int ret;
189         struct rte_vhost_memory *mem = NULL;
190         int vfio_container_fd;
191
192         ret = rte_vhost_get_mem_table(internal->vid, &mem);
193         if (ret < 0) {
194                 DRV_LOG(ERR, "failed to get VM memory layout.");
195                 goto exit;
196         }
197
198         vfio_container_fd = internal->vfio_container_fd;
199
200         for (i = 0; i < mem->nregions; i++) {
201                 struct rte_vhost_mem_region *reg;
202
203                 reg = &mem->regions[i];
204                 DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", "
205                         "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".",
206                         do_map ? "DMA map" : "DMA unmap", i,
207                         reg->host_user_addr, reg->guest_phys_addr, reg->size);
208
209                 if (do_map) {
210                         ret = rte_vfio_container_dma_map(vfio_container_fd,
211                                 reg->host_user_addr, reg->guest_phys_addr,
212                                 reg->size);
213                         if (ret < 0) {
214                                 DRV_LOG(ERR, "DMA map failed.");
215                                 goto exit;
216                         }
217                 } else {
218                         ret = rte_vfio_container_dma_unmap(vfio_container_fd,
219                                 reg->host_user_addr, reg->guest_phys_addr,
220                                 reg->size);
221                         if (ret < 0) {
222                                 DRV_LOG(ERR, "DMA unmap failed.");
223                                 goto exit;
224                         }
225                 }
226         }
227
228 exit:
229         free(mem);
230         return ret;
231 }
232
233 static uint64_t
234 hva_to_gpa(int vid, uint64_t hva)
235 {
236         struct rte_vhost_memory *mem = NULL;
237         struct rte_vhost_mem_region *reg;
238         uint32_t i;
239         uint64_t gpa = 0;
240
241         if (rte_vhost_get_mem_table(vid, &mem) < 0)
242                 goto exit;
243
244         for (i = 0; i < mem->nregions; i++) {
245                 reg = &mem->regions[i];
246
247                 if (hva >= reg->host_user_addr &&
248                                 hva < reg->host_user_addr + reg->size) {
249                         gpa = hva - reg->host_user_addr + reg->guest_phys_addr;
250                         break;
251                 }
252         }
253
254 exit:
255         free(mem);
256         return gpa;
257 }
258
259 static int
260 vdpa_ifcvf_start(struct ifcvf_internal *internal)
261 {
262         struct ifcvf_hw *hw = &internal->hw;
263         int i, nr_vring;
264         int vid;
265         struct rte_vhost_vring vq;
266         uint64_t gpa;
267
268         vid = internal->vid;
269         nr_vring = rte_vhost_get_vring_num(vid);
270         rte_vhost_get_negotiated_features(vid, &hw->req_features);
271
272         for (i = 0; i < nr_vring; i++) {
273                 rte_vhost_get_vhost_vring(vid, i, &vq);
274                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
275                 if (gpa == 0) {
276                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
277                         return -1;
278                 }
279                 hw->vring[i].desc = gpa;
280
281                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
282                 if (gpa == 0) {
283                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
284                         return -1;
285                 }
286                 hw->vring[i].avail = gpa;
287
288                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
289                 if (gpa == 0) {
290                         DRV_LOG(ERR, "Fail to get GPA for used ring.");
291                         return -1;
292                 }
293                 hw->vring[i].used = gpa;
294
295                 hw->vring[i].size = vq.size;
296                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
297                                 &hw->vring[i].last_used_idx);
298         }
299         hw->nr_vring = i;
300
301         return ifcvf_start_hw(&internal->hw);
302 }
303
304 static void
305 vdpa_ifcvf_stop(struct ifcvf_internal *internal)
306 {
307         struct ifcvf_hw *hw = &internal->hw;
308         uint32_t i;
309         int vid;
310         uint64_t features = 0;
311         uint64_t log_base = 0, log_size = 0;
312         uint64_t len;
313
314         vid = internal->vid;
315         ifcvf_stop_hw(hw);
316
317         for (i = 0; i < hw->nr_vring; i++)
318                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
319                                 hw->vring[i].last_used_idx);
320
321         if (internal->sw_lm)
322                 return;
323
324         rte_vhost_get_negotiated_features(vid, &features);
325         if (RTE_VHOST_NEED_LOG(features)) {
326                 ifcvf_disable_logging(hw);
327                 rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
328                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
329                                 log_base, IFCVF_LOG_BASE, log_size);
330                 /*
331                  * IFCVF marks dirty memory pages for only packet buffer,
332                  * SW helps to mark the used ring as dirty after device stops.
333                  */
334                 for (i = 0; i < hw->nr_vring; i++) {
335                         len = IFCVF_USED_RING_LEN(hw->vring[i].size);
336                         rte_vhost_log_used_vring(vid, i, 0, len);
337                 }
338         }
339 }
340
341 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
342                 sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
343 static int
344 vdpa_enable_vfio_intr(struct ifcvf_internal *internal, bool m_rx)
345 {
346         int ret;
347         uint32_t i, nr_vring;
348         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
349         struct vfio_irq_set *irq_set;
350         int *fd_ptr;
351         struct rte_vhost_vring vring;
352         int fd;
353
354         vring.callfd = -1;
355
356         nr_vring = rte_vhost_get_vring_num(internal->vid);
357
358         irq_set = (struct vfio_irq_set *)irq_set_buf;
359         irq_set->argsz = sizeof(irq_set_buf);
360         irq_set->count = nr_vring + 1;
361         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
362                          VFIO_IRQ_SET_ACTION_TRIGGER;
363         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
364         irq_set->start = 0;
365         fd_ptr = (int *)&irq_set->data;
366         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] =
367                 rte_intr_fd_get(internal->pdev->intr_handle);
368
369         for (i = 0; i < nr_vring; i++)
370                 internal->intr_fd[i] = -1;
371
372         for (i = 0; i < nr_vring; i++) {
373                 rte_vhost_get_vhost_vring(internal->vid, i, &vring);
374                 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
375                 if ((i & 1) == 0 && m_rx == true) {
376                         fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
377                         if (fd < 0) {
378                                 DRV_LOG(ERR, "can't setup eventfd: %s",
379                                         strerror(errno));
380                                 return -1;
381                         }
382                         internal->intr_fd[i] = fd;
383                         fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = fd;
384                 }
385         }
386
387         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
388         if (ret) {
389                 DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s",
390                                 strerror(errno));
391                 return -1;
392         }
393
394         return 0;
395 }
396
397 static int
398 vdpa_disable_vfio_intr(struct ifcvf_internal *internal)
399 {
400         int ret;
401         uint32_t i, nr_vring;
402         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
403         struct vfio_irq_set *irq_set;
404
405         irq_set = (struct vfio_irq_set *)irq_set_buf;
406         irq_set->argsz = sizeof(irq_set_buf);
407         irq_set->count = 0;
408         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
409         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
410         irq_set->start = 0;
411
412         nr_vring = rte_vhost_get_vring_num(internal->vid);
413         for (i = 0; i < nr_vring; i++) {
414                 if (internal->intr_fd[i] >= 0)
415                         close(internal->intr_fd[i]);
416                 internal->intr_fd[i] = -1;
417         }
418
419         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
420         if (ret) {
421                 DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s",
422                                 strerror(errno));
423                 return -1;
424         }
425
426         return 0;
427 }
428
429 static void *
430 notify_relay(void *arg)
431 {
432         int i, kickfd, epfd, nfds = 0;
433         uint32_t qid, q_num;
434         struct epoll_event events[IFCVF_MAX_QUEUES * 2];
435         struct epoll_event ev;
436         uint64_t buf;
437         int nbytes;
438         struct rte_vhost_vring vring;
439         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
440         struct ifcvf_hw *hw = &internal->hw;
441
442         q_num = rte_vhost_get_vring_num(internal->vid);
443
444         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
445         if (epfd < 0) {
446                 DRV_LOG(ERR, "failed to create epoll instance.");
447                 return NULL;
448         }
449         internal->epfd = epfd;
450
451         vring.kickfd = -1;
452         for (qid = 0; qid < q_num; qid++) {
453                 ev.events = EPOLLIN | EPOLLPRI;
454                 rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
455                 ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
456                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
457                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
458                         return NULL;
459                 }
460         }
461
462         for (;;) {
463                 nfds = epoll_wait(epfd, events, q_num, -1);
464                 if (nfds < 0) {
465                         if (errno == EINTR)
466                                 continue;
467                         DRV_LOG(ERR, "epoll_wait return fail\n");
468                         return NULL;
469                 }
470
471                 for (i = 0; i < nfds; i++) {
472                         qid = events[i].data.u32;
473                         kickfd = (uint32_t)(events[i].data.u64 >> 32);
474                         do {
475                                 nbytes = read(kickfd, &buf, 8);
476                                 if (nbytes < 0) {
477                                         if (errno == EINTR ||
478                                             errno == EWOULDBLOCK ||
479                                             errno == EAGAIN)
480                                                 continue;
481                                         DRV_LOG(INFO, "Error reading "
482                                                 "kickfd: %s",
483                                                 strerror(errno));
484                                 }
485                                 break;
486                         } while (1);
487
488                         ifcvf_notify_queue(hw, qid);
489                 }
490         }
491
492         return NULL;
493 }
494
495 static int
496 setup_notify_relay(struct ifcvf_internal *internal)
497 {
498         char name[THREAD_NAME_LEN];
499         int ret;
500
501         snprintf(name, sizeof(name), "ifc-notify-%d", internal->vid);
502         ret = rte_ctrl_thread_create(&internal->tid, name, NULL, notify_relay,
503                                      (void *)internal);
504         if (ret != 0) {
505                 DRV_LOG(ERR, "failed to create notify relay pthread.");
506                 return -1;
507         }
508
509         return 0;
510 }
511
512 static int
513 unset_notify_relay(struct ifcvf_internal *internal)
514 {
515         void *status;
516
517         if (internal->tid) {
518                 pthread_cancel(internal->tid);
519                 pthread_join(internal->tid, &status);
520         }
521         internal->tid = 0;
522
523         if (internal->epfd >= 0)
524                 close(internal->epfd);
525         internal->epfd = -1;
526
527         return 0;
528 }
529
530 static int
531 update_datapath(struct ifcvf_internal *internal)
532 {
533         int ret;
534
535         rte_spinlock_lock(&internal->lock);
536
537         if (!rte_atomic32_read(&internal->running) &&
538             (rte_atomic32_read(&internal->started) &&
539              rte_atomic32_read(&internal->dev_attached))) {
540                 ret = ifcvf_dma_map(internal, true);
541                 if (ret)
542                         goto err;
543
544                 ret = vdpa_enable_vfio_intr(internal, false);
545                 if (ret)
546                         goto err;
547
548                 ret = vdpa_ifcvf_start(internal);
549                 if (ret)
550                         goto err;
551
552                 ret = setup_notify_relay(internal);
553                 if (ret)
554                         goto err;
555
556                 rte_atomic32_set(&internal->running, 1);
557         } else if (rte_atomic32_read(&internal->running) &&
558                    (!rte_atomic32_read(&internal->started) ||
559                     !rte_atomic32_read(&internal->dev_attached))) {
560                 ret = unset_notify_relay(internal);
561                 if (ret)
562                         goto err;
563
564                 vdpa_ifcvf_stop(internal);
565
566                 ret = vdpa_disable_vfio_intr(internal);
567                 if (ret)
568                         goto err;
569
570                 ret = ifcvf_dma_map(internal, false);
571                 if (ret)
572                         goto err;
573
574                 rte_atomic32_set(&internal->running, 0);
575         }
576
577         rte_spinlock_unlock(&internal->lock);
578         return 0;
579 err:
580         rte_spinlock_unlock(&internal->lock);
581         return ret;
582 }
583
584 static int
585 m_ifcvf_start(struct ifcvf_internal *internal)
586 {
587         struct ifcvf_hw *hw = &internal->hw;
588         uint32_t i, nr_vring;
589         int vid, ret;
590         struct rte_vhost_vring vq;
591         void *vring_buf;
592         uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
593         uint64_t size;
594         uint64_t gpa;
595
596         memset(&vq, 0, sizeof(vq));
597         vid = internal->vid;
598         nr_vring = rte_vhost_get_vring_num(vid);
599         rte_vhost_get_negotiated_features(vid, &hw->req_features);
600
601         for (i = 0; i < nr_vring; i++) {
602                 rte_vhost_get_vhost_vring(vid, i, &vq);
603
604                 size = RTE_ALIGN_CEIL(vring_size(vq.size, rte_mem_page_size()),
605                                 rte_mem_page_size());
606                 vring_buf = rte_zmalloc("ifcvf", size, rte_mem_page_size());
607                 vring_init(&internal->m_vring[i], vq.size, vring_buf,
608                                 rte_mem_page_size());
609
610                 ret = rte_vfio_container_dma_map(internal->vfio_container_fd,
611                         (uint64_t)(uintptr_t)vring_buf, m_vring_iova, size);
612                 if (ret < 0) {
613                         DRV_LOG(ERR, "mediated vring DMA map failed.");
614                         goto error;
615                 }
616
617                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
618                 if (gpa == 0) {
619                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
620                         return -1;
621                 }
622                 hw->vring[i].desc = gpa;
623
624                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
625                 if (gpa == 0) {
626                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
627                         return -1;
628                 }
629                 hw->vring[i].avail = gpa;
630
631                 /* Direct I/O for Tx queue, relay for Rx queue */
632                 if (i & 1) {
633                         gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
634                         if (gpa == 0) {
635                                 DRV_LOG(ERR, "Fail to get GPA for used ring.");
636                                 return -1;
637                         }
638                         hw->vring[i].used = gpa;
639                 } else {
640                         hw->vring[i].used = m_vring_iova +
641                                 (char *)internal->m_vring[i].used -
642                                 (char *)internal->m_vring[i].desc;
643                 }
644
645                 hw->vring[i].size = vq.size;
646
647                 rte_vhost_get_vring_base(vid, i,
648                                 &internal->m_vring[i].avail->idx,
649                                 &internal->m_vring[i].used->idx);
650
651                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
652                                 &hw->vring[i].last_used_idx);
653
654                 m_vring_iova += size;
655         }
656         hw->nr_vring = nr_vring;
657
658         return ifcvf_start_hw(&internal->hw);
659
660 error:
661         for (i = 0; i < nr_vring; i++)
662                 rte_free(internal->m_vring[i].desc);
663
664         return -1;
665 }
666
667 static int
668 m_ifcvf_stop(struct ifcvf_internal *internal)
669 {
670         int vid;
671         uint32_t i;
672         struct rte_vhost_vring vq;
673         struct ifcvf_hw *hw = &internal->hw;
674         uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
675         uint64_t size, len;
676
677         vid = internal->vid;
678         ifcvf_stop_hw(hw);
679
680         for (i = 0; i < hw->nr_vring; i++) {
681                 /* synchronize remaining new used entries if any */
682                 if ((i & 1) == 0)
683                         update_used_ring(internal, i);
684
685                 rte_vhost_get_vhost_vring(vid, i, &vq);
686                 len = IFCVF_USED_RING_LEN(vq.size);
687                 rte_vhost_log_used_vring(vid, i, 0, len);
688
689                 size = RTE_ALIGN_CEIL(vring_size(vq.size, rte_mem_page_size()),
690                                 rte_mem_page_size());
691                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
692                         (uint64_t)(uintptr_t)internal->m_vring[i].desc,
693                         m_vring_iova, size);
694
695                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
696                                 hw->vring[i].last_used_idx);
697                 rte_free(internal->m_vring[i].desc);
698                 m_vring_iova += size;
699         }
700
701         return 0;
702 }
703
704 static void
705 update_used_ring(struct ifcvf_internal *internal, uint16_t qid)
706 {
707         rte_vdpa_relay_vring_used(internal->vid, qid, &internal->m_vring[qid]);
708         rte_vhost_vring_call(internal->vid, qid);
709 }
710
711 static void *
712 vring_relay(void *arg)
713 {
714         int i, vid, epfd, fd, nfds;
715         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
716         struct rte_vhost_vring vring;
717         uint16_t qid, q_num;
718         struct epoll_event events[IFCVF_MAX_QUEUES * 4];
719         struct epoll_event ev;
720         int nbytes;
721         uint64_t buf;
722
723         vid = internal->vid;
724         q_num = rte_vhost_get_vring_num(vid);
725
726         /* add notify fd and interrupt fd to epoll */
727         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
728         if (epfd < 0) {
729                 DRV_LOG(ERR, "failed to create epoll instance.");
730                 return NULL;
731         }
732         internal->epfd = epfd;
733
734         vring.kickfd = -1;
735         for (qid = 0; qid < q_num; qid++) {
736                 ev.events = EPOLLIN | EPOLLPRI;
737                 rte_vhost_get_vhost_vring(vid, qid, &vring);
738                 ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32;
739                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
740                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
741                         return NULL;
742                 }
743         }
744
745         for (qid = 0; qid < q_num; qid += 2) {
746                 ev.events = EPOLLIN | EPOLLPRI;
747                 /* leave a flag to mark it's for interrupt */
748                 ev.data.u64 = 1 | qid << 1 |
749                         (uint64_t)internal->intr_fd[qid] << 32;
750                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, internal->intr_fd[qid], &ev)
751                                 < 0) {
752                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
753                         return NULL;
754                 }
755                 update_used_ring(internal, qid);
756         }
757
758         /* start relay with a first kick */
759         for (qid = 0; qid < q_num; qid++)
760                 ifcvf_notify_queue(&internal->hw, qid);
761
762         /* listen to the events and react accordingly */
763         for (;;) {
764                 nfds = epoll_wait(epfd, events, q_num * 2, -1);
765                 if (nfds < 0) {
766                         if (errno == EINTR)
767                                 continue;
768                         DRV_LOG(ERR, "epoll_wait return fail\n");
769                         return NULL;
770                 }
771
772                 for (i = 0; i < nfds; i++) {
773                         fd = (uint32_t)(events[i].data.u64 >> 32);
774                         do {
775                                 nbytes = read(fd, &buf, 8);
776                                 if (nbytes < 0) {
777                                         if (errno == EINTR ||
778                                             errno == EWOULDBLOCK ||
779                                             errno == EAGAIN)
780                                                 continue;
781                                         DRV_LOG(INFO, "Error reading "
782                                                 "kickfd: %s",
783                                                 strerror(errno));
784                                 }
785                                 break;
786                         } while (1);
787
788                         qid = events[i].data.u32 >> 1;
789
790                         if (events[i].data.u32 & 1)
791                                 update_used_ring(internal, qid);
792                         else
793                                 ifcvf_notify_queue(&internal->hw, qid);
794                 }
795         }
796
797         return NULL;
798 }
799
800 static int
801 setup_vring_relay(struct ifcvf_internal *internal)
802 {
803         char name[THREAD_NAME_LEN];
804         int ret;
805
806         snprintf(name, sizeof(name), "ifc-vring-%d", internal->vid);
807         ret = rte_ctrl_thread_create(&internal->tid, name, NULL, vring_relay,
808                                      (void *)internal);
809         if (ret != 0) {
810                 DRV_LOG(ERR, "failed to create ring relay pthread.");
811                 return -1;
812         }
813
814         return 0;
815 }
816
817 static int
818 unset_vring_relay(struct ifcvf_internal *internal)
819 {
820         void *status;
821
822         if (internal->tid) {
823                 pthread_cancel(internal->tid);
824                 pthread_join(internal->tid, &status);
825         }
826         internal->tid = 0;
827
828         if (internal->epfd >= 0)
829                 close(internal->epfd);
830         internal->epfd = -1;
831
832         return 0;
833 }
834
835 static int
836 ifcvf_sw_fallback_switchover(struct ifcvf_internal *internal)
837 {
838         int ret;
839         int vid = internal->vid;
840
841         /* stop the direct IO data path */
842         unset_notify_relay(internal);
843         vdpa_ifcvf_stop(internal);
844         vdpa_disable_vfio_intr(internal);
845
846         ret = rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, false);
847         if (ret && ret != -ENOTSUP)
848                 goto error;
849
850         /* set up interrupt for interrupt relay */
851         ret = vdpa_enable_vfio_intr(internal, true);
852         if (ret)
853                 goto unmap;
854
855         /* config the VF */
856         ret = m_ifcvf_start(internal);
857         if (ret)
858                 goto unset_intr;
859
860         /* set up vring relay thread */
861         ret = setup_vring_relay(internal);
862         if (ret)
863                 goto stop_vf;
864
865         rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, true);
866
867         internal->sw_fallback_running = true;
868
869         return 0;
870
871 stop_vf:
872         m_ifcvf_stop(internal);
873 unset_intr:
874         vdpa_disable_vfio_intr(internal);
875 unmap:
876         ifcvf_dma_map(internal, false);
877 error:
878         return -1;
879 }
880
881 static int
882 ifcvf_dev_config(int vid)
883 {
884         struct rte_vdpa_device *vdev;
885         struct internal_list *list;
886         struct ifcvf_internal *internal;
887
888         vdev = rte_vhost_get_vdpa_device(vid);
889         list = find_internal_resource_by_vdev(vdev);
890         if (list == NULL) {
891                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
892                 return -1;
893         }
894
895         internal = list->internal;
896         internal->vid = vid;
897         rte_atomic32_set(&internal->dev_attached, 1);
898         update_datapath(internal);
899
900         if (rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, true) != 0)
901                 DRV_LOG(NOTICE, "vDPA (%s): software relay is used.",
902                                 vdev->device->name);
903
904         internal->configured = 1;
905         return 0;
906 }
907
908 static int
909 ifcvf_dev_close(int vid)
910 {
911         struct rte_vdpa_device *vdev;
912         struct internal_list *list;
913         struct ifcvf_internal *internal;
914
915         vdev = rte_vhost_get_vdpa_device(vid);
916         list = find_internal_resource_by_vdev(vdev);
917         if (list == NULL) {
918                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
919                 return -1;
920         }
921
922         internal = list->internal;
923
924         if (internal->sw_fallback_running) {
925                 /* unset ring relay */
926                 unset_vring_relay(internal);
927
928                 /* reset VF */
929                 m_ifcvf_stop(internal);
930
931                 /* remove interrupt setting */
932                 vdpa_disable_vfio_intr(internal);
933
934                 /* unset DMA map for guest memory */
935                 ifcvf_dma_map(internal, false);
936
937                 internal->sw_fallback_running = false;
938         } else {
939                 rte_atomic32_set(&internal->dev_attached, 0);
940                 update_datapath(internal);
941         }
942
943         internal->configured = 0;
944         return 0;
945 }
946
947 static int
948 ifcvf_set_features(int vid)
949 {
950         uint64_t features = 0;
951         struct rte_vdpa_device *vdev;
952         struct internal_list *list;
953         struct ifcvf_internal *internal;
954         uint64_t log_base = 0, log_size = 0;
955
956         vdev = rte_vhost_get_vdpa_device(vid);
957         list = find_internal_resource_by_vdev(vdev);
958         if (list == NULL) {
959                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
960                 return -1;
961         }
962
963         internal = list->internal;
964         rte_vhost_get_negotiated_features(vid, &features);
965
966         if (!RTE_VHOST_NEED_LOG(features))
967                 return 0;
968
969         if (internal->sw_lm) {
970                 ifcvf_sw_fallback_switchover(internal);
971         } else {
972                 rte_vhost_get_log_base(vid, &log_base, &log_size);
973                 rte_vfio_container_dma_map(internal->vfio_container_fd,
974                                 log_base, IFCVF_LOG_BASE, log_size);
975                 ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size);
976         }
977
978         return 0;
979 }
980
981 static int
982 ifcvf_get_vfio_group_fd(int vid)
983 {
984         struct rte_vdpa_device *vdev;
985         struct internal_list *list;
986
987         vdev = rte_vhost_get_vdpa_device(vid);
988         list = find_internal_resource_by_vdev(vdev);
989         if (list == NULL) {
990                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
991                 return -1;
992         }
993
994         return list->internal->vfio_group_fd;
995 }
996
997 static int
998 ifcvf_get_vfio_device_fd(int vid)
999 {
1000         struct rte_vdpa_device *vdev;
1001         struct internal_list *list;
1002
1003         vdev = rte_vhost_get_vdpa_device(vid);
1004         list = find_internal_resource_by_vdev(vdev);
1005         if (list == NULL) {
1006                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1007                 return -1;
1008         }
1009
1010         return list->internal->vfio_dev_fd;
1011 }
1012
1013 static int
1014 ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
1015 {
1016         struct rte_vdpa_device *vdev;
1017         struct internal_list *list;
1018         struct ifcvf_internal *internal;
1019         struct vfio_region_info reg = { .argsz = sizeof(reg) };
1020         int ret;
1021
1022         vdev = rte_vhost_get_vdpa_device(vid);
1023         list = find_internal_resource_by_vdev(vdev);
1024         if (list == NULL) {
1025                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1026                 return -1;
1027         }
1028
1029         internal = list->internal;
1030
1031         reg.index = ifcvf_get_notify_region(&internal->hw);
1032         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
1033         if (ret) {
1034                 DRV_LOG(ERR, "Get not get device region info: %s",
1035                                 strerror(errno));
1036                 return -1;
1037         }
1038
1039         *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset;
1040         *size = 0x1000;
1041
1042         return 0;
1043 }
1044
1045 static int
1046 ifcvf_get_queue_num(struct rte_vdpa_device *vdev, uint32_t *queue_num)
1047 {
1048         struct internal_list *list;
1049
1050         list = find_internal_resource_by_vdev(vdev);
1051         if (list == NULL) {
1052                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1053                 return -1;
1054         }
1055
1056         *queue_num = list->internal->max_queues;
1057
1058         return 0;
1059 }
1060
1061 static int
1062 ifcvf_get_vdpa_features(struct rte_vdpa_device *vdev, uint64_t *features)
1063 {
1064         struct internal_list *list;
1065
1066         list = find_internal_resource_by_vdev(vdev);
1067         if (list == NULL) {
1068                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1069                 return -1;
1070         }
1071
1072         *features = list->internal->features;
1073
1074         return 0;
1075 }
1076
1077 #define VDPA_SUPPORTED_PROTOCOL_FEATURES \
1078                 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \
1079                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | \
1080                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | \
1081                  1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
1082                  1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | \
1083                  1ULL << VHOST_USER_PROTOCOL_F_STATUS)
1084 static int
1085 ifcvf_get_protocol_features(struct rte_vdpa_device *vdev, uint64_t *features)
1086 {
1087         RTE_SET_USED(vdev);
1088
1089         *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
1090         return 0;
1091 }
1092
1093 static int
1094 ifcvf_set_vring_state(int vid, int vring, int state)
1095 {
1096         struct rte_vdpa_device *vdev;
1097         struct internal_list *list;
1098         struct ifcvf_internal *internal;
1099         struct ifcvf_hw *hw;
1100         struct ifcvf_pci_common_cfg *cfg;
1101         int ret = 0;
1102
1103         vdev = rte_vhost_get_vdpa_device(vid);
1104         list = find_internal_resource_by_vdev(vdev);
1105         if (list == NULL) {
1106                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1107                 return -1;
1108         }
1109
1110         internal = list->internal;
1111         if (vring < 0 || vring >= internal->max_queues * 2) {
1112                 DRV_LOG(ERR, "Vring index %d not correct", vring);
1113                 return -1;
1114         }
1115
1116         hw = &internal->hw;
1117         if (!internal->configured)
1118                 goto exit;
1119
1120         cfg = hw->common_cfg;
1121         IFCVF_WRITE_REG16(vring, &cfg->queue_select);
1122         IFCVF_WRITE_REG16(!!state, &cfg->queue_enable);
1123
1124         if (!state && hw->vring[vring].enable) {
1125                 ret = vdpa_disable_vfio_intr(internal);
1126                 if (ret)
1127                         return ret;
1128         }
1129
1130         if (state && !hw->vring[vring].enable) {
1131                 ret = vdpa_enable_vfio_intr(internal, false);
1132                 if (ret)
1133                         return ret;
1134         }
1135
1136 exit:
1137         hw->vring[vring].enable = !!state;
1138         return 0;
1139 }
1140
1141 static struct rte_vdpa_dev_ops ifcvf_ops = {
1142         .get_queue_num = ifcvf_get_queue_num,
1143         .get_features = ifcvf_get_vdpa_features,
1144         .get_protocol_features = ifcvf_get_protocol_features,
1145         .dev_conf = ifcvf_dev_config,
1146         .dev_close = ifcvf_dev_close,
1147         .set_vring_state = ifcvf_set_vring_state,
1148         .set_features = ifcvf_set_features,
1149         .migration_done = NULL,
1150         .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
1151         .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
1152         .get_notify_area = ifcvf_get_notify_area,
1153 };
1154
1155 static inline int
1156 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1157 {
1158         uint16_t *n = extra_args;
1159
1160         if (value == NULL || extra_args == NULL)
1161                 return -EINVAL;
1162
1163         *n = (uint16_t)strtoul(value, NULL, 0);
1164         if (*n == USHRT_MAX && errno == ERANGE)
1165                 return -1;
1166
1167         return 0;
1168 }
1169
1170 static int
1171 ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
1172                 struct rte_pci_device *pci_dev)
1173 {
1174         uint64_t features;
1175         struct ifcvf_internal *internal = NULL;
1176         struct internal_list *list = NULL;
1177         int vdpa_mode = 0;
1178         int sw_fallback_lm = 0;
1179         struct rte_kvargs *kvlist = NULL;
1180         int ret = 0;
1181
1182         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1183                 return 0;
1184
1185         if (!pci_dev->device.devargs)
1186                 return 1;
1187
1188         kvlist = rte_kvargs_parse(pci_dev->device.devargs->args,
1189                         ifcvf_valid_arguments);
1190         if (kvlist == NULL)
1191                 return 1;
1192
1193         /* probe only when vdpa mode is specified */
1194         if (rte_kvargs_count(kvlist, IFCVF_VDPA_MODE) == 0) {
1195                 rte_kvargs_free(kvlist);
1196                 return 1;
1197         }
1198
1199         ret = rte_kvargs_process(kvlist, IFCVF_VDPA_MODE, &open_int,
1200                         &vdpa_mode);
1201         if (ret < 0 || vdpa_mode == 0) {
1202                 rte_kvargs_free(kvlist);
1203                 return 1;
1204         }
1205
1206         list = rte_zmalloc("ifcvf", sizeof(*list), 0);
1207         if (list == NULL)
1208                 goto error;
1209
1210         internal = rte_zmalloc("ifcvf", sizeof(*internal), 0);
1211         if (internal == NULL)
1212                 goto error;
1213
1214         internal->pdev = pci_dev;
1215         rte_spinlock_init(&internal->lock);
1216
1217         if (ifcvf_vfio_setup(internal) < 0) {
1218                 DRV_LOG(ERR, "failed to setup device %s", pci_dev->name);
1219                 goto error;
1220         }
1221
1222         if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0) {
1223                 DRV_LOG(ERR, "failed to init device %s", pci_dev->name);
1224                 goto error;
1225         }
1226
1227         internal->configured = 0;
1228         internal->max_queues = IFCVF_MAX_QUEUES;
1229         features = ifcvf_get_features(&internal->hw);
1230         internal->features = (features &
1231                 ~(1ULL << VIRTIO_F_IOMMU_PLATFORM)) |
1232                 (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
1233                 (1ULL << VIRTIO_NET_F_CTRL_VQ) |
1234                 (1ULL << VIRTIO_NET_F_STATUS) |
1235                 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
1236                 (1ULL << VHOST_F_LOG_ALL);
1237
1238         list->internal = internal;
1239
1240         if (rte_kvargs_count(kvlist, IFCVF_SW_FALLBACK_LM)) {
1241                 ret = rte_kvargs_process(kvlist, IFCVF_SW_FALLBACK_LM,
1242                                 &open_int, &sw_fallback_lm);
1243                 if (ret < 0)
1244                         goto error;
1245         }
1246         internal->sw_lm = sw_fallback_lm;
1247
1248         internal->vdev = rte_vdpa_register_device(&pci_dev->device, &ifcvf_ops);
1249         if (internal->vdev == NULL) {
1250                 DRV_LOG(ERR, "failed to register device %s", pci_dev->name);
1251                 goto error;
1252         }
1253
1254         pthread_mutex_lock(&internal_list_lock);
1255         TAILQ_INSERT_TAIL(&internal_list, list, next);
1256         pthread_mutex_unlock(&internal_list_lock);
1257
1258         rte_atomic32_set(&internal->started, 1);
1259         update_datapath(internal);
1260
1261         rte_kvargs_free(kvlist);
1262         return 0;
1263
1264 error:
1265         rte_kvargs_free(kvlist);
1266         rte_free(list);
1267         rte_free(internal);
1268         return -1;
1269 }
1270
1271 static int
1272 ifcvf_pci_remove(struct rte_pci_device *pci_dev)
1273 {
1274         struct ifcvf_internal *internal;
1275         struct internal_list *list;
1276
1277         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1278                 return 0;
1279
1280         list = find_internal_resource_by_dev(pci_dev);
1281         if (list == NULL) {
1282                 DRV_LOG(ERR, "Invalid device: %s", pci_dev->name);
1283                 return -1;
1284         }
1285
1286         internal = list->internal;
1287         rte_atomic32_set(&internal->started, 0);
1288         update_datapath(internal);
1289
1290         rte_pci_unmap_device(internal->pdev);
1291         rte_vfio_container_destroy(internal->vfio_container_fd);
1292         rte_vdpa_unregister_device(internal->vdev);
1293
1294         pthread_mutex_lock(&internal_list_lock);
1295         TAILQ_REMOVE(&internal_list, list, next);
1296         pthread_mutex_unlock(&internal_list_lock);
1297
1298         rte_free(list);
1299         rte_free(internal);
1300
1301         return 0;
1302 }
1303
1304 /*
1305  * IFCVF has the same vendor ID and device ID as virtio net PCI
1306  * device, with its specific subsystem vendor ID and device ID.
1307  */
1308 static const struct rte_pci_id pci_id_ifcvf_map[] = {
1309         { .class_id = RTE_CLASS_ANY_ID,
1310           .vendor_id = IFCVF_VENDOR_ID,
1311           .device_id = IFCVF_DEVICE_ID,
1312           .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
1313           .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID,
1314         },
1315
1316         { .vendor_id = 0, /* sentinel */
1317         },
1318 };
1319
1320 static struct rte_pci_driver rte_ifcvf_vdpa = {
1321         .id_table = pci_id_ifcvf_map,
1322         .drv_flags = 0,
1323         .probe = ifcvf_pci_probe,
1324         .remove = ifcvf_pci_remove,
1325 };
1326
1327 RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa);
1328 RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map);
1329 RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci");