7a06f97429b939300b4c61c67e77a1133a23a949
[dpdk.git] / drivers / vdpa / ifc / ifcvf_vdpa.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2018 Intel Corporation
3  */
4
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <fcntl.h>
8 #include <string.h>
9 #include <sys/ioctl.h>
10 #include <sys/epoll.h>
11 #include <linux/virtio_net.h>
12 #include <stdbool.h>
13
14 #include <rte_eal_paging.h>
15 #include <rte_malloc.h>
16 #include <rte_memory.h>
17 #include <rte_bus_pci.h>
18 #include <rte_vhost.h>
19 #include <rte_vdpa.h>
20 #include <rte_vdpa_dev.h>
21 #include <rte_vfio.h>
22 #include <rte_spinlock.h>
23 #include <rte_log.h>
24 #include <rte_kvargs.h>
25 #include <rte_devargs.h>
26
27 #include "base/ifcvf.h"
28
29 RTE_LOG_REGISTER(ifcvf_vdpa_logtype, pmd.vdpa.ifcvf, NOTICE);
30 #define DRV_LOG(level, fmt, args...) \
31         rte_log(RTE_LOG_ ## level, ifcvf_vdpa_logtype, \
32                 "IFCVF %s(): " fmt "\n", __func__, ##args)
33
34 #define IFCVF_USED_RING_LEN(size) \
35         ((size) * sizeof(struct vring_used_elem) + sizeof(uint16_t) * 3)
36
37 #define IFCVF_VDPA_MODE         "vdpa"
38 #define IFCVF_SW_FALLBACK_LM    "sw-live-migration"
39
40 static const char * const ifcvf_valid_arguments[] = {
41         IFCVF_VDPA_MODE,
42         IFCVF_SW_FALLBACK_LM,
43         NULL
44 };
45
46 struct ifcvf_internal {
47         struct rte_pci_device *pdev;
48         struct ifcvf_hw hw;
49         int configured;
50         int vfio_container_fd;
51         int vfio_group_fd;
52         int vfio_dev_fd;
53         pthread_t tid;  /* thread for notify relay */
54         int epfd;
55         int vid;
56         struct rte_vdpa_device *vdev;
57         uint16_t max_queues;
58         uint64_t features;
59         rte_atomic32_t started;
60         rte_atomic32_t dev_attached;
61         rte_atomic32_t running;
62         rte_spinlock_t lock;
63         bool sw_lm;
64         bool sw_fallback_running;
65         /* mediated vring for sw fallback */
66         struct vring m_vring[IFCVF_MAX_QUEUES * 2];
67         /* eventfd for used ring interrupt */
68         int intr_fd[IFCVF_MAX_QUEUES * 2];
69 };
70
71 struct internal_list {
72         TAILQ_ENTRY(internal_list) next;
73         struct ifcvf_internal *internal;
74 };
75
76 TAILQ_HEAD(internal_list_head, internal_list);
77 static struct internal_list_head internal_list =
78         TAILQ_HEAD_INITIALIZER(internal_list);
79
80 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
81
82 static void update_used_ring(struct ifcvf_internal *internal, uint16_t qid);
83
84 static struct internal_list *
85 find_internal_resource_by_vdev(struct rte_vdpa_device *vdev)
86 {
87         int found = 0;
88         struct internal_list *list;
89
90         pthread_mutex_lock(&internal_list_lock);
91
92         TAILQ_FOREACH(list, &internal_list, next) {
93                 if (vdev == list->internal->vdev) {
94                         found = 1;
95                         break;
96                 }
97         }
98
99         pthread_mutex_unlock(&internal_list_lock);
100
101         if (!found)
102                 return NULL;
103
104         return list;
105 }
106
107 static struct internal_list *
108 find_internal_resource_by_dev(struct rte_pci_device *pdev)
109 {
110         int found = 0;
111         struct internal_list *list;
112
113         pthread_mutex_lock(&internal_list_lock);
114
115         TAILQ_FOREACH(list, &internal_list, next) {
116                 if (!rte_pci_addr_cmp(&pdev->addr,
117                                         &list->internal->pdev->addr)) {
118                         found = 1;
119                         break;
120                 }
121         }
122
123         pthread_mutex_unlock(&internal_list_lock);
124
125         if (!found)
126                 return NULL;
127
128         return list;
129 }
130
131 static int
132 ifcvf_vfio_setup(struct ifcvf_internal *internal)
133 {
134         struct rte_pci_device *dev = internal->pdev;
135         char devname[RTE_DEV_NAME_MAX_LEN] = {0};
136         int iommu_group_num;
137         int i, ret;
138
139         internal->vfio_dev_fd = -1;
140         internal->vfio_group_fd = -1;
141         internal->vfio_container_fd = -1;
142
143         rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
144         ret = rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
145                         &iommu_group_num);
146         if (ret <= 0) {
147                 DRV_LOG(ERR, "%s failed to get IOMMU group", devname);
148                 return -1;
149         }
150
151         internal->vfio_container_fd = rte_vfio_container_create();
152         if (internal->vfio_container_fd < 0)
153                 return -1;
154
155         internal->vfio_group_fd = rte_vfio_container_group_bind(
156                         internal->vfio_container_fd, iommu_group_num);
157         if (internal->vfio_group_fd < 0)
158                 goto err;
159
160         if (rte_pci_map_device(dev))
161                 goto err;
162
163         internal->vfio_dev_fd = dev->intr_handle.vfio_dev_fd;
164
165         for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
166                         i++) {
167                 internal->hw.mem_resource[i].addr =
168                         internal->pdev->mem_resource[i].addr;
169                 internal->hw.mem_resource[i].phys_addr =
170                         internal->pdev->mem_resource[i].phys_addr;
171                 internal->hw.mem_resource[i].len =
172                         internal->pdev->mem_resource[i].len;
173         }
174
175         return 0;
176
177 err:
178         rte_vfio_container_destroy(internal->vfio_container_fd);
179         return -1;
180 }
181
182 static int
183 ifcvf_dma_map(struct ifcvf_internal *internal, int do_map)
184 {
185         uint32_t i;
186         int ret;
187         struct rte_vhost_memory *mem = NULL;
188         int vfio_container_fd;
189
190         ret = rte_vhost_get_mem_table(internal->vid, &mem);
191         if (ret < 0) {
192                 DRV_LOG(ERR, "failed to get VM memory layout.");
193                 goto exit;
194         }
195
196         vfio_container_fd = internal->vfio_container_fd;
197
198         for (i = 0; i < mem->nregions; i++) {
199                 struct rte_vhost_mem_region *reg;
200
201                 reg = &mem->regions[i];
202                 DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", "
203                         "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".",
204                         do_map ? "DMA map" : "DMA unmap", i,
205                         reg->host_user_addr, reg->guest_phys_addr, reg->size);
206
207                 if (do_map) {
208                         ret = rte_vfio_container_dma_map(vfio_container_fd,
209                                 reg->host_user_addr, reg->guest_phys_addr,
210                                 reg->size);
211                         if (ret < 0) {
212                                 DRV_LOG(ERR, "DMA map failed.");
213                                 goto exit;
214                         }
215                 } else {
216                         ret = rte_vfio_container_dma_unmap(vfio_container_fd,
217                                 reg->host_user_addr, reg->guest_phys_addr,
218                                 reg->size);
219                         if (ret < 0) {
220                                 DRV_LOG(ERR, "DMA unmap failed.");
221                                 goto exit;
222                         }
223                 }
224         }
225
226 exit:
227         if (mem)
228                 free(mem);
229         return ret;
230 }
231
232 static uint64_t
233 hva_to_gpa(int vid, uint64_t hva)
234 {
235         struct rte_vhost_memory *mem = NULL;
236         struct rte_vhost_mem_region *reg;
237         uint32_t i;
238         uint64_t gpa = 0;
239
240         if (rte_vhost_get_mem_table(vid, &mem) < 0)
241                 goto exit;
242
243         for (i = 0; i < mem->nregions; i++) {
244                 reg = &mem->regions[i];
245
246                 if (hva >= reg->host_user_addr &&
247                                 hva < reg->host_user_addr + reg->size) {
248                         gpa = hva - reg->host_user_addr + reg->guest_phys_addr;
249                         break;
250                 }
251         }
252
253 exit:
254         if (mem)
255                 free(mem);
256         return gpa;
257 }
258
259 static int
260 vdpa_ifcvf_start(struct ifcvf_internal *internal)
261 {
262         struct ifcvf_hw *hw = &internal->hw;
263         int i, nr_vring;
264         int vid;
265         struct rte_vhost_vring vq;
266         uint64_t gpa;
267
268         vid = internal->vid;
269         nr_vring = rte_vhost_get_vring_num(vid);
270         rte_vhost_get_negotiated_features(vid, &hw->req_features);
271
272         for (i = 0; i < nr_vring; i++) {
273                 rte_vhost_get_vhost_vring(vid, i, &vq);
274                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
275                 if (gpa == 0) {
276                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
277                         return -1;
278                 }
279                 hw->vring[i].desc = gpa;
280
281                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
282                 if (gpa == 0) {
283                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
284                         return -1;
285                 }
286                 hw->vring[i].avail = gpa;
287
288                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
289                 if (gpa == 0) {
290                         DRV_LOG(ERR, "Fail to get GPA for used ring.");
291                         return -1;
292                 }
293                 hw->vring[i].used = gpa;
294
295                 hw->vring[i].size = vq.size;
296                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
297                                 &hw->vring[i].last_used_idx);
298         }
299         hw->nr_vring = i;
300
301         return ifcvf_start_hw(&internal->hw);
302 }
303
304 static void
305 vdpa_ifcvf_stop(struct ifcvf_internal *internal)
306 {
307         struct ifcvf_hw *hw = &internal->hw;
308         uint32_t i;
309         int vid;
310         uint64_t features = 0;
311         uint64_t log_base = 0, log_size = 0;
312         uint64_t len;
313
314         vid = internal->vid;
315         ifcvf_stop_hw(hw);
316
317         for (i = 0; i < hw->nr_vring; i++)
318                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
319                                 hw->vring[i].last_used_idx);
320
321         if (internal->sw_lm)
322                 return;
323
324         rte_vhost_get_negotiated_features(vid, &features);
325         if (RTE_VHOST_NEED_LOG(features)) {
326                 ifcvf_disable_logging(hw);
327                 rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
328                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
329                                 log_base, IFCVF_LOG_BASE, log_size);
330                 /*
331                  * IFCVF marks dirty memory pages for only packet buffer,
332                  * SW helps to mark the used ring as dirty after device stops.
333                  */
334                 for (i = 0; i < hw->nr_vring; i++) {
335                         len = IFCVF_USED_RING_LEN(hw->vring[i].size);
336                         rte_vhost_log_used_vring(vid, i, 0, len);
337                 }
338         }
339 }
340
341 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
342                 sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
343 static int
344 vdpa_enable_vfio_intr(struct ifcvf_internal *internal, bool m_rx)
345 {
346         int ret;
347         uint32_t i, nr_vring;
348         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
349         struct vfio_irq_set *irq_set;
350         int *fd_ptr;
351         struct rte_vhost_vring vring;
352         int fd;
353
354         vring.callfd = -1;
355
356         nr_vring = rte_vhost_get_vring_num(internal->vid);
357
358         irq_set = (struct vfio_irq_set *)irq_set_buf;
359         irq_set->argsz = sizeof(irq_set_buf);
360         irq_set->count = nr_vring + 1;
361         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
362                          VFIO_IRQ_SET_ACTION_TRIGGER;
363         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
364         irq_set->start = 0;
365         fd_ptr = (int *)&irq_set->data;
366         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = internal->pdev->intr_handle.fd;
367
368         for (i = 0; i < nr_vring; i++)
369                 internal->intr_fd[i] = -1;
370
371         for (i = 0; i < nr_vring; i++) {
372                 rte_vhost_get_vhost_vring(internal->vid, i, &vring);
373                 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
374                 if ((i & 1) == 0 && m_rx == true) {
375                         fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
376                         if (fd < 0) {
377                                 DRV_LOG(ERR, "can't setup eventfd: %s",
378                                         strerror(errno));
379                                 return -1;
380                         }
381                         internal->intr_fd[i] = fd;
382                         fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = fd;
383                 }
384         }
385
386         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
387         if (ret) {
388                 DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s",
389                                 strerror(errno));
390                 return -1;
391         }
392
393         return 0;
394 }
395
396 static int
397 vdpa_disable_vfio_intr(struct ifcvf_internal *internal)
398 {
399         int ret;
400         uint32_t i, nr_vring;
401         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
402         struct vfio_irq_set *irq_set;
403
404         irq_set = (struct vfio_irq_set *)irq_set_buf;
405         irq_set->argsz = sizeof(irq_set_buf);
406         irq_set->count = 0;
407         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
408         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
409         irq_set->start = 0;
410
411         nr_vring = rte_vhost_get_vring_num(internal->vid);
412         for (i = 0; i < nr_vring; i++) {
413                 if (internal->intr_fd[i] >= 0)
414                         close(internal->intr_fd[i]);
415                 internal->intr_fd[i] = -1;
416         }
417
418         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
419         if (ret) {
420                 DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s",
421                                 strerror(errno));
422                 return -1;
423         }
424
425         return 0;
426 }
427
428 static void *
429 notify_relay(void *arg)
430 {
431         int i, kickfd, epfd, nfds = 0;
432         uint32_t qid, q_num;
433         struct epoll_event events[IFCVF_MAX_QUEUES * 2];
434         struct epoll_event ev;
435         uint64_t buf;
436         int nbytes;
437         struct rte_vhost_vring vring;
438         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
439         struct ifcvf_hw *hw = &internal->hw;
440
441         q_num = rte_vhost_get_vring_num(internal->vid);
442
443         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
444         if (epfd < 0) {
445                 DRV_LOG(ERR, "failed to create epoll instance.");
446                 return NULL;
447         }
448         internal->epfd = epfd;
449
450         vring.kickfd = -1;
451         for (qid = 0; qid < q_num; qid++) {
452                 ev.events = EPOLLIN | EPOLLPRI;
453                 rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
454                 ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
455                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
456                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
457                         return NULL;
458                 }
459         }
460
461         for (;;) {
462                 nfds = epoll_wait(epfd, events, q_num, -1);
463                 if (nfds < 0) {
464                         if (errno == EINTR)
465                                 continue;
466                         DRV_LOG(ERR, "epoll_wait return fail\n");
467                         return NULL;
468                 }
469
470                 for (i = 0; i < nfds; i++) {
471                         qid = events[i].data.u32;
472                         kickfd = (uint32_t)(events[i].data.u64 >> 32);
473                         do {
474                                 nbytes = read(kickfd, &buf, 8);
475                                 if (nbytes < 0) {
476                                         if (errno == EINTR ||
477                                             errno == EWOULDBLOCK ||
478                                             errno == EAGAIN)
479                                                 continue;
480                                         DRV_LOG(INFO, "Error reading "
481                                                 "kickfd: %s",
482                                                 strerror(errno));
483                                 }
484                                 break;
485                         } while (1);
486
487                         ifcvf_notify_queue(hw, qid);
488                 }
489         }
490
491         return NULL;
492 }
493
494 static int
495 setup_notify_relay(struct ifcvf_internal *internal)
496 {
497         int ret;
498
499         ret = pthread_create(&internal->tid, NULL, notify_relay,
500                         (void *)internal);
501         if (ret) {
502                 DRV_LOG(ERR, "failed to create notify relay pthread.");
503                 return -1;
504         }
505         return 0;
506 }
507
508 static int
509 unset_notify_relay(struct ifcvf_internal *internal)
510 {
511         void *status;
512
513         if (internal->tid) {
514                 pthread_cancel(internal->tid);
515                 pthread_join(internal->tid, &status);
516         }
517         internal->tid = 0;
518
519         if (internal->epfd >= 0)
520                 close(internal->epfd);
521         internal->epfd = -1;
522
523         return 0;
524 }
525
526 static int
527 update_datapath(struct ifcvf_internal *internal)
528 {
529         int ret;
530
531         rte_spinlock_lock(&internal->lock);
532
533         if (!rte_atomic32_read(&internal->running) &&
534             (rte_atomic32_read(&internal->started) &&
535              rte_atomic32_read(&internal->dev_attached))) {
536                 ret = ifcvf_dma_map(internal, 1);
537                 if (ret)
538                         goto err;
539
540                 ret = vdpa_enable_vfio_intr(internal, 0);
541                 if (ret)
542                         goto err;
543
544                 ret = vdpa_ifcvf_start(internal);
545                 if (ret)
546                         goto err;
547
548                 ret = setup_notify_relay(internal);
549                 if (ret)
550                         goto err;
551
552                 rte_atomic32_set(&internal->running, 1);
553         } else if (rte_atomic32_read(&internal->running) &&
554                    (!rte_atomic32_read(&internal->started) ||
555                     !rte_atomic32_read(&internal->dev_attached))) {
556                 ret = unset_notify_relay(internal);
557                 if (ret)
558                         goto err;
559
560                 vdpa_ifcvf_stop(internal);
561
562                 ret = vdpa_disable_vfio_intr(internal);
563                 if (ret)
564                         goto err;
565
566                 ret = ifcvf_dma_map(internal, 0);
567                 if (ret)
568                         goto err;
569
570                 rte_atomic32_set(&internal->running, 0);
571         }
572
573         rte_spinlock_unlock(&internal->lock);
574         return 0;
575 err:
576         rte_spinlock_unlock(&internal->lock);
577         return ret;
578 }
579
580 static int
581 m_ifcvf_start(struct ifcvf_internal *internal)
582 {
583         struct ifcvf_hw *hw = &internal->hw;
584         uint32_t i, nr_vring;
585         int vid, ret;
586         struct rte_vhost_vring vq;
587         void *vring_buf;
588         uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
589         uint64_t size;
590         uint64_t gpa;
591
592         memset(&vq, 0, sizeof(vq));
593         vid = internal->vid;
594         nr_vring = rte_vhost_get_vring_num(vid);
595         rte_vhost_get_negotiated_features(vid, &hw->req_features);
596
597         for (i = 0; i < nr_vring; i++) {
598                 rte_vhost_get_vhost_vring(vid, i, &vq);
599
600                 size = RTE_ALIGN_CEIL(vring_size(vq.size, rte_mem_page_size()),
601                                 rte_mem_page_size());
602                 vring_buf = rte_zmalloc("ifcvf", size, rte_mem_page_size());
603                 vring_init(&internal->m_vring[i], vq.size, vring_buf,
604                                 rte_mem_page_size());
605
606                 ret = rte_vfio_container_dma_map(internal->vfio_container_fd,
607                         (uint64_t)(uintptr_t)vring_buf, m_vring_iova, size);
608                 if (ret < 0) {
609                         DRV_LOG(ERR, "mediated vring DMA map failed.");
610                         goto error;
611                 }
612
613                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
614                 if (gpa == 0) {
615                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
616                         return -1;
617                 }
618                 hw->vring[i].desc = gpa;
619
620                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
621                 if (gpa == 0) {
622                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
623                         return -1;
624                 }
625                 hw->vring[i].avail = gpa;
626
627                 /* Direct I/O for Tx queue, relay for Rx queue */
628                 if (i & 1) {
629                         gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
630                         if (gpa == 0) {
631                                 DRV_LOG(ERR, "Fail to get GPA for used ring.");
632                                 return -1;
633                         }
634                         hw->vring[i].used = gpa;
635                 } else {
636                         hw->vring[i].used = m_vring_iova +
637                                 (char *)internal->m_vring[i].used -
638                                 (char *)internal->m_vring[i].desc;
639                 }
640
641                 hw->vring[i].size = vq.size;
642
643                 rte_vhost_get_vring_base(vid, i,
644                                 &internal->m_vring[i].avail->idx,
645                                 &internal->m_vring[i].used->idx);
646
647                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
648                                 &hw->vring[i].last_used_idx);
649
650                 m_vring_iova += size;
651         }
652         hw->nr_vring = nr_vring;
653
654         return ifcvf_start_hw(&internal->hw);
655
656 error:
657         for (i = 0; i < nr_vring; i++)
658                 if (internal->m_vring[i].desc)
659                         rte_free(internal->m_vring[i].desc);
660
661         return -1;
662 }
663
664 static int
665 m_ifcvf_stop(struct ifcvf_internal *internal)
666 {
667         int vid;
668         uint32_t i;
669         struct rte_vhost_vring vq;
670         struct ifcvf_hw *hw = &internal->hw;
671         uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
672         uint64_t size, len;
673
674         vid = internal->vid;
675         ifcvf_stop_hw(hw);
676
677         for (i = 0; i < hw->nr_vring; i++) {
678                 /* synchronize remaining new used entries if any */
679                 if ((i & 1) == 0)
680                         update_used_ring(internal, i);
681
682                 rte_vhost_get_vhost_vring(vid, i, &vq);
683                 len = IFCVF_USED_RING_LEN(vq.size);
684                 rte_vhost_log_used_vring(vid, i, 0, len);
685
686                 size = RTE_ALIGN_CEIL(vring_size(vq.size, rte_mem_page_size()),
687                                 rte_mem_page_size());
688                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
689                         (uint64_t)(uintptr_t)internal->m_vring[i].desc,
690                         m_vring_iova, size);
691
692                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
693                                 hw->vring[i].last_used_idx);
694                 rte_free(internal->m_vring[i].desc);
695                 m_vring_iova += size;
696         }
697
698         return 0;
699 }
700
701 static void
702 update_used_ring(struct ifcvf_internal *internal, uint16_t qid)
703 {
704         rte_vdpa_relay_vring_used(internal->vid, qid, &internal->m_vring[qid]);
705         rte_vhost_vring_call(internal->vid, qid);
706 }
707
708 static void *
709 vring_relay(void *arg)
710 {
711         int i, vid, epfd, fd, nfds;
712         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
713         struct rte_vhost_vring vring;
714         uint16_t qid, q_num;
715         struct epoll_event events[IFCVF_MAX_QUEUES * 4];
716         struct epoll_event ev;
717         int nbytes;
718         uint64_t buf;
719
720         vid = internal->vid;
721         q_num = rte_vhost_get_vring_num(vid);
722
723         /* add notify fd and interrupt fd to epoll */
724         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
725         if (epfd < 0) {
726                 DRV_LOG(ERR, "failed to create epoll instance.");
727                 return NULL;
728         }
729         internal->epfd = epfd;
730
731         vring.kickfd = -1;
732         for (qid = 0; qid < q_num; qid++) {
733                 ev.events = EPOLLIN | EPOLLPRI;
734                 rte_vhost_get_vhost_vring(vid, qid, &vring);
735                 ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32;
736                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
737                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
738                         return NULL;
739                 }
740         }
741
742         for (qid = 0; qid < q_num; qid += 2) {
743                 ev.events = EPOLLIN | EPOLLPRI;
744                 /* leave a flag to mark it's for interrupt */
745                 ev.data.u64 = 1 | qid << 1 |
746                         (uint64_t)internal->intr_fd[qid] << 32;
747                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, internal->intr_fd[qid], &ev)
748                                 < 0) {
749                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
750                         return NULL;
751                 }
752                 update_used_ring(internal, qid);
753         }
754
755         /* start relay with a first kick */
756         for (qid = 0; qid < q_num; qid++)
757                 ifcvf_notify_queue(&internal->hw, qid);
758
759         /* listen to the events and react accordingly */
760         for (;;) {
761                 nfds = epoll_wait(epfd, events, q_num * 2, -1);
762                 if (nfds < 0) {
763                         if (errno == EINTR)
764                                 continue;
765                         DRV_LOG(ERR, "epoll_wait return fail\n");
766                         return NULL;
767                 }
768
769                 for (i = 0; i < nfds; i++) {
770                         fd = (uint32_t)(events[i].data.u64 >> 32);
771                         do {
772                                 nbytes = read(fd, &buf, 8);
773                                 if (nbytes < 0) {
774                                         if (errno == EINTR ||
775                                             errno == EWOULDBLOCK ||
776                                             errno == EAGAIN)
777                                                 continue;
778                                         DRV_LOG(INFO, "Error reading "
779                                                 "kickfd: %s",
780                                                 strerror(errno));
781                                 }
782                                 break;
783                         } while (1);
784
785                         qid = events[i].data.u32 >> 1;
786
787                         if (events[i].data.u32 & 1)
788                                 update_used_ring(internal, qid);
789                         else
790                                 ifcvf_notify_queue(&internal->hw, qid);
791                 }
792         }
793
794         return NULL;
795 }
796
797 static int
798 setup_vring_relay(struct ifcvf_internal *internal)
799 {
800         int ret;
801
802         ret = pthread_create(&internal->tid, NULL, vring_relay,
803                         (void *)internal);
804         if (ret) {
805                 DRV_LOG(ERR, "failed to create ring relay pthread.");
806                 return -1;
807         }
808         return 0;
809 }
810
811 static int
812 unset_vring_relay(struct ifcvf_internal *internal)
813 {
814         void *status;
815
816         if (internal->tid) {
817                 pthread_cancel(internal->tid);
818                 pthread_join(internal->tid, &status);
819         }
820         internal->tid = 0;
821
822         if (internal->epfd >= 0)
823                 close(internal->epfd);
824         internal->epfd = -1;
825
826         return 0;
827 }
828
829 static int
830 ifcvf_sw_fallback_switchover(struct ifcvf_internal *internal)
831 {
832         int ret;
833         int vid = internal->vid;
834
835         /* stop the direct IO data path */
836         unset_notify_relay(internal);
837         vdpa_ifcvf_stop(internal);
838         vdpa_disable_vfio_intr(internal);
839
840         ret = rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, false);
841         if (ret && ret != -ENOTSUP)
842                 goto error;
843
844         /* set up interrupt for interrupt relay */
845         ret = vdpa_enable_vfio_intr(internal, 1);
846         if (ret)
847                 goto unmap;
848
849         /* config the VF */
850         ret = m_ifcvf_start(internal);
851         if (ret)
852                 goto unset_intr;
853
854         /* set up vring relay thread */
855         ret = setup_vring_relay(internal);
856         if (ret)
857                 goto stop_vf;
858
859         rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, true);
860
861         internal->sw_fallback_running = true;
862
863         return 0;
864
865 stop_vf:
866         m_ifcvf_stop(internal);
867 unset_intr:
868         vdpa_disable_vfio_intr(internal);
869 unmap:
870         ifcvf_dma_map(internal, 0);
871 error:
872         return -1;
873 }
874
875 static int
876 ifcvf_dev_config(int vid)
877 {
878         struct rte_vdpa_device *vdev;
879         struct internal_list *list;
880         struct ifcvf_internal *internal;
881
882         vdev = rte_vhost_get_vdpa_device(vid);
883         list = find_internal_resource_by_vdev(vdev);
884         if (list == NULL) {
885                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
886                 return -1;
887         }
888
889         internal = list->internal;
890         internal->vid = vid;
891         rte_atomic32_set(&internal->dev_attached, 1);
892         update_datapath(internal);
893
894         if (rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, true) != 0)
895                 DRV_LOG(NOTICE, "vDPA (%s): software relay is used.",
896                                 vdev->device->name);
897
898         internal->configured = 1;
899         return 0;
900 }
901
902 static int
903 ifcvf_dev_close(int vid)
904 {
905         struct rte_vdpa_device *vdev;
906         struct internal_list *list;
907         struct ifcvf_internal *internal;
908
909         vdev = rte_vhost_get_vdpa_device(vid);
910         list = find_internal_resource_by_vdev(vdev);
911         if (list == NULL) {
912                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
913                 return -1;
914         }
915
916         internal = list->internal;
917
918         if (internal->sw_fallback_running) {
919                 /* unset ring relay */
920                 unset_vring_relay(internal);
921
922                 /* reset VF */
923                 m_ifcvf_stop(internal);
924
925                 /* remove interrupt setting */
926                 vdpa_disable_vfio_intr(internal);
927
928                 /* unset DMA map for guest memory */
929                 ifcvf_dma_map(internal, 0);
930
931                 internal->sw_fallback_running = false;
932         } else {
933                 rte_atomic32_set(&internal->dev_attached, 0);
934                 update_datapath(internal);
935         }
936
937         internal->configured = 0;
938         return 0;
939 }
940
941 static int
942 ifcvf_set_features(int vid)
943 {
944         uint64_t features = 0;
945         struct rte_vdpa_device *vdev;
946         struct internal_list *list;
947         struct ifcvf_internal *internal;
948         uint64_t log_base = 0, log_size = 0;
949
950         vdev = rte_vhost_get_vdpa_device(vid);
951         list = find_internal_resource_by_vdev(vdev);
952         if (list == NULL) {
953                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
954                 return -1;
955         }
956
957         internal = list->internal;
958         rte_vhost_get_negotiated_features(vid, &features);
959
960         if (!RTE_VHOST_NEED_LOG(features))
961                 return 0;
962
963         if (internal->sw_lm) {
964                 ifcvf_sw_fallback_switchover(internal);
965         } else {
966                 rte_vhost_get_log_base(vid, &log_base, &log_size);
967                 rte_vfio_container_dma_map(internal->vfio_container_fd,
968                                 log_base, IFCVF_LOG_BASE, log_size);
969                 ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size);
970         }
971
972         return 0;
973 }
974
975 static int
976 ifcvf_get_vfio_group_fd(int vid)
977 {
978         struct rte_vdpa_device *vdev;
979         struct internal_list *list;
980
981         vdev = rte_vhost_get_vdpa_device(vid);
982         list = find_internal_resource_by_vdev(vdev);
983         if (list == NULL) {
984                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
985                 return -1;
986         }
987
988         return list->internal->vfio_group_fd;
989 }
990
991 static int
992 ifcvf_get_vfio_device_fd(int vid)
993 {
994         struct rte_vdpa_device *vdev;
995         struct internal_list *list;
996
997         vdev = rte_vhost_get_vdpa_device(vid);
998         list = find_internal_resource_by_vdev(vdev);
999         if (list == NULL) {
1000                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1001                 return -1;
1002         }
1003
1004         return list->internal->vfio_dev_fd;
1005 }
1006
1007 static int
1008 ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
1009 {
1010         struct rte_vdpa_device *vdev;
1011         struct internal_list *list;
1012         struct ifcvf_internal *internal;
1013         struct vfio_region_info reg = { .argsz = sizeof(reg) };
1014         int ret;
1015
1016         vdev = rte_vhost_get_vdpa_device(vid);
1017         list = find_internal_resource_by_vdev(vdev);
1018         if (list == NULL) {
1019                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1020                 return -1;
1021         }
1022
1023         internal = list->internal;
1024
1025         reg.index = ifcvf_get_notify_region(&internal->hw);
1026         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
1027         if (ret) {
1028                 DRV_LOG(ERR, "Get not get device region info: %s",
1029                                 strerror(errno));
1030                 return -1;
1031         }
1032
1033         *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset;
1034         *size = 0x1000;
1035
1036         return 0;
1037 }
1038
1039 static int
1040 ifcvf_get_queue_num(struct rte_vdpa_device *vdev, uint32_t *queue_num)
1041 {
1042         struct internal_list *list;
1043
1044         list = find_internal_resource_by_vdev(vdev);
1045         if (list == NULL) {
1046                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1047                 return -1;
1048         }
1049
1050         *queue_num = list->internal->max_queues;
1051
1052         return 0;
1053 }
1054
1055 static int
1056 ifcvf_get_vdpa_features(struct rte_vdpa_device *vdev, uint64_t *features)
1057 {
1058         struct internal_list *list;
1059
1060         list = find_internal_resource_by_vdev(vdev);
1061         if (list == NULL) {
1062                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1063                 return -1;
1064         }
1065
1066         *features = list->internal->features;
1067
1068         return 0;
1069 }
1070
1071 #define VDPA_SUPPORTED_PROTOCOL_FEATURES \
1072                 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \
1073                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | \
1074                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | \
1075                  1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
1076                  1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | \
1077                  1ULL << VHOST_USER_PROTOCOL_F_STATUS)
1078 static int
1079 ifcvf_get_protocol_features(struct rte_vdpa_device *vdev, uint64_t *features)
1080 {
1081         RTE_SET_USED(vdev);
1082
1083         *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
1084         return 0;
1085 }
1086
1087 static int
1088 ifcvf_set_vring_state(int vid, int vring, int state)
1089 {
1090         struct rte_vdpa_device *vdev;
1091         struct internal_list *list;
1092         struct ifcvf_internal *internal;
1093         struct ifcvf_hw *hw;
1094         struct ifcvf_pci_common_cfg *cfg;
1095         int ret = 0;
1096
1097         vdev = rte_vhost_get_vdpa_device(vid);
1098         list = find_internal_resource_by_vdev(vdev);
1099         if (list == NULL) {
1100                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1101                 return -1;
1102         }
1103
1104         internal = list->internal;
1105         if (vring < 0 || vring >= internal->max_queues * 2) {
1106                 DRV_LOG(ERR, "Vring index %d not correct", vring);
1107                 return -1;
1108         }
1109
1110         hw = &internal->hw;
1111         if (!internal->configured)
1112                 goto exit;
1113
1114         cfg = hw->common_cfg;
1115         IFCVF_WRITE_REG16(vring, &cfg->queue_select);
1116         IFCVF_WRITE_REG16(!!state, &cfg->queue_enable);
1117
1118         if (!state && hw->vring[vring].enable) {
1119                 ret = vdpa_disable_vfio_intr(internal);
1120                 if (ret)
1121                         return ret;
1122         }
1123
1124         if (state && !hw->vring[vring].enable) {
1125                 ret = vdpa_enable_vfio_intr(internal, 0);
1126                 if (ret)
1127                         return ret;
1128         }
1129
1130 exit:
1131         hw->vring[vring].enable = !!state;
1132         return 0;
1133 }
1134
1135 static struct rte_vdpa_dev_ops ifcvf_ops = {
1136         .get_queue_num = ifcvf_get_queue_num,
1137         .get_features = ifcvf_get_vdpa_features,
1138         .get_protocol_features = ifcvf_get_protocol_features,
1139         .dev_conf = ifcvf_dev_config,
1140         .dev_close = ifcvf_dev_close,
1141         .set_vring_state = ifcvf_set_vring_state,
1142         .set_features = ifcvf_set_features,
1143         .migration_done = NULL,
1144         .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
1145         .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
1146         .get_notify_area = ifcvf_get_notify_area,
1147 };
1148
1149 static inline int
1150 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1151 {
1152         uint16_t *n = extra_args;
1153
1154         if (value == NULL || extra_args == NULL)
1155                 return -EINVAL;
1156
1157         *n = (uint16_t)strtoul(value, NULL, 0);
1158         if (*n == USHRT_MAX && errno == ERANGE)
1159                 return -1;
1160
1161         return 0;
1162 }
1163
1164 static int
1165 ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
1166                 struct rte_pci_device *pci_dev)
1167 {
1168         uint64_t features;
1169         struct ifcvf_internal *internal = NULL;
1170         struct internal_list *list = NULL;
1171         int vdpa_mode = 0;
1172         int sw_fallback_lm = 0;
1173         struct rte_kvargs *kvlist = NULL;
1174         int ret = 0;
1175
1176         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1177                 return 0;
1178
1179         if (!pci_dev->device.devargs)
1180                 return 1;
1181
1182         kvlist = rte_kvargs_parse(pci_dev->device.devargs->args,
1183                         ifcvf_valid_arguments);
1184         if (kvlist == NULL)
1185                 return 1;
1186
1187         /* probe only when vdpa mode is specified */
1188         if (rte_kvargs_count(kvlist, IFCVF_VDPA_MODE) == 0) {
1189                 rte_kvargs_free(kvlist);
1190                 return 1;
1191         }
1192
1193         ret = rte_kvargs_process(kvlist, IFCVF_VDPA_MODE, &open_int,
1194                         &vdpa_mode);
1195         if (ret < 0 || vdpa_mode == 0) {
1196                 rte_kvargs_free(kvlist);
1197                 return 1;
1198         }
1199
1200         list = rte_zmalloc("ifcvf", sizeof(*list), 0);
1201         if (list == NULL)
1202                 goto error;
1203
1204         internal = rte_zmalloc("ifcvf", sizeof(*internal), 0);
1205         if (internal == NULL)
1206                 goto error;
1207
1208         internal->pdev = pci_dev;
1209         rte_spinlock_init(&internal->lock);
1210
1211         if (ifcvf_vfio_setup(internal) < 0) {
1212                 DRV_LOG(ERR, "failed to setup device %s", pci_dev->name);
1213                 goto error;
1214         }
1215
1216         if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0) {
1217                 DRV_LOG(ERR, "failed to init device %s", pci_dev->name);
1218                 goto error;
1219         }
1220
1221         internal->configured = 0;
1222         internal->max_queues = IFCVF_MAX_QUEUES;
1223         features = ifcvf_get_features(&internal->hw);
1224         internal->features = (features &
1225                 ~(1ULL << VIRTIO_F_IOMMU_PLATFORM)) |
1226                 (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
1227                 (1ULL << VIRTIO_NET_F_CTRL_VQ) |
1228                 (1ULL << VIRTIO_NET_F_STATUS) |
1229                 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
1230                 (1ULL << VHOST_F_LOG_ALL);
1231
1232         list->internal = internal;
1233
1234         if (rte_kvargs_count(kvlist, IFCVF_SW_FALLBACK_LM)) {
1235                 ret = rte_kvargs_process(kvlist, IFCVF_SW_FALLBACK_LM,
1236                                 &open_int, &sw_fallback_lm);
1237                 if (ret < 0)
1238                         goto error;
1239         }
1240         internal->sw_lm = sw_fallback_lm;
1241
1242         internal->vdev = rte_vdpa_register_device(&pci_dev->device, &ifcvf_ops);
1243         if (internal->vdev == NULL) {
1244                 DRV_LOG(ERR, "failed to register device %s", pci_dev->name);
1245                 goto error;
1246         }
1247
1248         pthread_mutex_lock(&internal_list_lock);
1249         TAILQ_INSERT_TAIL(&internal_list, list, next);
1250         pthread_mutex_unlock(&internal_list_lock);
1251
1252         rte_atomic32_set(&internal->started, 1);
1253         update_datapath(internal);
1254
1255         rte_kvargs_free(kvlist);
1256         return 0;
1257
1258 error:
1259         rte_kvargs_free(kvlist);
1260         rte_free(list);
1261         rte_free(internal);
1262         return -1;
1263 }
1264
1265 static int
1266 ifcvf_pci_remove(struct rte_pci_device *pci_dev)
1267 {
1268         struct ifcvf_internal *internal;
1269         struct internal_list *list;
1270
1271         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1272                 return 0;
1273
1274         list = find_internal_resource_by_dev(pci_dev);
1275         if (list == NULL) {
1276                 DRV_LOG(ERR, "Invalid device: %s", pci_dev->name);
1277                 return -1;
1278         }
1279
1280         internal = list->internal;
1281         rte_atomic32_set(&internal->started, 0);
1282         update_datapath(internal);
1283
1284         rte_pci_unmap_device(internal->pdev);
1285         rte_vfio_container_destroy(internal->vfio_container_fd);
1286         rte_vdpa_unregister_device(internal->vdev);
1287
1288         pthread_mutex_lock(&internal_list_lock);
1289         TAILQ_REMOVE(&internal_list, list, next);
1290         pthread_mutex_unlock(&internal_list_lock);
1291
1292         rte_free(list);
1293         rte_free(internal);
1294
1295         return 0;
1296 }
1297
1298 /*
1299  * IFCVF has the same vendor ID and device ID as virtio net PCI
1300  * device, with its specific subsystem vendor ID and device ID.
1301  */
1302 static const struct rte_pci_id pci_id_ifcvf_map[] = {
1303         { .class_id = RTE_CLASS_ANY_ID,
1304           .vendor_id = IFCVF_VENDOR_ID,
1305           .device_id = IFCVF_DEVICE_ID,
1306           .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
1307           .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID,
1308         },
1309
1310         { .vendor_id = 0, /* sentinel */
1311         },
1312 };
1313
1314 static struct rte_pci_driver rte_ifcvf_vdpa = {
1315         .id_table = pci_id_ifcvf_map,
1316         .drv_flags = 0,
1317         .probe = ifcvf_pci_probe,
1318         .remove = ifcvf_pci_remove,
1319 };
1320
1321 RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa);
1322 RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map);
1323 RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci");