net/mlx5: prepare Direct Verbs for Direct Rule
[dpdk.git] / drivers / net / ifc / ifcvf_vdpa.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2018 Intel Corporation
3  */
4
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <fcntl.h>
8 #include <sys/ioctl.h>
9 #include <sys/epoll.h>
10 #include <linux/virtio_net.h>
11 #include <stdbool.h>
12
13 #include <rte_malloc.h>
14 #include <rte_memory.h>
15 #include <rte_bus_pci.h>
16 #include <rte_vhost.h>
17 #include <rte_vdpa.h>
18 #include <rte_vfio.h>
19 #include <rte_spinlock.h>
20 #include <rte_log.h>
21 #include <rte_kvargs.h>
22 #include <rte_devargs.h>
23
24 #include "base/ifcvf.h"
25
26 #define DRV_LOG(level, fmt, args...) \
27         rte_log(RTE_LOG_ ## level, ifcvf_vdpa_logtype, \
28                 "IFCVF %s(): " fmt "\n", __func__, ##args)
29
30 #ifndef PAGE_SIZE
31 #define PAGE_SIZE 4096
32 #endif
33
34 #define IFCVF_USED_RING_LEN(size) \
35         ((size) * sizeof(struct vring_used_elem) + sizeof(uint16_t) * 3)
36
37 #define IFCVF_VDPA_MODE         "vdpa"
38 #define IFCVF_SW_FALLBACK_LM    "sw-live-migration"
39
40 static const char * const ifcvf_valid_arguments[] = {
41         IFCVF_VDPA_MODE,
42         IFCVF_SW_FALLBACK_LM,
43         NULL
44 };
45
46 static int ifcvf_vdpa_logtype;
47
48 struct ifcvf_internal {
49         struct rte_vdpa_dev_addr dev_addr;
50         struct rte_pci_device *pdev;
51         struct ifcvf_hw hw;
52         int vfio_container_fd;
53         int vfio_group_fd;
54         int vfio_dev_fd;
55         pthread_t tid;  /* thread for notify relay */
56         int epfd;
57         int vid;
58         int did;
59         uint16_t max_queues;
60         uint64_t features;
61         rte_atomic32_t started;
62         rte_atomic32_t dev_attached;
63         rte_atomic32_t running;
64         rte_spinlock_t lock;
65         bool sw_lm;
66         bool sw_fallback_running;
67         /* mediated vring for sw fallback */
68         struct vring m_vring[IFCVF_MAX_QUEUES * 2];
69 };
70
71 struct internal_list {
72         TAILQ_ENTRY(internal_list) next;
73         struct ifcvf_internal *internal;
74 };
75
76 TAILQ_HEAD(internal_list_head, internal_list);
77 static struct internal_list_head internal_list =
78         TAILQ_HEAD_INITIALIZER(internal_list);
79
80 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
81
82 static struct internal_list *
83 find_internal_resource_by_did(int did)
84 {
85         int found = 0;
86         struct internal_list *list;
87
88         pthread_mutex_lock(&internal_list_lock);
89
90         TAILQ_FOREACH(list, &internal_list, next) {
91                 if (did == list->internal->did) {
92                         found = 1;
93                         break;
94                 }
95         }
96
97         pthread_mutex_unlock(&internal_list_lock);
98
99         if (!found)
100                 return NULL;
101
102         return list;
103 }
104
105 static struct internal_list *
106 find_internal_resource_by_dev(struct rte_pci_device *pdev)
107 {
108         int found = 0;
109         struct internal_list *list;
110
111         pthread_mutex_lock(&internal_list_lock);
112
113         TAILQ_FOREACH(list, &internal_list, next) {
114                 if (pdev == list->internal->pdev) {
115                         found = 1;
116                         break;
117                 }
118         }
119
120         pthread_mutex_unlock(&internal_list_lock);
121
122         if (!found)
123                 return NULL;
124
125         return list;
126 }
127
128 static int
129 ifcvf_vfio_setup(struct ifcvf_internal *internal)
130 {
131         struct rte_pci_device *dev = internal->pdev;
132         char devname[RTE_DEV_NAME_MAX_LEN] = {0};
133         int iommu_group_num;
134         int i;
135
136         internal->vfio_dev_fd = -1;
137         internal->vfio_group_fd = -1;
138         internal->vfio_container_fd = -1;
139
140         rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
141         rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
142                         &iommu_group_num);
143
144         internal->vfio_container_fd = rte_vfio_container_create();
145         if (internal->vfio_container_fd < 0)
146                 return -1;
147
148         internal->vfio_group_fd = rte_vfio_container_group_bind(
149                         internal->vfio_container_fd, iommu_group_num);
150         if (internal->vfio_group_fd < 0)
151                 goto err;
152
153         if (rte_pci_map_device(dev))
154                 goto err;
155
156         internal->vfio_dev_fd = dev->intr_handle.vfio_dev_fd;
157
158         for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
159                         i++) {
160                 internal->hw.mem_resource[i].addr =
161                         internal->pdev->mem_resource[i].addr;
162                 internal->hw.mem_resource[i].phys_addr =
163                         internal->pdev->mem_resource[i].phys_addr;
164                 internal->hw.mem_resource[i].len =
165                         internal->pdev->mem_resource[i].len;
166         }
167
168         return 0;
169
170 err:
171         rte_vfio_container_destroy(internal->vfio_container_fd);
172         return -1;
173 }
174
175 static int
176 ifcvf_dma_map(struct ifcvf_internal *internal, int do_map)
177 {
178         uint32_t i;
179         int ret;
180         struct rte_vhost_memory *mem = NULL;
181         int vfio_container_fd;
182
183         ret = rte_vhost_get_mem_table(internal->vid, &mem);
184         if (ret < 0) {
185                 DRV_LOG(ERR, "failed to get VM memory layout.");
186                 goto exit;
187         }
188
189         vfio_container_fd = internal->vfio_container_fd;
190
191         for (i = 0; i < mem->nregions; i++) {
192                 struct rte_vhost_mem_region *reg;
193
194                 reg = &mem->regions[i];
195                 DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", "
196                         "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".",
197                         do_map ? "DMA map" : "DMA unmap", i,
198                         reg->host_user_addr, reg->guest_phys_addr, reg->size);
199
200                 if (do_map) {
201                         ret = rte_vfio_container_dma_map(vfio_container_fd,
202                                 reg->host_user_addr, reg->guest_phys_addr,
203                                 reg->size);
204                         if (ret < 0) {
205                                 DRV_LOG(ERR, "DMA map failed.");
206                                 goto exit;
207                         }
208                 } else {
209                         ret = rte_vfio_container_dma_unmap(vfio_container_fd,
210                                 reg->host_user_addr, reg->guest_phys_addr,
211                                 reg->size);
212                         if (ret < 0) {
213                                 DRV_LOG(ERR, "DMA unmap failed.");
214                                 goto exit;
215                         }
216                 }
217         }
218
219 exit:
220         if (mem)
221                 free(mem);
222         return ret;
223 }
224
225 static uint64_t
226 hva_to_gpa(int vid, uint64_t hva)
227 {
228         struct rte_vhost_memory *mem = NULL;
229         struct rte_vhost_mem_region *reg;
230         uint32_t i;
231         uint64_t gpa = 0;
232
233         if (rte_vhost_get_mem_table(vid, &mem) < 0)
234                 goto exit;
235
236         for (i = 0; i < mem->nregions; i++) {
237                 reg = &mem->regions[i];
238
239                 if (hva >= reg->host_user_addr &&
240                                 hva < reg->host_user_addr + reg->size) {
241                         gpa = hva - reg->host_user_addr + reg->guest_phys_addr;
242                         break;
243                 }
244         }
245
246 exit:
247         if (mem)
248                 free(mem);
249         return gpa;
250 }
251
252 static int
253 vdpa_ifcvf_start(struct ifcvf_internal *internal)
254 {
255         struct ifcvf_hw *hw = &internal->hw;
256         int i, nr_vring;
257         int vid;
258         struct rte_vhost_vring vq;
259         uint64_t gpa;
260
261         vid = internal->vid;
262         nr_vring = rte_vhost_get_vring_num(vid);
263         rte_vhost_get_negotiated_features(vid, &hw->req_features);
264
265         for (i = 0; i < nr_vring; i++) {
266                 rte_vhost_get_vhost_vring(vid, i, &vq);
267                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
268                 if (gpa == 0) {
269                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
270                         return -1;
271                 }
272                 hw->vring[i].desc = gpa;
273
274                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
275                 if (gpa == 0) {
276                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
277                         return -1;
278                 }
279                 hw->vring[i].avail = gpa;
280
281                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
282                 if (gpa == 0) {
283                         DRV_LOG(ERR, "Fail to get GPA for used ring.");
284                         return -1;
285                 }
286                 hw->vring[i].used = gpa;
287
288                 hw->vring[i].size = vq.size;
289                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
290                                 &hw->vring[i].last_used_idx);
291         }
292         hw->nr_vring = i;
293
294         return ifcvf_start_hw(&internal->hw);
295 }
296
297 static void
298 vdpa_ifcvf_stop(struct ifcvf_internal *internal)
299 {
300         struct ifcvf_hw *hw = &internal->hw;
301         uint32_t i;
302         int vid;
303         uint64_t features;
304         uint64_t log_base, log_size;
305         uint64_t len;
306
307         vid = internal->vid;
308         ifcvf_stop_hw(hw);
309
310         for (i = 0; i < hw->nr_vring; i++)
311                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
312                                 hw->vring[i].last_used_idx);
313
314         if (internal->sw_lm)
315                 return;
316
317         rte_vhost_get_negotiated_features(vid, &features);
318         if (RTE_VHOST_NEED_LOG(features)) {
319                 ifcvf_disable_logging(hw);
320                 rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
321                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
322                                 log_base, IFCVF_LOG_BASE, log_size);
323                 /*
324                  * IFCVF marks dirty memory pages for only packet buffer,
325                  * SW helps to mark the used ring as dirty after device stops.
326                  */
327                 for (i = 0; i < hw->nr_vring; i++) {
328                         len = IFCVF_USED_RING_LEN(hw->vring[i].size);
329                         rte_vhost_log_used_vring(vid, i, 0, len);
330                 }
331         }
332 }
333
334 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
335                 sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
336 static int
337 vdpa_enable_vfio_intr(struct ifcvf_internal *internal)
338 {
339         int ret;
340         uint32_t i, nr_vring;
341         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
342         struct vfio_irq_set *irq_set;
343         int *fd_ptr;
344         struct rte_vhost_vring vring;
345
346         nr_vring = rte_vhost_get_vring_num(internal->vid);
347
348         irq_set = (struct vfio_irq_set *)irq_set_buf;
349         irq_set->argsz = sizeof(irq_set_buf);
350         irq_set->count = nr_vring + 1;
351         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
352                          VFIO_IRQ_SET_ACTION_TRIGGER;
353         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
354         irq_set->start = 0;
355         fd_ptr = (int *)&irq_set->data;
356         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = internal->pdev->intr_handle.fd;
357
358         for (i = 0; i < nr_vring; i++) {
359                 rte_vhost_get_vhost_vring(internal->vid, i, &vring);
360                 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
361         }
362
363         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
364         if (ret) {
365                 DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s",
366                                 strerror(errno));
367                 return -1;
368         }
369
370         return 0;
371 }
372
373 static int
374 vdpa_disable_vfio_intr(struct ifcvf_internal *internal)
375 {
376         int ret;
377         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
378         struct vfio_irq_set *irq_set;
379
380         irq_set = (struct vfio_irq_set *)irq_set_buf;
381         irq_set->argsz = sizeof(irq_set_buf);
382         irq_set->count = 0;
383         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
384         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
385         irq_set->start = 0;
386
387         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
388         if (ret) {
389                 DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s",
390                                 strerror(errno));
391                 return -1;
392         }
393
394         return 0;
395 }
396
397 static void *
398 notify_relay(void *arg)
399 {
400         int i, kickfd, epfd, nfds = 0;
401         uint32_t qid, q_num;
402         struct epoll_event events[IFCVF_MAX_QUEUES * 2];
403         struct epoll_event ev;
404         uint64_t buf;
405         int nbytes;
406         struct rte_vhost_vring vring;
407         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
408         struct ifcvf_hw *hw = &internal->hw;
409
410         q_num = rte_vhost_get_vring_num(internal->vid);
411
412         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
413         if (epfd < 0) {
414                 DRV_LOG(ERR, "failed to create epoll instance.");
415                 return NULL;
416         }
417         internal->epfd = epfd;
418
419         for (qid = 0; qid < q_num; qid++) {
420                 ev.events = EPOLLIN | EPOLLPRI;
421                 rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
422                 ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
423                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
424                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
425                         return NULL;
426                 }
427         }
428
429         for (;;) {
430                 nfds = epoll_wait(epfd, events, q_num, -1);
431                 if (nfds < 0) {
432                         if (errno == EINTR)
433                                 continue;
434                         DRV_LOG(ERR, "epoll_wait return fail\n");
435                         return NULL;
436                 }
437
438                 for (i = 0; i < nfds; i++) {
439                         qid = events[i].data.u32;
440                         kickfd = (uint32_t)(events[i].data.u64 >> 32);
441                         do {
442                                 nbytes = read(kickfd, &buf, 8);
443                                 if (nbytes < 0) {
444                                         if (errno == EINTR ||
445                                             errno == EWOULDBLOCK ||
446                                             errno == EAGAIN)
447                                                 continue;
448                                         DRV_LOG(INFO, "Error reading "
449                                                 "kickfd: %s",
450                                                 strerror(errno));
451                                 }
452                                 break;
453                         } while (1);
454
455                         ifcvf_notify_queue(hw, qid);
456                 }
457         }
458
459         return NULL;
460 }
461
462 static int
463 setup_notify_relay(struct ifcvf_internal *internal)
464 {
465         int ret;
466
467         ret = pthread_create(&internal->tid, NULL, notify_relay,
468                         (void *)internal);
469         if (ret) {
470                 DRV_LOG(ERR, "failed to create notify relay pthread.");
471                 return -1;
472         }
473         return 0;
474 }
475
476 static int
477 unset_notify_relay(struct ifcvf_internal *internal)
478 {
479         void *status;
480
481         if (internal->tid) {
482                 pthread_cancel(internal->tid);
483                 pthread_join(internal->tid, &status);
484         }
485         internal->tid = 0;
486
487         if (internal->epfd >= 0)
488                 close(internal->epfd);
489         internal->epfd = -1;
490
491         return 0;
492 }
493
494 static int
495 update_datapath(struct ifcvf_internal *internal)
496 {
497         int ret;
498
499         rte_spinlock_lock(&internal->lock);
500
501         if (!rte_atomic32_read(&internal->running) &&
502             (rte_atomic32_read(&internal->started) &&
503              rte_atomic32_read(&internal->dev_attached))) {
504                 ret = ifcvf_dma_map(internal, 1);
505                 if (ret)
506                         goto err;
507
508                 ret = vdpa_enable_vfio_intr(internal);
509                 if (ret)
510                         goto err;
511
512                 ret = vdpa_ifcvf_start(internal);
513                 if (ret)
514                         goto err;
515
516                 ret = setup_notify_relay(internal);
517                 if (ret)
518                         goto err;
519
520                 rte_atomic32_set(&internal->running, 1);
521         } else if (rte_atomic32_read(&internal->running) &&
522                    (!rte_atomic32_read(&internal->started) ||
523                     !rte_atomic32_read(&internal->dev_attached))) {
524                 ret = unset_notify_relay(internal);
525                 if (ret)
526                         goto err;
527
528                 vdpa_ifcvf_stop(internal);
529
530                 ret = vdpa_disable_vfio_intr(internal);
531                 if (ret)
532                         goto err;
533
534                 ret = ifcvf_dma_map(internal, 0);
535                 if (ret)
536                         goto err;
537
538                 rte_atomic32_set(&internal->running, 0);
539         }
540
541         rte_spinlock_unlock(&internal->lock);
542         return 0;
543 err:
544         rte_spinlock_unlock(&internal->lock);
545         return ret;
546 }
547
548 static int
549 m_ifcvf_start(struct ifcvf_internal *internal)
550 {
551         struct ifcvf_hw *hw = &internal->hw;
552         uint32_t i, nr_vring;
553         int vid, ret;
554         struct rte_vhost_vring vq;
555         void *vring_buf;
556         uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
557         uint64_t size;
558         uint64_t gpa;
559
560         vid = internal->vid;
561         nr_vring = rte_vhost_get_vring_num(vid);
562         rte_vhost_get_negotiated_features(vid, &hw->req_features);
563
564         for (i = 0; i < nr_vring; i++) {
565                 rte_vhost_get_vhost_vring(vid, i, &vq);
566
567                 size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE),
568                                 PAGE_SIZE);
569                 vring_buf = rte_zmalloc("ifcvf", size, PAGE_SIZE);
570                 vring_init(&internal->m_vring[i], vq.size, vring_buf,
571                                 PAGE_SIZE);
572
573                 ret = rte_vfio_container_dma_map(internal->vfio_container_fd,
574                         (uint64_t)(uintptr_t)vring_buf, m_vring_iova, size);
575                 if (ret < 0) {
576                         DRV_LOG(ERR, "mediated vring DMA map failed.");
577                         goto error;
578                 }
579
580                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
581                 if (gpa == 0) {
582                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
583                         return -1;
584                 }
585                 hw->vring[i].desc = gpa;
586
587                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
588                 if (gpa == 0) {
589                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
590                         return -1;
591                 }
592                 hw->vring[i].avail = gpa;
593
594                 hw->vring[i].used = m_vring_iova +
595                         (char *)internal->m_vring[i].used -
596                         (char *)internal->m_vring[i].desc;
597
598                 hw->vring[i].size = vq.size;
599
600                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
601                                 &hw->vring[i].last_used_idx);
602
603                 m_vring_iova += size;
604         }
605         hw->nr_vring = nr_vring;
606
607         return ifcvf_start_hw(&internal->hw);
608
609 error:
610         for (i = 0; i < nr_vring; i++)
611                 if (internal->m_vring[i].desc)
612                         rte_free(internal->m_vring[i].desc);
613
614         return -1;
615 }
616
617 static int
618 m_ifcvf_stop(struct ifcvf_internal *internal)
619 {
620         int vid;
621         uint32_t i;
622         struct rte_vhost_vring vq;
623         struct ifcvf_hw *hw = &internal->hw;
624         uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
625         uint64_t size, len;
626
627         vid = internal->vid;
628         ifcvf_stop_hw(hw);
629
630         for (i = 0; i < hw->nr_vring; i++) {
631                 rte_vhost_get_vhost_vring(vid, i, &vq);
632                 len = IFCVF_USED_RING_LEN(vq.size);
633                 rte_vhost_log_used_vring(vid, i, 0, len);
634
635                 size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE),
636                                 PAGE_SIZE);
637                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
638                         (uint64_t)(uintptr_t)internal->m_vring[i].desc,
639                         m_vring_iova, size);
640
641                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
642                                 hw->vring[i].last_used_idx);
643                 rte_free(internal->m_vring[i].desc);
644                 m_vring_iova += size;
645         }
646
647         return 0;
648 }
649
650 static int
651 m_enable_vfio_intr(struct ifcvf_internal *internal)
652 {
653         uint32_t nr_vring;
654         struct rte_intr_handle *intr_handle = &internal->pdev->intr_handle;
655         int ret;
656
657         nr_vring = rte_vhost_get_vring_num(internal->vid);
658
659         ret = rte_intr_efd_enable(intr_handle, nr_vring);
660         if (ret)
661                 return -1;
662
663         ret = rte_intr_enable(intr_handle);
664         if (ret)
665                 return -1;
666
667         return 0;
668 }
669
670 static void
671 m_disable_vfio_intr(struct ifcvf_internal *internal)
672 {
673         struct rte_intr_handle *intr_handle = &internal->pdev->intr_handle;
674
675         rte_intr_efd_disable(intr_handle);
676         rte_intr_disable(intr_handle);
677 }
678
679 static void
680 update_used_ring(struct ifcvf_internal *internal, uint16_t qid)
681 {
682         rte_vdpa_relay_vring_used(internal->vid, qid, &internal->m_vring[qid]);
683         rte_vhost_vring_call(internal->vid, qid);
684 }
685
686 static void *
687 vring_relay(void *arg)
688 {
689         int i, vid, epfd, fd, nfds;
690         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
691         struct rte_vhost_vring vring;
692         struct rte_intr_handle *intr_handle;
693         uint16_t qid, q_num;
694         struct epoll_event events[IFCVF_MAX_QUEUES * 4];
695         struct epoll_event ev;
696         int nbytes;
697         uint64_t buf;
698
699         vid = internal->vid;
700         q_num = rte_vhost_get_vring_num(vid);
701         /* prepare the mediated vring */
702         for (qid = 0; qid < q_num; qid++)
703                 rte_vhost_get_vring_base(vid, qid,
704                                 &internal->m_vring[qid].avail->idx,
705                                 &internal->m_vring[qid].used->idx);
706
707         /* add notify fd and interrupt fd to epoll */
708         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
709         if (epfd < 0) {
710                 DRV_LOG(ERR, "failed to create epoll instance.");
711                 return NULL;
712         }
713         internal->epfd = epfd;
714
715         for (qid = 0; qid < q_num; qid++) {
716                 ev.events = EPOLLIN | EPOLLPRI;
717                 rte_vhost_get_vhost_vring(vid, qid, &vring);
718                 ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32;
719                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
720                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
721                         return NULL;
722                 }
723         }
724
725         intr_handle = &internal->pdev->intr_handle;
726         for (qid = 0; qid < q_num; qid++) {
727                 ev.events = EPOLLIN | EPOLLPRI;
728                 ev.data.u64 = 1 | qid << 1 |
729                         (uint64_t)intr_handle->efds[qid] << 32;
730                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, intr_handle->efds[qid], &ev)
731                                 < 0) {
732                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
733                         return NULL;
734                 }
735         }
736
737         /* start relay with a first kick */
738         for (qid = 0; qid < q_num; qid++)
739                 ifcvf_notify_queue(&internal->hw, qid);
740
741         /* listen to the events and react accordingly */
742         for (;;) {
743                 nfds = epoll_wait(epfd, events, q_num * 2, -1);
744                 if (nfds < 0) {
745                         if (errno == EINTR)
746                                 continue;
747                         DRV_LOG(ERR, "epoll_wait return fail\n");
748                         return NULL;
749                 }
750
751                 for (i = 0; i < nfds; i++) {
752                         fd = (uint32_t)(events[i].data.u64 >> 32);
753                         do {
754                                 nbytes = read(fd, &buf, 8);
755                                 if (nbytes < 0) {
756                                         if (errno == EINTR ||
757                                             errno == EWOULDBLOCK ||
758                                             errno == EAGAIN)
759                                                 continue;
760                                         DRV_LOG(INFO, "Error reading "
761                                                 "kickfd: %s",
762                                                 strerror(errno));
763                                 }
764                                 break;
765                         } while (1);
766
767                         qid = events[i].data.u32 >> 1;
768
769                         if (events[i].data.u32 & 1)
770                                 update_used_ring(internal, qid);
771                         else
772                                 ifcvf_notify_queue(&internal->hw, qid);
773                 }
774         }
775
776         return NULL;
777 }
778
779 static int
780 setup_vring_relay(struct ifcvf_internal *internal)
781 {
782         int ret;
783
784         ret = pthread_create(&internal->tid, NULL, vring_relay,
785                         (void *)internal);
786         if (ret) {
787                 DRV_LOG(ERR, "failed to create ring relay pthread.");
788                 return -1;
789         }
790         return 0;
791 }
792
793 static int
794 unset_vring_relay(struct ifcvf_internal *internal)
795 {
796         void *status;
797
798         if (internal->tid) {
799                 pthread_cancel(internal->tid);
800                 pthread_join(internal->tid, &status);
801         }
802         internal->tid = 0;
803
804         if (internal->epfd >= 0)
805                 close(internal->epfd);
806         internal->epfd = -1;
807
808         return 0;
809 }
810
811 static int
812 ifcvf_sw_fallback_switchover(struct ifcvf_internal *internal)
813 {
814         int ret;
815         int vid = internal->vid;
816
817         /* stop the direct IO data path */
818         unset_notify_relay(internal);
819         vdpa_ifcvf_stop(internal);
820         vdpa_disable_vfio_intr(internal);
821
822         ret = rte_vhost_host_notifier_ctrl(vid, false);
823         if (ret && ret != -ENOTSUP)
824                 goto error;
825
826         /* set up interrupt for interrupt relay */
827         ret = m_enable_vfio_intr(internal);
828         if (ret)
829                 goto unmap;
830
831         /* config the VF */
832         ret = m_ifcvf_start(internal);
833         if (ret)
834                 goto unset_intr;
835
836         /* set up vring relay thread */
837         ret = setup_vring_relay(internal);
838         if (ret)
839                 goto stop_vf;
840
841         rte_vhost_host_notifier_ctrl(vid, true);
842
843         internal->sw_fallback_running = true;
844
845         return 0;
846
847 stop_vf:
848         m_ifcvf_stop(internal);
849 unset_intr:
850         m_disable_vfio_intr(internal);
851 unmap:
852         ifcvf_dma_map(internal, 0);
853 error:
854         return -1;
855 }
856
857 static int
858 ifcvf_dev_config(int vid)
859 {
860         int did;
861         struct internal_list *list;
862         struct ifcvf_internal *internal;
863
864         did = rte_vhost_get_vdpa_device_id(vid);
865         list = find_internal_resource_by_did(did);
866         if (list == NULL) {
867                 DRV_LOG(ERR, "Invalid device id: %d", did);
868                 return -1;
869         }
870
871         internal = list->internal;
872         internal->vid = vid;
873         rte_atomic32_set(&internal->dev_attached, 1);
874         update_datapath(internal);
875
876         if (rte_vhost_host_notifier_ctrl(vid, true) != 0)
877                 DRV_LOG(NOTICE, "vDPA (%d): software relay is used.", did);
878
879         return 0;
880 }
881
882 static int
883 ifcvf_dev_close(int vid)
884 {
885         int did;
886         struct internal_list *list;
887         struct ifcvf_internal *internal;
888
889         did = rte_vhost_get_vdpa_device_id(vid);
890         list = find_internal_resource_by_did(did);
891         if (list == NULL) {
892                 DRV_LOG(ERR, "Invalid device id: %d", did);
893                 return -1;
894         }
895
896         internal = list->internal;
897
898         if (internal->sw_fallback_running) {
899                 /* unset ring relay */
900                 unset_vring_relay(internal);
901
902                 /* reset VF */
903                 m_ifcvf_stop(internal);
904
905                 /* remove interrupt setting */
906                 m_disable_vfio_intr(internal);
907
908                 /* unset DMA map for guest memory */
909                 ifcvf_dma_map(internal, 0);
910
911                 internal->sw_fallback_running = false;
912         } else {
913                 rte_atomic32_set(&internal->dev_attached, 0);
914                 update_datapath(internal);
915         }
916
917         return 0;
918 }
919
920 static int
921 ifcvf_set_features(int vid)
922 {
923         uint64_t features;
924         int did;
925         struct internal_list *list;
926         struct ifcvf_internal *internal;
927         uint64_t log_base, log_size;
928
929         did = rte_vhost_get_vdpa_device_id(vid);
930         list = find_internal_resource_by_did(did);
931         if (list == NULL) {
932                 DRV_LOG(ERR, "Invalid device id: %d", did);
933                 return -1;
934         }
935
936         internal = list->internal;
937         rte_vhost_get_negotiated_features(vid, &features);
938
939         if (!RTE_VHOST_NEED_LOG(features))
940                 return 0;
941
942         if (internal->sw_lm) {
943                 ifcvf_sw_fallback_switchover(internal);
944         } else {
945                 rte_vhost_get_log_base(vid, &log_base, &log_size);
946                 rte_vfio_container_dma_map(internal->vfio_container_fd,
947                                 log_base, IFCVF_LOG_BASE, log_size);
948                 ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size);
949         }
950
951         return 0;
952 }
953
954 static int
955 ifcvf_get_vfio_group_fd(int vid)
956 {
957         int did;
958         struct internal_list *list;
959
960         did = rte_vhost_get_vdpa_device_id(vid);
961         list = find_internal_resource_by_did(did);
962         if (list == NULL) {
963                 DRV_LOG(ERR, "Invalid device id: %d", did);
964                 return -1;
965         }
966
967         return list->internal->vfio_group_fd;
968 }
969
970 static int
971 ifcvf_get_vfio_device_fd(int vid)
972 {
973         int did;
974         struct internal_list *list;
975
976         did = rte_vhost_get_vdpa_device_id(vid);
977         list = find_internal_resource_by_did(did);
978         if (list == NULL) {
979                 DRV_LOG(ERR, "Invalid device id: %d", did);
980                 return -1;
981         }
982
983         return list->internal->vfio_dev_fd;
984 }
985
986 static int
987 ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
988 {
989         int did;
990         struct internal_list *list;
991         struct ifcvf_internal *internal;
992         struct vfio_region_info reg = { .argsz = sizeof(reg) };
993         int ret;
994
995         did = rte_vhost_get_vdpa_device_id(vid);
996         list = find_internal_resource_by_did(did);
997         if (list == NULL) {
998                 DRV_LOG(ERR, "Invalid device id: %d", did);
999                 return -1;
1000         }
1001
1002         internal = list->internal;
1003
1004         reg.index = ifcvf_get_notify_region(&internal->hw);
1005         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
1006         if (ret) {
1007                 DRV_LOG(ERR, "Get not get device region info: %s",
1008                                 strerror(errno));
1009                 return -1;
1010         }
1011
1012         *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset;
1013         *size = 0x1000;
1014
1015         return 0;
1016 }
1017
1018 static int
1019 ifcvf_get_queue_num(int did, uint32_t *queue_num)
1020 {
1021         struct internal_list *list;
1022
1023         list = find_internal_resource_by_did(did);
1024         if (list == NULL) {
1025                 DRV_LOG(ERR, "Invalid device id: %d", did);
1026                 return -1;
1027         }
1028
1029         *queue_num = list->internal->max_queues;
1030
1031         return 0;
1032 }
1033
1034 static int
1035 ifcvf_get_vdpa_features(int did, uint64_t *features)
1036 {
1037         struct internal_list *list;
1038
1039         list = find_internal_resource_by_did(did);
1040         if (list == NULL) {
1041                 DRV_LOG(ERR, "Invalid device id: %d", did);
1042                 return -1;
1043         }
1044
1045         *features = list->internal->features;
1046
1047         return 0;
1048 }
1049
1050 #define VDPA_SUPPORTED_PROTOCOL_FEATURES \
1051                 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \
1052                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | \
1053                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | \
1054                  1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
1055                  1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD)
1056 static int
1057 ifcvf_get_protocol_features(int did __rte_unused, uint64_t *features)
1058 {
1059         *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
1060         return 0;
1061 }
1062
1063 static struct rte_vdpa_dev_ops ifcvf_ops = {
1064         .get_queue_num = ifcvf_get_queue_num,
1065         .get_features = ifcvf_get_vdpa_features,
1066         .get_protocol_features = ifcvf_get_protocol_features,
1067         .dev_conf = ifcvf_dev_config,
1068         .dev_close = ifcvf_dev_close,
1069         .set_vring_state = NULL,
1070         .set_features = ifcvf_set_features,
1071         .migration_done = NULL,
1072         .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
1073         .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
1074         .get_notify_area = ifcvf_get_notify_area,
1075 };
1076
1077 static inline int
1078 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1079 {
1080         uint16_t *n = extra_args;
1081
1082         if (value == NULL || extra_args == NULL)
1083                 return -EINVAL;
1084
1085         *n = (uint16_t)strtoul(value, NULL, 0);
1086         if (*n == USHRT_MAX && errno == ERANGE)
1087                 return -1;
1088
1089         return 0;
1090 }
1091
1092 static int
1093 ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
1094                 struct rte_pci_device *pci_dev)
1095 {
1096         uint64_t features;
1097         struct ifcvf_internal *internal = NULL;
1098         struct internal_list *list = NULL;
1099         int vdpa_mode = 0;
1100         int sw_fallback_lm = 0;
1101         struct rte_kvargs *kvlist = NULL;
1102         int ret = 0;
1103
1104         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1105                 return 0;
1106
1107         kvlist = rte_kvargs_parse(pci_dev->device.devargs->args,
1108                         ifcvf_valid_arguments);
1109         if (kvlist == NULL)
1110                 return 1;
1111
1112         /* probe only when vdpa mode is specified */
1113         if (rte_kvargs_count(kvlist, IFCVF_VDPA_MODE) == 0) {
1114                 rte_kvargs_free(kvlist);
1115                 return 1;
1116         }
1117
1118         ret = rte_kvargs_process(kvlist, IFCVF_VDPA_MODE, &open_int,
1119                         &vdpa_mode);
1120         if (ret < 0 || vdpa_mode == 0) {
1121                 rte_kvargs_free(kvlist);
1122                 return 1;
1123         }
1124
1125         list = rte_zmalloc("ifcvf", sizeof(*list), 0);
1126         if (list == NULL)
1127                 goto error;
1128
1129         internal = rte_zmalloc("ifcvf", sizeof(*internal), 0);
1130         if (internal == NULL)
1131                 goto error;
1132
1133         internal->pdev = pci_dev;
1134         rte_spinlock_init(&internal->lock);
1135
1136         if (ifcvf_vfio_setup(internal) < 0) {
1137                 DRV_LOG(ERR, "failed to setup device %s", pci_dev->name);
1138                 goto error;
1139         }
1140
1141         if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0) {
1142                 DRV_LOG(ERR, "failed to init device %s", pci_dev->name);
1143                 goto error;
1144         }
1145
1146         internal->max_queues = IFCVF_MAX_QUEUES;
1147         features = ifcvf_get_features(&internal->hw);
1148         internal->features = (features &
1149                 ~(1ULL << VIRTIO_F_IOMMU_PLATFORM)) |
1150                 (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
1151                 (1ULL << VIRTIO_NET_F_CTRL_VQ) |
1152                 (1ULL << VIRTIO_NET_F_STATUS) |
1153                 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
1154                 (1ULL << VHOST_F_LOG_ALL);
1155
1156         internal->dev_addr.pci_addr = pci_dev->addr;
1157         internal->dev_addr.type = PCI_ADDR;
1158         list->internal = internal;
1159
1160         if (rte_kvargs_count(kvlist, IFCVF_SW_FALLBACK_LM)) {
1161                 ret = rte_kvargs_process(kvlist, IFCVF_SW_FALLBACK_LM,
1162                                 &open_int, &sw_fallback_lm);
1163                 if (ret < 0)
1164                         goto error;
1165         }
1166         internal->sw_lm = sw_fallback_lm;
1167
1168         internal->did = rte_vdpa_register_device(&internal->dev_addr,
1169                                 &ifcvf_ops);
1170         if (internal->did < 0) {
1171                 DRV_LOG(ERR, "failed to register device %s", pci_dev->name);
1172                 goto error;
1173         }
1174
1175         pthread_mutex_lock(&internal_list_lock);
1176         TAILQ_INSERT_TAIL(&internal_list, list, next);
1177         pthread_mutex_unlock(&internal_list_lock);
1178
1179         rte_atomic32_set(&internal->started, 1);
1180         update_datapath(internal);
1181
1182         rte_kvargs_free(kvlist);
1183         return 0;
1184
1185 error:
1186         rte_kvargs_free(kvlist);
1187         rte_free(list);
1188         rte_free(internal);
1189         return -1;
1190 }
1191
1192 static int
1193 ifcvf_pci_remove(struct rte_pci_device *pci_dev)
1194 {
1195         struct ifcvf_internal *internal;
1196         struct internal_list *list;
1197
1198         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1199                 return 0;
1200
1201         list = find_internal_resource_by_dev(pci_dev);
1202         if (list == NULL) {
1203                 DRV_LOG(ERR, "Invalid device: %s", pci_dev->name);
1204                 return -1;
1205         }
1206
1207         internal = list->internal;
1208         rte_atomic32_set(&internal->started, 0);
1209         update_datapath(internal);
1210
1211         rte_pci_unmap_device(internal->pdev);
1212         rte_vfio_container_destroy(internal->vfio_container_fd);
1213         rte_vdpa_unregister_device(internal->did);
1214
1215         pthread_mutex_lock(&internal_list_lock);
1216         TAILQ_REMOVE(&internal_list, list, next);
1217         pthread_mutex_unlock(&internal_list_lock);
1218
1219         rte_free(list);
1220         rte_free(internal);
1221
1222         return 0;
1223 }
1224
1225 /*
1226  * IFCVF has the same vendor ID and device ID as virtio net PCI
1227  * device, with its specific subsystem vendor ID and device ID.
1228  */
1229 static const struct rte_pci_id pci_id_ifcvf_map[] = {
1230         { .class_id = RTE_CLASS_ANY_ID,
1231           .vendor_id = IFCVF_VENDOR_ID,
1232           .device_id = IFCVF_DEVICE_ID,
1233           .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
1234           .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID,
1235         },
1236
1237         { .vendor_id = 0, /* sentinel */
1238         },
1239 };
1240
1241 static struct rte_pci_driver rte_ifcvf_vdpa = {
1242         .id_table = pci_id_ifcvf_map,
1243         .drv_flags = 0,
1244         .probe = ifcvf_pci_probe,
1245         .remove = ifcvf_pci_remove,
1246 };
1247
1248 RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa);
1249 RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map);
1250 RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci");
1251
1252 RTE_INIT(ifcvf_vdpa_init_log)
1253 {
1254         ifcvf_vdpa_logtype = rte_log_register("pmd.net.ifcvf_vdpa");
1255         if (ifcvf_vdpa_logtype >= 0)
1256                 rte_log_set_level(ifcvf_vdpa_logtype, RTE_LOG_NOTICE);
1257 }