vdpa/ifc: set notify and vring relay thread names
[dpdk.git] / drivers / vdpa / ifc / ifcvf_vdpa.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2018 Intel Corporation
3  */
4
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <fcntl.h>
8 #include <string.h>
9 #include <sys/ioctl.h>
10 #include <sys/epoll.h>
11 #include <linux/virtio_net.h>
12 #include <stdbool.h>
13
14 #include <rte_eal_paging.h>
15 #include <rte_malloc.h>
16 #include <rte_memory.h>
17 #include <rte_bus_pci.h>
18 #include <rte_vhost.h>
19 #include <rte_vdpa.h>
20 #include <rte_vdpa_dev.h>
21 #include <rte_vfio.h>
22 #include <rte_spinlock.h>
23 #include <rte_log.h>
24 #include <rte_kvargs.h>
25 #include <rte_devargs.h>
26
27 #include "base/ifcvf.h"
28
29 RTE_LOG_REGISTER(ifcvf_vdpa_logtype, pmd.vdpa.ifcvf, NOTICE);
30 #define DRV_LOG(level, fmt, args...) \
31         rte_log(RTE_LOG_ ## level, ifcvf_vdpa_logtype, \
32                 "IFCVF %s(): " fmt "\n", __func__, ##args)
33
34 #define IFCVF_USED_RING_LEN(size) \
35         ((size) * sizeof(struct vring_used_elem) + sizeof(uint16_t) * 3)
36
37 #define IFCVF_VDPA_MODE         "vdpa"
38 #define IFCVF_SW_FALLBACK_LM    "sw-live-migration"
39
40 #define THREAD_NAME_LEN 16
41
42 static const char * const ifcvf_valid_arguments[] = {
43         IFCVF_VDPA_MODE,
44         IFCVF_SW_FALLBACK_LM,
45         NULL
46 };
47
48 struct ifcvf_internal {
49         struct rte_pci_device *pdev;
50         struct ifcvf_hw hw;
51         int configured;
52         int vfio_container_fd;
53         int vfio_group_fd;
54         int vfio_dev_fd;
55         pthread_t tid;  /* thread for notify relay */
56         int epfd;
57         int vid;
58         struct rte_vdpa_device *vdev;
59         uint16_t max_queues;
60         uint64_t features;
61         rte_atomic32_t started;
62         rte_atomic32_t dev_attached;
63         rte_atomic32_t running;
64         rte_spinlock_t lock;
65         bool sw_lm;
66         bool sw_fallback_running;
67         /* mediated vring for sw fallback */
68         struct vring m_vring[IFCVF_MAX_QUEUES * 2];
69         /* eventfd for used ring interrupt */
70         int intr_fd[IFCVF_MAX_QUEUES * 2];
71 };
72
73 struct internal_list {
74         TAILQ_ENTRY(internal_list) next;
75         struct ifcvf_internal *internal;
76 };
77
78 TAILQ_HEAD(internal_list_head, internal_list);
79 static struct internal_list_head internal_list =
80         TAILQ_HEAD_INITIALIZER(internal_list);
81
82 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
83
84 static void update_used_ring(struct ifcvf_internal *internal, uint16_t qid);
85
86 static struct internal_list *
87 find_internal_resource_by_vdev(struct rte_vdpa_device *vdev)
88 {
89         int found = 0;
90         struct internal_list *list;
91
92         pthread_mutex_lock(&internal_list_lock);
93
94         TAILQ_FOREACH(list, &internal_list, next) {
95                 if (vdev == list->internal->vdev) {
96                         found = 1;
97                         break;
98                 }
99         }
100
101         pthread_mutex_unlock(&internal_list_lock);
102
103         if (!found)
104                 return NULL;
105
106         return list;
107 }
108
109 static struct internal_list *
110 find_internal_resource_by_dev(struct rte_pci_device *pdev)
111 {
112         int found = 0;
113         struct internal_list *list;
114
115         pthread_mutex_lock(&internal_list_lock);
116
117         TAILQ_FOREACH(list, &internal_list, next) {
118                 if (!rte_pci_addr_cmp(&pdev->addr,
119                                         &list->internal->pdev->addr)) {
120                         found = 1;
121                         break;
122                 }
123         }
124
125         pthread_mutex_unlock(&internal_list_lock);
126
127         if (!found)
128                 return NULL;
129
130         return list;
131 }
132
133 static int
134 ifcvf_vfio_setup(struct ifcvf_internal *internal)
135 {
136         struct rte_pci_device *dev = internal->pdev;
137         char devname[RTE_DEV_NAME_MAX_LEN] = {0};
138         int iommu_group_num;
139         int i, ret;
140
141         internal->vfio_dev_fd = -1;
142         internal->vfio_group_fd = -1;
143         internal->vfio_container_fd = -1;
144
145         rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
146         ret = rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
147                         &iommu_group_num);
148         if (ret <= 0) {
149                 DRV_LOG(ERR, "%s failed to get IOMMU group", devname);
150                 return -1;
151         }
152
153         internal->vfio_container_fd = rte_vfio_container_create();
154         if (internal->vfio_container_fd < 0)
155                 return -1;
156
157         internal->vfio_group_fd = rte_vfio_container_group_bind(
158                         internal->vfio_container_fd, iommu_group_num);
159         if (internal->vfio_group_fd < 0)
160                 goto err;
161
162         if (rte_pci_map_device(dev))
163                 goto err;
164
165         internal->vfio_dev_fd = dev->intr_handle.vfio_dev_fd;
166
167         for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
168                         i++) {
169                 internal->hw.mem_resource[i].addr =
170                         internal->pdev->mem_resource[i].addr;
171                 internal->hw.mem_resource[i].phys_addr =
172                         internal->pdev->mem_resource[i].phys_addr;
173                 internal->hw.mem_resource[i].len =
174                         internal->pdev->mem_resource[i].len;
175         }
176
177         return 0;
178
179 err:
180         rte_vfio_container_destroy(internal->vfio_container_fd);
181         return -1;
182 }
183
184 static int
185 ifcvf_dma_map(struct ifcvf_internal *internal, int do_map)
186 {
187         uint32_t i;
188         int ret;
189         struct rte_vhost_memory *mem = NULL;
190         int vfio_container_fd;
191
192         ret = rte_vhost_get_mem_table(internal->vid, &mem);
193         if (ret < 0) {
194                 DRV_LOG(ERR, "failed to get VM memory layout.");
195                 goto exit;
196         }
197
198         vfio_container_fd = internal->vfio_container_fd;
199
200         for (i = 0; i < mem->nregions; i++) {
201                 struct rte_vhost_mem_region *reg;
202
203                 reg = &mem->regions[i];
204                 DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", "
205                         "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".",
206                         do_map ? "DMA map" : "DMA unmap", i,
207                         reg->host_user_addr, reg->guest_phys_addr, reg->size);
208
209                 if (do_map) {
210                         ret = rte_vfio_container_dma_map(vfio_container_fd,
211                                 reg->host_user_addr, reg->guest_phys_addr,
212                                 reg->size);
213                         if (ret < 0) {
214                                 DRV_LOG(ERR, "DMA map failed.");
215                                 goto exit;
216                         }
217                 } else {
218                         ret = rte_vfio_container_dma_unmap(vfio_container_fd,
219                                 reg->host_user_addr, reg->guest_phys_addr,
220                                 reg->size);
221                         if (ret < 0) {
222                                 DRV_LOG(ERR, "DMA unmap failed.");
223                                 goto exit;
224                         }
225                 }
226         }
227
228 exit:
229         if (mem)
230                 free(mem);
231         return ret;
232 }
233
234 static uint64_t
235 hva_to_gpa(int vid, uint64_t hva)
236 {
237         struct rte_vhost_memory *mem = NULL;
238         struct rte_vhost_mem_region *reg;
239         uint32_t i;
240         uint64_t gpa = 0;
241
242         if (rte_vhost_get_mem_table(vid, &mem) < 0)
243                 goto exit;
244
245         for (i = 0; i < mem->nregions; i++) {
246                 reg = &mem->regions[i];
247
248                 if (hva >= reg->host_user_addr &&
249                                 hva < reg->host_user_addr + reg->size) {
250                         gpa = hva - reg->host_user_addr + reg->guest_phys_addr;
251                         break;
252                 }
253         }
254
255 exit:
256         if (mem)
257                 free(mem);
258         return gpa;
259 }
260
261 static int
262 vdpa_ifcvf_start(struct ifcvf_internal *internal)
263 {
264         struct ifcvf_hw *hw = &internal->hw;
265         int i, nr_vring;
266         int vid;
267         struct rte_vhost_vring vq;
268         uint64_t gpa;
269
270         vid = internal->vid;
271         nr_vring = rte_vhost_get_vring_num(vid);
272         rte_vhost_get_negotiated_features(vid, &hw->req_features);
273
274         for (i = 0; i < nr_vring; i++) {
275                 rte_vhost_get_vhost_vring(vid, i, &vq);
276                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
277                 if (gpa == 0) {
278                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
279                         return -1;
280                 }
281                 hw->vring[i].desc = gpa;
282
283                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
284                 if (gpa == 0) {
285                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
286                         return -1;
287                 }
288                 hw->vring[i].avail = gpa;
289
290                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
291                 if (gpa == 0) {
292                         DRV_LOG(ERR, "Fail to get GPA for used ring.");
293                         return -1;
294                 }
295                 hw->vring[i].used = gpa;
296
297                 hw->vring[i].size = vq.size;
298                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
299                                 &hw->vring[i].last_used_idx);
300         }
301         hw->nr_vring = i;
302
303         return ifcvf_start_hw(&internal->hw);
304 }
305
306 static void
307 vdpa_ifcvf_stop(struct ifcvf_internal *internal)
308 {
309         struct ifcvf_hw *hw = &internal->hw;
310         uint32_t i;
311         int vid;
312         uint64_t features = 0;
313         uint64_t log_base = 0, log_size = 0;
314         uint64_t len;
315
316         vid = internal->vid;
317         ifcvf_stop_hw(hw);
318
319         for (i = 0; i < hw->nr_vring; i++)
320                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
321                                 hw->vring[i].last_used_idx);
322
323         if (internal->sw_lm)
324                 return;
325
326         rte_vhost_get_negotiated_features(vid, &features);
327         if (RTE_VHOST_NEED_LOG(features)) {
328                 ifcvf_disable_logging(hw);
329                 rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
330                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
331                                 log_base, IFCVF_LOG_BASE, log_size);
332                 /*
333                  * IFCVF marks dirty memory pages for only packet buffer,
334                  * SW helps to mark the used ring as dirty after device stops.
335                  */
336                 for (i = 0; i < hw->nr_vring; i++) {
337                         len = IFCVF_USED_RING_LEN(hw->vring[i].size);
338                         rte_vhost_log_used_vring(vid, i, 0, len);
339                 }
340         }
341 }
342
343 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
344                 sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
345 static int
346 vdpa_enable_vfio_intr(struct ifcvf_internal *internal, bool m_rx)
347 {
348         int ret;
349         uint32_t i, nr_vring;
350         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
351         struct vfio_irq_set *irq_set;
352         int *fd_ptr;
353         struct rte_vhost_vring vring;
354         int fd;
355
356         vring.callfd = -1;
357
358         nr_vring = rte_vhost_get_vring_num(internal->vid);
359
360         irq_set = (struct vfio_irq_set *)irq_set_buf;
361         irq_set->argsz = sizeof(irq_set_buf);
362         irq_set->count = nr_vring + 1;
363         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
364                          VFIO_IRQ_SET_ACTION_TRIGGER;
365         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
366         irq_set->start = 0;
367         fd_ptr = (int *)&irq_set->data;
368         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = internal->pdev->intr_handle.fd;
369
370         for (i = 0; i < nr_vring; i++)
371                 internal->intr_fd[i] = -1;
372
373         for (i = 0; i < nr_vring; i++) {
374                 rte_vhost_get_vhost_vring(internal->vid, i, &vring);
375                 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
376                 if ((i & 1) == 0 && m_rx == true) {
377                         fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
378                         if (fd < 0) {
379                                 DRV_LOG(ERR, "can't setup eventfd: %s",
380                                         strerror(errno));
381                                 return -1;
382                         }
383                         internal->intr_fd[i] = fd;
384                         fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = fd;
385                 }
386         }
387
388         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
389         if (ret) {
390                 DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s",
391                                 strerror(errno));
392                 return -1;
393         }
394
395         return 0;
396 }
397
398 static int
399 vdpa_disable_vfio_intr(struct ifcvf_internal *internal)
400 {
401         int ret;
402         uint32_t i, nr_vring;
403         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
404         struct vfio_irq_set *irq_set;
405
406         irq_set = (struct vfio_irq_set *)irq_set_buf;
407         irq_set->argsz = sizeof(irq_set_buf);
408         irq_set->count = 0;
409         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
410         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
411         irq_set->start = 0;
412
413         nr_vring = rte_vhost_get_vring_num(internal->vid);
414         for (i = 0; i < nr_vring; i++) {
415                 if (internal->intr_fd[i] >= 0)
416                         close(internal->intr_fd[i]);
417                 internal->intr_fd[i] = -1;
418         }
419
420         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
421         if (ret) {
422                 DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s",
423                                 strerror(errno));
424                 return -1;
425         }
426
427         return 0;
428 }
429
430 static void *
431 notify_relay(void *arg)
432 {
433         int i, kickfd, epfd, nfds = 0;
434         uint32_t qid, q_num;
435         struct epoll_event events[IFCVF_MAX_QUEUES * 2];
436         struct epoll_event ev;
437         uint64_t buf;
438         int nbytes;
439         struct rte_vhost_vring vring;
440         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
441         struct ifcvf_hw *hw = &internal->hw;
442
443         q_num = rte_vhost_get_vring_num(internal->vid);
444
445         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
446         if (epfd < 0) {
447                 DRV_LOG(ERR, "failed to create epoll instance.");
448                 return NULL;
449         }
450         internal->epfd = epfd;
451
452         vring.kickfd = -1;
453         for (qid = 0; qid < q_num; qid++) {
454                 ev.events = EPOLLIN | EPOLLPRI;
455                 rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
456                 ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
457                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
458                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
459                         return NULL;
460                 }
461         }
462
463         for (;;) {
464                 nfds = epoll_wait(epfd, events, q_num, -1);
465                 if (nfds < 0) {
466                         if (errno == EINTR)
467                                 continue;
468                         DRV_LOG(ERR, "epoll_wait return fail\n");
469                         return NULL;
470                 }
471
472                 for (i = 0; i < nfds; i++) {
473                         qid = events[i].data.u32;
474                         kickfd = (uint32_t)(events[i].data.u64 >> 32);
475                         do {
476                                 nbytes = read(kickfd, &buf, 8);
477                                 if (nbytes < 0) {
478                                         if (errno == EINTR ||
479                                             errno == EWOULDBLOCK ||
480                                             errno == EAGAIN)
481                                                 continue;
482                                         DRV_LOG(INFO, "Error reading "
483                                                 "kickfd: %s",
484                                                 strerror(errno));
485                                 }
486                                 break;
487                         } while (1);
488
489                         ifcvf_notify_queue(hw, qid);
490                 }
491         }
492
493         return NULL;
494 }
495
496 static int
497 setup_notify_relay(struct ifcvf_internal *internal)
498 {
499         char name[THREAD_NAME_LEN];
500         int ret;
501
502         snprintf(name, sizeof(name), "ifc-notify-%d", internal->vid);
503         ret = rte_ctrl_thread_create(&internal->tid, name, NULL, notify_relay,
504                                      (void *)internal);
505         if (ret != 0) {
506                 DRV_LOG(ERR, "failed to create notify relay pthread.");
507                 return -1;
508         }
509
510         return 0;
511 }
512
513 static int
514 unset_notify_relay(struct ifcvf_internal *internal)
515 {
516         void *status;
517
518         if (internal->tid) {
519                 pthread_cancel(internal->tid);
520                 pthread_join(internal->tid, &status);
521         }
522         internal->tid = 0;
523
524         if (internal->epfd >= 0)
525                 close(internal->epfd);
526         internal->epfd = -1;
527
528         return 0;
529 }
530
531 static int
532 update_datapath(struct ifcvf_internal *internal)
533 {
534         int ret;
535
536         rte_spinlock_lock(&internal->lock);
537
538         if (!rte_atomic32_read(&internal->running) &&
539             (rte_atomic32_read(&internal->started) &&
540              rte_atomic32_read(&internal->dev_attached))) {
541                 ret = ifcvf_dma_map(internal, 1);
542                 if (ret)
543                         goto err;
544
545                 ret = vdpa_enable_vfio_intr(internal, 0);
546                 if (ret)
547                         goto err;
548
549                 ret = vdpa_ifcvf_start(internal);
550                 if (ret)
551                         goto err;
552
553                 ret = setup_notify_relay(internal);
554                 if (ret)
555                         goto err;
556
557                 rte_atomic32_set(&internal->running, 1);
558         } else if (rte_atomic32_read(&internal->running) &&
559                    (!rte_atomic32_read(&internal->started) ||
560                     !rte_atomic32_read(&internal->dev_attached))) {
561                 ret = unset_notify_relay(internal);
562                 if (ret)
563                         goto err;
564
565                 vdpa_ifcvf_stop(internal);
566
567                 ret = vdpa_disable_vfio_intr(internal);
568                 if (ret)
569                         goto err;
570
571                 ret = ifcvf_dma_map(internal, 0);
572                 if (ret)
573                         goto err;
574
575                 rte_atomic32_set(&internal->running, 0);
576         }
577
578         rte_spinlock_unlock(&internal->lock);
579         return 0;
580 err:
581         rte_spinlock_unlock(&internal->lock);
582         return ret;
583 }
584
585 static int
586 m_ifcvf_start(struct ifcvf_internal *internal)
587 {
588         struct ifcvf_hw *hw = &internal->hw;
589         uint32_t i, nr_vring;
590         int vid, ret;
591         struct rte_vhost_vring vq;
592         void *vring_buf;
593         uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
594         uint64_t size;
595         uint64_t gpa;
596
597         memset(&vq, 0, sizeof(vq));
598         vid = internal->vid;
599         nr_vring = rte_vhost_get_vring_num(vid);
600         rte_vhost_get_negotiated_features(vid, &hw->req_features);
601
602         for (i = 0; i < nr_vring; i++) {
603                 rte_vhost_get_vhost_vring(vid, i, &vq);
604
605                 size = RTE_ALIGN_CEIL(vring_size(vq.size, rte_mem_page_size()),
606                                 rte_mem_page_size());
607                 vring_buf = rte_zmalloc("ifcvf", size, rte_mem_page_size());
608                 vring_init(&internal->m_vring[i], vq.size, vring_buf,
609                                 rte_mem_page_size());
610
611                 ret = rte_vfio_container_dma_map(internal->vfio_container_fd,
612                         (uint64_t)(uintptr_t)vring_buf, m_vring_iova, size);
613                 if (ret < 0) {
614                         DRV_LOG(ERR, "mediated vring DMA map failed.");
615                         goto error;
616                 }
617
618                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
619                 if (gpa == 0) {
620                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
621                         return -1;
622                 }
623                 hw->vring[i].desc = gpa;
624
625                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
626                 if (gpa == 0) {
627                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
628                         return -1;
629                 }
630                 hw->vring[i].avail = gpa;
631
632                 /* Direct I/O for Tx queue, relay for Rx queue */
633                 if (i & 1) {
634                         gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
635                         if (gpa == 0) {
636                                 DRV_LOG(ERR, "Fail to get GPA for used ring.");
637                                 return -1;
638                         }
639                         hw->vring[i].used = gpa;
640                 } else {
641                         hw->vring[i].used = m_vring_iova +
642                                 (char *)internal->m_vring[i].used -
643                                 (char *)internal->m_vring[i].desc;
644                 }
645
646                 hw->vring[i].size = vq.size;
647
648                 rte_vhost_get_vring_base(vid, i,
649                                 &internal->m_vring[i].avail->idx,
650                                 &internal->m_vring[i].used->idx);
651
652                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
653                                 &hw->vring[i].last_used_idx);
654
655                 m_vring_iova += size;
656         }
657         hw->nr_vring = nr_vring;
658
659         return ifcvf_start_hw(&internal->hw);
660
661 error:
662         for (i = 0; i < nr_vring; i++)
663                 if (internal->m_vring[i].desc)
664                         rte_free(internal->m_vring[i].desc);
665
666         return -1;
667 }
668
669 static int
670 m_ifcvf_stop(struct ifcvf_internal *internal)
671 {
672         int vid;
673         uint32_t i;
674         struct rte_vhost_vring vq;
675         struct ifcvf_hw *hw = &internal->hw;
676         uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
677         uint64_t size, len;
678
679         vid = internal->vid;
680         ifcvf_stop_hw(hw);
681
682         for (i = 0; i < hw->nr_vring; i++) {
683                 /* synchronize remaining new used entries if any */
684                 if ((i & 1) == 0)
685                         update_used_ring(internal, i);
686
687                 rte_vhost_get_vhost_vring(vid, i, &vq);
688                 len = IFCVF_USED_RING_LEN(vq.size);
689                 rte_vhost_log_used_vring(vid, i, 0, len);
690
691                 size = RTE_ALIGN_CEIL(vring_size(vq.size, rte_mem_page_size()),
692                                 rte_mem_page_size());
693                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
694                         (uint64_t)(uintptr_t)internal->m_vring[i].desc,
695                         m_vring_iova, size);
696
697                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
698                                 hw->vring[i].last_used_idx);
699                 rte_free(internal->m_vring[i].desc);
700                 m_vring_iova += size;
701         }
702
703         return 0;
704 }
705
706 static void
707 update_used_ring(struct ifcvf_internal *internal, uint16_t qid)
708 {
709         rte_vdpa_relay_vring_used(internal->vid, qid, &internal->m_vring[qid]);
710         rte_vhost_vring_call(internal->vid, qid);
711 }
712
713 static void *
714 vring_relay(void *arg)
715 {
716         int i, vid, epfd, fd, nfds;
717         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
718         struct rte_vhost_vring vring;
719         uint16_t qid, q_num;
720         struct epoll_event events[IFCVF_MAX_QUEUES * 4];
721         struct epoll_event ev;
722         int nbytes;
723         uint64_t buf;
724
725         vid = internal->vid;
726         q_num = rte_vhost_get_vring_num(vid);
727
728         /* add notify fd and interrupt fd to epoll */
729         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
730         if (epfd < 0) {
731                 DRV_LOG(ERR, "failed to create epoll instance.");
732                 return NULL;
733         }
734         internal->epfd = epfd;
735
736         vring.kickfd = -1;
737         for (qid = 0; qid < q_num; qid++) {
738                 ev.events = EPOLLIN | EPOLLPRI;
739                 rte_vhost_get_vhost_vring(vid, qid, &vring);
740                 ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32;
741                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
742                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
743                         return NULL;
744                 }
745         }
746
747         for (qid = 0; qid < q_num; qid += 2) {
748                 ev.events = EPOLLIN | EPOLLPRI;
749                 /* leave a flag to mark it's for interrupt */
750                 ev.data.u64 = 1 | qid << 1 |
751                         (uint64_t)internal->intr_fd[qid] << 32;
752                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, internal->intr_fd[qid], &ev)
753                                 < 0) {
754                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
755                         return NULL;
756                 }
757                 update_used_ring(internal, qid);
758         }
759
760         /* start relay with a first kick */
761         for (qid = 0; qid < q_num; qid++)
762                 ifcvf_notify_queue(&internal->hw, qid);
763
764         /* listen to the events and react accordingly */
765         for (;;) {
766                 nfds = epoll_wait(epfd, events, q_num * 2, -1);
767                 if (nfds < 0) {
768                         if (errno == EINTR)
769                                 continue;
770                         DRV_LOG(ERR, "epoll_wait return fail\n");
771                         return NULL;
772                 }
773
774                 for (i = 0; i < nfds; i++) {
775                         fd = (uint32_t)(events[i].data.u64 >> 32);
776                         do {
777                                 nbytes = read(fd, &buf, 8);
778                                 if (nbytes < 0) {
779                                         if (errno == EINTR ||
780                                             errno == EWOULDBLOCK ||
781                                             errno == EAGAIN)
782                                                 continue;
783                                         DRV_LOG(INFO, "Error reading "
784                                                 "kickfd: %s",
785                                                 strerror(errno));
786                                 }
787                                 break;
788                         } while (1);
789
790                         qid = events[i].data.u32 >> 1;
791
792                         if (events[i].data.u32 & 1)
793                                 update_used_ring(internal, qid);
794                         else
795                                 ifcvf_notify_queue(&internal->hw, qid);
796                 }
797         }
798
799         return NULL;
800 }
801
802 static int
803 setup_vring_relay(struct ifcvf_internal *internal)
804 {
805         char name[THREAD_NAME_LEN];
806         int ret;
807
808         snprintf(name, sizeof(name), "ifc-vring-%d", internal->vid);
809         ret = rte_ctrl_thread_create(&internal->tid, name, NULL, vring_relay,
810                                      (void *)internal);
811         if (ret != 0) {
812                 DRV_LOG(ERR, "failed to create ring relay pthread.");
813                 return -1;
814         }
815
816         return 0;
817 }
818
819 static int
820 unset_vring_relay(struct ifcvf_internal *internal)
821 {
822         void *status;
823
824         if (internal->tid) {
825                 pthread_cancel(internal->tid);
826                 pthread_join(internal->tid, &status);
827         }
828         internal->tid = 0;
829
830         if (internal->epfd >= 0)
831                 close(internal->epfd);
832         internal->epfd = -1;
833
834         return 0;
835 }
836
837 static int
838 ifcvf_sw_fallback_switchover(struct ifcvf_internal *internal)
839 {
840         int ret;
841         int vid = internal->vid;
842
843         /* stop the direct IO data path */
844         unset_notify_relay(internal);
845         vdpa_ifcvf_stop(internal);
846         vdpa_disable_vfio_intr(internal);
847
848         ret = rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, false);
849         if (ret && ret != -ENOTSUP)
850                 goto error;
851
852         /* set up interrupt for interrupt relay */
853         ret = vdpa_enable_vfio_intr(internal, 1);
854         if (ret)
855                 goto unmap;
856
857         /* config the VF */
858         ret = m_ifcvf_start(internal);
859         if (ret)
860                 goto unset_intr;
861
862         /* set up vring relay thread */
863         ret = setup_vring_relay(internal);
864         if (ret)
865                 goto stop_vf;
866
867         rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, true);
868
869         internal->sw_fallback_running = true;
870
871         return 0;
872
873 stop_vf:
874         m_ifcvf_stop(internal);
875 unset_intr:
876         vdpa_disable_vfio_intr(internal);
877 unmap:
878         ifcvf_dma_map(internal, 0);
879 error:
880         return -1;
881 }
882
883 static int
884 ifcvf_dev_config(int vid)
885 {
886         struct rte_vdpa_device *vdev;
887         struct internal_list *list;
888         struct ifcvf_internal *internal;
889
890         vdev = rte_vhost_get_vdpa_device(vid);
891         list = find_internal_resource_by_vdev(vdev);
892         if (list == NULL) {
893                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
894                 return -1;
895         }
896
897         internal = list->internal;
898         internal->vid = vid;
899         rte_atomic32_set(&internal->dev_attached, 1);
900         update_datapath(internal);
901
902         if (rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, true) != 0)
903                 DRV_LOG(NOTICE, "vDPA (%s): software relay is used.",
904                                 vdev->device->name);
905
906         internal->configured = 1;
907         return 0;
908 }
909
910 static int
911 ifcvf_dev_close(int vid)
912 {
913         struct rte_vdpa_device *vdev;
914         struct internal_list *list;
915         struct ifcvf_internal *internal;
916
917         vdev = rte_vhost_get_vdpa_device(vid);
918         list = find_internal_resource_by_vdev(vdev);
919         if (list == NULL) {
920                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
921                 return -1;
922         }
923
924         internal = list->internal;
925
926         if (internal->sw_fallback_running) {
927                 /* unset ring relay */
928                 unset_vring_relay(internal);
929
930                 /* reset VF */
931                 m_ifcvf_stop(internal);
932
933                 /* remove interrupt setting */
934                 vdpa_disable_vfio_intr(internal);
935
936                 /* unset DMA map for guest memory */
937                 ifcvf_dma_map(internal, 0);
938
939                 internal->sw_fallback_running = false;
940         } else {
941                 rte_atomic32_set(&internal->dev_attached, 0);
942                 update_datapath(internal);
943         }
944
945         internal->configured = 0;
946         return 0;
947 }
948
949 static int
950 ifcvf_set_features(int vid)
951 {
952         uint64_t features = 0;
953         struct rte_vdpa_device *vdev;
954         struct internal_list *list;
955         struct ifcvf_internal *internal;
956         uint64_t log_base = 0, log_size = 0;
957
958         vdev = rte_vhost_get_vdpa_device(vid);
959         list = find_internal_resource_by_vdev(vdev);
960         if (list == NULL) {
961                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
962                 return -1;
963         }
964
965         internal = list->internal;
966         rte_vhost_get_negotiated_features(vid, &features);
967
968         if (!RTE_VHOST_NEED_LOG(features))
969                 return 0;
970
971         if (internal->sw_lm) {
972                 ifcvf_sw_fallback_switchover(internal);
973         } else {
974                 rte_vhost_get_log_base(vid, &log_base, &log_size);
975                 rte_vfio_container_dma_map(internal->vfio_container_fd,
976                                 log_base, IFCVF_LOG_BASE, log_size);
977                 ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size);
978         }
979
980         return 0;
981 }
982
983 static int
984 ifcvf_get_vfio_group_fd(int vid)
985 {
986         struct rte_vdpa_device *vdev;
987         struct internal_list *list;
988
989         vdev = rte_vhost_get_vdpa_device(vid);
990         list = find_internal_resource_by_vdev(vdev);
991         if (list == NULL) {
992                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
993                 return -1;
994         }
995
996         return list->internal->vfio_group_fd;
997 }
998
999 static int
1000 ifcvf_get_vfio_device_fd(int vid)
1001 {
1002         struct rte_vdpa_device *vdev;
1003         struct internal_list *list;
1004
1005         vdev = rte_vhost_get_vdpa_device(vid);
1006         list = find_internal_resource_by_vdev(vdev);
1007         if (list == NULL) {
1008                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1009                 return -1;
1010         }
1011
1012         return list->internal->vfio_dev_fd;
1013 }
1014
1015 static int
1016 ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
1017 {
1018         struct rte_vdpa_device *vdev;
1019         struct internal_list *list;
1020         struct ifcvf_internal *internal;
1021         struct vfio_region_info reg = { .argsz = sizeof(reg) };
1022         int ret;
1023
1024         vdev = rte_vhost_get_vdpa_device(vid);
1025         list = find_internal_resource_by_vdev(vdev);
1026         if (list == NULL) {
1027                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1028                 return -1;
1029         }
1030
1031         internal = list->internal;
1032
1033         reg.index = ifcvf_get_notify_region(&internal->hw);
1034         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
1035         if (ret) {
1036                 DRV_LOG(ERR, "Get not get device region info: %s",
1037                                 strerror(errno));
1038                 return -1;
1039         }
1040
1041         *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset;
1042         *size = 0x1000;
1043
1044         return 0;
1045 }
1046
1047 static int
1048 ifcvf_get_queue_num(struct rte_vdpa_device *vdev, uint32_t *queue_num)
1049 {
1050         struct internal_list *list;
1051
1052         list = find_internal_resource_by_vdev(vdev);
1053         if (list == NULL) {
1054                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1055                 return -1;
1056         }
1057
1058         *queue_num = list->internal->max_queues;
1059
1060         return 0;
1061 }
1062
1063 static int
1064 ifcvf_get_vdpa_features(struct rte_vdpa_device *vdev, uint64_t *features)
1065 {
1066         struct internal_list *list;
1067
1068         list = find_internal_resource_by_vdev(vdev);
1069         if (list == NULL) {
1070                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1071                 return -1;
1072         }
1073
1074         *features = list->internal->features;
1075
1076         return 0;
1077 }
1078
1079 #define VDPA_SUPPORTED_PROTOCOL_FEATURES \
1080                 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \
1081                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | \
1082                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | \
1083                  1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
1084                  1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | \
1085                  1ULL << VHOST_USER_PROTOCOL_F_STATUS)
1086 static int
1087 ifcvf_get_protocol_features(struct rte_vdpa_device *vdev, uint64_t *features)
1088 {
1089         RTE_SET_USED(vdev);
1090
1091         *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
1092         return 0;
1093 }
1094
1095 static int
1096 ifcvf_set_vring_state(int vid, int vring, int state)
1097 {
1098         struct rte_vdpa_device *vdev;
1099         struct internal_list *list;
1100         struct ifcvf_internal *internal;
1101         struct ifcvf_hw *hw;
1102         struct ifcvf_pci_common_cfg *cfg;
1103         int ret = 0;
1104
1105         vdev = rte_vhost_get_vdpa_device(vid);
1106         list = find_internal_resource_by_vdev(vdev);
1107         if (list == NULL) {
1108                 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1109                 return -1;
1110         }
1111
1112         internal = list->internal;
1113         if (vring < 0 || vring >= internal->max_queues * 2) {
1114                 DRV_LOG(ERR, "Vring index %d not correct", vring);
1115                 return -1;
1116         }
1117
1118         hw = &internal->hw;
1119         if (!internal->configured)
1120                 goto exit;
1121
1122         cfg = hw->common_cfg;
1123         IFCVF_WRITE_REG16(vring, &cfg->queue_select);
1124         IFCVF_WRITE_REG16(!!state, &cfg->queue_enable);
1125
1126         if (!state && hw->vring[vring].enable) {
1127                 ret = vdpa_disable_vfio_intr(internal);
1128                 if (ret)
1129                         return ret;
1130         }
1131
1132         if (state && !hw->vring[vring].enable) {
1133                 ret = vdpa_enable_vfio_intr(internal, 0);
1134                 if (ret)
1135                         return ret;
1136         }
1137
1138 exit:
1139         hw->vring[vring].enable = !!state;
1140         return 0;
1141 }
1142
1143 static struct rte_vdpa_dev_ops ifcvf_ops = {
1144         .get_queue_num = ifcvf_get_queue_num,
1145         .get_features = ifcvf_get_vdpa_features,
1146         .get_protocol_features = ifcvf_get_protocol_features,
1147         .dev_conf = ifcvf_dev_config,
1148         .dev_close = ifcvf_dev_close,
1149         .set_vring_state = ifcvf_set_vring_state,
1150         .set_features = ifcvf_set_features,
1151         .migration_done = NULL,
1152         .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
1153         .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
1154         .get_notify_area = ifcvf_get_notify_area,
1155 };
1156
1157 static inline int
1158 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1159 {
1160         uint16_t *n = extra_args;
1161
1162         if (value == NULL || extra_args == NULL)
1163                 return -EINVAL;
1164
1165         *n = (uint16_t)strtoul(value, NULL, 0);
1166         if (*n == USHRT_MAX && errno == ERANGE)
1167                 return -1;
1168
1169         return 0;
1170 }
1171
1172 static int
1173 ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
1174                 struct rte_pci_device *pci_dev)
1175 {
1176         uint64_t features;
1177         struct ifcvf_internal *internal = NULL;
1178         struct internal_list *list = NULL;
1179         int vdpa_mode = 0;
1180         int sw_fallback_lm = 0;
1181         struct rte_kvargs *kvlist = NULL;
1182         int ret = 0;
1183
1184         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1185                 return 0;
1186
1187         if (!pci_dev->device.devargs)
1188                 return 1;
1189
1190         kvlist = rte_kvargs_parse(pci_dev->device.devargs->args,
1191                         ifcvf_valid_arguments);
1192         if (kvlist == NULL)
1193                 return 1;
1194
1195         /* probe only when vdpa mode is specified */
1196         if (rte_kvargs_count(kvlist, IFCVF_VDPA_MODE) == 0) {
1197                 rte_kvargs_free(kvlist);
1198                 return 1;
1199         }
1200
1201         ret = rte_kvargs_process(kvlist, IFCVF_VDPA_MODE, &open_int,
1202                         &vdpa_mode);
1203         if (ret < 0 || vdpa_mode == 0) {
1204                 rte_kvargs_free(kvlist);
1205                 return 1;
1206         }
1207
1208         list = rte_zmalloc("ifcvf", sizeof(*list), 0);
1209         if (list == NULL)
1210                 goto error;
1211
1212         internal = rte_zmalloc("ifcvf", sizeof(*internal), 0);
1213         if (internal == NULL)
1214                 goto error;
1215
1216         internal->pdev = pci_dev;
1217         rte_spinlock_init(&internal->lock);
1218
1219         if (ifcvf_vfio_setup(internal) < 0) {
1220                 DRV_LOG(ERR, "failed to setup device %s", pci_dev->name);
1221                 goto error;
1222         }
1223
1224         if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0) {
1225                 DRV_LOG(ERR, "failed to init device %s", pci_dev->name);
1226                 goto error;
1227         }
1228
1229         internal->configured = 0;
1230         internal->max_queues = IFCVF_MAX_QUEUES;
1231         features = ifcvf_get_features(&internal->hw);
1232         internal->features = (features &
1233                 ~(1ULL << VIRTIO_F_IOMMU_PLATFORM)) |
1234                 (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
1235                 (1ULL << VIRTIO_NET_F_CTRL_VQ) |
1236                 (1ULL << VIRTIO_NET_F_STATUS) |
1237                 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
1238                 (1ULL << VHOST_F_LOG_ALL);
1239
1240         list->internal = internal;
1241
1242         if (rte_kvargs_count(kvlist, IFCVF_SW_FALLBACK_LM)) {
1243                 ret = rte_kvargs_process(kvlist, IFCVF_SW_FALLBACK_LM,
1244                                 &open_int, &sw_fallback_lm);
1245                 if (ret < 0)
1246                         goto error;
1247         }
1248         internal->sw_lm = sw_fallback_lm;
1249
1250         internal->vdev = rte_vdpa_register_device(&pci_dev->device, &ifcvf_ops);
1251         if (internal->vdev == NULL) {
1252                 DRV_LOG(ERR, "failed to register device %s", pci_dev->name);
1253                 goto error;
1254         }
1255
1256         pthread_mutex_lock(&internal_list_lock);
1257         TAILQ_INSERT_TAIL(&internal_list, list, next);
1258         pthread_mutex_unlock(&internal_list_lock);
1259
1260         rte_atomic32_set(&internal->started, 1);
1261         update_datapath(internal);
1262
1263         rte_kvargs_free(kvlist);
1264         return 0;
1265
1266 error:
1267         rte_kvargs_free(kvlist);
1268         rte_free(list);
1269         rte_free(internal);
1270         return -1;
1271 }
1272
1273 static int
1274 ifcvf_pci_remove(struct rte_pci_device *pci_dev)
1275 {
1276         struct ifcvf_internal *internal;
1277         struct internal_list *list;
1278
1279         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1280                 return 0;
1281
1282         list = find_internal_resource_by_dev(pci_dev);
1283         if (list == NULL) {
1284                 DRV_LOG(ERR, "Invalid device: %s", pci_dev->name);
1285                 return -1;
1286         }
1287
1288         internal = list->internal;
1289         rte_atomic32_set(&internal->started, 0);
1290         update_datapath(internal);
1291
1292         rte_pci_unmap_device(internal->pdev);
1293         rte_vfio_container_destroy(internal->vfio_container_fd);
1294         rte_vdpa_unregister_device(internal->vdev);
1295
1296         pthread_mutex_lock(&internal_list_lock);
1297         TAILQ_REMOVE(&internal_list, list, next);
1298         pthread_mutex_unlock(&internal_list_lock);
1299
1300         rte_free(list);
1301         rte_free(internal);
1302
1303         return 0;
1304 }
1305
1306 /*
1307  * IFCVF has the same vendor ID and device ID as virtio net PCI
1308  * device, with its specific subsystem vendor ID and device ID.
1309  */
1310 static const struct rte_pci_id pci_id_ifcvf_map[] = {
1311         { .class_id = RTE_CLASS_ANY_ID,
1312           .vendor_id = IFCVF_VENDOR_ID,
1313           .device_id = IFCVF_DEVICE_ID,
1314           .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
1315           .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID,
1316         },
1317
1318         { .vendor_id = 0, /* sentinel */
1319         },
1320 };
1321
1322 static struct rte_pci_driver rte_ifcvf_vdpa = {
1323         .id_table = pci_id_ifcvf_map,
1324         .drv_flags = 0,
1325         .probe = ifcvf_pci_probe,
1326         .remove = ifcvf_pci_remove,
1327 };
1328
1329 RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa);
1330 RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map);
1331 RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci");